diff options
Diffstat (limited to 'fs')
235 files changed, 7385 insertions, 5858 deletions
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c index 9ee534159cc6..42e102e2e74a 100644 --- a/fs/9p/vfs_inode.c +++ b/fs/9p/vfs_inode.c @@ -823,28 +823,21 @@ struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry, if (IS_ERR(dfid)) return ERR_CAST(dfid); - name = dentry->d_name.name; - fid = p9_client_walk(dfid, 1, &name, 1); - if (IS_ERR(fid)) { - if (fid == ERR_PTR(-ENOENT)) { - d_add(dentry, NULL); - return NULL; - } - return ERR_CAST(fid); - } /* * Make sure we don't use a wrong inode due to parallel * unlink. For cached mode create calls request for new * inode. But with cache disabled, lookup should do this. */ - if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) + name = dentry->d_name.name; + fid = p9_client_walk(dfid, 1, &name, 1); + if (fid == ERR_PTR(-ENOENT)) + inode = NULL; + else if (IS_ERR(fid)) + inode = ERR_CAST(fid); + else if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) inode = v9fs_get_inode_from_fid(v9ses, fid, dir->i_sb); else inode = v9fs_get_new_inode_from_fid(v9ses, fid, dir->i_sb); - if (IS_ERR(inode)) { - p9_client_clunk(fid); - return ERR_CAST(inode); - } /* * If we had a rename on the server and a parallel lookup * for the new name, then make sure we instantiate with @@ -853,12 +846,14 @@ struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry, * k/b. */ res = d_splice_alias(inode, dentry); - if (!res) - v9fs_fid_add(dentry, fid); - else if (!IS_ERR(res)) - v9fs_fid_add(res, fid); - else - p9_client_clunk(fid); + if (!IS_ERR(fid)) { + if (!res) + v9fs_fid_add(dentry, fid); + else if (!IS_ERR(res)) + v9fs_fid_add(res, fid); + else + p9_client_clunk(fid); + } return res; } diff --git a/fs/Kconfig b/fs/Kconfig index bc821a86d965..ac4ac908f001 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -196,7 +196,7 @@ config HUGETLBFS help hugetlbfs is a filesystem backing for HugeTLB pages, based on ramfs. For architectures that support it, say Y here and read - <file:Documentation/vm/hugetlbpage.txt> for details. + <file:Documentation/admin-guide/mm/hugetlbpage.rst> for details. If unsure, say N. diff --git a/fs/adfs/dir.c b/fs/adfs/dir.c index 29444c83da48..e18eff854e1a 100644 --- a/fs/adfs/dir.c +++ b/fs/adfs/dir.c @@ -146,20 +146,6 @@ adfs_dir_lookup_byname(struct inode *inode, const struct qstr *name, struct obje obj->parent_id = inode->i_ino; - /* - * '.' is handled by reserved_lookup() in fs/namei.c - */ - if (name->len == 2 && name->name[0] == '.' && name->name[1] == '.') { - /* - * Currently unable to fill in the rest of 'obj', - * but this is better than nothing. We need to - * ascend one level to find it's parent. - */ - obj->name_len = 0; - obj->file_id = obj->parent_id; - goto free_out; - } - read_lock(&adfs_dir_lock); ret = ops->setpos(&dir, 0); @@ -266,17 +252,17 @@ adfs_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) error = adfs_dir_lookup_byname(dir, &dentry->d_name, &obj); if (error == 0) { - error = -EACCES; /* * This only returns NULL if get_empty_inode * fails. */ inode = adfs_iget(dir->i_sb, &obj); - if (inode) - error = 0; + if (!inode) + inode = ERR_PTR(-EACCES); + } else if (error != -ENOENT) { + inode = ERR_PTR(error); } - d_add(dentry, inode); - return ERR_PTR(error); + return d_splice_alias(inode, dentry); } /* diff --git a/fs/affs/namei.c b/fs/affs/namei.c index d8aa0ae3d037..41c5749f4db7 100644 --- a/fs/affs/namei.c +++ b/fs/affs/namei.c @@ -201,14 +201,16 @@ affs_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) struct super_block *sb = dir->i_sb; struct buffer_head *bh; struct inode *inode = NULL; + struct dentry *res; pr_debug("%s(\"%pd\")\n", __func__, dentry); affs_lock_dir(dir); bh = affs_find_entry(dir, dentry); - affs_unlock_dir(dir); - if (IS_ERR(bh)) + if (IS_ERR(bh)) { + affs_unlock_dir(dir); return ERR_CAST(bh); + } if (bh) { u32 ino = bh->b_blocknr; @@ -222,11 +224,12 @@ affs_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) } affs_brelse(bh); inode = affs_iget(sb, ino); - if (IS_ERR(inode)) - return ERR_CAST(inode); } - d_add(dentry, inode); - return NULL; + res = d_splice_alias(inode, dentry); + if (!IS_ERR_OR_NULL(res)) + res->d_fsdata = dentry->d_fsdata; + affs_unlock_dir(dir); + return res; } int diff --git a/fs/affs/super.c b/fs/affs/super.c index e602619aed9d..d1ad11a8a4a5 100644 --- a/fs/affs/super.c +++ b/fs/affs/super.c @@ -241,6 +241,7 @@ parse_options(char *options, kuid_t *uid, kgid_t *gid, int *mode, int *reserved, affs_set_opt(*mount_opts, SF_NO_TRUNCATE); break; case Opt_prefix: + kfree(*prefix); *prefix = match_strdup(&args[0]); if (!*prefix) return 0; diff --git a/fs/afs/addr_list.c b/fs/afs/addr_list.c index 3bedfed608a2..7587fb665ff1 100644 --- a/fs/afs/addr_list.c +++ b/fs/afs/addr_list.c @@ -121,7 +121,7 @@ struct afs_addr_list *afs_parse_text_addrs(const char *text, size_t len, p = text; do { struct sockaddr_rxrpc *srx = &alist->addrs[alist->nr_addrs]; - char tdelim = delim; + const char *q, *stop; if (*p == delim) { p++; @@ -130,28 +130,33 @@ struct afs_addr_list *afs_parse_text_addrs(const char *text, size_t len, if (*p == '[') { p++; - tdelim = ']'; + q = memchr(p, ']', end - p); + } else { + for (q = p; q < end; q++) + if (*q == '+' || *q == delim) + break; } - if (in4_pton(p, end - p, + if (in4_pton(p, q - p, (u8 *)&srx->transport.sin6.sin6_addr.s6_addr32[3], - tdelim, &p)) { + -1, &stop)) { srx->transport.sin6.sin6_addr.s6_addr32[0] = 0; srx->transport.sin6.sin6_addr.s6_addr32[1] = 0; srx->transport.sin6.sin6_addr.s6_addr32[2] = htonl(0xffff); - } else if (in6_pton(p, end - p, + } else if (in6_pton(p, q - p, srx->transport.sin6.sin6_addr.s6_addr, - tdelim, &p)) { + -1, &stop)) { /* Nothing to do */ } else { goto bad_address; } - if (tdelim == ']') { - if (p == end || *p != ']') - goto bad_address; + if (stop != q) + goto bad_address; + + p = q; + if (q < end && *q == ']') p++; - } if (p < end) { if (*p == '+') { diff --git a/fs/afs/callback.c b/fs/afs/callback.c index abd9a84f4e88..571437dcb252 100644 --- a/fs/afs/callback.c +++ b/fs/afs/callback.c @@ -23,36 +23,55 @@ /* * Set up an interest-in-callbacks record for a volume on a server and * register it with the server. - * - Called with volume->server_sem held. + * - Called with vnode->io_lock held. */ int afs_register_server_cb_interest(struct afs_vnode *vnode, - struct afs_server_entry *entry) + struct afs_server_list *slist, + unsigned int index) { - struct afs_cb_interest *cbi = entry->cb_interest, *vcbi, *new, *x; + struct afs_server_entry *entry = &slist->servers[index]; + struct afs_cb_interest *cbi, *vcbi, *new, *old; struct afs_server *server = entry->server; again: + if (vnode->cb_interest && + likely(vnode->cb_interest == entry->cb_interest)) + return 0; + + read_lock(&slist->lock); + cbi = afs_get_cb_interest(entry->cb_interest); + read_unlock(&slist->lock); + vcbi = vnode->cb_interest; if (vcbi) { - if (vcbi == cbi) + if (vcbi == cbi) { + afs_put_cb_interest(afs_v2net(vnode), cbi); return 0; + } + /* Use a new interest in the server list for the same server + * rather than an old one that's still attached to a vnode. + */ if (cbi && vcbi->server == cbi->server) { write_seqlock(&vnode->cb_lock); - vnode->cb_interest = afs_get_cb_interest(cbi); + old = vnode->cb_interest; + vnode->cb_interest = cbi; write_sequnlock(&vnode->cb_lock); - afs_put_cb_interest(afs_v2net(vnode), cbi); + afs_put_cb_interest(afs_v2net(vnode), old); return 0; } + /* Re-use the one attached to the vnode. */ if (!cbi && vcbi->server == server) { - afs_get_cb_interest(vcbi); - x = cmpxchg(&entry->cb_interest, cbi, vcbi); - if (x != cbi) { - cbi = x; - afs_put_cb_interest(afs_v2net(vnode), vcbi); + write_lock(&slist->lock); + if (entry->cb_interest) { + write_unlock(&slist->lock); + afs_put_cb_interest(afs_v2net(vnode), cbi); goto again; } + + entry->cb_interest = cbi; + write_unlock(&slist->lock); return 0; } } @@ -72,13 +91,16 @@ again: list_add_tail(&new->cb_link, &server->cb_interests); write_unlock(&server->cb_break_lock); - x = cmpxchg(&entry->cb_interest, cbi, new); - if (x == cbi) { + write_lock(&slist->lock); + if (!entry->cb_interest) { + entry->cb_interest = afs_get_cb_interest(new); cbi = new; + new = NULL; } else { - cbi = x; - afs_put_cb_interest(afs_v2net(vnode), new); + cbi = afs_get_cb_interest(entry->cb_interest); } + write_unlock(&slist->lock); + afs_put_cb_interest(afs_v2net(vnode), new); } ASSERT(cbi); @@ -88,11 +110,14 @@ again: */ write_seqlock(&vnode->cb_lock); - vnode->cb_interest = afs_get_cb_interest(cbi); + old = vnode->cb_interest; + vnode->cb_interest = cbi; vnode->cb_s_break = cbi->server->cb_s_break; + vnode->cb_v_break = vnode->volume->cb_v_break; clear_bit(AFS_VNODE_CB_PROMISED, &vnode->flags); write_sequnlock(&vnode->cb_lock); + afs_put_cb_interest(afs_v2net(vnode), old); return 0; } @@ -171,13 +196,24 @@ static void afs_break_one_callback(struct afs_server *server, if (cbi->vid != fid->vid) continue; - data.volume = NULL; - data.fid = *fid; - inode = ilookup5_nowait(cbi->sb, fid->vnode, afs_iget5_test, &data); - if (inode) { - vnode = AFS_FS_I(inode); - afs_break_callback(vnode); - iput(inode); + if (fid->vnode == 0 && fid->unique == 0) { + /* The callback break applies to an entire volume. */ + struct afs_super_info *as = AFS_FS_S(cbi->sb); + struct afs_volume *volume = as->volume; + + write_lock(&volume->cb_break_lock); + volume->cb_v_break++; + write_unlock(&volume->cb_break_lock); + } else { + data.volume = NULL; + data.fid = *fid; + inode = ilookup5_nowait(cbi->sb, fid->vnode, + afs_iget5_test, &data); + if (inode) { + vnode = AFS_FS_I(inode); + afs_break_callback(vnode); + iput(inode); + } } } @@ -195,6 +231,8 @@ void afs_break_callbacks(struct afs_server *server, size_t count, ASSERT(server != NULL); ASSERTCMP(count, <=, AFSCBMAX); + /* TODO: Sort the callback break list by volume ID */ + for (; count > 0; callbacks++, count--) { _debug("- Fid { vl=%08x n=%u u=%u } CB { v=%u x=%u t=%u }", callbacks->fid.vid, diff --git a/fs/afs/cmservice.c b/fs/afs/cmservice.c index 357de908df3a..c332c95a6940 100644 --- a/fs/afs/cmservice.c +++ b/fs/afs/cmservice.c @@ -133,21 +133,10 @@ bool afs_cm_incoming_call(struct afs_call *call) } /* - * clean up a cache manager call + * Clean up a cache manager call. */ static void afs_cm_destructor(struct afs_call *call) { - _enter(""); - - /* Break the callbacks here so that we do it after the final ACK is - * received. The step number here must match the final number in - * afs_deliver_cb_callback(). - */ - if (call->unmarshall == 5) { - ASSERT(call->cm_server && call->count && call->request); - afs_break_callbacks(call->cm_server, call->count, call->request); - } - kfree(call->buffer); call->buffer = NULL; } @@ -161,14 +150,14 @@ static void SRXAFSCB_CallBack(struct work_struct *work) _enter(""); - /* be sure to send the reply *before* attempting to spam the AFS server - * with FSFetchStatus requests on the vnodes with broken callbacks lest - * the AFS server get into a vicious cycle of trying to break further - * callbacks because it hadn't received completion of the CBCallBack op - * yet */ - afs_send_empty_reply(call); + /* We need to break the callbacks before sending the reply as the + * server holds up change visibility till it receives our reply so as + * to maintain cache coherency. + */ + if (call->cm_server) + afs_break_callbacks(call->cm_server, call->count, call->request); - afs_break_callbacks(call->cm_server, call->count, call->request); + afs_send_empty_reply(call); afs_put_call(call); _leave(""); } @@ -180,7 +169,6 @@ static int afs_deliver_cb_callback(struct afs_call *call) { struct afs_callback_break *cb; struct sockaddr_rxrpc srx; - struct afs_server *server; __be32 *bp; int ret, loop; @@ -267,15 +255,6 @@ static int afs_deliver_cb_callback(struct afs_call *call) call->offset = 0; call->unmarshall++; - - /* Record that the message was unmarshalled successfully so - * that the call destructor can know do the callback breaking - * work, even if the final ACK isn't received. - * - * If the step number changes, then afs_cm_destructor() must be - * updated also. - */ - call->unmarshall++; case 5: break; } @@ -286,10 +265,9 @@ static int afs_deliver_cb_callback(struct afs_call *call) /* we'll need the file server record as that tells us which set of * vnodes to operate upon */ rxrpc_kernel_get_peer(call->net->socket, call->rxcall, &srx); - server = afs_find_server(call->net, &srx); - if (!server) - return -ENOTCONN; - call->cm_server = server; + call->cm_server = afs_find_server(call->net, &srx); + if (!call->cm_server) + trace_afs_cm_no_server(call, &srx); return afs_queue_call_work(call); } @@ -303,7 +281,8 @@ static void SRXAFSCB_InitCallBackState(struct work_struct *work) _enter("{%p}", call->cm_server); - afs_init_callback_state(call->cm_server); + if (call->cm_server) + afs_init_callback_state(call->cm_server); afs_send_empty_reply(call); afs_put_call(call); _leave(""); @@ -315,7 +294,6 @@ static void SRXAFSCB_InitCallBackState(struct work_struct *work) static int afs_deliver_cb_init_call_back_state(struct afs_call *call) { struct sockaddr_rxrpc srx; - struct afs_server *server; int ret; _enter(""); @@ -328,10 +306,9 @@ static int afs_deliver_cb_init_call_back_state(struct afs_call *call) /* we'll need the file server record as that tells us which set of * vnodes to operate upon */ - server = afs_find_server(call->net, &srx); - if (!server) - return -ENOTCONN; - call->cm_server = server; + call->cm_server = afs_find_server(call->net, &srx); + if (!call->cm_server) + trace_afs_cm_no_server(call, &srx); return afs_queue_call_work(call); } @@ -341,8 +318,6 @@ static int afs_deliver_cb_init_call_back_state(struct afs_call *call) */ static int afs_deliver_cb_init_call_back_state3(struct afs_call *call) { - struct sockaddr_rxrpc srx; - struct afs_server *server; struct afs_uuid *r; unsigned loop; __be32 *b; @@ -398,11 +373,11 @@ static int afs_deliver_cb_init_call_back_state3(struct afs_call *call) /* we'll need the file server record as that tells us which set of * vnodes to operate upon */ - rxrpc_kernel_get_peer(call->net->socket, call->rxcall, &srx); - server = afs_find_server(call->net, &srx); - if (!server) - return -ENOTCONN; - call->cm_server = server; + rcu_read_lock(); + call->cm_server = afs_find_server_by_uuid(call->net, call->request); + rcu_read_unlock(); + if (!call->cm_server) + trace_afs_cm_no_server_u(call, call->request); return afs_queue_call_work(call); } diff --git a/fs/afs/dir.c b/fs/afs/dir.c index 5889f70d4d27..7d623008157f 100644 --- a/fs/afs/dir.c +++ b/fs/afs/dir.c @@ -180,6 +180,7 @@ static int afs_dir_open(struct inode *inode, struct file *file) * get reclaimed during the iteration. */ static struct afs_read *afs_read_dir(struct afs_vnode *dvnode, struct key *key) + __acquires(&dvnode->validate_lock) { struct afs_read *req; loff_t i_size; @@ -261,18 +262,21 @@ retry: /* If we're going to reload, we need to lock all the pages to prevent * races. */ - if (!test_bit(AFS_VNODE_DIR_VALID, &dvnode->flags)) { - ret = -ERESTARTSYS; - for (i = 0; i < req->nr_pages; i++) - if (lock_page_killable(req->pages[i]) < 0) - goto error_unlock; + ret = -ERESTARTSYS; + if (down_read_killable(&dvnode->validate_lock) < 0) + goto error; - if (test_bit(AFS_VNODE_DIR_VALID, &dvnode->flags)) - goto success; + if (test_bit(AFS_VNODE_DIR_VALID, &dvnode->flags)) + goto success; + + up_read(&dvnode->validate_lock); + if (down_write_killable(&dvnode->validate_lock) < 0) + goto error; + if (!test_bit(AFS_VNODE_DIR_VALID, &dvnode->flags)) { ret = afs_fetch_data(dvnode, key, req); if (ret < 0) - goto error_unlock_all; + goto error_unlock; task_io_account_read(PAGE_SIZE * req->nr_pages); @@ -284,33 +288,26 @@ retry: for (i = 0; i < req->nr_pages; i++) if (!afs_dir_check_page(dvnode, req->pages[i], req->actual_len)) - goto error_unlock_all; + goto error_unlock; // TODO: Trim excess pages set_bit(AFS_VNODE_DIR_VALID, &dvnode->flags); } + downgrade_write(&dvnode->validate_lock); success: - i = req->nr_pages; - while (i > 0) - unlock_page(req->pages[--i]); return req; -error_unlock_all: - i = req->nr_pages; error_unlock: - while (i > 0) - unlock_page(req->pages[--i]); + up_write(&dvnode->validate_lock); error: afs_put_read(req); _leave(" = %d", ret); return ERR_PTR(ret); content_has_grown: - i = req->nr_pages; - while (i > 0) - unlock_page(req->pages[--i]); + up_write(&dvnode->validate_lock); afs_put_read(req); goto retry; } @@ -473,6 +470,7 @@ static int afs_dir_iterate(struct inode *dir, struct dir_context *ctx, } out: + up_read(&dvnode->validate_lock); afs_put_read(req); _leave(" = %d", ret); return ret; @@ -1143,7 +1141,7 @@ static int afs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) ret = -ERESTARTSYS; if (afs_begin_vnode_operation(&fc, dvnode, key)) { while (afs_select_fileserver(&fc)) { - fc.cb_break = dvnode->cb_break + dvnode->cb_s_break; + fc.cb_break = afs_calc_vnode_cb_break(dvnode); afs_fs_create(&fc, dentry->d_name.name, mode, data_version, &newfid, &newstatus, &newcb); } @@ -1213,7 +1211,7 @@ static int afs_rmdir(struct inode *dir, struct dentry *dentry) ret = -ERESTARTSYS; if (afs_begin_vnode_operation(&fc, dvnode, key)) { while (afs_select_fileserver(&fc)) { - fc.cb_break = dvnode->cb_break + dvnode->cb_s_break; + fc.cb_break = afs_calc_vnode_cb_break(dvnode); afs_fs_remove(&fc, dentry->d_name.name, true, data_version); } @@ -1316,7 +1314,7 @@ static int afs_unlink(struct inode *dir, struct dentry *dentry) ret = -ERESTARTSYS; if (afs_begin_vnode_operation(&fc, dvnode, key)) { while (afs_select_fileserver(&fc)) { - fc.cb_break = dvnode->cb_break + dvnode->cb_s_break; + fc.cb_break = afs_calc_vnode_cb_break(dvnode); afs_fs_remove(&fc, dentry->d_name.name, false, data_version); } @@ -1373,7 +1371,7 @@ static int afs_create(struct inode *dir, struct dentry *dentry, umode_t mode, ret = -ERESTARTSYS; if (afs_begin_vnode_operation(&fc, dvnode, key)) { while (afs_select_fileserver(&fc)) { - fc.cb_break = dvnode->cb_break + dvnode->cb_s_break; + fc.cb_break = afs_calc_vnode_cb_break(dvnode); afs_fs_create(&fc, dentry->d_name.name, mode, data_version, &newfid, &newstatus, &newcb); } @@ -1443,8 +1441,8 @@ static int afs_link(struct dentry *from, struct inode *dir, } while (afs_select_fileserver(&fc)) { - fc.cb_break = dvnode->cb_break + dvnode->cb_s_break; - fc.cb_break_2 = vnode->cb_break + vnode->cb_s_break; + fc.cb_break = afs_calc_vnode_cb_break(dvnode); + fc.cb_break_2 = afs_calc_vnode_cb_break(vnode); afs_fs_link(&fc, vnode, dentry->d_name.name, data_version); } @@ -1512,7 +1510,7 @@ static int afs_symlink(struct inode *dir, struct dentry *dentry, ret = -ERESTARTSYS; if (afs_begin_vnode_operation(&fc, dvnode, key)) { while (afs_select_fileserver(&fc)) { - fc.cb_break = dvnode->cb_break + dvnode->cb_s_break; + fc.cb_break = afs_calc_vnode_cb_break(dvnode); afs_fs_symlink(&fc, dentry->d_name.name, content, data_version, &newfid, &newstatus); @@ -1588,8 +1586,8 @@ static int afs_rename(struct inode *old_dir, struct dentry *old_dentry, } } while (afs_select_fileserver(&fc)) { - fc.cb_break = orig_dvnode->cb_break + orig_dvnode->cb_s_break; - fc.cb_break_2 = new_dvnode->cb_break + new_dvnode->cb_s_break; + fc.cb_break = afs_calc_vnode_cb_break(orig_dvnode); + fc.cb_break_2 = afs_calc_vnode_cb_break(new_dvnode); afs_fs_rename(&fc, old_dentry->d_name.name, new_dvnode, new_dentry->d_name.name, orig_data_version, new_data_version); diff --git a/fs/afs/file.c b/fs/afs/file.c index c24c08016dd9..7d4f26198573 100644 --- a/fs/afs/file.c +++ b/fs/afs/file.c @@ -238,7 +238,7 @@ int afs_fetch_data(struct afs_vnode *vnode, struct key *key, struct afs_read *de ret = -ERESTARTSYS; if (afs_begin_vnode_operation(&fc, vnode, key)) { while (afs_select_fileserver(&fc)) { - fc.cb_break = vnode->cb_break + vnode->cb_s_break; + fc.cb_break = afs_calc_vnode_cb_break(vnode); afs_fs_fetch_data(&fc, desc); } diff --git a/fs/afs/flock.c b/fs/afs/flock.c index 7a0e017070ec..dc62d15a964b 100644 --- a/fs/afs/flock.c +++ b/fs/afs/flock.c @@ -86,7 +86,7 @@ static int afs_set_lock(struct afs_vnode *vnode, struct key *key, ret = -ERESTARTSYS; if (afs_begin_vnode_operation(&fc, vnode, key)) { while (afs_select_fileserver(&fc)) { - fc.cb_break = vnode->cb_break + vnode->cb_s_break; + fc.cb_break = afs_calc_vnode_cb_break(vnode); afs_fs_set_lock(&fc, type); } @@ -117,7 +117,7 @@ static int afs_extend_lock(struct afs_vnode *vnode, struct key *key) ret = -ERESTARTSYS; if (afs_begin_vnode_operation(&fc, vnode, key)) { while (afs_select_current_fileserver(&fc)) { - fc.cb_break = vnode->cb_break + vnode->cb_s_break; + fc.cb_break = afs_calc_vnode_cb_break(vnode); afs_fs_extend_lock(&fc); } @@ -148,7 +148,7 @@ static int afs_release_lock(struct afs_vnode *vnode, struct key *key) ret = -ERESTARTSYS; if (afs_begin_vnode_operation(&fc, vnode, key)) { while (afs_select_current_fileserver(&fc)) { - fc.cb_break = vnode->cb_break + vnode->cb_s_break; + fc.cb_break = afs_calc_vnode_cb_break(vnode); afs_fs_release_lock(&fc); } diff --git a/fs/afs/fsclient.c b/fs/afs/fsclient.c index efacdb7c1dee..b273e1d60478 100644 --- a/fs/afs/fsclient.c +++ b/fs/afs/fsclient.c @@ -134,6 +134,7 @@ static int xdr_decode_AFSFetchStatus(struct afs_call *call, struct afs_read *read_req) { const struct afs_xdr_AFSFetchStatus *xdr = (const void *)*_bp; + bool inline_error = (call->operation_ID == afs_FS_InlineBulkStatus); u64 data_version, size; u32 type, abort_code; u8 flags = 0; @@ -142,13 +143,32 @@ static int xdr_decode_AFSFetchStatus(struct afs_call *call, if (vnode) write_seqlock(&vnode->cb_lock); + abort_code = ntohl(xdr->abort_code); + if (xdr->if_version != htonl(AFS_FSTATUS_VERSION)) { + if (xdr->if_version == htonl(0) && + abort_code != 0 && + inline_error) { + /* The OpenAFS fileserver has a bug in FS.InlineBulkStatus + * whereby it doesn't set the interface version in the error + * case. + */ + status->abort_code = abort_code; + ret = 0; + goto out; + } + pr_warn("Unknown AFSFetchStatus version %u\n", ntohl(xdr->if_version)); goto bad; } + if (abort_code != 0 && inline_error) { + status->abort_code = abort_code; + ret = 0; + goto out; + } + type = ntohl(xdr->type); - abort_code = ntohl(xdr->abort_code); switch (type) { case AFS_FTYPE_FILE: case AFS_FTYPE_DIR: @@ -165,13 +185,6 @@ static int xdr_decode_AFSFetchStatus(struct afs_call *call, } status->type = type; break; - case AFS_FTYPE_INVALID: - if (abort_code != 0) { - status->abort_code = abort_code; - ret = 0; - goto out; - } - /* Fall through */ default: goto bad; } @@ -248,7 +261,7 @@ static void xdr_decode_AFSCallBack(struct afs_call *call, write_seqlock(&vnode->cb_lock); - if (call->cb_break == (vnode->cb_break + cbi->server->cb_s_break)) { + if (call->cb_break == afs_cb_break_sum(vnode, cbi)) { vnode->cb_version = ntohl(*bp++); cb_expiry = ntohl(*bp++); vnode->cb_type = ntohl(*bp++); diff --git a/fs/afs/inode.c b/fs/afs/inode.c index 06194cfe9724..479b7fdda124 100644 --- a/fs/afs/inode.c +++ b/fs/afs/inode.c @@ -108,7 +108,7 @@ int afs_fetch_status(struct afs_vnode *vnode, struct key *key, bool new_inode) ret = -ERESTARTSYS; if (afs_begin_vnode_operation(&fc, vnode, key)) { while (afs_select_fileserver(&fc)) { - fc.cb_break = vnode->cb_break + vnode->cb_s_break; + fc.cb_break = afs_calc_vnode_cb_break(vnode); afs_fs_fetch_file_status(&fc, NULL, new_inode); } @@ -393,15 +393,18 @@ int afs_validate(struct afs_vnode *vnode, struct key *key) read_seqlock_excl(&vnode->cb_lock); if (test_bit(AFS_VNODE_CB_PROMISED, &vnode->flags)) { - if (vnode->cb_s_break != vnode->cb_interest->server->cb_s_break) { + if (vnode->cb_s_break != vnode->cb_interest->server->cb_s_break || + vnode->cb_v_break != vnode->volume->cb_v_break) { vnode->cb_s_break = vnode->cb_interest->server->cb_s_break; + vnode->cb_v_break = vnode->volume->cb_v_break; + valid = false; } else if (vnode->status.type == AFS_FTYPE_DIR && test_bit(AFS_VNODE_DIR_VALID, &vnode->flags) && vnode->cb_expires_at - 10 > now) { - valid = true; + valid = true; } else if (!test_bit(AFS_VNODE_ZAP_DATA, &vnode->flags) && vnode->cb_expires_at - 10 > now) { - valid = true; + valid = true; } } else if (test_bit(AFS_VNODE_DELETED, &vnode->flags)) { valid = true; @@ -415,7 +418,7 @@ int afs_validate(struct afs_vnode *vnode, struct key *key) if (valid) goto valid; - mutex_lock(&vnode->validate_lock); + down_write(&vnode->validate_lock); /* if the promise has expired, we need to check the server again to get * a new promise - note that if the (parent) directory's metadata was @@ -444,13 +447,13 @@ int afs_validate(struct afs_vnode *vnode, struct key *key) * different */ if (test_and_clear_bit(AFS_VNODE_ZAP_DATA, &vnode->flags)) afs_zap_data(vnode); - mutex_unlock(&vnode->validate_lock); + up_write(&vnode->validate_lock); valid: _leave(" = 0"); return 0; error_unlock: - mutex_unlock(&vnode->validate_lock); + up_write(&vnode->validate_lock); _leave(" = %d", ret); return ret; } @@ -574,7 +577,7 @@ int afs_setattr(struct dentry *dentry, struct iattr *attr) ret = -ERESTARTSYS; if (afs_begin_vnode_operation(&fc, vnode, key)) { while (afs_select_fileserver(&fc)) { - fc.cb_break = vnode->cb_break + vnode->cb_s_break; + fc.cb_break = afs_calc_vnode_cb_break(vnode); afs_fs_setattr(&fc, attr); } diff --git a/fs/afs/internal.h b/fs/afs/internal.h index f8086ec95e24..e3f8a46663db 100644 --- a/fs/afs/internal.h +++ b/fs/afs/internal.h @@ -396,6 +396,7 @@ struct afs_server { #define AFS_SERVER_FL_PROBED 5 /* The fileserver has been probed */ #define AFS_SERVER_FL_PROBING 6 /* Fileserver is being probed */ #define AFS_SERVER_FL_NO_IBULK 7 /* Fileserver doesn't support FS.InlineBulkStatus */ +#define AFS_SERVER_FL_MAY_HAVE_CB 8 /* May have callbacks on this fileserver */ atomic_t usage; u32 addr_version; /* Address list version */ @@ -433,6 +434,7 @@ struct afs_server_list { unsigned short index; /* Server currently in use */ unsigned short vnovol_mask; /* Servers to be skipped due to VNOVOL */ unsigned int seq; /* Set to ->servers_seq when installed */ + rwlock_t lock; struct afs_server_entry servers[]; }; @@ -459,6 +461,9 @@ struct afs_volume { rwlock_t servers_lock; /* Lock for ->servers */ unsigned int servers_seq; /* Incremented each time ->servers changes */ + unsigned cb_v_break; /* Break-everything counter. */ + rwlock_t cb_break_lock; + afs_voltype_t type; /* type of volume */ short error; char type_force; /* force volume type (suppress R/O -> R/W) */ @@ -494,7 +499,7 @@ struct afs_vnode { #endif struct afs_permits __rcu *permit_cache; /* cache of permits so far obtained */ struct mutex io_lock; /* Lock for serialising I/O on this mutex */ - struct mutex validate_lock; /* lock for validating this vnode */ + struct rw_semaphore validate_lock; /* lock for validating this vnode */ spinlock_t wb_lock; /* lock for wb_keys */ spinlock_t lock; /* waitqueue/flags lock */ unsigned long flags; @@ -519,6 +524,7 @@ struct afs_vnode { /* outstanding callback notification on this file */ struct afs_cb_interest *cb_interest; /* Server on which this resides */ unsigned int cb_s_break; /* Mass break counter on ->server */ + unsigned int cb_v_break; /* Mass break counter on ->volume */ unsigned int cb_break; /* Break counter on vnode */ seqlock_t cb_lock; /* Lock for ->cb_interest, ->status, ->cb_*break */ @@ -648,16 +654,29 @@ extern void afs_init_callback_state(struct afs_server *); extern void afs_break_callback(struct afs_vnode *); extern void afs_break_callbacks(struct afs_server *, size_t, struct afs_callback_break*); -extern int afs_register_server_cb_interest(struct afs_vnode *, struct afs_server_entry *); +extern int afs_register_server_cb_interest(struct afs_vnode *, + struct afs_server_list *, unsigned int); extern void afs_put_cb_interest(struct afs_net *, struct afs_cb_interest *); extern void afs_clear_callback_interests(struct afs_net *, struct afs_server_list *); static inline struct afs_cb_interest *afs_get_cb_interest(struct afs_cb_interest *cbi) { - refcount_inc(&cbi->usage); + if (cbi) + refcount_inc(&cbi->usage); return cbi; } +static inline unsigned int afs_calc_vnode_cb_break(struct afs_vnode *vnode) +{ + return vnode->cb_break + vnode->cb_s_break + vnode->cb_v_break; +} + +static inline unsigned int afs_cb_break_sum(struct afs_vnode *vnode, + struct afs_cb_interest *cbi) +{ + return vnode->cb_break + cbi->server->cb_s_break + vnode->volume->cb_v_break; +} + /* * cell.c */ diff --git a/fs/afs/proc.c b/fs/afs/proc.c index 839a22280606..3aad32762989 100644 --- a/fs/afs/proc.c +++ b/fs/afs/proc.c @@ -62,7 +62,6 @@ static const struct file_operations afs_proc_rootcell_fops = { .llseek = no_llseek, }; -static int afs_proc_cell_volumes_open(struct inode *inode, struct file *file); static void *afs_proc_cell_volumes_start(struct seq_file *p, loff_t *pos); static void *afs_proc_cell_volumes_next(struct seq_file *p, void *v, loff_t *pos); @@ -76,15 +75,6 @@ static const struct seq_operations afs_proc_cell_volumes_ops = { .show = afs_proc_cell_volumes_show, }; -static const struct file_operations afs_proc_cell_volumes_fops = { - .open = afs_proc_cell_volumes_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release, -}; - -static int afs_proc_cell_vlservers_open(struct inode *inode, - struct file *file); static void *afs_proc_cell_vlservers_start(struct seq_file *p, loff_t *pos); static void *afs_proc_cell_vlservers_next(struct seq_file *p, void *v, loff_t *pos); @@ -98,14 +88,6 @@ static const struct seq_operations afs_proc_cell_vlservers_ops = { .show = afs_proc_cell_vlservers_show, }; -static const struct file_operations afs_proc_cell_vlservers_fops = { - .open = afs_proc_cell_vlservers_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release, -}; - -static int afs_proc_servers_open(struct inode *inode, struct file *file); static void *afs_proc_servers_start(struct seq_file *p, loff_t *pos); static void *afs_proc_servers_next(struct seq_file *p, void *v, loff_t *pos); @@ -119,13 +101,6 @@ static const struct seq_operations afs_proc_servers_ops = { .show = afs_proc_servers_show, }; -static const struct file_operations afs_proc_servers_fops = { - .open = afs_proc_servers_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release, -}; - static int afs_proc_sysname_open(struct inode *inode, struct file *file); static int afs_proc_sysname_release(struct inode *inode, struct file *file); static void *afs_proc_sysname_start(struct seq_file *p, loff_t *pos); @@ -152,7 +127,7 @@ static const struct file_operations afs_proc_sysname_fops = { .write = afs_proc_sysname_write, }; -static const struct file_operations afs_proc_stats_fops; +static int afs_proc_stats_show(struct seq_file *m, void *v); /* * initialise the /proc/fs/afs/ directory @@ -167,8 +142,8 @@ int afs_proc_init(struct afs_net *net) if (!proc_create("cells", 0644, net->proc_afs, &afs_proc_cells_fops) || !proc_create("rootcell", 0644, net->proc_afs, &afs_proc_rootcell_fops) || - !proc_create("servers", 0644, net->proc_afs, &afs_proc_servers_fops) || - !proc_create("stats", 0644, net->proc_afs, &afs_proc_stats_fops) || + !proc_create_seq("servers", 0644, net->proc_afs, &afs_proc_servers_ops) || + !proc_create_single("stats", 0644, net->proc_afs, afs_proc_stats_show) || !proc_create("sysname", 0644, net->proc_afs, &afs_proc_sysname_fops)) goto error_tree; @@ -196,16 +171,7 @@ void afs_proc_cleanup(struct afs_net *net) */ static int afs_proc_cells_open(struct inode *inode, struct file *file) { - struct seq_file *m; - int ret; - - ret = seq_open(file, &afs_proc_cells_ops); - if (ret < 0) - return ret; - - m = file->private_data; - m->private = PDE_DATA(inode); - return 0; + return seq_open(file, &afs_proc_cells_ops); } /* @@ -430,10 +396,11 @@ int afs_proc_cell_setup(struct afs_net *net, struct afs_cell *cell) if (!dir) goto error_dir; - if (!proc_create_data("vlservers", 0, dir, - &afs_proc_cell_vlservers_fops, cell) || - !proc_create_data("volumes", 0, dir, - &afs_proc_cell_volumes_fops, cell)) + if (!proc_create_seq_data("vlservers", 0, dir, + &afs_proc_cell_vlservers_ops, cell)) + goto error_tree; + if (!proc_create_seq_data("volumes", 0, dir, &afs_proc_cell_volumes_ops, + cell)) goto error_tree; _leave(" = 0"); @@ -459,36 +426,13 @@ void afs_proc_cell_remove(struct afs_net *net, struct afs_cell *cell) } /* - * open "/proc/fs/afs/<cell>/volumes" which provides a summary of extant cells - */ -static int afs_proc_cell_volumes_open(struct inode *inode, struct file *file) -{ - struct afs_cell *cell; - struct seq_file *m; - int ret; - - cell = PDE_DATA(inode); - if (!cell) - return -ENOENT; - - ret = seq_open(file, &afs_proc_cell_volumes_ops); - if (ret < 0) - return ret; - - m = file->private_data; - m->private = cell; - - return 0; -} - -/* * set up the iterator to start reading from the cells list and return the * first item */ static void *afs_proc_cell_volumes_start(struct seq_file *m, loff_t *_pos) __acquires(cell->proc_lock) { - struct afs_cell *cell = m->private; + struct afs_cell *cell = PDE_DATA(file_inode(m->file)); _enter("cell=%p pos=%Ld", cell, *_pos); @@ -502,7 +446,7 @@ static void *afs_proc_cell_volumes_start(struct seq_file *m, loff_t *_pos) static void *afs_proc_cell_volumes_next(struct seq_file *p, void *v, loff_t *_pos) { - struct afs_cell *cell = p->private; + struct afs_cell *cell = PDE_DATA(file_inode(p->file)); _enter("cell=%p pos=%Ld", cell, *_pos); return seq_list_next(v, &cell->proc_volumes, _pos); @@ -514,7 +458,7 @@ static void *afs_proc_cell_volumes_next(struct seq_file *p, void *v, static void afs_proc_cell_volumes_stop(struct seq_file *p, void *v) __releases(cell->proc_lock) { - struct afs_cell *cell = p->private; + struct afs_cell *cell = PDE_DATA(file_inode(p->file)); read_unlock(&cell->proc_lock); } @@ -530,7 +474,7 @@ static const char afs_vol_types[3][3] = { */ static int afs_proc_cell_volumes_show(struct seq_file *m, void *v) { - struct afs_cell *cell = m->private; + struct afs_cell *cell = PDE_DATA(file_inode(m->file)); struct afs_volume *vol = list_entry(v, struct afs_volume, proc_link); /* Display header on line 1 */ @@ -547,30 +491,6 @@ static int afs_proc_cell_volumes_show(struct seq_file *m, void *v) } /* - * open "/proc/fs/afs/<cell>/vlservers" which provides a list of volume - * location server - */ -static int afs_proc_cell_vlservers_open(struct inode *inode, struct file *file) -{ - struct afs_cell *cell; - struct seq_file *m; - int ret; - - cell = PDE_DATA(inode); - if (!cell) - return -ENOENT; - - ret = seq_open(file, &afs_proc_cell_vlservers_ops); - if (ret<0) - return ret; - - m = file->private_data; - m->private = cell; - - return 0; -} - -/* * set up the iterator to start reading from the cells list and return the * first item */ @@ -578,7 +498,7 @@ static void *afs_proc_cell_vlservers_start(struct seq_file *m, loff_t *_pos) __acquires(rcu) { struct afs_addr_list *alist; - struct afs_cell *cell = m->private; + struct afs_cell *cell = PDE_DATA(file_inode(m->file)); loff_t pos = *_pos; rcu_read_lock(); @@ -603,7 +523,7 @@ static void *afs_proc_cell_vlservers_next(struct seq_file *p, void *v, loff_t *_pos) { struct afs_addr_list *alist; - struct afs_cell *cell = p->private; + struct afs_cell *cell = PDE_DATA(file_inode(p->file)); loff_t pos; alist = rcu_dereference(cell->vl_addrs); @@ -644,15 +564,6 @@ static int afs_proc_cell_vlservers_show(struct seq_file *m, void *v) } /* - * open "/proc/fs/afs/servers" which provides a summary of active - * servers - */ -static int afs_proc_servers_open(struct inode *inode, struct file *file) -{ - return seq_open(file, &afs_proc_servers_ops); -} - -/* * Set up the iterator to start reading from the server list and return the * first item. */ @@ -931,18 +842,3 @@ static int afs_proc_stats_show(struct seq_file *m, void *v) atomic_long_read(&net->n_store_bytes)); return 0; } - -/* - * Open "/proc/fs/afs/stats" to allow reading of the stat counters. - */ -static int afs_proc_stats_open(struct inode *inode, struct file *file) -{ - return single_open(file, afs_proc_stats_show, NULL); -} - -static const struct file_operations afs_proc_stats_fops = { - .open = afs_proc_stats_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; diff --git a/fs/afs/rotate.c b/fs/afs/rotate.c index ac0feac9d746..e065bc0768e6 100644 --- a/fs/afs/rotate.c +++ b/fs/afs/rotate.c @@ -179,7 +179,7 @@ bool afs_select_fileserver(struct afs_fs_cursor *fc) */ if (fc->flags & AFS_FS_CURSOR_VNOVOL) { fc->ac.error = -EREMOTEIO; - goto failed; + goto next_server; } write_lock(&vnode->volume->servers_lock); @@ -201,7 +201,7 @@ bool afs_select_fileserver(struct afs_fs_cursor *fc) */ if (vnode->volume->servers == fc->server_list) { fc->ac.error = -EREMOTEIO; - goto failed; + goto next_server; } /* Try again */ @@ -350,8 +350,8 @@ use_server: * break request before we've finished decoding the reply and * installing the vnode. */ - fc->ac.error = afs_register_server_cb_interest( - vnode, &fc->server_list->servers[fc->index]); + fc->ac.error = afs_register_server_cb_interest(vnode, fc->server_list, + fc->index); if (fc->ac.error < 0) goto failed; @@ -369,8 +369,16 @@ use_server: if (!test_bit(AFS_SERVER_FL_PROBED, &server->flags)) { fc->ac.alist = afs_get_addrlist(alist); - if (!afs_probe_fileserver(fc)) - goto failed; + if (!afs_probe_fileserver(fc)) { + switch (fc->ac.error) { + case -ENOMEM: + case -ERESTARTSYS: + case -EINTR: + goto failed; + default: + goto next_server; + } + } } if (!fc->ac.alist) diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c index 5c6263972ec9..08735948f15d 100644 --- a/fs/afs/rxrpc.c +++ b/fs/afs/rxrpc.c @@ -41,6 +41,7 @@ int afs_open_socket(struct afs_net *net) { struct sockaddr_rxrpc srx; struct socket *socket; + unsigned int min_level; int ret; _enter(""); @@ -60,6 +61,12 @@ int afs_open_socket(struct afs_net *net) srx.transport.sin6.sin6_family = AF_INET6; srx.transport.sin6.sin6_port = htons(AFS_CM_PORT); + min_level = RXRPC_SECURITY_ENCRYPT; + ret = kernel_setsockopt(socket, SOL_RXRPC, RXRPC_MIN_SECURITY_LEVEL, + (void *)&min_level, sizeof(min_level)); + if (ret < 0) + goto error_2; + ret = kernel_bind(socket, (struct sockaddr *) &srx, sizeof(srx)); if (ret == -EADDRINUSE) { srx.transport.sin6.sin6_port = 0; @@ -482,8 +489,12 @@ static void afs_deliver_to_call(struct afs_call *call) state = READ_ONCE(call->state); switch (ret) { case 0: - if (state == AFS_CALL_CL_PROC_REPLY) + if (state == AFS_CALL_CL_PROC_REPLY) { + if (call->cbi) + set_bit(AFS_SERVER_FL_MAY_HAVE_CB, + &call->cbi->server->flags); goto call_complete; + } ASSERTCMP(state, >, AFS_CALL_CL_PROC_REPLY); goto done; case -EINPROGRESS: @@ -493,11 +504,6 @@ static void afs_deliver_to_call(struct afs_call *call) case -ECONNABORTED: ASSERTCMP(state, ==, AFS_CALL_COMPLETE); goto done; - case -ENOTCONN: - abort_code = RX_CALL_DEAD; - rxrpc_kernel_abort_call(call->net->socket, call->rxcall, - abort_code, ret, "KNC"); - goto local_abort; case -ENOTSUPP: abort_code = RXGEN_OPCODE; rxrpc_kernel_abort_call(call->net->socket, call->rxcall, diff --git a/fs/afs/security.c b/fs/afs/security.c index cea2fff313dc..81dfedb7879f 100644 --- a/fs/afs/security.c +++ b/fs/afs/security.c @@ -147,8 +147,7 @@ void afs_cache_permit(struct afs_vnode *vnode, struct key *key, break; } - if (cb_break != (vnode->cb_break + - vnode->cb_interest->server->cb_s_break)) { + if (cb_break != afs_cb_break_sum(vnode, vnode->cb_interest)) { changed = true; break; } @@ -178,7 +177,7 @@ void afs_cache_permit(struct afs_vnode *vnode, struct key *key, } } - if (cb_break != (vnode->cb_break + vnode->cb_interest->server->cb_s_break)) + if (cb_break != afs_cb_break_sum(vnode, vnode->cb_interest)) goto someone_else_changed_it; /* We need a ref on any permits list we want to copy as we'll have to @@ -257,7 +256,7 @@ found: spin_lock(&vnode->lock); zap = rcu_access_pointer(vnode->permit_cache); - if (cb_break == (vnode->cb_break + vnode->cb_interest->server->cb_s_break) && + if (cb_break == afs_cb_break_sum(vnode, vnode->cb_interest) && zap == permits) rcu_assign_pointer(vnode->permit_cache, replacement); else @@ -373,18 +372,14 @@ int afs_permission(struct inode *inode, int mask) mask, access, S_ISDIR(inode->i_mode) ? "dir" : "file"); if (S_ISDIR(inode->i_mode)) { - if (mask & MAY_EXEC) { + if (mask & (MAY_EXEC | MAY_READ | MAY_CHDIR)) { if (!(access & AFS_ACE_LOOKUP)) goto permission_denied; - } else if (mask & MAY_READ) { - if (!(access & AFS_ACE_LOOKUP)) - goto permission_denied; - } else if (mask & MAY_WRITE) { + } + if (mask & MAY_WRITE) { if (!(access & (AFS_ACE_DELETE | /* rmdir, unlink, rename from */ AFS_ACE_INSERT))) /* create, mkdir, symlink, rename to */ goto permission_denied; - } else { - BUG(); } } else { if (!(access & AFS_ACE_LOOKUP)) diff --git a/fs/afs/server.c b/fs/afs/server.c index 629c74986cff..3af4625e2f8c 100644 --- a/fs/afs/server.c +++ b/fs/afs/server.c @@ -67,12 +67,6 @@ struct afs_server *afs_find_server(struct afs_net *net, sizeof(struct in6_addr)); if (diff == 0) goto found; - if (diff < 0) { - // TODO: Sort the list - //if (i == alist->nr_ipv4) - // goto not_found; - break; - } } } } else { @@ -87,17 +81,10 @@ struct afs_server *afs_find_server(struct afs_net *net, (u32 __force)b->sin6_addr.s6_addr32[3]); if (diff == 0) goto found; - if (diff < 0) { - // TODO: Sort the list - //if (i == 0) - // goto not_found; - break; - } } } } - //not_found: server = NULL; found: if (server && !atomic_inc_not_zero(&server->usage)) @@ -395,14 +382,16 @@ static void afs_destroy_server(struct afs_net *net, struct afs_server *server) struct afs_addr_list *alist = rcu_access_pointer(server->addresses); struct afs_addr_cursor ac = { .alist = alist, - .addr = &alist->addrs[0], .start = alist->index, - .index = alist->index, + .index = 0, + .addr = &alist->addrs[alist->index], .error = 0, }; _enter("%p", server); - afs_fs_give_up_all_callbacks(net, server, &ac, NULL); + if (test_bit(AFS_SERVER_FL_MAY_HAVE_CB, &server->flags)) + afs_fs_give_up_all_callbacks(net, server, &ac, NULL); + call_rcu(&server->rcu, afs_server_rcu); afs_dec_servers_outstanding(net); } diff --git a/fs/afs/server_list.c b/fs/afs/server_list.c index 0f8dc4c8f07c..8a5760aa5832 100644 --- a/fs/afs/server_list.c +++ b/fs/afs/server_list.c @@ -49,6 +49,7 @@ struct afs_server_list *afs_alloc_server_list(struct afs_cell *cell, goto error; refcount_set(&slist->usage, 1); + rwlock_init(&slist->lock); /* Make sure a records exists for each server in the list. */ for (i = 0; i < vldb->nr_servers; i++) { @@ -64,9 +65,11 @@ struct afs_server_list *afs_alloc_server_list(struct afs_cell *cell, goto error_2; } - /* Insertion-sort by server pointer */ + /* Insertion-sort by UUID */ for (j = 0; j < slist->nr_servers; j++) - if (slist->servers[j].server >= server) + if (memcmp(&slist->servers[j].server->uuid, + &server->uuid, + sizeof(server->uuid)) >= 0) break; if (j < slist->nr_servers) { if (slist->servers[j].server == server) { diff --git a/fs/afs/super.c b/fs/afs/super.c index 65081ec3c36e..9e5d7966621c 100644 --- a/fs/afs/super.c +++ b/fs/afs/super.c @@ -590,7 +590,7 @@ static void afs_i_init_once(void *_vnode) memset(vnode, 0, sizeof(*vnode)); inode_init_once(&vnode->vfs_inode); mutex_init(&vnode->io_lock); - mutex_init(&vnode->validate_lock); + init_rwsem(&vnode->validate_lock); spin_lock_init(&vnode->wb_lock); spin_lock_init(&vnode->lock); INIT_LIST_HEAD(&vnode->wb_keys); @@ -688,7 +688,7 @@ static int afs_statfs(struct dentry *dentry, struct kstatfs *buf) if (afs_begin_vnode_operation(&fc, vnode, key)) { fc.flags |= AFS_FS_CURSOR_NO_VSLEEP; while (afs_select_fileserver(&fc)) { - fc.cb_break = vnode->cb_break + vnode->cb_s_break; + fc.cb_break = afs_calc_vnode_cb_break(vnode); afs_fs_get_volume_status(&fc, &vs); } diff --git a/fs/afs/vlclient.c b/fs/afs/vlclient.c index 1ed7e2fd2f35..c3b740813fc7 100644 --- a/fs/afs/vlclient.c +++ b/fs/afs/vlclient.c @@ -23,7 +23,7 @@ static int afs_deliver_vl_get_entry_by_name_u(struct afs_call *call) struct afs_uvldbentry__xdr *uvldb; struct afs_vldb_entry *entry; bool new_only = false; - u32 tmp, nr_servers; + u32 tmp, nr_servers, vlflags; int i, ret; _enter(""); @@ -55,6 +55,7 @@ static int afs_deliver_vl_get_entry_by_name_u(struct afs_call *call) new_only = true; } + vlflags = ntohl(uvldb->flags); for (i = 0; i < nr_servers; i++) { struct afs_uuid__xdr *xdr; struct afs_uuid *uuid; @@ -64,12 +65,13 @@ static int afs_deliver_vl_get_entry_by_name_u(struct afs_call *call) if (tmp & AFS_VLSF_DONTUSE || (new_only && !(tmp & AFS_VLSF_NEWREPSITE))) continue; - if (tmp & AFS_VLSF_RWVOL) + if (tmp & AFS_VLSF_RWVOL) { entry->fs_mask[i] |= AFS_VOL_VTM_RW; + if (vlflags & AFS_VLF_BACKEXISTS) + entry->fs_mask[i] |= AFS_VOL_VTM_BAK; + } if (tmp & AFS_VLSF_ROVOL) entry->fs_mask[i] |= AFS_VOL_VTM_RO; - if (tmp & AFS_VLSF_BACKVOL) - entry->fs_mask[i] |= AFS_VOL_VTM_BAK; if (!entry->fs_mask[i]) continue; @@ -89,15 +91,14 @@ static int afs_deliver_vl_get_entry_by_name_u(struct afs_call *call) for (i = 0; i < AFS_MAXTYPES; i++) entry->vid[i] = ntohl(uvldb->volumeId[i]); - tmp = ntohl(uvldb->flags); - if (tmp & AFS_VLF_RWEXISTS) + if (vlflags & AFS_VLF_RWEXISTS) __set_bit(AFS_VLDB_HAS_RW, &entry->flags); - if (tmp & AFS_VLF_ROEXISTS) + if (vlflags & AFS_VLF_ROEXISTS) __set_bit(AFS_VLDB_HAS_RO, &entry->flags); - if (tmp & AFS_VLF_BACKEXISTS) + if (vlflags & AFS_VLF_BACKEXISTS) __set_bit(AFS_VLDB_HAS_BAK, &entry->flags); - if (!(tmp & (AFS_VLF_RWEXISTS | AFS_VLF_ROEXISTS | AFS_VLF_BACKEXISTS))) { + if (!(vlflags & (AFS_VLF_RWEXISTS | AFS_VLF_ROEXISTS | AFS_VLF_BACKEXISTS))) { entry->error = -ENOMEDIUM; __set_bit(AFS_VLDB_QUERY_ERROR, &entry->flags); } diff --git a/fs/afs/write.c b/fs/afs/write.c index c164698dc304..8b39e6ebb40b 100644 --- a/fs/afs/write.c +++ b/fs/afs/write.c @@ -351,7 +351,7 @@ found_key: ret = -ERESTARTSYS; if (afs_begin_vnode_operation(&fc, vnode, wbk->key)) { while (afs_select_fileserver(&fc)) { - fc.cb_break = vnode->cb_break + vnode->cb_s_break; + fc.cb_break = afs_calc_vnode_cb_break(vnode); afs_fs_store_data(&fc, mapping, first, last, offset, to); } @@ -5,6 +5,7 @@ * Implements an efficient asynchronous io interface. * * Copyright 2000, 2001, 2002 Red Hat, Inc. All Rights Reserved. + * Copyright 2018 Christoph Hellwig. * * See ../COPYING for licensing terms. */ @@ -46,6 +47,8 @@ #include "internal.h" +#define KIOCB_KEY 0 + #define AIO_RING_MAGIC 0xa10a10a1 #define AIO_RING_COMPAT_FEATURES 1 #define AIO_RING_INCOMPAT_FEATURES 0 @@ -156,21 +159,29 @@ struct kioctx { unsigned id; }; -/* - * We use ki_cancel == KIOCB_CANCELLED to indicate that a kiocb has been either - * cancelled or completed (this makes a certain amount of sense because - * successful cancellation - io_cancel() - does deliver the completion to - * userspace). - * - * And since most things don't implement kiocb cancellation and we'd really like - * kiocb completion to be lockless when possible, we use ki_cancel to - * synchronize cancellation and completion - we only set it to KIOCB_CANCELLED - * with xchg() or cmpxchg(), see batch_complete_aio() and kiocb_cancel(). - */ -#define KIOCB_CANCELLED ((void *) (~0ULL)) +struct fsync_iocb { + struct work_struct work; + struct file *file; + bool datasync; +}; + +struct poll_iocb { + struct file *file; + __poll_t events; + struct wait_queue_head *head; + + union { + struct wait_queue_entry wait; + struct work_struct work; + }; +}; struct aio_kiocb { - struct kiocb common; + union { + struct kiocb rw; + struct fsync_iocb fsync; + struct poll_iocb poll; + }; struct kioctx *ki_ctx; kiocb_cancel_fn *ki_cancel; @@ -264,9 +275,6 @@ static int __init aio_setup(void) kiocb_cachep = KMEM_CACHE(aio_kiocb, SLAB_HWCACHE_ALIGN|SLAB_PANIC); kioctx_cachep = KMEM_CACHE(kioctx,SLAB_HWCACHE_ALIGN|SLAB_PANIC); - - pr_debug("sizeof(struct page) = %zu\n", sizeof(struct page)); - return 0; } __initcall(aio_setup); @@ -552,42 +560,20 @@ static int aio_setup_ring(struct kioctx *ctx, unsigned int nr_events) void kiocb_set_cancel_fn(struct kiocb *iocb, kiocb_cancel_fn *cancel) { - struct aio_kiocb *req = container_of(iocb, struct aio_kiocb, common); + struct aio_kiocb *req = container_of(iocb, struct aio_kiocb, rw); struct kioctx *ctx = req->ki_ctx; unsigned long flags; - spin_lock_irqsave(&ctx->ctx_lock, flags); - - if (!req->ki_list.next) - list_add(&req->ki_list, &ctx->active_reqs); + if (WARN_ON_ONCE(!list_empty(&req->ki_list))) + return; + spin_lock_irqsave(&ctx->ctx_lock, flags); + list_add_tail(&req->ki_list, &ctx->active_reqs); req->ki_cancel = cancel; - spin_unlock_irqrestore(&ctx->ctx_lock, flags); } EXPORT_SYMBOL(kiocb_set_cancel_fn); -static int kiocb_cancel(struct aio_kiocb *kiocb) -{ - kiocb_cancel_fn *old, *cancel; - - /* - * Don't want to set kiocb->ki_cancel = KIOCB_CANCELLED unless it - * actually has a cancel function, hence the cmpxchg() - */ - - cancel = READ_ONCE(kiocb->ki_cancel); - do { - if (!cancel || cancel == KIOCB_CANCELLED) - return -EINVAL; - - old = cancel; - cancel = cmpxchg(&kiocb->ki_cancel, old, KIOCB_CANCELLED); - } while (cancel != old); - - return cancel(&kiocb->common); -} - /* * free_ioctx() should be RCU delayed to synchronize against the RCU * protected lookup_ioctx() and also needs process context to call @@ -634,9 +620,8 @@ static void free_ioctx_users(struct percpu_ref *ref) while (!list_empty(&ctx->active_reqs)) { req = list_first_entry(&ctx->active_reqs, struct aio_kiocb, ki_list); - + req->ki_cancel(&req->rw); list_del_init(&req->ki_list); - kiocb_cancel(req); } spin_unlock_irq(&ctx->ctx_lock); @@ -1042,7 +1027,7 @@ static inline struct aio_kiocb *aio_get_req(struct kioctx *ctx) goto out_put; percpu_ref_get(&ctx->reqs); - + INIT_LIST_HEAD(&req->ki_list); req->ki_ctx = ctx; return req; out_put: @@ -1050,15 +1035,6 @@ out_put: return NULL; } -static void kiocb_free(struct aio_kiocb *req) -{ - if (req->common.ki_filp) - fput(req->common.ki_filp); - if (req->ki_eventfd != NULL) - eventfd_ctx_put(req->ki_eventfd); - kmem_cache_free(kiocb_cachep, req); -} - static struct kioctx *lookup_ioctx(unsigned long ctx_id) { struct aio_ring __user *ring = (void __user *)ctx_id; @@ -1078,8 +1054,8 @@ static struct kioctx *lookup_ioctx(unsigned long ctx_id) ctx = rcu_dereference(table->table[id]); if (ctx && ctx->user_id == ctx_id) { - percpu_ref_get(&ctx->users); - ret = ctx; + if (percpu_ref_tryget_live(&ctx->users)) + ret = ctx; } out: rcu_read_unlock(); @@ -1089,44 +1065,14 @@ out: /* aio_complete * Called when the io request on the given iocb is complete. */ -static void aio_complete(struct kiocb *kiocb, long res, long res2) +static void aio_complete(struct aio_kiocb *iocb, long res, long res2) { - struct aio_kiocb *iocb = container_of(kiocb, struct aio_kiocb, common); struct kioctx *ctx = iocb->ki_ctx; struct aio_ring *ring; struct io_event *ev_page, *event; unsigned tail, pos, head; unsigned long flags; - if (kiocb->ki_flags & IOCB_WRITE) { - struct file *file = kiocb->ki_filp; - - /* - * Tell lockdep we inherited freeze protection from submission - * thread. - */ - if (S_ISREG(file_inode(file)->i_mode)) - __sb_writers_acquired(file_inode(file)->i_sb, SB_FREEZE_WRITE); - file_end_write(file); - } - - /* - * Special case handling for sync iocbs: - * - events go directly into the iocb for fast handling - * - the sync task with the iocb in its stack holds the single iocb - * ref, no other paths have a way to get another ref - * - the sync task helpfully left a reference to itself in the iocb - */ - BUG_ON(is_sync_kiocb(kiocb)); - - if (iocb->ki_list.next) { - unsigned long flags; - - spin_lock_irqsave(&ctx->ctx_lock, flags); - list_del(&iocb->ki_list); - spin_unlock_irqrestore(&ctx->ctx_lock, flags); - } - /* * Add a completion event to the ring buffer. Must be done holding * ctx->completion_lock to prevent other code from messing with the tail @@ -1180,11 +1126,12 @@ static void aio_complete(struct kiocb *kiocb, long res, long res2) * eventfd. The eventfd_signal() function is safe to be called * from IRQ context. */ - if (iocb->ki_eventfd != NULL) + if (iocb->ki_eventfd) { eventfd_signal(iocb->ki_eventfd, 1); + eventfd_ctx_put(iocb->ki_eventfd); + } - /* everything turned out well, dispose of the aiocb. */ - kiocb_free(iocb); + kmem_cache_free(kiocb_cachep, iocb); /* * We have to order our ring_info tail store above and test @@ -1250,14 +1197,13 @@ static long aio_read_events_ring(struct kioctx *ctx, if (head == tail) break; - avail = min(avail, nr - ret); - avail = min_t(long, avail, AIO_EVENTS_PER_PAGE - - ((head + AIO_EVENTS_OFFSET) % AIO_EVENTS_PER_PAGE)); - pos = head + AIO_EVENTS_OFFSET; page = ctx->ring_pages[pos / AIO_EVENTS_PER_PAGE]; pos %= AIO_EVENTS_PER_PAGE; + avail = min(avail, nr - ret); + avail = min_t(long, avail, AIO_EVENTS_PER_PAGE - pos); + ev = kmap(page); copy_ret = copy_to_user(event + ret, ev + pos, sizeof(*ev) * avail); @@ -1328,10 +1274,6 @@ static long read_events(struct kioctx *ctx, long min_nr, long nr, wait_event_interruptible_hrtimeout(ctx->wait, aio_read_events(ctx, min_nr, nr, event, &ret), until); - - if (!ret && signal_pending(current)) - ret = -EINTR; - return ret; } @@ -1447,6 +1389,58 @@ SYSCALL_DEFINE1(io_destroy, aio_context_t, ctx) return -EINVAL; } +static void aio_remove_iocb(struct aio_kiocb *iocb) +{ + struct kioctx *ctx = iocb->ki_ctx; + unsigned long flags; + + spin_lock_irqsave(&ctx->ctx_lock, flags); + list_del(&iocb->ki_list); + spin_unlock_irqrestore(&ctx->ctx_lock, flags); +} + +static void aio_complete_rw(struct kiocb *kiocb, long res, long res2) +{ + struct aio_kiocb *iocb = container_of(kiocb, struct aio_kiocb, rw); + + if (!list_empty_careful(&iocb->ki_list)) + aio_remove_iocb(iocb); + + if (kiocb->ki_flags & IOCB_WRITE) { + struct inode *inode = file_inode(kiocb->ki_filp); + + /* + * Tell lockdep we inherited freeze protection from submission + * thread. + */ + if (S_ISREG(inode->i_mode)) + __sb_writers_acquired(inode->i_sb, SB_FREEZE_WRITE); + file_end_write(kiocb->ki_filp); + } + + fput(kiocb->ki_filp); + aio_complete(iocb, res, res2); +} + +static int aio_prep_rw(struct kiocb *req, struct iocb *iocb) +{ + int ret; + + req->ki_filp = fget(iocb->aio_fildes); + if (unlikely(!req->ki_filp)) + return -EBADF; + req->ki_complete = aio_complete_rw; + req->ki_pos = iocb->aio_offset; + req->ki_flags = iocb_flags(req->ki_filp); + if (iocb->aio_flags & IOCB_FLAG_RESFD) + req->ki_flags |= IOCB_EVENTFD; + req->ki_hint = file_write_hint(req->ki_filp); + ret = kiocb_set_rw_flags(req, iocb->aio_rw_flags); + if (unlikely(ret)) + fput(req->ki_filp); + return ret; +} + static int aio_setup_rw(int rw, struct iocb *iocb, struct iovec **iovec, bool vectored, bool compat, struct iov_iter *iter) { @@ -1466,11 +1460,11 @@ static int aio_setup_rw(int rw, struct iocb *iocb, struct iovec **iovec, return import_iovec(rw, buf, len, UIO_FASTIOV, iovec, iter); } -static inline ssize_t aio_ret(struct kiocb *req, ssize_t ret) +static inline void aio_rw_done(struct kiocb *req, ssize_t ret) { switch (ret) { case -EIOCBQUEUED: - return ret; + break; case -ERESTARTSYS: case -ERESTARTNOINTR: case -ERESTARTNOHAND: @@ -1482,85 +1476,270 @@ static inline ssize_t aio_ret(struct kiocb *req, ssize_t ret) ret = -EINTR; /*FALLTHRU*/ default: - aio_complete(req, ret, 0); - return 0; + aio_complete_rw(req, ret, 0); } } static ssize_t aio_read(struct kiocb *req, struct iocb *iocb, bool vectored, bool compat) { - struct file *file = req->ki_filp; struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs; struct iov_iter iter; + struct file *file; ssize_t ret; + ret = aio_prep_rw(req, iocb); + if (ret) + return ret; + file = req->ki_filp; + + ret = -EBADF; if (unlikely(!(file->f_mode & FMODE_READ))) - return -EBADF; + goto out_fput; + ret = -EINVAL; if (unlikely(!file->f_op->read_iter)) - return -EINVAL; + goto out_fput; ret = aio_setup_rw(READ, iocb, &iovec, vectored, compat, &iter); if (ret) - return ret; + goto out_fput; ret = rw_verify_area(READ, file, &req->ki_pos, iov_iter_count(&iter)); if (!ret) - ret = aio_ret(req, call_read_iter(file, req, &iter)); + aio_rw_done(req, call_read_iter(file, req, &iter)); kfree(iovec); +out_fput: + if (unlikely(ret)) + fput(file); return ret; } static ssize_t aio_write(struct kiocb *req, struct iocb *iocb, bool vectored, bool compat) { - struct file *file = req->ki_filp; struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs; struct iov_iter iter; + struct file *file; ssize_t ret; + ret = aio_prep_rw(req, iocb); + if (ret) + return ret; + file = req->ki_filp; + + ret = -EBADF; if (unlikely(!(file->f_mode & FMODE_WRITE))) - return -EBADF; + goto out_fput; + ret = -EINVAL; if (unlikely(!file->f_op->write_iter)) - return -EINVAL; + goto out_fput; ret = aio_setup_rw(WRITE, iocb, &iovec, vectored, compat, &iter); if (ret) - return ret; + goto out_fput; ret = rw_verify_area(WRITE, file, &req->ki_pos, iov_iter_count(&iter)); if (!ret) { - req->ki_flags |= IOCB_WRITE; - file_start_write(file); - ret = aio_ret(req, call_write_iter(file, req, &iter)); /* - * We release freeze protection in aio_complete(). Fool lockdep - * by telling it the lock got released so that it doesn't - * complain about held lock when we return to userspace. + * Open-code file_start_write here to grab freeze protection, + * which will be released by another thread in + * aio_complete_rw(). Fool lockdep by telling it the lock got + * released so that it doesn't complain about the held lock when + * we return to userspace. */ - if (S_ISREG(file_inode(file)->i_mode)) + if (S_ISREG(file_inode(file)->i_mode)) { + __sb_start_write(file_inode(file)->i_sb, SB_FREEZE_WRITE, true); __sb_writers_release(file_inode(file)->i_sb, SB_FREEZE_WRITE); + } + req->ki_flags |= IOCB_WRITE; + aio_rw_done(req, call_write_iter(file, req, &iter)); } kfree(iovec); +out_fput: + if (unlikely(ret)) + fput(file); return ret; } +static void aio_fsync_work(struct work_struct *work) +{ + struct fsync_iocb *req = container_of(work, struct fsync_iocb, work); + int ret; + + ret = vfs_fsync(req->file, req->datasync); + fput(req->file); + aio_complete(container_of(req, struct aio_kiocb, fsync), ret, 0); +} + +static int aio_fsync(struct fsync_iocb *req, struct iocb *iocb, bool datasync) +{ + if (unlikely(iocb->aio_buf || iocb->aio_offset || iocb->aio_nbytes || + iocb->aio_rw_flags)) + return -EINVAL; + req->file = fget(iocb->aio_fildes); + if (unlikely(!req->file)) + return -EBADF; + if (unlikely(!req->file->f_op->fsync)) { + fput(req->file); + return -EINVAL; + } + + req->datasync = datasync; + INIT_WORK(&req->work, aio_fsync_work); + schedule_work(&req->work); + return 0; +} + +/* need to use list_del_init so we can check if item was present */ +static inline bool __aio_poll_remove(struct poll_iocb *req) +{ + if (list_empty(&req->wait.entry)) + return false; + list_del_init(&req->wait.entry); + return true; +} + +static inline void __aio_poll_complete(struct aio_kiocb *iocb, __poll_t mask) +{ + fput(iocb->poll.file); + aio_complete(iocb, mangle_poll(mask), 0); +} + +static void aio_poll_work(struct work_struct *work) +{ + struct aio_kiocb *iocb = container_of(work, struct aio_kiocb, poll.work); + + if (!list_empty_careful(&iocb->ki_list)) + aio_remove_iocb(iocb); + __aio_poll_complete(iocb, iocb->poll.events); +} + +static int aio_poll_cancel(struct kiocb *iocb) +{ + struct aio_kiocb *aiocb = container_of(iocb, struct aio_kiocb, rw); + struct poll_iocb *req = &aiocb->poll; + struct wait_queue_head *head = req->head; + bool found = false; + + spin_lock(&head->lock); + found = __aio_poll_remove(req); + spin_unlock(&head->lock); + + if (found) { + req->events = 0; + INIT_WORK(&req->work, aio_poll_work); + schedule_work(&req->work); + } + return 0; +} + +static int aio_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync, + void *key) +{ + struct poll_iocb *req = container_of(wait, struct poll_iocb, wait); + struct aio_kiocb *iocb = container_of(req, struct aio_kiocb, poll); + struct file *file = req->file; + __poll_t mask = key_to_poll(key); + + assert_spin_locked(&req->head->lock); + + /* for instances that support it check for an event match first: */ + if (mask && !(mask & req->events)) + return 0; + + mask = file->f_op->poll_mask(file, req->events); + if (!mask) + return 0; + + __aio_poll_remove(req); + + /* + * Try completing without a context switch if we can acquire ctx_lock + * without spinning. Otherwise we need to defer to a workqueue to + * avoid a deadlock due to the lock order. + */ + if (spin_trylock(&iocb->ki_ctx->ctx_lock)) { + list_del_init(&iocb->ki_list); + spin_unlock(&iocb->ki_ctx->ctx_lock); + + __aio_poll_complete(iocb, mask); + } else { + req->events = mask; + INIT_WORK(&req->work, aio_poll_work); + schedule_work(&req->work); + } + + return 1; +} + +static ssize_t aio_poll(struct aio_kiocb *aiocb, struct iocb *iocb) +{ + struct kioctx *ctx = aiocb->ki_ctx; + struct poll_iocb *req = &aiocb->poll; + __poll_t mask; + + /* reject any unknown events outside the normal event mask. */ + if ((u16)iocb->aio_buf != iocb->aio_buf) + return -EINVAL; + /* reject fields that are not defined for poll */ + if (iocb->aio_offset || iocb->aio_nbytes || iocb->aio_rw_flags) + return -EINVAL; + + req->events = demangle_poll(iocb->aio_buf) | EPOLLERR | EPOLLHUP; + req->file = fget(iocb->aio_fildes); + if (unlikely(!req->file)) + return -EBADF; + if (!file_has_poll_mask(req->file)) + goto out_fail; + + req->head = req->file->f_op->get_poll_head(req->file, req->events); + if (!req->head) + goto out_fail; + if (IS_ERR(req->head)) { + mask = EPOLLERR; + goto done; + } + + init_waitqueue_func_entry(&req->wait, aio_poll_wake); + aiocb->ki_cancel = aio_poll_cancel; + + spin_lock_irq(&ctx->ctx_lock); + spin_lock(&req->head->lock); + mask = req->file->f_op->poll_mask(req->file, req->events); + if (!mask) { + __add_wait_queue(req->head, &req->wait); + list_add_tail(&aiocb->ki_list, &ctx->active_reqs); + } + spin_unlock(&req->head->lock); + spin_unlock_irq(&ctx->ctx_lock); +done: + if (mask) + __aio_poll_complete(aiocb, mask); + return 0; +out_fail: + fput(req->file); + return -EINVAL; /* same as no support for IOCB_CMD_POLL */ +} + static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb, - struct iocb *iocb, bool compat) + bool compat) { struct aio_kiocb *req; - struct file *file; + struct iocb iocb; ssize_t ret; + if (unlikely(copy_from_user(&iocb, user_iocb, sizeof(iocb)))) + return -EFAULT; + /* enforce forwards compatibility on users */ - if (unlikely(iocb->aio_reserved2)) { + if (unlikely(iocb.aio_reserved2)) { pr_debug("EINVAL: reserve field set\n"); return -EINVAL; } /* prevent overflows */ if (unlikely( - (iocb->aio_buf != (unsigned long)iocb->aio_buf) || - (iocb->aio_nbytes != (size_t)iocb->aio_nbytes) || - ((ssize_t)iocb->aio_nbytes < 0) + (iocb.aio_buf != (unsigned long)iocb.aio_buf) || + (iocb.aio_nbytes != (size_t)iocb.aio_nbytes) || + ((ssize_t)iocb.aio_nbytes < 0) )) { pr_debug("EINVAL: overflow check\n"); return -EINVAL; @@ -1570,37 +1749,19 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb, if (unlikely(!req)) return -EAGAIN; - req->common.ki_filp = file = fget(iocb->aio_fildes); - if (unlikely(!req->common.ki_filp)) { - ret = -EBADF; - goto out_put_req; - } - req->common.ki_pos = iocb->aio_offset; - req->common.ki_complete = aio_complete; - req->common.ki_flags = iocb_flags(req->common.ki_filp); - req->common.ki_hint = file_write_hint(file); - - if (iocb->aio_flags & IOCB_FLAG_RESFD) { + if (iocb.aio_flags & IOCB_FLAG_RESFD) { /* * If the IOCB_FLAG_RESFD flag of aio_flags is set, get an * instance of the file* now. The file descriptor must be * an eventfd() fd, and will be signaled for each completed * event using the eventfd_signal() function. */ - req->ki_eventfd = eventfd_ctx_fdget((int) iocb->aio_resfd); + req->ki_eventfd = eventfd_ctx_fdget((int) iocb.aio_resfd); if (IS_ERR(req->ki_eventfd)) { ret = PTR_ERR(req->ki_eventfd); req->ki_eventfd = NULL; goto out_put_req; } - - req->common.ki_flags |= IOCB_EVENTFD; - } - - ret = kiocb_set_rw_flags(&req->common, iocb->aio_rw_flags); - if (unlikely(ret)) { - pr_debug("EINVAL: aio_rw_flags\n"); - goto out_put_req; } ret = put_user(KIOCB_KEY, &user_iocb->aio_key); @@ -1610,41 +1771,67 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb, } req->ki_user_iocb = user_iocb; - req->ki_user_data = iocb->aio_data; + req->ki_user_data = iocb.aio_data; - get_file(file); - switch (iocb->aio_lio_opcode) { + switch (iocb.aio_lio_opcode) { case IOCB_CMD_PREAD: - ret = aio_read(&req->common, iocb, false, compat); + ret = aio_read(&req->rw, &iocb, false, compat); break; case IOCB_CMD_PWRITE: - ret = aio_write(&req->common, iocb, false, compat); + ret = aio_write(&req->rw, &iocb, false, compat); break; case IOCB_CMD_PREADV: - ret = aio_read(&req->common, iocb, true, compat); + ret = aio_read(&req->rw, &iocb, true, compat); break; case IOCB_CMD_PWRITEV: - ret = aio_write(&req->common, iocb, true, compat); + ret = aio_write(&req->rw, &iocb, true, compat); + break; + case IOCB_CMD_FSYNC: + ret = aio_fsync(&req->fsync, &iocb, false); + break; + case IOCB_CMD_FDSYNC: + ret = aio_fsync(&req->fsync, &iocb, true); + break; + case IOCB_CMD_POLL: + ret = aio_poll(req, &iocb); break; default: - pr_debug("invalid aio operation %d\n", iocb->aio_lio_opcode); + pr_debug("invalid aio operation %d\n", iocb.aio_lio_opcode); ret = -EINVAL; break; } - fput(file); - if (ret && ret != -EIOCBQUEUED) + /* + * If ret is 0, we'd either done aio_complete() ourselves or have + * arranged for that to be done asynchronously. Anything non-zero + * means that we need to destroy req ourselves. + */ + if (ret) goto out_put_req; return 0; out_put_req: put_reqs_available(ctx, 1); percpu_ref_put(&ctx->reqs); - kiocb_free(req); + if (req->ki_eventfd) + eventfd_ctx_put(req->ki_eventfd); + kmem_cache_free(kiocb_cachep, req); return ret; } -static long do_io_submit(aio_context_t ctx_id, long nr, - struct iocb __user *__user *iocbpp, bool compat) +/* sys_io_submit: + * Queue the nr iocbs pointed to by iocbpp for processing. Returns + * the number of iocbs queued. May return -EINVAL if the aio_context + * specified by ctx_id is invalid, if nr is < 0, if the iocb at + * *iocbpp[0] is not properly initialized, if the operation specified + * is invalid for the file descriptor in the iocb. May fail with + * -EFAULT if any of the data structures point to invalid data. May + * fail with -EBADF if the file descriptor specified in the first + * iocb is invalid. May fail with -EAGAIN if insufficient resources + * are available to queue any iocbs. Will return 0 if nr is 0. Will + * fail with -ENOSYS if not implemented. + */ +SYSCALL_DEFINE3(io_submit, aio_context_t, ctx_id, long, nr, + struct iocb __user * __user *, iocbpp) { struct kioctx *ctx; long ret = 0; @@ -1654,39 +1841,25 @@ static long do_io_submit(aio_context_t ctx_id, long nr, if (unlikely(nr < 0)) return -EINVAL; - if (unlikely(nr > LONG_MAX/sizeof(*iocbpp))) - nr = LONG_MAX/sizeof(*iocbpp); - - if (unlikely(!access_ok(VERIFY_READ, iocbpp, (nr*sizeof(*iocbpp))))) - return -EFAULT; - ctx = lookup_ioctx(ctx_id); if (unlikely(!ctx)) { pr_debug("EINVAL: invalid context id\n"); return -EINVAL; } - blk_start_plug(&plug); + if (nr > ctx->nr_events) + nr = ctx->nr_events; - /* - * AKPM: should this return a partial result if some of the IOs were - * successfully submitted? - */ - for (i=0; i<nr; i++) { + blk_start_plug(&plug); + for (i = 0; i < nr; i++) { struct iocb __user *user_iocb; - struct iocb tmp; - if (unlikely(__get_user(user_iocb, iocbpp + i))) { + if (unlikely(get_user(user_iocb, iocbpp + i))) { ret = -EFAULT; break; } - if (unlikely(copy_from_user(&tmp, user_iocb, sizeof(tmp)))) { - ret = -EFAULT; - break; - } - - ret = io_submit_one(ctx, user_iocb, &tmp, compat); + ret = io_submit_one(ctx, user_iocb, false); if (ret) break; } @@ -1696,59 +1869,44 @@ static long do_io_submit(aio_context_t ctx_id, long nr, return i ? i : ret; } -/* sys_io_submit: - * Queue the nr iocbs pointed to by iocbpp for processing. Returns - * the number of iocbs queued. May return -EINVAL if the aio_context - * specified by ctx_id is invalid, if nr is < 0, if the iocb at - * *iocbpp[0] is not properly initialized, if the operation specified - * is invalid for the file descriptor in the iocb. May fail with - * -EFAULT if any of the data structures point to invalid data. May - * fail with -EBADF if the file descriptor specified in the first - * iocb is invalid. May fail with -EAGAIN if insufficient resources - * are available to queue any iocbs. Will return 0 if nr is 0. Will - * fail with -ENOSYS if not implemented. - */ -SYSCALL_DEFINE3(io_submit, aio_context_t, ctx_id, long, nr, - struct iocb __user * __user *, iocbpp) -{ - return do_io_submit(ctx_id, nr, iocbpp, 0); -} - #ifdef CONFIG_COMPAT -static inline long -copy_iocb(long nr, u32 __user *ptr32, struct iocb __user * __user *ptr64) +COMPAT_SYSCALL_DEFINE3(io_submit, compat_aio_context_t, ctx_id, + int, nr, compat_uptr_t __user *, iocbpp) { - compat_uptr_t uptr; - int i; + struct kioctx *ctx; + long ret = 0; + int i = 0; + struct blk_plug plug; - for (i = 0; i < nr; ++i) { - if (get_user(uptr, ptr32 + i)) - return -EFAULT; - if (put_user(compat_ptr(uptr), ptr64 + i)) - return -EFAULT; + if (unlikely(nr < 0)) + return -EINVAL; + + ctx = lookup_ioctx(ctx_id); + if (unlikely(!ctx)) { + pr_debug("EINVAL: invalid context id\n"); + return -EINVAL; } - return 0; -} -#define MAX_AIO_SUBMITS (PAGE_SIZE/sizeof(struct iocb *)) + if (nr > ctx->nr_events) + nr = ctx->nr_events; -COMPAT_SYSCALL_DEFINE3(io_submit, compat_aio_context_t, ctx_id, - int, nr, u32 __user *, iocb) -{ - struct iocb __user * __user *iocb64; - long ret; + blk_start_plug(&plug); + for (i = 0; i < nr; i++) { + compat_uptr_t user_iocb; - if (unlikely(nr < 0)) - return -EINVAL; + if (unlikely(get_user(user_iocb, iocbpp + i))) { + ret = -EFAULT; + break; + } - if (nr > MAX_AIO_SUBMITS) - nr = MAX_AIO_SUBMITS; + ret = io_submit_one(ctx, compat_ptr(user_iocb), true); + if (ret) + break; + } + blk_finish_plug(&plug); - iocb64 = compat_alloc_user_space(nr * sizeof(*iocb64)); - ret = copy_iocb(nr, iocb, iocb64); - if (!ret) - ret = do_io_submit(ctx_id, nr, iocb64, 1); - return ret; + percpu_ref_put(&ctx->users); + return i ? i : ret; } #endif @@ -1756,15 +1914,12 @@ COMPAT_SYSCALL_DEFINE3(io_submit, compat_aio_context_t, ctx_id, * Finds a given iocb for cancellation. */ static struct aio_kiocb * -lookup_kiocb(struct kioctx *ctx, struct iocb __user *iocb, u32 key) +lookup_kiocb(struct kioctx *ctx, struct iocb __user *iocb) { struct aio_kiocb *kiocb; assert_spin_locked(&ctx->ctx_lock); - if (key != KIOCB_KEY) - return NULL; - /* TODO: use a hash or array, this sucks. */ list_for_each_entry(kiocb, &ctx->active_reqs, ki_list) { if (kiocb->ki_user_iocb == iocb) @@ -1788,25 +1943,24 @@ SYSCALL_DEFINE3(io_cancel, aio_context_t, ctx_id, struct iocb __user *, iocb, { struct kioctx *ctx; struct aio_kiocb *kiocb; + int ret = -EINVAL; u32 key; - int ret; - ret = get_user(key, &iocb->aio_key); - if (unlikely(ret)) + if (unlikely(get_user(key, &iocb->aio_key))) return -EFAULT; + if (unlikely(key != KIOCB_KEY)) + return -EINVAL; ctx = lookup_ioctx(ctx_id); if (unlikely(!ctx)) return -EINVAL; spin_lock_irq(&ctx->ctx_lock); - - kiocb = lookup_kiocb(ctx, iocb, key); - if (kiocb) - ret = kiocb_cancel(kiocb); - else - ret = -EINVAL; - + kiocb = lookup_kiocb(ctx, iocb); + if (kiocb) { + ret = kiocb->ki_cancel(&kiocb->rw); + list_del_init(&kiocb->ki_list); + } spin_unlock_irq(&ctx->ctx_lock); if (!ret) { @@ -1861,13 +2015,60 @@ SYSCALL_DEFINE5(io_getevents, aio_context_t, ctx_id, struct timespec __user *, timeout) { struct timespec64 ts; + int ret; + + if (timeout && unlikely(get_timespec64(&ts, timeout))) + return -EFAULT; + + ret = do_io_getevents(ctx_id, min_nr, nr, events, timeout ? &ts : NULL); + if (!ret && signal_pending(current)) + ret = -EINTR; + return ret; +} + +SYSCALL_DEFINE6(io_pgetevents, + aio_context_t, ctx_id, + long, min_nr, + long, nr, + struct io_event __user *, events, + struct timespec __user *, timeout, + const struct __aio_sigset __user *, usig) +{ + struct __aio_sigset ksig = { NULL, }; + sigset_t ksigmask, sigsaved; + struct timespec64 ts; + int ret; + + if (timeout && unlikely(get_timespec64(&ts, timeout))) + return -EFAULT; + + if (usig && copy_from_user(&ksig, usig, sizeof(ksig))) + return -EFAULT; - if (timeout) { - if (unlikely(get_timespec64(&ts, timeout))) + if (ksig.sigmask) { + if (ksig.sigsetsize != sizeof(sigset_t)) + return -EINVAL; + if (copy_from_user(&ksigmask, ksig.sigmask, sizeof(ksigmask))) return -EFAULT; + sigdelsetmask(&ksigmask, sigmask(SIGKILL) | sigmask(SIGSTOP)); + sigprocmask(SIG_SETMASK, &ksigmask, &sigsaved); + } + + ret = do_io_getevents(ctx_id, min_nr, nr, events, timeout ? &ts : NULL); + if (signal_pending(current)) { + if (ksig.sigmask) { + current->saved_sigmask = sigsaved; + set_restore_sigmask(); + } + + if (!ret) + ret = -ERESTARTNOHAND; + } else { + if (ksig.sigmask) + sigprocmask(SIG_SETMASK, &sigsaved, NULL); } - return do_io_getevents(ctx_id, min_nr, nr, events, timeout ? &ts : NULL); + return ret; } #ifdef CONFIG_COMPAT @@ -1878,13 +2079,64 @@ COMPAT_SYSCALL_DEFINE5(io_getevents, compat_aio_context_t, ctx_id, struct compat_timespec __user *, timeout) { struct timespec64 t; + int ret; + + if (timeout && compat_get_timespec64(&t, timeout)) + return -EFAULT; + + ret = do_io_getevents(ctx_id, min_nr, nr, events, timeout ? &t : NULL); + if (!ret && signal_pending(current)) + ret = -EINTR; + return ret; +} + + +struct __compat_aio_sigset { + compat_sigset_t __user *sigmask; + compat_size_t sigsetsize; +}; + +COMPAT_SYSCALL_DEFINE6(io_pgetevents, + compat_aio_context_t, ctx_id, + compat_long_t, min_nr, + compat_long_t, nr, + struct io_event __user *, events, + struct compat_timespec __user *, timeout, + const struct __compat_aio_sigset __user *, usig) +{ + struct __compat_aio_sigset ksig = { NULL, }; + sigset_t ksigmask, sigsaved; + struct timespec64 t; + int ret; + + if (timeout && compat_get_timespec64(&t, timeout)) + return -EFAULT; + + if (usig && copy_from_user(&ksig, usig, sizeof(ksig))) + return -EFAULT; - if (timeout) { - if (compat_get_timespec64(&t, timeout)) + if (ksig.sigmask) { + if (ksig.sigsetsize != sizeof(compat_sigset_t)) + return -EINVAL; + if (get_compat_sigset(&ksigmask, ksig.sigmask)) return -EFAULT; + sigdelsetmask(&ksigmask, sigmask(SIGKILL) | sigmask(SIGSTOP)); + sigprocmask(SIG_SETMASK, &ksigmask, &sigsaved); + } + ret = do_io_getevents(ctx_id, min_nr, nr, events, timeout ? &t : NULL); + if (signal_pending(current)) { + if (ksig.sigmask) { + current->saved_sigmask = sigsaved; + set_restore_sigmask(); + } + if (!ret) + ret = -ERESTARTNOHAND; + } else { + if (ksig.sigmask) + sigprocmask(SIG_SETMASK, &sigsaved, NULL); } - return do_io_getevents(ctx_id, min_nr, nr, events, timeout ? &t : NULL); + return ret; } #endif diff --git a/fs/attr.c b/fs/attr.c index 12ffdb6fb63c..d0b4d34878fb 100644 --- a/fs/attr.c +++ b/fs/attr.c @@ -18,6 +18,32 @@ #include <linux/evm.h> #include <linux/ima.h> +static bool chown_ok(const struct inode *inode, kuid_t uid) +{ + if (uid_eq(current_fsuid(), inode->i_uid) && + uid_eq(uid, inode->i_uid)) + return true; + if (capable_wrt_inode_uidgid(inode, CAP_CHOWN)) + return true; + if (uid_eq(inode->i_uid, INVALID_UID) && + ns_capable(inode->i_sb->s_user_ns, CAP_CHOWN)) + return true; + return false; +} + +static bool chgrp_ok(const struct inode *inode, kgid_t gid) +{ + if (uid_eq(current_fsuid(), inode->i_uid) && + (in_group_p(gid) || gid_eq(gid, inode->i_gid))) + return true; + if (capable_wrt_inode_uidgid(inode, CAP_CHOWN)) + return true; + if (gid_eq(inode->i_gid, INVALID_GID) && + ns_capable(inode->i_sb->s_user_ns, CAP_CHOWN)) + return true; + return false; +} + /** * setattr_prepare - check if attribute changes to a dentry are allowed * @dentry: dentry to check @@ -52,17 +78,11 @@ int setattr_prepare(struct dentry *dentry, struct iattr *attr) goto kill_priv; /* Make sure a caller can chown. */ - if ((ia_valid & ATTR_UID) && - (!uid_eq(current_fsuid(), inode->i_uid) || - !uid_eq(attr->ia_uid, inode->i_uid)) && - !capable_wrt_inode_uidgid(inode, CAP_CHOWN)) + if ((ia_valid & ATTR_UID) && !chown_ok(inode, attr->ia_uid)) return -EPERM; /* Make sure caller can chgrp. */ - if ((ia_valid & ATTR_GID) && - (!uid_eq(current_fsuid(), inode->i_uid) || - (!in_group_p(attr->ia_gid) && !gid_eq(attr->ia_gid, inode->i_gid))) && - !capable_wrt_inode_uidgid(inode, CAP_CHOWN)) + if ((ia_valid & ATTR_GID) && !chgrp_ok(inode, attr->ia_gid)) return -EPERM; /* Make sure a caller can chmod. */ diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c index af2832aaeec5..4700b4534439 100644 --- a/fs/befs/linuxvfs.c +++ b/fs/befs/linuxvfs.c @@ -198,23 +198,16 @@ befs_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) if (ret == BEFS_BT_NOT_FOUND) { befs_debug(sb, "<--- %s %pd not found", __func__, dentry); - d_add(dentry, NULL); - return ERR_PTR(-ENOENT); - + inode = NULL; } else if (ret != BEFS_OK || offset == 0) { befs_error(sb, "<--- %s Error", __func__); - return ERR_PTR(-ENODATA); + inode = ERR_PTR(-ENODATA); + } else { + inode = befs_iget(dir->i_sb, (ino_t) offset); } - - inode = befs_iget(dir->i_sb, (ino_t) offset); - if (IS_ERR(inode)) - return ERR_CAST(inode); - - d_add(dentry, inode); - befs_debug(sb, "<--- %s", __func__); - return NULL; + return d_splice_alias(inode, dentry); } static int diff --git a/fs/bfs/dir.c b/fs/bfs/dir.c index ee832ca5f734..f32f21c3bbc7 100644 --- a/fs/bfs/dir.c +++ b/fs/bfs/dir.c @@ -21,10 +21,9 @@ #define dprintf(x...) #endif -static int bfs_add_entry(struct inode *dir, const unsigned char *name, - int namelen, int ino); +static int bfs_add_entry(struct inode *dir, const struct qstr *child, int ino); static struct buffer_head *bfs_find_entry(struct inode *dir, - const unsigned char *name, int namelen, + const struct qstr *child, struct bfs_dirent **res_dir); static int bfs_readdir(struct file *f, struct dir_context *ctx) @@ -111,8 +110,7 @@ static int bfs_create(struct inode *dir, struct dentry *dentry, umode_t mode, mark_inode_dirty(inode); bfs_dump_imap("create", s); - err = bfs_add_entry(dir, dentry->d_name.name, dentry->d_name.len, - inode->i_ino); + err = bfs_add_entry(dir, &dentry->d_name, inode->i_ino); if (err) { inode_dec_link_count(inode); mutex_unlock(&info->bfs_lock); @@ -136,19 +134,14 @@ static struct dentry *bfs_lookup(struct inode *dir, struct dentry *dentry, return ERR_PTR(-ENAMETOOLONG); mutex_lock(&info->bfs_lock); - bh = bfs_find_entry(dir, dentry->d_name.name, dentry->d_name.len, &de); + bh = bfs_find_entry(dir, &dentry->d_name, &de); if (bh) { unsigned long ino = (unsigned long)le16_to_cpu(de->ino); brelse(bh); inode = bfs_iget(dir->i_sb, ino); - if (IS_ERR(inode)) { - mutex_unlock(&info->bfs_lock); - return ERR_CAST(inode); - } } mutex_unlock(&info->bfs_lock); - d_add(dentry, inode); - return NULL; + return d_splice_alias(inode, dentry); } static int bfs_link(struct dentry *old, struct inode *dir, @@ -159,8 +152,7 @@ static int bfs_link(struct dentry *old, struct inode *dir, int err; mutex_lock(&info->bfs_lock); - err = bfs_add_entry(dir, new->d_name.name, new->d_name.len, - inode->i_ino); + err = bfs_add_entry(dir, &new->d_name, inode->i_ino); if (err) { mutex_unlock(&info->bfs_lock); return err; @@ -183,7 +175,7 @@ static int bfs_unlink(struct inode *dir, struct dentry *dentry) struct bfs_sb_info *info = BFS_SB(inode->i_sb); mutex_lock(&info->bfs_lock); - bh = bfs_find_entry(dir, dentry->d_name.name, dentry->d_name.len, &de); + bh = bfs_find_entry(dir, &dentry->d_name, &de); if (!bh || (le16_to_cpu(de->ino) != inode->i_ino)) goto out_brelse; @@ -228,27 +220,21 @@ static int bfs_rename(struct inode *old_dir, struct dentry *old_dentry, info = BFS_SB(old_inode->i_sb); mutex_lock(&info->bfs_lock); - old_bh = bfs_find_entry(old_dir, - old_dentry->d_name.name, - old_dentry->d_name.len, &old_de); + old_bh = bfs_find_entry(old_dir, &old_dentry->d_name, &old_de); if (!old_bh || (le16_to_cpu(old_de->ino) != old_inode->i_ino)) goto end_rename; error = -EPERM; new_inode = d_inode(new_dentry); - new_bh = bfs_find_entry(new_dir, - new_dentry->d_name.name, - new_dentry->d_name.len, &new_de); + new_bh = bfs_find_entry(new_dir, &new_dentry->d_name, &new_de); if (new_bh && !new_inode) { brelse(new_bh); new_bh = NULL; } if (!new_bh) { - error = bfs_add_entry(new_dir, - new_dentry->d_name.name, - new_dentry->d_name.len, + error = bfs_add_entry(new_dir, &new_dentry->d_name, old_inode->i_ino); if (error) goto end_rename; @@ -278,9 +264,10 @@ const struct inode_operations bfs_dir_inops = { .rename = bfs_rename, }; -static int bfs_add_entry(struct inode *dir, const unsigned char *name, - int namelen, int ino) +static int bfs_add_entry(struct inode *dir, const struct qstr *child, int ino) { + const unsigned char *name = child->name; + int namelen = child->len; struct buffer_head *bh; struct bfs_dirent *de; int block, sblock, eblock, off, pos; @@ -332,12 +319,14 @@ static inline int bfs_namecmp(int len, const unsigned char *name, } static struct buffer_head *bfs_find_entry(struct inode *dir, - const unsigned char *name, int namelen, + const struct qstr *child, struct bfs_dirent **res_dir) { unsigned long block = 0, offset = 0; struct buffer_head *bh = NULL; struct bfs_dirent *de; + const unsigned char *name = child->name; + int namelen = child->len; *res_dir = NULL; if (namelen > BFS_NAMELEN) diff --git a/fs/block_dev.c b/fs/block_dev.c index 7ec920e27065..bef6934b6189 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -272,7 +272,7 @@ struct blkdev_dio { struct bio bio; }; -static struct bio_set *blkdev_dio_pool __read_mostly; +static struct bio_set blkdev_dio_pool; static void blkdev_bio_end_io(struct bio *bio) { @@ -334,7 +334,7 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages) (bdev_logical_block_size(bdev) - 1)) return -EINVAL; - bio = bio_alloc_bioset(GFP_KERNEL, nr_pages, blkdev_dio_pool); + bio = bio_alloc_bioset(GFP_KERNEL, nr_pages, &blkdev_dio_pool); bio_get(bio); /* extra ref for the completion handler */ dio = container_of(bio, struct blkdev_dio, bio); @@ -432,10 +432,7 @@ blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter) static __init int blkdev_init(void) { - blkdev_dio_pool = bioset_create(4, offsetof(struct blkdev_dio, bio), BIOSET_NEED_BVECS); - if (!blkdev_dio_pool) - return -ENOMEM; - return 0; + return bioset_init(&blkdev_dio_pool, 4, offsetof(struct blkdev_dio, bio), BIOSET_NEED_BVECS); } module_init(blkdev_init); @@ -1322,27 +1319,30 @@ static void flush_disk(struct block_device *bdev, bool kill_dirty) * check_disk_size_change - checks for disk size change and adjusts bdev size. * @disk: struct gendisk to check * @bdev: struct bdev to adjust. + * @verbose: if %true log a message about a size change if there is any * * This routine checks to see if the bdev size does not match the disk size * and adjusts it if it differs. When shrinking the bdev size, its all caches * are freed. */ -void check_disk_size_change(struct gendisk *disk, struct block_device *bdev) +void check_disk_size_change(struct gendisk *disk, struct block_device *bdev, + bool verbose) { loff_t disk_size, bdev_size; disk_size = (loff_t)get_capacity(disk) << 9; bdev_size = i_size_read(bdev->bd_inode); if (disk_size != bdev_size) { - printk(KERN_INFO - "%s: detected capacity change from %lld to %lld\n", - disk->disk_name, bdev_size, disk_size); + if (verbose) { + printk(KERN_INFO + "%s: detected capacity change from %lld to %lld\n", + disk->disk_name, bdev_size, disk_size); + } i_size_write(bdev->bd_inode, disk_size); if (bdev_size > disk_size) flush_disk(bdev, false); } } -EXPORT_SYMBOL(check_disk_size_change); /** * revalidate_disk - wrapper for lower-level driver's revalidate_disk call-back @@ -1364,7 +1364,7 @@ int revalidate_disk(struct gendisk *disk) return ret; mutex_lock(&bdev->bd_mutex); - check_disk_size_change(disk, bdev); + check_disk_size_change(disk, bdev, ret == 0); bdev->bd_invalidated = 0; mutex_unlock(&bdev->bd_mutex); bdput(bdev); diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index 234bae55b85d..7e075343daa5 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -19,17 +19,17 @@ * ordered operations list so that we make sure to flush out any * new data the application may have written before commit. */ -#define BTRFS_INODE_ORDERED_DATA_CLOSE 0 -#define BTRFS_INODE_ORPHAN_META_RESERVED 1 -#define BTRFS_INODE_DUMMY 2 -#define BTRFS_INODE_IN_DEFRAG 3 -#define BTRFS_INODE_HAS_ORPHAN_ITEM 4 -#define BTRFS_INODE_HAS_ASYNC_EXTENT 5 -#define BTRFS_INODE_NEEDS_FULL_SYNC 6 -#define BTRFS_INODE_COPY_EVERYTHING 7 -#define BTRFS_INODE_IN_DELALLOC_LIST 8 -#define BTRFS_INODE_READDIO_NEED_LOCK 9 -#define BTRFS_INODE_HAS_PROPS 10 +enum { + BTRFS_INODE_ORDERED_DATA_CLOSE = 0, + BTRFS_INODE_DUMMY, + BTRFS_INODE_IN_DEFRAG, + BTRFS_INODE_HAS_ASYNC_EXTENT, + BTRFS_INODE_NEEDS_FULL_SYNC, + BTRFS_INODE_COPY_EVERYTHING, + BTRFS_INODE_IN_DELALLOC_LIST, + BTRFS_INODE_READDIO_NEED_LOCK, + BTRFS_INODE_HAS_PROPS, +}; /* in memory btrfs inode */ struct btrfs_inode { diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index 1061575a7d25..d3e447b45bf7 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -990,12 +990,7 @@ static void __free_workspace(int type, struct list_head *workspace, btrfs_compress_op[idx]->free_workspace(workspace); atomic_dec(total_ws); wake: - /* - * Make sure counter is updated before we wake up waiters. - */ - smp_mb(); - if (waitqueue_active(ws_wait)) - wake_up(ws_wait); + cond_wake_up(ws_wait); } static void free_workspace(int type, struct list_head *ws) diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h index cc605f7b23fb..ddda9b80bf20 100644 --- a/fs/btrfs/compression.h +++ b/fs/btrfs/compression.h @@ -6,6 +6,8 @@ #ifndef BTRFS_COMPRESSION_H #define BTRFS_COMPRESSION_H +#include <linux/sizes.h> + /* * We want to make sure that amount of RAM required to uncompress an extent is * reasonable, so we limit the total size in ram of a compressed extent to diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 3fd44835b386..4bc326df472e 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -2330,7 +2330,7 @@ static noinline void unlock_up(struct btrfs_path *path, int level, no_skips = 1; t = path->nodes[i]; - if (i >= lowest_unlock && i > skip_level && path->locks[i]) { + if (i >= lowest_unlock && i > skip_level) { btrfs_tree_unlock_rw(t, path->locks[i]); path->locks[i] = 0; if (write_lock_level && @@ -2432,14 +2432,11 @@ read_block_for_search(struct btrfs_root *root, struct btrfs_path *p, btrfs_unlock_up_safe(p, level + 1); btrfs_set_path_blocking(p); - free_extent_buffer(tmp); if (p->reada != READA_NONE) reada_for_search(fs_info, p, level, slot, key->objectid); - btrfs_release_path(p); - ret = -EAGAIN; - tmp = read_tree_block(fs_info, blocknr, 0, parent_level - 1, + tmp = read_tree_block(fs_info, blocknr, gen, parent_level - 1, &first_key); if (!IS_ERR(tmp)) { /* @@ -2448,12 +2445,14 @@ read_block_for_search(struct btrfs_root *root, struct btrfs_path *p, * and give up so that our caller doesn't loop forever * on our EAGAINs. */ - if (!btrfs_buffer_uptodate(tmp, 0, 0)) + if (!extent_buffer_uptodate(tmp)) ret = -EIO; free_extent_buffer(tmp); } else { ret = PTR_ERR(tmp); } + + btrfs_release_path(p); return ret; } @@ -2599,6 +2598,78 @@ int btrfs_find_item(struct btrfs_root *fs_root, struct btrfs_path *path, return 0; } +static struct extent_buffer *btrfs_search_slot_get_root(struct btrfs_root *root, + struct btrfs_path *p, + int write_lock_level) +{ + struct btrfs_fs_info *fs_info = root->fs_info; + struct extent_buffer *b; + int root_lock; + int level = 0; + + /* We try very hard to do read locks on the root */ + root_lock = BTRFS_READ_LOCK; + + if (p->search_commit_root) { + /* The commit roots are read only so we always do read locks */ + if (p->need_commit_sem) + down_read(&fs_info->commit_root_sem); + b = root->commit_root; + extent_buffer_get(b); + level = btrfs_header_level(b); + if (p->need_commit_sem) + up_read(&fs_info->commit_root_sem); + /* + * Ensure that all callers have set skip_locking when + * p->search_commit_root = 1. + */ + ASSERT(p->skip_locking == 1); + + goto out; + } + + if (p->skip_locking) { + b = btrfs_root_node(root); + level = btrfs_header_level(b); + goto out; + } + + /* + * If the level is set to maximum, we can skip trying to get the read + * lock. + */ + if (write_lock_level < BTRFS_MAX_LEVEL) { + /* + * We don't know the level of the root node until we actually + * have it read locked + */ + b = btrfs_read_lock_root_node(root); + level = btrfs_header_level(b); + if (level > write_lock_level) + goto out; + + /* Whoops, must trade for write lock */ + btrfs_tree_read_unlock(b); + free_extent_buffer(b); + } + + b = btrfs_lock_root_node(root); + root_lock = BTRFS_WRITE_LOCK; + + /* The level might have changed, check again */ + level = btrfs_header_level(b); + +out: + p->nodes[level] = b; + if (!p->skip_locking) + p->locks[level] = root_lock; + /* + * Callers are responsible for dropping b's references. + */ + return b; +} + + /* * btrfs_search_slot - look for a key in a tree and perform necessary * modifications to preserve tree invariants. @@ -2635,7 +2706,6 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root *root, int err; int level; int lowest_unlock = 1; - int root_lock; /* everything at write_lock_level or lower must be write locked */ int write_lock_level = 0; u8 lowest_level = 0; @@ -2673,50 +2743,7 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root *root, again: prev_cmp = -1; - /* - * we try very hard to do read locks on the root - */ - root_lock = BTRFS_READ_LOCK; - level = 0; - if (p->search_commit_root) { - /* - * the commit roots are read only - * so we always do read locks - */ - if (p->need_commit_sem) - down_read(&fs_info->commit_root_sem); - b = root->commit_root; - extent_buffer_get(b); - level = btrfs_header_level(b); - if (p->need_commit_sem) - up_read(&fs_info->commit_root_sem); - if (!p->skip_locking) - btrfs_tree_read_lock(b); - } else { - if (p->skip_locking) { - b = btrfs_root_node(root); - level = btrfs_header_level(b); - } else { - /* we don't know the level of the root node - * until we actually have it read locked - */ - b = btrfs_read_lock_root_node(root); - level = btrfs_header_level(b); - if (level <= write_lock_level) { - /* whoops, must trade for write lock */ - btrfs_tree_read_unlock(b); - free_extent_buffer(b); - b = btrfs_lock_root_node(root); - root_lock = BTRFS_WRITE_LOCK; - - /* the level might have changed, check again */ - level = btrfs_header_level(b); - } - } - } - p->nodes[level] = b; - if (!p->skip_locking) - p->locks[level] = root_lock; + b = btrfs_search_slot_get_root(root, p, write_lock_level); while (b) { level = btrfs_header_level(b); @@ -5414,12 +5441,24 @@ int btrfs_compare_trees(struct btrfs_root *left_root, down_read(&fs_info->commit_root_sem); left_level = btrfs_header_level(left_root->commit_root); left_root_level = left_level; - left_path->nodes[left_level] = left_root->commit_root; + left_path->nodes[left_level] = + btrfs_clone_extent_buffer(left_root->commit_root); + if (!left_path->nodes[left_level]) { + up_read(&fs_info->commit_root_sem); + ret = -ENOMEM; + goto out; + } extent_buffer_get(left_path->nodes[left_level]); right_level = btrfs_header_level(right_root->commit_root); right_root_level = right_level; - right_path->nodes[right_level] = right_root->commit_root; + right_path->nodes[right_level] = + btrfs_clone_extent_buffer(right_root->commit_root); + if (!right_path->nodes[right_level]) { + up_read(&fs_info->commit_root_sem); + ret = -ENOMEM; + goto out; + } extent_buffer_get(right_path->nodes[right_level]); up_read(&fs_info->commit_root_sem); diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 2771cc56a622..f4bf7874c24a 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -739,6 +739,12 @@ struct btrfs_delayed_root; */ #define BTRFS_FS_NEED_ASYNC_COMMIT 17 +/* + * Indicate that balance has been set up from the ioctl and is in the main + * phase. The fs_info::balance_ctl is initialized. + */ +#define BTRFS_FS_BALANCE_RUNNING 18 + struct btrfs_fs_info { u8 fsid[BTRFS_FSID_SIZE]; u8 chunk_tree_uuid[BTRFS_UUID_SIZE]; @@ -838,7 +844,6 @@ struct btrfs_fs_info { struct mutex transaction_kthread_mutex; struct mutex cleaner_mutex; struct mutex chunk_mutex; - struct mutex volume_mutex; /* * this is taken to make sure we don't set block groups ro after @@ -1004,7 +1009,6 @@ struct btrfs_fs_info { /* restriper state */ spinlock_t balance_lock; struct mutex balance_mutex; - atomic_t balance_running; atomic_t balance_pause_req; atomic_t balance_cancel_req; struct btrfs_balance_control *balance_ctl; @@ -1219,9 +1223,6 @@ struct btrfs_root { spinlock_t log_extents_lock[2]; struct list_head logged_list[2]; - spinlock_t orphan_lock; - atomic_t orphan_inodes; - struct btrfs_block_rsv *orphan_block_rsv; int orphan_cleanup_state; spinlock_t inode_lock; @@ -2764,13 +2765,9 @@ void btrfs_delalloc_release_space(struct inode *inode, void btrfs_free_reserved_data_space_noquota(struct inode *inode, u64 start, u64 len); void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans); -int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans, - struct btrfs_inode *inode); -void btrfs_orphan_release_metadata(struct btrfs_inode *inode); int btrfs_subvolume_reserve_metadata(struct btrfs_root *root, struct btrfs_block_rsv *rsv, - int nitems, - u64 *qgroup_reserved, bool use_global_rsv); + int nitems, bool use_global_rsv); void btrfs_subvolume_release_metadata(struct btrfs_fs_info *fs_info, struct btrfs_block_rsv *rsv); void btrfs_delalloc_release_extents(struct btrfs_inode *inode, u64 num_bytes, @@ -2828,7 +2825,7 @@ void btrfs_wait_for_snapshot_creation(struct btrfs_root *root); void check_system_chunk(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, const u64 type); u64 add_new_free_space(struct btrfs_block_group_cache *block_group, - struct btrfs_fs_info *info, u64 start, u64 end); + u64 start, u64 end); /* ctree.c */ int btrfs_bin_search(struct extent_buffer *eb, const struct btrfs_key *key, @@ -3042,11 +3039,9 @@ void btrfs_update_root_times(struct btrfs_trans_handle *trans, struct btrfs_root *root); /* uuid-tree.c */ -int btrfs_uuid_tree_add(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, u8 *uuid, u8 type, +int btrfs_uuid_tree_add(struct btrfs_trans_handle *trans, u8 *uuid, u8 type, u64 subid); -int btrfs_uuid_tree_rem(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, u8 *uuid, u8 type, +int btrfs_uuid_tree_remove(struct btrfs_trans_handle *trans, u8 *uuid, u8 type, u64 subid); int btrfs_uuid_tree_iterate(struct btrfs_fs_info *fs_info, int (*check_func)(struct btrfs_fs_info *, u8 *, u8, @@ -3163,18 +3158,6 @@ void btrfs_extent_item_to_extent_map(struct btrfs_inode *inode, struct extent_map *em); /* inode.c */ -struct btrfs_delalloc_work { - struct inode *inode; - int delay_iput; - struct completion completion; - struct list_head list; - struct btrfs_work work; -}; - -struct btrfs_delalloc_work *btrfs_alloc_delalloc_work(struct inode *inode, - int delay_iput); -void btrfs_wait_and_free_delalloc_work(struct btrfs_delalloc_work *work); - struct extent_map *btrfs_get_extent_fiemap(struct btrfs_inode *inode, struct page *page, size_t pg_offset, u64 start, u64 len, int create); @@ -3182,6 +3165,8 @@ noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len, u64 *orig_start, u64 *orig_block_len, u64 *ram_bytes); +void __btrfs_del_delalloc_inode(struct btrfs_root *root, + struct btrfs_inode *inode); struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry); int btrfs_set_inode_index(struct btrfs_inode *dir, u64 *index); int btrfs_unlink_inode(struct btrfs_trans_handle *trans, @@ -3191,10 +3176,7 @@ int btrfs_unlink_inode(struct btrfs_trans_handle *trans, int btrfs_add_link(struct btrfs_trans_handle *trans, struct btrfs_inode *parent_inode, struct btrfs_inode *inode, const char *name, int name_len, int add_backref, u64 index); -int btrfs_unlink_subvol(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct inode *dir, u64 objectid, - const char *name, int name_len); +int btrfs_delete_subvolume(struct inode *dir, struct dentry *dentry); int btrfs_truncate_block(struct inode *inode, loff_t from, loff_t len, int front); int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, @@ -3202,9 +3184,8 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, struct inode *inode, u64 new_size, u32 min_type); -int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput); -int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int delay_iput, - int nr); +int btrfs_start_delalloc_inodes(struct btrfs_root *root); +int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int nr); int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end, unsigned int extra_bits, struct extent_state **cached_state, int dedupe); @@ -3238,10 +3219,7 @@ int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans, int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct btrfs_inode *inode); int btrfs_orphan_cleanup(struct btrfs_root *root); -void btrfs_orphan_commit_root(struct btrfs_trans_handle *trans, - struct btrfs_root *root); int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size); -void btrfs_invalidate_inodes(struct btrfs_root *root); void btrfs_add_delayed_iput(struct inode *inode); void btrfs_run_delayed_iputs(struct btrfs_fs_info *fs_info); int btrfs_prealloc_file_range(struct inode *inode, int mode, @@ -3260,14 +3238,14 @@ void btrfs_test_inode_set_ops(struct inode *inode); long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg); long btrfs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg); int btrfs_ioctl_get_supported_features(void __user *arg); -void btrfs_update_iflags(struct inode *inode); +void btrfs_sync_inode_flags_to_i_flags(struct inode *inode); int btrfs_is_empty_uuid(u8 *uuid); int btrfs_defrag_file(struct inode *inode, struct file *file, struct btrfs_ioctl_defrag_range_args *range, u64 newer_than, unsigned long max_pages); void btrfs_get_block_group_info(struct list_head *groups_list, struct btrfs_ioctl_space_info *space); -void update_ioctl_balance_args(struct btrfs_fs_info *fs_info, int lock, +void btrfs_update_ioctl_balance_args(struct btrfs_fs_info *fs_info, struct btrfs_ioctl_balance_args *bargs); ssize_t btrfs_dedupe_file_range(struct file *src_file, u64 loff, u64 olen, struct file *dst_file, u64 dst_loff); @@ -3765,4 +3743,26 @@ static inline int btrfs_is_testing(struct btrfs_fs_info *fs_info) return 0; } +static inline void cond_wake_up(struct wait_queue_head *wq) +{ + /* + * This implies a full smp_mb barrier, see comments for + * waitqueue_active why. + */ + if (wq_has_sleeper(wq)) + wake_up(wq); +} + +static inline void cond_wake_up_nomb(struct wait_queue_head *wq) +{ + /* + * Special case for conditional wakeup where the barrier required for + * waitqueue_active is implied by some of the preceding code. Eg. one + * of such atomic operations (atomic_dec_and_return, ...), or a + * unlock/lock sequence, etc. + */ + if (waitqueue_active(wq)) + wake_up(wq); +} + #endif diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index a8d492dbd3e7..fe6caa7e698b 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -460,13 +460,10 @@ static void finish_one_item(struct btrfs_delayed_root *delayed_root) { int seq = atomic_inc_return(&delayed_root->items_seq); - /* - * atomic_dec_return implies a barrier for waitqueue_active - */ + /* atomic_dec_return implies a barrier */ if ((atomic_dec_return(&delayed_root->items) < - BTRFS_DELAYED_BACKGROUND || seq % BTRFS_DELAYED_BATCH == 0) && - waitqueue_active(&delayed_root->wait)) - wake_up(&delayed_root->wait); + BTRFS_DELAYED_BACKGROUND || seq % BTRFS_DELAYED_BATCH == 0)) + cond_wake_up_nomb(&delayed_root->wait); } static void __btrfs_remove_delayed_item(struct btrfs_delayed_item *delayed_item) diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c index e1b0651686f7..03dec673d12a 100644 --- a/fs/btrfs/delayed-ref.c +++ b/fs/btrfs/delayed-ref.c @@ -286,10 +286,10 @@ static bool merge_ref(struct btrfs_trans_handle *trans, } void btrfs_merge_delayed_refs(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, struct btrfs_delayed_ref_root *delayed_refs, struct btrfs_delayed_ref_head *head) { + struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_delayed_ref_node *ref; struct rb_node *node; u64 seq = 0; @@ -323,9 +323,7 @@ again: } } -int btrfs_check_delayed_seq(struct btrfs_fs_info *fs_info, - struct btrfs_delayed_ref_root *delayed_refs, - u64 seq) +int btrfs_check_delayed_seq(struct btrfs_fs_info *fs_info, u64 seq) { struct seq_list *elem; int ret = 0; @@ -336,10 +334,9 @@ int btrfs_check_delayed_seq(struct btrfs_fs_info *fs_info, struct seq_list, list); if (seq >= elem->seq) { btrfs_debug(fs_info, - "holding back delayed_ref %#x.%x, lowest is %#x.%x (%p)", + "holding back delayed_ref %#x.%x, lowest is %#x.%x", (u32)(seq >> 32), (u32)seq, - (u32)(elem->seq >> 32), (u32)elem->seq, - delayed_refs); + (u32)(elem->seq >> 32), (u32)elem->seq); ret = 1; } } @@ -529,33 +526,20 @@ update_existing_head_ref(struct btrfs_delayed_ref_root *delayed_refs, spin_unlock(&existing->lock); } -/* - * helper function to actually insert a head node into the rbtree. - * this does all the dirty work in terms of maintaining the correct - * overall modification count. - */ -static noinline struct btrfs_delayed_ref_head * -add_delayed_ref_head(struct btrfs_fs_info *fs_info, - struct btrfs_trans_handle *trans, - struct btrfs_delayed_ref_head *head_ref, - struct btrfs_qgroup_extent_record *qrecord, - u64 bytenr, u64 num_bytes, u64 ref_root, u64 reserved, - int action, int is_data, int is_system, - int *qrecord_inserted_ret, - int *old_ref_mod, int *new_ref_mod) - +static void init_delayed_ref_head(struct btrfs_delayed_ref_head *head_ref, + struct btrfs_qgroup_extent_record *qrecord, + u64 bytenr, u64 num_bytes, u64 ref_root, + u64 reserved, int action, bool is_data, + bool is_system) { - struct btrfs_delayed_ref_head *existing; - struct btrfs_delayed_ref_root *delayed_refs; int count_mod = 1; int must_insert_reserved = 0; - int qrecord_inserted = 0; /* If reserved is provided, it must be a data extent. */ BUG_ON(!is_data && reserved); /* - * the head node stores the sum of all the mods, so dropping a ref + * The head node stores the sum of all the mods, so dropping a ref * should drop the sum in the head node by one. */ if (action == BTRFS_UPDATE_DELAYED_HEAD) @@ -564,12 +548,11 @@ add_delayed_ref_head(struct btrfs_fs_info *fs_info, count_mod = -1; /* - * BTRFS_ADD_DELAYED_EXTENT means that we need to update - * the reserved accounting when the extent is finally added, or - * if a later modification deletes the delayed ref without ever - * inserting the extent into the extent allocation tree. - * ref->must_insert_reserved is the flag used to record - * that accounting mods are required. + * BTRFS_ADD_DELAYED_EXTENT means that we need to update the reserved + * accounting when the extent is finally added, or if a later + * modification deletes the delayed ref without ever inserting the + * extent into the extent allocation tree. ref->must_insert_reserved + * is the flag used to record that accounting mods are required. * * Once we record must_insert_reserved, switch the action to * BTRFS_ADD_DELAYED_REF because other special casing is not required. @@ -579,8 +562,6 @@ add_delayed_ref_head(struct btrfs_fs_info *fs_info, else must_insert_reserved = 0; - delayed_refs = &trans->transaction->delayed_refs; - refcount_set(&head_ref->refs, 1); head_ref->bytenr = bytenr; head_ref->num_bytes = num_bytes; @@ -598,7 +579,6 @@ add_delayed_ref_head(struct btrfs_fs_info *fs_info, spin_lock_init(&head_ref->lock); mutex_init(&head_ref->mutex); - /* Record qgroup extent info if provided */ if (qrecord) { if (ref_root && reserved) { head_ref->qgroup_ref_root = ref_root; @@ -608,20 +588,44 @@ add_delayed_ref_head(struct btrfs_fs_info *fs_info, qrecord->bytenr = bytenr; qrecord->num_bytes = num_bytes; qrecord->old_roots = NULL; + } +} + +/* + * helper function to actually insert a head node into the rbtree. + * this does all the dirty work in terms of maintaining the correct + * overall modification count. + */ +static noinline struct btrfs_delayed_ref_head * +add_delayed_ref_head(struct btrfs_trans_handle *trans, + struct btrfs_delayed_ref_head *head_ref, + struct btrfs_qgroup_extent_record *qrecord, + int action, int *qrecord_inserted_ret, + int *old_ref_mod, int *new_ref_mod) +{ + struct btrfs_delayed_ref_head *existing; + struct btrfs_delayed_ref_root *delayed_refs; + int qrecord_inserted = 0; - if(btrfs_qgroup_trace_extent_nolock(fs_info, + delayed_refs = &trans->transaction->delayed_refs; + + /* Record qgroup extent info if provided */ + if (qrecord) { + if (btrfs_qgroup_trace_extent_nolock(trans->fs_info, delayed_refs, qrecord)) kfree(qrecord); else qrecord_inserted = 1; } - trace_add_delayed_ref_head(fs_info, head_ref, action); + trace_add_delayed_ref_head(trans->fs_info, head_ref, action); existing = htree_insert(&delayed_refs->href_root, &head_ref->href_node); if (existing) { - WARN_ON(ref_root && reserved && existing->qgroup_ref_root + WARN_ON(qrecord && head_ref->qgroup_ref_root + && head_ref->qgroup_reserved + && existing->qgroup_ref_root && existing->qgroup_reserved); update_existing_head_ref(delayed_refs, existing, head_ref, old_ref_mod); @@ -634,8 +638,8 @@ add_delayed_ref_head(struct btrfs_fs_info *fs_info, } else { if (old_ref_mod) *old_ref_mod = 0; - if (is_data && count_mod < 0) - delayed_refs->pending_csums += num_bytes; + if (head_ref->is_data && head_ref->ref_mod < 0) + delayed_refs->pending_csums += head_ref->num_bytes; delayed_refs->num_heads++; delayed_refs->num_heads_ready++; atomic_inc(&delayed_refs->num_entries); @@ -645,90 +649,48 @@ add_delayed_ref_head(struct btrfs_fs_info *fs_info, *qrecord_inserted_ret = qrecord_inserted; if (new_ref_mod) *new_ref_mod = head_ref->total_ref_mod; - return head_ref; -} - -/* - * helper to insert a delayed tree ref into the rbtree. - */ -static noinline void -add_delayed_tree_ref(struct btrfs_fs_info *fs_info, - struct btrfs_trans_handle *trans, - struct btrfs_delayed_ref_head *head_ref, - struct btrfs_delayed_ref_node *ref, u64 bytenr, - u64 num_bytes, u64 parent, u64 ref_root, int level, - int action) -{ - struct btrfs_delayed_tree_ref *full_ref; - struct btrfs_delayed_ref_root *delayed_refs; - u64 seq = 0; - int ret; - - if (action == BTRFS_ADD_DELAYED_EXTENT) - action = BTRFS_ADD_DELAYED_REF; - if (is_fstree(ref_root)) - seq = atomic64_read(&fs_info->tree_mod_seq); - delayed_refs = &trans->transaction->delayed_refs; - - /* first set the basic ref node struct up */ - refcount_set(&ref->refs, 1); - ref->bytenr = bytenr; - ref->num_bytes = num_bytes; - ref->ref_mod = 1; - ref->action = action; - ref->is_head = 0; - ref->in_tree = 1; - ref->seq = seq; - RB_CLEAR_NODE(&ref->ref_node); - INIT_LIST_HEAD(&ref->add_list); - - full_ref = btrfs_delayed_node_to_tree_ref(ref); - full_ref->parent = parent; - full_ref->root = ref_root; - if (parent) - ref->type = BTRFS_SHARED_BLOCK_REF_KEY; - else - ref->type = BTRFS_TREE_BLOCK_REF_KEY; - full_ref->level = level; - - trace_add_delayed_tree_ref(fs_info, ref, full_ref, action); - - ret = insert_delayed_ref(trans, delayed_refs, head_ref, ref); - - /* - * XXX: memory should be freed at the same level allocated. - * But bad practice is anywhere... Follow it now. Need cleanup. - */ - if (ret > 0) - kmem_cache_free(btrfs_delayed_tree_ref_cachep, full_ref); + return head_ref; } /* - * helper to insert a delayed data ref into the rbtree. + * init_delayed_ref_common - Initialize the structure which represents a + * modification to a an extent. + * + * @fs_info: Internal to the mounted filesystem mount structure. + * + * @ref: The structure which is going to be initialized. + * + * @bytenr: The logical address of the extent for which a modification is + * going to be recorded. + * + * @num_bytes: Size of the extent whose modification is being recorded. + * + * @ref_root: The id of the root where this modification has originated, this + * can be either one of the well-known metadata trees or the + * subvolume id which references this extent. + * + * @action: Can be one of BTRFS_ADD_DELAYED_REF/BTRFS_DROP_DELAYED_REF or + * BTRFS_ADD_DELAYED_EXTENT + * + * @ref_type: Holds the type of the extent which is being recorded, can be + * one of BTRFS_SHARED_BLOCK_REF_KEY/BTRFS_TREE_BLOCK_REF_KEY + * when recording a metadata extent or BTRFS_SHARED_DATA_REF_KEY/ + * BTRFS_EXTENT_DATA_REF_KEY when recording data extent */ -static noinline void -add_delayed_data_ref(struct btrfs_fs_info *fs_info, - struct btrfs_trans_handle *trans, - struct btrfs_delayed_ref_head *head_ref, - struct btrfs_delayed_ref_node *ref, u64 bytenr, - u64 num_bytes, u64 parent, u64 ref_root, u64 owner, - u64 offset, int action) +static void init_delayed_ref_common(struct btrfs_fs_info *fs_info, + struct btrfs_delayed_ref_node *ref, + u64 bytenr, u64 num_bytes, u64 ref_root, + int action, u8 ref_type) { - struct btrfs_delayed_data_ref *full_ref; - struct btrfs_delayed_ref_root *delayed_refs; u64 seq = 0; - int ret; if (action == BTRFS_ADD_DELAYED_EXTENT) action = BTRFS_ADD_DELAYED_REF; - delayed_refs = &trans->transaction->delayed_refs; - if (is_fstree(ref_root)) seq = atomic64_read(&fs_info->tree_mod_seq); - /* first set the basic ref node struct up */ refcount_set(&ref->refs, 1); ref->bytenr = bytenr; ref->num_bytes = num_bytes; @@ -737,25 +699,9 @@ add_delayed_data_ref(struct btrfs_fs_info *fs_info, ref->is_head = 0; ref->in_tree = 1; ref->seq = seq; + ref->type = ref_type; RB_CLEAR_NODE(&ref->ref_node); INIT_LIST_HEAD(&ref->add_list); - - full_ref = btrfs_delayed_node_to_data_ref(ref); - full_ref->parent = parent; - full_ref->root = ref_root; - if (parent) - ref->type = BTRFS_SHARED_DATA_REF_KEY; - else - ref->type = BTRFS_EXTENT_DATA_REF_KEY; - - full_ref->objectid = owner; - full_ref->offset = offset; - - trace_add_delayed_data_ref(fs_info, ref, full_ref, action); - - ret = insert_delayed_ref(trans, delayed_refs, head_ref, ref); - if (ret > 0) - kmem_cache_free(btrfs_delayed_data_ref_cachep, full_ref); } /* @@ -775,13 +721,25 @@ int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info, struct btrfs_delayed_ref_root *delayed_refs; struct btrfs_qgroup_extent_record *record = NULL; int qrecord_inserted; - int is_system = (ref_root == BTRFS_CHUNK_TREE_OBJECTID); + bool is_system = (ref_root == BTRFS_CHUNK_TREE_OBJECTID); + int ret; + u8 ref_type; BUG_ON(extent_op && extent_op->is_data); ref = kmem_cache_alloc(btrfs_delayed_tree_ref_cachep, GFP_NOFS); if (!ref) return -ENOMEM; + if (parent) + ref_type = BTRFS_SHARED_BLOCK_REF_KEY; + else + ref_type = BTRFS_TREE_BLOCK_REF_KEY; + init_delayed_ref_common(fs_info, &ref->node, bytenr, num_bytes, + ref_root, action, ref_type); + ref->root = ref_root; + ref->parent = parent; + ref->level = level; + head_ref = kmem_cache_alloc(btrfs_delayed_ref_head_cachep, GFP_NOFS); if (!head_ref) goto free_ref; @@ -793,6 +751,8 @@ int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info, goto free_head_ref; } + init_delayed_ref_head(head_ref, record, bytenr, num_bytes, + ref_root, 0, action, false, is_system); head_ref->extent_op = extent_op; delayed_refs = &trans->transaction->delayed_refs; @@ -802,15 +762,19 @@ int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info, * insert both the head node and the new ref without dropping * the spin lock */ - head_ref = add_delayed_ref_head(fs_info, trans, head_ref, record, - bytenr, num_bytes, 0, 0, action, 0, - is_system, &qrecord_inserted, + head_ref = add_delayed_ref_head(trans, head_ref, record, + action, &qrecord_inserted, old_ref_mod, new_ref_mod); - add_delayed_tree_ref(fs_info, trans, head_ref, &ref->node, bytenr, - num_bytes, parent, ref_root, level, action); + ret = insert_delayed_ref(trans, delayed_refs, head_ref, &ref->node); spin_unlock(&delayed_refs->lock); + trace_add_delayed_tree_ref(fs_info, &ref->node, ref, + action == BTRFS_ADD_DELAYED_EXTENT ? + BTRFS_ADD_DELAYED_REF : action); + if (ret > 0) + kmem_cache_free(btrfs_delayed_tree_ref_cachep, ref); + if (qrecord_inserted) btrfs_qgroup_trace_extent_post(fs_info, record); @@ -839,11 +803,25 @@ int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info, struct btrfs_delayed_ref_root *delayed_refs; struct btrfs_qgroup_extent_record *record = NULL; int qrecord_inserted; + int ret; + u8 ref_type; ref = kmem_cache_alloc(btrfs_delayed_data_ref_cachep, GFP_NOFS); if (!ref) return -ENOMEM; + if (parent) + ref_type = BTRFS_SHARED_DATA_REF_KEY; + else + ref_type = BTRFS_EXTENT_DATA_REF_KEY; + init_delayed_ref_common(fs_info, &ref->node, bytenr, num_bytes, + ref_root, action, ref_type); + ref->root = ref_root; + ref->parent = parent; + ref->objectid = owner; + ref->offset = offset; + + head_ref = kmem_cache_alloc(btrfs_delayed_ref_head_cachep, GFP_NOFS); if (!head_ref) { kmem_cache_free(btrfs_delayed_data_ref_cachep, ref); @@ -861,6 +839,8 @@ int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info, } } + init_delayed_ref_head(head_ref, record, bytenr, num_bytes, ref_root, + reserved, action, true, false); head_ref->extent_op = NULL; delayed_refs = &trans->transaction->delayed_refs; @@ -870,16 +850,20 @@ int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info, * insert both the head node and the new ref without dropping * the spin lock */ - head_ref = add_delayed_ref_head(fs_info, trans, head_ref, record, - bytenr, num_bytes, ref_root, reserved, - action, 1, 0, &qrecord_inserted, + head_ref = add_delayed_ref_head(trans, head_ref, record, + action, &qrecord_inserted, old_ref_mod, new_ref_mod); - add_delayed_data_ref(fs_info, trans, head_ref, &ref->node, bytenr, - num_bytes, parent, ref_root, owner, offset, - action); + ret = insert_delayed_ref(trans, delayed_refs, head_ref, &ref->node); spin_unlock(&delayed_refs->lock); + trace_add_delayed_data_ref(trans->fs_info, &ref->node, ref, + action == BTRFS_ADD_DELAYED_EXTENT ? + BTRFS_ADD_DELAYED_REF : action); + if (ret > 0) + kmem_cache_free(btrfs_delayed_data_ref_cachep, ref); + + if (qrecord_inserted) return btrfs_qgroup_trace_extent_post(fs_info, record); return 0; @@ -897,19 +881,16 @@ int btrfs_add_delayed_extent_op(struct btrfs_fs_info *fs_info, if (!head_ref) return -ENOMEM; + init_delayed_ref_head(head_ref, NULL, bytenr, num_bytes, 0, 0, + BTRFS_UPDATE_DELAYED_HEAD, extent_op->is_data, + false); head_ref->extent_op = extent_op; delayed_refs = &trans->transaction->delayed_refs; spin_lock(&delayed_refs->lock); - /* - * extent_ops just modify the flags of an extent and they don't result - * in ref count changes, hence it's safe to pass false/0 for is_system - * argument - */ - add_delayed_ref_head(fs_info, trans, head_ref, NULL, bytenr, - num_bytes, 0, 0, BTRFS_UPDATE_DELAYED_HEAD, - extent_op->is_data, 0, NULL, NULL, NULL); + add_delayed_ref_head(trans, head_ref, NULL, BTRFS_UPDATE_DELAYED_HEAD, + NULL, NULL, NULL); spin_unlock(&delayed_refs->lock); return 0; diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h index 7f00db50bd24..ea1aecb6a50d 100644 --- a/fs/btrfs/delayed-ref.h +++ b/fs/btrfs/delayed-ref.h @@ -251,7 +251,6 @@ int btrfs_add_delayed_extent_op(struct btrfs_fs_info *fs_info, u64 bytenr, u64 num_bytes, struct btrfs_delayed_extent_op *extent_op); void btrfs_merge_delayed_refs(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, struct btrfs_delayed_ref_root *delayed_refs, struct btrfs_delayed_ref_head *head); @@ -269,9 +268,7 @@ static inline void btrfs_delayed_ref_unlock(struct btrfs_delayed_ref_head *head) struct btrfs_delayed_ref_head * btrfs_select_ref_head(struct btrfs_trans_handle *trans); -int btrfs_check_delayed_seq(struct btrfs_fs_info *fs_info, - struct btrfs_delayed_ref_root *delayed_refs, - u64 seq); +int btrfs_check_delayed_seq(struct btrfs_fs_info *fs_info, u64 seq); /* * helper functions to cast a node into its container diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index f82be266ba4b..e2ba0419297a 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -33,8 +33,6 @@ static void btrfs_dev_replace_update_device_in_mapping_tree( struct btrfs_device *srcdev, struct btrfs_device *tgtdev); static int btrfs_dev_replace_kthread(void *data); -static int btrfs_dev_replace_continue_on_mount(struct btrfs_fs_info *fs_info); - int btrfs_init_dev_replace(struct btrfs_fs_info *fs_info) { @@ -179,6 +177,105 @@ out: } /* + * Initialize a new device for device replace target from a given source dev + * and path. + * + * Return 0 and new device in @device_out, otherwise return < 0 + */ +static int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info, + const char *device_path, + struct btrfs_device *srcdev, + struct btrfs_device **device_out) +{ + struct btrfs_device *device; + struct block_device *bdev; + struct list_head *devices; + struct rcu_string *name; + u64 devid = BTRFS_DEV_REPLACE_DEVID; + int ret = 0; + + *device_out = NULL; + if (fs_info->fs_devices->seeding) { + btrfs_err(fs_info, "the filesystem is a seed filesystem!"); + return -EINVAL; + } + + bdev = blkdev_get_by_path(device_path, FMODE_WRITE | FMODE_EXCL, + fs_info->bdev_holder); + if (IS_ERR(bdev)) { + btrfs_err(fs_info, "target device %s is invalid!", device_path); + return PTR_ERR(bdev); + } + + filemap_write_and_wait(bdev->bd_inode->i_mapping); + + devices = &fs_info->fs_devices->devices; + list_for_each_entry(device, devices, dev_list) { + if (device->bdev == bdev) { + btrfs_err(fs_info, + "target device is in the filesystem!"); + ret = -EEXIST; + goto error; + } + } + + + if (i_size_read(bdev->bd_inode) < + btrfs_device_get_total_bytes(srcdev)) { + btrfs_err(fs_info, + "target device is smaller than source device!"); + ret = -EINVAL; + goto error; + } + + + device = btrfs_alloc_device(NULL, &devid, NULL); + if (IS_ERR(device)) { + ret = PTR_ERR(device); + goto error; + } + + name = rcu_string_strdup(device_path, GFP_KERNEL); + if (!name) { + btrfs_free_device(device); + ret = -ENOMEM; + goto error; + } + rcu_assign_pointer(device->name, name); + + mutex_lock(&fs_info->fs_devices->device_list_mutex); + set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state); + device->generation = 0; + device->io_width = fs_info->sectorsize; + device->io_align = fs_info->sectorsize; + device->sector_size = fs_info->sectorsize; + device->total_bytes = btrfs_device_get_total_bytes(srcdev); + device->disk_total_bytes = btrfs_device_get_disk_total_bytes(srcdev); + device->bytes_used = btrfs_device_get_bytes_used(srcdev); + device->commit_total_bytes = srcdev->commit_total_bytes; + device->commit_bytes_used = device->bytes_used; + device->fs_info = fs_info; + device->bdev = bdev; + set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state); + set_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state); + device->mode = FMODE_EXCL; + device->dev_stats_valid = 1; + set_blocksize(device->bdev, BTRFS_BDEV_BLOCKSIZE); + device->fs_devices = fs_info->fs_devices; + list_add(&device->dev_list, &fs_info->fs_devices->devices); + fs_info->fs_devices->num_devices++; + fs_info->fs_devices->open_devices++; + mutex_unlock(&fs_info->fs_devices->device_list_mutex); + + *device_out = device; + return 0; + +error: + blkdev_put(bdev, FMODE_EXCL); + return ret; +} + +/* * called from commit_transaction. Writes changed device replace state to * disk. */ @@ -317,18 +414,13 @@ int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info, struct btrfs_device *tgt_device = NULL; struct btrfs_device *src_device = NULL; - /* the disk copy procedure reuses the scrub code */ - mutex_lock(&fs_info->volume_mutex); ret = btrfs_find_device_by_devspec(fs_info, srcdevid, srcdev_name, &src_device); - if (ret) { - mutex_unlock(&fs_info->volume_mutex); + if (ret) return ret; - } ret = btrfs_init_dev_replace_tgtdev(fs_info, tgtdev_name, src_device, &tgt_device); - mutex_unlock(&fs_info->volume_mutex); if (ret) return ret; @@ -360,7 +452,6 @@ int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info, dev_replace->cont_reading_from_srcdev_mode = read_src; WARN_ON(!src_device); dev_replace->srcdev = src_device; - WARN_ON(!tgt_device); dev_replace->tgtdev = tgt_device; btrfs_info_in_rcu(fs_info, @@ -503,7 +594,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, * flush all outstanding I/O and inode extent mappings before the * copy operation is declared as being finished */ - ret = btrfs_start_delalloc_roots(fs_info, 0, -1); + ret = btrfs_start_delalloc_roots(fs_info, -1); if (ret) { mutex_unlock(&dev_replace->lock_finishing_cancel_unmount); return ret; @@ -518,7 +609,6 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, ret = btrfs_commit_transaction(trans); WARN_ON(ret); - mutex_lock(&uuid_mutex); /* keep away write_all_supers() during the finishing procedure */ mutex_lock(&fs_info->fs_devices->device_list_mutex); mutex_lock(&fs_info->chunk_mutex); @@ -545,7 +635,6 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, btrfs_dev_replace_write_unlock(dev_replace); mutex_unlock(&fs_info->chunk_mutex); mutex_unlock(&fs_info->fs_devices->device_list_mutex); - mutex_unlock(&uuid_mutex); btrfs_rm_dev_replace_blocked(fs_info); if (tgt_device) btrfs_destroy_dev_replace_tgtdev(fs_info, tgt_device); @@ -596,7 +685,6 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, */ mutex_unlock(&fs_info->chunk_mutex); mutex_unlock(&fs_info->fs_devices->device_list_mutex); - mutex_unlock(&uuid_mutex); /* replace the sysfs entry */ btrfs_sysfs_rm_device_link(fs_info->fs_devices, src_device); @@ -800,7 +888,17 @@ int btrfs_resume_dev_replace_async(struct btrfs_fs_info *fs_info) } btrfs_dev_replace_write_unlock(dev_replace); - WARN_ON(test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags)); + /* + * This could collide with a paused balance, but the exclusive op logic + * should never allow both to start and pause. We don't want to allow + * dev-replace to start anyway. + */ + if (test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags)) { + btrfs_info(fs_info, + "cannot resume dev-replace, other exclusive operation running"); + return 0; + } + task = kthread_run(btrfs_dev_replace_kthread, fs_info, "btrfs-devrepl"); return PTR_ERR_OR_ZERO(task); } @@ -810,6 +908,7 @@ static int btrfs_dev_replace_kthread(void *data) struct btrfs_fs_info *fs_info = data; struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; u64 progress; + int ret; progress = btrfs_dev_replace_progress(fs_info); progress = div_u64(progress, 10); @@ -820,23 +919,14 @@ static int btrfs_dev_replace_kthread(void *data) btrfs_dev_name(dev_replace->tgtdev), (unsigned int)progress); - btrfs_dev_replace_continue_on_mount(fs_info); - clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags); - - return 0; -} - -static int btrfs_dev_replace_continue_on_mount(struct btrfs_fs_info *fs_info) -{ - struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; - int ret; - ret = btrfs_scrub_dev(fs_info, dev_replace->srcdev->devid, dev_replace->committed_cursor_left, btrfs_device_get_total_bytes(dev_replace->srcdev), &dev_replace->scrub_progress, 0, 1); ret = btrfs_dev_replace_finishing(fs_info, ret); WARN_ON(ret); + + clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags); return 0; } @@ -916,9 +1006,9 @@ void btrfs_dev_replace_clear_lock_blocking( ASSERT(atomic_read(&dev_replace->read_locks) > 0); ASSERT(atomic_read(&dev_replace->blocking_readers) > 0); read_lock(&dev_replace->lock); - if (atomic_dec_and_test(&dev_replace->blocking_readers) && - waitqueue_active(&dev_replace->read_lock_wq)) - wake_up(&dev_replace->read_lock_wq); + /* Barrier implied by atomic_dec_and_test */ + if (atomic_dec_and_test(&dev_replace->blocking_readers)) + cond_wake_up_nomb(&dev_replace->read_lock_wq); } void btrfs_bio_counter_inc_noblocked(struct btrfs_fs_info *fs_info) @@ -929,9 +1019,7 @@ void btrfs_bio_counter_inc_noblocked(struct btrfs_fs_info *fs_info) void btrfs_bio_counter_sub(struct btrfs_fs_info *fs_info, s64 amount) { percpu_counter_sub(&fs_info->bio_counter, amount); - - if (waitqueue_active(&fs_info->replace_wait)) - wake_up(&fs_info->replace_wait); + cond_wake_up_nomb(&fs_info->replace_wait); } void btrfs_bio_counter_inc_blocked(struct btrfs_fs_info *fs_info) diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 60caa68c3618..205092dc9390 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -55,7 +55,6 @@ static const struct extent_io_ops btree_extent_io_ops; static void end_workqueue_fn(struct btrfs_work *work); static void free_fs_root(struct btrfs_root *root); -static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info); static void btrfs_destroy_ordered_extents(struct btrfs_root *root); static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans, struct btrfs_fs_info *fs_info); @@ -416,7 +415,7 @@ static int btrfs_check_super_csum(struct btrfs_fs_info *fs_info, static int verify_level_key(struct btrfs_fs_info *fs_info, struct extent_buffer *eb, int level, - struct btrfs_key *first_key) + struct btrfs_key *first_key, u64 parent_transid) { int found_level; struct btrfs_key found_key; @@ -454,10 +453,11 @@ static int verify_level_key(struct btrfs_fs_info *fs_info, if (ret) { WARN_ON(1); btrfs_err(fs_info, -"tree first key mismatch detected, bytenr=%llu key expected=(%llu, %u, %llu) has=(%llu, %u, %llu)", - eb->start, first_key->objectid, first_key->type, - first_key->offset, found_key.objectid, - found_key.type, found_key.offset); +"tree first key mismatch detected, bytenr=%llu parent_transid=%llu key expected=(%llu,%u,%llu) has=(%llu,%u,%llu)", + eb->start, parent_transid, first_key->objectid, + first_key->type, first_key->offset, + found_key.objectid, found_key.type, + found_key.offset); } #endif return ret; @@ -493,7 +493,7 @@ static int btree_read_extent_buffer_pages(struct btrfs_fs_info *fs_info, parent_transid, 0)) ret = -EIO; else if (verify_level_key(fs_info, eb, level, - first_key)) + first_key, parent_transid)) ret = -EUCLEAN; else break; @@ -1185,7 +1185,6 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info, root->inode_tree = RB_ROOT; INIT_RADIX_TREE(&root->delayed_nodes_tree, GFP_ATOMIC); root->block_rsv = NULL; - root->orphan_block_rsv = NULL; INIT_LIST_HEAD(&root->dirty_list); INIT_LIST_HEAD(&root->root_list); @@ -1195,7 +1194,6 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info, INIT_LIST_HEAD(&root->ordered_root); INIT_LIST_HEAD(&root->logged_list[0]); INIT_LIST_HEAD(&root->logged_list[1]); - spin_lock_init(&root->orphan_lock); spin_lock_init(&root->inode_lock); spin_lock_init(&root->delalloc_lock); spin_lock_init(&root->ordered_extent_lock); @@ -1216,7 +1214,6 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info, atomic_set(&root->log_commit[1], 0); atomic_set(&root->log_writers, 0); atomic_set(&root->log_batch, 0); - atomic_set(&root->orphan_inodes, 0); refcount_set(&root->refs, 1); atomic_set(&root->will_be_snapshotted, 0); root->log_transid = 0; @@ -2164,7 +2161,6 @@ static void btrfs_init_balance(struct btrfs_fs_info *fs_info) { spin_lock_init(&fs_info->balance_lock); mutex_init(&fs_info->balance_mutex); - atomic_set(&fs_info->balance_running, 0); atomic_set(&fs_info->balance_pause_req, 0); atomic_set(&fs_info->balance_cancel_req, 0); fs_info->balance_ctl = NULL; @@ -2442,6 +2438,211 @@ out: return ret; } +/* + * Real super block validation + * NOTE: super csum type and incompat features will not be checked here. + * + * @sb: super block to check + * @mirror_num: the super block number to check its bytenr: + * 0 the primary (1st) sb + * 1, 2 2nd and 3rd backup copy + * -1 skip bytenr check + */ +static int validate_super(struct btrfs_fs_info *fs_info, + struct btrfs_super_block *sb, int mirror_num) +{ + u64 nodesize = btrfs_super_nodesize(sb); + u64 sectorsize = btrfs_super_sectorsize(sb); + int ret = 0; + + if (btrfs_super_magic(sb) != BTRFS_MAGIC) { + btrfs_err(fs_info, "no valid FS found"); + ret = -EINVAL; + } + if (btrfs_super_flags(sb) & ~BTRFS_SUPER_FLAG_SUPP) { + btrfs_err(fs_info, "unrecognized or unsupported super flag: %llu", + btrfs_super_flags(sb) & ~BTRFS_SUPER_FLAG_SUPP); + ret = -EINVAL; + } + if (btrfs_super_root_level(sb) >= BTRFS_MAX_LEVEL) { + btrfs_err(fs_info, "tree_root level too big: %d >= %d", + btrfs_super_root_level(sb), BTRFS_MAX_LEVEL); + ret = -EINVAL; + } + if (btrfs_super_chunk_root_level(sb) >= BTRFS_MAX_LEVEL) { + btrfs_err(fs_info, "chunk_root level too big: %d >= %d", + btrfs_super_chunk_root_level(sb), BTRFS_MAX_LEVEL); + ret = -EINVAL; + } + if (btrfs_super_log_root_level(sb) >= BTRFS_MAX_LEVEL) { + btrfs_err(fs_info, "log_root level too big: %d >= %d", + btrfs_super_log_root_level(sb), BTRFS_MAX_LEVEL); + ret = -EINVAL; + } + + /* + * Check sectorsize and nodesize first, other check will need it. + * Check all possible sectorsize(4K, 8K, 16K, 32K, 64K) here. + */ + if (!is_power_of_2(sectorsize) || sectorsize < 4096 || + sectorsize > BTRFS_MAX_METADATA_BLOCKSIZE) { + btrfs_err(fs_info, "invalid sectorsize %llu", sectorsize); + ret = -EINVAL; + } + /* Only PAGE SIZE is supported yet */ + if (sectorsize != PAGE_SIZE) { + btrfs_err(fs_info, + "sectorsize %llu not supported yet, only support %lu", + sectorsize, PAGE_SIZE); + ret = -EINVAL; + } + if (!is_power_of_2(nodesize) || nodesize < sectorsize || + nodesize > BTRFS_MAX_METADATA_BLOCKSIZE) { + btrfs_err(fs_info, "invalid nodesize %llu", nodesize); + ret = -EINVAL; + } + if (nodesize != le32_to_cpu(sb->__unused_leafsize)) { + btrfs_err(fs_info, "invalid leafsize %u, should be %llu", + le32_to_cpu(sb->__unused_leafsize), nodesize); + ret = -EINVAL; + } + + /* Root alignment check */ + if (!IS_ALIGNED(btrfs_super_root(sb), sectorsize)) { + btrfs_warn(fs_info, "tree_root block unaligned: %llu", + btrfs_super_root(sb)); + ret = -EINVAL; + } + if (!IS_ALIGNED(btrfs_super_chunk_root(sb), sectorsize)) { + btrfs_warn(fs_info, "chunk_root block unaligned: %llu", + btrfs_super_chunk_root(sb)); + ret = -EINVAL; + } + if (!IS_ALIGNED(btrfs_super_log_root(sb), sectorsize)) { + btrfs_warn(fs_info, "log_root block unaligned: %llu", + btrfs_super_log_root(sb)); + ret = -EINVAL; + } + + if (memcmp(fs_info->fsid, sb->dev_item.fsid, BTRFS_FSID_SIZE) != 0) { + btrfs_err(fs_info, + "dev_item UUID does not match fsid: %pU != %pU", + fs_info->fsid, sb->dev_item.fsid); + ret = -EINVAL; + } + + /* + * Hint to catch really bogus numbers, bitflips or so, more exact checks are + * done later + */ + if (btrfs_super_bytes_used(sb) < 6 * btrfs_super_nodesize(sb)) { + btrfs_err(fs_info, "bytes_used is too small %llu", + btrfs_super_bytes_used(sb)); + ret = -EINVAL; + } + if (!is_power_of_2(btrfs_super_stripesize(sb))) { + btrfs_err(fs_info, "invalid stripesize %u", + btrfs_super_stripesize(sb)); + ret = -EINVAL; + } + if (btrfs_super_num_devices(sb) > (1UL << 31)) + btrfs_warn(fs_info, "suspicious number of devices: %llu", + btrfs_super_num_devices(sb)); + if (btrfs_super_num_devices(sb) == 0) { + btrfs_err(fs_info, "number of devices is 0"); + ret = -EINVAL; + } + + if (mirror_num >= 0 && + btrfs_super_bytenr(sb) != btrfs_sb_offset(mirror_num)) { + btrfs_err(fs_info, "super offset mismatch %llu != %u", + btrfs_super_bytenr(sb), BTRFS_SUPER_INFO_OFFSET); + ret = -EINVAL; + } + + /* + * Obvious sys_chunk_array corruptions, it must hold at least one key + * and one chunk + */ + if (btrfs_super_sys_array_size(sb) > BTRFS_SYSTEM_CHUNK_ARRAY_SIZE) { + btrfs_err(fs_info, "system chunk array too big %u > %u", + btrfs_super_sys_array_size(sb), + BTRFS_SYSTEM_CHUNK_ARRAY_SIZE); + ret = -EINVAL; + } + if (btrfs_super_sys_array_size(sb) < sizeof(struct btrfs_disk_key) + + sizeof(struct btrfs_chunk)) { + btrfs_err(fs_info, "system chunk array too small %u < %zu", + btrfs_super_sys_array_size(sb), + sizeof(struct btrfs_disk_key) + + sizeof(struct btrfs_chunk)); + ret = -EINVAL; + } + + /* + * The generation is a global counter, we'll trust it more than the others + * but it's still possible that it's the one that's wrong. + */ + if (btrfs_super_generation(sb) < btrfs_super_chunk_root_generation(sb)) + btrfs_warn(fs_info, + "suspicious: generation < chunk_root_generation: %llu < %llu", + btrfs_super_generation(sb), + btrfs_super_chunk_root_generation(sb)); + if (btrfs_super_generation(sb) < btrfs_super_cache_generation(sb) + && btrfs_super_cache_generation(sb) != (u64)-1) + btrfs_warn(fs_info, + "suspicious: generation < cache_generation: %llu < %llu", + btrfs_super_generation(sb), + btrfs_super_cache_generation(sb)); + + return ret; +} + +/* + * Validation of super block at mount time. + * Some checks already done early at mount time, like csum type and incompat + * flags will be skipped. + */ +static int btrfs_validate_mount_super(struct btrfs_fs_info *fs_info) +{ + return validate_super(fs_info, fs_info->super_copy, 0); +} + +/* + * Validation of super block at write time. + * Some checks like bytenr check will be skipped as their values will be + * overwritten soon. + * Extra checks like csum type and incompat flags will be done here. + */ +static int btrfs_validate_write_super(struct btrfs_fs_info *fs_info, + struct btrfs_super_block *sb) +{ + int ret; + + ret = validate_super(fs_info, sb, -1); + if (ret < 0) + goto out; + if (btrfs_super_csum_type(sb) != BTRFS_CSUM_TYPE_CRC32) { + ret = -EUCLEAN; + btrfs_err(fs_info, "invalid csum type, has %u want %u", + btrfs_super_csum_type(sb), BTRFS_CSUM_TYPE_CRC32); + goto out; + } + if (btrfs_super_incompat_flags(sb) & ~BTRFS_FEATURE_INCOMPAT_SUPP) { + ret = -EUCLEAN; + btrfs_err(fs_info, + "invalid incompat flags, has 0x%llx valid mask 0x%llx", + btrfs_super_incompat_flags(sb), + (unsigned long long)BTRFS_FEATURE_INCOMPAT_SUPP); + goto out; + } +out: + if (ret < 0) + btrfs_err(fs_info, + "super block corruption detected before writing it to disk"); + return ret; +} + int open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_devices, char *options) @@ -2601,7 +2802,6 @@ int open_ctree(struct super_block *sb, mutex_init(&fs_info->chunk_mutex); mutex_init(&fs_info->transaction_kthread_mutex); mutex_init(&fs_info->cleaner_mutex); - mutex_init(&fs_info->volume_mutex); mutex_init(&fs_info->ro_block_group_mutex); init_rwsem(&fs_info->commit_root_sem); init_rwsem(&fs_info->cleanup_work_sem); @@ -2668,7 +2868,7 @@ int open_ctree(struct super_block *sb, memcpy(fs_info->fsid, fs_info->super_copy->fsid, BTRFS_FSID_SIZE); - ret = btrfs_check_super_valid(fs_info); + ret = btrfs_validate_mount_super(fs_info); if (ret) { btrfs_err(fs_info, "superblock contains fatal errors"); err = -EINVAL; @@ -3523,7 +3723,7 @@ int btrfs_get_num_tolerated_disk_barrier_failures(u64 flags) for (raid_type = 0; raid_type < BTRFS_NR_RAID_TYPES; raid_type++) { if (raid_type == BTRFS_RAID_SINGLE) continue; - if (!(flags & btrfs_raid_group[raid_type])) + if (!(flags & btrfs_raid_array[raid_type].bg_flag)) continue; min_tolerated = min(min_tolerated, btrfs_raid_array[raid_type]. @@ -3603,6 +3803,14 @@ int write_all_supers(struct btrfs_fs_info *fs_info, int max_mirrors) flags = btrfs_super_flags(sb); btrfs_set_super_flags(sb, flags | BTRFS_HEADER_FLAG_WRITTEN); + ret = btrfs_validate_write_super(fs_info, sb); + if (ret < 0) { + mutex_unlock(&fs_info->fs_devices->device_list_mutex); + btrfs_handle_fs_error(fs_info, -EUCLEAN, + "unexpected superblock corruption detected"); + return -EUCLEAN; + } + ret = write_dev_supers(dev, sb, max_mirrors); if (ret) total_errors++; @@ -3674,8 +3882,6 @@ static void free_fs_root(struct btrfs_root *root) { iput(root->ino_cache_inode); WARN_ON(!RB_EMPTY_ROOT(&root->inode_tree)); - btrfs_free_block_rsv(root->fs_info, root->orphan_block_rsv); - root->orphan_block_rsv = NULL; if (root->anon_dev) free_anon_bdev(root->anon_dev); if (root->subv_writers) @@ -3766,7 +3972,6 @@ int btrfs_commit_super(struct btrfs_fs_info *fs_info) void close_ctree(struct btrfs_fs_info *fs_info) { - struct btrfs_root *root = fs_info->tree_root; int ret; set_bit(BTRFS_FS_CLOSING_START, &fs_info->flags); @@ -3818,6 +4023,7 @@ void close_ctree(struct btrfs_fs_info *fs_info) set_bit(BTRFS_FS_CLOSING_DONE, &fs_info->flags); btrfs_free_qgroup_config(fs_info); + ASSERT(list_empty(&fs_info->delalloc_roots)); if (percpu_counter_sum(&fs_info->delalloc_bytes)) { btrfs_info(fs_info, "at unmount delalloc count %lld", @@ -3861,9 +4067,6 @@ void close_ctree(struct btrfs_fs_info *fs_info) btrfs_free_stripe_hash_table(fs_info); btrfs_free_ref_cache(fs_info); - __btrfs_free_block_rsv(root->orphan_block_rsv); - root->orphan_block_rsv = NULL; - while (!list_empty(&fs_info->pinned_chunks)) { struct extent_map *em; @@ -3974,166 +4177,17 @@ int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid, int level, level, first_key); } -static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info) -{ - struct btrfs_super_block *sb = fs_info->super_copy; - u64 nodesize = btrfs_super_nodesize(sb); - u64 sectorsize = btrfs_super_sectorsize(sb); - int ret = 0; - - if (btrfs_super_magic(sb) != BTRFS_MAGIC) { - btrfs_err(fs_info, "no valid FS found"); - ret = -EINVAL; - } - if (btrfs_super_flags(sb) & ~BTRFS_SUPER_FLAG_SUPP) { - btrfs_err(fs_info, "unrecognized or unsupported super flag: %llu", - btrfs_super_flags(sb) & ~BTRFS_SUPER_FLAG_SUPP); - ret = -EINVAL; - } - if (btrfs_super_root_level(sb) >= BTRFS_MAX_LEVEL) { - btrfs_err(fs_info, "tree_root level too big: %d >= %d", - btrfs_super_root_level(sb), BTRFS_MAX_LEVEL); - ret = -EINVAL; - } - if (btrfs_super_chunk_root_level(sb) >= BTRFS_MAX_LEVEL) { - btrfs_err(fs_info, "chunk_root level too big: %d >= %d", - btrfs_super_chunk_root_level(sb), BTRFS_MAX_LEVEL); - ret = -EINVAL; - } - if (btrfs_super_log_root_level(sb) >= BTRFS_MAX_LEVEL) { - btrfs_err(fs_info, "log_root level too big: %d >= %d", - btrfs_super_log_root_level(sb), BTRFS_MAX_LEVEL); - ret = -EINVAL; - } - - /* - * Check sectorsize and nodesize first, other check will need it. - * Check all possible sectorsize(4K, 8K, 16K, 32K, 64K) here. - */ - if (!is_power_of_2(sectorsize) || sectorsize < 4096 || - sectorsize > BTRFS_MAX_METADATA_BLOCKSIZE) { - btrfs_err(fs_info, "invalid sectorsize %llu", sectorsize); - ret = -EINVAL; - } - /* Only PAGE SIZE is supported yet */ - if (sectorsize != PAGE_SIZE) { - btrfs_err(fs_info, - "sectorsize %llu not supported yet, only support %lu", - sectorsize, PAGE_SIZE); - ret = -EINVAL; - } - if (!is_power_of_2(nodesize) || nodesize < sectorsize || - nodesize > BTRFS_MAX_METADATA_BLOCKSIZE) { - btrfs_err(fs_info, "invalid nodesize %llu", nodesize); - ret = -EINVAL; - } - if (nodesize != le32_to_cpu(sb->__unused_leafsize)) { - btrfs_err(fs_info, "invalid leafsize %u, should be %llu", - le32_to_cpu(sb->__unused_leafsize), nodesize); - ret = -EINVAL; - } - - /* Root alignment check */ - if (!IS_ALIGNED(btrfs_super_root(sb), sectorsize)) { - btrfs_warn(fs_info, "tree_root block unaligned: %llu", - btrfs_super_root(sb)); - ret = -EINVAL; - } - if (!IS_ALIGNED(btrfs_super_chunk_root(sb), sectorsize)) { - btrfs_warn(fs_info, "chunk_root block unaligned: %llu", - btrfs_super_chunk_root(sb)); - ret = -EINVAL; - } - if (!IS_ALIGNED(btrfs_super_log_root(sb), sectorsize)) { - btrfs_warn(fs_info, "log_root block unaligned: %llu", - btrfs_super_log_root(sb)); - ret = -EINVAL; - } - - if (memcmp(fs_info->fsid, sb->dev_item.fsid, BTRFS_FSID_SIZE) != 0) { - btrfs_err(fs_info, - "dev_item UUID does not match fsid: %pU != %pU", - fs_info->fsid, sb->dev_item.fsid); - ret = -EINVAL; - } - - /* - * Hint to catch really bogus numbers, bitflips or so, more exact checks are - * done later - */ - if (btrfs_super_bytes_used(sb) < 6 * btrfs_super_nodesize(sb)) { - btrfs_err(fs_info, "bytes_used is too small %llu", - btrfs_super_bytes_used(sb)); - ret = -EINVAL; - } - if (!is_power_of_2(btrfs_super_stripesize(sb))) { - btrfs_err(fs_info, "invalid stripesize %u", - btrfs_super_stripesize(sb)); - ret = -EINVAL; - } - if (btrfs_super_num_devices(sb) > (1UL << 31)) - btrfs_warn(fs_info, "suspicious number of devices: %llu", - btrfs_super_num_devices(sb)); - if (btrfs_super_num_devices(sb) == 0) { - btrfs_err(fs_info, "number of devices is 0"); - ret = -EINVAL; - } - - if (btrfs_super_bytenr(sb) != BTRFS_SUPER_INFO_OFFSET) { - btrfs_err(fs_info, "super offset mismatch %llu != %u", - btrfs_super_bytenr(sb), BTRFS_SUPER_INFO_OFFSET); - ret = -EINVAL; - } - - /* - * Obvious sys_chunk_array corruptions, it must hold at least one key - * and one chunk - */ - if (btrfs_super_sys_array_size(sb) > BTRFS_SYSTEM_CHUNK_ARRAY_SIZE) { - btrfs_err(fs_info, "system chunk array too big %u > %u", - btrfs_super_sys_array_size(sb), - BTRFS_SYSTEM_CHUNK_ARRAY_SIZE); - ret = -EINVAL; - } - if (btrfs_super_sys_array_size(sb) < sizeof(struct btrfs_disk_key) - + sizeof(struct btrfs_chunk)) { - btrfs_err(fs_info, "system chunk array too small %u < %zu", - btrfs_super_sys_array_size(sb), - sizeof(struct btrfs_disk_key) - + sizeof(struct btrfs_chunk)); - ret = -EINVAL; - } - - /* - * The generation is a global counter, we'll trust it more than the others - * but it's still possible that it's the one that's wrong. - */ - if (btrfs_super_generation(sb) < btrfs_super_chunk_root_generation(sb)) - btrfs_warn(fs_info, - "suspicious: generation < chunk_root_generation: %llu < %llu", - btrfs_super_generation(sb), - btrfs_super_chunk_root_generation(sb)); - if (btrfs_super_generation(sb) < btrfs_super_cache_generation(sb) - && btrfs_super_cache_generation(sb) != (u64)-1) - btrfs_warn(fs_info, - "suspicious: generation < cache_generation: %llu < %llu", - btrfs_super_generation(sb), - btrfs_super_cache_generation(sb)); - - return ret; -} - static void btrfs_error_commit_super(struct btrfs_fs_info *fs_info) { + /* cleanup FS via transaction */ + btrfs_cleanup_transaction(fs_info); + mutex_lock(&fs_info->cleaner_mutex); btrfs_run_delayed_iputs(fs_info); mutex_unlock(&fs_info->cleaner_mutex); down_write(&fs_info->cleanup_work_sem); up_write(&fs_info->cleanup_work_sem); - - /* cleanup FS via transaction */ - btrfs_cleanup_transaction(fs_info); } static void btrfs_destroy_ordered_extents(struct btrfs_root *root) @@ -4258,19 +4312,23 @@ static void btrfs_destroy_delalloc_inodes(struct btrfs_root *root) list_splice_init(&root->delalloc_inodes, &splice); while (!list_empty(&splice)) { + struct inode *inode = NULL; btrfs_inode = list_first_entry(&splice, struct btrfs_inode, delalloc_inodes); - - list_del_init(&btrfs_inode->delalloc_inodes); - clear_bit(BTRFS_INODE_IN_DELALLOC_LIST, - &btrfs_inode->runtime_flags); + __btrfs_del_delalloc_inode(root, btrfs_inode); spin_unlock(&root->delalloc_lock); - btrfs_invalidate_inodes(btrfs_inode->root); - + /* + * Make sure we get a live inode and that it'll not disappear + * meanwhile. + */ + inode = igrab(&btrfs_inode->vfs_inode); + if (inode) { + invalidate_inode_pages2(inode->i_mapping); + iput(inode); + } spin_lock(&root->delalloc_lock); } - spin_unlock(&root->delalloc_lock); } @@ -4286,7 +4344,6 @@ static void btrfs_destroy_all_delalloc_inodes(struct btrfs_fs_info *fs_info) while (!list_empty(&splice)) { root = list_first_entry(&splice, struct btrfs_root, delalloc_root); - list_del_init(&root->delalloc_root); root = btrfs_grab_fs_root(root); BUG_ON(!root); spin_unlock(&fs_info->delalloc_root_lock); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 51b5e2da708c..3d9fe58c0080 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -66,10 +66,8 @@ static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans, u64 flags, u64 owner, u64 offset, struct btrfs_key *ins, int ref_mod); static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, - u64 parent, u64 root_objectid, - u64 flags, struct btrfs_disk_key *key, - int level, struct btrfs_key *ins); + struct btrfs_delayed_ref_node *node, + struct btrfs_delayed_extent_op *extent_op); static int do_chunk_alloc(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, u64 flags, int force); @@ -256,7 +254,7 @@ static int exclude_super_stripes(struct btrfs_fs_info *fs_info, for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) { bytenr = btrfs_sb_offset(i); ret = btrfs_rmap_block(fs_info, cache->key.objectid, - bytenr, 0, &logical, &nr, &stripe_len); + bytenr, &logical, &nr, &stripe_len); if (ret) return ret; @@ -343,8 +341,9 @@ static void fragment_free_space(struct btrfs_block_group_cache *block_group) * since their free space will be released as soon as the transaction commits. */ u64 add_new_free_space(struct btrfs_block_group_cache *block_group, - struct btrfs_fs_info *info, u64 start, u64 end) + u64 start, u64 end) { + struct btrfs_fs_info *info = block_group->fs_info; u64 extent_start, extent_end, size, total_added = 0; int ret; @@ -489,8 +488,7 @@ next: if (key.type == BTRFS_EXTENT_ITEM_KEY || key.type == BTRFS_METADATA_ITEM_KEY) { - total_found += add_new_free_space(block_group, - fs_info, last, + total_found += add_new_free_space(block_group, last, key.objectid); if (key.type == BTRFS_METADATA_ITEM_KEY) last = key.objectid + @@ -508,7 +506,7 @@ next: } ret = 0; - total_found += add_new_free_space(block_group, fs_info, last, + total_found += add_new_free_space(block_group, last, block_group->key.objectid + block_group->key.offset); caching_ctl->progress = (u64)-1; @@ -744,12 +742,12 @@ static struct btrfs_space_info *__find_space_info(struct btrfs_fs_info *info, } static void add_pinned_bytes(struct btrfs_fs_info *fs_info, s64 num_bytes, - u64 owner, u64 root_objectid) + bool metadata, u64 root_objectid) { struct btrfs_space_info *space_info; u64 flags; - if (owner < BTRFS_FIRST_FREE_OBJECTID) { + if (metadata) { if (root_objectid == BTRFS_CHUNK_TREE_OBJECTID) flags = BTRFS_BLOCK_GROUP_SYSTEM; else @@ -2200,8 +2198,11 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, &old_ref_mod, &new_ref_mod); } - if (ret == 0 && old_ref_mod < 0 && new_ref_mod >= 0) - add_pinned_bytes(fs_info, -num_bytes, owner, root_objectid); + if (ret == 0 && old_ref_mod < 0 && new_ref_mod >= 0) { + bool metadata = owner < BTRFS_FIRST_FREE_OBJECTID; + + add_pinned_bytes(fs_info, -num_bytes, metadata, root_objectid); + } return ret; } @@ -2428,10 +2429,8 @@ static int run_delayed_tree_ref(struct btrfs_trans_handle *trans, { int ret = 0; struct btrfs_delayed_tree_ref *ref; - struct btrfs_key ins; u64 parent = 0; u64 ref_root = 0; - bool skinny_metadata = btrfs_fs_incompat(fs_info, SKINNY_METADATA); ref = btrfs_delayed_node_to_tree_ref(node); trace_run_delayed_tree_ref(fs_info, node, ref, node->action); @@ -2440,15 +2439,6 @@ static int run_delayed_tree_ref(struct btrfs_trans_handle *trans, parent = ref->parent; ref_root = ref->root; - ins.objectid = node->bytenr; - if (skinny_metadata) { - ins.offset = ref->level; - ins.type = BTRFS_METADATA_ITEM_KEY; - } else { - ins.offset = node->num_bytes; - ins.type = BTRFS_EXTENT_ITEM_KEY; - } - if (node->ref_mod != 1) { btrfs_err(fs_info, "btree block(%llu) has %d references rather than 1: action %d ref_root %llu parent %llu", @@ -2458,11 +2448,7 @@ static int run_delayed_tree_ref(struct btrfs_trans_handle *trans, } if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) { BUG_ON(!extent_op || !extent_op->update_flags); - ret = alloc_reserved_tree_block(trans, fs_info, - parent, ref_root, - extent_op->flags_to_set, - &extent_op->key, - ref->level, &ins); + ret = alloc_reserved_tree_block(trans, node, extent_op); } else if (node->action == BTRFS_ADD_DELAYED_REF) { ret = __btrfs_inc_extent_ref(trans, fs_info, node, parent, ref_root, @@ -2594,8 +2580,8 @@ static int cleanup_ref_head(struct btrfs_trans_handle *trans, delayed_refs->num_heads--; rb_erase(&head->href_node, &delayed_refs->href_root); RB_CLEAR_NODE(&head->href_node); - spin_unlock(&delayed_refs->lock); spin_unlock(&head->lock); + spin_unlock(&delayed_refs->lock); atomic_dec(&delayed_refs->num_entries); trace_run_delayed_ref_head(fs_info, head, 0); @@ -2700,17 +2686,12 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, * insert_inline_extent_backref()). */ spin_lock(&locked_ref->lock); - btrfs_merge_delayed_refs(trans, fs_info, delayed_refs, - locked_ref); + btrfs_merge_delayed_refs(trans, delayed_refs, locked_ref); - /* - * locked_ref is the head node, so we have to go one - * node back for any delayed ref updates - */ ref = select_delayed_ref(locked_ref); if (ref && ref->seq && - btrfs_check_delayed_seq(fs_info, delayed_refs, ref->seq)) { + btrfs_check_delayed_seq(fs_info, ref->seq)) { spin_unlock(&locked_ref->lock); unselect_delayed_ref_head(delayed_refs, locked_ref); locked_ref = NULL; @@ -3291,7 +3272,7 @@ int btrfs_cross_ref_exist(struct btrfs_root *root, u64 objectid, u64 offset, path = btrfs_alloc_path(); if (!path) - return -ENOENT; + return -ENOMEM; do { ret = check_committed_ref(root, path, objectid, @@ -4026,8 +4007,7 @@ static const char *alloc_name(u64 flags) }; } -static int create_space_info(struct btrfs_fs_info *info, u64 flags, - struct btrfs_space_info **new) +static int create_space_info(struct btrfs_fs_info *info, u64 flags) { struct btrfs_space_info *space_info; @@ -4065,7 +4045,6 @@ static int create_space_info(struct btrfs_fs_info *info, u64 flags, return ret; } - *new = space_info; list_add_rcu(&space_info->list, &info->space_info); if (flags & BTRFS_BLOCK_GROUP_DATA) info->data_sinfo = space_info; @@ -4122,7 +4101,7 @@ static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags) * returns target flags in extended format or 0 if restripe for this * chunk_type is not in progress * - * should be called with either volume_mutex or balance_lock held + * should be called with balance_lock held */ static u64 get_restripe_target(struct btrfs_fs_info *fs_info, u64 flags) { @@ -4178,7 +4157,7 @@ static u64 btrfs_reduce_alloc_profile(struct btrfs_fs_info *fs_info, u64 flags) /* First, mask out the RAID levels which aren't possible */ for (raid_type = 0; raid_type < BTRFS_NR_RAID_TYPES; raid_type++) { if (num_devices >= btrfs_raid_array[raid_type].devs_min) - allowed |= btrfs_raid_group[raid_type]; + allowed |= btrfs_raid_array[raid_type].bg_flag; } allowed &= flags; @@ -4341,7 +4320,7 @@ commit_trans: need_commit--; if (need_commit > 0) { - btrfs_start_delalloc_roots(fs_info, 0, -1); + btrfs_start_delalloc_roots(fs_info, -1); btrfs_wait_ordered_roots(fs_info, U64_MAX, 0, (u64)-1); } @@ -4678,12 +4657,14 @@ again: trans->allocating_chunk = false; spin_lock(&space_info->lock); - if (ret < 0 && ret != -ENOSPC) - goto out; - if (ret) - space_info->full = 1; - else + if (ret < 0) { + if (ret == -ENOSPC) + space_info->full = 1; + else + goto out; + } else { ret = 1; + } space_info->force_alloc = CHUNK_ALLOC_NO_FORCE; out: @@ -4792,7 +4773,7 @@ static void btrfs_writeback_inodes_sb_nr(struct btrfs_fs_info *fs_info, * the filesystem is readonly(all dirty pages are written to * the disk). */ - btrfs_start_delalloc_roots(fs_info, 0, nr_items); + btrfs_start_delalloc_roots(fs_info, nr_items); if (!current->journal_info) btrfs_wait_ordered_roots(fs_info, nr_items, 0, (u64)-1); } @@ -5949,44 +5930,6 @@ void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans) trans->chunk_bytes_reserved = 0; } -/* Can only return 0 or -ENOSPC */ -int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans, - struct btrfs_inode *inode) -{ - struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb); - struct btrfs_root *root = inode->root; - /* - * We always use trans->block_rsv here as we will have reserved space - * for our orphan when starting the transaction, using get_block_rsv() - * here will sometimes make us choose the wrong block rsv as we could be - * doing a reloc inode for a non refcounted root. - */ - struct btrfs_block_rsv *src_rsv = trans->block_rsv; - struct btrfs_block_rsv *dst_rsv = root->orphan_block_rsv; - - /* - * We need to hold space in order to delete our orphan item once we've - * added it, so this takes the reservation so we can release it later - * when we are truly done with the orphan item. - */ - u64 num_bytes = btrfs_calc_trans_metadata_size(fs_info, 1); - - trace_btrfs_space_reservation(fs_info, "orphan", btrfs_ino(inode), - num_bytes, 1); - return btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes, 1); -} - -void btrfs_orphan_release_metadata(struct btrfs_inode *inode) -{ - struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb); - struct btrfs_root *root = inode->root; - u64 num_bytes = btrfs_calc_trans_metadata_size(fs_info, 1); - - trace_btrfs_space_reservation(fs_info, "orphan", btrfs_ino(inode), - num_bytes, 0); - btrfs_block_rsv_release(fs_info, root->orphan_block_rsv, num_bytes); -} - /* * btrfs_subvolume_reserve_metadata() - reserve space for subvolume operation * root: the root of the parent directory @@ -6004,7 +5947,6 @@ void btrfs_orphan_release_metadata(struct btrfs_inode *inode) int btrfs_subvolume_reserve_metadata(struct btrfs_root *root, struct btrfs_block_rsv *rsv, int items, - u64 *qgroup_reserved, bool use_global_rsv) { u64 num_bytes; @@ -6022,8 +5964,6 @@ int btrfs_subvolume_reserve_metadata(struct btrfs_root *root, num_bytes = 0; } - *qgroup_reserved = num_bytes; - num_bytes = btrfs_calc_trans_metadata_size(fs_info, items); rsv->space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA); @@ -6033,8 +5973,8 @@ int btrfs_subvolume_reserve_metadata(struct btrfs_root *root, if (ret == -ENOSPC && use_global_rsv) ret = btrfs_block_rsv_migrate(global_rsv, rsv, num_bytes, 1); - if (ret && *qgroup_reserved) - btrfs_qgroup_free_meta_prealloc(root, *qgroup_reserved); + if (ret && num_bytes) + btrfs_qgroup_free_meta_prealloc(root, num_bytes); return ret; } @@ -6354,6 +6294,7 @@ static int update_block_group(struct btrfs_trans_handle *trans, spin_lock(&info->unused_bgs_lock); if (list_empty(&cache->bg_list)) { btrfs_get_block_group(cache); + trace_btrfs_add_unused_block_group(cache); list_add_tail(&cache->bg_list, &info->unused_bgs); } @@ -6511,6 +6452,7 @@ int btrfs_exclude_logged_extents(struct btrfs_fs_info *fs_info, struct btrfs_key key; int found_type; int i; + int ret = 0; if (!btrfs_fs_incompat(fs_info, MIXED_GROUPS)) return 0; @@ -6527,10 +6469,12 @@ int btrfs_exclude_logged_extents(struct btrfs_fs_info *fs_info, continue; key.objectid = btrfs_file_extent_disk_bytenr(eb, item); key.offset = btrfs_file_extent_disk_num_bytes(eb, item); - __exclude_logged_extent(fs_info, key.objectid, key.offset); + ret = __exclude_logged_extent(fs_info, key.objectid, key.offset); + if (ret) + break; } - return 0; + return ret; } static void @@ -7122,7 +7066,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, } } - ret = add_to_free_space_tree(trans, info, bytenr, num_bytes); + ret = add_to_free_space_tree(trans, bytenr, num_bytes); if (ret) { btrfs_abort_transaction(trans, ret); goto out; @@ -7266,7 +7210,7 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans, } out: if (pin) - add_pinned_bytes(fs_info, buf->len, btrfs_header_level(buf), + add_pinned_bytes(fs_info, buf->len, true, root->root_key.objectid); if (last_ref) { @@ -7320,8 +7264,11 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, &old_ref_mod, &new_ref_mod); } - if (ret == 0 && old_ref_mod >= 0 && new_ref_mod < 0) - add_pinned_bytes(fs_info, num_bytes, owner, root_objectid); + if (ret == 0 && old_ref_mod >= 0 && new_ref_mod < 0) { + bool metadata = owner < BTRFS_FIRST_FREE_OBJECTID; + + add_pinned_bytes(fs_info, num_bytes, metadata, root_objectid); + } return ret; } @@ -7373,24 +7320,6 @@ wait_block_group_cache_done(struct btrfs_block_group_cache *cache) return ret; } -static const char *btrfs_raid_type_names[BTRFS_NR_RAID_TYPES] = { - [BTRFS_RAID_RAID10] = "raid10", - [BTRFS_RAID_RAID1] = "raid1", - [BTRFS_RAID_DUP] = "dup", - [BTRFS_RAID_RAID0] = "raid0", - [BTRFS_RAID_SINGLE] = "single", - [BTRFS_RAID_RAID5] = "raid5", - [BTRFS_RAID_RAID6] = "raid6", -}; - -static const char *get_raid_name(enum btrfs_raid_types type) -{ - if (type >= BTRFS_NR_RAID_TYPES) - return NULL; - - return btrfs_raid_type_names[type]; -} - enum btrfs_loop_type { LOOP_CACHING_NOWAIT = 0, LOOP_CACHING_WAIT = 1, @@ -7662,7 +7591,7 @@ have_block_group: if (offset) { /* we have a block, we're done */ spin_unlock(&last_ptr->refill_lock); - trace_btrfs_reserve_extent_cluster(fs_info, + trace_btrfs_reserve_extent_cluster( used_block_group, search_start, num_bytes); if (used_block_group != block_group) { @@ -7735,7 +7664,7 @@ refill_cluster: if (offset) { /* we found one, proceed */ spin_unlock(&last_ptr->refill_lock); - trace_btrfs_reserve_extent_cluster(fs_info, + trace_btrfs_reserve_extent_cluster( block_group, search_start, num_bytes); goto checks; @@ -7835,8 +7764,7 @@ checks: ins->objectid = search_start; ins->offset = num_bytes; - trace_btrfs_reserve_extent(fs_info, block_group, - search_start, num_bytes); + trace_btrfs_reserve_extent(block_group, search_start, num_bytes); btrfs_release_block_group(block_group, delalloc); break; loop: @@ -8184,8 +8112,7 @@ static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans, btrfs_mark_buffer_dirty(path->nodes[0]); btrfs_free_path(path); - ret = remove_from_free_space_tree(trans, fs_info, ins->objectid, - ins->offset); + ret = remove_from_free_space_tree(trans, ins->objectid, ins->offset); if (ret) return ret; @@ -8200,37 +8127,52 @@ static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans, } static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, - u64 parent, u64 root_objectid, - u64 flags, struct btrfs_disk_key *key, - int level, struct btrfs_key *ins) + struct btrfs_delayed_ref_node *node, + struct btrfs_delayed_extent_op *extent_op) { + struct btrfs_fs_info *fs_info = trans->fs_info; int ret; struct btrfs_extent_item *extent_item; + struct btrfs_key extent_key; struct btrfs_tree_block_info *block_info; struct btrfs_extent_inline_ref *iref; struct btrfs_path *path; struct extent_buffer *leaf; + struct btrfs_delayed_tree_ref *ref; u32 size = sizeof(*extent_item) + sizeof(*iref); - u64 num_bytes = ins->offset; + u64 num_bytes; + u64 flags = extent_op->flags_to_set; bool skinny_metadata = btrfs_fs_incompat(fs_info, SKINNY_METADATA); - if (!skinny_metadata) + ref = btrfs_delayed_node_to_tree_ref(node); + + extent_key.objectid = node->bytenr; + if (skinny_metadata) { + extent_key.offset = ref->level; + extent_key.type = BTRFS_METADATA_ITEM_KEY; + num_bytes = fs_info->nodesize; + } else { + extent_key.offset = node->num_bytes; + extent_key.type = BTRFS_EXTENT_ITEM_KEY; size += sizeof(*block_info); + num_bytes = node->num_bytes; + } path = btrfs_alloc_path(); if (!path) { - btrfs_free_and_pin_reserved_extent(fs_info, ins->objectid, + btrfs_free_and_pin_reserved_extent(fs_info, + extent_key.objectid, fs_info->nodesize); return -ENOMEM; } path->leave_spinning = 1; ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path, - ins, size); + &extent_key, size); if (ret) { btrfs_free_path(path); - btrfs_free_and_pin_reserved_extent(fs_info, ins->objectid, + btrfs_free_and_pin_reserved_extent(fs_info, + extent_key.objectid, fs_info->nodesize); return ret; } @@ -8245,42 +8187,41 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, if (skinny_metadata) { iref = (struct btrfs_extent_inline_ref *)(extent_item + 1); - num_bytes = fs_info->nodesize; } else { block_info = (struct btrfs_tree_block_info *)(extent_item + 1); - btrfs_set_tree_block_key(leaf, block_info, key); - btrfs_set_tree_block_level(leaf, block_info, level); + btrfs_set_tree_block_key(leaf, block_info, &extent_op->key); + btrfs_set_tree_block_level(leaf, block_info, ref->level); iref = (struct btrfs_extent_inline_ref *)(block_info + 1); } - if (parent > 0) { + if (node->type == BTRFS_SHARED_BLOCK_REF_KEY) { BUG_ON(!(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)); btrfs_set_extent_inline_ref_type(leaf, iref, BTRFS_SHARED_BLOCK_REF_KEY); - btrfs_set_extent_inline_ref_offset(leaf, iref, parent); + btrfs_set_extent_inline_ref_offset(leaf, iref, ref->parent); } else { btrfs_set_extent_inline_ref_type(leaf, iref, BTRFS_TREE_BLOCK_REF_KEY); - btrfs_set_extent_inline_ref_offset(leaf, iref, root_objectid); + btrfs_set_extent_inline_ref_offset(leaf, iref, ref->root); } btrfs_mark_buffer_dirty(leaf); btrfs_free_path(path); - ret = remove_from_free_space_tree(trans, fs_info, ins->objectid, + ret = remove_from_free_space_tree(trans, extent_key.objectid, num_bytes); if (ret) return ret; - ret = update_block_group(trans, fs_info, ins->objectid, + ret = update_block_group(trans, fs_info, extent_key.objectid, fs_info->nodesize, 1); if (ret) { /* -ENOENT, logic error */ btrfs_err(fs_info, "update block group failed for %llu %llu", - ins->objectid, ins->offset); + extent_key.objectid, extent_key.offset); BUG(); } - trace_btrfs_reserved_extent_alloc(fs_info, ins->objectid, + trace_btrfs_reserved_extent_alloc(fs_info, extent_key.objectid, fs_info->nodesize); return ret; } @@ -10173,8 +10114,7 @@ int btrfs_read_block_groups(struct btrfs_fs_info *info) } else if (btrfs_block_group_used(&cache->item) == 0) { cache->last_byte_to_unpin = (u64)-1; cache->cached = BTRFS_CACHE_FINISHED; - add_new_free_space(cache, info, - found_key.objectid, + add_new_free_space(cache, found_key.objectid, found_key.objectid + found_key.offset); free_excluded_extents(info, cache); @@ -10204,6 +10144,7 @@ int btrfs_read_block_groups(struct btrfs_fs_info *info) /* Should always be true but just in case. */ if (list_empty(&cache->bg_list)) { btrfs_get_block_group(cache); + trace_btrfs_add_unused_block_group(cache); list_add_tail(&cache->bg_list, &info->unused_bgs); } @@ -10269,7 +10210,7 @@ void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans) key.offset); if (ret) btrfs_abort_transaction(trans, ret); - add_block_group_free_space(trans, fs_info, block_group); + add_block_group_free_space(trans, block_group); /* already aborted the transaction if it failed. */ next: list_del_init(&block_group->bg_list); @@ -10310,7 +10251,7 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, return ret; } - add_new_free_space(cache, fs_info, chunk_offset, chunk_offset + size); + add_new_free_space(cache, chunk_offset, chunk_offset + size); free_excluded_extents(fs_info, cache); @@ -10391,6 +10332,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, BUG_ON(!block_group); BUG_ON(!block_group->ro); + trace_btrfs_remove_block_group(block_group); /* * Free the reserved super bytes from this block group before * remove it. @@ -10648,7 +10590,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, mutex_unlock(&fs_info->chunk_mutex); - ret = remove_block_group_free_space(trans, fs_info, block_group); + ret = remove_block_group_free_space(trans, block_group); if (ret) goto out; @@ -10755,6 +10697,7 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info) * the ro check in case balance is currently acting on * this block group. */ + trace_btrfs_skip_unused_block_group(block_group); spin_unlock(&block_group->lock); up_write(&space_info->groups_sem); goto next; @@ -10877,7 +10820,6 @@ next: int btrfs_init_space_info(struct btrfs_fs_info *fs_info) { - struct btrfs_space_info *space_info; struct btrfs_super_block *disk_super; u64 features; u64 flags; @@ -10893,21 +10835,21 @@ int btrfs_init_space_info(struct btrfs_fs_info *fs_info) mixed = 1; flags = BTRFS_BLOCK_GROUP_SYSTEM; - ret = create_space_info(fs_info, flags, &space_info); + ret = create_space_info(fs_info, flags); if (ret) goto out; if (mixed) { flags = BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_DATA; - ret = create_space_info(fs_info, flags, &space_info); + ret = create_space_info(fs_info, flags); } else { flags = BTRFS_BLOCK_GROUP_METADATA; - ret = create_space_info(fs_info, flags, &space_info); + ret = create_space_info(fs_info, flags); if (ret) goto out; flags = BTRFS_BLOCK_GROUP_DATA; - ret = create_space_info(fs_info, flags, &space_info); + ret = create_space_info(fs_info, flags); } out: return ret; @@ -11092,12 +11034,7 @@ int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range) void btrfs_end_write_no_snapshotting(struct btrfs_root *root) { percpu_counter_dec(&root->subv_writers->counter); - /* - * Make sure counter is updated before we wake up waiters. - */ - smp_mb(); - if (waitqueue_active(&root->subv_writers->wait)) - wake_up(&root->subv_writers->wait); + cond_wake_up(&root->subv_writers->wait); } int btrfs_start_write_no_snapshotting(struct btrfs_root *root) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index e99b329002cf..51fc015c7d2c 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -26,7 +26,7 @@ static struct kmem_cache *extent_state_cache; static struct kmem_cache *extent_buffer_cache; -static struct bio_set *btrfs_bioset; +static struct bio_set btrfs_bioset; static inline bool extent_state_in_tree(const struct extent_state *state) { @@ -162,20 +162,18 @@ int __init extent_io_init(void) if (!extent_buffer_cache) goto free_state_cache; - btrfs_bioset = bioset_create(BIO_POOL_SIZE, - offsetof(struct btrfs_io_bio, bio), - BIOSET_NEED_BVECS); - if (!btrfs_bioset) + if (bioset_init(&btrfs_bioset, BIO_POOL_SIZE, + offsetof(struct btrfs_io_bio, bio), + BIOSET_NEED_BVECS)) goto free_buffer_cache; - if (bioset_integrity_create(btrfs_bioset, BIO_POOL_SIZE)) + if (bioset_integrity_create(&btrfs_bioset, BIO_POOL_SIZE)) goto free_bioset; return 0; free_bioset: - bioset_free(btrfs_bioset); - btrfs_bioset = NULL; + bioset_exit(&btrfs_bioset); free_buffer_cache: kmem_cache_destroy(extent_buffer_cache); @@ -198,8 +196,7 @@ void __cold extent_io_exit(void) rcu_barrier(); kmem_cache_destroy(extent_state_cache); kmem_cache_destroy(extent_buffer_cache); - if (btrfs_bioset) - bioset_free(btrfs_bioset); + bioset_exit(&btrfs_bioset); } void extent_io_tree_init(struct extent_io_tree *tree, @@ -2679,7 +2676,7 @@ struct bio *btrfs_bio_alloc(struct block_device *bdev, u64 first_byte) { struct bio *bio; - bio = bio_alloc_bioset(GFP_NOFS, BIO_MAX_PAGES, btrfs_bioset); + bio = bio_alloc_bioset(GFP_NOFS, BIO_MAX_PAGES, &btrfs_bioset); bio_set_dev(bio, bdev); bio->bi_iter.bi_sector = first_byte >> 9; btrfs_io_bio_init(btrfs_io_bio(bio)); @@ -2692,7 +2689,7 @@ struct bio *btrfs_bio_clone(struct bio *bio) struct bio *new; /* Bio allocation backed by a bioset does not fail */ - new = bio_clone_fast(bio, GFP_NOFS, btrfs_bioset); + new = bio_clone_fast(bio, GFP_NOFS, &btrfs_bioset); btrfs_bio = btrfs_io_bio(new); btrfs_io_bio_init(btrfs_bio); btrfs_bio->iter = bio->bi_iter; @@ -2704,7 +2701,7 @@ struct bio *btrfs_io_bio_alloc(unsigned int nr_iovecs) struct bio *bio; /* Bio allocation backed by a bioset does not fail */ - bio = bio_alloc_bioset(GFP_NOFS, nr_iovecs, btrfs_bioset); + bio = bio_alloc_bioset(GFP_NOFS, nr_iovecs, &btrfs_bioset); btrfs_io_bio_init(btrfs_io_bio(bio)); return bio; } @@ -2715,7 +2712,7 @@ struct bio *btrfs_bio_clone_partial(struct bio *orig, int offset, int size) struct btrfs_io_bio *btrfs_bio; /* this will never fail when it's backed by a bioset */ - bio = bio_clone_fast(orig, GFP_NOFS, btrfs_bioset); + bio = bio_clone_fast(orig, GFP_NOFS, &btrfs_bioset); ASSERT(bio); btrfs_bio = btrfs_io_bio(bio); @@ -4109,14 +4106,13 @@ int extent_write_locked_range(struct inode *inode, u64 start, u64 end, return ret; } -int extent_writepages(struct extent_io_tree *tree, - struct address_space *mapping, +int extent_writepages(struct address_space *mapping, struct writeback_control *wbc) { int ret = 0; struct extent_page_data epd = { .bio = NULL, - .tree = tree, + .tree = &BTRFS_I(mapping->host)->io_tree, .extent_locked = 0, .sync_io = wbc->sync_mode == WB_SYNC_ALL, }; @@ -4126,9 +4122,8 @@ int extent_writepages(struct extent_io_tree *tree, return ret; } -int extent_readpages(struct extent_io_tree *tree, - struct address_space *mapping, - struct list_head *pages, unsigned nr_pages) +int extent_readpages(struct address_space *mapping, struct list_head *pages, + unsigned nr_pages) { struct bio *bio = NULL; unsigned page_idx; @@ -4136,6 +4131,7 @@ int extent_readpages(struct extent_io_tree *tree, struct page *pagepool[16]; struct page *page; struct extent_map *em_cached = NULL; + struct extent_io_tree *tree = &BTRFS_I(mapping->host)->io_tree; int nr = 0; u64 prev_em_start = (u64)-1; @@ -4202,8 +4198,7 @@ int extent_invalidatepage(struct extent_io_tree *tree, * are locked or under IO and drops the related state bits if it is safe * to drop the page. */ -static int try_release_extent_state(struct extent_map_tree *map, - struct extent_io_tree *tree, +static int try_release_extent_state(struct extent_io_tree *tree, struct page *page, gfp_t mask) { u64 start = page_offset(page); @@ -4238,13 +4233,13 @@ static int try_release_extent_state(struct extent_map_tree *map, * in the range corresponding to the page, both state records and extent * map records are removed */ -int try_release_extent_mapping(struct extent_map_tree *map, - struct extent_io_tree *tree, struct page *page, - gfp_t mask) +int try_release_extent_mapping(struct page *page, gfp_t mask) { struct extent_map *em; u64 start = page_offset(page); u64 end = start + PAGE_SIZE - 1; + struct extent_io_tree *tree = &BTRFS_I(page->mapping->host)->io_tree; + struct extent_map_tree *map = &BTRFS_I(page->mapping->host)->extent_tree; if (gfpflags_allow_blocking(mask) && page->mapping->host->i_size > SZ_16M) { @@ -4278,7 +4273,7 @@ int try_release_extent_mapping(struct extent_map_tree *map, free_extent_map(em); } } - return try_release_extent_state(map, tree, page, mask); + return try_release_extent_state(tree, page, mask); } /* @@ -5620,46 +5615,6 @@ void copy_extent_buffer(struct extent_buffer *dst, struct extent_buffer *src, } } -void le_bitmap_set(u8 *map, unsigned int start, int len) -{ - u8 *p = map + BIT_BYTE(start); - const unsigned int size = start + len; - int bits_to_set = BITS_PER_BYTE - (start % BITS_PER_BYTE); - u8 mask_to_set = BITMAP_FIRST_BYTE_MASK(start); - - while (len - bits_to_set >= 0) { - *p |= mask_to_set; - len -= bits_to_set; - bits_to_set = BITS_PER_BYTE; - mask_to_set = ~0; - p++; - } - if (len) { - mask_to_set &= BITMAP_LAST_BYTE_MASK(size); - *p |= mask_to_set; - } -} - -void le_bitmap_clear(u8 *map, unsigned int start, int len) -{ - u8 *p = map + BIT_BYTE(start); - const unsigned int size = start + len; - int bits_to_clear = BITS_PER_BYTE - (start % BITS_PER_BYTE); - u8 mask_to_clear = BITMAP_FIRST_BYTE_MASK(start); - - while (len - bits_to_clear >= 0) { - *p &= ~mask_to_clear; - len -= bits_to_clear; - bits_to_clear = BITS_PER_BYTE; - mask_to_clear = ~0; - p++; - } - if (len) { - mask_to_clear &= BITMAP_LAST_BYTE_MASK(size); - *p &= ~mask_to_clear; - } -} - /* * eb_bitmap_offset() - calculate the page and offset of the byte containing the * given bit number diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index a53009694b16..0bfd4aeb822d 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -79,14 +79,6 @@ #define BITMAP_LAST_BYTE_MASK(nbits) \ (BYTE_MASK >> (-(nbits) & (BITS_PER_BYTE - 1))) -static inline int le_test_bit(int nr, const u8 *addr) -{ - return 1U & (addr[BIT_BYTE(nr)] >> (nr & (BITS_PER_BYTE-1))); -} - -void le_bitmap_set(u8 *map, unsigned int start, int len); -void le_bitmap_clear(u8 *map, unsigned int start, int len); - struct extent_state; struct btrfs_root; struct btrfs_inode; @@ -278,9 +270,7 @@ typedef struct extent_map *(get_extent_t)(struct btrfs_inode *inode, int create); void extent_io_tree_init(struct extent_io_tree *tree, void *private_data); -int try_release_extent_mapping(struct extent_map_tree *map, - struct extent_io_tree *tree, struct page *page, - gfp_t mask); +int try_release_extent_mapping(struct page *page, gfp_t mask); int try_release_extent_buffer(struct page *page); int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, struct extent_state **cached); @@ -421,14 +411,12 @@ int extent_invalidatepage(struct extent_io_tree *tree, int extent_write_full_page(struct page *page, struct writeback_control *wbc); int extent_write_locked_range(struct inode *inode, u64 start, u64 end, int mode); -int extent_writepages(struct extent_io_tree *tree, - struct address_space *mapping, +int extent_writepages(struct address_space *mapping, struct writeback_control *wbc); int btree_write_cache_pages(struct address_space *mapping, struct writeback_control *wbc); -int extent_readpages(struct extent_io_tree *tree, - struct address_space *mapping, - struct list_head *pages, unsigned nr_pages); +int extent_readpages(struct address_space *mapping, struct list_head *pages, + unsigned nr_pages); int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, __u64 start, __u64 len); void set_page_extent_mapped(struct page *page); diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index 1b8a078f92eb..6648d55e5339 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -518,6 +518,7 @@ static noinline int merge_extent_mapping(struct extent_map_tree *em_tree, /** * btrfs_add_extent_mapping - add extent mapping into em_tree + * @fs_info - used for tracepoint * @em_tree - the extent tree into which we want to insert the extent mapping * @em_in - extent we are inserting * @start - start of the logical range btrfs_get_extent() is requesting @@ -535,7 +536,8 @@ static noinline int merge_extent_mapping(struct extent_map_tree *em_tree, * Return 0 on success, otherwise -EEXIST. * */ -int btrfs_add_extent_mapping(struct extent_map_tree *em_tree, +int btrfs_add_extent_mapping(struct btrfs_fs_info *fs_info, + struct extent_map_tree *em_tree, struct extent_map **em_in, u64 start, u64 len) { int ret; @@ -553,7 +555,7 @@ int btrfs_add_extent_mapping(struct extent_map_tree *em_tree, existing = search_extent_mapping(em_tree, start, len); - trace_btrfs_handle_em_exist(existing, em, start, len); + trace_btrfs_handle_em_exist(fs_info, existing, em, start, len); /* * existing will always be non-NULL, since there must be diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h index 5fcb80a6ce37..25d985e7532a 100644 --- a/fs/btrfs/extent_map.h +++ b/fs/btrfs/extent_map.h @@ -92,7 +92,8 @@ int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len, u64 gen void clear_em_logging(struct extent_map_tree *tree, struct extent_map *em); struct extent_map *search_extent_mapping(struct extent_map_tree *tree, u64 start, u64 len); -int btrfs_add_extent_mapping(struct extent_map_tree *em_tree, +int btrfs_add_extent_mapping(struct btrfs_fs_info *fs_info, + struct extent_map_tree *em_tree, struct extent_map **em_in, u64 start, u64 len); #endif diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index e5b569bebc73..d5f80cb300be 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -253,10 +253,8 @@ int btrfs_truncate_free_space_cache(struct btrfs_trans_handle *trans, truncate_pagecache(inode, 0); /* - * We don't need an orphan item because truncating the free space cache - * will never be split across transactions. - * We don't need to check for -EAGAIN because we're a free space - * cache inode + * We skip the throttling logic for free space cache inodes, so we don't + * need to check for -EAGAIN. */ ret = btrfs_truncate_inode_items(trans, root, inode, 0, BTRFS_EXTENT_DATA_KEY); diff --git a/fs/btrfs/free-space-tree.c b/fs/btrfs/free-space-tree.c index 32a0f6cb5594..b5950aacd697 100644 --- a/fs/btrfs/free-space-tree.c +++ b/fs/btrfs/free-space-tree.c @@ -12,7 +12,6 @@ #include "transaction.h" static int __add_block_group_free_space(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, struct btrfs_block_group_cache *block_group, struct btrfs_path *path); @@ -45,11 +44,10 @@ void set_free_space_tree_thresholds(struct btrfs_block_group_cache *cache) } static int add_new_free_space_info(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, struct btrfs_block_group_cache *block_group, struct btrfs_path *path) { - struct btrfs_root *root = fs_info->free_space_root; + struct btrfs_root *root = trans->fs_info->free_space_root; struct btrfs_free_space_info *info; struct btrfs_key key; struct extent_buffer *leaf; @@ -138,10 +136,11 @@ static inline u32 free_space_bitmap_size(u64 size, u32 sectorsize) return DIV_ROUND_UP((u32)div_u64(size, sectorsize), BITS_PER_BYTE); } -static u8 *alloc_bitmap(u32 bitmap_size) +static unsigned long *alloc_bitmap(u32 bitmap_size) { - u8 *ret; + unsigned long *ret; unsigned int nofs_flag; + u32 bitmap_rounded_size = round_up(bitmap_size, sizeof(unsigned long)); /* * GFP_NOFS doesn't work with kvmalloc(), but we really can't recurse @@ -152,21 +151,42 @@ static u8 *alloc_bitmap(u32 bitmap_size) * know that recursion is unsafe. */ nofs_flag = memalloc_nofs_save(); - ret = kvzalloc(bitmap_size, GFP_KERNEL); + ret = kvzalloc(bitmap_rounded_size, GFP_KERNEL); memalloc_nofs_restore(nofs_flag); return ret; } +static void le_bitmap_set(unsigned long *map, unsigned int start, int len) +{ + u8 *p = ((u8 *)map) + BIT_BYTE(start); + const unsigned int size = start + len; + int bits_to_set = BITS_PER_BYTE - (start % BITS_PER_BYTE); + u8 mask_to_set = BITMAP_FIRST_BYTE_MASK(start); + + while (len - bits_to_set >= 0) { + *p |= mask_to_set; + len -= bits_to_set; + bits_to_set = BITS_PER_BYTE; + mask_to_set = ~0; + p++; + } + if (len) { + mask_to_set &= BITMAP_LAST_BYTE_MASK(size); + *p |= mask_to_set; + } +} + int convert_free_space_to_bitmaps(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, struct btrfs_block_group_cache *block_group, struct btrfs_path *path) { + struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_root *root = fs_info->free_space_root; struct btrfs_free_space_info *info; struct btrfs_key key, found_key; struct extent_buffer *leaf; - u8 *bitmap, *bitmap_cursor; + unsigned long *bitmap; + char *bitmap_cursor; u64 start, end; u64 bitmap_range, i; u32 bitmap_size, flags, expected_extent_count; @@ -255,7 +275,7 @@ int convert_free_space_to_bitmaps(struct btrfs_trans_handle *trans, goto out; } - bitmap_cursor = bitmap; + bitmap_cursor = (char *)bitmap; bitmap_range = fs_info->sectorsize * BTRFS_FREE_SPACE_BITMAP_BITS; i = start; while (i < end) { @@ -296,21 +316,18 @@ out: } int convert_free_space_to_extents(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, struct btrfs_block_group_cache *block_group, struct btrfs_path *path) { + struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_root *root = fs_info->free_space_root; struct btrfs_free_space_info *info; struct btrfs_key key, found_key; struct extent_buffer *leaf; - u8 *bitmap; + unsigned long *bitmap; u64 start, end; - /* Initialize to silence GCC. */ - u64 extent_start = 0; - u64 offset; u32 bitmap_size, flags, expected_extent_count; - int prev_bit = 0, bit, bitnr; + unsigned long nrbits, start_bit, end_bit; u32 extent_count = 0; int done = 0, nr; int ret; @@ -348,7 +365,7 @@ int convert_free_space_to_extents(struct btrfs_trans_handle *trans, break; } else if (found_key.type == BTRFS_FREE_SPACE_BITMAP_KEY) { unsigned long ptr; - u8 *bitmap_cursor; + char *bitmap_cursor; u32 bitmap_pos, data_size; ASSERT(found_key.objectid >= start); @@ -358,7 +375,7 @@ int convert_free_space_to_extents(struct btrfs_trans_handle *trans, bitmap_pos = div_u64(found_key.objectid - start, fs_info->sectorsize * BITS_PER_BYTE); - bitmap_cursor = bitmap + bitmap_pos; + bitmap_cursor = ((char *)bitmap) + bitmap_pos; data_size = free_space_bitmap_size(found_key.offset, fs_info->sectorsize); @@ -392,32 +409,16 @@ int convert_free_space_to_extents(struct btrfs_trans_handle *trans, btrfs_mark_buffer_dirty(leaf); btrfs_release_path(path); - offset = start; - bitnr = 0; - while (offset < end) { - bit = !!le_test_bit(bitnr, bitmap); - if (prev_bit == 0 && bit == 1) { - extent_start = offset; - } else if (prev_bit == 1 && bit == 0) { - key.objectid = extent_start; - key.type = BTRFS_FREE_SPACE_EXTENT_KEY; - key.offset = offset - extent_start; - - ret = btrfs_insert_empty_item(trans, root, path, &key, 0); - if (ret) - goto out; - btrfs_release_path(path); + nrbits = div_u64(block_group->key.offset, block_group->fs_info->sectorsize); + start_bit = find_next_bit_le(bitmap, nrbits, 0); - extent_count++; - } - prev_bit = bit; - offset += fs_info->sectorsize; - bitnr++; - } - if (prev_bit == 1) { - key.objectid = extent_start; + while (start_bit < nrbits) { + end_bit = find_next_zero_bit_le(bitmap, nrbits, start_bit); + ASSERT(start_bit < end_bit); + + key.objectid = start + start_bit * block_group->fs_info->sectorsize; key.type = BTRFS_FREE_SPACE_EXTENT_KEY; - key.offset = end - extent_start; + key.offset = (end_bit - start_bit) * block_group->fs_info->sectorsize; ret = btrfs_insert_empty_item(trans, root, path, &key, 0); if (ret) @@ -425,6 +426,8 @@ int convert_free_space_to_extents(struct btrfs_trans_handle *trans, btrfs_release_path(path); extent_count++; + + start_bit = find_next_bit_le(bitmap, nrbits, end_bit); } if (extent_count != expected_extent_count) { @@ -446,7 +449,6 @@ out: } static int update_free_space_extent_count(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, struct btrfs_block_group_cache *block_group, struct btrfs_path *path, int new_extents) @@ -459,7 +461,8 @@ static int update_free_space_extent_count(struct btrfs_trans_handle *trans, if (new_extents == 0) return 0; - info = search_free_space_info(trans, fs_info, block_group, path, 1); + info = search_free_space_info(trans, trans->fs_info, block_group, path, + 1); if (IS_ERR(info)) { ret = PTR_ERR(info); goto out; @@ -474,12 +477,10 @@ static int update_free_space_extent_count(struct btrfs_trans_handle *trans, if (!(flags & BTRFS_FREE_SPACE_USING_BITMAPS) && extent_count > block_group->bitmap_high_thresh) { - ret = convert_free_space_to_bitmaps(trans, fs_info, block_group, - path); + ret = convert_free_space_to_bitmaps(trans, block_group, path); } else if ((flags & BTRFS_FREE_SPACE_USING_BITMAPS) && extent_count < block_group->bitmap_low_thresh) { - ret = convert_free_space_to_extents(trans, fs_info, block_group, - path); + ret = convert_free_space_to_extents(trans, block_group, path); } out: @@ -576,12 +577,11 @@ static int free_space_next_bitmap(struct btrfs_trans_handle *trans, * the bitmap. */ static int modify_free_space_bitmap(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, struct btrfs_block_group_cache *block_group, struct btrfs_path *path, u64 start, u64 size, int remove) { - struct btrfs_root *root = fs_info->free_space_root; + struct btrfs_root *root = block_group->fs_info->free_space_root; struct btrfs_key key; u64 end = start + size; u64 cur_start, cur_size; @@ -682,7 +682,7 @@ static int modify_free_space_bitmap(struct btrfs_trans_handle *trans, } btrfs_release_path(path); - ret = update_free_space_extent_count(trans, fs_info, block_group, path, + ret = update_free_space_extent_count(trans, block_group, path, new_extents); out: @@ -690,12 +690,11 @@ out: } static int remove_free_space_extent(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, struct btrfs_block_group_cache *block_group, struct btrfs_path *path, u64 start, u64 size) { - struct btrfs_root *root = fs_info->free_space_root; + struct btrfs_root *root = trans->fs_info->free_space_root; struct btrfs_key key; u64 found_start, found_end; u64 end = start + size; @@ -769,7 +768,7 @@ static int remove_free_space_extent(struct btrfs_trans_handle *trans, } btrfs_release_path(path); - ret = update_free_space_extent_count(trans, fs_info, block_group, path, + ret = update_free_space_extent_count(trans, block_group, path, new_extents); out: @@ -777,7 +776,6 @@ out: } int __remove_from_free_space_tree(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, struct btrfs_block_group_cache *block_group, struct btrfs_path *path, u64 start, u64 size) { @@ -786,36 +784,35 @@ int __remove_from_free_space_tree(struct btrfs_trans_handle *trans, int ret; if (block_group->needs_free_space) { - ret = __add_block_group_free_space(trans, fs_info, block_group, - path); + ret = __add_block_group_free_space(trans, block_group, path); if (ret) return ret; } - info = search_free_space_info(NULL, fs_info, block_group, path, 0); + info = search_free_space_info(NULL, trans->fs_info, block_group, path, + 0); if (IS_ERR(info)) return PTR_ERR(info); flags = btrfs_free_space_flags(path->nodes[0], info); btrfs_release_path(path); if (flags & BTRFS_FREE_SPACE_USING_BITMAPS) { - return modify_free_space_bitmap(trans, fs_info, block_group, - path, start, size, 1); + return modify_free_space_bitmap(trans, block_group, path, + start, size, 1); } else { - return remove_free_space_extent(trans, fs_info, block_group, - path, start, size); + return remove_free_space_extent(trans, block_group, path, + start, size); } } int remove_from_free_space_tree(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, u64 start, u64 size) { struct btrfs_block_group_cache *block_group; struct btrfs_path *path; int ret; - if (!btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) + if (!btrfs_fs_compat_ro(trans->fs_info, FREE_SPACE_TREE)) return 0; path = btrfs_alloc_path(); @@ -824,7 +821,7 @@ int remove_from_free_space_tree(struct btrfs_trans_handle *trans, goto out; } - block_group = btrfs_lookup_block_group(fs_info, start); + block_group = btrfs_lookup_block_group(trans->fs_info, start); if (!block_group) { ASSERT(0); ret = -ENOENT; @@ -832,8 +829,8 @@ int remove_from_free_space_tree(struct btrfs_trans_handle *trans, } mutex_lock(&block_group->free_space_lock); - ret = __remove_from_free_space_tree(trans, fs_info, block_group, path, - start, size); + ret = __remove_from_free_space_tree(trans, block_group, path, start, + size); mutex_unlock(&block_group->free_space_lock); btrfs_put_block_group(block_group); @@ -845,12 +842,11 @@ out: } static int add_free_space_extent(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, struct btrfs_block_group_cache *block_group, struct btrfs_path *path, u64 start, u64 size) { - struct btrfs_root *root = fs_info->free_space_root; + struct btrfs_root *root = trans->fs_info->free_space_root; struct btrfs_key key, new_key; u64 found_start, found_end; u64 end = start + size; @@ -965,7 +961,7 @@ insert: goto out; btrfs_release_path(path); - ret = update_free_space_extent_count(trans, fs_info, block_group, path, + ret = update_free_space_extent_count(trans, block_group, path, new_extents); out: @@ -973,17 +969,16 @@ out: } int __add_to_free_space_tree(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, struct btrfs_block_group_cache *block_group, struct btrfs_path *path, u64 start, u64 size) { + struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_free_space_info *info; u32 flags; int ret; if (block_group->needs_free_space) { - ret = __add_block_group_free_space(trans, fs_info, block_group, - path); + ret = __add_block_group_free_space(trans, block_group, path); if (ret) return ret; } @@ -995,23 +990,22 @@ int __add_to_free_space_tree(struct btrfs_trans_handle *trans, btrfs_release_path(path); if (flags & BTRFS_FREE_SPACE_USING_BITMAPS) { - return modify_free_space_bitmap(trans, fs_info, block_group, - path, start, size, 0); + return modify_free_space_bitmap(trans, block_group, path, + start, size, 0); } else { - return add_free_space_extent(trans, fs_info, block_group, path, - start, size); + return add_free_space_extent(trans, block_group, path, start, + size); } } int add_to_free_space_tree(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, u64 start, u64 size) { struct btrfs_block_group_cache *block_group; struct btrfs_path *path; int ret; - if (!btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) + if (!btrfs_fs_compat_ro(trans->fs_info, FREE_SPACE_TREE)) return 0; path = btrfs_alloc_path(); @@ -1020,7 +1014,7 @@ int add_to_free_space_tree(struct btrfs_trans_handle *trans, goto out; } - block_group = btrfs_lookup_block_group(fs_info, start); + block_group = btrfs_lookup_block_group(trans->fs_info, start); if (!block_group) { ASSERT(0); ret = -ENOENT; @@ -1028,8 +1022,7 @@ int add_to_free_space_tree(struct btrfs_trans_handle *trans, } mutex_lock(&block_group->free_space_lock); - ret = __add_to_free_space_tree(trans, fs_info, block_group, path, start, - size); + ret = __add_to_free_space_tree(trans, block_group, path, start, size); mutex_unlock(&block_group->free_space_lock); btrfs_put_block_group(block_group); @@ -1046,10 +1039,9 @@ out: * through the normal add/remove hooks. */ static int populate_free_space_tree(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, struct btrfs_block_group_cache *block_group) { - struct btrfs_root *extent_root = fs_info->extent_root; + struct btrfs_root *extent_root = trans->fs_info->extent_root; struct btrfs_path *path, *path2; struct btrfs_key key; u64 start, end; @@ -1066,7 +1058,7 @@ static int populate_free_space_tree(struct btrfs_trans_handle *trans, return -ENOMEM; } - ret = add_new_free_space_info(trans, fs_info, block_group, path2); + ret = add_new_free_space_info(trans, block_group, path2); if (ret) goto out; @@ -1099,7 +1091,7 @@ static int populate_free_space_tree(struct btrfs_trans_handle *trans, break; if (start < key.objectid) { - ret = __add_to_free_space_tree(trans, fs_info, + ret = __add_to_free_space_tree(trans, block_group, path2, start, key.objectid - @@ -1109,7 +1101,7 @@ static int populate_free_space_tree(struct btrfs_trans_handle *trans, } start = key.objectid; if (key.type == BTRFS_METADATA_ITEM_KEY) - start += fs_info->nodesize; + start += trans->fs_info->nodesize; else start += key.offset; } else if (key.type == BTRFS_BLOCK_GROUP_ITEM_KEY) { @@ -1124,8 +1116,8 @@ static int populate_free_space_tree(struct btrfs_trans_handle *trans, break; } if (start < end) { - ret = __add_to_free_space_tree(trans, fs_info, block_group, - path2, start, end - start); + ret = __add_to_free_space_tree(trans, block_group, path2, + start, end - start); if (ret) goto out_locked; } @@ -1165,7 +1157,7 @@ int btrfs_create_free_space_tree(struct btrfs_fs_info *fs_info) while (node) { block_group = rb_entry(node, struct btrfs_block_group_cache, cache_node); - ret = populate_free_space_tree(trans, fs_info, block_group); + ret = populate_free_space_tree(trans, block_group); if (ret) goto abort; node = rb_next(node); @@ -1269,7 +1261,6 @@ abort: } static int __add_block_group_free_space(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, struct btrfs_block_group_cache *block_group, struct btrfs_path *path) { @@ -1277,19 +1268,19 @@ static int __add_block_group_free_space(struct btrfs_trans_handle *trans, block_group->needs_free_space = 0; - ret = add_new_free_space_info(trans, fs_info, block_group, path); + ret = add_new_free_space_info(trans, block_group, path); if (ret) return ret; - return __add_to_free_space_tree(trans, fs_info, block_group, path, + return __add_to_free_space_tree(trans, block_group, path, block_group->key.objectid, block_group->key.offset); } int add_block_group_free_space(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, struct btrfs_block_group_cache *block_group) { + struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_path *path = NULL; int ret = 0; @@ -1306,7 +1297,7 @@ int add_block_group_free_space(struct btrfs_trans_handle *trans, goto out; } - ret = __add_block_group_free_space(trans, fs_info, block_group, path); + ret = __add_block_group_free_space(trans, block_group, path); out: btrfs_free_path(path); @@ -1317,10 +1308,9 @@ out: } int remove_block_group_free_space(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, struct btrfs_block_group_cache *block_group) { - struct btrfs_root *root = fs_info->free_space_root; + struct btrfs_root *root = trans->fs_info->free_space_root; struct btrfs_path *path; struct btrfs_key key, found_key; struct extent_buffer *leaf; @@ -1328,7 +1318,7 @@ int remove_block_group_free_space(struct btrfs_trans_handle *trans, int done = 0, nr; int ret; - if (!btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) + if (!btrfs_fs_compat_ro(trans->fs_info, FREE_SPACE_TREE)) return 0; if (block_group->needs_free_space) { @@ -1439,7 +1429,6 @@ static int load_free_space_bitmaps(struct btrfs_caching_control *caching_ctl, extent_start = offset; } else if (prev_bit == 1 && bit == 0) { total_found += add_new_free_space(block_group, - fs_info, extent_start, offset); if (total_found > CACHING_CTL_WAKE_UP) { @@ -1453,8 +1442,8 @@ static int load_free_space_bitmaps(struct btrfs_caching_control *caching_ctl, } } if (prev_bit == 1) { - total_found += add_new_free_space(block_group, fs_info, - extent_start, end); + total_found += add_new_free_space(block_group, extent_start, + end); extent_count++; } @@ -1511,8 +1500,7 @@ static int load_free_space_extents(struct btrfs_caching_control *caching_ctl, caching_ctl->progress = key.objectid; - total_found += add_new_free_space(block_group, fs_info, - key.objectid, + total_found += add_new_free_space(block_group, key.objectid, key.objectid + key.offset); if (total_found > CACHING_CTL_WAKE_UP) { total_found = 0; diff --git a/fs/btrfs/free-space-tree.h b/fs/btrfs/free-space-tree.h index 874b4feecad2..3133651d7d70 100644 --- a/fs/btrfs/free-space-tree.h +++ b/fs/btrfs/free-space-tree.h @@ -19,16 +19,12 @@ int btrfs_create_free_space_tree(struct btrfs_fs_info *fs_info); int btrfs_clear_free_space_tree(struct btrfs_fs_info *fs_info); int load_free_space_tree(struct btrfs_caching_control *caching_ctl); int add_block_group_free_space(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, struct btrfs_block_group_cache *block_group); int remove_block_group_free_space(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, struct btrfs_block_group_cache *block_group); int add_to_free_space_tree(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, u64 start, u64 size); int remove_from_free_space_tree(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, u64 start, u64 size); #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS @@ -38,19 +34,15 @@ search_free_space_info(struct btrfs_trans_handle *trans, struct btrfs_block_group_cache *block_group, struct btrfs_path *path, int cow); int __add_to_free_space_tree(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, struct btrfs_block_group_cache *block_group, struct btrfs_path *path, u64 start, u64 size); int __remove_from_free_space_tree(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, struct btrfs_block_group_cache *block_group, struct btrfs_path *path, u64 start, u64 size); int convert_free_space_to_bitmaps(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, struct btrfs_block_group_cache *block_group, struct btrfs_path *path); int convert_free_space_to_extents(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, struct btrfs_block_group_cache *block_group, struct btrfs_path *path); int free_space_test_bit(struct btrfs_block_group_cache *block_group, diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index d241285a0d2a..89b208201783 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -1018,8 +1018,10 @@ static noinline int cow_file_range(struct inode *inode, ram_size, /* ram_bytes */ BTRFS_COMPRESS_NONE, /* compress_type */ BTRFS_ORDERED_REGULAR /* type */); - if (IS_ERR(em)) + if (IS_ERR(em)) { + ret = PTR_ERR(em); goto out_reserve; + } free_extent_map(em); ret = btrfs_add_ordered_extent(inode, start, ins.objectid, @@ -1156,13 +1158,10 @@ static noinline void async_cow_submit(struct btrfs_work *work) nr_pages = (async_cow->end - async_cow->start + PAGE_SIZE) >> PAGE_SHIFT; - /* - * atomic_sub_return implies a barrier for waitqueue_active - */ + /* atomic_sub_return implies a barrier */ if (atomic_sub_return(nr_pages, &fs_info->async_delalloc_pages) < - 5 * SZ_1M && - waitqueue_active(&fs_info->async_submit_wait)) - wake_up(&fs_info->async_submit_wait); + 5 * SZ_1M) + cond_wake_up_nomb(&fs_info->async_submit_wait); if (async_cow->inode) submit_compressed_extents(async_cow->inode, async_cow); @@ -1373,6 +1372,13 @@ next_slot: btrfs_file_extent_encryption(leaf, fi) || btrfs_file_extent_other_encoding(leaf, fi)) goto out_check; + /* + * Do the same check as in btrfs_cross_ref_exist but + * without the unnecessary search. + */ + if (btrfs_file_extent_generation(leaf, fi) <= + btrfs_root_last_snapshot(&root->root_item)) + goto out_check; if (extent_type == BTRFS_FILE_EXTENT_REG && !force) goto out_check; if (btrfs_extent_readonly(fs_info, disk_bytenr)) @@ -1742,24 +1748,32 @@ static void btrfs_add_delalloc_inodes(struct btrfs_root *root, spin_unlock(&root->delalloc_lock); } -static void btrfs_del_delalloc_inode(struct btrfs_root *root, - struct btrfs_inode *inode) + +void __btrfs_del_delalloc_inode(struct btrfs_root *root, + struct btrfs_inode *inode) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb); - spin_lock(&root->delalloc_lock); if (!list_empty(&inode->delalloc_inodes)) { list_del_init(&inode->delalloc_inodes); clear_bit(BTRFS_INODE_IN_DELALLOC_LIST, &inode->runtime_flags); root->nr_delalloc_inodes--; if (!root->nr_delalloc_inodes) { + ASSERT(list_empty(&root->delalloc_inodes)); spin_lock(&fs_info->delalloc_root_lock); BUG_ON(list_empty(&root->delalloc_root)); list_del_init(&root->delalloc_root); spin_unlock(&fs_info->delalloc_root_lock); } } +} + +static void btrfs_del_delalloc_inode(struct btrfs_root *root, + struct btrfs_inode *inode) +{ + spin_lock(&root->delalloc_lock); + __btrfs_del_delalloc_inode(root, inode); spin_unlock(&root->delalloc_lock); } @@ -3151,6 +3165,9 @@ out: /* once for the tree */ btrfs_put_ordered_extent(ordered_extent); + /* Try to release some metadata so we don't get an OOM but don't wait */ + btrfs_btree_balance_dirty_nodelay(fs_info); + return ret; } @@ -3293,177 +3310,31 @@ void btrfs_run_delayed_iputs(struct btrfs_fs_info *fs_info) } /* - * This is called in transaction commit time. If there are no orphan - * files in the subvolume, it removes orphan item and frees block_rsv - * structure. - */ -void btrfs_orphan_commit_root(struct btrfs_trans_handle *trans, - struct btrfs_root *root) -{ - struct btrfs_fs_info *fs_info = root->fs_info; - struct btrfs_block_rsv *block_rsv; - int ret; - - if (atomic_read(&root->orphan_inodes) || - root->orphan_cleanup_state != ORPHAN_CLEANUP_DONE) - return; - - spin_lock(&root->orphan_lock); - if (atomic_read(&root->orphan_inodes)) { - spin_unlock(&root->orphan_lock); - return; - } - - if (root->orphan_cleanup_state != ORPHAN_CLEANUP_DONE) { - spin_unlock(&root->orphan_lock); - return; - } - - block_rsv = root->orphan_block_rsv; - root->orphan_block_rsv = NULL; - spin_unlock(&root->orphan_lock); - - if (test_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &root->state) && - btrfs_root_refs(&root->root_item) > 0) { - ret = btrfs_del_orphan_item(trans, fs_info->tree_root, - root->root_key.objectid); - if (ret) - btrfs_abort_transaction(trans, ret); - else - clear_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, - &root->state); - } - - if (block_rsv) { - WARN_ON(block_rsv->size > 0); - btrfs_free_block_rsv(fs_info, block_rsv); - } -} - -/* - * This creates an orphan entry for the given inode in case something goes - * wrong in the middle of an unlink/truncate. - * - * NOTE: caller of this function should reserve 5 units of metadata for - * this function. + * This creates an orphan entry for the given inode in case something goes wrong + * in the middle of an unlink. */ int btrfs_orphan_add(struct btrfs_trans_handle *trans, - struct btrfs_inode *inode) + struct btrfs_inode *inode) { - struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb); - struct btrfs_root *root = inode->root; - struct btrfs_block_rsv *block_rsv = NULL; - int reserve = 0; - bool insert = false; int ret; - if (!root->orphan_block_rsv) { - block_rsv = btrfs_alloc_block_rsv(fs_info, - BTRFS_BLOCK_RSV_TEMP); - if (!block_rsv) - return -ENOMEM; - } - - if (!test_and_set_bit(BTRFS_INODE_HAS_ORPHAN_ITEM, - &inode->runtime_flags)) - insert = true; - - if (!test_and_set_bit(BTRFS_INODE_ORPHAN_META_RESERVED, - &inode->runtime_flags)) - reserve = 1; - - spin_lock(&root->orphan_lock); - /* If someone has created ->orphan_block_rsv, be happy to use it. */ - if (!root->orphan_block_rsv) { - root->orphan_block_rsv = block_rsv; - } else if (block_rsv) { - btrfs_free_block_rsv(fs_info, block_rsv); - block_rsv = NULL; - } - - if (insert) - atomic_inc(&root->orphan_inodes); - spin_unlock(&root->orphan_lock); - - /* grab metadata reservation from transaction handle */ - if (reserve) { - ret = btrfs_orphan_reserve_metadata(trans, inode); - ASSERT(!ret); - if (ret) { - /* - * dec doesn't need spin_lock as ->orphan_block_rsv - * would be released only if ->orphan_inodes is - * zero. - */ - atomic_dec(&root->orphan_inodes); - clear_bit(BTRFS_INODE_ORPHAN_META_RESERVED, - &inode->runtime_flags); - if (insert) - clear_bit(BTRFS_INODE_HAS_ORPHAN_ITEM, - &inode->runtime_flags); - return ret; - } - } - - /* insert an orphan item to track this unlinked/truncated file */ - if (insert) { - ret = btrfs_insert_orphan_item(trans, root, btrfs_ino(inode)); - if (ret) { - if (reserve) { - clear_bit(BTRFS_INODE_ORPHAN_META_RESERVED, - &inode->runtime_flags); - btrfs_orphan_release_metadata(inode); - } - /* - * btrfs_orphan_commit_root may race with us and set - * ->orphan_block_rsv to zero, in order to avoid that, - * decrease ->orphan_inodes after everything is done. - */ - atomic_dec(&root->orphan_inodes); - if (ret != -EEXIST) { - clear_bit(BTRFS_INODE_HAS_ORPHAN_ITEM, - &inode->runtime_flags); - btrfs_abort_transaction(trans, ret); - return ret; - } - } - ret = 0; + ret = btrfs_insert_orphan_item(trans, inode->root, btrfs_ino(inode)); + if (ret && ret != -EEXIST) { + btrfs_abort_transaction(trans, ret); + return ret; } return 0; } /* - * We have done the truncate/delete so we can go ahead and remove the orphan - * item for this particular inode. + * We have done the delete so we can go ahead and remove the orphan item for + * this particular inode. */ static int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct btrfs_inode *inode) { - struct btrfs_root *root = inode->root; - int delete_item = 0; - int ret = 0; - - if (test_and_clear_bit(BTRFS_INODE_HAS_ORPHAN_ITEM, - &inode->runtime_flags)) - delete_item = 1; - - if (delete_item && trans) - ret = btrfs_del_orphan_item(trans, root, btrfs_ino(inode)); - - if (test_and_clear_bit(BTRFS_INODE_ORPHAN_META_RESERVED, - &inode->runtime_flags)) - btrfs_orphan_release_metadata(inode); - - /* - * btrfs_orphan_commit_root may race with us and set ->orphan_block_rsv - * to zero, in order to avoid that, decrease ->orphan_inodes after - * everything is done. - */ - if (delete_item) - atomic_dec(&root->orphan_inodes); - - return ret; + return btrfs_del_orphan_item(trans, inode->root, btrfs_ino(inode)); } /* @@ -3479,7 +3350,7 @@ int btrfs_orphan_cleanup(struct btrfs_root *root) struct btrfs_trans_handle *trans; struct inode *inode; u64 last_objectid = 0; - int ret = 0, nr_unlink = 0, nr_truncate = 0; + int ret = 0, nr_unlink = 0; if (cmpxchg(&root->orphan_cleanup_state, 0, ORPHAN_CLEANUP_STARTED)) return 0; @@ -3579,12 +3450,31 @@ int btrfs_orphan_cleanup(struct btrfs_root *root) key.offset = found_key.objectid - 1; continue; } + } + /* - * Inode is already gone but the orphan item is still there, - * kill the orphan item. + * If we have an inode with links, there are a couple of + * possibilities. Old kernels (before v3.12) used to create an + * orphan item for truncate indicating that there were possibly + * extent items past i_size that needed to be deleted. In v3.12, + * truncate was changed to update i_size in sync with the extent + * items, but the (useless) orphan item was still created. Since + * v4.18, we don't create the orphan item for truncate at all. + * + * So, this item could mean that we need to do a truncate, but + * only if this filesystem was last used on a pre-v3.12 kernel + * and was not cleanly unmounted. The odds of that are quite + * slim, and it's a pain to do the truncate now, so just delete + * the orphan item. + * + * It's also possible that this orphan item was supposed to be + * deleted but wasn't. The inode number may have been reused, + * but either way, we can delete the orphan item. */ - if (ret == -ENOENT) { + if (ret == -ENOENT || inode->i_nlink) { + if (!ret) + iput(inode); trans = btrfs_start_transaction(root, 1); if (IS_ERR(trans)) { ret = PTR_ERR(trans); @@ -3600,42 +3490,7 @@ int btrfs_orphan_cleanup(struct btrfs_root *root) continue; } - /* - * add this inode to the orphan list so btrfs_orphan_del does - * the proper thing when we hit it - */ - set_bit(BTRFS_INODE_HAS_ORPHAN_ITEM, - &BTRFS_I(inode)->runtime_flags); - atomic_inc(&root->orphan_inodes); - - /* if we have links, this was a truncate, lets do that */ - if (inode->i_nlink) { - if (WARN_ON(!S_ISREG(inode->i_mode))) { - iput(inode); - continue; - } - nr_truncate++; - - /* 1 for the orphan item deletion. */ - trans = btrfs_start_transaction(root, 1); - if (IS_ERR(trans)) { - iput(inode); - ret = PTR_ERR(trans); - goto out; - } - ret = btrfs_orphan_add(trans, BTRFS_I(inode)); - btrfs_end_transaction(trans); - if (ret) { - iput(inode); - goto out; - } - - ret = btrfs_truncate(inode, false); - if (ret) - btrfs_orphan_del(NULL, BTRFS_I(inode)); - } else { - nr_unlink++; - } + nr_unlink++; /* this will do delete_inode and everything for us */ iput(inode); @@ -3647,12 +3502,7 @@ int btrfs_orphan_cleanup(struct btrfs_root *root) root->orphan_cleanup_state = ORPHAN_CLEANUP_DONE; - if (root->orphan_block_rsv) - btrfs_block_rsv_release(fs_info, root->orphan_block_rsv, - (u64)-1); - - if (root->orphan_block_rsv || - test_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &root->state)) { + if (test_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &root->state)) { trans = btrfs_join_transaction(root); if (!IS_ERR(trans)) btrfs_end_transaction(trans); @@ -3660,8 +3510,6 @@ int btrfs_orphan_cleanup(struct btrfs_root *root) if (nr_unlink) btrfs_debug(fs_info, "unlinked %d orphans", nr_unlink); - if (nr_truncate) - btrfs_debug(fs_info, "truncated %d orphans", nr_truncate); out: if (ret) @@ -3924,7 +3772,7 @@ cache_acl: break; } - btrfs_update_iflags(inode); + btrfs_sync_inode_flags_to_i_flags(inode); return 0; make_bad: @@ -4238,7 +4086,7 @@ out: return ret; } -int btrfs_unlink_subvol(struct btrfs_trans_handle *trans, +static int btrfs_unlink_subvol(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct inode *dir, u64 objectid, const char *name, int name_len) @@ -4319,6 +4167,262 @@ out: return ret; } +/* + * Helper to check if the subvolume references other subvolumes or if it's + * default. + */ +static noinline int may_destroy_subvol(struct btrfs_root *root) +{ + struct btrfs_fs_info *fs_info = root->fs_info; + struct btrfs_path *path; + struct btrfs_dir_item *di; + struct btrfs_key key; + u64 dir_id; + int ret; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + /* Make sure this root isn't set as the default subvol */ + dir_id = btrfs_super_root_dir(fs_info->super_copy); + di = btrfs_lookup_dir_item(NULL, fs_info->tree_root, path, + dir_id, "default", 7, 0); + if (di && !IS_ERR(di)) { + btrfs_dir_item_key_to_cpu(path->nodes[0], di, &key); + if (key.objectid == root->root_key.objectid) { + ret = -EPERM; + btrfs_err(fs_info, + "deleting default subvolume %llu is not allowed", + key.objectid); + goto out; + } + btrfs_release_path(path); + } + + key.objectid = root->root_key.objectid; + key.type = BTRFS_ROOT_REF_KEY; + key.offset = (u64)-1; + + ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0); + if (ret < 0) + goto out; + BUG_ON(ret == 0); + + ret = 0; + if (path->slots[0] > 0) { + path->slots[0]--; + btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); + if (key.objectid == root->root_key.objectid && + key.type == BTRFS_ROOT_REF_KEY) + ret = -ENOTEMPTY; + } +out: + btrfs_free_path(path); + return ret; +} + +/* Delete all dentries for inodes belonging to the root */ +static void btrfs_prune_dentries(struct btrfs_root *root) +{ + struct btrfs_fs_info *fs_info = root->fs_info; + struct rb_node *node; + struct rb_node *prev; + struct btrfs_inode *entry; + struct inode *inode; + u64 objectid = 0; + + if (!test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) + WARN_ON(btrfs_root_refs(&root->root_item) != 0); + + spin_lock(&root->inode_lock); +again: + node = root->inode_tree.rb_node; + prev = NULL; + while (node) { + prev = node; + entry = rb_entry(node, struct btrfs_inode, rb_node); + + if (objectid < btrfs_ino(BTRFS_I(&entry->vfs_inode))) + node = node->rb_left; + else if (objectid > btrfs_ino(BTRFS_I(&entry->vfs_inode))) + node = node->rb_right; + else + break; + } + if (!node) { + while (prev) { + entry = rb_entry(prev, struct btrfs_inode, rb_node); + if (objectid <= btrfs_ino(BTRFS_I(&entry->vfs_inode))) { + node = prev; + break; + } + prev = rb_next(prev); + } + } + while (node) { + entry = rb_entry(node, struct btrfs_inode, rb_node); + objectid = btrfs_ino(BTRFS_I(&entry->vfs_inode)) + 1; + inode = igrab(&entry->vfs_inode); + if (inode) { + spin_unlock(&root->inode_lock); + if (atomic_read(&inode->i_count) > 1) + d_prune_aliases(inode); + /* + * btrfs_drop_inode will have it removed from the inode + * cache when its usage count hits zero. + */ + iput(inode); + cond_resched(); + spin_lock(&root->inode_lock); + goto again; + } + + if (cond_resched_lock(&root->inode_lock)) + goto again; + + node = rb_next(node); + } + spin_unlock(&root->inode_lock); +} + +int btrfs_delete_subvolume(struct inode *dir, struct dentry *dentry) +{ + struct btrfs_fs_info *fs_info = btrfs_sb(dentry->d_sb); + struct btrfs_root *root = BTRFS_I(dir)->root; + struct inode *inode = d_inode(dentry); + struct btrfs_root *dest = BTRFS_I(inode)->root; + struct btrfs_trans_handle *trans; + struct btrfs_block_rsv block_rsv; + u64 root_flags; + int ret; + int err; + + /* + * Don't allow to delete a subvolume with send in progress. This is + * inside the inode lock so the error handling that has to drop the bit + * again is not run concurrently. + */ + spin_lock(&dest->root_item_lock); + root_flags = btrfs_root_flags(&dest->root_item); + if (dest->send_in_progress == 0) { + btrfs_set_root_flags(&dest->root_item, + root_flags | BTRFS_ROOT_SUBVOL_DEAD); + spin_unlock(&dest->root_item_lock); + } else { + spin_unlock(&dest->root_item_lock); + btrfs_warn(fs_info, + "attempt to delete subvolume %llu during send", + dest->root_key.objectid); + return -EPERM; + } + + down_write(&fs_info->subvol_sem); + + err = may_destroy_subvol(dest); + if (err) + goto out_up_write; + + btrfs_init_block_rsv(&block_rsv, BTRFS_BLOCK_RSV_TEMP); + /* + * One for dir inode, + * two for dir entries, + * two for root ref/backref. + */ + err = btrfs_subvolume_reserve_metadata(root, &block_rsv, 5, true); + if (err) + goto out_up_write; + + trans = btrfs_start_transaction(root, 0); + if (IS_ERR(trans)) { + err = PTR_ERR(trans); + goto out_release; + } + trans->block_rsv = &block_rsv; + trans->bytes_reserved = block_rsv.size; + + btrfs_record_snapshot_destroy(trans, BTRFS_I(dir)); + + ret = btrfs_unlink_subvol(trans, root, dir, + dest->root_key.objectid, + dentry->d_name.name, + dentry->d_name.len); + if (ret) { + err = ret; + btrfs_abort_transaction(trans, ret); + goto out_end_trans; + } + + btrfs_record_root_in_trans(trans, dest); + + memset(&dest->root_item.drop_progress, 0, + sizeof(dest->root_item.drop_progress)); + dest->root_item.drop_level = 0; + btrfs_set_root_refs(&dest->root_item, 0); + + if (!test_and_set_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &dest->state)) { + ret = btrfs_insert_orphan_item(trans, + fs_info->tree_root, + dest->root_key.objectid); + if (ret) { + btrfs_abort_transaction(trans, ret); + err = ret; + goto out_end_trans; + } + } + + ret = btrfs_uuid_tree_remove(trans, dest->root_item.uuid, + BTRFS_UUID_KEY_SUBVOL, + dest->root_key.objectid); + if (ret && ret != -ENOENT) { + btrfs_abort_transaction(trans, ret); + err = ret; + goto out_end_trans; + } + if (!btrfs_is_empty_uuid(dest->root_item.received_uuid)) { + ret = btrfs_uuid_tree_remove(trans, + dest->root_item.received_uuid, + BTRFS_UUID_KEY_RECEIVED_SUBVOL, + dest->root_key.objectid); + if (ret && ret != -ENOENT) { + btrfs_abort_transaction(trans, ret); + err = ret; + goto out_end_trans; + } + } + +out_end_trans: + trans->block_rsv = NULL; + trans->bytes_reserved = 0; + ret = btrfs_end_transaction(trans); + if (ret && !err) + err = ret; + inode->i_flags |= S_DEAD; +out_release: + btrfs_subvolume_release_metadata(fs_info, &block_rsv); +out_up_write: + up_write(&fs_info->subvol_sem); + if (err) { + spin_lock(&dest->root_item_lock); + root_flags = btrfs_root_flags(&dest->root_item); + btrfs_set_root_flags(&dest->root_item, + root_flags & ~BTRFS_ROOT_SUBVOL_DEAD); + spin_unlock(&dest->root_item_lock); + } else { + d_invalidate(dentry); + btrfs_prune_dentries(dest); + ASSERT(dest->send_in_progress == 0); + + /* the last ref */ + if (dest->ino_cache_inode) { + iput(dest->ino_cache_inode); + dest->ino_cache_inode = NULL; + } + } + + return err; +} + static int btrfs_rmdir(struct inode *dir, struct dentry *dentry) { struct inode *inode = d_inode(dentry); @@ -4330,7 +4434,7 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry) if (inode->i_size > BTRFS_EMPTY_DIR_SIZE) return -ENOTEMPTY; if (btrfs_ino(BTRFS_I(inode)) == BTRFS_FIRST_FREE_OBJECTID) - return -EPERM; + return btrfs_delete_subvolume(dir, dentry); trans = __unlink_start_trans(dir); if (IS_ERR(trans)) @@ -4442,7 +4546,6 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, int pending_del_slot = 0; int extent_type = -1; int ret; - int err = 0; u64 ino = btrfs_ino(BTRFS_I(inode)); u64 bytes_deleted = 0; bool be_nice = false; @@ -4494,22 +4597,19 @@ search_again: * up a huge file in a single leaf. Most of the time that * bytes_deleted is > 0, it will be huge by the time we get here */ - if (be_nice && bytes_deleted > SZ_32M) { - if (btrfs_should_end_transaction(trans)) { - err = -EAGAIN; - goto error; - } + if (be_nice && bytes_deleted > SZ_32M && + btrfs_should_end_transaction(trans)) { + ret = -EAGAIN; + goto out; } - path->leave_spinning = 1; ret = btrfs_search_slot(trans, root, &key, path, -1, 1); - if (ret < 0) { - err = ret; + if (ret < 0) goto out; - } if (ret > 0) { + ret = 0; /* there are no items in the tree for us to truncate, we're * done */ @@ -4620,7 +4720,7 @@ search_again: * We have to bail so the last_size is set to * just before this extent. */ - err = NEED_TRUNCATE_BLOCK; + ret = NEED_TRUNCATE_BLOCK; break; } @@ -4659,7 +4759,10 @@ delete: extent_num_bytes, 0, btrfs_header_owner(leaf), ino, extent_offset); - BUG_ON(ret); + if (ret) { + btrfs_abort_transaction(trans, ret); + break; + } if (btrfs_should_throttle_delayed_refs(trans, fs_info)) btrfs_async_run_delayed_refs(fs_info, trans->delayed_ref_updates * 2, @@ -4687,7 +4790,7 @@ delete: pending_del_nr); if (ret) { btrfs_abort_transaction(trans, ret); - goto error; + break; } pending_del_nr = 0; } @@ -4698,8 +4801,8 @@ delete: trans->delayed_ref_updates = 0; ret = btrfs_run_delayed_refs(trans, updates * 2); - if (ret && !err) - err = ret; + if (ret) + break; } } /* @@ -4707,8 +4810,8 @@ delete: * and let the transaction restart */ if (should_end) { - err = -EAGAIN; - goto error; + ret = -EAGAIN; + break; } goto search_again; } else { @@ -4716,32 +4819,37 @@ delete: } } out: - if (pending_del_nr) { - ret = btrfs_del_items(trans, root, path, pending_del_slot, + if (ret >= 0 && pending_del_nr) { + int err; + + err = btrfs_del_items(trans, root, path, pending_del_slot, pending_del_nr); - if (ret) - btrfs_abort_transaction(trans, ret); + if (err) { + btrfs_abort_transaction(trans, err); + ret = err; + } } -error: if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) { ASSERT(last_size >= new_size); - if (!err && last_size > new_size) + if (!ret && last_size > new_size) last_size = new_size; btrfs_ordered_update_i_size(inode, last_size, NULL); } btrfs_free_path(path); - if (be_nice && bytes_deleted > SZ_32M) { + if (be_nice && bytes_deleted > SZ_32M && (ret >= 0 || ret == -EAGAIN)) { unsigned long updates = trans->delayed_ref_updates; + int err; + if (updates) { trans->delayed_ref_updates = 0; - ret = btrfs_run_delayed_refs(trans, updates * 2); - if (ret && !err) - err = ret; + err = btrfs_run_delayed_refs(trans, updates * 2); + if (err) + ret = err; } } - return err; + return ret; } /* @@ -5083,30 +5191,6 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr) set_bit(BTRFS_INODE_ORDERED_DATA_CLOSE, &BTRFS_I(inode)->runtime_flags); - /* - * 1 for the orphan item we're going to add - * 1 for the orphan item deletion. - */ - trans = btrfs_start_transaction(root, 2); - if (IS_ERR(trans)) - return PTR_ERR(trans); - - /* - * We need to do this in case we fail at _any_ point during the - * actual truncate. Once we do the truncate_setsize we could - * invalidate pages which forces any outstanding ordered io to - * be instantly completed which will give us extents that need - * to be truncated. If we fail to get an orphan inode down we - * could have left over extents that were never meant to live, - * so we need to guarantee from this point on that everything - * will be consistent. - */ - ret = btrfs_orphan_add(trans, BTRFS_I(inode)); - btrfs_end_transaction(trans); - if (ret) - return ret; - - /* we don't support swapfiles, so vmtruncate shouldn't fail */ truncate_setsize(inode, newsize); /* Disable nonlocked read DIO to avoid the end less truncate */ @@ -5118,29 +5202,16 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr) if (ret && inode->i_nlink) { int err; - /* To get a stable disk_i_size */ - err = btrfs_wait_ordered_range(inode, 0, (u64)-1); - if (err) { - btrfs_orphan_del(NULL, BTRFS_I(inode)); - return err; - } - /* - * failed to truncate, disk_i_size is only adjusted down - * as we remove extents, so it should represent the true - * size of the inode, so reset the in memory size and - * delete our orphan entry. + * Truncate failed, so fix up the in-memory size. We + * adjusted disk_i_size down as we removed extents, so + * wait for disk_i_size to be stable and then update the + * in-memory size to match. */ - trans = btrfs_join_transaction(root); - if (IS_ERR(trans)) { - btrfs_orphan_del(NULL, BTRFS_I(inode)); - return ret; - } - i_size_write(inode, BTRFS_I(inode)->disk_i_size); - err = btrfs_orphan_del(trans, BTRFS_I(inode)); + err = btrfs_wait_ordered_range(inode, 0, (u64)-1); if (err) - btrfs_abort_transaction(trans, err); - btrfs_end_transaction(trans); + return err; + i_size_write(inode, BTRFS_I(inode)->disk_i_size); } } @@ -5270,13 +5341,52 @@ static void evict_inode_truncate_pages(struct inode *inode) spin_unlock(&io_tree->lock); } +static struct btrfs_trans_handle *evict_refill_and_join(struct btrfs_root *root, + struct btrfs_block_rsv *rsv, + u64 min_size) +{ + struct btrfs_fs_info *fs_info = root->fs_info; + struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; + int failures = 0; + + for (;;) { + struct btrfs_trans_handle *trans; + int ret; + + ret = btrfs_block_rsv_refill(root, rsv, min_size, + BTRFS_RESERVE_FLUSH_LIMIT); + + if (ret && ++failures > 2) { + btrfs_warn(fs_info, + "could not allocate space for a delete; will truncate on mount"); + return ERR_PTR(-ENOSPC); + } + + trans = btrfs_join_transaction(root); + if (IS_ERR(trans) || !ret) + return trans; + + /* + * Try to steal from the global reserve if there is space for + * it. + */ + if (!btrfs_check_space_for_delayed_refs(trans, fs_info) && + !btrfs_block_rsv_migrate(global_rsv, rsv, min_size, 0)) + return trans; + + /* If not, commit and try again. */ + ret = btrfs_commit_transaction(trans); + if (ret) + return ERR_PTR(ret); + } +} + void btrfs_evict_inode(struct inode *inode) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_trans_handle *trans; struct btrfs_root *root = BTRFS_I(inode)->root; - struct btrfs_block_rsv *rsv, *global_rsv; - int steal_from_global = 0; + struct btrfs_block_rsv *rsv; u64 min_size; int ret; @@ -5297,21 +5407,16 @@ void btrfs_evict_inode(struct inode *inode) btrfs_is_free_space_inode(BTRFS_I(inode)))) goto no_delete; - if (is_bad_inode(inode)) { - btrfs_orphan_del(NULL, BTRFS_I(inode)); + if (is_bad_inode(inode)) goto no_delete; - } /* do we really want it for ->i_nlink > 0 and zero btrfs_root_refs? */ if (!special_file(inode->i_mode)) btrfs_wait_ordered_range(inode, 0, (u64)-1); btrfs_free_io_failure_record(BTRFS_I(inode), 0, (u64)-1); - if (test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags)) { - BUG_ON(test_bit(BTRFS_INODE_HAS_ORPHAN_ITEM, - &BTRFS_I(inode)->runtime_flags)); + if (test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags)) goto no_delete; - } if (inode->i_nlink > 0) { BUG_ON(btrfs_root_refs(&root->root_item) != 0 && @@ -5320,130 +5425,63 @@ void btrfs_evict_inode(struct inode *inode) } ret = btrfs_commit_inode_delayed_inode(BTRFS_I(inode)); - if (ret) { - btrfs_orphan_del(NULL, BTRFS_I(inode)); + if (ret) goto no_delete; - } rsv = btrfs_alloc_block_rsv(fs_info, BTRFS_BLOCK_RSV_TEMP); - if (!rsv) { - btrfs_orphan_del(NULL, BTRFS_I(inode)); + if (!rsv) goto no_delete; - } rsv->size = min_size; rsv->failfast = 1; - global_rsv = &fs_info->global_block_rsv; btrfs_i_size_write(BTRFS_I(inode), 0); - /* - * This is a bit simpler than btrfs_truncate since we've already - * reserved our space for our orphan item in the unlink, so we just - * need to reserve some slack space in case we add bytes and update - * inode item when doing the truncate. - */ while (1) { - ret = btrfs_block_rsv_refill(root, rsv, min_size, - BTRFS_RESERVE_FLUSH_LIMIT); - - /* - * Try and steal from the global reserve since we will - * likely not use this space anyway, we want to try as - * hard as possible to get this to work. - */ - if (ret) - steal_from_global++; - else - steal_from_global = 0; - ret = 0; - - /* - * steal_from_global == 0: we reserved stuff, hooray! - * steal_from_global == 1: we didn't reserve stuff, boo! - * steal_from_global == 2: we've committed, still not a lot of - * room but maybe we'll have room in the global reserve this - * time. - * steal_from_global == 3: abandon all hope! - */ - if (steal_from_global > 2) { - btrfs_warn(fs_info, - "Could not get space for a delete, will truncate on mount %d", - ret); - btrfs_orphan_del(NULL, BTRFS_I(inode)); - btrfs_free_block_rsv(fs_info, rsv); - goto no_delete; - } - - trans = btrfs_join_transaction(root); - if (IS_ERR(trans)) { - btrfs_orphan_del(NULL, BTRFS_I(inode)); - btrfs_free_block_rsv(fs_info, rsv); - goto no_delete; - } - - /* - * We can't just steal from the global reserve, we need to make - * sure there is room to do it, if not we need to commit and try - * again. - */ - if (steal_from_global) { - if (!btrfs_check_space_for_delayed_refs(trans, fs_info)) - ret = btrfs_block_rsv_migrate(global_rsv, rsv, - min_size, 0); - else - ret = -ENOSPC; - } - - /* - * Couldn't steal from the global reserve, we have too much - * pending stuff built up, commit the transaction and try it - * again. - */ - if (ret) { - ret = btrfs_commit_transaction(trans); - if (ret) { - btrfs_orphan_del(NULL, BTRFS_I(inode)); - btrfs_free_block_rsv(fs_info, rsv); - goto no_delete; - } - continue; - } else { - steal_from_global = 0; - } + trans = evict_refill_and_join(root, rsv, min_size); + if (IS_ERR(trans)) + goto free_rsv; trans->block_rsv = rsv; ret = btrfs_truncate_inode_items(trans, root, inode, 0, 0); - if (ret != -ENOSPC && ret != -EAGAIN) - break; - trans->block_rsv = &fs_info->trans_block_rsv; btrfs_end_transaction(trans); - trans = NULL; btrfs_btree_balance_dirty(fs_info); + if (ret && ret != -ENOSPC && ret != -EAGAIN) + goto free_rsv; + else if (!ret) + break; } - btrfs_free_block_rsv(fs_info, rsv); - /* - * Errors here aren't a big deal, it just means we leave orphan items - * in the tree. They will be cleaned up on the next mount. + * Errors here aren't a big deal, it just means we leave orphan items in + * the tree. They will be cleaned up on the next mount. If the inode + * number gets reused, cleanup deletes the orphan item without doing + * anything, and unlink reuses the existing orphan item. + * + * If it turns out that we are dropping too many of these, we might want + * to add a mechanism for retrying these after a commit. */ - if (ret == 0) { - trans->block_rsv = root->orphan_block_rsv; + trans = evict_refill_and_join(root, rsv, min_size); + if (!IS_ERR(trans)) { + trans->block_rsv = rsv; btrfs_orphan_del(trans, BTRFS_I(inode)); - } else { - btrfs_orphan_del(NULL, BTRFS_I(inode)); + trans->block_rsv = &fs_info->trans_block_rsv; + btrfs_end_transaction(trans); } - trans->block_rsv = &fs_info->trans_block_rsv; if (!(root == fs_info->tree_root || root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)) btrfs_return_ino(root, btrfs_ino(BTRFS_I(inode))); - btrfs_end_transaction(trans); - btrfs_btree_balance_dirty(fs_info); +free_rsv: + btrfs_free_block_rsv(fs_info, rsv); no_delete: + /* + * If we didn't successfully delete, the orphan item will still be in + * the tree and we'll retry on the next mount. Again, we might also want + * to retry these periodically in the future. + */ btrfs_remove_delayed_node(BTRFS_I(inode)); clear_inode(inode); } @@ -5619,69 +5657,6 @@ static void inode_tree_del(struct inode *inode) } } -void btrfs_invalidate_inodes(struct btrfs_root *root) -{ - struct btrfs_fs_info *fs_info = root->fs_info; - struct rb_node *node; - struct rb_node *prev; - struct btrfs_inode *entry; - struct inode *inode; - u64 objectid = 0; - - if (!test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) - WARN_ON(btrfs_root_refs(&root->root_item) != 0); - - spin_lock(&root->inode_lock); -again: - node = root->inode_tree.rb_node; - prev = NULL; - while (node) { - prev = node; - entry = rb_entry(node, struct btrfs_inode, rb_node); - - if (objectid < btrfs_ino(BTRFS_I(&entry->vfs_inode))) - node = node->rb_left; - else if (objectid > btrfs_ino(BTRFS_I(&entry->vfs_inode))) - node = node->rb_right; - else - break; - } - if (!node) { - while (prev) { - entry = rb_entry(prev, struct btrfs_inode, rb_node); - if (objectid <= btrfs_ino(BTRFS_I(&entry->vfs_inode))) { - node = prev; - break; - } - prev = rb_next(prev); - } - } - while (node) { - entry = rb_entry(node, struct btrfs_inode, rb_node); - objectid = btrfs_ino(BTRFS_I(&entry->vfs_inode)) + 1; - inode = igrab(&entry->vfs_inode); - if (inode) { - spin_unlock(&root->inode_lock); - if (atomic_read(&inode->i_count) > 1) - d_prune_aliases(inode); - /* - * btrfs_drop_inode will have it removed from - * the inode cache when its usage count - * hits zero. - */ - iput(inode); - cond_resched(); - spin_lock(&root->inode_lock); - goto again; - } - - if (cond_resched_lock(&root->inode_lock)) - goto again; - - node = rb_next(node); - } - spin_unlock(&root->inode_lock); -} static int btrfs_init_locked_inode(struct inode *inode, void *p) { @@ -5843,11 +5818,6 @@ static int btrfs_dentry_delete(const struct dentry *dentry) return 0; } -static void btrfs_dentry_release(struct dentry *dentry) -{ - kfree(dentry->d_fsdata); -} - static struct dentry *btrfs_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) { @@ -6263,7 +6233,7 @@ static void btrfs_inherit_iflags(struct inode *inode, struct inode *dir) BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM; } - btrfs_update_iflags(inode); + btrfs_sync_inode_flags_to_i_flags(inode); } static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, @@ -6579,8 +6549,7 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry, goto out_unlock_inode; } else { btrfs_update_inode(trans, root, inode); - unlock_new_inode(inode); - d_instantiate(dentry, inode); + d_instantiate_new(dentry, inode); } out_unlock: @@ -6656,8 +6625,7 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry, goto out_unlock_inode; BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops; - unlock_new_inode(inode); - d_instantiate(dentry, inode); + d_instantiate_new(dentry, inode); out_unlock: btrfs_end_transaction(trans); @@ -6700,8 +6668,9 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir, * 2 items for inode and inode ref * 2 items for dir items * 1 item for parent inode + * 1 item for orphan item deletion if O_TMPFILE */ - trans = btrfs_start_transaction(root, 5); + trans = btrfs_start_transaction(root, inode->i_nlink ? 5 : 6); if (IS_ERR(trans)) { err = PTR_ERR(trans); trans = NULL; @@ -6802,12 +6771,7 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) if (err) goto out_fail_inode; - d_instantiate(dentry, inode); - /* - * mkdir is special. We're unlocking after we call d_instantiate - * to avoid a race with nfsd calling d_instantiate. - */ - unlock_new_inode(inode); + d_instantiate_new(dentry, inode); drop_on_err = 0; out_fail: @@ -7083,7 +7047,7 @@ insert: err = 0; write_lock(&em_tree->lock); - err = btrfs_add_extent_mapping(em_tree, &em, start, len); + err = btrfs_add_extent_mapping(fs_info, em_tree, &em, start, len); write_unlock(&em_tree->lock); out: @@ -7368,6 +7332,14 @@ noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len, btrfs_file_extent_other_encoding(leaf, fi)) goto out; + /* + * Do the same check as in btrfs_cross_ref_exist but without the + * unnecessary search. + */ + if (btrfs_file_extent_generation(leaf, fi) <= + btrfs_root_last_snapshot(&root->root_item)) + goto out; + backref_offset = btrfs_file_extent_offset(leaf, fi); if (orig_start) { @@ -7568,6 +7540,125 @@ static struct extent_map *create_io_em(struct inode *inode, u64 start, u64 len, return em; } + +static int btrfs_get_blocks_direct_read(struct extent_map *em, + struct buffer_head *bh_result, + struct inode *inode, + u64 start, u64 len) +{ + if (em->block_start == EXTENT_MAP_HOLE || + test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) + return -ENOENT; + + len = min(len, em->len - (start - em->start)); + + bh_result->b_blocknr = (em->block_start + (start - em->start)) >> + inode->i_blkbits; + bh_result->b_size = len; + bh_result->b_bdev = em->bdev; + set_buffer_mapped(bh_result); + + return 0; +} + +static int btrfs_get_blocks_direct_write(struct extent_map **map, + struct buffer_head *bh_result, + struct inode *inode, + struct btrfs_dio_data *dio_data, + u64 start, u64 len) +{ + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct extent_map *em = *map; + int ret = 0; + + /* + * We don't allocate a new extent in the following cases + * + * 1) The inode is marked as NODATACOW. In this case we'll just use the + * existing extent. + * 2) The extent is marked as PREALLOC. We're good to go here and can + * just use the extent. + * + */ + if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) || + ((BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) && + em->block_start != EXTENT_MAP_HOLE)) { + int type; + u64 block_start, orig_start, orig_block_len, ram_bytes; + + if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) + type = BTRFS_ORDERED_PREALLOC; + else + type = BTRFS_ORDERED_NOCOW; + len = min(len, em->len - (start - em->start)); + block_start = em->block_start + (start - em->start); + + if (can_nocow_extent(inode, start, &len, &orig_start, + &orig_block_len, &ram_bytes) == 1 && + btrfs_inc_nocow_writers(fs_info, block_start)) { + struct extent_map *em2; + + em2 = btrfs_create_dio_extent(inode, start, len, + orig_start, block_start, + len, orig_block_len, + ram_bytes, type); + btrfs_dec_nocow_writers(fs_info, block_start); + if (type == BTRFS_ORDERED_PREALLOC) { + free_extent_map(em); + *map = em = em2; + } + + if (em2 && IS_ERR(em2)) { + ret = PTR_ERR(em2); + goto out; + } + /* + * For inode marked NODATACOW or extent marked PREALLOC, + * use the existing or preallocated extent, so does not + * need to adjust btrfs_space_info's bytes_may_use. + */ + btrfs_free_reserved_data_space_noquota(inode, start, + len); + goto skip_cow; + } + } + + /* this will cow the extent */ + len = bh_result->b_size; + free_extent_map(em); + *map = em = btrfs_new_extent_direct(inode, start, len); + if (IS_ERR(em)) { + ret = PTR_ERR(em); + goto out; + } + + len = min(len, em->len - (start - em->start)); + +skip_cow: + bh_result->b_blocknr = (em->block_start + (start - em->start)) >> + inode->i_blkbits; + bh_result->b_size = len; + bh_result->b_bdev = em->bdev; + set_buffer_mapped(bh_result); + + if (!test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) + set_buffer_new(bh_result); + + /* + * Need to update the i_size under the extent lock so buffered + * readers will get the updated i_size when we unlock. + */ + if (!dio_data->overwrite && start + len > i_size_read(inode)) + i_size_write(inode, start + len); + + WARN_ON(dio_data->reserve < len); + dio_data->reserve -= len; + dio_data->unsubmitted_oe_range_end = start + len; + current->journal_info = dio_data; +out: + return ret; +} + static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create) { @@ -7636,116 +7727,36 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock, goto unlock_err; } - /* Just a good old fashioned hole, return */ - if (!create && (em->block_start == EXTENT_MAP_HOLE || - test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) { - free_extent_map(em); - goto unlock_err; - } - - /* - * We don't allocate a new extent in the following cases - * - * 1) The inode is marked as NODATACOW. In this case we'll just use the - * existing extent. - * 2) The extent is marked as PREALLOC. We're good to go here and can - * just use the extent. - * - */ - if (!create) { - len = min(len, em->len - (start - em->start)); - lockstart = start + len; - goto unlock; - } - - if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) || - ((BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) && - em->block_start != EXTENT_MAP_HOLE)) { - int type; - u64 block_start, orig_start, orig_block_len, ram_bytes; - - if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) - type = BTRFS_ORDERED_PREALLOC; - else - type = BTRFS_ORDERED_NOCOW; - len = min(len, em->len - (start - em->start)); - block_start = em->block_start + (start - em->start); - - if (can_nocow_extent(inode, start, &len, &orig_start, - &orig_block_len, &ram_bytes) == 1 && - btrfs_inc_nocow_writers(fs_info, block_start)) { - struct extent_map *em2; - - em2 = btrfs_create_dio_extent(inode, start, len, - orig_start, block_start, - len, orig_block_len, - ram_bytes, type); - btrfs_dec_nocow_writers(fs_info, block_start); - if (type == BTRFS_ORDERED_PREALLOC) { - free_extent_map(em); - em = em2; - } - if (em2 && IS_ERR(em2)) { - ret = PTR_ERR(em2); - goto unlock_err; - } - /* - * For inode marked NODATACOW or extent marked PREALLOC, - * use the existing or preallocated extent, so does not - * need to adjust btrfs_space_info's bytes_may_use. - */ - btrfs_free_reserved_data_space_noquota(inode, - start, len); - goto unlock; - } - } - - /* - * this will cow the extent, reset the len in case we changed - * it above - */ - len = bh_result->b_size; - free_extent_map(em); - em = btrfs_new_extent_direct(inode, start, len); - if (IS_ERR(em)) { - ret = PTR_ERR(em); - goto unlock_err; - } - len = min(len, em->len - (start - em->start)); -unlock: - bh_result->b_blocknr = (em->block_start + (start - em->start)) >> - inode->i_blkbits; - bh_result->b_size = len; - bh_result->b_bdev = em->bdev; - set_buffer_mapped(bh_result); if (create) { - if (!test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) - set_buffer_new(bh_result); + ret = btrfs_get_blocks_direct_write(&em, bh_result, inode, + dio_data, start, len); + if (ret < 0) + goto unlock_err; + /* clear and unlock the entire range */ + clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend, + unlock_bits, 1, 0, &cached_state); + } else { + ret = btrfs_get_blocks_direct_read(em, bh_result, inode, + start, len); + /* Can be negative only if we read from a hole */ + if (ret < 0) { + ret = 0; + free_extent_map(em); + goto unlock_err; + } /* - * Need to update the i_size under the extent lock so buffered - * readers will get the updated i_size when we unlock. + * We need to unlock only the end area that we aren't using. + * The rest is going to be unlocked by the endio routine. */ - if (!dio_data->overwrite && start + len > i_size_read(inode)) - i_size_write(inode, start + len); - - WARN_ON(dio_data->reserve < len); - dio_data->reserve -= len; - dio_data->unsubmitted_oe_range_end = start + len; - current->journal_info = dio_data; - } - - /* - * In the case of write we need to clear and unlock the entire range, - * in the case of read we need to unlock only the end area that we - * aren't using if there is any left over space. - */ - if (lockstart < lockend) { - clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, - lockend, unlock_bits, 1, 0, - &cached_state); - } else { - free_extent_state(cached_state); + lockstart = start + bh_result->b_size; + if (lockstart < lockend) { + clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, + lockend, unlock_bits, 1, 0, + &cached_state); + } else { + free_extent_state(cached_state); + } } free_extent_map(em); @@ -8131,7 +8142,6 @@ static void __endio_write_update_ordered(struct inode *inode, u64 ordered_offset = offset; u64 ordered_bytes = bytes; u64 last_offset; - int ret; if (btrfs_is_free_space_inode(BTRFS_I(inode))) { wq = fs_info->endio_freespace_worker; @@ -8141,32 +8151,31 @@ static void __endio_write_update_ordered(struct inode *inode, func = btrfs_endio_write_helper; } -again: - last_offset = ordered_offset; - ret = btrfs_dec_test_first_ordered_pending(inode, &ordered, - &ordered_offset, - ordered_bytes, - uptodate); - if (!ret) - goto out_test; - - btrfs_init_work(&ordered->work, func, finish_ordered_fn, NULL, NULL); - btrfs_queue_work(wq, &ordered->work); -out_test: - /* - * If btrfs_dec_test_ordered_pending does not find any ordered extent - * in the range, we can exit. - */ - if (ordered_offset == last_offset) - return; - /* - * our bio might span multiple ordered extents. If we haven't - * completed the accounting for the whole dio, go back and try again - */ - if (ordered_offset < offset + bytes) { - ordered_bytes = offset + bytes - ordered_offset; - ordered = NULL; - goto again; + while (ordered_offset < offset + bytes) { + last_offset = ordered_offset; + if (btrfs_dec_test_first_ordered_pending(inode, &ordered, + &ordered_offset, + ordered_bytes, + uptodate)) { + btrfs_init_work(&ordered->work, func, + finish_ordered_fn, + NULL, NULL); + btrfs_queue_work(wq, &ordered->work); + } + /* + * If btrfs_dec_test_ordered_pending does not find any ordered + * extent in the range, we can exit. + */ + if (ordered_offset == last_offset) + return; + /* + * Our bio might span multiple ordered extents. In this case + * we keep goin until we have accounted the whole dio. + */ + if (ordered_offset < offset + bytes) { + ordered_bytes = offset + bytes - ordered_offset; + ordered = NULL; + } } } @@ -8705,29 +8714,19 @@ static int btrfs_writepage(struct page *page, struct writeback_control *wbc) static int btrfs_writepages(struct address_space *mapping, struct writeback_control *wbc) { - struct extent_io_tree *tree; - - tree = &BTRFS_I(mapping->host)->io_tree; - return extent_writepages(tree, mapping, wbc); + return extent_writepages(mapping, wbc); } static int btrfs_readpages(struct file *file, struct address_space *mapping, struct list_head *pages, unsigned nr_pages) { - struct extent_io_tree *tree; - tree = &BTRFS_I(mapping->host)->io_tree; - return extent_readpages(tree, mapping, pages, nr_pages); + return extent_readpages(mapping, pages, nr_pages); } + static int __btrfs_releasepage(struct page *page, gfp_t gfp_flags) { - struct extent_io_tree *tree; - struct extent_map_tree *map; - int ret; - - tree = &BTRFS_I(page->mapping->host)->io_tree; - map = &BTRFS_I(page->mapping->host)->extent_tree; - ret = try_release_extent_mapping(map, tree, page, gfp_flags); + int ret = try_release_extent_mapping(page, gfp_flags); if (ret == 1) { ClearPagePrivate(page); set_page_private(page, 0); @@ -8868,8 +8867,8 @@ again: * * We are not allowed to take the i_mutex here so we have to play games to * protect against truncate races as the page could now be beyond EOF. Because - * vmtruncate() writes the inode size before removing pages, once we have the - * page lock we can determine safely if the page is beyond EOF. If it is not + * truncate_setsize() writes the inode size before removing pages, once we have + * the page lock we can determine safely if the page is beyond EOF. If it is not * beyond EOF, then the page is guaranteed safe against truncation until we * unlock the page. */ @@ -9031,8 +9030,7 @@ static int btrfs_truncate(struct inode *inode, bool skip_writeback) struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_block_rsv *rsv; - int ret = 0; - int err = 0; + int ret; struct btrfs_trans_handle *trans; u64 mask = fs_info->sectorsize - 1; u64 min_size = btrfs_calc_trunc_metadata_size(fs_info, 1); @@ -9045,39 +9043,31 @@ static int btrfs_truncate(struct inode *inode, bool skip_writeback) } /* - * Yes ladies and gentlemen, this is indeed ugly. The fact is we have - * 3 things going on here - * - * 1) We need to reserve space for our orphan item and the space to - * delete our orphan item. Lord knows we don't want to have a dangling - * orphan item because we didn't reserve space to remove it. + * Yes ladies and gentlemen, this is indeed ugly. We have a couple of + * things going on here: * - * 2) We need to reserve space to update our inode. + * 1) We need to reserve space to update our inode. * - * 3) We need to have something to cache all the space that is going to + * 2) We need to have something to cache all the space that is going to * be free'd up by the truncate operation, but also have some slack * space reserved in case it uses space during the truncate (thank you * very much snapshotting). * - * And we need these to all be separate. The fact is we can use a lot of + * And we need these to be separate. The fact is we can use a lot of * space doing the truncate, and we have no earthly idea how much space * we will use, so we need the truncate reservation to be separate so it - * doesn't end up using space reserved for updating the inode or - * removing the orphan item. We also need to be able to stop the - * transaction and start a new one, which means we need to be able to - * update the inode several times, and we have no idea of knowing how - * many times that will be, so we can't just reserve 1 item for the - * entirety of the operation, so that has to be done separately as well. - * Then there is the orphan item, which does indeed need to be held on - * to for the whole operation, and we need nobody to touch this reserved - * space except the orphan code. + * doesn't end up using space reserved for updating the inode. We also + * need to be able to stop the transaction and start a new one, which + * means we need to be able to update the inode several times, and we + * have no idea of knowing how many times that will be, so we can't just + * reserve 1 item for the entirety of the operation, so that has to be + * done separately as well. * * So that leaves us with * - * 1) root->orphan_block_rsv - for the orphan deletion. - * 2) rsv - for the truncate reservation, which we will steal from the + * 1) rsv - for the truncate reservation, which we will steal from the * transaction reservation. - * 3) fs_info->trans_block_rsv - this will have 1 items worth left for + * 2) fs_info->trans_block_rsv - this will have 1 items worth left for * updating the inode. */ rsv = btrfs_alloc_block_rsv(fs_info, BTRFS_BLOCK_RSV_TEMP); @@ -9092,7 +9082,7 @@ static int btrfs_truncate(struct inode *inode, bool skip_writeback) */ trans = btrfs_start_transaction(root, 2); if (IS_ERR(trans)) { - err = PTR_ERR(trans); + ret = PTR_ERR(trans); goto out; } @@ -9116,23 +9106,19 @@ static int btrfs_truncate(struct inode *inode, bool skip_writeback) inode->i_size, BTRFS_EXTENT_DATA_KEY); trans->block_rsv = &fs_info->trans_block_rsv; - if (ret != -ENOSPC && ret != -EAGAIN) { - err = ret; + if (ret != -ENOSPC && ret != -EAGAIN) break; - } ret = btrfs_update_inode(trans, root, inode); - if (ret) { - err = ret; + if (ret) break; - } btrfs_end_transaction(trans); btrfs_btree_balance_dirty(fs_info); trans = btrfs_start_transaction(root, 2); if (IS_ERR(trans)) { - ret = err = PTR_ERR(trans); + ret = PTR_ERR(trans); trans = NULL; break; } @@ -9165,29 +9151,23 @@ static int btrfs_truncate(struct inode *inode, bool skip_writeback) btrfs_ordered_update_i_size(inode, inode->i_size, NULL); } - if (ret == 0 && inode->i_nlink > 0) { - trans->block_rsv = root->orphan_block_rsv; - ret = btrfs_orphan_del(trans, BTRFS_I(inode)); - if (ret) - err = ret; - } - if (trans) { + int ret2; + trans->block_rsv = &fs_info->trans_block_rsv; - ret = btrfs_update_inode(trans, root, inode); - if (ret && !err) - err = ret; + ret2 = btrfs_update_inode(trans, root, inode); + if (ret2 && !ret) + ret = ret2; - ret = btrfs_end_transaction(trans); + ret2 = btrfs_end_transaction(trans); + if (ret2 && !ret) + ret = ret2; btrfs_btree_balance_dirty(fs_info); } out: btrfs_free_block_rsv(fs_info, rsv); - if (ret && !err) - err = ret; - - return err; + return ret; } /* @@ -9323,13 +9303,6 @@ void btrfs_destroy_inode(struct inode *inode) if (!root) goto free; - if (test_bit(BTRFS_INODE_HAS_ORPHAN_ITEM, - &BTRFS_I(inode)->runtime_flags)) { - btrfs_info(fs_info, "inode %llu still on the orphan list", - btrfs_ino(BTRFS_I(inode))); - atomic_dec(&root->orphan_inodes); - } - while (1) { ordered = btrfs_lookup_first_ordered_extent(inode, (u64)-1); if (!ordered) @@ -9963,6 +9936,13 @@ static int btrfs_rename2(struct inode *old_dir, struct dentry *old_dentry, return btrfs_rename(old_dir, old_dentry, new_dir, new_dentry, flags); } +struct btrfs_delalloc_work { + struct inode *inode; + struct completion completion; + struct list_head list; + struct btrfs_work work; +}; + static void btrfs_run_delalloc_work(struct btrfs_work *work) { struct btrfs_delalloc_work *delalloc_work; @@ -9976,15 +9956,11 @@ static void btrfs_run_delalloc_work(struct btrfs_work *work) &BTRFS_I(inode)->runtime_flags)) filemap_flush(inode->i_mapping); - if (delalloc_work->delay_iput) - btrfs_add_delayed_iput(inode); - else - iput(inode); + iput(inode); complete(&delalloc_work->completion); } -struct btrfs_delalloc_work *btrfs_alloc_delalloc_work(struct inode *inode, - int delay_iput) +static struct btrfs_delalloc_work *btrfs_alloc_delalloc_work(struct inode *inode) { struct btrfs_delalloc_work *work; @@ -9995,7 +9971,6 @@ struct btrfs_delalloc_work *btrfs_alloc_delalloc_work(struct inode *inode, init_completion(&work->completion); INIT_LIST_HEAD(&work->list); work->inode = inode; - work->delay_iput = delay_iput; WARN_ON_ONCE(!inode); btrfs_init_work(&work->work, btrfs_flush_delalloc_helper, btrfs_run_delalloc_work, NULL, NULL); @@ -10003,18 +9978,11 @@ struct btrfs_delalloc_work *btrfs_alloc_delalloc_work(struct inode *inode, return work; } -void btrfs_wait_and_free_delalloc_work(struct btrfs_delalloc_work *work) -{ - wait_for_completion(&work->completion); - kfree(work); -} - /* * some fairly slow code that needs optimization. This walks the list * of all the inodes with pending delalloc and forces them to disk. */ -static int __start_delalloc_inodes(struct btrfs_root *root, int delay_iput, - int nr) +static int start_delalloc_inodes(struct btrfs_root *root, int nr) { struct btrfs_inode *binode; struct inode *inode; @@ -10042,12 +10010,9 @@ static int __start_delalloc_inodes(struct btrfs_root *root, int delay_iput, } spin_unlock(&root->delalloc_lock); - work = btrfs_alloc_delalloc_work(inode, delay_iput); + work = btrfs_alloc_delalloc_work(inode); if (!work) { - if (delay_iput) - btrfs_add_delayed_iput(inode); - else - iput(inode); + iput(inode); ret = -ENOMEM; goto out; } @@ -10065,10 +10030,11 @@ static int __start_delalloc_inodes(struct btrfs_root *root, int delay_iput, out: list_for_each_entry_safe(work, next, &works, list) { list_del_init(&work->list); - btrfs_wait_and_free_delalloc_work(work); + wait_for_completion(&work->completion); + kfree(work); } - if (!list_empty_careful(&splice)) { + if (!list_empty(&splice)) { spin_lock(&root->delalloc_lock); list_splice_tail(&splice, &root->delalloc_inodes); spin_unlock(&root->delalloc_lock); @@ -10077,7 +10043,7 @@ out: return ret; } -int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput) +int btrfs_start_delalloc_inodes(struct btrfs_root *root) { struct btrfs_fs_info *fs_info = root->fs_info; int ret; @@ -10085,14 +10051,13 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput) if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) return -EROFS; - ret = __start_delalloc_inodes(root, delay_iput, -1); + ret = start_delalloc_inodes(root, -1); if (ret > 0) ret = 0; return ret; } -int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int delay_iput, - int nr) +int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int nr) { struct btrfs_root *root; struct list_head splice; @@ -10115,7 +10080,7 @@ int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int delay_iput, &fs_info->delalloc_roots); spin_unlock(&fs_info->delalloc_root_lock); - ret = __start_delalloc_inodes(root, delay_iput, nr); + ret = start_delalloc_inodes(root, nr); btrfs_put_fs_root(root); if (ret < 0) goto out; @@ -10130,7 +10095,7 @@ int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int delay_iput, ret = 0; out: - if (!list_empty_careful(&splice)) { + if (!list_empty(&splice)) { spin_lock(&fs_info->delalloc_root_lock); list_splice_tail(&splice, &fs_info->delalloc_roots); spin_unlock(&fs_info->delalloc_root_lock); @@ -10250,8 +10215,7 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry, goto out_unlock_inode; } - unlock_new_inode(inode); - d_instantiate(dentry, inode); + d_instantiate_new(dentry, inode); out_unlock: btrfs_end_transaction(trans); @@ -10669,5 +10633,4 @@ static const struct inode_operations btrfs_symlink_inode_operations = { const struct dentry_operations btrfs_dentry_operations = { .d_delete = btrfs_dentry_delete, - .d_release = btrfs_dentry_release, }; diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 632e26d6f7ce..d29992f7dc63 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -93,20 +93,22 @@ static int btrfs_clone(struct inode *src, struct inode *inode, int no_time_update); /* Mask out flags that are inappropriate for the given type of inode. */ -static unsigned int btrfs_mask_flags(umode_t mode, unsigned int flags) +static unsigned int btrfs_mask_fsflags_for_type(struct inode *inode, + unsigned int flags) { - if (S_ISDIR(mode)) + if (S_ISDIR(inode->i_mode)) return flags; - else if (S_ISREG(mode)) + else if (S_ISREG(inode->i_mode)) return flags & ~FS_DIRSYNC_FL; else return flags & (FS_NODUMP_FL | FS_NOATIME_FL); } /* - * Export inode flags to the format expected by the FS_IOC_GETFLAGS ioctl. + * Export internal inode flags to the format expected by the FS_IOC_GETFLAGS + * ioctl. */ -static unsigned int btrfs_flags_to_ioctl(unsigned int flags) +static unsigned int btrfs_inode_flags_to_fsflags(unsigned int flags) { unsigned int iflags = 0; @@ -136,20 +138,20 @@ static unsigned int btrfs_flags_to_ioctl(unsigned int flags) /* * Update inode->i_flags based on the btrfs internal flags. */ -void btrfs_update_iflags(struct inode *inode) +void btrfs_sync_inode_flags_to_i_flags(struct inode *inode) { - struct btrfs_inode *ip = BTRFS_I(inode); + struct btrfs_inode *binode = BTRFS_I(inode); unsigned int new_fl = 0; - if (ip->flags & BTRFS_INODE_SYNC) + if (binode->flags & BTRFS_INODE_SYNC) new_fl |= S_SYNC; - if (ip->flags & BTRFS_INODE_IMMUTABLE) + if (binode->flags & BTRFS_INODE_IMMUTABLE) new_fl |= S_IMMUTABLE; - if (ip->flags & BTRFS_INODE_APPEND) + if (binode->flags & BTRFS_INODE_APPEND) new_fl |= S_APPEND; - if (ip->flags & BTRFS_INODE_NOATIME) + if (binode->flags & BTRFS_INODE_NOATIME) new_fl |= S_NOATIME; - if (ip->flags & BTRFS_INODE_DIRSYNC) + if (binode->flags & BTRFS_INODE_DIRSYNC) new_fl |= S_DIRSYNC; set_mask_bits(&inode->i_flags, @@ -159,15 +161,16 @@ void btrfs_update_iflags(struct inode *inode) static int btrfs_ioctl_getflags(struct file *file, void __user *arg) { - struct btrfs_inode *ip = BTRFS_I(file_inode(file)); - unsigned int flags = btrfs_flags_to_ioctl(ip->flags); + struct btrfs_inode *binode = BTRFS_I(file_inode(file)); + unsigned int flags = btrfs_inode_flags_to_fsflags(binode->flags); if (copy_to_user(arg, &flags, sizeof(flags))) return -EFAULT; return 0; } -static int check_flags(unsigned int flags) +/* Check if @flags are a supported and valid set of FS_*_FL flags */ +static int check_fsflags(unsigned int flags) { if (flags & ~(FS_IMMUTABLE_FL | FS_APPEND_FL | \ FS_NOATIME_FL | FS_NODUMP_FL | \ @@ -186,13 +189,13 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg) { struct inode *inode = file_inode(file); struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - struct btrfs_inode *ip = BTRFS_I(inode); - struct btrfs_root *root = ip->root; + struct btrfs_inode *binode = BTRFS_I(inode); + struct btrfs_root *root = binode->root; struct btrfs_trans_handle *trans; - unsigned int flags, oldflags; + unsigned int fsflags, old_fsflags; int ret; - u64 ip_oldflags; - unsigned int i_oldflags; + u64 old_flags; + unsigned int old_i_flags; umode_t mode; if (!inode_owner_or_capable(inode)) @@ -201,10 +204,10 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg) if (btrfs_root_readonly(root)) return -EROFS; - if (copy_from_user(&flags, arg, sizeof(flags))) + if (copy_from_user(&fsflags, arg, sizeof(fsflags))) return -EFAULT; - ret = check_flags(flags); + ret = check_fsflags(fsflags); if (ret) return ret; @@ -214,44 +217,44 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg) inode_lock(inode); - ip_oldflags = ip->flags; - i_oldflags = inode->i_flags; + old_flags = binode->flags; + old_i_flags = inode->i_flags; mode = inode->i_mode; - flags = btrfs_mask_flags(inode->i_mode, flags); - oldflags = btrfs_flags_to_ioctl(ip->flags); - if ((flags ^ oldflags) & (FS_APPEND_FL | FS_IMMUTABLE_FL)) { + fsflags = btrfs_mask_fsflags_for_type(inode, fsflags); + old_fsflags = btrfs_inode_flags_to_fsflags(binode->flags); + if ((fsflags ^ old_fsflags) & (FS_APPEND_FL | FS_IMMUTABLE_FL)) { if (!capable(CAP_LINUX_IMMUTABLE)) { ret = -EPERM; goto out_unlock; } } - if (flags & FS_SYNC_FL) - ip->flags |= BTRFS_INODE_SYNC; + if (fsflags & FS_SYNC_FL) + binode->flags |= BTRFS_INODE_SYNC; else - ip->flags &= ~BTRFS_INODE_SYNC; - if (flags & FS_IMMUTABLE_FL) - ip->flags |= BTRFS_INODE_IMMUTABLE; + binode->flags &= ~BTRFS_INODE_SYNC; + if (fsflags & FS_IMMUTABLE_FL) + binode->flags |= BTRFS_INODE_IMMUTABLE; else - ip->flags &= ~BTRFS_INODE_IMMUTABLE; - if (flags & FS_APPEND_FL) - ip->flags |= BTRFS_INODE_APPEND; + binode->flags &= ~BTRFS_INODE_IMMUTABLE; + if (fsflags & FS_APPEND_FL) + binode->flags |= BTRFS_INODE_APPEND; else - ip->flags &= ~BTRFS_INODE_APPEND; - if (flags & FS_NODUMP_FL) - ip->flags |= BTRFS_INODE_NODUMP; + binode->flags &= ~BTRFS_INODE_APPEND; + if (fsflags & FS_NODUMP_FL) + binode->flags |= BTRFS_INODE_NODUMP; else - ip->flags &= ~BTRFS_INODE_NODUMP; - if (flags & FS_NOATIME_FL) - ip->flags |= BTRFS_INODE_NOATIME; + binode->flags &= ~BTRFS_INODE_NODUMP; + if (fsflags & FS_NOATIME_FL) + binode->flags |= BTRFS_INODE_NOATIME; else - ip->flags &= ~BTRFS_INODE_NOATIME; - if (flags & FS_DIRSYNC_FL) - ip->flags |= BTRFS_INODE_DIRSYNC; + binode->flags &= ~BTRFS_INODE_NOATIME; + if (fsflags & FS_DIRSYNC_FL) + binode->flags |= BTRFS_INODE_DIRSYNC; else - ip->flags &= ~BTRFS_INODE_DIRSYNC; - if (flags & FS_NOCOW_FL) { + binode->flags &= ~BTRFS_INODE_DIRSYNC; + if (fsflags & FS_NOCOW_FL) { if (S_ISREG(mode)) { /* * It's safe to turn csums off here, no extents exist. @@ -259,10 +262,10 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg) * status of the file and will not set it. */ if (inode->i_size == 0) - ip->flags |= BTRFS_INODE_NODATACOW - | BTRFS_INODE_NODATASUM; + binode->flags |= BTRFS_INODE_NODATACOW + | BTRFS_INODE_NODATASUM; } else { - ip->flags |= BTRFS_INODE_NODATACOW; + binode->flags |= BTRFS_INODE_NODATACOW; } } else { /* @@ -270,10 +273,10 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg) */ if (S_ISREG(mode)) { if (inode->i_size == 0) - ip->flags &= ~(BTRFS_INODE_NODATACOW + binode->flags &= ~(BTRFS_INODE_NODATACOW | BTRFS_INODE_NODATASUM); } else { - ip->flags &= ~BTRFS_INODE_NODATACOW; + binode->flags &= ~BTRFS_INODE_NODATACOW; } } @@ -282,18 +285,18 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg) * flag may be changed automatically if compression code won't make * things smaller. */ - if (flags & FS_NOCOMP_FL) { - ip->flags &= ~BTRFS_INODE_COMPRESS; - ip->flags |= BTRFS_INODE_NOCOMPRESS; + if (fsflags & FS_NOCOMP_FL) { + binode->flags &= ~BTRFS_INODE_COMPRESS; + binode->flags |= BTRFS_INODE_NOCOMPRESS; ret = btrfs_set_prop(inode, "btrfs.compression", NULL, 0, 0); if (ret && ret != -ENODATA) goto out_drop; - } else if (flags & FS_COMPR_FL) { + } else if (fsflags & FS_COMPR_FL) { const char *comp; - ip->flags |= BTRFS_INODE_COMPRESS; - ip->flags &= ~BTRFS_INODE_NOCOMPRESS; + binode->flags |= BTRFS_INODE_COMPRESS; + binode->flags &= ~BTRFS_INODE_NOCOMPRESS; comp = btrfs_compress_type2str(fs_info->compress_type); if (!comp || comp[0] == 0) @@ -308,7 +311,7 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg) ret = btrfs_set_prop(inode, "btrfs.compression", NULL, 0, 0); if (ret && ret != -ENODATA) goto out_drop; - ip->flags &= ~(BTRFS_INODE_COMPRESS | BTRFS_INODE_NOCOMPRESS); + binode->flags &= ~(BTRFS_INODE_COMPRESS | BTRFS_INODE_NOCOMPRESS); } trans = btrfs_start_transaction(root, 1); @@ -317,7 +320,7 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg) goto out_drop; } - btrfs_update_iflags(inode); + btrfs_sync_inode_flags_to_i_flags(inode); inode_inc_iversion(inode); inode->i_ctime = current_time(inode); ret = btrfs_update_inode(trans, root, inode); @@ -325,8 +328,8 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg) btrfs_end_transaction(trans); out_drop: if (ret) { - ip->flags = ip_oldflags; - inode->i_flags = i_oldflags; + binode->flags = old_flags; + inode->i_flags = old_i_flags; } out_unlock: @@ -335,6 +338,148 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg) return ret; } +/* + * Translate btrfs internal inode flags to xflags as expected by the + * FS_IOC_FSGETXATT ioctl. Filter only the supported ones, unknown flags are + * silently dropped. + */ +static unsigned int btrfs_inode_flags_to_xflags(unsigned int flags) +{ + unsigned int xflags = 0; + + if (flags & BTRFS_INODE_APPEND) + xflags |= FS_XFLAG_APPEND; + if (flags & BTRFS_INODE_IMMUTABLE) + xflags |= FS_XFLAG_IMMUTABLE; + if (flags & BTRFS_INODE_NOATIME) + xflags |= FS_XFLAG_NOATIME; + if (flags & BTRFS_INODE_NODUMP) + xflags |= FS_XFLAG_NODUMP; + if (flags & BTRFS_INODE_SYNC) + xflags |= FS_XFLAG_SYNC; + + return xflags; +} + +/* Check if @flags are a supported and valid set of FS_XFLAGS_* flags */ +static int check_xflags(unsigned int flags) +{ + if (flags & ~(FS_XFLAG_APPEND | FS_XFLAG_IMMUTABLE | FS_XFLAG_NOATIME | + FS_XFLAG_NODUMP | FS_XFLAG_SYNC)) + return -EOPNOTSUPP; + return 0; +} + +/* + * Set the xflags from the internal inode flags. The remaining items of fsxattr + * are zeroed. + */ +static int btrfs_ioctl_fsgetxattr(struct file *file, void __user *arg) +{ + struct btrfs_inode *binode = BTRFS_I(file_inode(file)); + struct fsxattr fa; + + memset(&fa, 0, sizeof(fa)); + fa.fsx_xflags = btrfs_inode_flags_to_xflags(binode->flags); + + if (copy_to_user(arg, &fa, sizeof(fa))) + return -EFAULT; + + return 0; +} + +static int btrfs_ioctl_fssetxattr(struct file *file, void __user *arg) +{ + struct inode *inode = file_inode(file); + struct btrfs_inode *binode = BTRFS_I(inode); + struct btrfs_root *root = binode->root; + struct btrfs_trans_handle *trans; + struct fsxattr fa; + unsigned old_flags; + unsigned old_i_flags; + int ret = 0; + + if (!inode_owner_or_capable(inode)) + return -EPERM; + + if (btrfs_root_readonly(root)) + return -EROFS; + + memset(&fa, 0, sizeof(fa)); + if (copy_from_user(&fa, arg, sizeof(fa))) + return -EFAULT; + + ret = check_xflags(fa.fsx_xflags); + if (ret) + return ret; + + if (fa.fsx_extsize != 0 || fa.fsx_projid != 0 || fa.fsx_cowextsize != 0) + return -EOPNOTSUPP; + + ret = mnt_want_write_file(file); + if (ret) + return ret; + + inode_lock(inode); + + old_flags = binode->flags; + old_i_flags = inode->i_flags; + + /* We need the capabilities to change append-only or immutable inode */ + if (((old_flags & (BTRFS_INODE_APPEND | BTRFS_INODE_IMMUTABLE)) || + (fa.fsx_xflags & (FS_XFLAG_APPEND | FS_XFLAG_IMMUTABLE))) && + !capable(CAP_LINUX_IMMUTABLE)) { + ret = -EPERM; + goto out_unlock; + } + + if (fa.fsx_xflags & FS_XFLAG_SYNC) + binode->flags |= BTRFS_INODE_SYNC; + else + binode->flags &= ~BTRFS_INODE_SYNC; + if (fa.fsx_xflags & FS_XFLAG_IMMUTABLE) + binode->flags |= BTRFS_INODE_IMMUTABLE; + else + binode->flags &= ~BTRFS_INODE_IMMUTABLE; + if (fa.fsx_xflags & FS_XFLAG_APPEND) + binode->flags |= BTRFS_INODE_APPEND; + else + binode->flags &= ~BTRFS_INODE_APPEND; + if (fa.fsx_xflags & FS_XFLAG_NODUMP) + binode->flags |= BTRFS_INODE_NODUMP; + else + binode->flags &= ~BTRFS_INODE_NODUMP; + if (fa.fsx_xflags & FS_XFLAG_NOATIME) + binode->flags |= BTRFS_INODE_NOATIME; + else + binode->flags &= ~BTRFS_INODE_NOATIME; + + /* 1 item for the inode */ + trans = btrfs_start_transaction(root, 1); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + goto out_unlock; + } + + btrfs_sync_inode_flags_to_i_flags(inode); + inode_inc_iversion(inode); + inode->i_ctime = current_time(inode); + ret = btrfs_update_inode(trans, root, inode); + + btrfs_end_transaction(trans); + +out_unlock: + if (ret) { + binode->flags = old_flags; + inode->i_flags = old_i_flags; + } + + inode_unlock(inode); + mnt_drop_write_file(file); + + return ret; +} + static int btrfs_ioctl_getversion(struct file *file, int __user *arg) { struct inode *inode = file_inode(file); @@ -424,7 +569,6 @@ static noinline int create_subvol(struct inode *dir, u64 objectid; u64 new_dirid = BTRFS_FIRST_FREE_OBJECTID; u64 index = 0; - u64 qgroup_reserved; uuid_le new_uuid; root_item = kzalloc(sizeof(*root_item), GFP_KERNEL); @@ -449,8 +593,7 @@ static noinline int create_subvol(struct inode *dir, * The same as the snapshot creation, please see the comment * of create_snapshot(). */ - ret = btrfs_subvolume_reserve_metadata(root, &block_rsv, - 8, &qgroup_reserved, false); + ret = btrfs_subvolume_reserve_metadata(root, &block_rsv, 8, false); if (ret) goto fail_free; @@ -573,7 +716,7 @@ static noinline int create_subvol(struct inode *dir, btrfs_ino(BTRFS_I(dir)), index, name, namelen); BUG_ON(ret); - ret = btrfs_uuid_tree_add(trans, fs_info, root_item->uuid, + ret = btrfs_uuid_tree_add(trans, root_item->uuid, BTRFS_UUID_KEY_SUBVOL, objectid); if (ret) btrfs_abort_transaction(trans, ret); @@ -640,7 +783,7 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir, wait_event(root->subv_writers->wait, percpu_counter_sum(&root->subv_writers->counter) == 0); - ret = btrfs_start_delalloc_inodes(root, 0); + ret = btrfs_start_delalloc_inodes(root); if (ret) goto dec_and_free; @@ -658,7 +801,6 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir, */ ret = btrfs_subvolume_reserve_metadata(BTRFS_I(dir)->root, &pending_snapshot->block_rsv, 8, - &pending_snapshot->qgroup_reserved, false); if (ret) goto dec_and_free; @@ -1457,7 +1599,6 @@ static noinline int btrfs_ioctl_resize(struct file *file, return BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS; } - mutex_lock(&fs_info->volume_mutex); vol_args = memdup_user(arg, sizeof(*vol_args)); if (IS_ERR(vol_args)) { ret = PTR_ERR(vol_args); @@ -1565,7 +1706,6 @@ static noinline int btrfs_ioctl_resize(struct file *file, out_free: kfree(vol_args); out: - mutex_unlock(&fs_info->volume_mutex); clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags); mnt_drop_write_file(file); return ret; @@ -1832,60 +1972,6 @@ out: return ret; } -/* - * helper to check if the subvolume references other subvolumes - */ -static noinline int may_destroy_subvol(struct btrfs_root *root) -{ - struct btrfs_fs_info *fs_info = root->fs_info; - struct btrfs_path *path; - struct btrfs_dir_item *di; - struct btrfs_key key; - u64 dir_id; - int ret; - - path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; - - /* Make sure this root isn't set as the default subvol */ - dir_id = btrfs_super_root_dir(fs_info->super_copy); - di = btrfs_lookup_dir_item(NULL, fs_info->tree_root, path, - dir_id, "default", 7, 0); - if (di && !IS_ERR(di)) { - btrfs_dir_item_key_to_cpu(path->nodes[0], di, &key); - if (key.objectid == root->root_key.objectid) { - ret = -EPERM; - btrfs_err(fs_info, - "deleting default subvolume %llu is not allowed", - key.objectid); - goto out; - } - btrfs_release_path(path); - } - - key.objectid = root->root_key.objectid; - key.type = BTRFS_ROOT_REF_KEY; - key.offset = (u64)-1; - - ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0); - if (ret < 0) - goto out; - BUG_ON(ret == 0); - - ret = 0; - if (path->slots[0] > 0) { - path->slots[0]--; - btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); - if (key.objectid == root->root_key.objectid && - key.type == BTRFS_ROOT_REF_KEY) - ret = -ENOTEMPTY; - } -out: - btrfs_free_path(path); - return ret; -} - static noinline int key_in_sk(struct btrfs_key *key, struct btrfs_ioctl_search_key *sk) { @@ -2066,7 +2152,7 @@ static noinline int search_ioctl(struct inode *inode, root = btrfs_read_fs_root_no_name(info, &key); if (IS_ERR(root)) { btrfs_free_path(path); - return -ENOENT; + return PTR_ERR(root); } } @@ -2200,8 +2286,7 @@ static noinline int btrfs_search_path_in_tree(struct btrfs_fs_info *info, key.offset = (u64)-1; root = btrfs_read_fs_root_no_name(info, &key); if (IS_ERR(root)) { - btrfs_err(info, "could not find root %llu", tree_id); - ret = -ENOENT; + ret = PTR_ERR(root); goto out; } @@ -2256,6 +2341,165 @@ out: return ret; } +static int btrfs_search_path_in_tree_user(struct inode *inode, + struct btrfs_ioctl_ino_lookup_user_args *args) +{ + struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info; + struct super_block *sb = inode->i_sb; + struct btrfs_key upper_limit = BTRFS_I(inode)->location; + u64 treeid = BTRFS_I(inode)->root->root_key.objectid; + u64 dirid = args->dirid; + unsigned long item_off; + unsigned long item_len; + struct btrfs_inode_ref *iref; + struct btrfs_root_ref *rref; + struct btrfs_root *root; + struct btrfs_path *path; + struct btrfs_key key, key2; + struct extent_buffer *leaf; + struct inode *temp_inode; + char *ptr; + int slot; + int len; + int total_len = 0; + int ret; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + /* + * If the bottom subvolume does not exist directly under upper_limit, + * construct the path in from the bottom up. + */ + if (dirid != upper_limit.objectid) { + ptr = &args->path[BTRFS_INO_LOOKUP_USER_PATH_MAX - 1]; + + key.objectid = treeid; + key.type = BTRFS_ROOT_ITEM_KEY; + key.offset = (u64)-1; + root = btrfs_read_fs_root_no_name(fs_info, &key); + if (IS_ERR(root)) { + ret = PTR_ERR(root); + goto out; + } + + key.objectid = dirid; + key.type = BTRFS_INODE_REF_KEY; + key.offset = (u64)-1; + while (1) { + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) { + goto out; + } else if (ret > 0) { + ret = btrfs_previous_item(root, path, dirid, + BTRFS_INODE_REF_KEY); + if (ret < 0) { + goto out; + } else if (ret > 0) { + ret = -ENOENT; + goto out; + } + } + + leaf = path->nodes[0]; + slot = path->slots[0]; + btrfs_item_key_to_cpu(leaf, &key, slot); + + iref = btrfs_item_ptr(leaf, slot, struct btrfs_inode_ref); + len = btrfs_inode_ref_name_len(leaf, iref); + ptr -= len + 1; + total_len += len + 1; + if (ptr < args->path) { + ret = -ENAMETOOLONG; + goto out; + } + + *(ptr + len) = '/'; + read_extent_buffer(leaf, ptr, + (unsigned long)(iref + 1), len); + + /* Check the read+exec permission of this directory */ + ret = btrfs_previous_item(root, path, dirid, + BTRFS_INODE_ITEM_KEY); + if (ret < 0) { + goto out; + } else if (ret > 0) { + ret = -ENOENT; + goto out; + } + + leaf = path->nodes[0]; + slot = path->slots[0]; + btrfs_item_key_to_cpu(leaf, &key2, slot); + if (key2.objectid != dirid) { + ret = -ENOENT; + goto out; + } + + temp_inode = btrfs_iget(sb, &key2, root, NULL); + ret = inode_permission(temp_inode, MAY_READ | MAY_EXEC); + iput(temp_inode); + if (ret) { + ret = -EACCES; + goto out; + } + + if (key.offset == upper_limit.objectid) + break; + if (key.objectid == BTRFS_FIRST_FREE_OBJECTID) { + ret = -EACCES; + goto out; + } + + btrfs_release_path(path); + key.objectid = key.offset; + key.offset = (u64)-1; + dirid = key.objectid; + } + + memmove(args->path, ptr, total_len); + args->path[total_len] = '\0'; + btrfs_release_path(path); + } + + /* Get the bottom subvolume's name from ROOT_REF */ + root = fs_info->tree_root; + key.objectid = treeid; + key.type = BTRFS_ROOT_REF_KEY; + key.offset = args->treeid; + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) { + goto out; + } else if (ret > 0) { + ret = -ENOENT; + goto out; + } + + leaf = path->nodes[0]; + slot = path->slots[0]; + btrfs_item_key_to_cpu(leaf, &key, slot); + + item_off = btrfs_item_ptr_offset(leaf, slot); + item_len = btrfs_item_size_nr(leaf, slot); + /* Check if dirid in ROOT_REF corresponds to passed dirid */ + rref = btrfs_item_ptr(leaf, slot, struct btrfs_root_ref); + if (args->dirid != btrfs_root_ref_dirid(leaf, rref)) { + ret = -EINVAL; + goto out; + } + + /* Copy subvolume's name */ + item_off += sizeof(struct btrfs_root_ref); + item_len -= sizeof(struct btrfs_root_ref); + read_extent_buffer(leaf, args->name, item_off, item_len); + args->name[item_len] = 0; + +out: + btrfs_free_path(path); + return ret; +} + static noinline int btrfs_ioctl_ino_lookup(struct file *file, void __user *argp) { @@ -2298,6 +2542,265 @@ out: return ret; } +/* + * Version of ino_lookup ioctl (unprivileged) + * + * The main differences from ino_lookup ioctl are: + * + * 1. Read + Exec permission will be checked using inode_permission() during + * path construction. -EACCES will be returned in case of failure. + * 2. Path construction will be stopped at the inode number which corresponds + * to the fd with which this ioctl is called. If constructed path does not + * exist under fd's inode, -EACCES will be returned. + * 3. The name of bottom subvolume is also searched and filled. + */ +static int btrfs_ioctl_ino_lookup_user(struct file *file, void __user *argp) +{ + struct btrfs_ioctl_ino_lookup_user_args *args; + struct inode *inode; + int ret; + + args = memdup_user(argp, sizeof(*args)); + if (IS_ERR(args)) + return PTR_ERR(args); + + inode = file_inode(file); + + if (args->dirid == BTRFS_FIRST_FREE_OBJECTID && + BTRFS_I(inode)->location.objectid != BTRFS_FIRST_FREE_OBJECTID) { + /* + * The subvolume does not exist under fd with which this is + * called + */ + kfree(args); + return -EACCES; + } + + ret = btrfs_search_path_in_tree_user(inode, args); + + if (ret == 0 && copy_to_user(argp, args, sizeof(*args))) + ret = -EFAULT; + + kfree(args); + return ret; +} + +/* Get the subvolume information in BTRFS_ROOT_ITEM and BTRFS_ROOT_BACKREF */ +static int btrfs_ioctl_get_subvol_info(struct file *file, void __user *argp) +{ + struct btrfs_ioctl_get_subvol_info_args *subvol_info; + struct btrfs_fs_info *fs_info; + struct btrfs_root *root; + struct btrfs_path *path; + struct btrfs_key key; + struct btrfs_root_item *root_item; + struct btrfs_root_ref *rref; + struct extent_buffer *leaf; + unsigned long item_off; + unsigned long item_len; + struct inode *inode; + int slot; + int ret = 0; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + subvol_info = kzalloc(sizeof(*subvol_info), GFP_KERNEL); + if (!subvol_info) { + btrfs_free_path(path); + return -ENOMEM; + } + + inode = file_inode(file); + fs_info = BTRFS_I(inode)->root->fs_info; + + /* Get root_item of inode's subvolume */ + key.objectid = BTRFS_I(inode)->root->root_key.objectid; + key.type = BTRFS_ROOT_ITEM_KEY; + key.offset = (u64)-1; + root = btrfs_read_fs_root_no_name(fs_info, &key); + if (IS_ERR(root)) { + ret = PTR_ERR(root); + goto out; + } + root_item = &root->root_item; + + subvol_info->treeid = key.objectid; + + subvol_info->generation = btrfs_root_generation(root_item); + subvol_info->flags = btrfs_root_flags(root_item); + + memcpy(subvol_info->uuid, root_item->uuid, BTRFS_UUID_SIZE); + memcpy(subvol_info->parent_uuid, root_item->parent_uuid, + BTRFS_UUID_SIZE); + memcpy(subvol_info->received_uuid, root_item->received_uuid, + BTRFS_UUID_SIZE); + + subvol_info->ctransid = btrfs_root_ctransid(root_item); + subvol_info->ctime.sec = btrfs_stack_timespec_sec(&root_item->ctime); + subvol_info->ctime.nsec = btrfs_stack_timespec_nsec(&root_item->ctime); + + subvol_info->otransid = btrfs_root_otransid(root_item); + subvol_info->otime.sec = btrfs_stack_timespec_sec(&root_item->otime); + subvol_info->otime.nsec = btrfs_stack_timespec_nsec(&root_item->otime); + + subvol_info->stransid = btrfs_root_stransid(root_item); + subvol_info->stime.sec = btrfs_stack_timespec_sec(&root_item->stime); + subvol_info->stime.nsec = btrfs_stack_timespec_nsec(&root_item->stime); + + subvol_info->rtransid = btrfs_root_rtransid(root_item); + subvol_info->rtime.sec = btrfs_stack_timespec_sec(&root_item->rtime); + subvol_info->rtime.nsec = btrfs_stack_timespec_nsec(&root_item->rtime); + + if (key.objectid != BTRFS_FS_TREE_OBJECTID) { + /* Search root tree for ROOT_BACKREF of this subvolume */ + root = fs_info->tree_root; + + key.type = BTRFS_ROOT_BACKREF_KEY; + key.offset = 0; + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) { + goto out; + } else if (path->slots[0] >= + btrfs_header_nritems(path->nodes[0])) { + ret = btrfs_next_leaf(root, path); + if (ret < 0) { + goto out; + } else if (ret > 0) { + ret = -EUCLEAN; + goto out; + } + } + + leaf = path->nodes[0]; + slot = path->slots[0]; + btrfs_item_key_to_cpu(leaf, &key, slot); + if (key.objectid == subvol_info->treeid && + key.type == BTRFS_ROOT_BACKREF_KEY) { + subvol_info->parent_id = key.offset; + + rref = btrfs_item_ptr(leaf, slot, struct btrfs_root_ref); + subvol_info->dirid = btrfs_root_ref_dirid(leaf, rref); + + item_off = btrfs_item_ptr_offset(leaf, slot) + + sizeof(struct btrfs_root_ref); + item_len = btrfs_item_size_nr(leaf, slot) + - sizeof(struct btrfs_root_ref); + read_extent_buffer(leaf, subvol_info->name, + item_off, item_len); + } else { + ret = -ENOENT; + goto out; + } + } + + if (copy_to_user(argp, subvol_info, sizeof(*subvol_info))) + ret = -EFAULT; + +out: + btrfs_free_path(path); + kzfree(subvol_info); + return ret; +} + +/* + * Return ROOT_REF information of the subvolume containing this inode + * except the subvolume name. + */ +static int btrfs_ioctl_get_subvol_rootref(struct file *file, void __user *argp) +{ + struct btrfs_ioctl_get_subvol_rootref_args *rootrefs; + struct btrfs_root_ref *rref; + struct btrfs_root *root; + struct btrfs_path *path; + struct btrfs_key key; + struct extent_buffer *leaf; + struct inode *inode; + u64 objectid; + int slot; + int ret; + u8 found; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + rootrefs = memdup_user(argp, sizeof(*rootrefs)); + if (IS_ERR(rootrefs)) { + btrfs_free_path(path); + return PTR_ERR(rootrefs); + } + + inode = file_inode(file); + root = BTRFS_I(inode)->root->fs_info->tree_root; + objectid = BTRFS_I(inode)->root->root_key.objectid; + + key.objectid = objectid; + key.type = BTRFS_ROOT_REF_KEY; + key.offset = rootrefs->min_treeid; + found = 0; + + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) { + goto out; + } else if (path->slots[0] >= + btrfs_header_nritems(path->nodes[0])) { + ret = btrfs_next_leaf(root, path); + if (ret < 0) { + goto out; + } else if (ret > 0) { + ret = -EUCLEAN; + goto out; + } + } + while (1) { + leaf = path->nodes[0]; + slot = path->slots[0]; + + btrfs_item_key_to_cpu(leaf, &key, slot); + if (key.objectid != objectid || key.type != BTRFS_ROOT_REF_KEY) { + ret = 0; + goto out; + } + + if (found == BTRFS_MAX_ROOTREF_BUFFER_NUM) { + ret = -EOVERFLOW; + goto out; + } + + rref = btrfs_item_ptr(leaf, slot, struct btrfs_root_ref); + rootrefs->rootref[found].treeid = key.offset; + rootrefs->rootref[found].dirid = + btrfs_root_ref_dirid(leaf, rref); + found++; + + ret = btrfs_next_item(root, path); + if (ret < 0) { + goto out; + } else if (ret > 0) { + ret = -EUCLEAN; + goto out; + } + } + +out: + if (!ret || ret == -EOVERFLOW) { + rootrefs->num_items = found; + /* update min_treeid for next search */ + if (found) + rootrefs->min_treeid = + rootrefs->rootref[found - 1].treeid + 1; + if (copy_to_user(argp, rootrefs, sizeof(*rootrefs))) + ret = -EFAULT; + } + + kfree(rootrefs); + btrfs_free_path(path); + + return ret; +} + static noinline int btrfs_ioctl_snap_destroy(struct file *file, void __user *arg) { @@ -2309,12 +2812,7 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, struct btrfs_root *root = BTRFS_I(dir)->root; struct btrfs_root *dest = NULL; struct btrfs_ioctl_vol_args *vol_args; - struct btrfs_trans_handle *trans; - struct btrfs_block_rsv block_rsv; - u64 root_flags; - u64 qgroup_reserved; int namelen; - int ret; int err = 0; if (!S_ISDIR(dir->i_mode)) @@ -2398,133 +2896,11 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, } inode_lock(inode); - - /* - * Don't allow to delete a subvolume with send in progress. This is - * inside the i_mutex so the error handling that has to drop the bit - * again is not run concurrently. - */ - spin_lock(&dest->root_item_lock); - root_flags = btrfs_root_flags(&dest->root_item); - if (dest->send_in_progress == 0) { - btrfs_set_root_flags(&dest->root_item, - root_flags | BTRFS_ROOT_SUBVOL_DEAD); - spin_unlock(&dest->root_item_lock); - } else { - spin_unlock(&dest->root_item_lock); - btrfs_warn(fs_info, - "Attempt to delete subvolume %llu during send", - dest->root_key.objectid); - err = -EPERM; - goto out_unlock_inode; - } - - down_write(&fs_info->subvol_sem); - - err = may_destroy_subvol(dest); - if (err) - goto out_up_write; - - btrfs_init_block_rsv(&block_rsv, BTRFS_BLOCK_RSV_TEMP); - /* - * One for dir inode, two for dir entries, two for root - * ref/backref. - */ - err = btrfs_subvolume_reserve_metadata(root, &block_rsv, - 5, &qgroup_reserved, true); - if (err) - goto out_up_write; - - trans = btrfs_start_transaction(root, 0); - if (IS_ERR(trans)) { - err = PTR_ERR(trans); - goto out_release; - } - trans->block_rsv = &block_rsv; - trans->bytes_reserved = block_rsv.size; - - btrfs_record_snapshot_destroy(trans, BTRFS_I(dir)); - - ret = btrfs_unlink_subvol(trans, root, dir, - dest->root_key.objectid, - dentry->d_name.name, - dentry->d_name.len); - if (ret) { - err = ret; - btrfs_abort_transaction(trans, ret); - goto out_end_trans; - } - - btrfs_record_root_in_trans(trans, dest); - - memset(&dest->root_item.drop_progress, 0, - sizeof(dest->root_item.drop_progress)); - dest->root_item.drop_level = 0; - btrfs_set_root_refs(&dest->root_item, 0); - - if (!test_and_set_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &dest->state)) { - ret = btrfs_insert_orphan_item(trans, - fs_info->tree_root, - dest->root_key.objectid); - if (ret) { - btrfs_abort_transaction(trans, ret); - err = ret; - goto out_end_trans; - } - } - - ret = btrfs_uuid_tree_rem(trans, fs_info, dest->root_item.uuid, - BTRFS_UUID_KEY_SUBVOL, - dest->root_key.objectid); - if (ret && ret != -ENOENT) { - btrfs_abort_transaction(trans, ret); - err = ret; - goto out_end_trans; - } - if (!btrfs_is_empty_uuid(dest->root_item.received_uuid)) { - ret = btrfs_uuid_tree_rem(trans, fs_info, - dest->root_item.received_uuid, - BTRFS_UUID_KEY_RECEIVED_SUBVOL, - dest->root_key.objectid); - if (ret && ret != -ENOENT) { - btrfs_abort_transaction(trans, ret); - err = ret; - goto out_end_trans; - } - } - -out_end_trans: - trans->block_rsv = NULL; - trans->bytes_reserved = 0; - ret = btrfs_end_transaction(trans); - if (ret && !err) - err = ret; - inode->i_flags |= S_DEAD; -out_release: - btrfs_subvolume_release_metadata(fs_info, &block_rsv); -out_up_write: - up_write(&fs_info->subvol_sem); - if (err) { - spin_lock(&dest->root_item_lock); - root_flags = btrfs_root_flags(&dest->root_item); - btrfs_set_root_flags(&dest->root_item, - root_flags & ~BTRFS_ROOT_SUBVOL_DEAD); - spin_unlock(&dest->root_item_lock); - } -out_unlock_inode: + err = btrfs_delete_subvolume(dir, dentry); inode_unlock(inode); - if (!err) { - d_invalidate(dentry); - btrfs_invalidate_inodes(dest); + if (!err) d_delete(dentry); - ASSERT(dest->send_in_progress == 0); - /* the last ref */ - if (dest->ino_cache_inode) { - iput(dest->ino_cache_inode); - dest->ino_cache_inode = NULL; - } - } out_dput: dput(dentry); out_unlock_dir: @@ -2613,7 +2989,6 @@ static long btrfs_ioctl_add_dev(struct btrfs_fs_info *fs_info, void __user *arg) if (test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags)) return BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS; - mutex_lock(&fs_info->volume_mutex); vol_args = memdup_user(arg, sizeof(*vol_args)); if (IS_ERR(vol_args)) { ret = PTR_ERR(vol_args); @@ -2628,7 +3003,6 @@ static long btrfs_ioctl_add_dev(struct btrfs_fs_info *fs_info, void __user *arg) kfree(vol_args); out: - mutex_unlock(&fs_info->volume_mutex); clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags); return ret; } @@ -2654,8 +3028,10 @@ static long btrfs_ioctl_rm_dev_v2(struct file *file, void __user *arg) } /* Check for compatibility reject unknown flags */ - if (vol_args->flags & ~BTRFS_VOL_ARG_V2_FLAGS_SUPPORTED) - return -EOPNOTSUPP; + if (vol_args->flags & ~BTRFS_VOL_ARG_V2_FLAGS_SUPPORTED) { + ret = -EOPNOTSUPP; + goto out; + } if (test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags)) { ret = BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS; @@ -2954,8 +3330,6 @@ static void btrfs_cmp_data_free(struct cmp_pages *cmp) put_page(pg); } } - kfree(cmp->src_pages); - kfree(cmp->dst_pages); } static int btrfs_cmp_data_prepare(struct inode *src, u64 loff, @@ -2964,40 +3338,14 @@ static int btrfs_cmp_data_prepare(struct inode *src, u64 loff, { int ret; int num_pages = PAGE_ALIGN(len) >> PAGE_SHIFT; - struct page **src_pgarr, **dst_pgarr; - /* - * We must gather up all the pages before we initiate our - * extent locking. We use an array for the page pointers. Size - * of the array is bounded by len, which is in turn bounded by - * BTRFS_MAX_DEDUPE_LEN. - */ - src_pgarr = kcalloc(num_pages, sizeof(struct page *), GFP_KERNEL); - dst_pgarr = kcalloc(num_pages, sizeof(struct page *), GFP_KERNEL); - if (!src_pgarr || !dst_pgarr) { - kfree(src_pgarr); - kfree(dst_pgarr); - return -ENOMEM; - } cmp->num_pages = num_pages; - cmp->src_pages = src_pgarr; - cmp->dst_pages = dst_pgarr; - - /* - * If deduping ranges in the same inode, locking rules make it mandatory - * to always lock pages in ascending order to avoid deadlocks with - * concurrent tasks (such as starting writeback/delalloc). - */ - if (src == dst && dst_loff < loff) { - swap(src_pgarr, dst_pgarr); - swap(loff, dst_loff); - } - ret = gather_extent_pages(src, src_pgarr, cmp->num_pages, loff); + ret = gather_extent_pages(src, cmp->src_pages, num_pages, loff); if (ret) goto out; - ret = gather_extent_pages(dst, dst_pgarr, cmp->num_pages, dst_loff); + ret = gather_extent_pages(dst, cmp->dst_pages, num_pages, dst_loff); out: if (ret) @@ -3067,31 +3415,23 @@ static int extent_same_check_offsets(struct inode *inode, u64 off, u64 *plen, return 0; } -static int btrfs_extent_same(struct inode *src, u64 loff, u64 olen, - struct inode *dst, u64 dst_loff) +static int btrfs_extent_same_range(struct inode *src, u64 loff, u64 olen, + struct inode *dst, u64 dst_loff, + struct cmp_pages *cmp) { int ret; u64 len = olen; - struct cmp_pages cmp; bool same_inode = (src == dst); u64 same_lock_start = 0; u64 same_lock_len = 0; - if (len == 0) - return 0; - - if (same_inode) - inode_lock(src); - else - btrfs_double_inode_lock(src, dst); - ret = extent_same_check_offsets(src, loff, &len, olen); if (ret) - goto out_unlock; + return ret; ret = extent_same_check_offsets(dst, dst_loff, &len, olen); if (ret) - goto out_unlock; + return ret; if (same_inode) { /* @@ -3108,32 +3448,21 @@ static int btrfs_extent_same(struct inode *src, u64 loff, u64 olen, * allow an unaligned length so long as it ends at * i_size. */ - if (len != olen) { - ret = -EINVAL; - goto out_unlock; - } + if (len != olen) + return -EINVAL; /* Check for overlapping ranges */ - if (dst_loff + len > loff && dst_loff < loff + len) { - ret = -EINVAL; - goto out_unlock; - } + if (dst_loff + len > loff && dst_loff < loff + len) + return -EINVAL; same_lock_start = min_t(u64, loff, dst_loff); same_lock_len = max_t(u64, loff, dst_loff) + len - same_lock_start; } - /* don't make the dst file partly checksummed */ - if ((BTRFS_I(src)->flags & BTRFS_INODE_NODATASUM) != - (BTRFS_I(dst)->flags & BTRFS_INODE_NODATASUM)) { - ret = -EINVAL; - goto out_unlock; - } - again: - ret = btrfs_cmp_data_prepare(src, loff, dst, dst_loff, olen, &cmp); + ret = btrfs_cmp_data_prepare(src, loff, dst, dst_loff, olen, cmp); if (ret) - goto out_unlock; + return ret; if (same_inode) ret = lock_extent_range(src, same_lock_start, same_lock_len, @@ -3154,7 +3483,7 @@ again: * Ranges in the io trees already unlocked. Now unlock all * pages before waiting for all IO to complete. */ - btrfs_cmp_data_free(&cmp); + btrfs_cmp_data_free(cmp); if (same_inode) { btrfs_wait_ordered_range(src, same_lock_start, same_lock_len); @@ -3167,12 +3496,12 @@ again: ASSERT(ret == 0); if (WARN_ON(ret)) { /* ranges in the io trees already unlocked */ - btrfs_cmp_data_free(&cmp); + btrfs_cmp_data_free(cmp); return ret; } /* pass original length for comparison so we stay within i_size */ - ret = btrfs_cmp_data(olen, &cmp); + ret = btrfs_cmp_data(olen, cmp); if (ret == 0) ret = btrfs_clone(src, dst, loff, olen, len, dst_loff, 1); @@ -3182,18 +3511,91 @@ again: else btrfs_double_extent_unlock(src, loff, dst, dst_loff, len); - btrfs_cmp_data_free(&cmp); + btrfs_cmp_data_free(cmp); + + return ret; +} + +#define BTRFS_MAX_DEDUPE_LEN SZ_16M + +static int btrfs_extent_same(struct inode *src, u64 loff, u64 olen, + struct inode *dst, u64 dst_loff) +{ + int ret; + struct cmp_pages cmp; + int num_pages = PAGE_ALIGN(BTRFS_MAX_DEDUPE_LEN) >> PAGE_SHIFT; + bool same_inode = (src == dst); + u64 i, tail_len, chunk_count; + + if (olen == 0) + return 0; + + if (same_inode) + inode_lock(src); + else + btrfs_double_inode_lock(src, dst); + + /* don't make the dst file partly checksummed */ + if ((BTRFS_I(src)->flags & BTRFS_INODE_NODATASUM) != + (BTRFS_I(dst)->flags & BTRFS_INODE_NODATASUM)) { + ret = -EINVAL; + goto out_unlock; + } + + tail_len = olen % BTRFS_MAX_DEDUPE_LEN; + chunk_count = div_u64(olen, BTRFS_MAX_DEDUPE_LEN); + if (chunk_count == 0) + num_pages = PAGE_ALIGN(tail_len) >> PAGE_SHIFT; + + /* + * If deduping ranges in the same inode, locking rules make it + * mandatory to always lock pages in ascending order to avoid deadlocks + * with concurrent tasks (such as starting writeback/delalloc). + */ + if (same_inode && dst_loff < loff) + swap(loff, dst_loff); + + /* + * We must gather up all the pages before we initiate our extent + * locking. We use an array for the page pointers. Size of the array is + * bounded by len, which is in turn bounded by BTRFS_MAX_DEDUPE_LEN. + */ + cmp.src_pages = kvmalloc_array(num_pages, sizeof(struct page *), + GFP_KERNEL | __GFP_ZERO); + cmp.dst_pages = kvmalloc_array(num_pages, sizeof(struct page *), + GFP_KERNEL | __GFP_ZERO); + if (!cmp.src_pages || !cmp.dst_pages) { + ret = -ENOMEM; + goto out_free; + } + + for (i = 0; i < chunk_count; i++) { + ret = btrfs_extent_same_range(src, loff, BTRFS_MAX_DEDUPE_LEN, + dst, dst_loff, &cmp); + if (ret) + goto out_unlock; + + loff += BTRFS_MAX_DEDUPE_LEN; + dst_loff += BTRFS_MAX_DEDUPE_LEN; + } + + if (tail_len > 0) + ret = btrfs_extent_same_range(src, loff, tail_len, dst, + dst_loff, &cmp); + out_unlock: if (same_inode) inode_unlock(src); else btrfs_double_inode_unlock(src, dst); +out_free: + kvfree(cmp.src_pages); + kvfree(cmp.dst_pages); + return ret; } -#define BTRFS_MAX_DEDUPE_LEN SZ_16M - ssize_t btrfs_dedupe_file_range(struct file *src_file, u64 loff, u64 olen, struct file *dst_file, u64 dst_loff) { @@ -3202,9 +3604,6 @@ ssize_t btrfs_dedupe_file_range(struct file *src_file, u64 loff, u64 olen, u64 bs = BTRFS_I(src)->root->fs_info->sb->s_blocksize; ssize_t res; - if (olen > BTRFS_MAX_DEDUPE_LEN) - olen = BTRFS_MAX_DEDUPE_LEN; - if (WARN_ON_ONCE(bs < PAGE_SIZE)) { /* * Btrfs does not support blocksize < page_size. As a @@ -3826,11 +4225,6 @@ static noinline int btrfs_clone_files(struct file *file, struct file *file_src, src->i_sb != inode->i_sb) return -EXDEV; - /* don't make the dst file partly checksummed */ - if ((BTRFS_I(src)->flags & BTRFS_INODE_NODATASUM) != - (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) - return -EINVAL; - if (S_ISDIR(src->i_mode) || S_ISDIR(inode->i_mode)) return -EISDIR; @@ -3840,6 +4234,13 @@ static noinline int btrfs_clone_files(struct file *file, struct file *file_src, inode_lock(src); } + /* don't make the dst file partly checksummed */ + if ((BTRFS_I(src)->flags & BTRFS_INODE_NODATASUM) != + (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) { + ret = -EINVAL; + goto out_unlock; + } + /* determine range to clone */ ret = -EINVAL; if (off + len > src->i_size || off + len < off) @@ -4007,8 +4408,8 @@ out: return ret; } -void btrfs_get_block_group_info(struct list_head *groups_list, - struct btrfs_ioctl_space_info *space) +static void get_block_group_info(struct list_head *groups_list, + struct btrfs_ioctl_space_info *space) { struct btrfs_block_group_cache *block_group; @@ -4124,8 +4525,8 @@ static long btrfs_ioctl_space_info(struct btrfs_fs_info *fs_info, down_read(&info->groups_sem); for (c = 0; c < BTRFS_NR_RAID_TYPES; c++) { if (!list_empty(&info->block_groups[c])) { - btrfs_get_block_group_info( - &info->block_groups[c], &space); + get_block_group_info(&info->block_groups[c], + &space); memcpy(dest, &space, sizeof(space)); dest++; space_args.total_spaces++; @@ -4490,14 +4891,14 @@ out_loi: return ret; } -void update_ioctl_balance_args(struct btrfs_fs_info *fs_info, int lock, +void btrfs_update_ioctl_balance_args(struct btrfs_fs_info *fs_info, struct btrfs_ioctl_balance_args *bargs) { struct btrfs_balance_control *bctl = fs_info->balance_ctl; bargs->flags = bctl->flags; - if (atomic_read(&fs_info->balance_running)) + if (test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)) bargs->state |= BTRFS_BALANCE_STATE_RUNNING; if (atomic_read(&fs_info->balance_pause_req)) bargs->state |= BTRFS_BALANCE_STATE_PAUSE_REQ; @@ -4508,13 +4909,9 @@ void update_ioctl_balance_args(struct btrfs_fs_info *fs_info, int lock, memcpy(&bargs->meta, &bctl->meta, sizeof(bargs->meta)); memcpy(&bargs->sys, &bctl->sys, sizeof(bargs->sys)); - if (lock) { - spin_lock(&fs_info->balance_lock); - memcpy(&bargs->stat, &bctl->stat, sizeof(bargs->stat)); - spin_unlock(&fs_info->balance_lock); - } else { - memcpy(&bargs->stat, &bctl->stat, sizeof(bargs->stat)); - } + spin_lock(&fs_info->balance_lock); + memcpy(&bargs->stat, &bctl->stat, sizeof(bargs->stat)); + spin_unlock(&fs_info->balance_lock); } static long btrfs_ioctl_balance(struct file *file, void __user *arg) @@ -4535,7 +4932,6 @@ static long btrfs_ioctl_balance(struct file *file, void __user *arg) again: if (!test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags)) { - mutex_lock(&fs_info->volume_mutex); mutex_lock(&fs_info->balance_mutex); need_unlock = true; goto locked; @@ -4550,21 +4946,22 @@ again: mutex_lock(&fs_info->balance_mutex); if (fs_info->balance_ctl) { /* this is either (2) or (3) */ - if (!atomic_read(&fs_info->balance_running)) { + if (!test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)) { mutex_unlock(&fs_info->balance_mutex); - if (!mutex_trylock(&fs_info->volume_mutex)) - goto again; + /* + * Lock released to allow other waiters to continue, + * we'll reexamine the status again. + */ mutex_lock(&fs_info->balance_mutex); if (fs_info->balance_ctl && - !atomic_read(&fs_info->balance_running)) { + !test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)) { /* this is (3) */ need_unlock = false; goto locked; } mutex_unlock(&fs_info->balance_mutex); - mutex_unlock(&fs_info->volume_mutex); goto again; } else { /* this is (2) */ @@ -4617,7 +5014,6 @@ locked: goto out_bargs; } - bctl->fs_info = fs_info; if (arg) { memcpy(&bctl->data, &bargs->data, sizeof(bctl->data)); memcpy(&bctl->meta, &bargs->meta, sizeof(bctl->meta)); @@ -4636,14 +5032,14 @@ locked: do_balance: /* - * Ownership of bctl and filesystem flag BTRFS_FS_EXCL_OP - * goes to to btrfs_balance. bctl is freed in __cancel_balance, - * or, if restriper was paused all the way until unmount, in - * free_fs_info. The flag is cleared in __cancel_balance. + * Ownership of bctl and filesystem flag BTRFS_FS_EXCL_OP goes to + * btrfs_balance. bctl is freed in reset_balance_state, or, if + * restriper was paused all the way until unmount, in free_fs_info. + * The flag should be cleared after reset_balance_state. */ need_unlock = false; - ret = btrfs_balance(bctl, bargs); + ret = btrfs_balance(fs_info, bctl, bargs); bctl = NULL; if (arg) { @@ -4657,7 +5053,6 @@ out_bargs: kfree(bargs); out_unlock: mutex_unlock(&fs_info->balance_mutex); - mutex_unlock(&fs_info->volume_mutex); if (need_unlock) clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags); out: @@ -4701,7 +5096,7 @@ static long btrfs_ioctl_balance_progress(struct btrfs_fs_info *fs_info, goto out; } - update_ioctl_balance_args(fs_info, 1, bargs); + btrfs_update_ioctl_balance_args(fs_info, bargs); if (copy_to_user(arg, bargs, sizeof(*bargs))) ret = -EFAULT; @@ -5038,8 +5433,7 @@ static long _btrfs_ioctl_set_received_subvol(struct file *file, BTRFS_UUID_SIZE); if (received_uuid_changed && !btrfs_is_empty_uuid(root_item->received_uuid)) { - ret = btrfs_uuid_tree_rem(trans, fs_info, - root_item->received_uuid, + ret = btrfs_uuid_tree_remove(trans, root_item->received_uuid, BTRFS_UUID_KEY_RECEIVED_SUBVOL, root->root_key.objectid); if (ret && ret != -ENOENT) { @@ -5063,7 +5457,7 @@ static long _btrfs_ioctl_set_received_subvol(struct file *file, goto out; } if (received_uuid_changed && !btrfs_is_empty_uuid(sa->uuid)) { - ret = btrfs_uuid_tree_add(trans, fs_info, sa->uuid, + ret = btrfs_uuid_tree_add(trans, sa->uuid, BTRFS_UUID_KEY_RECEIVED_SUBVOL, root->root_key.objectid); if (ret < 0 && ret != -EEXIST) { @@ -5497,7 +5891,7 @@ long btrfs_ioctl(struct file *file, unsigned int case BTRFS_IOC_SYNC: { int ret; - ret = btrfs_start_delalloc_roots(fs_info, 0, -1); + ret = btrfs_start_delalloc_roots(fs_info, -1); if (ret) return ret; ret = btrfs_sync_fs(inode->i_sb, 1); @@ -5565,6 +5959,16 @@ long btrfs_ioctl(struct file *file, unsigned int return btrfs_ioctl_get_features(file, argp); case BTRFS_IOC_SET_FEATURES: return btrfs_ioctl_set_features(file, argp); + case FS_IOC_FSGETXATTR: + return btrfs_ioctl_fsgetxattr(file, argp); + case FS_IOC_FSSETXATTR: + return btrfs_ioctl_fssetxattr(file, argp); + case BTRFS_IOC_GET_SUBVOL_INFO: + return btrfs_ioctl_get_subvol_info(file, argp); + case BTRFS_IOC_GET_SUBVOL_ROOTREF: + return btrfs_ioctl_get_subvol_rootref(file, argp); + case BTRFS_IOC_INO_LOOKUP_USER: + return btrfs_ioctl_ino_lookup_user(file, argp); } return -ENOTTY; diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c index e4faefac9d16..1da768e5ef75 100644 --- a/fs/btrfs/locking.c +++ b/fs/btrfs/locking.c @@ -66,22 +66,16 @@ void btrfs_clear_lock_blocking_rw(struct extent_buffer *eb, int rw) write_lock(&eb->lock); WARN_ON(atomic_read(&eb->spinning_writers)); atomic_inc(&eb->spinning_writers); - /* - * atomic_dec_and_test implies a barrier for waitqueue_active - */ - if (atomic_dec_and_test(&eb->blocking_writers) && - waitqueue_active(&eb->write_lock_wq)) - wake_up(&eb->write_lock_wq); + /* atomic_dec_and_test implies a barrier */ + if (atomic_dec_and_test(&eb->blocking_writers)) + cond_wake_up_nomb(&eb->write_lock_wq); } else if (rw == BTRFS_READ_LOCK_BLOCKING) { BUG_ON(atomic_read(&eb->blocking_readers) == 0); read_lock(&eb->lock); atomic_inc(&eb->spinning_readers); - /* - * atomic_dec_and_test implies a barrier for waitqueue_active - */ - if (atomic_dec_and_test(&eb->blocking_readers) && - waitqueue_active(&eb->read_lock_wq)) - wake_up(&eb->read_lock_wq); + /* atomic_dec_and_test implies a barrier */ + if (atomic_dec_and_test(&eb->blocking_readers)) + cond_wake_up_nomb(&eb->read_lock_wq); } } @@ -221,12 +215,9 @@ void btrfs_tree_read_unlock_blocking(struct extent_buffer *eb) } btrfs_assert_tree_read_locked(eb); WARN_ON(atomic_read(&eb->blocking_readers) == 0); - /* - * atomic_dec_and_test implies a barrier for waitqueue_active - */ - if (atomic_dec_and_test(&eb->blocking_readers) && - waitqueue_active(&eb->read_lock_wq)) - wake_up(&eb->read_lock_wq); + /* atomic_dec_and_test implies a barrier */ + if (atomic_dec_and_test(&eb->blocking_readers)) + cond_wake_up_nomb(&eb->read_lock_wq); atomic_dec(&eb->read_locks); } @@ -275,12 +266,9 @@ void btrfs_tree_unlock(struct extent_buffer *eb) if (blockers) { WARN_ON(atomic_read(&eb->spinning_writers)); atomic_dec(&eb->blocking_writers); - /* - * Make sure counter is updated before we wake up waiters. - */ + /* Use the lighter barrier after atomic */ smp_mb__after_atomic(); - if (waitqueue_active(&eb->write_lock_wq)) - wake_up(&eb->write_lock_wq); + cond_wake_up_nomb(&eb->write_lock_wq); } else { WARN_ON(atomic_read(&eb->spinning_writers) != 1); atomic_dec(&eb->spinning_writers); diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c index 0667ea07f766..b6a4cc178bee 100644 --- a/fs/btrfs/lzo.c +++ b/fs/btrfs/lzo.c @@ -17,6 +17,43 @@ #define LZO_LEN 4 +/* + * Btrfs LZO compression format + * + * Regular and inlined LZO compressed data extents consist of: + * + * 1. Header + * Fixed size. LZO_LEN (4) bytes long, LE32. + * Records the total size (including the header) of compressed data. + * + * 2. Segment(s) + * Variable size. Each segment includes one segment header, followd by data + * payload. + * One regular LZO compressed extent can have one or more segments. + * For inlined LZO compressed extent, only one segment is allowed. + * One segment represents at most one page of uncompressed data. + * + * 2.1 Segment header + * Fixed size. LZO_LEN (4) bytes long, LE32. + * Records the total size of the segment (not including the header). + * Segment header never crosses page boundary, thus it's possible to + * have at most 3 padding zeros at the end of the page. + * + * 2.2 Data Payload + * Variable size. Size up limit should be lzo1x_worst_compress(PAGE_SIZE) + * which is 4419 for a 4KiB page. + * + * Example: + * Page 1: + * 0 0x2 0x4 0x6 0x8 0xa 0xc 0xe 0x10 + * 0x0000 | Header | SegHdr 01 | Data payload 01 ... | + * ... + * 0x0ff0 | SegHdr N | Data payload N ... |00| + * ^^ padding zeros + * Page 2: + * 0x1000 | SegHdr N+1| Data payload N+1 ... | + */ + struct workspace { void *mem; void *buf; /* where decompressed data goes */ @@ -258,6 +295,7 @@ static int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb) unsigned long working_bytes; size_t in_len; size_t out_len; + const size_t max_segment_len = lzo1x_worst_compress(PAGE_SIZE); unsigned long in_offset; unsigned long in_page_bytes_left; unsigned long tot_in; @@ -271,10 +309,22 @@ static int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb) data_in = kmap(pages_in[0]); tot_len = read_compress_length(data_in); + /* + * Compressed data header check. + * + * The real compressed size can't exceed the maximum extent length, and + * all pages should be used (whole unused page with just the segment + * header is not possible). If this happens it means the compressed + * extent is corrupted. + */ + if (tot_len > min_t(size_t, BTRFS_MAX_COMPRESSED, srclen) || + tot_len < srclen - PAGE_SIZE) { + ret = -EUCLEAN; + goto done; + } tot_in = LZO_LEN; in_offset = LZO_LEN; - tot_len = min_t(size_t, srclen, tot_len); in_page_bytes_left = PAGE_SIZE - LZO_LEN; tot_out = 0; @@ -285,6 +335,17 @@ static int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb) in_offset += LZO_LEN; tot_in += LZO_LEN; + /* + * Segment header check. + * + * The segment length must not exceed the maximum LZO + * compression size, nor the total compressed size. + */ + if (in_len > max_segment_len || tot_in + in_len > tot_len) { + ret = -EUCLEAN; + goto done; + } + tot_in += in_len; working_bytes = in_len; may_late_unmap = need_unmap = false; @@ -335,7 +396,7 @@ cont: } } - out_len = lzo1x_worst_compress(PAGE_SIZE); + out_len = max_segment_len; ret = lzo1x_decompress_safe(buf, in_len, workspace->buf, &out_len); if (need_unmap) @@ -369,15 +430,24 @@ static int lzo_decompress(struct list_head *ws, unsigned char *data_in, struct workspace *workspace = list_entry(ws, struct workspace, list); size_t in_len; size_t out_len; + size_t max_segment_len = lzo1x_worst_compress(PAGE_SIZE); int ret = 0; char *kaddr; unsigned long bytes; - BUG_ON(srclen < LZO_LEN); + if (srclen < LZO_LEN || srclen > max_segment_len + LZO_LEN * 2) + return -EUCLEAN; + in_len = read_compress_length(data_in); + if (in_len != srclen) + return -EUCLEAN; data_in += LZO_LEN; in_len = read_compress_length(data_in); + if (in_len != srclen - LZO_LEN * 2) { + ret = -EUCLEAN; + goto out; + } data_in += LZO_LEN; out_len = PAGE_SIZE; diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index 6db8bb2f2c28..2e1a1694a33d 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -343,11 +343,8 @@ int btrfs_dec_test_first_ordered_pending(struct inode *inode, if (entry->bytes_left == 0) { ret = test_and_set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags); - /* - * Implicit memory barrier after test_and_set_bit - */ - if (waitqueue_active(&entry->wait)) - wake_up(&entry->wait); + /* test_and_set_bit implies a barrier */ + cond_wake_up_nomb(&entry->wait); } else { ret = 1; } @@ -410,11 +407,8 @@ have_entry: if (entry->bytes_left == 0) { ret = test_and_set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags); - /* - * Implicit memory barrier after test_and_set_bit - */ - if (waitqueue_active(&entry->wait)) - wake_up(&entry->wait); + /* test_and_set_bit implies a barrier */ + cond_wake_up_nomb(&entry->wait); } else { ret = 1; } diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c index 21a831d3d087..a4e11cf04671 100644 --- a/fs/btrfs/print-tree.c +++ b/fs/btrfs/print-tree.c @@ -166,6 +166,25 @@ static void print_uuid_item(struct extent_buffer *l, unsigned long offset, } } +/* + * Helper to output refs and locking status of extent buffer. Useful to debug + * race condition related problems. + */ +static void print_eb_refs_lock(struct extent_buffer *eb) +{ +#ifdef CONFIG_BTRFS_DEBUG + btrfs_info(eb->fs_info, +"refs %u lock (w:%d r:%d bw:%d br:%d sw:%d sr:%d) lock_owner %u current %u", + atomic_read(&eb->refs), atomic_read(&eb->write_locks), + atomic_read(&eb->read_locks), + atomic_read(&eb->blocking_writers), + atomic_read(&eb->blocking_readers), + atomic_read(&eb->spinning_writers), + atomic_read(&eb->spinning_readers), + eb->lock_owner, current->pid); +#endif +} + void btrfs_print_leaf(struct extent_buffer *l) { struct btrfs_fs_info *fs_info; @@ -193,6 +212,7 @@ void btrfs_print_leaf(struct extent_buffer *l) "leaf %llu gen %llu total ptrs %d free space %d owner %llu", btrfs_header_bytenr(l), btrfs_header_generation(l), nr, btrfs_leaf_free_space(fs_info, l), btrfs_header_owner(l)); + print_eb_refs_lock(l); for (i = 0 ; i < nr ; i++) { item = btrfs_item_nr(i); btrfs_item_key_to_cpu(l, &key, i); @@ -347,6 +367,7 @@ void btrfs_print_tree(struct extent_buffer *c, bool follow) btrfs_header_bytenr(c), level, btrfs_header_generation(c), nr, (u32)BTRFS_NODEPTRS_PER_BLOCK(fs_info) - nr, btrfs_header_owner(c)); + print_eb_refs_lock(c); for (i = 0; i < nr; i++) { btrfs_node_key_to_cpu(c, &key, i); pr_info("\tkey %d (%llu %u %llu) block %llu gen %llu\n", diff --git a/fs/btrfs/props.c b/fs/btrfs/props.c index 53a8c95828e3..dc6140013ae8 100644 --- a/fs/btrfs/props.c +++ b/fs/btrfs/props.c @@ -380,6 +380,7 @@ static int prop_compression_apply(struct inode *inode, const char *value, size_t len) { + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); int type; if (len == 0) { @@ -390,14 +391,17 @@ static int prop_compression_apply(struct inode *inode, return 0; } - if (!strncmp("lzo", value, 3)) + if (!strncmp("lzo", value, 3)) { type = BTRFS_COMPRESS_LZO; - else if (!strncmp("zlib", value, 4)) + btrfs_set_fs_incompat(fs_info, COMPRESS_LZO); + } else if (!strncmp("zlib", value, 4)) { type = BTRFS_COMPRESS_ZLIB; - else if (!strncmp("zstd", value, len)) + } else if (!strncmp("zstd", value, len)) { type = BTRFS_COMPRESS_ZSTD; - else + btrfs_set_fs_incompat(fs_info, COMPRESS_ZSTD); + } else { return -EINVAL; + } BTRFS_I(inode)->flags &= ~BTRFS_INODE_NOCOMPRESS; BTRFS_I(inode)->flags |= BTRFS_INODE_COMPRESS; diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index 9fb758d5077a..1874a6d2e6f5 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -1882,8 +1882,8 @@ static int qgroup_update_counters(struct btrfs_fs_info *fs_info, cur_old_count = btrfs_qgroup_get_old_refcnt(qg, seq); cur_new_count = btrfs_qgroup_get_new_refcnt(qg, seq); - trace_qgroup_update_counters(fs_info, qg->qgroupid, - cur_old_count, cur_new_count); + trace_qgroup_update_counters(fs_info, qg, cur_old_count, + cur_new_count); /* Rfer update part */ if (cur_old_count == 0 && cur_new_count > 0) { @@ -2014,8 +2014,8 @@ btrfs_qgroup_account_extent(struct btrfs_trans_handle *trans, BUG_ON(!fs_info->quota_root); - trace_btrfs_qgroup_account_extent(fs_info, bytenr, num_bytes, - nr_old_roots, nr_new_roots); + trace_btrfs_qgroup_account_extent(fs_info, trans->transid, bytenr, + num_bytes, nr_old_roots, nr_new_roots); qgroups = ulist_alloc(GFP_NOFS); if (!qgroups) { @@ -2580,6 +2580,21 @@ out: } /* + * Check if the leaf is the last leaf. Which means all node pointers + * are at their last position. + */ +static bool is_last_leaf(struct btrfs_path *path) +{ + int i; + + for (i = 1; i < BTRFS_MAX_LEVEL && path->nodes[i]; i++) { + if (path->slots[i] != btrfs_header_nritems(path->nodes[i]) - 1) + return false; + } + return true; +} + +/* * returns < 0 on error, 0 when more leafs are to be scanned. * returns 1 when done. */ @@ -2590,8 +2605,8 @@ qgroup_rescan_leaf(struct btrfs_fs_info *fs_info, struct btrfs_path *path, struct btrfs_key found; struct extent_buffer *scratch_leaf = NULL; struct ulist *roots = NULL; - struct seq_list tree_mod_seq_elem = SEQ_LIST_INIT(tree_mod_seq_elem); u64 num_bytes; + bool done; int slot; int ret; @@ -2620,12 +2635,12 @@ qgroup_rescan_leaf(struct btrfs_fs_info *fs_info, struct btrfs_path *path, mutex_unlock(&fs_info->qgroup_rescan_lock); return ret; } + done = is_last_leaf(path); btrfs_item_key_to_cpu(path->nodes[0], &found, btrfs_header_nritems(path->nodes[0]) - 1); fs_info->qgroup_rescan_progress.objectid = found.objectid + 1; - btrfs_get_tree_mod_seq(fs_info, &tree_mod_seq_elem); scratch_leaf = btrfs_clone_extent_buffer(path->nodes[0]); if (!scratch_leaf) { ret = -ENOMEM; @@ -2664,8 +2679,9 @@ out: btrfs_tree_read_unlock_blocking(scratch_leaf); free_extent_buffer(scratch_leaf); } - btrfs_put_tree_mod_seq(fs_info, &tree_mod_seq_elem); + if (done && !ret) + ret = 1; return ret; } @@ -2681,6 +2697,12 @@ static void btrfs_qgroup_rescan_worker(struct btrfs_work *work) path = btrfs_alloc_path(); if (!path) goto out; + /* + * Rescan should only search for commit root, and any later difference + * should be recorded by qgroup + */ + path->search_commit_root = 1; + path->skip_locking = 1; err = 0; while (!err && !btrfs_fs_closing(fs_info)) { @@ -2760,26 +2782,36 @@ qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid, { int ret = 0; - if (!init_flags && - (!(fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) || - !(fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_ON))) { - ret = -EINVAL; - goto err; + if (!init_flags) { + /* we're resuming qgroup rescan at mount time */ + if (!(fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN)) + btrfs_warn(fs_info, + "qgroup rescan init failed, qgroup is not enabled"); + else if (!(fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_ON)) + btrfs_warn(fs_info, + "qgroup rescan init failed, qgroup rescan is not queued"); + return -EINVAL; } mutex_lock(&fs_info->qgroup_rescan_lock); spin_lock(&fs_info->qgroup_lock); if (init_flags) { - if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) + if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) { + btrfs_warn(fs_info, + "qgroup rescan is already in progress"); ret = -EINPROGRESS; - else if (!(fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_ON)) + } else if (!(fs_info->qgroup_flags & + BTRFS_QGROUP_STATUS_FLAG_ON)) { + btrfs_warn(fs_info, + "qgroup rescan init failed, qgroup is not enabled"); ret = -EINVAL; + } if (ret) { spin_unlock(&fs_info->qgroup_lock); mutex_unlock(&fs_info->qgroup_rescan_lock); - goto err; + return ret; } fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_RESCAN; } @@ -2798,13 +2830,6 @@ qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid, btrfs_init_work(&fs_info->qgroup_rescan_work, btrfs_qgroup_rescan_helper, btrfs_qgroup_rescan_worker, NULL, NULL); - - if (ret) { -err: - btrfs_info(fs_info, "qgroup_rescan_init failed with %d", ret); - return ret; - } - return 0; } diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index 9abd950e7f78..5e4ad134b9ad 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -163,6 +163,12 @@ struct btrfs_raid_bio { * bitmap to record which horizontal stripe has data */ unsigned long *dbitmap; + + /* allocated with real_stripes-many pointers for finish_*() calls */ + void **finish_pointers; + + /* allocated with stripe_npages-many bits for finish_*() calls */ + unsigned long *finish_pbitmap; }; static int __raid56_parity_recover(struct btrfs_raid_bio *rbio); @@ -981,9 +987,14 @@ static struct btrfs_raid_bio *alloc_rbio(struct btrfs_fs_info *fs_info, int stripe_npages = DIV_ROUND_UP(stripe_len, PAGE_SIZE); void *p; - rbio = kzalloc(sizeof(*rbio) + num_pages * sizeof(struct page *) * 2 + - DIV_ROUND_UP(stripe_npages, BITS_PER_LONG) * - sizeof(long), GFP_NOFS); + rbio = kzalloc(sizeof(*rbio) + + sizeof(*rbio->stripe_pages) * num_pages + + sizeof(*rbio->bio_pages) * num_pages + + sizeof(*rbio->finish_pointers) * real_stripes + + sizeof(*rbio->dbitmap) * BITS_TO_LONGS(stripe_npages) + + sizeof(*rbio->finish_pbitmap) * + BITS_TO_LONGS(stripe_npages), + GFP_NOFS); if (!rbio) return ERR_PTR(-ENOMEM); @@ -1005,13 +1016,20 @@ static struct btrfs_raid_bio *alloc_rbio(struct btrfs_fs_info *fs_info, atomic_set(&rbio->stripes_pending, 0); /* - * the stripe_pages and bio_pages array point to the extra + * the stripe_pages, bio_pages, etc arrays point to the extra * memory we allocated past the end of the rbio */ p = rbio + 1; - rbio->stripe_pages = p; - rbio->bio_pages = p + sizeof(struct page *) * num_pages; - rbio->dbitmap = p + sizeof(struct page *) * num_pages * 2; +#define CONSUME_ALLOC(ptr, count) do { \ + ptr = p; \ + p = (unsigned char *)p + sizeof(*(ptr)) * (count); \ + } while (0) + CONSUME_ALLOC(rbio->stripe_pages, num_pages); + CONSUME_ALLOC(rbio->bio_pages, num_pages); + CONSUME_ALLOC(rbio->finish_pointers, real_stripes); + CONSUME_ALLOC(rbio->dbitmap, BITS_TO_LONGS(stripe_npages)); + CONSUME_ALLOC(rbio->finish_pbitmap, BITS_TO_LONGS(stripe_npages)); +#undef CONSUME_ALLOC if (bbio->map_type & BTRFS_BLOCK_GROUP_RAID5) nr_data = real_stripes - 1; @@ -1180,7 +1198,7 @@ static void index_rbio_pages(struct btrfs_raid_bio *rbio) static noinline void finish_rmw(struct btrfs_raid_bio *rbio) { struct btrfs_bio *bbio = rbio->bbio; - void *pointers[rbio->real_stripes]; + void **pointers = rbio->finish_pointers; int nr_data = rbio->nr_data; int stripe; int pagenr; @@ -2350,8 +2368,8 @@ static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio, int need_check) { struct btrfs_bio *bbio = rbio->bbio; - void *pointers[rbio->real_stripes]; - DECLARE_BITMAP(pbitmap, rbio->stripe_npages); + void **pointers = rbio->finish_pointers; + unsigned long *pbitmap = rbio->finish_pbitmap; int nr_data = rbio->nr_data; int stripe; int pagenr; diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index b041b945a7ae..879b76fa881a 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -4299,7 +4299,7 @@ out: return inode; } -static struct reloc_control *alloc_reloc_control(struct btrfs_fs_info *fs_info) +static struct reloc_control *alloc_reloc_control(void) { struct reloc_control *rc; @@ -4344,7 +4344,7 @@ static void describe_relocation(struct btrfs_fs_info *fs_info, DESCRIBE_FLAG(RAID5, "raid5"); DESCRIBE_FLAG(RAID6, "raid6"); if (flags) - snprintf(buf, buf - bp + sizeof(buf), "|0x%llx", flags); + snprintf(bp, buf - bp + sizeof(buf), "|0x%llx", flags); #undef DESCRIBE_FLAG } @@ -4366,7 +4366,7 @@ int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start) int rw = 0; int err = 0; - rc = alloc_reloc_control(fs_info); + rc = alloc_reloc_control(); if (!rc) return -ENOMEM; @@ -4562,7 +4562,7 @@ int btrfs_recover_relocation(struct btrfs_root *root) if (list_empty(&reloc_roots)) goto out; - rc = alloc_reloc_control(fs_info); + rc = alloc_reloc_control(); if (!rc) { err = -ENOMEM; goto out; diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 52b39a0924e9..a59005862010 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -3984,6 +3984,7 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx, spin_lock(&fs_info->unused_bgs_lock); if (list_empty(&cache->bg_list)) { btrfs_get_block_group(cache); + trace_btrfs_add_unused_block_group(cache); list_add_tail(&cache->bg_list, &fs_info->unused_bgs); } diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index c0074d2d7d6d..c47f62b19226 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -235,6 +235,7 @@ struct orphan_dir_info { struct rb_node node; u64 ino; u64 gen; + u64 last_dir_index_offset; }; struct name_cache_entry { @@ -2844,12 +2845,6 @@ add_orphan_dir_info(struct send_ctx *sctx, u64 dir_ino) struct rb_node *parent = NULL; struct orphan_dir_info *entry, *odi; - odi = kmalloc(sizeof(*odi), GFP_KERNEL); - if (!odi) - return ERR_PTR(-ENOMEM); - odi->ino = dir_ino; - odi->gen = 0; - while (*p) { parent = *p; entry = rb_entry(parent, struct orphan_dir_info, node); @@ -2858,11 +2853,17 @@ add_orphan_dir_info(struct send_ctx *sctx, u64 dir_ino) } else if (dir_ino > entry->ino) { p = &(*p)->rb_right; } else { - kfree(odi); return entry; } } + odi = kmalloc(sizeof(*odi), GFP_KERNEL); + if (!odi) + return ERR_PTR(-ENOMEM); + odi->ino = dir_ino; + odi->gen = 0; + odi->last_dir_index_offset = 0; + rb_link_node(&odi->node, parent, p); rb_insert_color(&odi->node, &sctx->orphan_dirs); return odi; @@ -2917,6 +2918,7 @@ static int can_rmdir(struct send_ctx *sctx, u64 dir, u64 dir_gen, struct btrfs_key found_key; struct btrfs_key loc; struct btrfs_dir_item *di; + struct orphan_dir_info *odi = NULL; /* * Don't try to rmdir the top/root subvolume dir. @@ -2931,6 +2933,11 @@ static int can_rmdir(struct send_ctx *sctx, u64 dir, u64 dir_gen, key.objectid = dir; key.type = BTRFS_DIR_INDEX_KEY; key.offset = 0; + + odi = get_orphan_dir_info(sctx, dir); + if (odi) + key.offset = odi->last_dir_index_offset; + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); if (ret < 0) goto out; @@ -2958,30 +2965,33 @@ static int can_rmdir(struct send_ctx *sctx, u64 dir, u64 dir_gen, dm = get_waiting_dir_move(sctx, loc.objectid); if (dm) { - struct orphan_dir_info *odi; - odi = add_orphan_dir_info(sctx, dir); if (IS_ERR(odi)) { ret = PTR_ERR(odi); goto out; } odi->gen = dir_gen; + odi->last_dir_index_offset = found_key.offset; dm->rmdir_ino = dir; ret = 0; goto out; } if (loc.objectid > send_progress) { - struct orphan_dir_info *odi; - - odi = get_orphan_dir_info(sctx, dir); - free_orphan_dir_info(sctx, odi); + odi = add_orphan_dir_info(sctx, dir); + if (IS_ERR(odi)) { + ret = PTR_ERR(odi); + goto out; + } + odi->gen = dir_gen; + odi->last_dir_index_offset = found_key.offset; ret = 0; goto out; } path->slots[0]++; } + free_orphan_dir_info(sctx, odi); ret = 1; @@ -3259,13 +3269,16 @@ static int apply_dir_move(struct send_ctx *sctx, struct pending_dir_move *pm) if (rmdir_ino) { struct orphan_dir_info *odi; + u64 gen; odi = get_orphan_dir_info(sctx, rmdir_ino); if (!odi) { /* already deleted */ goto finish; } - ret = can_rmdir(sctx, rmdir_ino, odi->gen, sctx->cur_ino); + gen = odi->gen; + + ret = can_rmdir(sctx, rmdir_ino, gen, sctx->cur_ino); if (ret < 0) goto out; if (!ret) @@ -3276,13 +3289,12 @@ static int apply_dir_move(struct send_ctx *sctx, struct pending_dir_move *pm) ret = -ENOMEM; goto out; } - ret = get_cur_path(sctx, rmdir_ino, odi->gen, name); + ret = get_cur_path(sctx, rmdir_ino, gen, name); if (ret < 0) goto out; ret = send_rmdir(sctx, name); if (ret < 0) goto out; - free_orphan_dir_info(sctx, odi); } finish: @@ -6454,7 +6466,7 @@ static void btrfs_root_dec_send_in_progress(struct btrfs_root* root) */ if (root->send_in_progress < 0) btrfs_err(root->fs_info, - "send_in_progres unbalanced %d root %llu", + "send_in_progress unbalanced %d root %llu", root->send_in_progress, root->root_key.objectid); spin_unlock(&root->root_item_lock); } diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 0628092b0b1b..81107ad49f3a 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -323,6 +323,7 @@ enum { Opt_ssd, Opt_nossd, Opt_ssd_spread, Opt_nossd_spread, Opt_subvol, + Opt_subvol_empty, Opt_subvolid, Opt_thread_pool, Opt_treelog, Opt_notreelog, @@ -388,6 +389,7 @@ static const match_table_t tokens = { {Opt_ssd_spread, "ssd_spread"}, {Opt_nossd_spread, "nossd_spread"}, {Opt_subvol, "subvol=%s"}, + {Opt_subvol_empty, "subvol="}, {Opt_subvolid, "subvolid=%s"}, {Opt_thread_pool, "thread_pool=%u"}, {Opt_treelog, "treelog"}, @@ -461,6 +463,7 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options, btrfs_set_opt(info->mount_opt, DEGRADED); break; case Opt_subvol: + case Opt_subvol_empty: case Opt_subvolid: case Opt_subvolrootid: case Opt_device: @@ -1782,10 +1785,8 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data) } ret = btrfs_parse_options(fs_info, data, *flags); - if (ret) { - ret = -EINVAL; + if (ret) goto restore; - } btrfs_remount_begin(fs_info, old_opts, *flags); btrfs_resize_thread_pool(fs_info, diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index 4848a4318fb5..4a4e960c7c66 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -210,12 +210,42 @@ static struct attribute *btrfs_supported_feature_attrs[] = { NULL }; +/* + * Features which depend on feature bits and may differ between each fs. + * + * /sys/fs/btrfs/features lists all available features of this kernel while + * /sys/fs/btrfs/UUID/features shows features of the fs which are enabled or + * can be changed online. + */ static const struct attribute_group btrfs_feature_attr_group = { .name = "features", .is_visible = btrfs_feature_visible, .attrs = btrfs_supported_feature_attrs, }; +static ssize_t rmdir_subvol_show(struct kobject *kobj, + struct kobj_attribute *ka, char *buf) +{ + return snprintf(buf, PAGE_SIZE, "0\n"); +} +BTRFS_ATTR(static_feature, rmdir_subvol, rmdir_subvol_show); + +static struct attribute *btrfs_supported_static_feature_attrs[] = { + BTRFS_ATTR_PTR(static_feature, rmdir_subvol), + NULL +}; + +/* + * Features which only depend on kernel version. + * + * These are listed in /sys/fs/btrfs/features along with + * btrfs_feature_attr_group + */ +static const struct attribute_group btrfs_static_feature_attr_group = { + .name = "features", + .attrs = btrfs_supported_static_feature_attrs, +}; + static ssize_t btrfs_show_u64(u64 *value_ptr, spinlock_t *lock, char *buf) { u64 val; @@ -514,10 +544,11 @@ static inline struct btrfs_fs_info *to_fs_info(struct kobject *kobj) } #define NUM_FEATURE_BITS 64 -static char btrfs_unknown_feature_names[3][NUM_FEATURE_BITS][13]; -static struct btrfs_feature_attr btrfs_feature_attrs[3][NUM_FEATURE_BITS]; +#define BTRFS_FEATURE_NAME_MAX 13 +static char btrfs_unknown_feature_names[FEAT_MAX][NUM_FEATURE_BITS][BTRFS_FEATURE_NAME_MAX]; +static struct btrfs_feature_attr btrfs_feature_attrs[FEAT_MAX][NUM_FEATURE_BITS]; -static const u64 supported_feature_masks[3] = { +static const u64 supported_feature_masks[FEAT_MAX] = { [FEAT_COMPAT] = BTRFS_FEATURE_COMPAT_SUPP, [FEAT_COMPAT_RO] = BTRFS_FEATURE_COMPAT_RO_SUPP, [FEAT_INCOMPAT] = BTRFS_FEATURE_INCOMPAT_SUPP, @@ -589,7 +620,7 @@ void btrfs_sysfs_remove_fsid(struct btrfs_fs_devices *fs_devs) return; } - list_for_each_entry(fs_devs, fs_uuids, list) { + list_for_each_entry(fs_devs, fs_uuids, fs_list) { __btrfs_sysfs_remove_fsid(fs_devs); } } @@ -609,7 +640,7 @@ void btrfs_sysfs_remove_mounted(struct btrfs_fs_info *fs_info) btrfs_sysfs_rm_device_link(fs_info->fs_devices, NULL); } -const char * const btrfs_feature_set_names[3] = { +const char * const btrfs_feature_set_names[FEAT_MAX] = { [FEAT_COMPAT] = "compat", [FEAT_COMPAT_RO] = "compat_ro", [FEAT_INCOMPAT] = "incompat", @@ -673,7 +704,7 @@ static void init_feature_attrs(void) if (fa->kobj_attr.attr.name) continue; - snprintf(name, 13, "%s:%u", + snprintf(name, BTRFS_FEATURE_NAME_MAX, "%s:%u", btrfs_feature_set_names[set], i); fa->kobj_attr.attr.name = name; @@ -900,8 +931,15 @@ int __init btrfs_init_sysfs(void) ret = sysfs_create_group(&btrfs_kset->kobj, &btrfs_feature_attr_group); if (ret) goto out2; + ret = sysfs_merge_group(&btrfs_kset->kobj, + &btrfs_static_feature_attr_group); + if (ret) + goto out_remove_group; return 0; + +out_remove_group: + sysfs_remove_group(&btrfs_kset->kobj, &btrfs_feature_attr_group); out2: debugfs_remove_recursive(btrfs_debugfs_root_dentry); out1: @@ -912,6 +950,8 @@ out1: void __cold btrfs_exit_sysfs(void) { + sysfs_unmerge_group(&btrfs_kset->kobj, + &btrfs_static_feature_attr_group); sysfs_remove_group(&btrfs_kset->kobj, &btrfs_feature_attr_group); kset_unregister(btrfs_kset); debugfs_remove_recursive(btrfs_debugfs_root_dentry); diff --git a/fs/btrfs/sysfs.h b/fs/btrfs/sysfs.h index b567560d9aa9..c6ee600aff89 100644 --- a/fs/btrfs/sysfs.h +++ b/fs/btrfs/sysfs.h @@ -9,7 +9,7 @@ extern u64 btrfs_debugfs_test; enum btrfs_feature_set { - FEAT_COMPAT, + FEAT_COMPAT = 0, FEAT_COMPAT_RO, FEAT_INCOMPAT, FEAT_MAX @@ -77,7 +77,7 @@ attr_to_btrfs_feature_attr(struct attribute *attr) } char *btrfs_printable_features(enum btrfs_feature_set set, u64 flags); -extern const char * const btrfs_feature_set_names[3]; +extern const char * const btrfs_feature_set_names[FEAT_MAX]; extern struct kobj_type space_info_ktype; extern struct kobj_type btrfs_raid_ktype; int btrfs_sysfs_add_device_link(struct btrfs_fs_devices *fs_devices, diff --git a/fs/btrfs/tests/btrfs-tests.c b/fs/btrfs/tests/btrfs-tests.c index 30ed438da2a9..db72b3b6209e 100644 --- a/fs/btrfs/tests/btrfs-tests.c +++ b/fs/btrfs/tests/btrfs-tests.c @@ -219,11 +219,13 @@ void btrfs_free_dummy_block_group(struct btrfs_block_group_cache *cache) kfree(cache); } -void btrfs_init_dummy_trans(struct btrfs_trans_handle *trans) +void btrfs_init_dummy_trans(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info) { memset(trans, 0, sizeof(*trans)); trans->transid = 1; trans->type = __TRANS_DUMMY; + trans->fs_info = fs_info; } int btrfs_run_sanity_tests(void) diff --git a/fs/btrfs/tests/btrfs-tests.h b/fs/btrfs/tests/btrfs-tests.h index a5a0b9500d3e..70ff9f9d86a1 100644 --- a/fs/btrfs/tests/btrfs-tests.h +++ b/fs/btrfs/tests/btrfs-tests.h @@ -9,7 +9,8 @@ #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS int btrfs_run_sanity_tests(void); -#define test_msg(fmt, ...) pr_info("BTRFS: selftest: " fmt, ##__VA_ARGS__) +#define test_msg(fmt, ...) pr_info("BTRFS: selftest: " fmt "\n", ##__VA_ARGS__) +#define test_err(fmt, ...) pr_err("BTRFS: selftest: " fmt "\n", ##__VA_ARGS__) struct btrfs_root; struct btrfs_trans_handle; @@ -28,7 +29,8 @@ void btrfs_free_dummy_root(struct btrfs_root *root); struct btrfs_block_group_cache * btrfs_alloc_dummy_block_group(struct btrfs_fs_info *fs_info, unsigned long length); void btrfs_free_dummy_block_group(struct btrfs_block_group_cache *cache); -void btrfs_init_dummy_trans(struct btrfs_trans_handle *trans); +void btrfs_init_dummy_trans(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info); #else static inline int btrfs_run_sanity_tests(void) { diff --git a/fs/btrfs/tests/extent-buffer-tests.c b/fs/btrfs/tests/extent-buffer-tests.c index 31e8a9ec228c..7d72eab6d32c 100644 --- a/fs/btrfs/tests/extent-buffer-tests.c +++ b/fs/btrfs/tests/extent-buffer-tests.c @@ -26,31 +26,31 @@ static int test_btrfs_split_item(u32 sectorsize, u32 nodesize) u32 value_len = strlen(value); int ret = 0; - test_msg("Running btrfs_split_item tests\n"); + test_msg("running btrfs_split_item tests"); fs_info = btrfs_alloc_dummy_fs_info(nodesize, sectorsize); if (!fs_info) { - test_msg("Could not allocate fs_info\n"); + test_err("could not allocate fs_info"); return -ENOMEM; } root = btrfs_alloc_dummy_root(fs_info); if (IS_ERR(root)) { - test_msg("Could not allocate root\n"); + test_err("could not allocate root"); ret = PTR_ERR(root); goto out; } path = btrfs_alloc_path(); if (!path) { - test_msg("Could not allocate path\n"); + test_err("could not allocate path"); ret = -ENOMEM; goto out; } path->nodes[0] = eb = alloc_dummy_extent_buffer(fs_info, nodesize); if (!eb) { - test_msg("Could not allocate dummy buffer\n"); + test_err("could not allocate dummy buffer"); ret = -ENOMEM; goto out; } @@ -75,7 +75,7 @@ static int test_btrfs_split_item(u32 sectorsize, u32 nodesize) */ ret = btrfs_split_item(NULL, root, path, &key, 17); if (ret) { - test_msg("Split item failed %d\n", ret); + test_err("split item failed %d", ret); goto out; } @@ -86,14 +86,14 @@ static int test_btrfs_split_item(u32 sectorsize, u32 nodesize) btrfs_item_key_to_cpu(eb, &key, 0); if (key.objectid != 0 || key.type != BTRFS_EXTENT_CSUM_KEY || key.offset != 0) { - test_msg("Invalid key at slot 0\n"); + test_err("invalid key at slot 0"); ret = -EINVAL; goto out; } item = btrfs_item_nr(0); if (btrfs_item_size(eb, item) != strlen(split1)) { - test_msg("Invalid len in the first split\n"); + test_err("invalid len in the first split"); ret = -EINVAL; goto out; } @@ -101,8 +101,8 @@ static int test_btrfs_split_item(u32 sectorsize, u32 nodesize) read_extent_buffer(eb, buf, btrfs_item_ptr_offset(eb, 0), strlen(split1)); if (memcmp(buf, split1, strlen(split1))) { - test_msg("Data in the buffer doesn't match what it should " - "in the first split have='%.*s' want '%s'\n", + test_err( +"data in the buffer doesn't match what it should in the first split have='%.*s' want '%s'", (int)strlen(split1), buf, split1); ret = -EINVAL; goto out; @@ -111,14 +111,14 @@ static int test_btrfs_split_item(u32 sectorsize, u32 nodesize) btrfs_item_key_to_cpu(eb, &key, 1); if (key.objectid != 0 || key.type != BTRFS_EXTENT_CSUM_KEY || key.offset != 3) { - test_msg("Invalid key at slot 1\n"); + test_err("invalid key at slot 1"); ret = -EINVAL; goto out; } item = btrfs_item_nr(1); if (btrfs_item_size(eb, item) != strlen(split2)) { - test_msg("Invalid len in the second split\n"); + test_err("invalid len in the second split"); ret = -EINVAL; goto out; } @@ -126,8 +126,8 @@ static int test_btrfs_split_item(u32 sectorsize, u32 nodesize) read_extent_buffer(eb, buf, btrfs_item_ptr_offset(eb, 1), strlen(split2)); if (memcmp(buf, split2, strlen(split2))) { - test_msg("Data in the buffer doesn't match what it should " - "in the second split\n"); + test_err( + "data in the buffer doesn't match what it should in the second split"); ret = -EINVAL; goto out; } @@ -136,21 +136,21 @@ static int test_btrfs_split_item(u32 sectorsize, u32 nodesize) /* Do it again so we test memmoving the other items in the leaf */ ret = btrfs_split_item(NULL, root, path, &key, 4); if (ret) { - test_msg("Second split item failed %d\n", ret); + test_err("second split item failed %d", ret); goto out; } btrfs_item_key_to_cpu(eb, &key, 0); if (key.objectid != 0 || key.type != BTRFS_EXTENT_CSUM_KEY || key.offset != 0) { - test_msg("Invalid key at slot 0\n"); + test_err("invalid key at slot 0"); ret = -EINVAL; goto out; } item = btrfs_item_nr(0); if (btrfs_item_size(eb, item) != strlen(split3)) { - test_msg("Invalid len in the first split\n"); + test_err("invalid len in the first split"); ret = -EINVAL; goto out; } @@ -158,8 +158,8 @@ static int test_btrfs_split_item(u32 sectorsize, u32 nodesize) read_extent_buffer(eb, buf, btrfs_item_ptr_offset(eb, 0), strlen(split3)); if (memcmp(buf, split3, strlen(split3))) { - test_msg("Data in the buffer doesn't match what it should " - "in the third split"); + test_err( + "data in the buffer doesn't match what it should in the third split"); ret = -EINVAL; goto out; } @@ -167,14 +167,14 @@ static int test_btrfs_split_item(u32 sectorsize, u32 nodesize) btrfs_item_key_to_cpu(eb, &key, 1); if (key.objectid != 0 || key.type != BTRFS_EXTENT_CSUM_KEY || key.offset != 1) { - test_msg("Invalid key at slot 1\n"); + test_err("invalid key at slot 1"); ret = -EINVAL; goto out; } item = btrfs_item_nr(1); if (btrfs_item_size(eb, item) != strlen(split4)) { - test_msg("Invalid len in the second split\n"); + test_err("invalid len in the second split"); ret = -EINVAL; goto out; } @@ -182,8 +182,8 @@ static int test_btrfs_split_item(u32 sectorsize, u32 nodesize) read_extent_buffer(eb, buf, btrfs_item_ptr_offset(eb, 1), strlen(split4)); if (memcmp(buf, split4, strlen(split4))) { - test_msg("Data in the buffer doesn't match what it should " - "in the fourth split\n"); + test_err( + "data in the buffer doesn't match what it should in the fourth split"); ret = -EINVAL; goto out; } @@ -191,14 +191,14 @@ static int test_btrfs_split_item(u32 sectorsize, u32 nodesize) btrfs_item_key_to_cpu(eb, &key, 2); if (key.objectid != 0 || key.type != BTRFS_EXTENT_CSUM_KEY || key.offset != 3) { - test_msg("Invalid key at slot 2\n"); + test_err("invalid key at slot 2"); ret = -EINVAL; goto out; } item = btrfs_item_nr(2); if (btrfs_item_size(eb, item) != strlen(split2)) { - test_msg("Invalid len in the second split\n"); + test_err("invalid len in the second split"); ret = -EINVAL; goto out; } @@ -206,8 +206,8 @@ static int test_btrfs_split_item(u32 sectorsize, u32 nodesize) read_extent_buffer(eb, buf, btrfs_item_ptr_offset(eb, 2), strlen(split2)); if (memcmp(buf, split2, strlen(split2))) { - test_msg("Data in the buffer doesn't match what it should " - "in the last chunk\n"); + test_err( + "data in the buffer doesn't match what it should in the last chunk"); ret = -EINVAL; goto out; } @@ -220,6 +220,6 @@ out: int btrfs_test_extent_buffer_operations(u32 sectorsize, u32 nodesize) { - test_msg("Running extent buffer operation tests\n"); + test_msg("running extent buffer operation tests"); return test_btrfs_split_item(sectorsize, nodesize); } diff --git a/fs/btrfs/tests/extent-io-tests.c b/fs/btrfs/tests/extent-io-tests.c index 76aa5a678a96..d9269a531a4d 100644 --- a/fs/btrfs/tests/extent-io-tests.c +++ b/fs/btrfs/tests/extent-io-tests.c @@ -46,7 +46,9 @@ static noinline int process_page_range(struct inode *inode, u64 start, u64 end, cond_resched(); loops++; if (loops > 100000) { - printk(KERN_ERR "stuck in a loop, start %Lu, end %Lu, nr_pages %lu, ret %d\n", start, end, nr_pages, ret); + printk(KERN_ERR + "stuck in a loop, start %llu, end %llu, nr_pages %lu, ret %d\n", + start, end, nr_pages, ret); break; } } @@ -66,11 +68,11 @@ static int test_find_delalloc(u32 sectorsize) u64 found; int ret = -EINVAL; - test_msg("Running find delalloc tests\n"); + test_msg("running find delalloc tests"); inode = btrfs_new_test_inode(); if (!inode) { - test_msg("Failed to allocate test inode\n"); + test_err("failed to allocate test inode"); return -ENOMEM; } @@ -84,7 +86,7 @@ static int test_find_delalloc(u32 sectorsize) for (index = 0; index < (total_dirty >> PAGE_SHIFT); index++) { page = find_or_create_page(inode->i_mapping, index, GFP_KERNEL); if (!page) { - test_msg("Failed to allocate test page\n"); + test_err("failed to allocate test page"); ret = -ENOMEM; goto out; } @@ -107,11 +109,11 @@ static int test_find_delalloc(u32 sectorsize) found = find_lock_delalloc_range(inode, &tmp, locked_page, &start, &end, max_bytes); if (!found) { - test_msg("Should have found at least one delalloc\n"); + test_err("should have found at least one delalloc"); goto out_bits; } if (start != 0 || end != (sectorsize - 1)) { - test_msg("Expected start 0 end %u, got start %llu end %llu\n", + test_err("expected start 0 end %u, got start %llu end %llu", sectorsize - 1, start, end); goto out_bits; } @@ -129,7 +131,7 @@ static int test_find_delalloc(u32 sectorsize) locked_page = find_lock_page(inode->i_mapping, test_start >> PAGE_SHIFT); if (!locked_page) { - test_msg("Couldn't find the locked page\n"); + test_err("couldn't find the locked page"); goto out_bits; } set_extent_delalloc(&tmp, sectorsize, max_bytes - 1, 0, NULL); @@ -138,17 +140,17 @@ static int test_find_delalloc(u32 sectorsize) found = find_lock_delalloc_range(inode, &tmp, locked_page, &start, &end, max_bytes); if (!found) { - test_msg("Couldn't find delalloc in our range\n"); + test_err("couldn't find delalloc in our range"); goto out_bits; } if (start != test_start || end != max_bytes - 1) { - test_msg("Expected start %Lu end %Lu, got start %Lu, end " - "%Lu\n", test_start, max_bytes - 1, start, end); + test_err("expected start %llu end %llu, got start %llu, end %llu", + test_start, max_bytes - 1, start, end); goto out_bits; } if (process_page_range(inode, start, end, PROCESS_TEST_LOCKED | PROCESS_UNLOCK)) { - test_msg("There were unlocked pages in the range\n"); + test_err("there were unlocked pages in the range"); goto out_bits; } unlock_extent(&tmp, start, end); @@ -164,7 +166,7 @@ static int test_find_delalloc(u32 sectorsize) locked_page = find_lock_page(inode->i_mapping, test_start >> PAGE_SHIFT); if (!locked_page) { - test_msg("Couldn't find the locked page\n"); + test_err("couldn't find the locked page"); goto out_bits; } start = test_start; @@ -172,11 +174,11 @@ static int test_find_delalloc(u32 sectorsize) found = find_lock_delalloc_range(inode, &tmp, locked_page, &start, &end, max_bytes); if (found) { - test_msg("Found range when we shouldn't have\n"); + test_err("found range when we shouldn't have"); goto out_bits; } if (end != (u64)-1) { - test_msg("Did not return the proper end offset\n"); + test_err("did not return the proper end offset"); goto out_bits; } @@ -193,17 +195,17 @@ static int test_find_delalloc(u32 sectorsize) found = find_lock_delalloc_range(inode, &tmp, locked_page, &start, &end, max_bytes); if (!found) { - test_msg("Didn't find our range\n"); + test_err("didn't find our range"); goto out_bits; } if (start != test_start || end != total_dirty - 1) { - test_msg("Expected start %Lu end %Lu, got start %Lu end %Lu\n", + test_err("expected start %llu end %llu, got start %llu end %llu", test_start, total_dirty - 1, start, end); goto out_bits; } if (process_page_range(inode, start, end, PROCESS_TEST_LOCKED | PROCESS_UNLOCK)) { - test_msg("Pages in range were not all locked\n"); + test_err("pages in range were not all locked"); goto out_bits; } unlock_extent(&tmp, start, end); @@ -215,7 +217,7 @@ static int test_find_delalloc(u32 sectorsize) page = find_get_page(inode->i_mapping, (max_bytes + SZ_1M) >> PAGE_SHIFT); if (!page) { - test_msg("Couldn't find our page\n"); + test_err("couldn't find our page"); goto out_bits; } ClearPageDirty(page); @@ -234,18 +236,17 @@ static int test_find_delalloc(u32 sectorsize) found = find_lock_delalloc_range(inode, &tmp, locked_page, &start, &end, max_bytes); if (!found) { - test_msg("Didn't find our range\n"); + test_err("didn't find our range"); goto out_bits; } if (start != test_start && end != test_start + PAGE_SIZE - 1) { - test_msg("Expected start %Lu end %Lu, got start %Lu end %Lu\n", - test_start, test_start + PAGE_SIZE - 1, start, - end); + test_err("expected start %llu end %llu, got start %llu end %llu", + test_start, test_start + PAGE_SIZE - 1, start, end); goto out_bits; } if (process_page_range(inode, start, end, PROCESS_TEST_LOCKED | PROCESS_UNLOCK)) { - test_msg("Pages in range were not all locked\n"); + test_err("pages in range were not all locked"); goto out_bits; } ret = 0; @@ -271,14 +272,14 @@ static int check_eb_bitmap(unsigned long *bitmap, struct extent_buffer *eb, bit = !!test_bit(i, bitmap); bit1 = !!extent_buffer_test_bit(eb, 0, i); if (bit1 != bit) { - test_msg("Bits do not match\n"); + test_err("bits do not match"); return -EINVAL; } bit1 = !!extent_buffer_test_bit(eb, i / BITS_PER_BYTE, i % BITS_PER_BYTE); if (bit1 != bit) { - test_msg("Offset bits do not match\n"); + test_err("offset bits do not match"); return -EINVAL; } } @@ -295,7 +296,7 @@ static int __test_eb_bitmaps(unsigned long *bitmap, struct extent_buffer *eb, memset(bitmap, 0, len); memzero_extent_buffer(eb, 0, len); if (memcmp_extent_buffer(eb, bitmap, 0, len) != 0) { - test_msg("Bitmap was not zeroed\n"); + test_err("bitmap was not zeroed"); return -EINVAL; } @@ -303,7 +304,7 @@ static int __test_eb_bitmaps(unsigned long *bitmap, struct extent_buffer *eb, extent_buffer_bitmap_set(eb, 0, 0, len * BITS_PER_BYTE); ret = check_eb_bitmap(bitmap, eb, len); if (ret) { - test_msg("Setting all bits failed\n"); + test_err("setting all bits failed"); return ret; } @@ -311,7 +312,7 @@ static int __test_eb_bitmaps(unsigned long *bitmap, struct extent_buffer *eb, extent_buffer_bitmap_clear(eb, 0, 0, len * BITS_PER_BYTE); ret = check_eb_bitmap(bitmap, eb, len); if (ret) { - test_msg("Clearing all bits failed\n"); + test_err("clearing all bits failed"); return ret; } @@ -324,7 +325,7 @@ static int __test_eb_bitmaps(unsigned long *bitmap, struct extent_buffer *eb, sizeof(long) * BITS_PER_BYTE); ret = check_eb_bitmap(bitmap, eb, len); if (ret) { - test_msg("Setting straddling pages failed\n"); + test_err("setting straddling pages failed"); return ret; } @@ -337,7 +338,7 @@ static int __test_eb_bitmaps(unsigned long *bitmap, struct extent_buffer *eb, sizeof(long) * BITS_PER_BYTE); ret = check_eb_bitmap(bitmap, eb, len); if (ret) { - test_msg("Clearing straddling pages failed\n"); + test_err("clearing straddling pages failed"); return ret; } } @@ -361,7 +362,7 @@ static int __test_eb_bitmaps(unsigned long *bitmap, struct extent_buffer *eb, ret = check_eb_bitmap(bitmap, eb, len); if (ret) { - test_msg("Random bit pattern failed\n"); + test_err("random bit pattern failed"); return ret; } @@ -376,7 +377,7 @@ static int test_eb_bitmaps(u32 sectorsize, u32 nodesize) struct extent_buffer *eb; int ret; - test_msg("Running extent buffer bitmap tests\n"); + test_msg("running extent buffer bitmap tests"); /* * In ppc64, sectorsize can be 64K, thus 4 * 64K will be larger than @@ -389,13 +390,13 @@ static int test_eb_bitmaps(u32 sectorsize, u32 nodesize) bitmap = kmalloc(len, GFP_KERNEL); if (!bitmap) { - test_msg("Couldn't allocate test bitmap\n"); + test_err("couldn't allocate test bitmap"); return -ENOMEM; } eb = __alloc_dummy_extent_buffer(fs_info, 0, len); if (!eb) { - test_msg("Couldn't allocate test extent buffer\n"); + test_err("couldn't allocate test extent buffer"); kfree(bitmap); return -ENOMEM; } @@ -408,7 +409,7 @@ static int test_eb_bitmaps(u32 sectorsize, u32 nodesize) free_extent_buffer(eb); eb = __alloc_dummy_extent_buffer(NULL, nodesize / 2, len); if (!eb) { - test_msg("Couldn't allocate test extent buffer\n"); + test_err("couldn't allocate test extent buffer"); kfree(bitmap); return -ENOMEM; } @@ -424,7 +425,7 @@ int btrfs_test_extent_io(u32 sectorsize, u32 nodesize) { int ret; - test_msg("Running extent I/O tests\n"); + test_msg("running extent I/O tests"); ret = test_find_delalloc(sectorsize); if (ret) @@ -432,6 +433,6 @@ int btrfs_test_extent_io(u32 sectorsize, u32 nodesize) ret = test_eb_bitmaps(sectorsize, nodesize); out: - test_msg("Extent I/O tests finished\n"); + test_msg("extent I/O tests finished"); return ret; } diff --git a/fs/btrfs/tests/extent-map-tests.c b/fs/btrfs/tests/extent-map-tests.c index 79e0a5f4d9c9..385a5316e4bf 100644 --- a/fs/btrfs/tests/extent-map-tests.c +++ b/fs/btrfs/tests/extent-map-tests.c @@ -19,8 +19,8 @@ static void free_extent_map_tree(struct extent_map_tree *em_tree) #ifdef CONFIG_BTRFS_DEBUG if (refcount_read(&em->refs) != 1) { - test_msg( -"em leak: em (start 0x%llx len 0x%llx block_start 0x%llx block_len 0x%llx) refs %d\n", + test_err( +"em leak: em (start 0x%llx len 0x%llx block_start 0x%llx block_len 0x%llx) refs %d", em->start, em->len, em->block_start, em->block_len, refcount_read(&em->refs)); @@ -47,7 +47,8 @@ static void free_extent_map_tree(struct extent_map_tree *em_tree) * ->add_extent_mapping(0, 16K) * -> #handle -EEXIST */ -static void test_case_1(struct extent_map_tree *em_tree) +static void test_case_1(struct btrfs_fs_info *fs_info, + struct extent_map_tree *em_tree) { struct extent_map *em; u64 start = 0; @@ -90,14 +91,14 @@ static void test_case_1(struct extent_map_tree *em_tree) em->len = len; em->block_start = start; em->block_len = len; - ret = btrfs_add_extent_mapping(em_tree, &em, em->start, em->len); + ret = btrfs_add_extent_mapping(fs_info, em_tree, &em, em->start, em->len); if (ret) - test_msg("case1 [%llu %llu]: ret %d\n", start, start + len, ret); + test_err("case1 [%llu %llu]: ret %d", start, start + len, ret); if (em && (em->start != 0 || extent_map_end(em) != SZ_16K || em->block_start != 0 || em->block_len != SZ_16K)) - test_msg( -"case1 [%llu %llu]: ret %d return a wrong em (start %llu len %llu block_start %llu block_len %llu\n", + test_err( +"case1 [%llu %llu]: ret %d return a wrong em (start %llu len %llu block_start %llu block_len %llu", start, start + len, ret, em->start, em->len, em->block_start, em->block_len); free_extent_map(em); @@ -112,7 +113,8 @@ out: * Reading the inline ending up with EEXIST, ie. read an inline * extent and discard page cache and read it again. */ -static void test_case_2(struct extent_map_tree *em_tree) +static void test_case_2(struct btrfs_fs_info *fs_info, + struct extent_map_tree *em_tree) { struct extent_map *em; int ret; @@ -153,14 +155,14 @@ static void test_case_2(struct extent_map_tree *em_tree) em->len = SZ_1K; em->block_start = EXTENT_MAP_INLINE; em->block_len = (u64)-1; - ret = btrfs_add_extent_mapping(em_tree, &em, em->start, em->len); + ret = btrfs_add_extent_mapping(fs_info, em_tree, &em, em->start, em->len); if (ret) - test_msg("case2 [0 1K]: ret %d\n", ret); + test_err("case2 [0 1K]: ret %d", ret); if (em && (em->start != 0 || extent_map_end(em) != SZ_1K || em->block_start != EXTENT_MAP_INLINE || em->block_len != (u64)-1)) - test_msg( -"case2 [0 1K]: ret %d return a wrong em (start %llu len %llu block_start %llu block_len %llu\n", + test_err( +"case2 [0 1K]: ret %d return a wrong em (start %llu len %llu block_start %llu block_len %llu", ret, em->start, em->len, em->block_start, em->block_len); free_extent_map(em); @@ -169,7 +171,8 @@ out: free_extent_map_tree(em_tree); } -static void __test_case_3(struct extent_map_tree *em_tree, u64 start) +static void __test_case_3(struct btrfs_fs_info *fs_info, + struct extent_map_tree *em_tree, u64 start) { struct extent_map *em; u64 len = SZ_4K; @@ -198,9 +201,9 @@ static void __test_case_3(struct extent_map_tree *em_tree, u64 start) em->len = SZ_16K; em->block_start = 0; em->block_len = SZ_16K; - ret = btrfs_add_extent_mapping(em_tree, &em, start, len); + ret = btrfs_add_extent_mapping(fs_info, em_tree, &em, start, len); if (ret) - test_msg("case3 [0x%llx 0x%llx): ret %d\n", + test_err("case3 [0x%llx 0x%llx): ret %d", start, start + len, ret); /* * Since bytes within em are contiguous, em->block_start is identical to @@ -209,8 +212,8 @@ static void __test_case_3(struct extent_map_tree *em_tree, u64 start) if (em && (start < em->start || start + len > extent_map_end(em) || em->start != em->block_start || em->len != em->block_len)) - test_msg( -"case3 [0x%llx 0x%llx): ret %d em (start 0x%llx len 0x%llx block_start 0x%llx block_len 0x%llx)\n", + test_err( +"case3 [0x%llx 0x%llx): ret %d em (start 0x%llx len 0x%llx block_start 0x%llx block_len 0x%llx)", start, start + len, ret, em->start, em->len, em->block_start, em->block_len); free_extent_map(em); @@ -235,14 +238,16 @@ out: * -> add_extent_mapping() * -> add_extent_mapping() */ -static void test_case_3(struct extent_map_tree *em_tree) +static void test_case_3(struct btrfs_fs_info *fs_info, + struct extent_map_tree *em_tree) { - __test_case_3(em_tree, 0); - __test_case_3(em_tree, SZ_8K); - __test_case_3(em_tree, (12 * 1024ULL)); + __test_case_3(fs_info, em_tree, 0); + __test_case_3(fs_info, em_tree, SZ_8K); + __test_case_3(fs_info, em_tree, (12 * 1024ULL)); } -static void __test_case_4(struct extent_map_tree *em_tree, u64 start) +static void __test_case_4(struct btrfs_fs_info *fs_info, + struct extent_map_tree *em_tree, u64 start) { struct extent_map *em; u64 len = SZ_4K; @@ -283,14 +288,14 @@ static void __test_case_4(struct extent_map_tree *em_tree, u64 start) em->len = SZ_32K; em->block_start = 0; em->block_len = SZ_32K; - ret = btrfs_add_extent_mapping(em_tree, &em, start, len); + ret = btrfs_add_extent_mapping(fs_info, em_tree, &em, start, len); if (ret) - test_msg("case4 [0x%llx 0x%llx): ret %d\n", + test_err("case4 [0x%llx 0x%llx): ret %d", start, len, ret); if (em && (start < em->start || start + len > extent_map_end(em))) - test_msg( -"case4 [0x%llx 0x%llx): ret %d, added wrong em (start 0x%llx len 0x%llx block_start 0x%llx block_len 0x%llx)\n", + test_err( +"case4 [0x%llx 0x%llx): ret %d, added wrong em (start 0x%llx len 0x%llx block_start 0x%llx block_len 0x%llx)", start, len, ret, em->start, em->len, em->block_start, em->block_len); free_extent_map(em); @@ -324,30 +329,45 @@ out: * # handle -EEXIST when adding * # [0, 32K) */ -static void test_case_4(struct extent_map_tree *em_tree) +static void test_case_4(struct btrfs_fs_info *fs_info, + struct extent_map_tree *em_tree) { - __test_case_4(em_tree, 0); - __test_case_4(em_tree, SZ_4K); + __test_case_4(fs_info, em_tree, 0); + __test_case_4(fs_info, em_tree, SZ_4K); } int btrfs_test_extent_map(void) { + struct btrfs_fs_info *fs_info = NULL; struct extent_map_tree *em_tree; - test_msg("Running extent_map tests\n"); + test_msg("running extent_map tests"); + + /* + * Note: the fs_info is not set up completely, we only need + * fs_info::fsid for the tracepoint. + */ + fs_info = btrfs_alloc_dummy_fs_info(PAGE_SIZE, PAGE_SIZE); + if (!fs_info) { + test_msg("Couldn't allocate dummy fs info"); + return -ENOMEM; + } em_tree = kzalloc(sizeof(*em_tree), GFP_KERNEL); if (!em_tree) /* Skip the test on error. */ - return 0; + goto out; extent_map_tree_init(em_tree); - test_case_1(em_tree); - test_case_2(em_tree); - test_case_3(em_tree); - test_case_4(em_tree); + test_case_1(fs_info, em_tree); + test_case_2(fs_info, em_tree); + test_case_3(fs_info, em_tree); + test_case_4(fs_info, em_tree); kfree(em_tree); +out: + btrfs_free_dummy_fs_info(fs_info); + return 0; } diff --git a/fs/btrfs/tests/free-space-tests.c b/fs/btrfs/tests/free-space-tests.c index d3c9f8a59ba5..5c2f77e9439b 100644 --- a/fs/btrfs/tests/free-space-tests.c +++ b/fs/btrfs/tests/free-space-tests.c @@ -20,63 +20,63 @@ static int test_extents(struct btrfs_block_group_cache *cache) { int ret = 0; - test_msg("Running extent only tests\n"); + test_msg("running extent only tests"); /* First just make sure we can remove an entire entry */ ret = btrfs_add_free_space(cache, 0, SZ_4M); if (ret) { - test_msg("Error adding initial extents %d\n", ret); + test_err("error adding initial extents %d", ret); return ret; } ret = btrfs_remove_free_space(cache, 0, SZ_4M); if (ret) { - test_msg("Error removing extent %d\n", ret); + test_err("error removing extent %d", ret); return ret; } if (test_check_exists(cache, 0, SZ_4M)) { - test_msg("Full remove left some lingering space\n"); + test_err("full remove left some lingering space"); return -1; } /* Ok edge and middle cases now */ ret = btrfs_add_free_space(cache, 0, SZ_4M); if (ret) { - test_msg("Error adding half extent %d\n", ret); + test_err("error adding half extent %d", ret); return ret; } ret = btrfs_remove_free_space(cache, 3 * SZ_1M, SZ_1M); if (ret) { - test_msg("Error removing tail end %d\n", ret); + test_err("error removing tail end %d", ret); return ret; } ret = btrfs_remove_free_space(cache, 0, SZ_1M); if (ret) { - test_msg("Error removing front end %d\n", ret); + test_err("error removing front end %d", ret); return ret; } ret = btrfs_remove_free_space(cache, SZ_2M, 4096); if (ret) { - test_msg("Error removing middle piece %d\n", ret); + test_err("error removing middle piece %d", ret); return ret; } if (test_check_exists(cache, 0, SZ_1M)) { - test_msg("Still have space at the front\n"); + test_err("still have space at the front"); return -1; } if (test_check_exists(cache, SZ_2M, 4096)) { - test_msg("Still have space in the middle\n"); + test_err("still have space in the middle"); return -1; } if (test_check_exists(cache, 3 * SZ_1M, SZ_1M)) { - test_msg("Still have space at the end\n"); + test_err("still have space at the end"); return -1; } @@ -92,34 +92,34 @@ static int test_bitmaps(struct btrfs_block_group_cache *cache, u64 next_bitmap_offset; int ret; - test_msg("Running bitmap only tests\n"); + test_msg("running bitmap only tests"); ret = test_add_free_space_entry(cache, 0, SZ_4M, 1); if (ret) { - test_msg("Couldn't create a bitmap entry %d\n", ret); + test_err("couldn't create a bitmap entry %d", ret); return ret; } ret = btrfs_remove_free_space(cache, 0, SZ_4M); if (ret) { - test_msg("Error removing bitmap full range %d\n", ret); + test_err("error removing bitmap full range %d", ret); return ret; } if (test_check_exists(cache, 0, SZ_4M)) { - test_msg("Left some space in bitmap\n"); + test_err("left some space in bitmap"); return -1; } ret = test_add_free_space_entry(cache, 0, SZ_4M, 1); if (ret) { - test_msg("Couldn't add to our bitmap entry %d\n", ret); + test_err("couldn't add to our bitmap entry %d", ret); return ret; } ret = btrfs_remove_free_space(cache, SZ_1M, SZ_2M); if (ret) { - test_msg("Couldn't remove middle chunk %d\n", ret); + test_err("couldn't remove middle chunk %d", ret); return ret; } @@ -133,19 +133,19 @@ static int test_bitmaps(struct btrfs_block_group_cache *cache, ret = test_add_free_space_entry(cache, next_bitmap_offset - SZ_2M, SZ_4M, 1); if (ret) { - test_msg("Couldn't add space that straddles two bitmaps %d\n", + test_err("couldn't add space that straddles two bitmaps %d", ret); return ret; } ret = btrfs_remove_free_space(cache, next_bitmap_offset - SZ_1M, SZ_2M); if (ret) { - test_msg("Couldn't remove overlapping space %d\n", ret); + test_err("couldn't remove overlapping space %d", ret); return ret; } if (test_check_exists(cache, next_bitmap_offset - SZ_1M, SZ_2M)) { - test_msg("Left some space when removing overlapping\n"); + test_err("left some space when removing overlapping"); return -1; } @@ -161,7 +161,7 @@ static int test_bitmaps_and_extents(struct btrfs_block_group_cache *cache, u64 bitmap_offset = (u64)(BITS_PER_BITMAP * sectorsize); int ret; - test_msg("Running bitmap and extent tests\n"); + test_msg("running bitmap and extent tests"); /* * First let's do something simple, an extent at the same offset as the @@ -170,42 +170,42 @@ static int test_bitmaps_and_extents(struct btrfs_block_group_cache *cache, */ ret = test_add_free_space_entry(cache, SZ_4M, SZ_1M, 1); if (ret) { - test_msg("Couldn't create bitmap entry %d\n", ret); + test_err("couldn't create bitmap entry %d", ret); return ret; } ret = test_add_free_space_entry(cache, 0, SZ_1M, 0); if (ret) { - test_msg("Couldn't add extent entry %d\n", ret); + test_err("couldn't add extent entry %d", ret); return ret; } ret = btrfs_remove_free_space(cache, 0, SZ_1M); if (ret) { - test_msg("Couldn't remove extent entry %d\n", ret); + test_err("couldn't remove extent entry %d", ret); return ret; } if (test_check_exists(cache, 0, SZ_1M)) { - test_msg("Left remnants after our remove\n"); + test_err("left remnants after our remove"); return -1; } /* Now to add back the extent entry and remove from the bitmap */ ret = test_add_free_space_entry(cache, 0, SZ_1M, 0); if (ret) { - test_msg("Couldn't re-add extent entry %d\n", ret); + test_err("couldn't re-add extent entry %d", ret); return ret; } ret = btrfs_remove_free_space(cache, SZ_4M, SZ_1M); if (ret) { - test_msg("Couldn't remove from bitmap %d\n", ret); + test_err("couldn't remove from bitmap %d", ret); return ret; } if (test_check_exists(cache, SZ_4M, SZ_1M)) { - test_msg("Left remnants in the bitmap\n"); + test_err("left remnants in the bitmap"); return -1; } @@ -215,18 +215,18 @@ static int test_bitmaps_and_extents(struct btrfs_block_group_cache *cache, */ ret = test_add_free_space_entry(cache, SZ_1M, SZ_4M, 1); if (ret) { - test_msg("Couldn't add to a bitmap %d\n", ret); + test_err("couldn't add to a bitmap %d", ret); return ret; } ret = btrfs_remove_free_space(cache, SZ_512K, 3 * SZ_1M); if (ret) { - test_msg("Couldn't remove overlapping space %d\n", ret); + test_err("couldn't remove overlapping space %d", ret); return ret; } if (test_check_exists(cache, SZ_512K, 3 * SZ_1M)) { - test_msg("Left over pieces after removing overlapping\n"); + test_err("left over pieces after removing overlapping"); return -1; } @@ -235,24 +235,24 @@ static int test_bitmaps_and_extents(struct btrfs_block_group_cache *cache, /* Now with the extent entry offset into the bitmap */ ret = test_add_free_space_entry(cache, SZ_4M, SZ_4M, 1); if (ret) { - test_msg("Couldn't add space to the bitmap %d\n", ret); + test_err("couldn't add space to the bitmap %d", ret); return ret; } ret = test_add_free_space_entry(cache, SZ_2M, SZ_2M, 0); if (ret) { - test_msg("Couldn't add extent to the cache %d\n", ret); + test_err("couldn't add extent to the cache %d", ret); return ret; } ret = btrfs_remove_free_space(cache, 3 * SZ_1M, SZ_4M); if (ret) { - test_msg("Problem removing overlapping space %d\n", ret); + test_err("problem removing overlapping space %d", ret); return ret; } if (test_check_exists(cache, 3 * SZ_1M, SZ_4M)) { - test_msg("Left something behind when removing space"); + test_err("left something behind when removing space"); return -1; } @@ -269,25 +269,25 @@ static int test_bitmaps_and_extents(struct btrfs_block_group_cache *cache, __btrfs_remove_free_space_cache(cache->free_space_ctl); ret = test_add_free_space_entry(cache, bitmap_offset + SZ_4M, SZ_4M, 1); if (ret) { - test_msg("Couldn't add bitmap %d\n", ret); + test_err("couldn't add bitmap %d", ret); return ret; } ret = test_add_free_space_entry(cache, bitmap_offset - SZ_1M, 5 * SZ_1M, 0); if (ret) { - test_msg("Couldn't add extent entry %d\n", ret); + test_err("couldn't add extent entry %d", ret); return ret; } ret = btrfs_remove_free_space(cache, bitmap_offset + SZ_1M, 5 * SZ_1M); if (ret) { - test_msg("Failed to free our space %d\n", ret); + test_err("failed to free our space %d", ret); return ret; } if (test_check_exists(cache, bitmap_offset + SZ_1M, 5 * SZ_1M)) { - test_msg("Left stuff over\n"); + test_err("left stuff over"); return -1; } @@ -301,19 +301,19 @@ static int test_bitmaps_and_extents(struct btrfs_block_group_cache *cache, */ ret = test_add_free_space_entry(cache, SZ_1M, SZ_2M, 1); if (ret) { - test_msg("Couldn't add bitmap entry %d\n", ret); + test_err("couldn't add bitmap entry %d", ret); return ret; } ret = test_add_free_space_entry(cache, 3 * SZ_1M, SZ_1M, 0); if (ret) { - test_msg("Couldn't add extent entry %d\n", ret); + test_err("couldn't add extent entry %d", ret); return ret; } ret = btrfs_remove_free_space(cache, SZ_1M, 3 * SZ_1M); if (ret) { - test_msg("Error removing bitmap and extent overlapping %d\n", ret); + test_err("error removing bitmap and extent overlapping %d", ret); return ret; } @@ -335,12 +335,14 @@ check_num_extents_and_bitmaps(const struct btrfs_block_group_cache *cache, const int num_bitmaps) { if (cache->free_space_ctl->free_extents != num_extents) { - test_msg("Incorrect # of extent entries in the cache: %d, expected %d\n", + test_err( + "incorrect # of extent entries in the cache: %d, expected %d", cache->free_space_ctl->free_extents, num_extents); return -EINVAL; } if (cache->free_space_ctl->total_bitmaps != num_bitmaps) { - test_msg("Incorrect # of extent entries in the cache: %d, expected %d\n", + test_err( + "incorrect # of extent entries in the cache: %d, expected %d", cache->free_space_ctl->total_bitmaps, num_bitmaps); return -EINVAL; } @@ -358,7 +360,7 @@ static int check_cache_empty(struct btrfs_block_group_cache *cache) * allocate. */ if (cache->free_space_ctl->free_space != 0) { - test_msg("Cache free space is not 0\n"); + test_err("cache free space is not 0"); return -EINVAL; } @@ -366,7 +368,7 @@ static int check_cache_empty(struct btrfs_block_group_cache *cache) offset = btrfs_find_space_for_alloc(cache, 0, 4096, 0, &max_extent_size); if (offset != 0) { - test_msg("Space allocation did not fail, returned offset: %llu", + test_err("space allocation did not fail, returned offset: %llu", offset); return -EINVAL; } @@ -402,7 +404,7 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache, }; const struct btrfs_free_space_op *orig_free_space_ops; - test_msg("Running space stealing from bitmap to extent\n"); + test_msg("running space stealing from bitmap to extent"); /* * For this test, we want to ensure we end up with an extent entry @@ -430,7 +432,7 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache, */ ret = test_add_free_space_entry(cache, SZ_128M - SZ_256K, SZ_128K, 0); if (ret) { - test_msg("Couldn't add extent entry %d\n", ret); + test_err("couldn't add extent entry %d", ret); return ret; } @@ -438,7 +440,7 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache, ret = test_add_free_space_entry(cache, SZ_128M + SZ_512K, SZ_128M - SZ_512K, 1); if (ret) { - test_msg("Couldn't add bitmap entry %d\n", ret); + test_err("couldn't add bitmap entry %d", ret); return ret; } @@ -457,17 +459,17 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache, SZ_128M + 768 * SZ_1K, SZ_128M - 768 * SZ_1K); if (ret) { - test_msg("Failed to free part of bitmap space %d\n", ret); + test_err("failed to free part of bitmap space %d", ret); return ret; } /* Confirm that only those 2 ranges are marked as free. */ if (!test_check_exists(cache, SZ_128M - SZ_256K, SZ_128K)) { - test_msg("Free space range missing\n"); + test_err("free space range missing"); return -ENOENT; } if (!test_check_exists(cache, SZ_128M + SZ_512K, SZ_256K)) { - test_msg("Free space range missing\n"); + test_err("free space range missing"); return -ENOENT; } @@ -477,7 +479,7 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache, */ if (test_check_exists(cache, SZ_128M + 768 * SZ_1K, SZ_128M - 768 * SZ_1K)) { - test_msg("Bitmap region not removed from space cache\n"); + test_err("bitmap region not removed from space cache"); return -EINVAL; } @@ -486,7 +488,7 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache, * covered by the bitmap, isn't marked as free. */ if (test_check_exists(cache, SZ_128M + SZ_256K, SZ_256K)) { - test_msg("Invalid bitmap region marked as free\n"); + test_err("invalid bitmap region marked as free"); return -EINVAL; } @@ -495,7 +497,7 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache, * by the bitmap too, isn't marked as free either. */ if (test_check_exists(cache, SZ_128M, SZ_256K)) { - test_msg("Invalid bitmap region marked as free\n"); + test_err("invalid bitmap region marked as free"); return -EINVAL; } @@ -506,12 +508,12 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache, */ ret = btrfs_add_free_space(cache, SZ_128M, SZ_512K); if (ret) { - test_msg("Error adding free space: %d\n", ret); + test_err("error adding free space: %d", ret); return ret; } /* Confirm the region is marked as free. */ if (!test_check_exists(cache, SZ_128M, SZ_512K)) { - test_msg("Bitmap region not marked as free\n"); + test_err("bitmap region not marked as free"); return -ENOENT; } @@ -531,7 +533,7 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache, */ ret = btrfs_add_free_space(cache, SZ_128M + SZ_16M, sectorsize); if (ret) { - test_msg("Error adding free space: %d\n", ret); + test_err("error adding free space: %d", ret); return ret; } @@ -550,12 +552,12 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache, */ ret = btrfs_add_free_space(cache, SZ_128M - SZ_128K, SZ_128K); if (ret) { - test_msg("Error adding free space: %d\n", ret); + test_err("error adding free space: %d", ret); return ret; } /* Confirm the region is marked as free. */ if (!test_check_exists(cache, SZ_128M - SZ_128K, SZ_128K)) { - test_msg("Extent region not marked as free\n"); + test_err("extent region not marked as free"); return -ENOENT; } @@ -583,12 +585,12 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache, * allocate the whole free space at once. */ if (!test_check_exists(cache, SZ_128M - SZ_256K, SZ_1M)) { - test_msg("Expected region not marked as free\n"); + test_err("expected region not marked as free"); return -ENOENT; } if (cache->free_space_ctl->free_space != (SZ_1M + sectorsize)) { - test_msg("Cache free space is not 1Mb + %u\n", sectorsize); + test_err("cache free space is not 1Mb + %u", sectorsize); return -EINVAL; } @@ -596,7 +598,8 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache, 0, SZ_1M, 0, &max_extent_size); if (offset != (SZ_128M - SZ_256K)) { - test_msg("Failed to allocate 1Mb from space cache, returned offset is: %llu\n", + test_err( + "failed to allocate 1Mb from space cache, returned offset is: %llu", offset); return -EINVAL; } @@ -610,7 +613,7 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache, return ret; if (cache->free_space_ctl->free_space != sectorsize) { - test_msg("Cache free space is not %u\n", sectorsize); + test_err("cache free space is not %u", sectorsize); return -EINVAL; } @@ -618,7 +621,7 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache, 0, sectorsize, 0, &max_extent_size); if (offset != (SZ_128M + SZ_16M)) { - test_msg("Failed to allocate %u, returned offset : %llu\n", + test_err("failed to allocate %u, returned offset : %llu", sectorsize, offset); return -EINVAL; } @@ -640,14 +643,14 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache, */ ret = test_add_free_space_entry(cache, SZ_128M + SZ_128K, SZ_128K, 0); if (ret) { - test_msg("Couldn't add extent entry %d\n", ret); + test_err("couldn't add extent entry %d", ret); return ret; } /* Bitmap entry covering free space range [0, 128Mb - 512Kb[ */ ret = test_add_free_space_entry(cache, 0, SZ_128M - SZ_512K, 1); if (ret) { - test_msg("Couldn't add bitmap entry %d\n", ret); + test_err("couldn't add bitmap entry %d", ret); return ret; } @@ -664,17 +667,17 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache, */ ret = btrfs_remove_free_space(cache, 0, SZ_128M - 768 * SZ_1K); if (ret) { - test_msg("Failed to free part of bitmap space %d\n", ret); + test_err("failed to free part of bitmap space %d", ret); return ret; } /* Confirm that only those 2 ranges are marked as free. */ if (!test_check_exists(cache, SZ_128M + SZ_128K, SZ_128K)) { - test_msg("Free space range missing\n"); + test_err("free space range missing"); return -ENOENT; } if (!test_check_exists(cache, SZ_128M - 768 * SZ_1K, SZ_256K)) { - test_msg("Free space range missing\n"); + test_err("free space range missing"); return -ENOENT; } @@ -683,7 +686,7 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache, * as free anymore. */ if (test_check_exists(cache, 0, SZ_128M - 768 * SZ_1K)) { - test_msg("Bitmap region not removed from space cache\n"); + test_err("bitmap region not removed from space cache"); return -EINVAL; } @@ -692,7 +695,7 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache, * covered by the bitmap, isn't marked as free. */ if (test_check_exists(cache, SZ_128M - SZ_512K, SZ_512K)) { - test_msg("Invalid bitmap region marked as free\n"); + test_err("invalid bitmap region marked as free"); return -EINVAL; } @@ -703,12 +706,12 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache, */ ret = btrfs_add_free_space(cache, SZ_128M - SZ_512K, SZ_512K); if (ret) { - test_msg("Error adding free space: %d\n", ret); + test_err("error adding free space: %d", ret); return ret; } /* Confirm the region is marked as free. */ if (!test_check_exists(cache, SZ_128M - SZ_512K, SZ_512K)) { - test_msg("Bitmap region not marked as free\n"); + test_err("bitmap region not marked as free"); return -ENOENT; } @@ -728,7 +731,7 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache, */ ret = btrfs_add_free_space(cache, SZ_32M, 2 * sectorsize); if (ret) { - test_msg("Error adding free space: %d\n", ret); + test_err("error adding free space: %d", ret); return ret; } @@ -739,12 +742,12 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache, */ ret = btrfs_add_free_space(cache, SZ_128M, SZ_128K); if (ret) { - test_msg("Error adding free space: %d\n", ret); + test_err("error adding free space: %d", ret); return ret; } /* Confirm the region is marked as free. */ if (!test_check_exists(cache, SZ_128M, SZ_128K)) { - test_msg("Extent region not marked as free\n"); + test_err("extent region not marked as free"); return -ENOENT; } @@ -772,19 +775,20 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache, * allocate the whole free space at once. */ if (!test_check_exists(cache, SZ_128M - 768 * SZ_1K, SZ_1M)) { - test_msg("Expected region not marked as free\n"); + test_err("expected region not marked as free"); return -ENOENT; } if (cache->free_space_ctl->free_space != (SZ_1M + 2 * sectorsize)) { - test_msg("Cache free space is not 1Mb + %u\n", 2 * sectorsize); + test_err("cache free space is not 1Mb + %u", 2 * sectorsize); return -EINVAL; } offset = btrfs_find_space_for_alloc(cache, 0, SZ_1M, 0, &max_extent_size); if (offset != (SZ_128M - 768 * SZ_1K)) { - test_msg("Failed to allocate 1Mb from space cache, returned offset is: %llu\n", + test_err( + "failed to allocate 1Mb from space cache, returned offset is: %llu", offset); return -EINVAL; } @@ -798,7 +802,7 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache, return ret; if (cache->free_space_ctl->free_space != 2 * sectorsize) { - test_msg("Cache free space is not %u\n", 2 * sectorsize); + test_err("cache free space is not %u", 2 * sectorsize); return -EINVAL; } @@ -806,9 +810,8 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache, 0, 2 * sectorsize, 0, &max_extent_size); if (offset != SZ_32M) { - test_msg("Failed to allocate %u, offset: %llu\n", - 2 * sectorsize, - offset); + test_err("failed to allocate %u, offset: %llu", + 2 * sectorsize, offset); return -EINVAL; } @@ -829,7 +832,7 @@ int btrfs_test_free_space_cache(u32 sectorsize, u32 nodesize) struct btrfs_root *root = NULL; int ret = -ENOMEM; - test_msg("Running btrfs free space cache tests\n"); + test_msg("running btrfs free space cache tests"); fs_info = btrfs_alloc_dummy_fs_info(nodesize, sectorsize); if (!fs_info) return -ENOMEM; @@ -843,7 +846,7 @@ int btrfs_test_free_space_cache(u32 sectorsize, u32 nodesize) cache = btrfs_alloc_dummy_block_group(fs_info, BITS_PER_BITMAP * sectorsize + PAGE_SIZE); if (!cache) { - test_msg("Couldn't run the tests\n"); + test_err("couldn't run the tests"); btrfs_free_dummy_fs_info(fs_info); return 0; } @@ -871,6 +874,6 @@ out: btrfs_free_dummy_block_group(cache); btrfs_free_dummy_root(root); btrfs_free_dummy_fs_info(fs_info); - test_msg("Free space cache tests finished\n"); + test_msg("free space cache tests finished"); return ret; } diff --git a/fs/btrfs/tests/free-space-tree-tests.c b/fs/btrfs/tests/free-space-tree-tests.c index e1f9666c4974..89346da890cf 100644 --- a/fs/btrfs/tests/free-space-tree-tests.c +++ b/fs/btrfs/tests/free-space-tree-tests.c @@ -32,7 +32,7 @@ static int __check_free_space_extents(struct btrfs_trans_handle *trans, info = search_free_space_info(trans, fs_info, cache, path, 0); if (IS_ERR(info)) { - test_msg("Could not find free space info\n"); + test_err("could not find free space info"); ret = PTR_ERR(info); goto out; } @@ -40,7 +40,7 @@ static int __check_free_space_extents(struct btrfs_trans_handle *trans, extent_count = btrfs_free_space_extent_count(path->nodes[0], info); if (extent_count != num_extents) { - test_msg("Extent count is wrong\n"); + test_err("extent count is wrong"); ret = -EINVAL; goto out; } @@ -99,7 +99,7 @@ out: btrfs_release_path(path); return ret; invalid: - test_msg("Free space tree is invalid\n"); + test_err("free space tree is invalid"); ret = -EINVAL; goto out; } @@ -117,7 +117,7 @@ static int check_free_space_extents(struct btrfs_trans_handle *trans, info = search_free_space_info(trans, fs_info, cache, path, 0); if (IS_ERR(info)) { - test_msg("Could not find free space info\n"); + test_err("could not find free space info"); btrfs_release_path(path); return PTR_ERR(info); } @@ -131,15 +131,15 @@ static int check_free_space_extents(struct btrfs_trans_handle *trans, /* Flip it to the other format and check that for good measure. */ if (flags & BTRFS_FREE_SPACE_USING_BITMAPS) { - ret = convert_free_space_to_extents(trans, fs_info, cache, path); + ret = convert_free_space_to_extents(trans, cache, path); if (ret) { - test_msg("Could not convert to extents\n"); + test_err("could not convert to extents"); return ret; } } else { - ret = convert_free_space_to_bitmaps(trans, fs_info, cache, path); + ret = convert_free_space_to_bitmaps(trans, cache, path); if (ret) { - test_msg("Could not convert to bitmaps\n"); + test_err("could not convert to bitmaps"); return ret; } } @@ -170,11 +170,11 @@ static int test_remove_all(struct btrfs_trans_handle *trans, const struct free_space_extent extents[] = {}; int ret; - ret = __remove_from_free_space_tree(trans, fs_info, cache, path, + ret = __remove_from_free_space_tree(trans, cache, path, cache->key.objectid, cache->key.offset); if (ret) { - test_msg("Could not remove free space\n"); + test_err("could not remove free space"); return ret; } @@ -194,10 +194,10 @@ static int test_remove_beginning(struct btrfs_trans_handle *trans, }; int ret; - ret = __remove_from_free_space_tree(trans, fs_info, cache, path, + ret = __remove_from_free_space_tree(trans, cache, path, cache->key.objectid, alignment); if (ret) { - test_msg("Could not remove free space\n"); + test_err("could not remove free space"); return ret; } @@ -217,12 +217,12 @@ static int test_remove_end(struct btrfs_trans_handle *trans, }; int ret; - ret = __remove_from_free_space_tree(trans, fs_info, cache, path, + ret = __remove_from_free_space_tree(trans, cache, path, cache->key.objectid + cache->key.offset - alignment, alignment); if (ret) { - test_msg("Could not remove free space\n"); + test_err("could not remove free space"); return ret; } @@ -243,11 +243,11 @@ static int test_remove_middle(struct btrfs_trans_handle *trans, }; int ret; - ret = __remove_from_free_space_tree(trans, fs_info, cache, path, + ret = __remove_from_free_space_tree(trans, cache, path, cache->key.objectid + alignment, alignment); if (ret) { - test_msg("Could not remove free space\n"); + test_err("could not remove free space"); return ret; } @@ -266,26 +266,26 @@ static int test_merge_left(struct btrfs_trans_handle *trans, }; int ret; - ret = __remove_from_free_space_tree(trans, fs_info, cache, path, + ret = __remove_from_free_space_tree(trans, cache, path, cache->key.objectid, cache->key.offset); if (ret) { - test_msg("Could not remove free space\n"); + test_err("could not remove free space"); return ret; } - ret = __add_to_free_space_tree(trans, fs_info, cache, path, - cache->key.objectid, alignment); + ret = __add_to_free_space_tree(trans, cache, path, cache->key.objectid, + alignment); if (ret) { - test_msg("Could not add free space\n"); + test_err("could not add free space"); return ret; } - ret = __add_to_free_space_tree(trans, fs_info, cache, path, + ret = __add_to_free_space_tree(trans, cache, path, cache->key.objectid + alignment, alignment); if (ret) { - test_msg("Could not add free space\n"); + test_err("could not add free space"); return ret; } @@ -304,27 +304,27 @@ static int test_merge_right(struct btrfs_trans_handle *trans, }; int ret; - ret = __remove_from_free_space_tree(trans, fs_info, cache, path, + ret = __remove_from_free_space_tree(trans, cache, path, cache->key.objectid, cache->key.offset); if (ret) { - test_msg("Could not remove free space\n"); + test_err("could not remove free space"); return ret; } - ret = __add_to_free_space_tree(trans, fs_info, cache, path, + ret = __add_to_free_space_tree(trans, cache, path, cache->key.objectid + 2 * alignment, alignment); if (ret) { - test_msg("Could not add free space\n"); + test_err("could not add free space"); return ret; } - ret = __add_to_free_space_tree(trans, fs_info, cache, path, + ret = __add_to_free_space_tree(trans, cache, path, cache->key.objectid + alignment, alignment); if (ret) { - test_msg("Could not add free space\n"); + test_err("could not add free space"); return ret; } @@ -343,34 +343,34 @@ static int test_merge_both(struct btrfs_trans_handle *trans, }; int ret; - ret = __remove_from_free_space_tree(trans, fs_info, cache, path, + ret = __remove_from_free_space_tree(trans, cache, path, cache->key.objectid, cache->key.offset); if (ret) { - test_msg("Could not remove free space\n"); + test_err("could not remove free space"); return ret; } - ret = __add_to_free_space_tree(trans, fs_info, cache, path, - cache->key.objectid, alignment); + ret = __add_to_free_space_tree(trans, cache, path, cache->key.objectid, + alignment); if (ret) { - test_msg("Could not add free space\n"); + test_err("could not add free space"); return ret; } - ret = __add_to_free_space_tree(trans, fs_info, cache, path, + ret = __add_to_free_space_tree(trans, cache, path, cache->key.objectid + 2 * alignment, alignment); if (ret) { - test_msg("Could not add free space\n"); + test_err("could not add free space"); return ret; } - ret = __add_to_free_space_tree(trans, fs_info, cache, path, + ret = __add_to_free_space_tree(trans, cache, path, cache->key.objectid + alignment, alignment); if (ret) { - test_msg("Could not add free space\n"); + test_err("could not add free space"); return ret; } @@ -391,34 +391,34 @@ static int test_merge_none(struct btrfs_trans_handle *trans, }; int ret; - ret = __remove_from_free_space_tree(trans, fs_info, cache, path, + ret = __remove_from_free_space_tree(trans, cache, path, cache->key.objectid, cache->key.offset); if (ret) { - test_msg("Could not remove free space\n"); + test_err("could not remove free space"); return ret; } - ret = __add_to_free_space_tree(trans, fs_info, cache, path, - cache->key.objectid, alignment); + ret = __add_to_free_space_tree(trans, cache, path, cache->key.objectid, + alignment); if (ret) { - test_msg("Could not add free space\n"); + test_err("could not add free space"); return ret; } - ret = __add_to_free_space_tree(trans, fs_info, cache, path, + ret = __add_to_free_space_tree(trans, cache, path, cache->key.objectid + 4 * alignment, alignment); if (ret) { - test_msg("Could not add free space\n"); + test_err("could not add free space"); return ret; } - ret = __add_to_free_space_tree(trans, fs_info, cache, path, + ret = __add_to_free_space_tree(trans, cache, path, cache->key.objectid + 2 * alignment, alignment); if (ret) { - test_msg("Could not add free space\n"); + test_err("could not add free space"); return ret; } @@ -444,14 +444,14 @@ static int run_test(test_func_t test_func, int bitmaps, u32 sectorsize, fs_info = btrfs_alloc_dummy_fs_info(nodesize, sectorsize); if (!fs_info) { - test_msg("Couldn't allocate dummy fs info\n"); + test_err("couldn't allocate dummy fs info"); ret = -ENOMEM; goto out; } root = btrfs_alloc_dummy_root(fs_info); if (IS_ERR(root)) { - test_msg("Couldn't allocate dummy root\n"); + test_err("couldn't allocate dummy root"); ret = PTR_ERR(root); goto out; } @@ -463,7 +463,7 @@ static int run_test(test_func_t test_func, int bitmaps, u32 sectorsize, root->node = alloc_test_extent_buffer(root->fs_info, nodesize); if (!root->node) { - test_msg("Couldn't allocate dummy buffer\n"); + test_err("couldn't allocate dummy buffer"); ret = -ENOMEM; goto out; } @@ -473,7 +473,7 @@ static int run_test(test_func_t test_func, int bitmaps, u32 sectorsize, cache = btrfs_alloc_dummy_block_group(fs_info, 8 * alignment); if (!cache) { - test_msg("Couldn't allocate dummy block group cache\n"); + test_err("couldn't allocate dummy block group cache"); ret = -ENOMEM; goto out; } @@ -482,26 +482,25 @@ static int run_test(test_func_t test_func, int bitmaps, u32 sectorsize, cache->needs_free_space = 1; cache->fs_info = root->fs_info; - btrfs_init_dummy_trans(&trans); + btrfs_init_dummy_trans(&trans, root->fs_info); path = btrfs_alloc_path(); if (!path) { - test_msg("Couldn't allocate path\n"); + test_err("couldn't allocate path"); ret = -ENOMEM; goto out; } - ret = add_block_group_free_space(&trans, root->fs_info, cache); + ret = add_block_group_free_space(&trans, cache); if (ret) { - test_msg("Could not add block group free space\n"); + test_err("could not add block group free space"); goto out; } if (bitmaps) { - ret = convert_free_space_to_bitmaps(&trans, root->fs_info, - cache, path); + ret = convert_free_space_to_bitmaps(&trans, cache, path); if (ret) { - test_msg("Could not convert block group to bitmaps\n"); + test_err("could not convert block group to bitmaps"); goto out; } } @@ -510,14 +509,14 @@ static int run_test(test_func_t test_func, int bitmaps, u32 sectorsize, if (ret) goto out; - ret = remove_block_group_free_space(&trans, root->fs_info, cache); + ret = remove_block_group_free_space(&trans, cache); if (ret) { - test_msg("Could not remove block group free space\n"); + test_err("could not remove block group free space"); goto out; } if (btrfs_header_nritems(root->node) != 0) { - test_msg("Free space tree has leftover items\n"); + test_err("free space tree has leftover items"); ret = -EINVAL; goto out; } @@ -539,14 +538,16 @@ static int run_test_both_formats(test_func_t test_func, u32 sectorsize, ret = run_test(test_func, 0, sectorsize, nodesize, alignment); if (ret) { - test_msg("%pf failed with extents, sectorsize=%u, nodesize=%u, alignment=%u\n", + test_err( + "%pf failed with extents, sectorsize=%u, nodesize=%u, alignment=%u", test_func, sectorsize, nodesize, alignment); test_ret = ret; } ret = run_test(test_func, 1, sectorsize, nodesize, alignment); if (ret) { - test_msg("%pf failed with bitmaps, sectorsize=%u, nodesize=%u, alignment=%u\n", + test_err( + "%pf failed with bitmaps, sectorsize=%u, nodesize=%u, alignment=%u", test_func, sectorsize, nodesize, alignment); test_ret = ret; } @@ -577,7 +578,7 @@ int btrfs_test_free_space_tree(u32 sectorsize, u32 nodesize) */ bitmap_alignment = BTRFS_FREE_SPACE_BITMAP_BITS * PAGE_SIZE; - test_msg("Running free space tree tests\n"); + test_msg("running free space tree tests"); for (i = 0; i < ARRAY_SIZE(tests); i++) { int ret; diff --git a/fs/btrfs/tests/inode-tests.c b/fs/btrfs/tests/inode-tests.c index e0ba799536b4..64043f028820 100644 --- a/fs/btrfs/tests/inode-tests.c +++ b/fs/btrfs/tests/inode-tests.c @@ -228,7 +228,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) inode = btrfs_new_test_inode(); if (!inode) { - test_msg("Couldn't allocate inode\n"); + test_err("couldn't allocate inode"); return ret; } @@ -238,19 +238,19 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) fs_info = btrfs_alloc_dummy_fs_info(nodesize, sectorsize); if (!fs_info) { - test_msg("Couldn't allocate dummy fs info\n"); + test_err("couldn't allocate dummy fs info"); goto out; } root = btrfs_alloc_dummy_root(fs_info); if (IS_ERR(root)) { - test_msg("Couldn't allocate root\n"); + test_err("couldn't allocate root"); goto out; } root->node = alloc_dummy_extent_buffer(fs_info, nodesize); if (!root->node) { - test_msg("Couldn't allocate dummy buffer\n"); + test_err("couldn't allocate dummy buffer"); goto out; } @@ -268,11 +268,11 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, 0, sectorsize, 0); if (IS_ERR(em)) { em = NULL; - test_msg("Got an error when we shouldn't have\n"); + test_err("got an error when we shouldn't have"); goto out; } if (em->block_start != EXTENT_MAP_HOLE) { - test_msg("Expected a hole, got %llu\n", em->block_start); + test_err("expected a hole, got %llu", em->block_start); goto out; } free_extent_map(em); @@ -287,20 +287,21 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, 0, (u64)-1, 0); if (IS_ERR(em)) { - test_msg("Got an error when we shouldn't have\n"); + test_err("got an error when we shouldn't have"); goto out; } if (em->block_start != EXTENT_MAP_HOLE) { - test_msg("Expected a hole, got %llu\n", em->block_start); + test_err("expected a hole, got %llu", em->block_start); goto out; } if (em->start != 0 || em->len != 5) { - test_msg("Unexpected extent wanted start 0 len 5, got start " - "%llu len %llu\n", em->start, em->len); + test_err( + "unexpected extent wanted start 0 len 5, got start %llu len %llu", + em->start, em->len); goto out; } if (em->flags != 0) { - test_msg("Unexpected flags set, want 0 have %lu\n", em->flags); + test_err("unexpected flags set, want 0 have %lu", em->flags); goto out; } offset = em->start + em->len; @@ -308,21 +309,22 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize, 0); if (IS_ERR(em)) { - test_msg("Got an error when we shouldn't have\n"); + test_err("got an error when we shouldn't have"); goto out; } if (em->block_start != EXTENT_MAP_INLINE) { - test_msg("Expected an inline, got %llu\n", em->block_start); + test_err("expected an inline, got %llu", em->block_start); goto out; } if (em->start != offset || em->len != (sectorsize - 5)) { - test_msg("Unexpected extent wanted start %llu len 1, got start " - "%llu len %llu\n", offset, em->start, em->len); + test_err( + "unexpected extent wanted start %llu len 1, got start %llu len %llu", + offset, em->start, em->len); goto out; } if (em->flags != 0) { - test_msg("Unexpected flags set, want 0 have %lu\n", em->flags); + test_err("unexpected flags set, want 0 have %lu", em->flags); goto out; } /* @@ -335,20 +337,21 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize, 0); if (IS_ERR(em)) { - test_msg("Got an error when we shouldn't have\n"); + test_err("got an error when we shouldn't have"); goto out; } if (em->block_start != EXTENT_MAP_HOLE) { - test_msg("Expected a hole, got %llu\n", em->block_start); + test_err("expected a hole, got %llu", em->block_start); goto out; } if (em->start != offset || em->len != 4) { - test_msg("Unexpected extent wanted start %llu len 4, got start " - "%llu len %llu\n", offset, em->start, em->len); + test_err( + "unexpected extent wanted start %llu len 4, got start %llu len %llu", + offset, em->start, em->len); goto out; } if (em->flags != 0) { - test_msg("Unexpected flags set, want 0 have %lu\n", em->flags); + test_err("unexpected flags set, want 0 have %lu", em->flags); goto out; } offset = em->start + em->len; @@ -357,24 +360,25 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) /* Regular extent */ em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize, 0); if (IS_ERR(em)) { - test_msg("Got an error when we shouldn't have\n"); + test_err("got an error when we shouldn't have"); goto out; } if (em->block_start >= EXTENT_MAP_LAST_BYTE) { - test_msg("Expected a real extent, got %llu\n", em->block_start); + test_err("expected a real extent, got %llu", em->block_start); goto out; } if (em->start != offset || em->len != sectorsize - 1) { - test_msg("Unexpected extent wanted start %llu len 4095, got " - "start %llu len %llu\n", offset, em->start, em->len); + test_err( + "unexpected extent wanted start %llu len 4095, got start %llu len %llu", + offset, em->start, em->len); goto out; } if (em->flags != 0) { - test_msg("Unexpected flags set, want 0 have %lu\n", em->flags); + test_err("unexpected flags set, want 0 have %lu", em->flags); goto out; } if (em->orig_start != em->start) { - test_msg("Wrong orig offset, want %llu, have %llu\n", em->start, + test_err("wrong orig offset, want %llu, have %llu", em->start, em->orig_start); goto out; } @@ -384,25 +388,25 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) /* The next 3 are split extents */ em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize, 0); if (IS_ERR(em)) { - test_msg("Got an error when we shouldn't have\n"); + test_err("got an error when we shouldn't have"); goto out; } if (em->block_start >= EXTENT_MAP_LAST_BYTE) { - test_msg("Expected a real extent, got %llu\n", em->block_start); + test_err("expected a real extent, got %llu", em->block_start); goto out; } if (em->start != offset || em->len != sectorsize) { - test_msg("Unexpected extent start %llu len %u, " - "got start %llu len %llu\n", + test_err( + "unexpected extent start %llu len %u, got start %llu len %llu", offset, sectorsize, em->start, em->len); goto out; } if (em->flags != 0) { - test_msg("Unexpected flags set, want 0 have %lu\n", em->flags); + test_err("unexpected flags set, want 0 have %lu", em->flags); goto out; } if (em->orig_start != em->start) { - test_msg("Wrong orig offset, want %llu, have %llu\n", em->start, + test_err("wrong orig offset, want %llu, have %llu", em->start, em->orig_start); goto out; } @@ -413,21 +417,21 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize, 0); if (IS_ERR(em)) { - test_msg("Got an error when we shouldn't have\n"); + test_err("got an error when we shouldn't have"); goto out; } if (em->block_start != EXTENT_MAP_HOLE) { - test_msg("Expected a hole, got %llu\n", em->block_start); + test_err("expected a hole, got %llu", em->block_start); goto out; } if (em->start != offset || em->len != sectorsize) { - test_msg("Unexpected extent wanted start %llu len %u, " - "got start %llu len %llu\n", + test_err( + "unexpected extent wanted start %llu len %u, got start %llu len %llu", offset, sectorsize, em->start, em->len); goto out; } if (em->flags != 0) { - test_msg("Unexpected flags set, want 0 have %lu\n", em->flags); + test_err("unexpected flags set, want 0 have %lu", em->flags); goto out; } offset = em->start + em->len; @@ -435,31 +439,31 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize, 0); if (IS_ERR(em)) { - test_msg("Got an error when we shouldn't have\n"); + test_err("got an error when we shouldn't have"); goto out; } if (em->block_start >= EXTENT_MAP_LAST_BYTE) { - test_msg("Expected a real extent, got %llu\n", em->block_start); + test_err("expected a real extent, got %llu", em->block_start); goto out; } if (em->start != offset || em->len != 2 * sectorsize) { - test_msg("Unexpected extent wanted start %llu len %u, " - "got start %llu len %llu\n", + test_err( + "unexpected extent wanted start %llu len %u, got start %llu len %llu", offset, 2 * sectorsize, em->start, em->len); goto out; } if (em->flags != 0) { - test_msg("Unexpected flags set, want 0 have %lu\n", em->flags); + test_err("unexpected flags set, want 0 have %lu", em->flags); goto out; } if (em->orig_start != orig_start) { - test_msg("Wrong orig offset, want %llu, have %llu\n", + test_err("wrong orig offset, want %llu, have %llu", orig_start, em->orig_start); goto out; } disk_bytenr += (em->start - orig_start); if (em->block_start != disk_bytenr) { - test_msg("Wrong block start, want %llu, have %llu\n", + test_err("wrong block start, want %llu, have %llu", disk_bytenr, em->block_start); goto out; } @@ -469,26 +473,26 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) /* Prealloc extent */ em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize, 0); if (IS_ERR(em)) { - test_msg("Got an error when we shouldn't have\n"); + test_err("got an error when we shouldn't have"); goto out; } if (em->block_start >= EXTENT_MAP_LAST_BYTE) { - test_msg("Expected a real extent, got %llu\n", em->block_start); + test_err("expected a real extent, got %llu", em->block_start); goto out; } if (em->start != offset || em->len != sectorsize) { - test_msg("Unexpected extent wanted start %llu len %u, " - "got start %llu len %llu\n", + test_err( + "unexpected extent wanted start %llu len %u, got start %llu len %llu", offset, sectorsize, em->start, em->len); goto out; } if (em->flags != prealloc_only) { - test_msg("Unexpected flags set, want %lu have %lu\n", + test_err("unexpected flags set, want %lu have %lu", prealloc_only, em->flags); goto out; } if (em->orig_start != em->start) { - test_msg("Wrong orig offset, want %llu, have %llu\n", em->start, + test_err("wrong orig offset, want %llu, have %llu", em->start, em->orig_start); goto out; } @@ -498,26 +502,26 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) /* The next 3 are a half written prealloc extent */ em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize, 0); if (IS_ERR(em)) { - test_msg("Got an error when we shouldn't have\n"); + test_err("got an error when we shouldn't have"); goto out; } if (em->block_start >= EXTENT_MAP_LAST_BYTE) { - test_msg("Expected a real extent, got %llu\n", em->block_start); + test_err("expected a real extent, got %llu", em->block_start); goto out; } if (em->start != offset || em->len != sectorsize) { - test_msg("Unexpected extent wanted start %llu len %u, " - "got start %llu len %llu\n", + test_err( + "unexpected extent wanted start %llu len %u, got start %llu len %llu", offset, sectorsize, em->start, em->len); goto out; } if (em->flags != prealloc_only) { - test_msg("Unexpected flags set, want %lu have %lu\n", + test_err("unexpected flags set, want %lu have %lu", prealloc_only, em->flags); goto out; } if (em->orig_start != em->start) { - test_msg("Wrong orig offset, want %llu, have %llu\n", em->start, + test_err("wrong orig offset, want %llu, have %llu", em->start, em->orig_start); goto out; } @@ -528,30 +532,30 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize, 0); if (IS_ERR(em)) { - test_msg("Got an error when we shouldn't have\n"); + test_err("got an error when we shouldn't have"); goto out; } if (em->block_start >= EXTENT_MAP_HOLE) { - test_msg("Expected a real extent, got %llu\n", em->block_start); + test_err("expected a real extent, got %llu", em->block_start); goto out; } if (em->start != offset || em->len != sectorsize) { - test_msg("Unexpected extent wanted start %llu len %u, " - "got start %llu len %llu\n", + test_err( + "unexpected extent wanted start %llu len %u, got start %llu len %llu", offset, sectorsize, em->start, em->len); goto out; } if (em->flags != 0) { - test_msg("Unexpected flags set, want 0 have %lu\n", em->flags); + test_err("unexpected flags set, want 0 have %lu", em->flags); goto out; } if (em->orig_start != orig_start) { - test_msg("Unexpected orig offset, wanted %llu, have %llu\n", + test_err("unexpected orig offset, wanted %llu, have %llu", orig_start, em->orig_start); goto out; } if (em->block_start != (disk_bytenr + (em->start - em->orig_start))) { - test_msg("Unexpected block start, wanted %llu, have %llu\n", + test_err("unexpected block start, wanted %llu, have %llu", disk_bytenr + (em->start - em->orig_start), em->block_start); goto out; @@ -561,31 +565,31 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize, 0); if (IS_ERR(em)) { - test_msg("Got an error when we shouldn't have\n"); + test_err("got an error when we shouldn't have"); goto out; } if (em->block_start >= EXTENT_MAP_LAST_BYTE) { - test_msg("Expected a real extent, got %llu\n", em->block_start); + test_err("expected a real extent, got %llu", em->block_start); goto out; } if (em->start != offset || em->len != 2 * sectorsize) { - test_msg("Unexpected extent wanted start %llu len %u, " - "got start %llu len %llu\n", + test_err( + "unexpected extent wanted start %llu len %u, got start %llu len %llu", offset, 2 * sectorsize, em->start, em->len); goto out; } if (em->flags != prealloc_only) { - test_msg("Unexpected flags set, want %lu have %lu\n", + test_err("unexpected flags set, want %lu have %lu", prealloc_only, em->flags); goto out; } if (em->orig_start != orig_start) { - test_msg("Wrong orig offset, want %llu, have %llu\n", orig_start, + test_err("wrong orig offset, want %llu, have %llu", orig_start, em->orig_start); goto out; } if (em->block_start != (disk_bytenr + (em->start - em->orig_start))) { - test_msg("Unexpected block start, wanted %llu, have %llu\n", + test_err("unexpected block start, wanted %llu, have %llu", disk_bytenr + (em->start - em->orig_start), em->block_start); goto out; @@ -596,31 +600,31 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) /* Now for the compressed extent */ em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize, 0); if (IS_ERR(em)) { - test_msg("Got an error when we shouldn't have\n"); + test_err("got an error when we shouldn't have"); goto out; } if (em->block_start >= EXTENT_MAP_LAST_BYTE) { - test_msg("Expected a real extent, got %llu\n", em->block_start); + test_err("expected a real extent, got %llu", em->block_start); goto out; } if (em->start != offset || em->len != 2 * sectorsize) { - test_msg("Unexpected extent wanted start %llu len %u," - "got start %llu len %llu\n", + test_err( + "unexpected extent wanted start %llu len %u, got start %llu len %llu", offset, 2 * sectorsize, em->start, em->len); goto out; } if (em->flags != compressed_only) { - test_msg("Unexpected flags set, want %lu have %lu\n", + test_err("unexpected flags set, want %lu have %lu", compressed_only, em->flags); goto out; } if (em->orig_start != em->start) { - test_msg("Wrong orig offset, want %llu, have %llu\n", + test_err("wrong orig offset, want %llu, have %llu", em->start, em->orig_start); goto out; } if (em->compress_type != BTRFS_COMPRESS_ZLIB) { - test_msg("Unexpected compress type, wanted %d, got %d\n", + test_err("unexpected compress type, wanted %d, got %d", BTRFS_COMPRESS_ZLIB, em->compress_type); goto out; } @@ -630,31 +634,31 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) /* Split compressed extent */ em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize, 0); if (IS_ERR(em)) { - test_msg("Got an error when we shouldn't have\n"); + test_err("got an error when we shouldn't have"); goto out; } if (em->block_start >= EXTENT_MAP_LAST_BYTE) { - test_msg("Expected a real extent, got %llu\n", em->block_start); + test_err("expected a real extent, got %llu", em->block_start); goto out; } if (em->start != offset || em->len != sectorsize) { - test_msg("Unexpected extent wanted start %llu len %u," - "got start %llu len %llu\n", + test_err( + "unexpected extent wanted start %llu len %u, got start %llu len %llu", offset, sectorsize, em->start, em->len); goto out; } if (em->flags != compressed_only) { - test_msg("Unexpected flags set, want %lu have %lu\n", + test_err("unexpected flags set, want %lu have %lu", compressed_only, em->flags); goto out; } if (em->orig_start != em->start) { - test_msg("Wrong orig offset, want %llu, have %llu\n", + test_err("wrong orig offset, want %llu, have %llu", em->start, em->orig_start); goto out; } if (em->compress_type != BTRFS_COMPRESS_ZLIB) { - test_msg("Unexpected compress type, wanted %d, got %d\n", + test_err("unexpected compress type, wanted %d, got %d", BTRFS_COMPRESS_ZLIB, em->compress_type); goto out; } @@ -665,25 +669,25 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize, 0); if (IS_ERR(em)) { - test_msg("Got an error when we shouldn't have\n"); + test_err("got an error when we shouldn't have"); goto out; } if (em->block_start >= EXTENT_MAP_LAST_BYTE) { - test_msg("Expected a real extent, got %llu\n", em->block_start); + test_err("expected a real extent, got %llu", em->block_start); goto out; } if (em->start != offset || em->len != sectorsize) { - test_msg("Unexpected extent wanted start %llu len %u, " - "got start %llu len %llu\n", + test_err( + "unexpected extent wanted start %llu len %u, got start %llu len %llu", offset, sectorsize, em->start, em->len); goto out; } if (em->flags != 0) { - test_msg("Unexpected flags set, want 0 have %lu\n", em->flags); + test_err("unexpected flags set, want 0 have %lu", em->flags); goto out; } if (em->orig_start != em->start) { - test_msg("Wrong orig offset, want %llu, have %llu\n", em->start, + test_err("wrong orig offset, want %llu, have %llu", em->start, em->orig_start); goto out; } @@ -692,32 +696,32 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize, 0); if (IS_ERR(em)) { - test_msg("Got an error when we shouldn't have\n"); + test_err("got an error when we shouldn't have"); goto out; } if (em->block_start != disk_bytenr) { - test_msg("Block start does not match, want %llu got %llu\n", + test_err("block start does not match, want %llu got %llu", disk_bytenr, em->block_start); goto out; } if (em->start != offset || em->len != 2 * sectorsize) { - test_msg("Unexpected extent wanted start %llu len %u, " - "got start %llu len %llu\n", + test_err( + "unexpected extent wanted start %llu len %u, got start %llu len %llu", offset, 2 * sectorsize, em->start, em->len); goto out; } if (em->flags != compressed_only) { - test_msg("Unexpected flags set, want %lu have %lu\n", + test_err("unexpected flags set, want %lu have %lu", compressed_only, em->flags); goto out; } if (em->orig_start != orig_start) { - test_msg("Wrong orig offset, want %llu, have %llu\n", + test_err("wrong orig offset, want %llu, have %llu", em->start, orig_start); goto out; } if (em->compress_type != BTRFS_COMPRESS_ZLIB) { - test_msg("Unexpected compress type, wanted %d, got %d\n", + test_err("unexpected compress type, wanted %d, got %d", BTRFS_COMPRESS_ZLIB, em->compress_type); goto out; } @@ -728,25 +732,25 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset + 6, sectorsize, 0); if (IS_ERR(em)) { - test_msg("Got an error when we shouldn't have\n"); + test_err("got an error when we shouldn't have"); goto out; } if (em->block_start >= EXTENT_MAP_LAST_BYTE) { - test_msg("Expected a real extent, got %llu\n", em->block_start); + test_err("expected a real extent, got %llu", em->block_start); goto out; } if (em->start != offset || em->len != sectorsize) { - test_msg("Unexpected extent wanted start %llu len %u, " - "got start %llu len %llu\n", + test_err( + "unexpected extent wanted start %llu len %u, got start %llu len %llu", offset, sectorsize, em->start, em->len); goto out; } if (em->flags != 0) { - test_msg("Unexpected flags set, want 0 have %lu\n", em->flags); + test_err("unexpected flags set, want 0 have %lu", em->flags); goto out; } if (em->orig_start != em->start) { - test_msg("Wrong orig offset, want %llu, have %llu\n", em->start, + test_err("wrong orig offset, want %llu, have %llu", em->start, em->orig_start); goto out; } @@ -755,11 +759,11 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, SZ_4M, 0); if (IS_ERR(em)) { - test_msg("Got an error when we shouldn't have\n"); + test_err("got an error when we shouldn't have"); goto out; } if (em->block_start != EXTENT_MAP_HOLE) { - test_msg("Expected a hole extent, got %llu\n", em->block_start); + test_err("expected a hole extent, got %llu", em->block_start); goto out; } /* @@ -768,18 +772,18 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) * test. */ if (em->start != offset || em->len != 3 * sectorsize) { - test_msg("Unexpected extent wanted start %llu len %u, " - "got start %llu len %llu\n", + test_err( + "unexpected extent wanted start %llu len %u, got start %llu len %llu", offset, 3 * sectorsize, em->start, em->len); goto out; } if (em->flags != vacancy_only) { - test_msg("Unexpected flags set, want %lu have %lu\n", + test_err("unexpected flags set, want %lu have %lu", vacancy_only, em->flags); goto out; } if (em->orig_start != em->start) { - test_msg("Wrong orig offset, want %llu, have %llu\n", em->start, + test_err("wrong orig offset, want %llu, have %llu", em->start, em->orig_start); goto out; } @@ -788,25 +792,25 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize, 0); if (IS_ERR(em)) { - test_msg("Got an error when we shouldn't have\n"); + test_err("got an error when we shouldn't have"); goto out; } if (em->block_start >= EXTENT_MAP_LAST_BYTE) { - test_msg("Expected a real extent, got %llu\n", em->block_start); + test_err("expected a real extent, got %llu", em->block_start); goto out; } if (em->start != offset || em->len != sectorsize) { - test_msg("Unexpected extent wanted start %llu len %u," - "got start %llu len %llu\n", + test_err( + "unexpected extent wanted start %llu len %u, got start %llu len %llu", offset, sectorsize, em->start, em->len); goto out; } if (em->flags != 0) { - test_msg("Unexpected flags set, want 0 have %lu\n", em->flags); + test_err("unexpected flags set, want 0 have %lu", em->flags); goto out; } if (em->orig_start != em->start) { - test_msg("Wrong orig offset, want %llu, have %llu\n", em->start, + test_err("wrong orig offset, want %llu, have %llu", em->start, em->orig_start); goto out; } @@ -830,7 +834,7 @@ static int test_hole_first(u32 sectorsize, u32 nodesize) inode = btrfs_new_test_inode(); if (!inode) { - test_msg("Couldn't allocate inode\n"); + test_err("couldn't allocate inode"); return ret; } @@ -840,19 +844,19 @@ static int test_hole_first(u32 sectorsize, u32 nodesize) fs_info = btrfs_alloc_dummy_fs_info(nodesize, sectorsize); if (!fs_info) { - test_msg("Couldn't allocate dummy fs info\n"); + test_err("couldn't allocate dummy fs info"); goto out; } root = btrfs_alloc_dummy_root(fs_info); if (IS_ERR(root)) { - test_msg("Couldn't allocate root\n"); + test_err("couldn't allocate root"); goto out; } root->node = alloc_dummy_extent_buffer(fs_info, nodesize); if (!root->node) { - test_msg("Couldn't allocate dummy buffer\n"); + test_err("couldn't allocate dummy buffer"); goto out; } @@ -871,21 +875,21 @@ static int test_hole_first(u32 sectorsize, u32 nodesize) sectorsize, BTRFS_FILE_EXTENT_REG, 0, 1); em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, 0, 2 * sectorsize, 0); if (IS_ERR(em)) { - test_msg("Got an error when we shouldn't have\n"); + test_err("got an error when we shouldn't have"); goto out; } if (em->block_start != EXTENT_MAP_HOLE) { - test_msg("Expected a hole, got %llu\n", em->block_start); + test_err("expected a hole, got %llu", em->block_start); goto out; } if (em->start != 0 || em->len != sectorsize) { - test_msg("Unexpected extent wanted start 0 len %u, " - "got start %llu len %llu\n", + test_err( + "unexpected extent wanted start 0 len %u, got start %llu len %llu", sectorsize, em->start, em->len); goto out; } if (em->flags != vacancy_only) { - test_msg("Wrong flags, wanted %lu, have %lu\n", vacancy_only, + test_err("wrong flags, wanted %lu, have %lu", vacancy_only, em->flags); goto out; } @@ -894,21 +898,21 @@ static int test_hole_first(u32 sectorsize, u32 nodesize) em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, sectorsize, 2 * sectorsize, 0); if (IS_ERR(em)) { - test_msg("Got an error when we shouldn't have\n"); + test_err("got an error when we shouldn't have"); goto out; } if (em->block_start != sectorsize) { - test_msg("Expected a real extent, got %llu\n", em->block_start); + test_err("expected a real extent, got %llu", em->block_start); goto out; } if (em->start != sectorsize || em->len != sectorsize) { - test_msg("Unexpected extent wanted start %u len %u, " - "got start %llu len %llu\n", + test_err( + "unexpected extent wanted start %u len %u, got start %llu len %llu", sectorsize, sectorsize, em->start, em->len); goto out; } if (em->flags != 0) { - test_msg("Unexpected flags set, wanted 0 got %lu\n", + test_err("unexpected flags set, wanted 0 got %lu", em->flags); goto out; } @@ -931,19 +935,19 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize) inode = btrfs_new_test_inode(); if (!inode) { - test_msg("Couldn't allocate inode\n"); + test_err("couldn't allocate inode"); return ret; } fs_info = btrfs_alloc_dummy_fs_info(nodesize, sectorsize); if (!fs_info) { - test_msg("Couldn't allocate dummy fs info\n"); + test_err("couldn't allocate dummy fs info"); goto out; } root = btrfs_alloc_dummy_root(fs_info); if (IS_ERR(root)) { - test_msg("Couldn't allocate root\n"); + test_err("couldn't allocate root"); goto out; } @@ -954,12 +958,12 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize) ret = btrfs_set_extent_delalloc(inode, 0, BTRFS_MAX_EXTENT_SIZE - 1, 0, NULL, 0); if (ret) { - test_msg("btrfs_set_extent_delalloc returned %d\n", ret); + test_err("btrfs_set_extent_delalloc returned %d", ret); goto out; } if (BTRFS_I(inode)->outstanding_extents != 1) { ret = -EINVAL; - test_msg("Miscount, wanted 1, got %u\n", + test_err("miscount, wanted 1, got %u", BTRFS_I(inode)->outstanding_extents); goto out; } @@ -969,12 +973,12 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize) BTRFS_MAX_EXTENT_SIZE + sectorsize - 1, 0, NULL, 0); if (ret) { - test_msg("btrfs_set_extent_delalloc returned %d\n", ret); + test_err("btrfs_set_extent_delalloc returned %d", ret); goto out; } if (BTRFS_I(inode)->outstanding_extents != 2) { ret = -EINVAL; - test_msg("Miscount, wanted 2, got %u\n", + test_err("miscount, wanted 2, got %u", BTRFS_I(inode)->outstanding_extents); goto out; } @@ -986,12 +990,12 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize) EXTENT_DELALLOC | EXTENT_DIRTY | EXTENT_UPTODATE, 0, 0, NULL); if (ret) { - test_msg("clear_extent_bit returned %d\n", ret); + test_err("clear_extent_bit returned %d", ret); goto out; } if (BTRFS_I(inode)->outstanding_extents != 2) { ret = -EINVAL; - test_msg("Miscount, wanted 2, got %u\n", + test_err("miscount, wanted 2, got %u", BTRFS_I(inode)->outstanding_extents); goto out; } @@ -1002,12 +1006,12 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize) + sectorsize - 1, 0, NULL, 0); if (ret) { - test_msg("btrfs_set_extent_delalloc returned %d\n", ret); + test_err("btrfs_set_extent_delalloc returned %d", ret); goto out; } if (BTRFS_I(inode)->outstanding_extents != 2) { ret = -EINVAL; - test_msg("Miscount, wanted 2, got %u\n", + test_err("miscount, wanted 2, got %u", BTRFS_I(inode)->outstanding_extents); goto out; } @@ -1020,12 +1024,12 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize) (BTRFS_MAX_EXTENT_SIZE << 1) + 3 * sectorsize - 1, 0, NULL, 0); if (ret) { - test_msg("btrfs_set_extent_delalloc returned %d\n", ret); + test_err("btrfs_set_extent_delalloc returned %d", ret); goto out; } if (BTRFS_I(inode)->outstanding_extents != 4) { ret = -EINVAL; - test_msg("Miscount, wanted 4, got %u\n", + test_err("miscount, wanted 4, got %u", BTRFS_I(inode)->outstanding_extents); goto out; } @@ -1037,12 +1041,12 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize) BTRFS_MAX_EXTENT_SIZE + sectorsize, BTRFS_MAX_EXTENT_SIZE + 2 * sectorsize - 1, 0, NULL, 0); if (ret) { - test_msg("btrfs_set_extent_delalloc returned %d\n", ret); + test_err("btrfs_set_extent_delalloc returned %d", ret); goto out; } if (BTRFS_I(inode)->outstanding_extents != 3) { ret = -EINVAL; - test_msg("Miscount, wanted 3, got %u\n", + test_err("miscount, wanted 3, got %u", BTRFS_I(inode)->outstanding_extents); goto out; } @@ -1054,12 +1058,12 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize) EXTENT_DIRTY | EXTENT_DELALLOC | EXTENT_UPTODATE, 0, 0, NULL); if (ret) { - test_msg("clear_extent_bit returned %d\n", ret); + test_err("clear_extent_bit returned %d", ret); goto out; } if (BTRFS_I(inode)->outstanding_extents != 4) { ret = -EINVAL; - test_msg("Miscount, wanted 4, got %u\n", + test_err("miscount, wanted 4, got %u", BTRFS_I(inode)->outstanding_extents); goto out; } @@ -1072,12 +1076,12 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize) BTRFS_MAX_EXTENT_SIZE + sectorsize, BTRFS_MAX_EXTENT_SIZE + 2 * sectorsize - 1, 0, NULL, 0); if (ret) { - test_msg("btrfs_set_extent_delalloc returned %d\n", ret); + test_err("btrfs_set_extent_delalloc returned %d", ret); goto out; } if (BTRFS_I(inode)->outstanding_extents != 3) { ret = -EINVAL; - test_msg("Miscount, wanted 3, got %u\n", + test_err("miscount, wanted 3, got %u", BTRFS_I(inode)->outstanding_extents); goto out; } @@ -1087,12 +1091,12 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize) EXTENT_DIRTY | EXTENT_DELALLOC | EXTENT_UPTODATE, 0, 0, NULL); if (ret) { - test_msg("clear_extent_bit returned %d\n", ret); + test_err("clear_extent_bit returned %d", ret); goto out; } if (BTRFS_I(inode)->outstanding_extents) { ret = -EINVAL; - test_msg("Miscount, wanted 0, got %u\n", + test_err("miscount, wanted 0, got %u", BTRFS_I(inode)->outstanding_extents); goto out; } @@ -1115,14 +1119,14 @@ int btrfs_test_inodes(u32 sectorsize, u32 nodesize) set_bit(EXTENT_FLAG_COMPRESSED, &compressed_only); set_bit(EXTENT_FLAG_PREALLOC, &prealloc_only); - test_msg("Running btrfs_get_extent tests\n"); + test_msg("running btrfs_get_extent tests"); ret = test_btrfs_get_extent(sectorsize, nodesize); if (ret) return ret; - test_msg("Running hole first btrfs_get_extent test\n"); + test_msg("running hole first btrfs_get_extent test"); ret = test_hole_first(sectorsize, nodesize); if (ret) return ret; - test_msg("Running outstanding_extents tests\n"); + test_msg("running outstanding_extents tests"); return test_extent_accounting(sectorsize, nodesize); } diff --git a/fs/btrfs/tests/qgroup-tests.c b/fs/btrfs/tests/qgroup-tests.c index 39b95783f736..ace94db09d29 100644 --- a/fs/btrfs/tests/qgroup-tests.c +++ b/fs/btrfs/tests/qgroup-tests.c @@ -24,7 +24,7 @@ static int insert_normal_tree_ref(struct btrfs_root *root, u64 bytenr, u32 size = sizeof(*item) + sizeof(*iref) + sizeof(*block_info); int ret; - btrfs_init_dummy_trans(&trans); + btrfs_init_dummy_trans(&trans, NULL); ins.objectid = bytenr; ins.type = BTRFS_EXTENT_ITEM_KEY; @@ -32,14 +32,14 @@ static int insert_normal_tree_ref(struct btrfs_root *root, u64 bytenr, path = btrfs_alloc_path(); if (!path) { - test_msg("Couldn't allocate path\n"); + test_err("couldn't allocate path"); return -ENOMEM; } path->leave_spinning = 1; ret = btrfs_insert_empty_item(&trans, root, path, &ins, size); if (ret) { - test_msg("Couldn't insert ref %d\n", ret); + test_err("couldn't insert ref %d", ret); btrfs_free_path(path); return ret; } @@ -74,7 +74,7 @@ static int add_tree_ref(struct btrfs_root *root, u64 bytenr, u64 num_bytes, u64 refs; int ret; - btrfs_init_dummy_trans(&trans); + btrfs_init_dummy_trans(&trans, NULL); key.objectid = bytenr; key.type = BTRFS_EXTENT_ITEM_KEY; @@ -82,14 +82,14 @@ static int add_tree_ref(struct btrfs_root *root, u64 bytenr, u64 num_bytes, path = btrfs_alloc_path(); if (!path) { - test_msg("Couldn't allocate path\n"); + test_err("couldn't allocate path"); return -ENOMEM; } path->leave_spinning = 1; ret = btrfs_search_slot(&trans, root, &key, path, 0, 1); if (ret) { - test_msg("Couldn't find extent ref\n"); + test_err("couldn't find extent ref"); btrfs_free_path(path); return ret; } @@ -111,7 +111,7 @@ static int add_tree_ref(struct btrfs_root *root, u64 bytenr, u64 num_bytes, ret = btrfs_insert_empty_item(&trans, root, path, &key, 0); if (ret) - test_msg("Failed to insert backref\n"); + test_err("failed to insert backref"); btrfs_free_path(path); return ret; } @@ -124,7 +124,7 @@ static int remove_extent_item(struct btrfs_root *root, u64 bytenr, struct btrfs_path *path; int ret; - btrfs_init_dummy_trans(&trans); + btrfs_init_dummy_trans(&trans, NULL); key.objectid = bytenr; key.type = BTRFS_EXTENT_ITEM_KEY; @@ -132,14 +132,14 @@ static int remove_extent_item(struct btrfs_root *root, u64 bytenr, path = btrfs_alloc_path(); if (!path) { - test_msg("Couldn't allocate path\n"); + test_err("couldn't allocate path"); return -ENOMEM; } path->leave_spinning = 1; ret = btrfs_search_slot(&trans, root, &key, path, -1, 1); if (ret) { - test_msg("Didn't find our key %d\n", ret); + test_err("didn't find our key %d", ret); btrfs_free_path(path); return ret; } @@ -158,7 +158,7 @@ static int remove_extent_ref(struct btrfs_root *root, u64 bytenr, u64 refs; int ret; - btrfs_init_dummy_trans(&trans); + btrfs_init_dummy_trans(&trans, NULL); key.objectid = bytenr; key.type = BTRFS_EXTENT_ITEM_KEY; @@ -166,14 +166,14 @@ static int remove_extent_ref(struct btrfs_root *root, u64 bytenr, path = btrfs_alloc_path(); if (!path) { - test_msg("Couldn't allocate path\n"); + test_err("couldn't allocate path"); return -ENOMEM; } path->leave_spinning = 1; ret = btrfs_search_slot(&trans, root, &key, path, 0, 1); if (ret) { - test_msg("Couldn't find extent ref\n"); + test_err("couldn't find extent ref"); btrfs_free_path(path); return ret; } @@ -195,7 +195,7 @@ static int remove_extent_ref(struct btrfs_root *root, u64 bytenr, ret = btrfs_search_slot(&trans, root, &key, path, -1, 1); if (ret) { - test_msg("Couldn't find backref %d\n", ret); + test_err("couldn't find backref %d", ret); btrfs_free_path(path); return ret; } @@ -213,12 +213,12 @@ static int test_no_shared_qgroup(struct btrfs_root *root, struct ulist *new_roots = NULL; int ret; - btrfs_init_dummy_trans(&trans); + btrfs_init_dummy_trans(&trans, fs_info); - test_msg("Qgroup basic add\n"); + test_msg("qgroup basic add"); ret = btrfs_create_qgroup(NULL, fs_info, BTRFS_FS_TREE_OBJECTID); if (ret) { - test_msg("Couldn't create a qgroup %d\n", ret); + test_err("couldn't create a qgroup %d", ret); return ret; } @@ -231,7 +231,7 @@ static int test_no_shared_qgroup(struct btrfs_root *root, false); if (ret) { ulist_free(old_roots); - test_msg("Couldn't find old roots: %d\n", ret); + test_err("couldn't find old roots: %d", ret); return ret; } @@ -245,20 +245,20 @@ static int test_no_shared_qgroup(struct btrfs_root *root, if (ret) { ulist_free(old_roots); ulist_free(new_roots); - test_msg("Couldn't find old roots: %d\n", ret); + test_err("couldn't find old roots: %d", ret); return ret; } ret = btrfs_qgroup_account_extent(&trans, fs_info, nodesize, nodesize, old_roots, new_roots); if (ret) { - test_msg("Couldn't account space for a qgroup %d\n", ret); + test_err("couldn't account space for a qgroup %d", ret); return ret; } if (btrfs_verify_qgroup_counts(fs_info, BTRFS_FS_TREE_OBJECTID, nodesize, nodesize)) { - test_msg("Qgroup counts didn't match expected values\n"); + test_err("qgroup counts didn't match expected values"); return -EINVAL; } old_roots = NULL; @@ -268,7 +268,7 @@ static int test_no_shared_qgroup(struct btrfs_root *root, false); if (ret) { ulist_free(old_roots); - test_msg("Couldn't find old roots: %d\n", ret); + test_err("couldn't find old roots: %d", ret); return ret; } @@ -281,19 +281,19 @@ static int test_no_shared_qgroup(struct btrfs_root *root, if (ret) { ulist_free(old_roots); ulist_free(new_roots); - test_msg("Couldn't find old roots: %d\n", ret); + test_err("couldn't find old roots: %d", ret); return ret; } ret = btrfs_qgroup_account_extent(&trans, fs_info, nodesize, nodesize, old_roots, new_roots); if (ret) { - test_msg("Couldn't account space for a qgroup %d\n", ret); + test_err("couldn't account space for a qgroup %d", ret); return -EINVAL; } if (btrfs_verify_qgroup_counts(fs_info, BTRFS_FS_TREE_OBJECTID, 0, 0)) { - test_msg("Qgroup counts didn't match expected values\n"); + test_err("qgroup counts didn't match expected values"); return -EINVAL; } @@ -314,9 +314,9 @@ static int test_multiple_refs(struct btrfs_root *root, struct ulist *new_roots = NULL; int ret; - btrfs_init_dummy_trans(&trans); + btrfs_init_dummy_trans(&trans, fs_info); - test_msg("Qgroup multiple refs test\n"); + test_msg("qgroup multiple refs test"); /* * We have BTRFS_FS_TREE_OBJECTID created already from the @@ -324,7 +324,7 @@ static int test_multiple_refs(struct btrfs_root *root, */ ret = btrfs_create_qgroup(NULL, fs_info, BTRFS_FIRST_FREE_OBJECTID); if (ret) { - test_msg("Couldn't create a qgroup %d\n", ret); + test_err("couldn't create a qgroup %d", ret); return ret; } @@ -332,7 +332,7 @@ static int test_multiple_refs(struct btrfs_root *root, false); if (ret) { ulist_free(old_roots); - test_msg("Couldn't find old roots: %d\n", ret); + test_err("couldn't find old roots: %d", ret); return ret; } @@ -346,20 +346,20 @@ static int test_multiple_refs(struct btrfs_root *root, if (ret) { ulist_free(old_roots); ulist_free(new_roots); - test_msg("Couldn't find old roots: %d\n", ret); + test_err("couldn't find old roots: %d", ret); return ret; } ret = btrfs_qgroup_account_extent(&trans, fs_info, nodesize, nodesize, old_roots, new_roots); if (ret) { - test_msg("Couldn't account space for a qgroup %d\n", ret); + test_err("couldn't account space for a qgroup %d", ret); return ret; } if (btrfs_verify_qgroup_counts(fs_info, BTRFS_FS_TREE_OBJECTID, nodesize, nodesize)) { - test_msg("Qgroup counts didn't match expected values\n"); + test_err("qgroup counts didn't match expected values"); return -EINVAL; } @@ -367,7 +367,7 @@ static int test_multiple_refs(struct btrfs_root *root, false); if (ret) { ulist_free(old_roots); - test_msg("Couldn't find old roots: %d\n", ret); + test_err("couldn't find old roots: %d", ret); return ret; } @@ -381,26 +381,26 @@ static int test_multiple_refs(struct btrfs_root *root, if (ret) { ulist_free(old_roots); ulist_free(new_roots); - test_msg("Couldn't find old roots: %d\n", ret); + test_err("couldn't find old roots: %d", ret); return ret; } ret = btrfs_qgroup_account_extent(&trans, fs_info, nodesize, nodesize, old_roots, new_roots); if (ret) { - test_msg("Couldn't account space for a qgroup %d\n", ret); + test_err("couldn't account space for a qgroup %d", ret); return ret; } if (btrfs_verify_qgroup_counts(fs_info, BTRFS_FS_TREE_OBJECTID, nodesize, 0)) { - test_msg("Qgroup counts didn't match expected values\n"); + test_err("qgroup counts didn't match expected values"); return -EINVAL; } if (btrfs_verify_qgroup_counts(fs_info, BTRFS_FIRST_FREE_OBJECTID, nodesize, 0)) { - test_msg("Qgroup counts didn't match expected values\n"); + test_err("qgroup counts didn't match expected values"); return -EINVAL; } @@ -408,7 +408,7 @@ static int test_multiple_refs(struct btrfs_root *root, false); if (ret) { ulist_free(old_roots); - test_msg("Couldn't find old roots: %d\n", ret); + test_err("couldn't find old roots: %d", ret); return ret; } @@ -422,26 +422,26 @@ static int test_multiple_refs(struct btrfs_root *root, if (ret) { ulist_free(old_roots); ulist_free(new_roots); - test_msg("Couldn't find old roots: %d\n", ret); + test_err("couldn't find old roots: %d", ret); return ret; } ret = btrfs_qgroup_account_extent(&trans, fs_info, nodesize, nodesize, old_roots, new_roots); if (ret) { - test_msg("Couldn't account space for a qgroup %d\n", ret); + test_err("couldn't account space for a qgroup %d", ret); return ret; } if (btrfs_verify_qgroup_counts(fs_info, BTRFS_FIRST_FREE_OBJECTID, 0, 0)) { - test_msg("Qgroup counts didn't match expected values\n"); + test_err("qgroup counts didn't match expected values"); return -EINVAL; } if (btrfs_verify_qgroup_counts(fs_info, BTRFS_FS_TREE_OBJECTID, nodesize, nodesize)) { - test_msg("Qgroup counts didn't match expected values\n"); + test_err("qgroup counts didn't match expected values"); return -EINVAL; } @@ -457,13 +457,13 @@ int btrfs_test_qgroups(u32 sectorsize, u32 nodesize) fs_info = btrfs_alloc_dummy_fs_info(nodesize, sectorsize); if (!fs_info) { - test_msg("Couldn't allocate dummy fs info\n"); + test_err("couldn't allocate dummy fs info"); return -ENOMEM; } root = btrfs_alloc_dummy_root(fs_info); if (IS_ERR(root)) { - test_msg("Couldn't allocate root\n"); + test_err("couldn't allocate root"); ret = PTR_ERR(root); goto out; } @@ -485,7 +485,7 @@ int btrfs_test_qgroups(u32 sectorsize, u32 nodesize) */ root->node = alloc_test_extent_buffer(root->fs_info, nodesize); if (!root->node) { - test_msg("Couldn't allocate dummy buffer\n"); + test_err("couldn't allocate dummy buffer"); ret = -ENOMEM; goto out; } @@ -495,7 +495,7 @@ int btrfs_test_qgroups(u32 sectorsize, u32 nodesize) tmp_root = btrfs_alloc_dummy_root(fs_info); if (IS_ERR(tmp_root)) { - test_msg("Couldn't allocate a fs root\n"); + test_err("couldn't allocate a fs root"); ret = PTR_ERR(tmp_root); goto out; } @@ -504,13 +504,13 @@ int btrfs_test_qgroups(u32 sectorsize, u32 nodesize) root->fs_info->fs_root = tmp_root; ret = btrfs_insert_fs_root(root->fs_info, tmp_root); if (ret) { - test_msg("Couldn't insert fs root %d\n", ret); + test_err("couldn't insert fs root %d", ret); goto out; } tmp_root = btrfs_alloc_dummy_root(fs_info); if (IS_ERR(tmp_root)) { - test_msg("Couldn't allocate a fs root\n"); + test_err("couldn't allocate a fs root"); ret = PTR_ERR(tmp_root); goto out; } @@ -518,11 +518,11 @@ int btrfs_test_qgroups(u32 sectorsize, u32 nodesize) tmp_root->root_key.objectid = BTRFS_FIRST_FREE_OBJECTID; ret = btrfs_insert_fs_root(root->fs_info, tmp_root); if (ret) { - test_msg("Couldn't insert fs root %d\n", ret); + test_err("couldn't insert fs root %d", ret); goto out; } - test_msg("Running qgroup tests\n"); + test_msg("running qgroup tests"); ret = test_no_shared_qgroup(root, sectorsize, nodesize); if (ret) goto out; diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index c944b4769e3c..4485eae41e88 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -877,12 +877,7 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, atomic_dec(&cur_trans->num_writers); extwriter_counter_dec(cur_trans, trans->type); - /* - * Make sure counter is updated before we wake up waiters. - */ - smp_mb(); - if (waitqueue_active(&cur_trans->writer_wait)) - wake_up(&cur_trans->writer_wait); + cond_wake_up(&cur_trans->writer_wait); btrfs_put_transaction(cur_trans); if (current->journal_info == trans) @@ -1250,7 +1245,6 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans) btrfs_free_log(trans, root); btrfs_update_reloc_root(trans, root); - btrfs_orphan_commit_root(trans, root); btrfs_save_ino_cache(root, trans); @@ -1640,15 +1634,14 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, btrfs_abort_transaction(trans, ret); goto fail; } - ret = btrfs_uuid_tree_add(trans, fs_info, new_uuid.b, - BTRFS_UUID_KEY_SUBVOL, objectid); + ret = btrfs_uuid_tree_add(trans, new_uuid.b, BTRFS_UUID_KEY_SUBVOL, + objectid); if (ret) { btrfs_abort_transaction(trans, ret); goto fail; } if (!btrfs_is_empty_uuid(new_root_item->received_uuid)) { - ret = btrfs_uuid_tree_add(trans, fs_info, - new_root_item->received_uuid, + ret = btrfs_uuid_tree_add(trans, new_root_item->received_uuid, BTRFS_UUID_KEY_RECEIVED_SUBVOL, objectid); if (ret && ret != -EEXIST) { diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h index d8c0826bc2c7..94439482a0ec 100644 --- a/fs/btrfs/transaction.h +++ b/fs/btrfs/transaction.h @@ -139,7 +139,6 @@ struct btrfs_pending_snapshot { struct btrfs_path *path; /* block reservation for the operation */ struct btrfs_block_rsv block_rsv; - u64 qgroup_reserved; /* extra metadata reservation for relocation */ int error; bool readonly; diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 43758e30aa7a..f8220ec02036 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -222,11 +222,8 @@ int btrfs_pin_log_trans(struct btrfs_root *root) void btrfs_end_log_trans(struct btrfs_root *root) { if (atomic_dec_and_test(&root->log_writers)) { - /* - * Implicit memory barrier after atomic_dec_and_test - */ - if (waitqueue_active(&root->log_writer_wait)) - wake_up(&root->log_writer_wait); + /* atomic_dec_and_test implies a barrier */ + cond_wake_up_nomb(&root->log_writer_wait); } } @@ -2988,11 +2985,8 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, mutex_lock(&log_root_tree->log_mutex); if (atomic_dec_and_test(&log_root_tree->log_writers)) { - /* - * Implicit memory barrier after atomic_dec_and_test - */ - if (waitqueue_active(&log_root_tree->log_writer_wait)) - wake_up(&log_root_tree->log_writer_wait); + /* atomic_dec_and_test implies a barrier */ + cond_wake_up_nomb(&log_root_tree->log_writer_wait); } if (ret) { @@ -3116,10 +3110,11 @@ out_wake_log_root: mutex_unlock(&log_root_tree->log_mutex); /* - * The barrier before waitqueue_active is implied by mutex_unlock + * The barrier before waitqueue_active (in cond_wake_up) is needed so + * all the updates above are seen by the woken threads. It might not be + * necessary, but proving that seems to be hard. */ - if (waitqueue_active(&log_root_tree->log_commit_wait[index2])) - wake_up(&log_root_tree->log_commit_wait[index2]); + cond_wake_up(&log_root_tree->log_commit_wait[index2]); out: mutex_lock(&root->log_mutex); btrfs_remove_all_log_ctxs(root, index1, ret); @@ -3128,10 +3123,11 @@ out: mutex_unlock(&root->log_mutex); /* - * The barrier before waitqueue_active is implied by mutex_unlock + * The barrier before waitqueue_active (in cond_wake_up) is needed so + * all the updates above are seen by the woken threads. It might not be + * necessary, but proving that seems to be hard. */ - if (waitqueue_active(&root->log_commit_wait[index1])) - wake_up(&root->log_commit_wait[index1]); + cond_wake_up(&root->log_commit_wait[index1]); return ret; } @@ -4320,6 +4316,110 @@ static int log_one_extent(struct btrfs_trans_handle *trans, return ret; } +/* + * Log all prealloc extents beyond the inode's i_size to make sure we do not + * lose them after doing a fast fsync and replaying the log. We scan the + * subvolume's root instead of iterating the inode's extent map tree because + * otherwise we can log incorrect extent items based on extent map conversion. + * That can happen due to the fact that extent maps are merged when they + * are not in the extent map tree's list of modified extents. + */ +static int btrfs_log_prealloc_extents(struct btrfs_trans_handle *trans, + struct btrfs_inode *inode, + struct btrfs_path *path) +{ + struct btrfs_root *root = inode->root; + struct btrfs_key key; + const u64 i_size = i_size_read(&inode->vfs_inode); + const u64 ino = btrfs_ino(inode); + struct btrfs_path *dst_path = NULL; + u64 last_extent = (u64)-1; + int ins_nr = 0; + int start_slot; + int ret; + + if (!(inode->flags & BTRFS_INODE_PREALLOC)) + return 0; + + key.objectid = ino; + key.type = BTRFS_EXTENT_DATA_KEY; + key.offset = i_size; + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) + goto out; + + while (true) { + struct extent_buffer *leaf = path->nodes[0]; + int slot = path->slots[0]; + + if (slot >= btrfs_header_nritems(leaf)) { + if (ins_nr > 0) { + ret = copy_items(trans, inode, dst_path, path, + &last_extent, start_slot, + ins_nr, 1, 0); + if (ret < 0) + goto out; + ins_nr = 0; + } + ret = btrfs_next_leaf(root, path); + if (ret < 0) + goto out; + if (ret > 0) { + ret = 0; + break; + } + continue; + } + + btrfs_item_key_to_cpu(leaf, &key, slot); + if (key.objectid > ino) + break; + if (WARN_ON_ONCE(key.objectid < ino) || + key.type < BTRFS_EXTENT_DATA_KEY || + key.offset < i_size) { + path->slots[0]++; + continue; + } + if (last_extent == (u64)-1) { + last_extent = key.offset; + /* + * Avoid logging extent items logged in past fsync calls + * and leading to duplicate keys in the log tree. + */ + do { + ret = btrfs_truncate_inode_items(trans, + root->log_root, + &inode->vfs_inode, + i_size, + BTRFS_EXTENT_DATA_KEY); + } while (ret == -EAGAIN); + if (ret) + goto out; + } + if (ins_nr == 0) + start_slot = slot; + ins_nr++; + path->slots[0]++; + if (!dst_path) { + dst_path = btrfs_alloc_path(); + if (!dst_path) { + ret = -ENOMEM; + goto out; + } + } + } + if (ins_nr > 0) { + ret = copy_items(trans, inode, dst_path, path, &last_extent, + start_slot, ins_nr, 1, 0); + if (ret > 0) + ret = 0; + } +out: + btrfs_release_path(path); + btrfs_free_path(dst_path); + return ret; +} + static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_inode *inode, @@ -4362,6 +4462,11 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans, if (em->generation <= test_gen) continue; + /* We log prealloc extents beyond eof later. */ + if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) && + em->start >= i_size_read(&inode->vfs_inode)) + continue; + if (em->start < logged_start) logged_start = em->start; if ((em->start + em->len - 1) > logged_end) @@ -4374,31 +4479,6 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans, num++; } - /* - * Add all prealloc extents beyond the inode's i_size to make sure we - * don't lose them after doing a fast fsync and replaying the log. - */ - if (inode->flags & BTRFS_INODE_PREALLOC) { - struct rb_node *node; - - for (node = rb_last(&tree->map); node; node = rb_prev(node)) { - em = rb_entry(node, struct extent_map, rb_node); - if (em->start < i_size_read(&inode->vfs_inode)) - break; - if (!list_empty(&em->list)) - continue; - /* Same as above loop. */ - if (++num > 32768) { - list_del_init(&tree->modified_extents); - ret = -EFBIG; - goto process; - } - refcount_inc(&em->refs); - set_bit(EXTENT_FLAG_LOGGING, &em->flags); - list_add_tail(&em->list, &extents); - } - } - list_sort(NULL, &extents, extent_cmp); btrfs_get_logged_extents(inode, logged_list, logged_start, logged_end); /* @@ -4443,6 +4523,9 @@ process: up_write(&inode->dio_sem); btrfs_release_path(path); + if (!ret) + ret = btrfs_log_prealloc_extents(trans, inode, path); + return ret; } @@ -4827,6 +4910,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, struct extent_map_tree *em_tree = &inode->extent_tree; u64 logged_isize = 0; bool need_log_inode_item = true; + bool xattrs_logged = false; path = btrfs_alloc_path(); if (!path) @@ -5128,6 +5212,7 @@ next_key: err = btrfs_log_all_xattrs(trans, root, inode, path, dst_path); if (err) goto out_unlock; + xattrs_logged = true; if (max_key.type >= BTRFS_EXTENT_DATA_KEY && !fast_search) { btrfs_release_path(path); btrfs_release_path(dst_path); @@ -5140,6 +5225,11 @@ log_extents: btrfs_release_path(dst_path); if (need_log_inode_item) { err = log_inode_item(trans, log, dst_path, inode); + if (!err && !xattrs_logged) { + err = btrfs_log_all_xattrs(trans, root, inode, path, + dst_path); + btrfs_release_path(path); + } if (err) goto out_unlock; } diff --git a/fs/btrfs/uuid-tree.c b/fs/btrfs/uuid-tree.c index 1ba7ca2a4200..3b2ae342e649 100644 --- a/fs/btrfs/uuid-tree.c +++ b/fs/btrfs/uuid-tree.c @@ -79,10 +79,10 @@ out: return ret; } -int btrfs_uuid_tree_add(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, u8 *uuid, u8 type, +int btrfs_uuid_tree_add(struct btrfs_trans_handle *trans, u8 *uuid, u8 type, u64 subid_cpu) { + struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_root *uuid_root = fs_info->uuid_root; int ret; struct btrfs_path *path = NULL; @@ -144,10 +144,10 @@ out: return ret; } -int btrfs_uuid_tree_rem(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, u8 *uuid, u8 type, +int btrfs_uuid_tree_remove(struct btrfs_trans_handle *trans, u8 *uuid, u8 type, u64 subid) { + struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_root *uuid_root = fs_info->uuid_root; int ret; struct btrfs_path *path = NULL; @@ -239,7 +239,7 @@ static int btrfs_uuid_iter_rem(struct btrfs_root *uuid_root, u8 *uuid, u8 type, goto out; } - ret = btrfs_uuid_tree_rem(trans, uuid_root->fs_info, uuid, type, subid); + ret = btrfs_uuid_tree_remove(trans, uuid, type, subid); btrfs_end_transaction(trans); out: diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 292266f6ab9c..e034ad9e23b4 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -40,6 +40,9 @@ const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = { .tolerated_failures = 1, .devs_increment = 2, .ncopies = 2, + .raid_name = "raid10", + .bg_flag = BTRFS_BLOCK_GROUP_RAID10, + .mindev_error = BTRFS_ERROR_DEV_RAID10_MIN_NOT_MET, }, [BTRFS_RAID_RAID1] = { .sub_stripes = 1, @@ -49,6 +52,9 @@ const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = { .tolerated_failures = 1, .devs_increment = 2, .ncopies = 2, + .raid_name = "raid1", + .bg_flag = BTRFS_BLOCK_GROUP_RAID1, + .mindev_error = BTRFS_ERROR_DEV_RAID1_MIN_NOT_MET, }, [BTRFS_RAID_DUP] = { .sub_stripes = 1, @@ -58,6 +64,9 @@ const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = { .tolerated_failures = 0, .devs_increment = 1, .ncopies = 2, + .raid_name = "dup", + .bg_flag = BTRFS_BLOCK_GROUP_DUP, + .mindev_error = 0, }, [BTRFS_RAID_RAID0] = { .sub_stripes = 1, @@ -67,6 +76,9 @@ const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = { .tolerated_failures = 0, .devs_increment = 1, .ncopies = 1, + .raid_name = "raid0", + .bg_flag = BTRFS_BLOCK_GROUP_RAID0, + .mindev_error = 0, }, [BTRFS_RAID_SINGLE] = { .sub_stripes = 1, @@ -76,6 +88,9 @@ const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = { .tolerated_failures = 0, .devs_increment = 1, .ncopies = 1, + .raid_name = "single", + .bg_flag = 0, + .mindev_error = 0, }, [BTRFS_RAID_RAID5] = { .sub_stripes = 1, @@ -85,6 +100,9 @@ const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = { .tolerated_failures = 1, .devs_increment = 1, .ncopies = 2, + .raid_name = "raid5", + .bg_flag = BTRFS_BLOCK_GROUP_RAID5, + .mindev_error = BTRFS_ERROR_DEV_RAID5_MIN_NOT_MET, }, [BTRFS_RAID_RAID6] = { .sub_stripes = 1, @@ -94,33 +112,19 @@ const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = { .tolerated_failures = 2, .devs_increment = 1, .ncopies = 3, + .raid_name = "raid6", + .bg_flag = BTRFS_BLOCK_GROUP_RAID6, + .mindev_error = BTRFS_ERROR_DEV_RAID6_MIN_NOT_MET, }, }; -const u64 btrfs_raid_group[BTRFS_NR_RAID_TYPES] = { - [BTRFS_RAID_RAID10] = BTRFS_BLOCK_GROUP_RAID10, - [BTRFS_RAID_RAID1] = BTRFS_BLOCK_GROUP_RAID1, - [BTRFS_RAID_DUP] = BTRFS_BLOCK_GROUP_DUP, - [BTRFS_RAID_RAID0] = BTRFS_BLOCK_GROUP_RAID0, - [BTRFS_RAID_SINGLE] = 0, - [BTRFS_RAID_RAID5] = BTRFS_BLOCK_GROUP_RAID5, - [BTRFS_RAID_RAID6] = BTRFS_BLOCK_GROUP_RAID6, -}; +const char *get_raid_name(enum btrfs_raid_types type) +{ + if (type >= BTRFS_NR_RAID_TYPES) + return NULL; -/* - * Table to convert BTRFS_RAID_* to the error code if minimum number of devices - * condition is not met. Zero means there's no corresponding - * BTRFS_ERROR_DEV_*_NOT_MET value. - */ -const int btrfs_raid_mindev_error[BTRFS_NR_RAID_TYPES] = { - [BTRFS_RAID_RAID10] = BTRFS_ERROR_DEV_RAID10_MIN_NOT_MET, - [BTRFS_RAID_RAID1] = BTRFS_ERROR_DEV_RAID1_MIN_NOT_MET, - [BTRFS_RAID_DUP] = 0, - [BTRFS_RAID_RAID0] = 0, - [BTRFS_RAID_SINGLE] = 0, - [BTRFS_RAID_RAID5] = BTRFS_ERROR_DEV_RAID5_MIN_NOT_MET, - [BTRFS_RAID_RAID6] = BTRFS_ERROR_DEV_RAID6_MIN_NOT_MET, -}; + return btrfs_raid_array[type].raid_name; +} static int init_first_rw_device(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info); @@ -167,12 +171,6 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, * may be used to exclude some operations from running concurrently without any * modifications to the list (see write_all_supers) * - * volume_mutex - * ------------ - * coarse lock owned by a mounted filesystem; used to exclude some operations - * that cannot run in parallel and affect the higher-level properties of the - * filesystem like: device add/deleting/resize/replace, or balance - * * balance_mutex * ------------- * protects balance structures (status, state) and context accessed from @@ -197,6 +195,41 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, * device_list_mutex * chunk_mutex * balance_mutex + * + * + * Exclusive operations, BTRFS_FS_EXCL_OP + * ====================================== + * + * Maintains the exclusivity of the following operations that apply to the + * whole filesystem and cannot run in parallel. + * + * - Balance (*) + * - Device add + * - Device remove + * - Device replace (*) + * - Resize + * + * The device operations (as above) can be in one of the following states: + * + * - Running state + * - Paused state + * - Completed state + * + * Only device operations marked with (*) can go into the Paused state for the + * following reasons: + * + * - ioctl (only Balance can be Paused through ioctl) + * - filesystem remounted as read-only + * - filesystem unmounted and mounted as read-only + * - system power-cycle and filesystem mounted as read-only + * - filesystem or device errors leading to forced read-only + * + * BTRFS_FS_EXCL_OP flag is set and cleared using atomic operations. + * During the course of Paused state, the BTRFS_FS_EXCL_OP remains set. + * A device operation in Paused or Running state can be canceled or resumed + * either by ioctl (Balance only) or when remounted as read-write. + * BTRFS_FS_EXCL_OP flag is cleared when the device operation is canceled or + * completed. */ DEFINE_MUTEX(uuid_mutex); @@ -227,14 +260,14 @@ static struct btrfs_fs_devices *alloc_fs_devices(const u8 *fsid) INIT_LIST_HEAD(&fs_devs->devices); INIT_LIST_HEAD(&fs_devs->resized_devices); INIT_LIST_HEAD(&fs_devs->alloc_list); - INIT_LIST_HEAD(&fs_devs->list); + INIT_LIST_HEAD(&fs_devs->fs_list); if (fsid) memcpy(fs_devs->fsid, fsid, BTRFS_FSID_SIZE); return fs_devs; } -static void free_device(struct btrfs_device *device) +void btrfs_free_device(struct btrfs_device *device) { rcu_string_free(device->name); bio_put(device->flush_bio); @@ -249,7 +282,7 @@ static void free_fs_devices(struct btrfs_fs_devices *fs_devices) device = list_entry(fs_devices->devices.next, struct btrfs_device, dev_list); list_del(&device->dev_list); - free_device(device); + btrfs_free_device(device); } kfree(fs_devices); } @@ -273,8 +306,8 @@ void __exit btrfs_cleanup_fs_uuids(void) while (!list_empty(&fs_uuids)) { fs_devices = list_entry(fs_uuids.next, - struct btrfs_fs_devices, list); - list_del(&fs_devices->list); + struct btrfs_fs_devices, fs_list); + list_del(&fs_devices->fs_list); free_fs_devices(fs_devices); } } @@ -282,7 +315,7 @@ void __exit btrfs_cleanup_fs_uuids(void) /* * Returns a pointer to a new btrfs_device on success; ERR_PTR() on error. * Returned struct is not linked onto any lists and must be destroyed using - * free_device. + * btrfs_free_device. */ static struct btrfs_device *__alloc_device(void) { @@ -327,10 +360,9 @@ static struct btrfs_device *__alloc_device(void) static struct btrfs_device *find_device(struct btrfs_fs_devices *fs_devices, u64 devid, const u8 *uuid) { - struct list_head *head = &fs_devices->devices; struct btrfs_device *dev; - list_for_each_entry(dev, head, dev_list) { + list_for_each_entry(dev, &fs_devices->devices, dev_list) { if (dev->devid == devid && (!uuid || !memcmp(dev->uuid, uuid, BTRFS_UUID_SIZE))) { return dev; @@ -343,7 +375,7 @@ static noinline struct btrfs_fs_devices *find_fsid(u8 *fsid) { struct btrfs_fs_devices *fs_devices; - list_for_each_entry(fs_devices, &fs_uuids, list) { + list_for_each_entry(fs_devices, &fs_uuids, fs_list) { if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) == 0) return fs_devices; } @@ -607,7 +639,7 @@ static void btrfs_free_stale_devices(const char *path, struct btrfs_fs_devices *fs_devs, *tmp_fs_devs; struct btrfs_device *dev, *tmp_dev; - list_for_each_entry_safe(fs_devs, tmp_fs_devs, &fs_uuids, list) { + list_for_each_entry_safe(fs_devs, tmp_fs_devs, &fs_uuids, fs_list) { if (fs_devs->opened) continue; @@ -632,13 +664,13 @@ static void btrfs_free_stale_devices(const char *path, /* delete the stale device */ if (fs_devs->num_devices == 1) { btrfs_sysfs_remove_fsid(fs_devs); - list_del(&fs_devs->list); + list_del(&fs_devs->fs_list); free_fs_devices(fs_devs); break; } else { fs_devs->num_devices--; list_del(&dev->dev_list); - free_device(dev); + btrfs_free_device(dev); } } } @@ -732,7 +764,7 @@ static noinline struct btrfs_device *device_list_add(const char *path, if (IS_ERR(fs_devices)) return ERR_CAST(fs_devices); - list_add(&fs_devices->list, &fs_uuids); + list_add(&fs_devices->fs_list, &fs_uuids); device = NULL; } else { @@ -753,7 +785,7 @@ static noinline struct btrfs_device *device_list_add(const char *path, name = rcu_string_strdup(path, GFP_NOFS); if (!name) { - free_device(device); + btrfs_free_device(device); return ERR_PTR(-ENOMEM); } rcu_assign_pointer(device->name, name); @@ -866,7 +898,7 @@ static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig) name = rcu_string_strdup(orig_dev->name->str, GFP_KERNEL); if (!name) { - free_device(device); + btrfs_free_device(device); goto error; } rcu_assign_pointer(device->name, name); @@ -938,7 +970,7 @@ again: } list_del_init(&device->dev_list); fs_devices->num_devices--; - free_device(device); + btrfs_free_device(device); } if (fs_devices->seed) { @@ -956,7 +988,7 @@ static void free_device_rcu(struct rcu_head *head) struct btrfs_device *device; device = container_of(head, struct btrfs_device, rcu); - free_device(device); + btrfs_free_device(device); } static void btrfs_close_bdev(struct btrfs_device *device) @@ -1005,7 +1037,7 @@ static void btrfs_prepare_close_one_device(struct btrfs_device *device) new_device->fs_devices = device->fs_devices; } -static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices) +static int close_fs_devices(struct btrfs_fs_devices *fs_devices) { struct btrfs_device *device, *tmp; struct list_head pending_put; @@ -1050,7 +1082,7 @@ int btrfs_close_devices(struct btrfs_fs_devices *fs_devices) int ret; mutex_lock(&uuid_mutex); - ret = __btrfs_close_devices(fs_devices); + ret = close_fs_devices(fs_devices); if (!fs_devices->opened) { seed_devices = fs_devices->seed; fs_devices->seed = NULL; @@ -1060,23 +1092,22 @@ int btrfs_close_devices(struct btrfs_fs_devices *fs_devices) while (seed_devices) { fs_devices = seed_devices; seed_devices = fs_devices->seed; - __btrfs_close_devices(fs_devices); + close_fs_devices(fs_devices); free_fs_devices(fs_devices); } return ret; } -static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices, +static int open_fs_devices(struct btrfs_fs_devices *fs_devices, fmode_t flags, void *holder) { - struct list_head *head = &fs_devices->devices; struct btrfs_device *device; struct btrfs_device *latest_dev = NULL; int ret = 0; flags |= FMODE_EXCL; - list_for_each_entry(device, head, dev_list) { + list_for_each_entry(device, &fs_devices->devices, dev_list) { /* Just open everything we can; ignore failures here */ if (btrfs_open_one_device(fs_devices, device, flags, holder)) continue; @@ -1115,15 +1146,16 @@ int btrfs_open_devices(struct btrfs_fs_devices *fs_devices, { int ret; - mutex_lock(&uuid_mutex); + mutex_lock(&fs_devices->device_list_mutex); if (fs_devices->opened) { fs_devices->opened++; ret = 0; } else { list_sort(NULL, &fs_devices->devices, devid_cmp); - ret = __btrfs_open_devices(fs_devices, flags, holder); + ret = open_fs_devices(fs_devices, flags, holder); } - mutex_unlock(&uuid_mutex); + mutex_unlock(&fs_devices->device_list_mutex); + return ret; } @@ -1201,31 +1233,29 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder, */ bytenr = btrfs_sb_offset(0); flags |= FMODE_EXCL; - mutex_lock(&uuid_mutex); bdev = blkdev_get_by_path(path, flags, holder); - if (IS_ERR(bdev)) { - ret = PTR_ERR(bdev); - goto error; - } + if (IS_ERR(bdev)) + return PTR_ERR(bdev); if (btrfs_read_disk_super(bdev, bytenr, &page, &disk_super)) { ret = -EINVAL; goto error_bdev_put; } + mutex_lock(&uuid_mutex); device = device_list_add(path, disk_super); if (IS_ERR(device)) ret = PTR_ERR(device); else *fs_devices_ret = device->fs_devices; + mutex_unlock(&uuid_mutex); btrfs_release_disk_super(page); error_bdev_put: blkdev_put(bdev, flags); -error: - mutex_unlock(&uuid_mutex); + return ret; } @@ -1857,11 +1887,11 @@ static int btrfs_check_raid_min_devices(struct btrfs_fs_info *fs_info, } while (read_seqretry(&fs_info->profiles_lock, seq)); for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) { - if (!(all_avail & btrfs_raid_group[i])) + if (!(all_avail & btrfs_raid_array[i].bg_flag)) continue; if (num_devices < btrfs_raid_array[i].devs_min) { - int ret = btrfs_raid_mindev_error[i]; + int ret = btrfs_raid_array[i].mindev_error; if (ret) return ret; @@ -1917,13 +1947,13 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path, { struct btrfs_device *device; struct btrfs_fs_devices *cur_devices; + struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; u64 num_devices; int ret = 0; - mutex_lock(&fs_info->volume_mutex); mutex_lock(&uuid_mutex); - num_devices = fs_info->fs_devices->num_devices; + num_devices = fs_devices->num_devices; btrfs_dev_replace_read_lock(&fs_info->dev_replace); if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace)) { WARN_ON(num_devices < 1); @@ -1986,27 +2016,32 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path, * (super_copy) should hold the device list mutex. */ + /* + * In normal cases the cur_devices == fs_devices. But in case + * of deleting a seed device, the cur_devices should point to + * its own fs_devices listed under the fs_devices->seed. + */ cur_devices = device->fs_devices; - mutex_lock(&fs_info->fs_devices->device_list_mutex); + mutex_lock(&fs_devices->device_list_mutex); list_del_rcu(&device->dev_list); - device->fs_devices->num_devices--; - device->fs_devices->total_devices--; + cur_devices->num_devices--; + cur_devices->total_devices--; if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) - device->fs_devices->missing_devices--; + cur_devices->missing_devices--; btrfs_assign_next_active_device(fs_info, device, NULL); if (device->bdev) { - device->fs_devices->open_devices--; + cur_devices->open_devices--; /* remove sysfs entry */ - btrfs_sysfs_rm_device_link(fs_info->fs_devices, device); + btrfs_sysfs_rm_device_link(fs_devices, device); } num_devices = btrfs_super_num_devices(fs_info->super_copy) - 1; btrfs_set_super_num_devices(fs_info->super_copy, num_devices); - mutex_unlock(&fs_info->fs_devices->device_list_mutex); + mutex_unlock(&fs_devices->device_list_mutex); /* * at this point, the device is zero sized and detached from @@ -2020,8 +2055,6 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path, call_rcu(&device->rcu, free_device_rcu); if (cur_devices->open_devices == 0) { - struct btrfs_fs_devices *fs_devices; - fs_devices = fs_info->fs_devices; while (fs_devices) { if (fs_devices->seed == cur_devices) { fs_devices->seed = cur_devices->seed; @@ -2030,20 +2063,19 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path, fs_devices = fs_devices->seed; } cur_devices->seed = NULL; - __btrfs_close_devices(cur_devices); + close_fs_devices(cur_devices); free_fs_devices(cur_devices); } out: mutex_unlock(&uuid_mutex); - mutex_unlock(&fs_info->volume_mutex); return ret; error_undo: if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) { mutex_lock(&fs_info->chunk_mutex); list_add(&device->dev_alloc_list, - &fs_info->fs_devices->alloc_list); + &fs_devices->alloc_list); device->fs_devices->rw_devices++; mutex_unlock(&fs_info->chunk_mutex); } @@ -2112,7 +2144,7 @@ void btrfs_rm_dev_replace_free_srcdev(struct btrfs_fs_info *fs_info, tmp_fs_devices = tmp_fs_devices->seed; } fs_devices->seed = NULL; - __btrfs_close_devices(fs_devices); + close_fs_devices(fs_devices); free_fs_devices(fs_devices); } } @@ -2120,23 +2152,23 @@ void btrfs_rm_dev_replace_free_srcdev(struct btrfs_fs_info *fs_info, void btrfs_destroy_dev_replace_tgtdev(struct btrfs_fs_info *fs_info, struct btrfs_device *tgtdev) { - mutex_lock(&uuid_mutex); + struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; + WARN_ON(!tgtdev); - mutex_lock(&fs_info->fs_devices->device_list_mutex); + mutex_lock(&fs_devices->device_list_mutex); - btrfs_sysfs_rm_device_link(fs_info->fs_devices, tgtdev); + btrfs_sysfs_rm_device_link(fs_devices, tgtdev); if (tgtdev->bdev) - fs_info->fs_devices->open_devices--; + fs_devices->open_devices--; - fs_info->fs_devices->num_devices--; + fs_devices->num_devices--; btrfs_assign_next_active_device(fs_info, tgtdev, NULL); list_del_rcu(&tgtdev->dev_list); - mutex_unlock(&fs_info->fs_devices->device_list_mutex); - mutex_unlock(&uuid_mutex); + mutex_unlock(&fs_devices->device_list_mutex); /* * The update_dev_time() with in btrfs_scratch_superblocks() @@ -2188,10 +2220,6 @@ int btrfs_find_device_missing_or_by_path(struct btrfs_fs_info *fs_info, struct btrfs_device *tmp; devices = &fs_info->fs_devices->devices; - /* - * It is safe to read the devices since the volume_mutex - * is held by the caller. - */ list_for_each_entry(tmp, devices, dev_list) { if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &tmp->dev_state) && !tmp->bdev) { @@ -2259,7 +2287,7 @@ static int btrfs_prepare_sprout(struct btrfs_fs_info *fs_info) return PTR_ERR(old_devices); } - list_add(&old_devices->list, &fs_uuids); + list_add(&old_devices->fs_list, &fs_uuids); memcpy(seed_devices, fs_devices, sizeof(*seed_devices)); seed_devices->opened = 1; @@ -2570,7 +2598,7 @@ error_trans: if (trans) btrfs_end_transaction(trans); error_free_device: - free_device(device); + btrfs_free_device(device); error: blkdev_put(bdev, FMODE_EXCL); if (seeding_dev && !unlocked) { @@ -2580,99 +2608,6 @@ error: return ret; } -int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info, - const char *device_path, - struct btrfs_device *srcdev, - struct btrfs_device **device_out) -{ - struct btrfs_device *device; - struct block_device *bdev; - struct list_head *devices; - struct rcu_string *name; - u64 devid = BTRFS_DEV_REPLACE_DEVID; - int ret = 0; - - *device_out = NULL; - if (fs_info->fs_devices->seeding) { - btrfs_err(fs_info, "the filesystem is a seed filesystem!"); - return -EINVAL; - } - - bdev = blkdev_get_by_path(device_path, FMODE_WRITE | FMODE_EXCL, - fs_info->bdev_holder); - if (IS_ERR(bdev)) { - btrfs_err(fs_info, "target device %s is invalid!", device_path); - return PTR_ERR(bdev); - } - - filemap_write_and_wait(bdev->bd_inode->i_mapping); - - devices = &fs_info->fs_devices->devices; - list_for_each_entry(device, devices, dev_list) { - if (device->bdev == bdev) { - btrfs_err(fs_info, - "target device is in the filesystem!"); - ret = -EEXIST; - goto error; - } - } - - - if (i_size_read(bdev->bd_inode) < - btrfs_device_get_total_bytes(srcdev)) { - btrfs_err(fs_info, - "target device is smaller than source device!"); - ret = -EINVAL; - goto error; - } - - - device = btrfs_alloc_device(NULL, &devid, NULL); - if (IS_ERR(device)) { - ret = PTR_ERR(device); - goto error; - } - - name = rcu_string_strdup(device_path, GFP_KERNEL); - if (!name) { - free_device(device); - ret = -ENOMEM; - goto error; - } - rcu_assign_pointer(device->name, name); - - mutex_lock(&fs_info->fs_devices->device_list_mutex); - set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state); - device->generation = 0; - device->io_width = fs_info->sectorsize; - device->io_align = fs_info->sectorsize; - device->sector_size = fs_info->sectorsize; - device->total_bytes = btrfs_device_get_total_bytes(srcdev); - device->disk_total_bytes = btrfs_device_get_disk_total_bytes(srcdev); - device->bytes_used = btrfs_device_get_bytes_used(srcdev); - device->commit_total_bytes = srcdev->commit_total_bytes; - device->commit_bytes_used = device->bytes_used; - device->fs_info = fs_info; - device->bdev = bdev; - set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state); - set_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state); - device->mode = FMODE_EXCL; - device->dev_stats_valid = 1; - set_blocksize(device->bdev, BTRFS_BDEV_BLOCKSIZE); - device->fs_devices = fs_info->fs_devices; - list_add(&device->dev_list, &fs_info->fs_devices->devices); - fs_info->fs_devices->num_devices++; - fs_info->fs_devices->open_devices++; - mutex_unlock(&fs_info->fs_devices->device_list_mutex); - - *device_out = device; - return ret; - -error: - blkdev_put(bdev, FMODE_EXCL); - return ret; -} - static noinline int btrfs_update_device(struct btrfs_trans_handle *trans, struct btrfs_device *device) { @@ -3273,24 +3208,12 @@ static void update_balance_args(struct btrfs_balance_control *bctl) } /* - * Should be called with both balance and volume mutexes held to - * serialize other volume operations (add_dev/rm_dev/resize) with - * restriper. Same goes for unset_balance_control. + * Clear the balance status in fs_info and delete the balance item from disk. */ -static void set_balance_control(struct btrfs_balance_control *bctl) -{ - struct btrfs_fs_info *fs_info = bctl->fs_info; - - BUG_ON(fs_info->balance_ctl); - - spin_lock(&fs_info->balance_lock); - fs_info->balance_ctl = bctl; - spin_unlock(&fs_info->balance_lock); -} - -static void unset_balance_control(struct btrfs_fs_info *fs_info) +static void reset_balance_state(struct btrfs_fs_info *fs_info) { struct btrfs_balance_control *bctl = fs_info->balance_ctl; + int ret; BUG_ON(!fs_info->balance_ctl); @@ -3299,6 +3222,9 @@ static void unset_balance_control(struct btrfs_fs_info *fs_info) spin_unlock(&fs_info->balance_lock); kfree(bctl); + ret = del_balance_item(fs_info); + if (ret) + btrfs_handle_fs_error(fs_info, ret, NULL); } /* @@ -3835,18 +3761,6 @@ static inline int balance_need_close(struct btrfs_fs_info *fs_info) atomic_read(&fs_info->balance_cancel_req) == 0); } -static void __cancel_balance(struct btrfs_fs_info *fs_info) -{ - int ret; - - unset_balance_control(fs_info); - ret = del_balance_item(fs_info); - if (ret) - btrfs_handle_fs_error(fs_info, ret, NULL); - - clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags); -} - /* Non-zero return value signifies invalidity */ static inline int validate_convert_profile(struct btrfs_balance_args *bctl_arg, u64 allowed) @@ -3857,12 +3771,12 @@ static inline int validate_convert_profile(struct btrfs_balance_args *bctl_arg, } /* - * Should be called with both balance and volume mutexes held + * Should be called with balance mutexe held */ -int btrfs_balance(struct btrfs_balance_control *bctl, +int btrfs_balance(struct btrfs_fs_info *fs_info, + struct btrfs_balance_control *bctl, struct btrfs_ioctl_balance_args *bargs) { - struct btrfs_fs_info *fs_info = bctl->fs_info; u64 meta_target, data_target; u64 allowed; int mixed = 0; @@ -3891,7 +3805,7 @@ int btrfs_balance(struct btrfs_balance_control *bctl, !(bctl->flags & BTRFS_BALANCE_METADATA) || memcmp(&bctl->data, &bctl->meta, sizeof(bctl->data))) { btrfs_err(fs_info, - "with mixed groups data and metadata balance options must be the same"); + "balance: mixed groups data and metadata options must be the same"); ret = -EINVAL; goto out; } @@ -3913,23 +3827,29 @@ int btrfs_balance(struct btrfs_balance_control *bctl, allowed |= (BTRFS_BLOCK_GROUP_RAID10 | BTRFS_BLOCK_GROUP_RAID6); if (validate_convert_profile(&bctl->data, allowed)) { + int index = btrfs_bg_flags_to_raid_index(bctl->data.target); + btrfs_err(fs_info, - "unable to start balance with target data profile %llu", - bctl->data.target); + "balance: invalid convert data profile %s", + get_raid_name(index)); ret = -EINVAL; goto out; } if (validate_convert_profile(&bctl->meta, allowed)) { + int index = btrfs_bg_flags_to_raid_index(bctl->meta.target); + btrfs_err(fs_info, - "unable to start balance with target metadata profile %llu", - bctl->meta.target); + "balance: invalid convert metadata profile %s", + get_raid_name(index)); ret = -EINVAL; goto out; } if (validate_convert_profile(&bctl->sys, allowed)) { + int index = btrfs_bg_flags_to_raid_index(bctl->sys.target); + btrfs_err(fs_info, - "unable to start balance with target system profile %llu", - bctl->sys.target); + "balance: invalid convert system profile %s", + get_raid_name(index)); ret = -EINVAL; goto out; } @@ -3950,10 +3870,10 @@ int btrfs_balance(struct btrfs_balance_control *bctl, !(bctl->meta.target & allowed))) { if (bctl->flags & BTRFS_BALANCE_FORCE) { btrfs_info(fs_info, - "force reducing metadata integrity"); + "balance: force reducing metadata integrity"); } else { btrfs_err(fs_info, - "balance will reduce metadata integrity, use force if you want this"); + "balance: reduces metadata integrity, use --force if you want this"); ret = -EINVAL; goto out; } @@ -3967,9 +3887,12 @@ int btrfs_balance(struct btrfs_balance_control *bctl, bctl->data.target : fs_info->avail_data_alloc_bits; if (btrfs_get_num_tolerated_disk_barrier_failures(meta_target) < btrfs_get_num_tolerated_disk_barrier_failures(data_target)) { + int meta_index = btrfs_bg_flags_to_raid_index(meta_target); + int data_index = btrfs_bg_flags_to_raid_index(data_target); + btrfs_warn(fs_info, - "metadata profile 0x%llx has lower redundancy than data profile 0x%llx", - meta_target, data_target); + "balance: metadata profile %s has lower redundancy than data profile %s", + get_raid_name(meta_index), get_raid_name(data_index)); } ret = insert_balance_item(fs_info, bctl); @@ -3978,7 +3901,10 @@ int btrfs_balance(struct btrfs_balance_control *bctl, if (!(bctl->flags & BTRFS_BALANCE_RESUME)) { BUG_ON(ret == -EEXIST); - set_balance_control(bctl); + BUG_ON(fs_info->balance_ctl); + spin_lock(&fs_info->balance_lock); + fs_info->balance_ctl = bctl; + spin_unlock(&fs_info->balance_lock); } else { BUG_ON(ret != -EEXIST); spin_lock(&fs_info->balance_lock); @@ -3986,22 +3912,24 @@ int btrfs_balance(struct btrfs_balance_control *bctl, spin_unlock(&fs_info->balance_lock); } - atomic_inc(&fs_info->balance_running); + ASSERT(!test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)); + set_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags); mutex_unlock(&fs_info->balance_mutex); ret = __btrfs_balance(fs_info); mutex_lock(&fs_info->balance_mutex); - atomic_dec(&fs_info->balance_running); + clear_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags); if (bargs) { memset(bargs, 0, sizeof(*bargs)); - update_ioctl_balance_args(fs_info, 0, bargs); + btrfs_update_ioctl_balance_args(fs_info, bargs); } if ((ret && ret != -ECANCELED && ret != -ENOSPC) || balance_need_close(fs_info)) { - __cancel_balance(fs_info); + reset_balance_state(fs_info); + clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags); } wake_up(&fs_info->balance_wait_q); @@ -4009,11 +3937,11 @@ int btrfs_balance(struct btrfs_balance_control *bctl, return ret; out: if (bctl->flags & BTRFS_BALANCE_RESUME) - __cancel_balance(fs_info); - else { + reset_balance_state(fs_info); + else kfree(bctl); - clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags); - } + clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags); + return ret; } @@ -4022,16 +3950,12 @@ static int balance_kthread(void *data) struct btrfs_fs_info *fs_info = data; int ret = 0; - mutex_lock(&fs_info->volume_mutex); mutex_lock(&fs_info->balance_mutex); - if (fs_info->balance_ctl) { - btrfs_info(fs_info, "continuing balance"); - ret = btrfs_balance(fs_info->balance_ctl, NULL); + btrfs_info(fs_info, "balance: resuming"); + ret = btrfs_balance(fs_info, fs_info->balance_ctl, NULL); } - mutex_unlock(&fs_info->balance_mutex); - mutex_unlock(&fs_info->volume_mutex); return ret; } @@ -4040,18 +3964,27 @@ int btrfs_resume_balance_async(struct btrfs_fs_info *fs_info) { struct task_struct *tsk; - spin_lock(&fs_info->balance_lock); + mutex_lock(&fs_info->balance_mutex); if (!fs_info->balance_ctl) { - spin_unlock(&fs_info->balance_lock); + mutex_unlock(&fs_info->balance_mutex); return 0; } - spin_unlock(&fs_info->balance_lock); + mutex_unlock(&fs_info->balance_mutex); if (btrfs_test_opt(fs_info, SKIP_BALANCE)) { - btrfs_info(fs_info, "force skipping balance"); + btrfs_info(fs_info, "balance: resume skipped"); return 0; } + /* + * A ro->rw remount sequence should continue with the paused balance + * regardless of who pauses it, system or the user as of now, so set + * the resume flag. + */ + spin_lock(&fs_info->balance_lock); + fs_info->balance_ctl->flags |= BTRFS_BALANCE_RESUME; + spin_unlock(&fs_info->balance_lock); + tsk = kthread_run(balance_kthread, fs_info, "btrfs-balance"); return PTR_ERR_OR_ZERO(tsk); } @@ -4091,7 +4024,6 @@ int btrfs_recover_balance(struct btrfs_fs_info *fs_info) leaf = path->nodes[0]; item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_balance_item); - bctl->fs_info = fs_info; bctl->flags = btrfs_balance_flags(leaf, item); bctl->flags |= BTRFS_BALANCE_RESUME; @@ -4102,15 +4034,26 @@ int btrfs_recover_balance(struct btrfs_fs_info *fs_info) btrfs_balance_sys(leaf, item, &disk_bargs); btrfs_disk_balance_args_to_cpu(&bctl->sys, &disk_bargs); - WARN_ON(test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags)); + /* + * This should never happen, as the paused balance state is recovered + * during mount without any chance of other exclusive ops to collide. + * + * This gives the exclusive op status to balance and keeps in paused + * state until user intervention (cancel or umount). If the ownership + * cannot be assigned, show a message but do not fail. The balance + * is in a paused state and must have fs_info::balance_ctl properly + * set up. + */ + if (test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags)) + btrfs_warn(fs_info, + "balance: cannot set exclusive op status, resume manually"); - mutex_lock(&fs_info->volume_mutex); mutex_lock(&fs_info->balance_mutex); - - set_balance_control(bctl); - + BUG_ON(fs_info->balance_ctl); + spin_lock(&fs_info->balance_lock); + fs_info->balance_ctl = bctl; + spin_unlock(&fs_info->balance_lock); mutex_unlock(&fs_info->balance_mutex); - mutex_unlock(&fs_info->volume_mutex); out: btrfs_free_path(path); return ret; @@ -4126,16 +4069,16 @@ int btrfs_pause_balance(struct btrfs_fs_info *fs_info) return -ENOTCONN; } - if (atomic_read(&fs_info->balance_running)) { + if (test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)) { atomic_inc(&fs_info->balance_pause_req); mutex_unlock(&fs_info->balance_mutex); wait_event(fs_info->balance_wait_q, - atomic_read(&fs_info->balance_running) == 0); + !test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)); mutex_lock(&fs_info->balance_mutex); /* we are good with balance_ctl ripped off from under us */ - BUG_ON(atomic_read(&fs_info->balance_running)); + BUG_ON(test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)); atomic_dec(&fs_info->balance_pause_req); } else { ret = -ENOTCONN; @@ -4147,38 +4090,49 @@ int btrfs_pause_balance(struct btrfs_fs_info *fs_info) int btrfs_cancel_balance(struct btrfs_fs_info *fs_info) { - if (sb_rdonly(fs_info->sb)) - return -EROFS; - mutex_lock(&fs_info->balance_mutex); if (!fs_info->balance_ctl) { mutex_unlock(&fs_info->balance_mutex); return -ENOTCONN; } + /* + * A paused balance with the item stored on disk can be resumed at + * mount time if the mount is read-write. Otherwise it's still paused + * and we must not allow cancelling as it deletes the item. + */ + if (sb_rdonly(fs_info->sb)) { + mutex_unlock(&fs_info->balance_mutex); + return -EROFS; + } + atomic_inc(&fs_info->balance_cancel_req); /* * if we are running just wait and return, balance item is * deleted in btrfs_balance in this case */ - if (atomic_read(&fs_info->balance_running)) { + if (test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)) { mutex_unlock(&fs_info->balance_mutex); wait_event(fs_info->balance_wait_q, - atomic_read(&fs_info->balance_running) == 0); + !test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)); mutex_lock(&fs_info->balance_mutex); } else { - /* __cancel_balance needs volume_mutex */ mutex_unlock(&fs_info->balance_mutex); - mutex_lock(&fs_info->volume_mutex); + /* + * Lock released to allow other waiters to continue, we'll + * reexamine the status again. + */ mutex_lock(&fs_info->balance_mutex); - if (fs_info->balance_ctl) - __cancel_balance(fs_info); - - mutex_unlock(&fs_info->volume_mutex); + if (fs_info->balance_ctl) { + reset_balance_state(fs_info); + clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags); + btrfs_info(fs_info, "balance: canceled"); + } } - BUG_ON(fs_info->balance_ctl || atomic_read(&fs_info->balance_running)); + BUG_ON(fs_info->balance_ctl || + test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)); atomic_dec(&fs_info->balance_cancel_req); mutex_unlock(&fs_info->balance_mutex); return 0; @@ -4255,8 +4209,7 @@ static int btrfs_uuid_scan_kthread(void *data) } update_tree: if (!btrfs_is_empty_uuid(root_item.uuid)) { - ret = btrfs_uuid_tree_add(trans, fs_info, - root_item.uuid, + ret = btrfs_uuid_tree_add(trans, root_item.uuid, BTRFS_UUID_KEY_SUBVOL, key.objectid); if (ret < 0) { @@ -4267,7 +4220,7 @@ update_tree: } if (!btrfs_is_empty_uuid(root_item.received_uuid)) { - ret = btrfs_uuid_tree_add(trans, fs_info, + ret = btrfs_uuid_tree_add(trans, root_item.received_uuid, BTRFS_UUID_KEY_RECEIVED_SUBVOL, key.objectid); @@ -4473,7 +4426,7 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size) if (!path) return -ENOMEM; - path->reada = READA_FORWARD; + path->reada = READA_BACK; mutex_lock(&fs_info->chunk_mutex); @@ -6034,9 +5987,8 @@ int btrfs_map_sblock(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, return __btrfs_map_block(fs_info, op, logical, length, bbio_ret, 0, 1); } -int btrfs_rmap_block(struct btrfs_fs_info *fs_info, - u64 chunk_start, u64 physical, u64 devid, - u64 **logical, int *naddrs, int *stripe_len) +int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start, + u64 physical, u64 **logical, int *naddrs, int *stripe_len) { struct extent_map *em; struct map_lookup *map; @@ -6068,8 +6020,6 @@ int btrfs_rmap_block(struct btrfs_fs_info *fs_info, BUG_ON(!buf); /* -ENOMEM */ for (i = 0; i < map->num_stripes; i++) { - if (devid && map->stripes[i].dev->devid != devid) - continue; if (map->stripes[i].physical > physical || map->stripes[i].physical + length <= physical) continue; @@ -6401,7 +6351,7 @@ static struct btrfs_device *add_missing_dev(struct btrfs_fs_devices *fs_devices, * * Return: a pointer to a new &struct btrfs_device on success; ERR_PTR() * on error. Returned struct is not linked onto any lists and must be - * destroyed with free_device. + * destroyed with btrfs_free_device. */ struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info, const u64 *devid, @@ -6424,7 +6374,7 @@ struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info, ret = find_next_devid(fs_info, &tmp); if (ret) { - free_device(dev); + btrfs_free_device(dev); return ERR_PTR(ret); } } @@ -6675,8 +6625,7 @@ static struct btrfs_fs_devices *open_seed_devices(struct btrfs_fs_info *fs_info, if (IS_ERR(fs_devices)) return fs_devices; - ret = __btrfs_open_devices(fs_devices, FMODE_READ, - fs_info->bdev_holder); + ret = open_fs_devices(fs_devices, FMODE_READ, fs_info->bdev_holder); if (ret) { free_fs_devices(fs_devices); fs_devices = ERR_PTR(ret); @@ -6684,7 +6633,7 @@ static struct btrfs_fs_devices *open_seed_devices(struct btrfs_fs_info *fs_info, } if (!fs_devices->seeding) { - __btrfs_close_devices(fs_devices); + close_fs_devices(fs_devices); free_fs_devices(fs_devices); fs_devices = ERR_PTR(-EINVAL); goto out; @@ -6993,6 +6942,10 @@ int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info) if (!path) return -ENOMEM; + /* + * uuid_mutex is needed only if we are mounting a sprout FS + * otherwise we don't need it. + */ mutex_lock(&uuid_mutex); mutex_lock(&fs_info->chunk_mutex); diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 79096884654f..5139ec8daf4c 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -208,6 +208,7 @@ BTRFS_DEVICE_GETSET_FUNCS(bytes_used); struct btrfs_fs_devices { u8 fsid[BTRFS_FSID_SIZE]; /* FS specific uuid */ + struct list_head fs_list; u64 num_devices; u64 open_devices; @@ -229,7 +230,6 @@ struct btrfs_fs_devices { struct list_head resized_devices; /* devices not currently being allocated */ struct list_head alloc_list; - struct list_head list; struct btrfs_fs_devices *seed; int seeding; @@ -329,11 +329,12 @@ struct btrfs_raid_attr { int tolerated_failures; /* max tolerated fail devs */ int devs_increment; /* ndevs has to be a multiple of this */ int ncopies; /* how many copies to data has */ + int mindev_error; /* error code if min devs requisite is unmet */ + const char raid_name[8]; /* name of the raid */ + u64 bg_flag; /* block group flag of the raid */ }; extern const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES]; -extern const int btrfs_raid_mindev_error[BTRFS_NR_RAID_TYPES]; -extern const u64 btrfs_raid_group[BTRFS_NR_RAID_TYPES]; struct map_lookup { u64 type; @@ -351,8 +352,6 @@ struct map_lookup { struct btrfs_balance_args; struct btrfs_balance_progress; struct btrfs_balance_control { - struct btrfs_fs_info *fs_info; - struct btrfs_balance_args data; struct btrfs_balance_args meta; struct btrfs_balance_args sys; @@ -393,9 +392,8 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, int btrfs_map_sblock(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, u64 logical, u64 *length, struct btrfs_bio **bbio_ret); -int btrfs_rmap_block(struct btrfs_fs_info *fs_info, - u64 chunk_start, u64 physical, u64 devid, - u64 **logical, int *naddrs, int *stripe_len); +int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start, + u64 physical, u64 **logical, int *naddrs, int *stripe_len); int btrfs_read_sys_array(struct btrfs_fs_info *fs_info); int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info); int btrfs_alloc_chunk(struct btrfs_trans_handle *trans, @@ -421,6 +419,7 @@ int btrfs_find_device_by_devspec(struct btrfs_fs_info *fs_info, u64 devid, struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info, const u64 *devid, const u8 *uuid); +void btrfs_free_device(struct btrfs_device *device); int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path, u64 devid); void __exit btrfs_cleanup_fs_uuids(void); @@ -431,11 +430,8 @@ struct btrfs_device *btrfs_find_device(struct btrfs_fs_info *fs_info, u64 devid, u8 *uuid, u8 *fsid); int btrfs_shrink_device(struct btrfs_device *device, u64 new_size); int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *path); -int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info, - const char *device_path, - struct btrfs_device *srcdev, - struct btrfs_device **device_out); -int btrfs_balance(struct btrfs_balance_control *bctl, +int btrfs_balance(struct btrfs_fs_info *fs_info, + struct btrfs_balance_control *bctl, struct btrfs_ioctl_balance_args *bargs); int btrfs_resume_balance_async(struct btrfs_fs_info *fs_info); int btrfs_recover_balance(struct btrfs_fs_info *fs_info); @@ -553,6 +549,8 @@ static inline enum btrfs_raid_types btrfs_bg_flags_to_raid_index(u64 flags) return BTRFS_RAID_SINGLE; /* BTRFS_BLOCK_GROUP_SINGLE */ } +const char *get_raid_name(enum btrfs_raid_types type); + void btrfs_update_commit_device_size(struct btrfs_fs_info *fs_info); void btrfs_update_commit_device_bytes_used(struct btrfs_transaction *trans); diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c index 0daa1e3fe0df..ab0bbe93b398 100644 --- a/fs/cachefiles/namei.c +++ b/fs/cachefiles/namei.c @@ -572,6 +572,11 @@ lookup_again: if (ret < 0) goto create_error; + if (unlikely(d_unhashed(next))) { + dput(next); + inode_unlock(d_inode(dir)); + goto lookup_again; + } ASSERT(d_backing_inode(next)); _debug("mkdir -> %p{%p{ino=%lu}}", @@ -764,6 +769,7 @@ struct dentry *cachefiles_get_directory(struct cachefiles_cache *cache, /* search the current directory for the element name */ inode_lock(d_inode(dir)); +retry: start = jiffies; subdir = lookup_one_len(dirname, dir, strlen(dirname)); cachefiles_hist(cachefiles_lookup_histogram, start); @@ -793,6 +799,10 @@ struct dentry *cachefiles_get_directory(struct cachefiles_cache *cache, if (ret < 0) goto mkdir_error; + if (unlikely(d_unhashed(subdir))) { + dput(subdir); + goto retry; + } ASSERT(d_backing_inode(subdir)); _debug("mkdir -> %p{%p{ino=%lu}}", diff --git a/fs/cachefiles/proc.c b/fs/cachefiles/proc.c index 125b90f6c796..0ce1aa56b67f 100644 --- a/fs/cachefiles/proc.c +++ b/fs/cachefiles/proc.c @@ -85,21 +85,6 @@ static const struct seq_operations cachefiles_histogram_ops = { }; /* - * open "/proc/fs/cachefiles/XXX" which provide statistics summaries - */ -static int cachefiles_histogram_open(struct inode *inode, struct file *file) -{ - return seq_open(file, &cachefiles_histogram_ops); -} - -static const struct file_operations cachefiles_histogram_fops = { - .open = cachefiles_histogram_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release, -}; - -/* * initialise the /proc/fs/cachefiles/ directory */ int __init cachefiles_proc_init(void) @@ -109,8 +94,8 @@ int __init cachefiles_proc_init(void) if (!proc_mkdir("fs/cachefiles", NULL)) goto error_dir; - if (!proc_create("fs/cachefiles/histogram", S_IFREG | 0444, NULL, - &cachefiles_histogram_fops)) + if (!proc_create_seq("fs/cachefiles/histogram", S_IFREG | 0444, NULL, + &cachefiles_histogram_ops)) goto error_histogram; _leave(" = 0"); diff --git a/fs/ceph/file.c b/fs/ceph/file.c index f85040d73e3d..cf0e45b10121 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -70,69 +70,104 @@ static __le32 ceph_flags_sys2wire(u32 flags) */ /* - * Calculate the length sum of direct io vectors that can - * be combined into one page vector. + * How many pages to get in one call to iov_iter_get_pages(). This + * determines the size of the on-stack array used as a buffer. */ -static size_t dio_get_pagev_size(const struct iov_iter *it) +#define ITER_GET_BVECS_PAGES 64 + +static ssize_t __iter_get_bvecs(struct iov_iter *iter, size_t maxsize, + struct bio_vec *bvecs) { - const struct iovec *iov = it->iov; - const struct iovec *iovend = iov + it->nr_segs; - size_t size; - - size = iov->iov_len - it->iov_offset; - /* - * An iov can be page vectored when both the current tail - * and the next base are page aligned. - */ - while (PAGE_ALIGNED((iov->iov_base + iov->iov_len)) && - (++iov < iovend && PAGE_ALIGNED((iov->iov_base)))) { - size += iov->iov_len; - } - dout("dio_get_pagevlen len = %zu\n", size); - return size; + size_t size = 0; + int bvec_idx = 0; + + if (maxsize > iov_iter_count(iter)) + maxsize = iov_iter_count(iter); + + while (size < maxsize) { + struct page *pages[ITER_GET_BVECS_PAGES]; + ssize_t bytes; + size_t start; + int idx = 0; + + bytes = iov_iter_get_pages(iter, pages, maxsize - size, + ITER_GET_BVECS_PAGES, &start); + if (bytes < 0) + return size ?: bytes; + + iov_iter_advance(iter, bytes); + size += bytes; + + for ( ; bytes; idx++, bvec_idx++) { + struct bio_vec bv = { + .bv_page = pages[idx], + .bv_len = min_t(int, bytes, PAGE_SIZE - start), + .bv_offset = start, + }; + + bvecs[bvec_idx] = bv; + bytes -= bv.bv_len; + start = 0; + } + } + + return size; } /* - * Allocate a page vector based on (@it, @nbytes). - * The return value is the tuple describing a page vector, - * that is (@pages, @page_align, @num_pages). + * iov_iter_get_pages() only considers one iov_iter segment, no matter + * what maxsize or maxpages are given. For ITER_BVEC that is a single + * page. + * + * Attempt to get up to @maxsize bytes worth of pages from @iter. + * Return the number of bytes in the created bio_vec array, or an error. */ -static struct page ** -dio_get_pages_alloc(const struct iov_iter *it, size_t nbytes, - size_t *page_align, int *num_pages) +static ssize_t iter_get_bvecs_alloc(struct iov_iter *iter, size_t maxsize, + struct bio_vec **bvecs, int *num_bvecs) { - struct iov_iter tmp_it = *it; - size_t align; - struct page **pages; - int ret = 0, idx, npages; + struct bio_vec *bv; + size_t orig_count = iov_iter_count(iter); + ssize_t bytes; + int npages; - align = (unsigned long)(it->iov->iov_base + it->iov_offset) & - (PAGE_SIZE - 1); - npages = calc_pages_for(align, nbytes); - pages = kvmalloc(sizeof(*pages) * npages, GFP_KERNEL); - if (!pages) - return ERR_PTR(-ENOMEM); + iov_iter_truncate(iter, maxsize); + npages = iov_iter_npages(iter, INT_MAX); + iov_iter_reexpand(iter, orig_count); - for (idx = 0; idx < npages; ) { - size_t start; - ret = iov_iter_get_pages(&tmp_it, pages + idx, nbytes, - npages - idx, &start); - if (ret < 0) - goto fail; + /* + * __iter_get_bvecs() may populate only part of the array -- zero it + * out. + */ + bv = kvmalloc_array(npages, sizeof(*bv), GFP_KERNEL | __GFP_ZERO); + if (!bv) + return -ENOMEM; - iov_iter_advance(&tmp_it, ret); - nbytes -= ret; - idx += (ret + start + PAGE_SIZE - 1) / PAGE_SIZE; + bytes = __iter_get_bvecs(iter, maxsize, bv); + if (bytes < 0) { + /* + * No pages were pinned -- just free the array. + */ + kvfree(bv); + return bytes; } - BUG_ON(nbytes != 0); - *num_pages = npages; - *page_align = align; - dout("dio_get_pages_alloc: got %d pages align %zu\n", npages, align); - return pages; -fail: - ceph_put_page_vector(pages, idx, false); - return ERR_PTR(ret); + *bvecs = bv; + *num_bvecs = npages; + return bytes; +} + +static void put_bvecs(struct bio_vec *bvecs, int num_bvecs, bool should_dirty) +{ + int i; + + for (i = 0; i < num_bvecs; i++) { + if (bvecs[i].bv_page) { + if (should_dirty) + set_page_dirty_lock(bvecs[i].bv_page); + put_page(bvecs[i].bv_page); + } + } + kvfree(bvecs); } /* @@ -746,11 +781,12 @@ static void ceph_aio_complete_req(struct ceph_osd_request *req) struct inode *inode = req->r_inode; struct ceph_aio_request *aio_req = req->r_priv; struct ceph_osd_data *osd_data = osd_req_op_extent_osd_data(req, 0); - int num_pages = calc_pages_for((u64)osd_data->alignment, - osd_data->length); - dout("ceph_aio_complete_req %p rc %d bytes %llu\n", - inode, rc, osd_data->length); + BUG_ON(osd_data->type != CEPH_OSD_DATA_TYPE_BVECS); + BUG_ON(!osd_data->num_bvecs); + + dout("ceph_aio_complete_req %p rc %d bytes %u\n", + inode, rc, osd_data->bvec_pos.iter.bi_size); if (rc == -EOLDSNAPC) { struct ceph_aio_work *aio_work; @@ -768,9 +804,10 @@ static void ceph_aio_complete_req(struct ceph_osd_request *req) } else if (!aio_req->write) { if (rc == -ENOENT) rc = 0; - if (rc >= 0 && osd_data->length > rc) { - int zoff = osd_data->alignment + rc; - int zlen = osd_data->length - rc; + if (rc >= 0 && osd_data->bvec_pos.iter.bi_size > rc) { + struct iov_iter i; + int zlen = osd_data->bvec_pos.iter.bi_size - rc; + /* * If read is satisfied by single OSD request, * it can pass EOF. Otherwise read is within @@ -785,13 +822,16 @@ static void ceph_aio_complete_req(struct ceph_osd_request *req) aio_req->total_len = rc + zlen; } - if (zlen > 0) - ceph_zero_page_vector_range(zoff, zlen, - osd_data->pages); + iov_iter_bvec(&i, ITER_BVEC, osd_data->bvec_pos.bvecs, + osd_data->num_bvecs, + osd_data->bvec_pos.iter.bi_size); + iov_iter_advance(&i, rc); + iov_iter_zero(zlen, &i); } } - ceph_put_page_vector(osd_data->pages, num_pages, aio_req->should_dirty); + put_bvecs(osd_data->bvec_pos.bvecs, osd_data->num_bvecs, + aio_req->should_dirty); ceph_osdc_put_request(req); if (rc < 0) @@ -879,7 +919,7 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter, struct ceph_fs_client *fsc = ceph_inode_to_client(inode); struct ceph_vino vino; struct ceph_osd_request *req; - struct page **pages; + struct bio_vec *bvecs; struct ceph_aio_request *aio_req = NULL; int num_pages = 0; int flags; @@ -914,10 +954,14 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter, } while (iov_iter_count(iter) > 0) { - u64 size = dio_get_pagev_size(iter); - size_t start = 0; + u64 size = iov_iter_count(iter); ssize_t len; + if (write) + size = min_t(u64, size, fsc->mount_options->wsize); + else + size = min_t(u64, size, fsc->mount_options->rsize); + vino = ceph_vino(inode); req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, vino, pos, &size, 0, @@ -933,18 +977,14 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter, break; } - if (write) - size = min_t(u64, size, fsc->mount_options->wsize); - else - size = min_t(u64, size, fsc->mount_options->rsize); - - len = size; - pages = dio_get_pages_alloc(iter, len, &start, &num_pages); - if (IS_ERR(pages)) { + len = iter_get_bvecs_alloc(iter, size, &bvecs, &num_pages); + if (len < 0) { ceph_osdc_put_request(req); - ret = PTR_ERR(pages); + ret = len; break; } + if (len != size) + osd_req_op_extent_update(req, 0, len); /* * To simplify error handling, allow AIO when IO within i_size @@ -977,8 +1017,7 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter, req->r_mtime = mtime; } - osd_req_op_extent_osd_data_pages(req, 0, pages, len, start, - false, false); + osd_req_op_extent_osd_data_bvecs(req, 0, bvecs, num_pages, len); if (aio_req) { aio_req->total_len += len; @@ -991,7 +1030,6 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter, list_add_tail(&req->r_unsafe_item, &aio_req->osd_reqs); pos += len; - iov_iter_advance(iter, len); continue; } @@ -1004,25 +1042,26 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter, if (ret == -ENOENT) ret = 0; if (ret >= 0 && ret < len && pos + ret < size) { + struct iov_iter i; int zlen = min_t(size_t, len - ret, size - pos - ret); - ceph_zero_page_vector_range(start + ret, zlen, - pages); + + iov_iter_bvec(&i, ITER_BVEC, bvecs, num_pages, + len); + iov_iter_advance(&i, ret); + iov_iter_zero(zlen, &i); ret += zlen; } if (ret >= 0) len = ret; } - ceph_put_page_vector(pages, num_pages, should_dirty); - + put_bvecs(bvecs, num_pages, should_dirty); ceph_osdc_put_request(req); if (ret < 0) break; pos += len; - iov_iter_advance(iter, len); - if (!write && pos >= size) break; diff --git a/fs/cifs/Makefile b/fs/cifs/Makefile index 7e4a1e2f0696..85817991ee68 100644 --- a/fs/cifs/Makefile +++ b/fs/cifs/Makefile @@ -1,11 +1,12 @@ # SPDX-License-Identifier: GPL-2.0 # -# Makefile for Linux CIFS VFS client +# Makefile for Linux CIFS/SMB2/SMB3 VFS client # +ccflags-y += -I$(src) # needed for trace events obj-$(CONFIG_CIFS) += cifs.o -cifs-y := cifsfs.o cifssmb.o cifs_debug.o connect.o dir.o file.o inode.o \ - link.o misc.o netmisc.o smbencrypt.o transport.o asn1.o \ +cifs-y := trace.o cifsfs.o cifssmb.o cifs_debug.o connect.o dir.o file.o \ + inode.o link.o misc.o netmisc.o smbencrypt.o transport.o asn1.o \ cifs_unicode.o nterr.o cifsencrypt.o \ readdir.o ioctl.o sess.o export.o smb1ops.o winucase.o \ smb2ops.o smb2maperror.o smb2transport.o \ diff --git a/fs/cifs/cifs_debug.c b/fs/cifs/cifs_debug.c index 9d69ea433330..116146022aa1 100644 --- a/fs/cifs/cifs_debug.c +++ b/fs/cifs/cifs_debug.c @@ -42,7 +42,7 @@ cifs_dump_mem(char *label, void *data, int length) data, length, true); } -void cifs_dump_detail(void *buf) +void cifs_dump_detail(void *buf, struct TCP_Server_Info *server) { #ifdef CONFIG_CIFS_DEBUG2 struct smb_hdr *smb = (struct smb_hdr *)buf; @@ -50,7 +50,8 @@ void cifs_dump_detail(void *buf) cifs_dbg(VFS, "Cmd: %d Err: 0x%x Flags: 0x%x Flgs2: 0x%x Mid: %d Pid: %d\n", smb->Command, smb->Status.CifsError, smb->Flags, smb->Flags2, smb->Mid, smb->Pid); - cifs_dbg(VFS, "smb buf %p len %u\n", smb, smbCalcSize(smb)); + cifs_dbg(VFS, "smb buf %p len %u\n", smb, + server->ops->calc_smb_size(smb, server)); #endif /* CONFIG_CIFS_DEBUG2 */ } @@ -83,7 +84,7 @@ void cifs_dump_mids(struct TCP_Server_Info *server) cifs_dbg(VFS, "IsMult: %d IsEnd: %d\n", mid_entry->multiRsp, mid_entry->multiEnd); if (mid_entry->resp_buf) { - cifs_dump_detail(mid_entry->resp_buf); + cifs_dump_detail(mid_entry->resp_buf, server); cifs_dump_mem("existing buf: ", mid_entry->resp_buf, 62); } @@ -113,6 +114,8 @@ static void cifs_debug_tcon(struct seq_file *m, struct cifs_tcon *tcon) seq_printf(m, " type: %d ", dev_type); if (tcon->seal) seq_printf(m, " Encrypted"); + if (tcon->nocase) + seq_printf(m, " nocase"); if (tcon->unix_ext) seq_printf(m, " POSIX Extensions"); if (tcon->ses->server->ops->dump_share_caps) @@ -237,6 +240,10 @@ skip_rdma: server->credits, server->dialect); if (server->sign) seq_printf(m, " signed"); +#ifdef CONFIG_CIFS_SMB311 + if (server->posix_ext_supported) + seq_printf(m, " posix"); +#endif /* 3.1.1 */ i++; list_for_each(tmp2, &server->smb_ses_list) { ses = list_entry(tmp2, struct cifs_ses, @@ -314,18 +321,6 @@ skip_rdma: return 0; } -static int cifs_debug_data_proc_open(struct inode *inode, struct file *file) -{ - return single_open(file, cifs_debug_data_proc_show, NULL); -} - -static const struct file_operations cifs_debug_data_proc_fops = { - .open = cifs_debug_data_proc_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; - #ifdef CONFIG_CIFS_STATS static ssize_t cifs_stats_proc_write(struct file *file, const char __user *buffer, size_t count, loff_t *ppos) @@ -497,35 +492,36 @@ cifs_proc_init(void) if (proc_fs_cifs == NULL) return; - proc_create("DebugData", 0, proc_fs_cifs, &cifs_debug_data_proc_fops); + proc_create_single("DebugData", 0, proc_fs_cifs, + cifs_debug_data_proc_show); #ifdef CONFIG_CIFS_STATS - proc_create("Stats", 0, proc_fs_cifs, &cifs_stats_proc_fops); + proc_create("Stats", 0644, proc_fs_cifs, &cifs_stats_proc_fops); #endif /* STATS */ - proc_create("cifsFYI", 0, proc_fs_cifs, &cifsFYI_proc_fops); - proc_create("traceSMB", 0, proc_fs_cifs, &traceSMB_proc_fops); - proc_create("LinuxExtensionsEnabled", 0, proc_fs_cifs, + proc_create("cifsFYI", 0644, proc_fs_cifs, &cifsFYI_proc_fops); + proc_create("traceSMB", 0644, proc_fs_cifs, &traceSMB_proc_fops); + proc_create("LinuxExtensionsEnabled", 0644, proc_fs_cifs, &cifs_linux_ext_proc_fops); - proc_create("SecurityFlags", 0, proc_fs_cifs, + proc_create("SecurityFlags", 0644, proc_fs_cifs, &cifs_security_flags_proc_fops); - proc_create("LookupCacheEnabled", 0, proc_fs_cifs, + proc_create("LookupCacheEnabled", 0644, proc_fs_cifs, &cifs_lookup_cache_proc_fops); #ifdef CONFIG_CIFS_SMB_DIRECT - proc_create("rdma_readwrite_threshold", 0, proc_fs_cifs, + proc_create("rdma_readwrite_threshold", 0644, proc_fs_cifs, &cifs_rdma_readwrite_threshold_proc_fops); - proc_create("smbd_max_frmr_depth", 0, proc_fs_cifs, + proc_create("smbd_max_frmr_depth", 0644, proc_fs_cifs, &cifs_smbd_max_frmr_depth_proc_fops); - proc_create("smbd_keep_alive_interval", 0, proc_fs_cifs, + proc_create("smbd_keep_alive_interval", 0644, proc_fs_cifs, &cifs_smbd_keep_alive_interval_proc_fops); - proc_create("smbd_max_receive_size", 0, proc_fs_cifs, + proc_create("smbd_max_receive_size", 0644, proc_fs_cifs, &cifs_smbd_max_receive_size_proc_fops); - proc_create("smbd_max_fragmented_recv_size", 0, proc_fs_cifs, + proc_create("smbd_max_fragmented_recv_size", 0644, proc_fs_cifs, &cifs_smbd_max_fragmented_recv_size_proc_fops); - proc_create("smbd_max_send_size", 0, proc_fs_cifs, + proc_create("smbd_max_send_size", 0644, proc_fs_cifs, &cifs_smbd_max_send_size_proc_fops); - proc_create("smbd_send_credit_target", 0, proc_fs_cifs, + proc_create("smbd_send_credit_target", 0644, proc_fs_cifs, &cifs_smbd_send_credit_target_proc_fops); - proc_create("smbd_receive_credit_max", 0, proc_fs_cifs, + proc_create("smbd_receive_credit_max", 0644, proc_fs_cifs, &cifs_smbd_receive_credit_max_proc_fops); #endif } @@ -583,6 +579,8 @@ static ssize_t cifsFYI_proc_write(struct file *file, const char __user *buffer, cifsFYI = bv; else if ((c[0] > '1') && (c[0] <= '9')) cifsFYI = (int) (c[0] - '0'); /* see cifs_debug.h for meanings */ + else + return -EINVAL; return count; } diff --git a/fs/cifs/cifs_debug.h b/fs/cifs/cifs_debug.h index 0e74690d11bc..f4f3f0853c6e 100644 --- a/fs/cifs/cifs_debug.h +++ b/fs/cifs/cifs_debug.h @@ -23,7 +23,7 @@ #define _H_CIFS_DEBUG void cifs_dump_mem(char *label, void *data, int length); -void cifs_dump_detail(void *); +void cifs_dump_detail(void *buf, struct TCP_Server_Info *ptcp_info); void cifs_dump_mids(struct TCP_Server_Info *); extern bool traceSMB; /* flag which enables the function below */ void dump_smb(void *, int); diff --git a/fs/cifs/cifs_fs_sb.h b/fs/cifs/cifs_fs_sb.h index 350fa55a1bf7..9731d0d891e7 100644 --- a/fs/cifs/cifs_fs_sb.h +++ b/fs/cifs/cifs_fs_sb.h @@ -50,6 +50,7 @@ * root mountable */ #define CIFS_MOUNT_UID_FROM_ACL 0x2000000 /* try to get UID via special SID */ +#define CIFS_MOUNT_NO_HANDLE_CACHE 0x4000000 /* disable caching dir handles */ struct cifs_sb_info { struct rb_root tlink_tree; diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index f715609b13f3..eb7b6573f322 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -58,13 +58,15 @@ bool traceSMB; bool enable_oplocks = true; bool linuxExtEnabled = true; bool lookupCacheEnabled = true; +bool disable_legacy_dialects; /* false by default */ unsigned int global_secflags = CIFSSEC_DEF; /* unsigned int ntlmv2_support = 0; */ unsigned int sign_CIFS_PDUs = 1; static const struct super_operations cifs_super_ops; unsigned int CIFSMaxBufSize = CIFS_MAX_MSGSIZE; module_param(CIFSMaxBufSize, uint, 0444); -MODULE_PARM_DESC(CIFSMaxBufSize, "Network buffer size (not including header). " +MODULE_PARM_DESC(CIFSMaxBufSize, "Network buffer size (not including header) " + "for CIFS requests. " "Default: 16384 Range: 8192 to 130048"); unsigned int cifs_min_rcv = CIFS_MIN_RCV_POOL; module_param(cifs_min_rcv, uint, 0444); @@ -76,11 +78,21 @@ MODULE_PARM_DESC(cifs_min_small, "Small network buffers in pool. Default: 30 " "Range: 2 to 256"); unsigned int cifs_max_pending = CIFS_MAX_REQ; module_param(cifs_max_pending, uint, 0444); -MODULE_PARM_DESC(cifs_max_pending, "Simultaneous requests to server. " +MODULE_PARM_DESC(cifs_max_pending, "Simultaneous requests to server for " + "CIFS/SMB1 dialect (N/A for SMB3) " "Default: 32767 Range: 2 to 32767."); module_param(enable_oplocks, bool, 0644); MODULE_PARM_DESC(enable_oplocks, "Enable or disable oplocks. Default: y/Y/1"); +module_param(disable_legacy_dialects, bool, 0644); +MODULE_PARM_DESC(disable_legacy_dialects, "To improve security it may be " + "helpful to restrict the ability to " + "override the default dialects (SMB2.1, " + "SMB3 and SMB3.02) on mount with old " + "dialects (CIFS/SMB1 and SMB2) since " + "vers=1.0 (CIFS/SMB1) and vers=2.0 are weaker" + " and less secure. Default: n/N/0"); + extern mempool_t *cifs_sm_req_poolp; extern mempool_t *cifs_req_poolp; extern mempool_t *cifs_mid_poolp; @@ -469,10 +481,20 @@ cifs_show_options(struct seq_file *s, struct dentry *root) seq_puts(s, ",persistenthandles"); else if (tcon->use_resilient) seq_puts(s, ",resilienthandles"); + +#ifdef CONFIG_CIFS_SMB311 + if (tcon->posix_extensions) + seq_puts(s, ",posix"); + else if (tcon->unix_ext) + seq_puts(s, ",unix"); + else + seq_puts(s, ",nounix"); +#else if (tcon->unix_ext) seq_puts(s, ",unix"); else seq_puts(s, ",nounix"); +#endif /* SMB311 */ if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_POSIX_PATHS) seq_puts(s, ",posixpaths"); if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SET_UID) @@ -495,6 +517,8 @@ cifs_show_options(struct seq_file *s, struct dentry *root) seq_puts(s, ",sfu"); if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_BRL) seq_puts(s, ",nobrl"); + if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_HANDLE_CACHE) + seq_puts(s, ",nohandlecache"); if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL) seq_puts(s, ",cifsacl"); if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_DYNPERM) @@ -897,6 +921,17 @@ struct file_system_type cifs_fs_type = { /* .fs_flags */ }; MODULE_ALIAS_FS("cifs"); + +static struct file_system_type smb3_fs_type = { + .owner = THIS_MODULE, + .name = "smb3", + .mount = cifs_do_mount, + .kill_sb = cifs_kill_sb, + /* .fs_flags */ +}; +MODULE_ALIAS_FS("smb3"); +MODULE_ALIAS("smb3"); + const struct inode_operations cifs_dir_inode_ops = { .create = cifs_create, .atomic_open = cifs_atomic_open, @@ -1047,6 +1082,18 @@ out: return rc; } +/* + * Directory operations under CIFS/SMB2/SMB3 are synchronous, so fsync() + * is a dummy operation. + */ +static int cifs_dir_fsync(struct file *file, loff_t start, loff_t end, int datasync) +{ + cifs_dbg(FYI, "Sync directory - name: %pD datasync: 0x%x\n", + file, datasync); + + return 0; +} + static ssize_t cifs_copy_file_range(struct file *src_file, loff_t off, struct file *dst_file, loff_t destoff, size_t len, unsigned int flags) @@ -1181,6 +1228,7 @@ const struct file_operations cifs_dir_ops = { .copy_file_range = cifs_copy_file_range, .clone_file_range = cifs_clone_file_range, .llseek = generic_file_llseek, + .fsync = cifs_dir_fsync, }; static void @@ -1422,6 +1470,12 @@ init_cifs(void) if (rc) goto out_init_cifs_idmap; + rc = register_filesystem(&smb3_fs_type); + if (rc) { + unregister_filesystem(&cifs_fs_type); + goto out_init_cifs_idmap; + } + return 0; out_init_cifs_idmap: @@ -1452,8 +1506,9 @@ out_clean_proc: static void __exit exit_cifs(void) { - cifs_dbg(NOISY, "exit_cifs\n"); + cifs_dbg(NOISY, "exit_smb3\n"); unregister_filesystem(&cifs_fs_type); + unregister_filesystem(&smb3_fs_type); cifs_dfs_release_automount_timer(); #ifdef CONFIG_CIFS_ACL exit_cifs_idmap(); diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h index 013ba2aed8d9..5f0231803431 100644 --- a/fs/cifs/cifsfs.h +++ b/fs/cifs/cifsfs.h @@ -149,5 +149,5 @@ extern long cifs_ioctl(struct file *filep, unsigned int cmd, unsigned long arg); extern const struct export_operations cifs_export_ops; #endif /* CONFIG_CIFS_NFSD_EXPORT */ -#define CIFS_VERSION "2.11" +#define CIFS_VERSION "2.12" #endif /* _CIFSFS_H */ diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index cb950a5fa078..08d1cdd96701 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -176,6 +176,7 @@ struct smb_rqst { struct kvec *rq_iov; /* array of kvecs */ unsigned int rq_nvec; /* number of kvecs in array */ struct page **rq_pages; /* pointer to array of page ptrs */ + unsigned int rq_offset; /* the offset to the 1st page */ unsigned int rq_npages; /* number pages in array */ unsigned int rq_pagesz; /* page size to use */ unsigned int rq_tailsz; /* length of last page */ @@ -244,7 +245,7 @@ struct smb_version_operations { int (*map_error)(char *, bool); /* find mid corresponding to the response message */ struct mid_q_entry * (*find_mid)(struct TCP_Server_Info *, char *); - void (*dump_detail)(void *); + void (*dump_detail)(void *buf, struct TCP_Server_Info *ptcp_info); void (*clear_stats)(struct cifs_tcon *); void (*print_stats)(struct seq_file *m, struct cifs_tcon *); void (*dump_share_caps)(struct seq_file *, struct cifs_tcon *); @@ -372,7 +373,7 @@ struct smb_version_operations { int (*close_dir)(const unsigned int, struct cifs_tcon *, struct cifs_fid *); /* calculate a size of SMB message */ - unsigned int (*calc_smb_size)(void *); + unsigned int (*calc_smb_size)(void *buf, struct TCP_Server_Info *ptcpi); /* check for STATUS_PENDING and process it in a positive case */ bool (*is_status_pending)(char *, struct TCP_Server_Info *, int); /* check for STATUS_NETWORK_SESSION_EXPIRED */ @@ -417,7 +418,7 @@ struct smb_version_operations { /* create lease context buffer for CREATE request */ char * (*create_lease_buf)(u8 *, u8); /* parse lease context buffer and return oplock/epoch info */ - __u8 (*parse_lease_buf)(void *, unsigned int *); + __u8 (*parse_lease_buf)(void *buf, unsigned int *epoch, char *lkey); ssize_t (*copychunk_range)(const unsigned int, struct cifsFileInfo *src_file, struct cifsFileInfo *target_file, @@ -457,7 +458,7 @@ struct smb_version_operations { struct mid_q_entry **); enum securityEnum (*select_sectype)(struct TCP_Server_Info *, enum securityEnum); - + int (*next_header)(char *); }; struct smb_version_values { @@ -521,10 +522,12 @@ struct smb_vol { bool sfu_remap:1; /* remap seven reserved chars ala SFU */ bool posix_paths:1; /* unset to not ask for posix pathnames. */ bool no_linux_ext:1; + bool linux_ext:1; bool sfu_emul:1; bool nullauth:1; /* attempt to authenticate with null user */ bool nocase:1; /* request case insensitive filenames */ bool nobrl:1; /* disable sending byte range locks to srv */ + bool nohandlecache:1; /* disable caching dir handles if srvr probs */ bool mand_lock:1; /* send mandatory not posix byte range lock reqs */ bool seal:1; /* request transport encryption on share */ bool nodfs:1; /* Do not request DFS, even if available */ @@ -630,7 +633,7 @@ struct TCP_Server_Info { bool oplocks:1; /* enable oplocks */ unsigned int maxReq; /* Clients should submit no more */ /* than maxReq distinct unanswered SMBs to the server when using */ - /* multiplexed reads or writes */ + /* multiplexed reads or writes (for SMB1/CIFS only, not SMB2/SMB3) */ unsigned int maxBuf; /* maxBuf specifies the maximum */ /* message size the server can send or receive for non-raw SMBs */ /* maxBuf is returned by SMB NegotiateProtocol so maxBuf is only 0 */ @@ -681,6 +684,7 @@ struct TCP_Server_Info { __le16 cipher_type; /* save initital negprot hash */ __u8 preauth_sha_hash[SMB2_PREAUTH_HASH_SIZE]; + bool posix_ext_supported; #endif /* 3.1.1 */ struct delayed_work reconnect; /* reconnect workqueue job */ struct mutex reconnect_mutex; /* prevent simultaneous reconnects */ @@ -953,9 +957,13 @@ struct cifs_tcon { bool print:1; /* set if connection to printer share */ bool retry:1; bool nocase:1; + bool nohandlecache:1; /* if strange server resource prob can turn off */ bool seal:1; /* transport encryption for this mounted share */ bool unix_ext:1; /* if false disable Linux extensions to CIFS protocol for this mount even if server would support */ +#ifdef CONFIG_CIFS_SMB311 + bool posix_extensions; /* if true SMB3.11 posix extensions enabled */ +#endif /* CIFS_311 */ bool local_lease:1; /* check leases (only) on local system not remote */ bool broken_posix_open; /* e.g. Samba server versions < 3.3.2, 3.2.9 */ bool broken_sparse_sup; /* if server or share does not support sparse */ @@ -979,6 +987,9 @@ struct cifs_tcon { struct fscache_cookie *fscache; /* cookie for share */ #endif struct list_head pending_opens; /* list of incomplete opens */ + bool valid_root_fid:1; /* Do we have a useable root fid */ + struct mutex prfid_mutex; /* prevents reopen race after dead ses*/ + struct cifs_fid *prfid; /* handle to the directory at top of share */ /* BB add field for back pointer to sb struct(s)? */ }; @@ -1071,6 +1082,7 @@ struct cifs_open_parms { int create_options; const char *path; struct cifs_fid *fid; + umode_t mode; bool reconnect:1; }; @@ -1169,10 +1181,11 @@ struct cifs_readdata { struct smbd_mr *mr; #endif unsigned int pagesz; + unsigned int page_offset; unsigned int tailsz; unsigned int credits; unsigned int nr_pages; - struct page *pages[]; + struct page **pages; }; struct cifs_writedata; @@ -1194,10 +1207,11 @@ struct cifs_writedata { struct smbd_mr *mr; #endif unsigned int pagesz; + unsigned int page_offset; unsigned int tailsz; unsigned int credits; unsigned int nr_pages; - struct page *pages[]; + struct page **pages; }; /* @@ -1692,16 +1706,17 @@ GLOBAL_EXTERN atomic_t smBufAllocCount; GLOBAL_EXTERN atomic_t midCount; /* Misc globals */ -GLOBAL_EXTERN bool enable_oplocks; /* enable or disable oplocks */ -GLOBAL_EXTERN bool lookupCacheEnabled; -GLOBAL_EXTERN unsigned int global_secflags; /* if on, session setup sent +extern bool enable_oplocks; /* enable or disable oplocks */ +extern bool lookupCacheEnabled; +extern unsigned int global_secflags; /* if on, session setup sent with more secure ntlmssp2 challenge/resp */ -GLOBAL_EXTERN unsigned int sign_CIFS_PDUs; /* enable smb packet signing */ -GLOBAL_EXTERN bool linuxExtEnabled;/*enable Linux/Unix CIFS extensions*/ -GLOBAL_EXTERN unsigned int CIFSMaxBufSize; /* max size not including hdr */ -GLOBAL_EXTERN unsigned int cifs_min_rcv; /* min size of big ntwrk buf pool */ -GLOBAL_EXTERN unsigned int cifs_min_small; /* min size of small buf pool */ -GLOBAL_EXTERN unsigned int cifs_max_pending; /* MAX requests at once to server*/ +extern unsigned int sign_CIFS_PDUs; /* enable smb packet signing */ +extern bool linuxExtEnabled;/*enable Linux/Unix CIFS extensions*/ +extern unsigned int CIFSMaxBufSize; /* max size not including hdr */ +extern unsigned int cifs_min_rcv; /* min size of big ntwrk buf pool */ +extern unsigned int cifs_min_small; /* min size of small buf pool */ +extern unsigned int cifs_max_pending; /* MAX requests at once to server*/ +extern bool disable_legacy_dialects; /* forbid vers=1.0 and vers=2.0 mounts */ #ifdef CONFIG_CIFS_ACL GLOBAL_EXTERN struct rb_root uidtree; diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h index 365a414a75e9..7933c5f9c076 100644 --- a/fs/cifs/cifsproto.h +++ b/fs/cifs/cifsproto.h @@ -21,6 +21,7 @@ #ifndef _CIFSPROTO_H #define _CIFSPROTO_H #include <linux/nls.h> +#include "trace.h" struct statfs; struct smb_vol; @@ -47,6 +48,7 @@ extern void _free_xid(unsigned int); cifs_dbg(FYI, "CIFS VFS: in %s as Xid: %u with uid: %d\n", \ __func__, __xid, \ from_kuid(&init_user_ns, current_fsuid())); \ + trace_smb3_enter(__xid, __func__); \ __xid; \ }) @@ -54,7 +56,11 @@ extern void _free_xid(unsigned int); do { \ _free_xid(curr_xid); \ cifs_dbg(FYI, "CIFS VFS: leaving %s (xid = %u) rc = %d\n", \ - __func__, curr_xid, (int)rc); \ + __func__, curr_xid, (int)rc); \ + if (rc) \ + trace_smb3_exit_err(curr_xid, __func__, (int)rc); \ + else \ + trace_smb3_exit_done(curr_xid, __func__); \ } while (0) extern int init_cifs_idmap(void); extern void exit_cifs_idmap(void); @@ -124,7 +130,7 @@ extern void cifs_update_eof(struct cifsInodeInfo *cifsi, loff_t offset, unsigned int bytes_written); extern struct cifsFileInfo *find_writable_file(struct cifsInodeInfo *, bool); extern struct cifsFileInfo *find_readable_file(struct cifsInodeInfo *, bool); -extern unsigned int smbCalcSize(void *buf); +extern unsigned int smbCalcSize(void *buf, struct TCP_Server_Info *server); extern int decode_negTokenInit(unsigned char *security_blob, int length, struct TCP_Server_Info *server); extern int cifs_convert_address(struct sockaddr *dst, const char *src, int len); @@ -197,7 +203,9 @@ extern void dequeue_mid(struct mid_q_entry *mid, bool malformed); extern int cifs_read_from_socket(struct TCP_Server_Info *server, char *buf, unsigned int to_read); extern int cifs_read_page_from_socket(struct TCP_Server_Info *server, - struct page *page, unsigned int to_read); + struct page *page, + unsigned int page_offset, + unsigned int to_read); extern int cifs_setup_cifs_sb(struct smb_vol *pvolume_info, struct cifs_sb_info *cifs_sb); extern int cifs_match_super(struct super_block *, void *); @@ -525,6 +533,8 @@ int cifs_async_writev(struct cifs_writedata *wdata, void cifs_writev_complete(struct work_struct *work); struct cifs_writedata *cifs_writedata_alloc(unsigned int nr_pages, work_func_t complete); +struct cifs_writedata *cifs_writedata_direct_alloc(struct page **pages, + work_func_t complete); void cifs_writedata_release(struct kref *refcount); int cifs_query_mf_symlink(unsigned int xid, struct cifs_tcon *tcon, struct cifs_sb_info *cifs_sb, diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c index 1529a088383d..5aca336642c0 100644 --- a/fs/cifs/cifssmb.c +++ b/fs/cifs/cifssmb.c @@ -106,6 +106,12 @@ cifs_mark_open_files_invalid(struct cifs_tcon *tcon) open_file->oplock_break_cancelled = true; } spin_unlock(&tcon->open_file_lock); + + mutex_lock(&tcon->prfid_mutex); + tcon->valid_root_fid = false; + memset(tcon->prfid, 0, sizeof(struct cifs_fid)); + mutex_unlock(&tcon->prfid_mutex); + /* * BB Add call to invalidate_inodes(sb) for all superblocks mounted * to this tcon. @@ -1946,6 +1952,7 @@ cifs_writedata_release(struct kref *refcount) if (wdata->cfile) cifsFileInfo_put(wdata->cfile); + kvfree(wdata->pages); kfree(wdata); } @@ -2069,12 +2076,22 @@ cifs_writev_complete(struct work_struct *work) struct cifs_writedata * cifs_writedata_alloc(unsigned int nr_pages, work_func_t complete) { + struct page **pages = + kzalloc(sizeof(struct page *) * nr_pages, GFP_NOFS); + if (pages) + return cifs_writedata_direct_alloc(pages, complete); + + return NULL; +} + +struct cifs_writedata * +cifs_writedata_direct_alloc(struct page **pages, work_func_t complete) +{ struct cifs_writedata *wdata; - /* writedata + number of page pointers */ - wdata = kzalloc(sizeof(*wdata) + - sizeof(struct page *) * nr_pages, GFP_NOFS); + wdata = kzalloc(sizeof(*wdata), GFP_NOFS); if (wdata != NULL) { + wdata->pages = pages; kref_init(&wdata->refcount); INIT_LIST_HEAD(&wdata->list); init_completion(&wdata->done); diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index a5aa158d535a..e5a2fe7f0dd4 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -61,6 +61,7 @@ #define RFC1001_PORT 139 extern mempool_t *cifs_req_poolp; +extern bool disable_legacy_dialects; /* FIXME: should these be tunable? */ #define TLINK_ERROR_EXPIRE (1 * HZ) @@ -76,9 +77,10 @@ enum { Opt_mapposix, Opt_nomapposix, Opt_mapchars, Opt_nomapchars, Opt_sfu, Opt_nosfu, Opt_nodfs, Opt_posixpaths, - Opt_noposixpaths, Opt_nounix, + Opt_noposixpaths, Opt_nounix, Opt_unix, Opt_nocase, Opt_brl, Opt_nobrl, + Opt_handlecache, Opt_nohandlecache, Opt_forcemandatorylock, Opt_setuidfromacl, Opt_setuids, Opt_nosetuids, Opt_dynperm, Opt_nodynperm, Opt_nohard, Opt_nosoft, @@ -144,10 +146,16 @@ static const match_table_t cifs_mount_option_tokens = { { Opt_noposixpaths, "noposixpaths" }, { Opt_nounix, "nounix" }, { Opt_nounix, "nolinux" }, + { Opt_nounix, "noposix" }, + { Opt_unix, "unix" }, + { Opt_unix, "linux" }, + { Opt_unix, "posix" }, { Opt_nocase, "nocase" }, { Opt_nocase, "ignorecase" }, { Opt_brl, "brl" }, { Opt_nobrl, "nobrl" }, + { Opt_handlecache, "handlecache" }, + { Opt_nohandlecache, "nohandlecache" }, { Opt_nobrl, "nolock" }, { Opt_forcemandatorylock, "forcemandatorylock" }, { Opt_forcemandatorylock, "forcemand" }, @@ -591,10 +599,11 @@ cifs_read_from_socket(struct TCP_Server_Info *server, char *buf, int cifs_read_page_from_socket(struct TCP_Server_Info *server, struct page *page, - unsigned int to_read) + unsigned int page_offset, unsigned int to_read) { struct msghdr smb_msg; - struct bio_vec bv = {.bv_page = page, .bv_len = to_read}; + struct bio_vec bv = { + .bv_page = page, .bv_len = to_read, .bv_offset = page_offset}; iov_iter_bvec(&smb_msg.msg_iter, READ | ITER_BVEC, &bv, 1, to_read); return cifs_readv_from_socket(server, &smb_msg); } @@ -848,6 +857,7 @@ cifs_demultiplex_thread(void *p) int length; struct TCP_Server_Info *server = p; unsigned int pdu_length; + unsigned int next_offset; char *buf = NULL; struct task_struct *task_to_wake = NULL; struct mid_q_entry *mid_entry; @@ -874,24 +884,29 @@ cifs_demultiplex_thread(void *p) length = cifs_read_from_socket(server, buf, pdu_length); if (length < 0) continue; - server->total_read = length; + + if (server->vals->header_preamble_size == 0) + server->total_read = 0; + else + server->total_read = length; /* * The right amount was read from socket - 4 bytes, * so we can now interpret the length field. */ pdu_length = get_rfc1002_length(buf); - server->pdu_size = pdu_length; cifs_dbg(FYI, "RFC1002 header 0x%x\n", pdu_length); if (!is_smb_response(server, buf[0])) continue; +next_pdu: + server->pdu_size = pdu_length; /* make sure we have enough to get to the MID */ - if (pdu_length < HEADER_SIZE(server) - 1 - + if (server->pdu_size < HEADER_SIZE(server) - 1 - server->vals->header_preamble_size) { cifs_dbg(VFS, "SMB response too short (%u bytes)\n", - pdu_length); + server->pdu_size); cifs_reconnect(server); wake_up(&server->response_q); continue; @@ -906,6 +921,12 @@ cifs_demultiplex_thread(void *p) continue; server->total_read += length; + if (server->ops->next_header) { + next_offset = server->ops->next_header(buf); + if (next_offset) + server->pdu_size = next_offset; + } + if (server->ops->is_transform_hdr && server->ops->receive_transform && server->ops->is_transform_hdr(buf)) { @@ -948,10 +969,18 @@ cifs_demultiplex_thread(void *p) HEADER_SIZE(server)); #ifdef CONFIG_CIFS_DEBUG2 if (server->ops->dump_detail) - server->ops->dump_detail(buf); + server->ops->dump_detail(buf, server); cifs_dump_mids(server); #endif /* CIFS_DEBUG2 */ - + } + if (pdu_length > server->pdu_size) { + if (!allocate_buffers(server)) + continue; + pdu_length -= server->pdu_size; + server->total_read = 0; + server->large_buf = false; + buf = server->smallbuf; + goto next_pdu; } } /* end while !EXITING */ @@ -1143,10 +1172,18 @@ cifs_parse_smb_version(char *value, struct smb_vol *vol) switch (match_token(value, cifs_smb_version_tokens, args)) { case Smb_1: + if (disable_legacy_dialects) { + cifs_dbg(VFS, "mount with legacy dialect disabled\n"); + return 1; + } vol->ops = &smb1_operations; vol->vals = &smb1_values; break; case Smb_20: + if (disable_legacy_dialects) { + cifs_dbg(VFS, "mount with legacy dialect disabled\n"); + return 1; + } vol->ops = &smb20_operations; vol->vals = &smb20_values; break; @@ -1426,8 +1463,17 @@ cifs_parse_mount_options(const char *mountdata, const char *devname, vol->posix_paths = 0; break; case Opt_nounix: + if (vol->linux_ext) + cifs_dbg(VFS, + "conflicting unix mount options\n"); vol->no_linux_ext = 1; break; + case Opt_unix: + if (vol->no_linux_ext) + cifs_dbg(VFS, + "conflicting unix mount options\n"); + vol->linux_ext = 1; + break; case Opt_nocase: vol->nocase = 1; break; @@ -1445,6 +1491,12 @@ cifs_parse_mount_options(const char *mountdata, const char *devname, (S_IALLUGO & ~(S_ISUID | S_IXGRP))) vol->file_mode = S_IALLUGO; break; + case Opt_nohandlecache: + vol->nohandlecache = 1; + break; + case Opt_handlecache: + vol->nohandlecache = 0; + break; case Opt_forcemandatorylock: vol->mand_lock = 1; break; @@ -1977,14 +2029,6 @@ cifs_parse_mount_options(const char *mountdata, const char *devname, goto cifs_parse_mount_err; } -#ifdef CONFIG_CIFS_SMB_DIRECT - if (vol->rdma && vol->sign) { - cifs_dbg(VFS, "Currently SMB direct doesn't support signing." - " This is being fixed\n"); - goto cifs_parse_mount_err; - } -#endif - #ifndef CONFIG_KEYS /* Muliuser mounts require CONFIG_KEYS support */ if (vol->multiuser) { @@ -2975,6 +3019,13 @@ cifs_get_tcon(struct cifs_ses *ses, struct smb_vol *volume_info) } } +#ifdef CONFIG_CIFS_SMB311 + if ((volume_info->linux_ext) && (ses->server->posix_ext_supported)) { + if (ses->server->vals->protocol_id == SMB311_PROT_ID) + tcon->posix_extensions = true; + } +#endif /* 311 */ + /* * BB Do we need to wrap session_mutex around this TCon call and Unix * SetFS as we do on SessSetup and reconnect? @@ -3030,6 +3081,7 @@ cifs_get_tcon(struct cifs_ses *ses, struct smb_vol *volume_info) */ tcon->retry = volume_info->retry; tcon->nocase = volume_info->nocase; + tcon->nohandlecache = volume_info->nohandlecache; tcon->local_lease = volume_info->local_lease; INIT_LIST_HEAD(&tcon->pending_opens); @@ -3588,6 +3640,8 @@ int cifs_setup_cifs_sb(struct smb_vol *pvolume_info, cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_UNX_EMUL; if (pvolume_info->nobrl) cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_NO_BRL; + if (pvolume_info->nohandlecache) + cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_NO_HANDLE_CACHE; if (pvolume_info->nostrictsync) cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_NOSSYNC; if (pvolume_info->mand_lock) @@ -3930,6 +3984,12 @@ try_mount_again: goto remote_path_check; } +#ifdef CONFIG_CIFS_SMB311 + /* if new SMB3.11 POSIX extensions are supported do not remap / and \ */ + if (tcon->posix_extensions) + cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_POSIX_PATHS; +#endif /* SMB3.11 */ + /* tell server which Unix caps we support */ if (cap_unix(tcon->ses)) { /* reset of caps checks mount to see if unix extensions @@ -4361,6 +4421,7 @@ cifs_construct_tcon(struct cifs_sb_info *cifs_sb, kuid_t fsuid) vol_info->UNC = master_tcon->treeName; vol_info->retry = master_tcon->retry; vol_info->nocase = master_tcon->nocase; + vol_info->nohandlecache = master_tcon->nohandlecache; vol_info->local_lease = master_tcon->local_lease; vol_info->no_linux_ext = !master_tcon->unix_ext; vol_info->sectype = master_tcon->ses->sectype; @@ -4390,8 +4451,14 @@ cifs_construct_tcon(struct cifs_sb_info *cifs_sb, kuid_t fsuid) goto out; } +#ifdef CONFIG_CIFS_SMB311 + /* if new SMB3.11 POSIX extensions are supported do not remap / and \ */ + if (tcon->posix_extensions) + cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_POSIX_PATHS; +#endif /* SMB3.11 */ if (cap_unix(ses)) reset_cifs_unix_caps(0, tcon, NULL, vol_info); + out: kfree(vol_info->username); kzfree(vol_info->password); diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c index 925844343038..ddae52bd1993 100644 --- a/fs/cifs/dir.c +++ b/fs/cifs/dir.c @@ -369,7 +369,7 @@ cifs_do_create(struct inode *inode, struct dentry *direntry, unsigned int xid, oparms.path = full_path; oparms.fid = fid; oparms.reconnect = false; - + oparms.mode = mode; rc = server->ops->open(xid, &oparms, oplock, buf); if (rc) { cifs_dbg(FYI, "cifs_create returned 0x%x\n", rc); @@ -780,21 +780,25 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry, tlink = cifs_sb_tlink(cifs_sb); if (IS_ERR(tlink)) { free_xid(xid); - return (struct dentry *)tlink; + return ERR_CAST(tlink); } pTcon = tlink_tcon(tlink); rc = check_name(direntry, pTcon); - if (rc) - goto lookup_out; + if (unlikely(rc)) { + cifs_put_tlink(tlink); + free_xid(xid); + return ERR_PTR(rc); + } /* can not grab the rename sem here since it would deadlock in the cases (beginning of sys_rename itself) in which we already have the sb rename sem */ full_path = build_path_from_dentry(direntry); if (full_path == NULL) { - rc = -ENOMEM; - goto lookup_out; + cifs_put_tlink(tlink); + free_xid(xid); + return ERR_PTR(-ENOMEM); } if (d_really_is_positive(direntry)) { @@ -813,29 +817,25 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry, parent_dir_inode->i_sb, xid, NULL); } - if ((rc == 0) && (newInode != NULL)) { - d_add(direntry, newInode); + if (rc == 0) { /* since paths are not looked up by component - the parent directories are presumed to be good here */ renew_parental_timestamps(direntry); - } else if (rc == -ENOENT) { - rc = 0; cifs_set_time(direntry, jiffies); - d_add(direntry, NULL); - /* if it was once a directory (but how can we tell?) we could do - shrink_dcache_parent(direntry); */ - } else if (rc != -EACCES) { - cifs_dbg(FYI, "Unexpected lookup error %d\n", rc); - /* We special case check for Access Denied - since that - is a common return code */ + newInode = NULL; + } else { + if (rc != -EACCES) { + cifs_dbg(FYI, "Unexpected lookup error %d\n", rc); + /* We special case check for Access Denied - since that + is a common return code */ + } + newInode = ERR_PTR(rc); } - -lookup_out: kfree(full_path); cifs_put_tlink(tlink); free_xid(xid); - return ERR_PTR(rc); + return d_splice_alias(newInode, direntry); } static int diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 23fd430fe74a..87eece6fbd48 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -2880,13 +2880,13 @@ out: } static struct cifs_readdata * -cifs_readdata_alloc(unsigned int nr_pages, work_func_t complete) +cifs_readdata_direct_alloc(struct page **pages, work_func_t complete) { struct cifs_readdata *rdata; - rdata = kzalloc(sizeof(*rdata) + (sizeof(struct page *) * nr_pages), - GFP_KERNEL); + rdata = kzalloc(sizeof(*rdata), GFP_KERNEL); if (rdata != NULL) { + rdata->pages = pages; kref_init(&rdata->refcount); INIT_LIST_HEAD(&rdata->list); init_completion(&rdata->done); @@ -2896,6 +2896,22 @@ cifs_readdata_alloc(unsigned int nr_pages, work_func_t complete) return rdata; } +static struct cifs_readdata * +cifs_readdata_alloc(unsigned int nr_pages, work_func_t complete) +{ + struct page **pages = + kzalloc(sizeof(struct page *) * nr_pages, GFP_KERNEL); + struct cifs_readdata *ret = NULL; + + if (pages) { + ret = cifs_readdata_direct_alloc(pages, complete); + if (!ret) + kfree(pages); + } + + return ret; +} + void cifs_readdata_release(struct kref *refcount) { @@ -2910,6 +2926,7 @@ cifs_readdata_release(struct kref *refcount) if (rdata->cfile) cifsFileInfo_put(rdata->cfile); + kvfree(rdata->pages); kfree(rdata); } @@ -3009,12 +3026,20 @@ uncached_fill_pages(struct TCP_Server_Info *server, int result = 0; unsigned int i; unsigned int nr_pages = rdata->nr_pages; + unsigned int page_offset = rdata->page_offset; rdata->got_bytes = 0; rdata->tailsz = PAGE_SIZE; for (i = 0; i < nr_pages; i++) { struct page *page = rdata->pages[i]; size_t n; + unsigned int segment_size = rdata->pagesz; + + if (i == 0) + segment_size -= page_offset; + else + page_offset = 0; + if (len <= 0) { /* no need to hold page hostage */ @@ -3023,24 +3048,25 @@ uncached_fill_pages(struct TCP_Server_Info *server, put_page(page); continue; } + n = len; - if (len >= PAGE_SIZE) { + if (len >= segment_size) /* enough data to fill the page */ - n = PAGE_SIZE; - len -= n; - } else { - zero_user(page, len, PAGE_SIZE - len); + n = segment_size; + else rdata->tailsz = len; - len = 0; - } + len -= n; + if (iter) - result = copy_page_from_iter(page, 0, n, iter); + result = copy_page_from_iter( + page, page_offset, n, iter); #ifdef CONFIG_CIFS_SMB_DIRECT else if (rdata->mr) result = n; #endif else - result = cifs_read_page_from_socket(server, page, n); + result = cifs_read_page_from_socket( + server, page, page_offset, n); if (result < 0) break; @@ -3113,6 +3139,7 @@ cifs_send_async_read(loff_t offset, size_t len, struct cifsFileInfo *open_file, rdata->bytes = cur_len; rdata->pid = pid; rdata->pagesz = PAGE_SIZE; + rdata->tailsz = PAGE_SIZE; rdata->read_into_pages = cifs_uncached_read_into_pages; rdata->copy_into_pages = cifs_uncached_copy_into_pages; rdata->credits = credits; @@ -3557,6 +3584,7 @@ readpages_fill_pages(struct TCP_Server_Info *server, u64 eof; pgoff_t eof_index; unsigned int nr_pages = rdata->nr_pages; + unsigned int page_offset = rdata->page_offset; /* determine the eof that the server (probably) has */ eof = CIFS_I(rdata->mapping->host)->server_eof; @@ -3567,13 +3595,21 @@ readpages_fill_pages(struct TCP_Server_Info *server, rdata->tailsz = PAGE_SIZE; for (i = 0; i < nr_pages; i++) { struct page *page = rdata->pages[i]; - size_t n = PAGE_SIZE; + unsigned int to_read = rdata->pagesz; + size_t n; + + if (i == 0) + to_read -= page_offset; + else + page_offset = 0; + + n = to_read; - if (len >= PAGE_SIZE) { - len -= PAGE_SIZE; + if (len >= to_read) { + len -= to_read; } else if (len > 0) { /* enough for partial page, fill and zero the rest */ - zero_user(page, len, PAGE_SIZE - len); + zero_user(page, len + page_offset, to_read - len); n = rdata->tailsz = len; len = 0; } else if (page->index > eof_index) { @@ -3605,13 +3641,15 @@ readpages_fill_pages(struct TCP_Server_Info *server, } if (iter) - result = copy_page_from_iter(page, 0, n, iter); + result = copy_page_from_iter( + page, page_offset, n, iter); #ifdef CONFIG_CIFS_SMB_DIRECT else if (rdata->mr) result = n; #endif else - result = cifs_read_page_from_socket(server, page, n); + result = cifs_read_page_from_socket( + server, page, page_offset, n); if (result < 0) break; @@ -3790,6 +3828,7 @@ static int cifs_readpages(struct file *file, struct address_space *mapping, rdata->bytes = bytes; rdata->pid = pid; rdata->pagesz = PAGE_SIZE; + rdata->tailsz = PAGE_SIZE; rdata->read_into_pages = cifs_readpages_read_into_pages; rdata->copy_into_pages = cifs_readpages_copy_into_pages; rdata->credits = credits; diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index 3c371f7f5963..745fd7fe8d0e 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c @@ -746,7 +746,8 @@ cifs_get_inode_info(struct inode **inode, const char *full_path, cifs_dbg(FYI, "Getting info on %s\n", full_path); if ((data == NULL) && (*inode != NULL)) { - if (CIFS_CACHE_READ(CIFS_I(*inode))) { + if (CIFS_CACHE_READ(CIFS_I(*inode)) && + CIFS_I(*inode)->time != 0) { cifs_dbg(FYI, "No need to revalidate cached inode sizes\n"); goto cgii_exit; } @@ -1857,15 +1858,15 @@ cifs_inode_needs_reval(struct inode *inode) struct cifsInodeInfo *cifs_i = CIFS_I(inode); struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); + if (cifs_i->time == 0) + return true; + if (CIFS_CACHE_READ(cifs_i)) return false; if (!lookupCacheEnabled) return true; - if (cifs_i->time == 0) - return true; - if (!cifs_sb->actimeo) return true; @@ -2104,10 +2105,14 @@ static int cifs_truncate_page(struct address_space *mapping, loff_t from) static void cifs_setsize(struct inode *inode, loff_t offset) { + struct cifsInodeInfo *cifs_i = CIFS_I(inode); + spin_lock(&inode->i_lock); i_size_write(inode, offset); spin_unlock(&inode->i_lock); + /* Cached inode must be refreshed on truncate */ + cifs_i->time = 0; truncate_pagecache(inode, offset); } diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c index 460084a8eac5..aba3fc3058da 100644 --- a/fs/cifs/misc.c +++ b/fs/cifs/misc.c @@ -117,6 +117,8 @@ tconInfoAlloc(void) INIT_LIST_HEAD(&ret_buf->openFileList); INIT_LIST_HEAD(&ret_buf->tcon_list); spin_lock_init(&ret_buf->open_file_lock); + mutex_init(&ret_buf->prfid_mutex); + ret_buf->prfid = kzalloc(sizeof(struct cifs_fid), GFP_KERNEL); #ifdef CONFIG_CIFS_STATS spin_lock_init(&ret_buf->stat_lock); #endif @@ -134,6 +136,7 @@ tconInfoFree(struct cifs_tcon *buf_to_free) atomic_dec(&tconInfoAllocCount); kfree(buf_to_free->nativeFileSystem); kzfree(buf_to_free->password); + kfree(buf_to_free->prfid); kfree(buf_to_free); } @@ -145,7 +148,7 @@ cifs_buf_get(void) * SMB2 header is bigger than CIFS one - no problems to clean some * more bytes for CIFS. */ - size_t buf_size = sizeof(struct smb2_hdr); + size_t buf_size = sizeof(struct smb2_sync_hdr); /* * We could use negotiated size instead of max_msgsize - @@ -339,7 +342,7 @@ checkSMB(char *buf, unsigned int total_read, struct TCP_Server_Info *server) /* otherwise, there is enough to get to the BCC */ if (check_smb_hdr(smb)) return -EIO; - clc_len = smbCalcSize(smb); + clc_len = smbCalcSize(smb, server); if (4 + rfclen != total_read) { cifs_dbg(VFS, "Length read does not match RFC1001 length %d\n", diff --git a/fs/cifs/netmisc.c b/fs/cifs/netmisc.c index cc88f4f0325e..d7ad0dfe4e68 100644 --- a/fs/cifs/netmisc.c +++ b/fs/cifs/netmisc.c @@ -903,7 +903,7 @@ map_smb_to_linux_error(char *buf, bool logErr) * portion, the number of word parameters and the data portion of the message */ unsigned int -smbCalcSize(void *buf) +smbCalcSize(void *buf, struct TCP_Server_Info *server) { struct smb_hdr *ptr = (struct smb_hdr *)buf; return (sizeof(struct smb_hdr) + (2 * ptr->WordCount) + diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c index a27fc8791551..eeab81c9452f 100644 --- a/fs/cifs/readdir.c +++ b/fs/cifs/readdir.c @@ -650,7 +650,8 @@ find_cifs_entry(const unsigned int xid, struct cifs_tcon *tcon, loff_t pos, char *cur_ent; char *end_of_smb = cfile->srch_inf.ntwrk_buf_start + server->ops->calc_smb_size( - cfile->srch_inf.ntwrk_buf_start); + cfile->srch_inf.ntwrk_buf_start, + server); cur_ent = cfile->srch_inf.srch_entries_start; first_entry_in_buffer = cfile->srch_inf.index_of_last_entry @@ -831,7 +832,8 @@ int cifs_readdir(struct file *file, struct dir_context *ctx) cifs_dbg(FYI, "loop through %d times filling dir for net buf %p\n", num_to_fill, cifsFile->srch_inf.ntwrk_buf_start); max_len = tcon->ses->server->ops->calc_smb_size( - cifsFile->srch_inf.ntwrk_buf_start); + cifsFile->srch_inf.ntwrk_buf_start, + tcon->ses->server); end_of_smb = cifsFile->srch_inf.ntwrk_buf_start + max_len; tmp_buf = kmalloc(UNICODE_NAME_MAX, GFP_KERNEL); diff --git a/fs/cifs/smb2glob.h b/fs/cifs/smb2glob.h index 401a5d856636..0ffa18094335 100644 --- a/fs/cifs/smb2glob.h +++ b/fs/cifs/smb2glob.h @@ -61,9 +61,4 @@ /* Maximum buffer size value we can send with 1 credit */ #define SMB2_MAX_BUFFER_SIZE 65536 -static inline struct smb2_sync_hdr *get_sync_hdr(void *buf) -{ - return &(((struct smb2_hdr *)buf)->sync_hdr); -} - #endif /* _SMB2_GLOB_H */ diff --git a/fs/cifs/smb2inode.c b/fs/cifs/smb2inode.c index 1238cd3552f9..a6e786e39248 100644 --- a/fs/cifs/smb2inode.c +++ b/fs/cifs/smb2inode.c @@ -44,26 +44,38 @@ smb2_open_op_close(const unsigned int xid, struct cifs_tcon *tcon, __u32 create_options, void *data, int command) { int rc, tmprc = 0; - __le16 *utf16_path; + __le16 *utf16_path = NULL; __u8 oplock = SMB2_OPLOCK_LEVEL_NONE; struct cifs_open_parms oparms; struct cifs_fid fid; + bool use_cached_root_handle = false; - utf16_path = cifs_convert_path_to_utf16(full_path, cifs_sb); - if (!utf16_path) - return -ENOMEM; + if ((strcmp(full_path, "") == 0) && (create_options == 0) && + (desired_access == FILE_READ_ATTRIBUTES) && + (create_disposition == FILE_OPEN) && + (tcon->nohandlecache == false)) { + rc = open_shroot(xid, tcon, &fid); + if (rc == 0) + use_cached_root_handle = true; + } + + if (use_cached_root_handle == false) { + utf16_path = cifs_convert_path_to_utf16(full_path, cifs_sb); + if (!utf16_path) + return -ENOMEM; - oparms.tcon = tcon; - oparms.desired_access = desired_access; - oparms.disposition = create_disposition; - oparms.create_options = create_options; - oparms.fid = &fid; - oparms.reconnect = false; + oparms.tcon = tcon; + oparms.desired_access = desired_access; + oparms.disposition = create_disposition; + oparms.create_options = create_options; + oparms.fid = &fid; + oparms.reconnect = false; - rc = SMB2_open(xid, &oparms, utf16_path, &oplock, NULL, NULL); - if (rc) { - kfree(utf16_path); - return rc; + rc = SMB2_open(xid, &oparms, utf16_path, &oplock, NULL, NULL); + if (rc) { + kfree(utf16_path); + return rc; + } } switch (command) { @@ -107,7 +119,8 @@ smb2_open_op_close(const unsigned int xid, struct cifs_tcon *tcon, break; } - rc = SMB2_close(xid, tcon, fid.persistent_fid, fid.volatile_fid); + if (use_cached_root_handle == false) + rc = SMB2_close(xid, tcon, fid.persistent_fid, fid.volatile_fid); if (tmprc) rc = tmprc; kfree(utf16_path); diff --git a/fs/cifs/smb2maperror.c b/fs/cifs/smb2maperror.c index 3bfc9c990724..20a2d304c603 100644 --- a/fs/cifs/smb2maperror.c +++ b/fs/cifs/smb2maperror.c @@ -27,6 +27,7 @@ #include "smb2proto.h" #include "smb2status.h" #include "smb2glob.h" +#include "trace.h" struct status_to_posix_error { __le32 smb2_status; @@ -2450,13 +2451,16 @@ smb2_print_status(__le32 status) int map_smb2_to_linux_error(char *buf, bool log_err) { - struct smb2_sync_hdr *shdr = get_sync_hdr(buf); + struct smb2_sync_hdr *shdr = (struct smb2_sync_hdr *)buf; unsigned int i; int rc = -EIO; __le32 smb2err = shdr->Status; - if (smb2err == 0) + if (smb2err == 0) { + trace_smb3_cmd_done(shdr->TreeId, shdr->SessionId, + le16_to_cpu(shdr->Command), le64_to_cpu(shdr->MessageId)); return 0; + } /* mask facility */ if (log_err && (smb2err != STATUS_MORE_PROCESSING_REQUIRED) && @@ -2478,5 +2482,8 @@ map_smb2_to_linux_error(char *buf, bool log_err) cifs_dbg(FYI, "Mapping SMB2 status code 0x%08x to POSIX err %d\n", __le32_to_cpu(smb2err), rc); + trace_smb3_cmd_err(shdr->TreeId, shdr->SessionId, + le16_to_cpu(shdr->Command), + le64_to_cpu(shdr->MessageId), le32_to_cpu(smb2err), rc); return rc; } diff --git a/fs/cifs/smb2misc.c b/fs/cifs/smb2misc.c index 68ea8491c160..cb5728e3d87d 100644 --- a/fs/cifs/smb2misc.c +++ b/fs/cifs/smb2misc.c @@ -94,8 +94,8 @@ static const __le16 smb2_rsp_struct_sizes[NUMBER_OF_SMB2_COMMANDS] = { }; #ifdef CONFIG_CIFS_SMB311 -static __u32 get_neg_ctxt_len(struct smb2_hdr *hdr, __u32 len, __u32 non_ctxlen, - size_t hdr_preamble_size) +static __u32 get_neg_ctxt_len(struct smb2_sync_hdr *hdr, __u32 len, + __u32 non_ctxlen) { __u16 neg_count; __u32 nc_offset, size_of_pad_before_neg_ctxts; @@ -109,12 +109,11 @@ static __u32 get_neg_ctxt_len(struct smb2_hdr *hdr, __u32 len, __u32 non_ctxlen, /* Make sure that negotiate contexts start after gss security blob */ nc_offset = le32_to_cpu(pneg_rsp->NegotiateContextOffset); - if (nc_offset < non_ctxlen - hdr_preamble_size /* RFC1001 len */) { + if (nc_offset < non_ctxlen) { printk_once(KERN_WARNING "invalid negotiate context offset\n"); return 0; } - size_of_pad_before_neg_ctxts = nc_offset - - (non_ctxlen - hdr_preamble_size); + size_of_pad_before_neg_ctxts = nc_offset - non_ctxlen; /* Verify that at least minimal negotiate contexts fit within frame */ if (len < nc_offset + (neg_count * sizeof(struct smb2_neg_context))) { @@ -131,25 +130,20 @@ static __u32 get_neg_ctxt_len(struct smb2_hdr *hdr, __u32 len, __u32 non_ctxlen, #endif /* CIFS_SMB311 */ int -smb2_check_message(char *buf, unsigned int length, struct TCP_Server_Info *srvr) +smb2_check_message(char *buf, unsigned int len, struct TCP_Server_Info *srvr) { - struct smb2_pdu *pdu = (struct smb2_pdu *)buf; - struct smb2_hdr *hdr = &pdu->hdr; - struct smb2_sync_hdr *shdr = get_sync_hdr(buf); + struct smb2_sync_hdr *shdr = (struct smb2_sync_hdr *)buf; + struct smb2_sync_pdu *pdu = (struct smb2_sync_pdu *)shdr; __u64 mid; - __u32 len = get_rfc1002_length(buf); __u32 clc_len; /* calculated length */ int command; - - /* BB disable following printk later */ - cifs_dbg(FYI, "%s length: 0x%x, smb_buf_length: 0x%x\n", - __func__, length, len); + int pdu_size = sizeof(struct smb2_sync_pdu); + int hdr_size = sizeof(struct smb2_sync_hdr); /* * Add function to do table lookup of StructureSize by command * ie Validate the wct via smb2_struct_sizes table above */ - if (shdr->ProtocolId == SMB2_TRANSFORM_PROTO_NUM) { struct smb2_transform_hdr *thdr = (struct smb2_transform_hdr *)buf; @@ -173,8 +167,8 @@ smb2_check_message(char *buf, unsigned int length, struct TCP_Server_Info *srvr) } mid = le64_to_cpu(shdr->MessageId); - if (length < sizeof(struct smb2_pdu)) { - if ((length >= sizeof(struct smb2_hdr)) + if (len < pdu_size) { + if ((len >= hdr_size) && (shdr->Status != 0)) { pdu->StructureSize2 = 0; /* @@ -187,8 +181,7 @@ smb2_check_message(char *buf, unsigned int length, struct TCP_Server_Info *srvr) } return 1; } - if (len > CIFSMaxBufSize + MAX_SMB2_HDR_SIZE - - srvr->vals->header_preamble_size) { + if (len > CIFSMaxBufSize + MAX_SMB2_HDR_SIZE) { cifs_dbg(VFS, "SMB length greater than maximum, mid=%llu\n", mid); return 1; @@ -227,44 +220,38 @@ smb2_check_message(char *buf, unsigned int length, struct TCP_Server_Info *srvr) } } - if (srvr->vals->header_preamble_size + len != length) { - cifs_dbg(VFS, "Total length %u RFC1002 length %zu mismatch mid %llu\n", - length, srvr->vals->header_preamble_size + len, mid); - return 1; - } - - clc_len = smb2_calc_size(hdr); + clc_len = smb2_calc_size(buf, srvr); #ifdef CONFIG_CIFS_SMB311 if (shdr->Command == SMB2_NEGOTIATE) - clc_len += get_neg_ctxt_len(hdr, len, clc_len, - srvr->vals->header_preamble_size); + clc_len += get_neg_ctxt_len(shdr, len, clc_len); #endif /* SMB311 */ - if (srvr->vals->header_preamble_size + len != clc_len) { - cifs_dbg(FYI, "Calculated size %u length %zu mismatch mid %llu\n", - clc_len, srvr->vals->header_preamble_size + len, mid); + if (len != clc_len) { + cifs_dbg(FYI, "Calculated size %u length %u mismatch mid %llu\n", + clc_len, len, mid); /* create failed on symlink */ if (command == SMB2_CREATE_HE && shdr->Status == STATUS_STOPPED_ON_SYMLINK) return 0; /* Windows 7 server returns 24 bytes more */ - if (clc_len + 24 - srvr->vals->header_preamble_size == len && command == SMB2_OPLOCK_BREAK_HE) + if (clc_len + 24 == len && command == SMB2_OPLOCK_BREAK_HE) return 0; /* server can return one byte more due to implied bcc[0] */ - if (clc_len == srvr->vals->header_preamble_size + len + 1) + if (clc_len == len + 1) return 0; /* * MacOS server pads after SMB2.1 write response with 3 bytes * of junk. Other servers match RFC1001 len to actual * SMB2/SMB3 frame length (header + smb2 response specific data) + * Some windows servers do too when compounding is used. * Log the server error (once), but allow it and continue * since the frame is parseable. */ - if (clc_len < srvr->vals->header_preamble_size /* RFC1001 header size */ + len) { + if (clc_len < len) { printk_once(KERN_WARNING - "SMB2 server sent bad RFC1001 len %d not %zu\n", - len, clc_len - srvr->vals->header_preamble_size); + "SMB2 server sent bad RFC1001 len %d not %d\n", + len, clc_len); return 0; } @@ -305,15 +292,14 @@ static const bool has_smb2_data_area[NUMBER_OF_SMB2_COMMANDS] = { * area and the offset to it (from the beginning of the smb are also returned. */ char * -smb2_get_data_area_len(int *off, int *len, struct smb2_hdr *hdr) +smb2_get_data_area_len(int *off, int *len, struct smb2_sync_hdr *shdr) { - struct smb2_sync_hdr *shdr = get_sync_hdr(hdr); *off = 0; *len = 0; /* error responses do not have data area */ if (shdr->Status && shdr->Status != STATUS_MORE_PROCESSING_REQUIRED && - (((struct smb2_err_rsp *)hdr)->StructureSize) == + (((struct smb2_err_rsp *)shdr)->StructureSize) == SMB2_ERROR_STRUCTURE_SIZE2) return NULL; @@ -325,42 +311,44 @@ smb2_get_data_area_len(int *off, int *len, struct smb2_hdr *hdr) switch (shdr->Command) { case SMB2_NEGOTIATE: *off = le16_to_cpu( - ((struct smb2_negotiate_rsp *)hdr)->SecurityBufferOffset); + ((struct smb2_negotiate_rsp *)shdr)->SecurityBufferOffset); *len = le16_to_cpu( - ((struct smb2_negotiate_rsp *)hdr)->SecurityBufferLength); + ((struct smb2_negotiate_rsp *)shdr)->SecurityBufferLength); break; case SMB2_SESSION_SETUP: *off = le16_to_cpu( - ((struct smb2_sess_setup_rsp *)hdr)->SecurityBufferOffset); + ((struct smb2_sess_setup_rsp *)shdr)->SecurityBufferOffset); *len = le16_to_cpu( - ((struct smb2_sess_setup_rsp *)hdr)->SecurityBufferLength); + ((struct smb2_sess_setup_rsp *)shdr)->SecurityBufferLength); break; case SMB2_CREATE: *off = le32_to_cpu( - ((struct smb2_create_rsp *)hdr)->CreateContextsOffset); + ((struct smb2_create_rsp *)shdr)->CreateContextsOffset); *len = le32_to_cpu( - ((struct smb2_create_rsp *)hdr)->CreateContextsLength); + ((struct smb2_create_rsp *)shdr)->CreateContextsLength); break; case SMB2_QUERY_INFO: *off = le16_to_cpu( - ((struct smb2_query_info_rsp *)hdr)->OutputBufferOffset); + ((struct smb2_query_info_rsp *)shdr)->OutputBufferOffset); *len = le32_to_cpu( - ((struct smb2_query_info_rsp *)hdr)->OutputBufferLength); + ((struct smb2_query_info_rsp *)shdr)->OutputBufferLength); break; case SMB2_READ: - *off = ((struct smb2_read_rsp *)hdr)->DataOffset; - *len = le32_to_cpu(((struct smb2_read_rsp *)hdr)->DataLength); + /* TODO: is this a bug ? */ + *off = ((struct smb2_read_rsp *)shdr)->DataOffset; + *len = le32_to_cpu(((struct smb2_read_rsp *)shdr)->DataLength); break; case SMB2_QUERY_DIRECTORY: *off = le16_to_cpu( - ((struct smb2_query_directory_rsp *)hdr)->OutputBufferOffset); + ((struct smb2_query_directory_rsp *)shdr)->OutputBufferOffset); *len = le32_to_cpu( - ((struct smb2_query_directory_rsp *)hdr)->OutputBufferLength); + ((struct smb2_query_directory_rsp *)shdr)->OutputBufferLength); break; case SMB2_IOCTL: *off = le32_to_cpu( - ((struct smb2_ioctl_rsp *)hdr)->OutputOffset); - *len = le32_to_cpu(((struct smb2_ioctl_rsp *)hdr)->OutputCount); + ((struct smb2_ioctl_rsp *)shdr)->OutputOffset); + *len = le32_to_cpu( + ((struct smb2_ioctl_rsp *)shdr)->OutputCount); break; case SMB2_CHANGE_NOTIFY: default: @@ -403,15 +391,14 @@ smb2_get_data_area_len(int *off, int *len, struct smb2_hdr *hdr) * portion, the number of word parameters and the data portion of the message. */ unsigned int -smb2_calc_size(void *buf) +smb2_calc_size(void *buf, struct TCP_Server_Info *srvr) { - struct smb2_pdu *pdu = (struct smb2_pdu *)buf; - struct smb2_hdr *hdr = &pdu->hdr; - struct smb2_sync_hdr *shdr = get_sync_hdr(hdr); + struct smb2_sync_pdu *pdu = (struct smb2_sync_pdu *)buf; + struct smb2_sync_hdr *shdr = &pdu->sync_hdr; int offset; /* the offset from the beginning of SMB to data area */ int data_length; /* the length of the variable length data area */ /* Structure Size has already been checked to make sure it is 64 */ - int len = 4 + le16_to_cpu(shdr->StructureSize); + int len = le16_to_cpu(shdr->StructureSize); /* * StructureSize2, ie length of fixed parameter area has already @@ -422,7 +409,7 @@ smb2_calc_size(void *buf) if (has_smb2_data_area[le16_to_cpu(shdr->Command)] == false) goto calc_size_exit; - smb2_get_data_area_len(&offset, &data_length, hdr); + smb2_get_data_area_len(&offset, &data_length, shdr); cifs_dbg(FYI, "SMB2 data length %d offset %d\n", data_length, offset); if (data_length > 0) { @@ -430,15 +417,14 @@ smb2_calc_size(void *buf) * Check to make sure that data area begins after fixed area, * Note that last byte of the fixed area is part of data area * for some commands, typically those with odd StructureSize, - * so we must add one to the calculation (and 4 to account for - * the size of the RFC1001 hdr. + * so we must add one to the calculation. */ - if (offset + 4 + 1 < len) { + if (offset + 1 < len) { cifs_dbg(VFS, "data area offset %d overlaps SMB2 header %d\n", - offset + 4 + 1, len); + offset + 1, len); data_length = 0; } else { - len = 4 + offset + data_length; + len = offset + data_length; } } calc_size_exit: @@ -465,8 +451,14 @@ cifs_convert_path_to_utf16(const char *from, struct cifs_sb_info *cifs_sb) /* Windows doesn't allow paths beginning with \ */ if (from[0] == '\\') start_of_path = from + 1; +#ifdef CONFIG_CIFS_SMB311 + /* SMB311 POSIX extensions paths do not include leading slash */ + else if (cifs_sb_master_tcon(cifs_sb)->posix_extensions) + start_of_path = from + 1; +#endif /* 311 */ else start_of_path = from; + to = cifs_strndup_to_utf16(start_of_path, PATH_MAX, &len, cifs_sb->local_nls, map_type); return to; @@ -621,7 +613,7 @@ smb2_is_valid_lease_break(char *buffer) bool smb2_is_valid_oplock_break(char *buffer, struct TCP_Server_Info *server) { - struct smb2_oplock_break_rsp *rsp = (struct smb2_oplock_break_rsp *)buffer; + struct smb2_oplock_break *rsp = (struct smb2_oplock_break *)buffer; struct list_head *tmp, *tmp1, *tmp2; struct cifs_ses *ses; struct cifs_tcon *tcon; @@ -630,7 +622,7 @@ smb2_is_valid_oplock_break(char *buffer, struct TCP_Server_Info *server) cifs_dbg(FYI, "Checking for oplock break\n"); - if (rsp->hdr.sync_hdr.Command != SMB2_OPLOCK_BREAK) + if (rsp->sync_hdr.Command != SMB2_OPLOCK_BREAK) return false; if (rsp->StructureSize != @@ -721,7 +713,7 @@ smb2_cancelled_close_fid(struct work_struct *work) int smb2_handle_cancelled_mid(char *buffer, struct TCP_Server_Info *server) { - struct smb2_sync_hdr *sync_hdr = get_sync_hdr(buffer); + struct smb2_sync_hdr *sync_hdr = (struct smb2_sync_hdr *)buffer; struct smb2_create_rsp *rsp = (struct smb2_create_rsp *)buffer; struct cifs_tcon *tcon; struct close_cancelled_open *cancelled; diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index b76b85881dcc..950d0ab2cc61 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@ -123,7 +123,7 @@ smb2_get_credits_field(struct TCP_Server_Info *server, const int optype) static unsigned int smb2_get_credits(struct mid_q_entry *mid) { - struct smb2_sync_hdr *shdr = get_sync_hdr(mid->resp_buf); + struct smb2_sync_hdr *shdr = (struct smb2_sync_hdr *)mid->resp_buf; return le16_to_cpu(shdr->CreditRequest); } @@ -190,7 +190,7 @@ static struct mid_q_entry * smb2_find_mid(struct TCP_Server_Info *server, char *buf) { struct mid_q_entry *mid; - struct smb2_sync_hdr *shdr = get_sync_hdr(buf); + struct smb2_sync_hdr *shdr = (struct smb2_sync_hdr *)buf; __u64 wire_mid = le64_to_cpu(shdr->MessageId); if (shdr->ProtocolId == SMB2_TRANSFORM_PROTO_NUM) { @@ -212,15 +212,16 @@ smb2_find_mid(struct TCP_Server_Info *server, char *buf) } static void -smb2_dump_detail(void *buf) +smb2_dump_detail(void *buf, struct TCP_Server_Info *server) { #ifdef CONFIG_CIFS_DEBUG2 - struct smb2_sync_hdr *shdr = get_sync_hdr(buf); + struct smb2_sync_hdr *shdr = (struct smb2_sync_hdr *)buf; cifs_dbg(VFS, "Cmd: %d Err: 0x%x Flags: 0x%x Mid: %llu Pid: %d\n", shdr->Command, shdr->Status, shdr->Flags, shdr->MessageId, shdr->ProcessId); - cifs_dbg(VFS, "smb buf %p len %u\n", buf, smb2_calc_size(buf)); + cifs_dbg(VFS, "smb buf %p len %u\n", buf, + server->ops->calc_smb_size(buf, server)); #endif } @@ -322,6 +323,40 @@ SMB3_request_interfaces(const unsigned int xid, struct cifs_tcon *tcon) } #endif /* STATS2 */ +/* + * Open the directory at the root of a share + */ +int open_shroot(unsigned int xid, struct cifs_tcon *tcon, struct cifs_fid *pfid) +{ + struct cifs_open_parms oparams; + int rc; + __le16 srch_path = 0; /* Null - since an open of top of share */ + u8 oplock = SMB2_OPLOCK_LEVEL_NONE; + + mutex_lock(&tcon->prfid_mutex); + if (tcon->valid_root_fid) { + cifs_dbg(FYI, "found a cached root file handle\n"); + memcpy(pfid, tcon->prfid, sizeof(struct cifs_fid)); + mutex_unlock(&tcon->prfid_mutex); + return 0; + } + + oparams.tcon = tcon; + oparams.create_options = 0; + oparams.desired_access = FILE_READ_ATTRIBUTES; + oparams.disposition = FILE_OPEN; + oparams.fid = pfid; + oparams.reconnect = false; + + rc = SMB2_open(xid, &oparams, &srch_path, &oplock, NULL, NULL); + if (rc == 0) { + memcpy(tcon->prfid, pfid, sizeof(struct cifs_fid)); + tcon->valid_root_fid = true; + } + mutex_unlock(&tcon->prfid_mutex); + return rc; +} + static void smb3_qfs_tcon(const unsigned int xid, struct cifs_tcon *tcon) { @@ -330,6 +365,7 @@ smb3_qfs_tcon(const unsigned int xid, struct cifs_tcon *tcon) u8 oplock = SMB2_OPLOCK_LEVEL_NONE; struct cifs_open_parms oparms; struct cifs_fid fid; + bool no_cached_open = tcon->nohandlecache; oparms.tcon = tcon; oparms.desired_access = FILE_READ_ATTRIBUTES; @@ -338,7 +374,11 @@ smb3_qfs_tcon(const unsigned int xid, struct cifs_tcon *tcon) oparms.fid = &fid; oparms.reconnect = false; - rc = SMB2_open(xid, &oparms, &srch_path, &oplock, NULL, NULL); + if (no_cached_open) + rc = SMB2_open(xid, &oparms, &srch_path, &oplock, NULL, NULL); + else + rc = open_shroot(xid, tcon, &fid); + if (rc) return; @@ -352,7 +392,8 @@ smb3_qfs_tcon(const unsigned int xid, struct cifs_tcon *tcon) FS_DEVICE_INFORMATION); SMB2_QFS_attr(xid, tcon, fid.persistent_fid, fid.volatile_fid, FS_SECTOR_SIZE_INFORMATION); /* SMB3 specific */ - SMB2_close(xid, tcon, fid.persistent_fid, fid.volatile_fid); + if (no_cached_open) + SMB2_close(xid, tcon, fid.persistent_fid, fid.volatile_fid); return; } @@ -394,6 +435,9 @@ smb2_is_path_accessible(const unsigned int xid, struct cifs_tcon *tcon, struct cifs_open_parms oparms; struct cifs_fid fid; + if ((*full_path == 0) && tcon->valid_root_fid) + return 0; + utf16_path = cifs_convert_path_to_utf16(full_path, cifs_sb); if (!utf16_path) return -ENOMEM; @@ -589,9 +633,15 @@ smb2_query_eas(const unsigned int xid, struct cifs_tcon *tcon, SMB2_close(xid, tcon, fid.persistent_fid, fid.volatile_fid); + /* + * If ea_name is NULL (listxattr) and there are no EAs, return 0 as it's + * not an error. Otherwise, the specified ea_name was not found. + */ if (!rc) rc = move_smb2_ea_to_cifs(ea_data, buf_size, smb2_data, SMB2_MAX_EA_BUF, ea_name); + else if (!ea_name && rc == -ENODATA) + rc = 0; kfree(smb2_data); return rc; @@ -698,9 +748,11 @@ smb2_dump_share_caps(struct seq_file *m, struct cifs_tcon *tcon) seq_puts(m, " TRIM-support,"); seq_printf(m, "\tShare Flags: 0x%x", tcon->share_flags); + seq_printf(m, "\n\ttid: 0x%x", tcon->tid); if (tcon->perf_sector_size) seq_printf(m, "\tOptimal sector size: 0x%x", tcon->perf_sector_size); + seq_printf(m, "\tMaximal Access: 0x%x", tcon->maximal_access); } static void @@ -1251,7 +1303,7 @@ smb2_close_dir(const unsigned int xid, struct cifs_tcon *tcon, static bool smb2_is_status_pending(char *buf, struct TCP_Server_Info *server, int length) { - struct smb2_sync_hdr *shdr = get_sync_hdr(buf); + struct smb2_sync_hdr *shdr = (struct smb2_sync_hdr *)buf; if (shdr->Status != STATUS_PENDING) return false; @@ -1269,12 +1321,13 @@ smb2_is_status_pending(char *buf, struct TCP_Server_Info *server, int length) static bool smb2_is_session_expired(char *buf) { - struct smb2_sync_hdr *shdr = get_sync_hdr(buf); + struct smb2_sync_hdr *shdr = (struct smb2_sync_hdr *)buf; - if (shdr->Status != STATUS_NETWORK_SESSION_EXPIRED) + if (shdr->Status != STATUS_NETWORK_SESSION_EXPIRED && + shdr->Status != STATUS_USER_SESSION_DELETED) return false; - cifs_dbg(FYI, "Session expired\n"); + cifs_dbg(FYI, "Session expired or deleted\n"); return true; } @@ -1468,8 +1521,6 @@ smb2_query_symlink(const unsigned int xid, struct cifs_tcon *tcon, unsigned int sub_offset; unsigned int print_len; unsigned int print_offset; - struct cifs_ses *ses = tcon->ses; - struct TCP_Server_Info *server = ses->server; cifs_dbg(FYI, "%s: path: %s\n", __func__, full_path); @@ -1493,7 +1544,7 @@ smb2_query_symlink(const unsigned int xid, struct cifs_tcon *tcon, err_buf = err_iov.iov_base; if (le32_to_cpu(err_buf->ByteCount) < sizeof(struct smb2_symlink_err_rsp) || - err_iov.iov_len + server->vals->header_preamble_size < SMB2_SYMLINK_STRUCT_SIZE) { + err_iov.iov_len < SMB2_SYMLINK_STRUCT_SIZE) { kfree(utf16_path); return -ENOENT; } @@ -1506,14 +1557,13 @@ smb2_query_symlink(const unsigned int xid, struct cifs_tcon *tcon, print_len = le16_to_cpu(symlink->PrintNameLength); print_offset = le16_to_cpu(symlink->PrintNameOffset); - if (err_iov.iov_len + server->vals->header_preamble_size < - SMB2_SYMLINK_STRUCT_SIZE + sub_offset + sub_len) { + if (err_iov.iov_len < SMB2_SYMLINK_STRUCT_SIZE + sub_offset + sub_len) { kfree(utf16_path); return -ENOENT; } - if (err_iov.iov_len + server->vals->header_preamble_size < - SMB2_SYMLINK_STRUCT_SIZE + print_offset + print_len) { + if (err_iov.iov_len < + SMB2_SYMLINK_STRUCT_SIZE + print_offset + print_len) { kfree(utf16_path); return -ENOENT; } @@ -1587,8 +1637,11 @@ get_smb2_acl_by_path(struct cifs_sb_info *cifs_sb, oparms.create_options = 0; utf16_path = cifs_convert_path_to_utf16(path, cifs_sb); - if (!utf16_path) - return ERR_PTR(-ENOMEM); + if (!utf16_path) { + rc = -ENOMEM; + free_xid(xid); + return ERR_PTR(rc); + } oparms.tcon = tcon; oparms.desired_access = READ_CONTROL; @@ -1646,8 +1699,11 @@ set_smb2_acl(struct cifs_ntsd *pnntsd, __u32 acllen, access_flags = WRITE_DAC; utf16_path = cifs_convert_path_to_utf16(path, cifs_sb); - if (!utf16_path) - return -ENOMEM; + if (!utf16_path) { + rc = -ENOMEM; + free_xid(xid); + return rc; + } oparms.tcon = tcon; oparms.desired_access = access_flags; @@ -1707,15 +1763,21 @@ static long smb3_zero_range(struct file *file, struct cifs_tcon *tcon, /* if file not oplocked can't be sure whether asking to extend size */ if (!CIFS_CACHE_READ(cifsi)) - if (keep_size == false) - return -EOPNOTSUPP; + if (keep_size == false) { + rc = -EOPNOTSUPP; + free_xid(xid); + return rc; + } /* * Must check if file sparse since fallocate -z (zero range) assumes * non-sparse allocation */ - if (!(cifsi->cifsAttrs & FILE_ATTRIBUTE_SPARSE_FILE)) - return -EOPNOTSUPP; + if (!(cifsi->cifsAttrs & FILE_ATTRIBUTE_SPARSE_FILE)) { + rc = -EOPNOTSUPP; + free_xid(xid); + return rc; + } /* * need to make sure we are not asked to extend the file since the SMB3 @@ -1724,8 +1786,11 @@ static long smb3_zero_range(struct file *file, struct cifs_tcon *tcon, * which for a non sparse file would zero the newly extended range */ if (keep_size == false) - if (i_size_read(inode) < offset + len) - return -EOPNOTSUPP; + if (i_size_read(inode) < offset + len) { + rc = -EOPNOTSUPP; + free_xid(xid); + return rc; + } cifs_dbg(FYI, "offset %lld len %lld", offset, len); @@ -1758,8 +1823,11 @@ static long smb3_punch_hole(struct file *file, struct cifs_tcon *tcon, /* Need to make file sparse, if not already, before freeing range. */ /* Consider adding equivalent for compressed since it could also work */ - if (!smb2_set_sparse(xid, tcon, cfile, inode, set_sparse)) - return -EOPNOTSUPP; + if (!smb2_set_sparse(xid, tcon, cfile, inode, set_sparse)) { + rc = -EOPNOTSUPP; + free_xid(xid); + return rc; + } cifs_dbg(FYI, "offset %lld len %lld", offset, len); @@ -1790,8 +1858,10 @@ static long smb3_simple_falloc(struct file *file, struct cifs_tcon *tcon, /* if file not oplocked can't be sure whether asking to extend size */ if (!CIFS_CACHE_READ(cifsi)) - if (keep_size == false) - return -EOPNOTSUPP; + if (keep_size == false) { + free_xid(xid); + return rc; + } /* * Files are non-sparse by default so falloc may be a no-op @@ -1800,14 +1870,16 @@ static long smb3_simple_falloc(struct file *file, struct cifs_tcon *tcon, */ if ((cifsi->cifsAttrs & FILE_ATTRIBUTE_SPARSE_FILE) == 0) { if (keep_size == true) - return 0; + rc = 0; /* check if extending file */ else if (i_size_read(inode) >= off + len) /* not extending file and already not sparse */ - return 0; + rc = 0; /* BB: in future add else clause to extend file */ else - return -EOPNOTSUPP; + rc = -EOPNOTSUPP; + free_xid(xid); + return rc; } if ((keep_size == true) || (i_size_read(inode) >= off + len)) { @@ -1819,8 +1891,11 @@ static long smb3_simple_falloc(struct file *file, struct cifs_tcon *tcon, * ie potentially making a few extra pages at the beginning * or end of the file non-sparse via set_sparse is harmless. */ - if ((off > 8192) || (off + len + 8192 < i_size_read(inode))) - return -EOPNOTSUPP; + if ((off > 8192) || (off + len + 8192 < i_size_read(inode))) { + rc = -EOPNOTSUPP; + free_xid(xid); + return rc; + } rc = smb2_set_sparse(xid, tcon, cfile, inode, false); } @@ -2029,7 +2104,7 @@ smb3_create_lease_buf(u8 *lease_key, u8 oplock) } static __u8 -smb2_parse_lease_buf(void *buf, unsigned int *epoch) +smb2_parse_lease_buf(void *buf, unsigned int *epoch, char *lease_key) { struct create_lease *lc = (struct create_lease *)buf; @@ -2040,13 +2115,16 @@ smb2_parse_lease_buf(void *buf, unsigned int *epoch) } static __u8 -smb3_parse_lease_buf(void *buf, unsigned int *epoch) +smb3_parse_lease_buf(void *buf, unsigned int *epoch, char *lease_key) { struct create_lease_v2 *lc = (struct create_lease_v2 *)buf; *epoch = le16_to_cpu(lc->lcontext.Epoch); if (lc->lcontext.LeaseFlags & SMB2_LEASE_FLAG_BREAK_IN_PROGRESS) return SMB2_OPLOCK_LEVEL_NOCHANGE; + if (lease_key) + memcpy(lease_key, &lc->lcontext.LeaseKeyLow, + SMB2_LEASE_KEY_SIZE); return le32_to_cpu(lc->lcontext.LeaseState); } @@ -2064,12 +2142,11 @@ smb2_dir_needs_close(struct cifsFileInfo *cfile) } static void -fill_transform_hdr(struct TCP_Server_Info *server, - struct smb2_transform_hdr *tr_hdr, struct smb_rqst *old_rq) +fill_transform_hdr(struct smb2_transform_hdr *tr_hdr, unsigned int orig_len, + struct smb_rqst *old_rq) { struct smb2_sync_hdr *shdr = (struct smb2_sync_hdr *)old_rq->rq_iov[1].iov_base; - unsigned int orig_len = get_rfc1002_length(old_rq->rq_iov[0].iov_base); memset(tr_hdr, 0, sizeof(struct smb2_transform_hdr)); tr_hdr->ProtocolId = SMB2_TRANSFORM_PROTO_NUM; @@ -2077,8 +2154,6 @@ fill_transform_hdr(struct TCP_Server_Info *server, tr_hdr->Flags = cpu_to_le16(0x01); get_random_bytes(&tr_hdr->Nonce, SMB3_AES128CMM_NONCE); memcpy(&tr_hdr->SessionId, &shdr->SessionId, 8); - inc_rfc1001_len(tr_hdr, sizeof(struct smb2_transform_hdr) - server->vals->header_preamble_size); - inc_rfc1001_len(tr_hdr, orig_len); } /* We can not use the normal sg_set_buf() as we will sometimes pass a @@ -2090,11 +2165,16 @@ static inline void smb2_sg_set_buf(struct scatterlist *sg, const void *buf, sg_set_page(sg, virt_to_page(buf), buflen, offset_in_page(buf)); } +/* Assumes: + * rqst->rq_iov[0] is rfc1002 length + * rqst->rq_iov[1] is tranform header + * rqst->rq_iov[2+] data to be encrypted/decrypted + */ static struct scatterlist * init_sg(struct smb_rqst *rqst, u8 *sign) { - unsigned int sg_len = rqst->rq_nvec + rqst->rq_npages + 1; - unsigned int assoc_data_len = sizeof(struct smb2_transform_hdr) - 24; + unsigned int sg_len = rqst->rq_nvec + rqst->rq_npages; + unsigned int assoc_data_len = sizeof(struct smb2_transform_hdr) - 20; struct scatterlist *sg; unsigned int i; unsigned int j; @@ -2104,10 +2184,10 @@ init_sg(struct smb_rqst *rqst, u8 *sign) return NULL; sg_init_table(sg, sg_len); - smb2_sg_set_buf(&sg[0], rqst->rq_iov[0].iov_base + 24, assoc_data_len); - for (i = 1; i < rqst->rq_nvec; i++) - smb2_sg_set_buf(&sg[i], rqst->rq_iov[i].iov_base, - rqst->rq_iov[i].iov_len); + smb2_sg_set_buf(&sg[0], rqst->rq_iov[1].iov_base + 20, assoc_data_len); + for (i = 1; i < rqst->rq_nvec - 1; i++) + smb2_sg_set_buf(&sg[i], rqst->rq_iov[i+1].iov_base, + rqst->rq_iov[i+1].iov_len); for (j = 0; i < sg_len - 1; i++, j++) { unsigned int len = (j < rqst->rq_npages - 1) ? rqst->rq_pagesz : rqst->rq_tailsz; @@ -2139,9 +2219,10 @@ smb2_get_enc_key(struct TCP_Server_Info *server, __u64 ses_id, int enc, u8 *key) } /* * Encrypt or decrypt @rqst message. @rqst has the following format: - * iov[0] - transform header (associate data), - * iov[1-N] and pages - data to encrypt. - * On success return encrypted data in iov[1-N] and pages, leave iov[0] + * iov[0] - rfc1002 length + * iov[1] - transform header (associate data), + * iov[2-N] and pages - data to encrypt. + * On success return encrypted data in iov[2-N] and pages, leave iov[0-1] * untouched. */ static int @@ -2149,7 +2230,7 @@ crypt_message(struct TCP_Server_Info *server, struct smb_rqst *rqst, int enc) { struct smb2_transform_hdr *tr_hdr = (struct smb2_transform_hdr *)rqst->rq_iov[0].iov_base; - unsigned int assoc_data_len = sizeof(struct smb2_transform_hdr) - 20 - server->vals->header_preamble_size; + unsigned int assoc_data_len = sizeof(struct smb2_transform_hdr) - 20; int rc = 0; struct scatterlist *sg; u8 sign[SMB2_SIGNATURE_SIZE] = {}; @@ -2236,6 +2317,10 @@ free_req: return rc; } +/* + * This is called from smb_send_rqst. At this point we have the rfc1002 + * header as the first element in the vector. + */ static int smb3_init_transform_rq(struct TCP_Server_Info *server, struct smb_rqst *new_rq, struct smb_rqst *old_rq) @@ -2244,6 +2329,7 @@ smb3_init_transform_rq(struct TCP_Server_Info *server, struct smb_rqst *new_rq, struct page **pages; struct smb2_transform_hdr *tr_hdr; unsigned int npages = old_rq->rq_npages; + unsigned int orig_len = get_rfc1002_length(old_rq->rq_iov[0].iov_base); int i; int rc = -ENOMEM; @@ -2262,24 +2348,34 @@ smb3_init_transform_rq(struct TCP_Server_Info *server, struct smb_rqst *new_rq, goto err_free_pages; } - iov = kmalloc_array(old_rq->rq_nvec, sizeof(struct kvec), GFP_KERNEL); + /* Make space for one extra iov to hold the transform header */ + iov = kmalloc_array(old_rq->rq_nvec + 1, sizeof(struct kvec), + GFP_KERNEL); if (!iov) goto err_free_pages; /* copy all iovs from the old except the 1st one (rfc1002 length) */ - memcpy(&iov[1], &old_rq->rq_iov[1], + memcpy(&iov[2], &old_rq->rq_iov[1], sizeof(struct kvec) * (old_rq->rq_nvec - 1)); + /* copy the rfc1002 iov */ + iov[0].iov_base = old_rq->rq_iov[0].iov_base; + iov[0].iov_len = old_rq->rq_iov[0].iov_len; + new_rq->rq_iov = iov; - new_rq->rq_nvec = old_rq->rq_nvec; + new_rq->rq_nvec = old_rq->rq_nvec + 1; tr_hdr = kmalloc(sizeof(struct smb2_transform_hdr), GFP_KERNEL); if (!tr_hdr) goto err_free_iov; - /* fill the 1st iov with a transform header */ - fill_transform_hdr(server, tr_hdr, old_rq); - new_rq->rq_iov[0].iov_base = tr_hdr; - new_rq->rq_iov[0].iov_len = sizeof(struct smb2_transform_hdr); + /* fill the 2nd iov with a transform header */ + fill_transform_hdr(tr_hdr, orig_len, old_rq); + new_rq->rq_iov[1].iov_base = tr_hdr; + new_rq->rq_iov[1].iov_len = sizeof(struct smb2_transform_hdr); + + /* Update rfc1002 header */ + inc_rfc1001_len(new_rq->rq_iov[0].iov_base, + sizeof(struct smb2_transform_hdr)); /* copy pages form the old */ for (i = 0; i < npages; i++) { @@ -2319,7 +2415,7 @@ smb3_free_transform_rq(struct smb_rqst *rqst) put_page(rqst->rq_pages[i]); kfree(rqst->rq_pages); /* free transform header */ - kfree(rqst->rq_iov[0].iov_base); + kfree(rqst->rq_iov[1].iov_base); kfree(rqst->rq_iov); } @@ -2336,18 +2432,19 @@ decrypt_raw_data(struct TCP_Server_Info *server, char *buf, unsigned int buf_data_size, struct page **pages, unsigned int npages, unsigned int page_data_size) { - struct kvec iov[2]; + struct kvec iov[3]; struct smb_rqst rqst = {NULL}; - struct smb2_hdr *hdr; int rc; - iov[0].iov_base = buf; - iov[0].iov_len = sizeof(struct smb2_transform_hdr); - iov[1].iov_base = buf + sizeof(struct smb2_transform_hdr); - iov[1].iov_len = buf_data_size; + iov[0].iov_base = NULL; + iov[0].iov_len = 0; + iov[1].iov_base = buf; + iov[1].iov_len = sizeof(struct smb2_transform_hdr); + iov[2].iov_base = buf + sizeof(struct smb2_transform_hdr); + iov[2].iov_len = buf_data_size; rqst.rq_iov = iov; - rqst.rq_nvec = 2; + rqst.rq_nvec = 3; rqst.rq_pages = pages; rqst.rq_npages = npages; rqst.rq_pagesz = PAGE_SIZE; @@ -2359,10 +2456,9 @@ decrypt_raw_data(struct TCP_Server_Info *server, char *buf, if (rc) return rc; - memmove(buf + server->vals->header_preamble_size, iov[1].iov_base, buf_data_size); - hdr = (struct smb2_hdr *)buf; - hdr->smb2_buf_length = cpu_to_be32(buf_data_size + page_data_size); - server->total_read = buf_data_size + page_data_size + server->vals->header_preamble_size; + memmove(buf, iov[2].iov_base, buf_data_size); + + server->total_read = buf_data_size + page_data_size; return rc; } @@ -2387,7 +2483,7 @@ read_data_into_pages(struct TCP_Server_Info *server, struct page **pages, zero_user(page, len, PAGE_SIZE - len); len = 0; } - length = cifs_read_page_from_socket(server, page, n); + length = cifs_read_page_from_socket(server, page, 0, n); if (length < 0) return length; server->total_read += length; @@ -2435,7 +2531,7 @@ handle_read_data(struct TCP_Server_Info *server, struct mid_q_entry *mid, unsigned int cur_page_idx; unsigned int pad_len; struct cifs_readdata *rdata = mid->callback_data; - struct smb2_sync_hdr *shdr = get_sync_hdr(buf); + struct smb2_sync_hdr *shdr = (struct smb2_sync_hdr *)buf; struct bio_vec *bvec = NULL; struct iov_iter iter; struct kvec iov; @@ -2466,7 +2562,7 @@ handle_read_data(struct TCP_Server_Info *server, struct mid_q_entry *mid, return 0; } - data_offset = server->ops->read_data_offset(buf) + server->vals->header_preamble_size; + data_offset = server->ops->read_data_offset(buf); #ifdef CONFIG_CIFS_SMB_DIRECT use_rdma_mr = rdata->mr; #endif @@ -2562,12 +2658,11 @@ receive_encrypted_read(struct TCP_Server_Info *server, struct mid_q_entry **mid) unsigned int npages; struct page **pages; unsigned int len; - unsigned int buflen = server->pdu_size + server->vals->header_preamble_size; + unsigned int buflen = server->pdu_size; int rc; int i = 0; - len = min_t(unsigned int, buflen, server->vals->read_rsp_size - - server->vals->header_preamble_size + + len = min_t(unsigned int, buflen, server->vals->read_rsp_size + sizeof(struct smb2_transform_hdr)) - HEADER_SIZE(server) + 1; rc = cifs_read_from_socket(server, buf + HEADER_SIZE(server) - 1, len); @@ -2575,8 +2670,7 @@ receive_encrypted_read(struct TCP_Server_Info *server, struct mid_q_entry **mid) return rc; server->total_read += rc; - len = le32_to_cpu(tr_hdr->OriginalMessageSize) + - server->vals->header_preamble_size - + len = le32_to_cpu(tr_hdr->OriginalMessageSize) - server->vals->read_rsp_size; npages = DIV_ROUND_UP(len, PAGE_SIZE); @@ -2603,8 +2697,7 @@ receive_encrypted_read(struct TCP_Server_Info *server, struct mid_q_entry **mid) if (rc) goto free_pages; - rc = decrypt_raw_data(server, buf, server->vals->read_rsp_size - - server->vals->header_preamble_size, + rc = decrypt_raw_data(server, buf, server->vals->read_rsp_size, pages, npages, len); if (rc) goto free_pages; @@ -2641,7 +2734,7 @@ receive_encrypted_standard(struct TCP_Server_Info *server, struct mid_q_entry *mid_entry; /* switch to large buffer if too big for a small one */ - if (pdu_length + server->vals->header_preamble_size > MAX_CIFS_SMALL_BUFFER_SIZE) { + if (pdu_length > MAX_CIFS_SMALL_BUFFER_SIZE) { server->large_buf = true; memcpy(server->bigbuf, buf, server->total_read); buf = server->bigbuf; @@ -2649,13 +2742,12 @@ receive_encrypted_standard(struct TCP_Server_Info *server, /* now read the rest */ length = cifs_read_from_socket(server, buf + HEADER_SIZE(server) - 1, - pdu_length - HEADER_SIZE(server) + 1 + - server->vals->header_preamble_size); + pdu_length - HEADER_SIZE(server) + 1); if (length < 0) return length; server->total_read += length; - buf_size = pdu_length + server->vals->header_preamble_size - sizeof(struct smb2_transform_hdr); + buf_size = pdu_length - sizeof(struct smb2_transform_hdr); length = decrypt_raw_data(server, buf, buf_size, NULL, 0, 0); if (length) return length; @@ -2684,7 +2776,7 @@ smb3_receive_transform(struct TCP_Server_Info *server, struct mid_q_entry **mid) struct smb2_transform_hdr *tr_hdr = (struct smb2_transform_hdr *)buf; unsigned int orig_len = le32_to_cpu(tr_hdr->OriginalMessageSize); - if (pdu_length + server->vals->header_preamble_size < sizeof(struct smb2_transform_hdr) + + if (pdu_length < sizeof(struct smb2_transform_hdr) + sizeof(struct smb2_sync_hdr)) { cifs_dbg(VFS, "Transform message is too small (%u)\n", pdu_length); @@ -2693,14 +2785,14 @@ smb3_receive_transform(struct TCP_Server_Info *server, struct mid_q_entry **mid) return -ECONNABORTED; } - if (pdu_length + server->vals->header_preamble_size < orig_len + sizeof(struct smb2_transform_hdr)) { + if (pdu_length < orig_len + sizeof(struct smb2_transform_hdr)) { cifs_dbg(VFS, "Transform message is broken\n"); cifs_reconnect(server); wake_up(&server->response_q); return -ECONNABORTED; } - if (pdu_length + server->vals->header_preamble_size > CIFSMaxBufSize + MAX_HEADER_SIZE(server)) + if (pdu_length > CIFSMaxBufSize + MAX_HEADER_SIZE(server)) return receive_encrypted_read(server, mid); return receive_encrypted_standard(server, mid); @@ -2711,11 +2803,23 @@ smb3_handle_read_data(struct TCP_Server_Info *server, struct mid_q_entry *mid) { char *buf = server->large_buf ? server->bigbuf : server->smallbuf; - return handle_read_data(server, mid, buf, server->pdu_size + - server->vals->header_preamble_size, + return handle_read_data(server, mid, buf, server->pdu_size, NULL, 0, 0); } +static int +smb2_next_header(char *buf) +{ + struct smb2_sync_hdr *hdr = (struct smb2_sync_hdr *)buf; + struct smb2_transform_hdr *t_hdr = (struct smb2_transform_hdr *)buf; + + if (hdr->ProtocolId == SMB2_TRANSFORM_PROTO_NUM) + return sizeof(struct smb2_transform_hdr) + + le32_to_cpu(t_hdr->OriginalMessageSize); + + return le32_to_cpu(hdr->NextCommand); +} + struct smb_version_operations smb20_operations = { .compare_fids = smb2_compare_fids, .setup_request = smb2_setup_request, @@ -2807,6 +2911,7 @@ struct smb_version_operations smb20_operations = { .get_acl_by_fid = get_smb2_acl_by_fid, .set_acl = set_smb2_acl, #endif /* CIFS_ACL */ + .next_header = smb2_next_header, }; struct smb_version_operations smb21_operations = { @@ -2901,6 +3006,7 @@ struct smb_version_operations smb21_operations = { .get_acl_by_fid = get_smb2_acl_by_fid, .set_acl = set_smb2_acl, #endif /* CIFS_ACL */ + .next_header = smb2_next_header, }; struct smb_version_operations smb30_operations = { @@ -3005,6 +3111,7 @@ struct smb_version_operations smb30_operations = { .get_acl_by_fid = get_smb2_acl_by_fid, .set_acl = set_smb2_acl, #endif /* CIFS_ACL */ + .next_header = smb2_next_header, }; #ifdef CONFIG_CIFS_SMB311 @@ -3105,6 +3212,7 @@ struct smb_version_operations smb311_operations = { .query_all_EAs = smb2_query_eas, .set_EA = smb2_set_ea, #endif /* CIFS_XATTR */ + .next_header = smb2_next_header, }; #endif /* CIFS_SMB311 */ @@ -3116,8 +3224,8 @@ struct smb_version_values smb20_values = { .exclusive_lock_type = SMB2_LOCKFLAG_EXCLUSIVE_LOCK, .shared_lock_type = SMB2_LOCKFLAG_SHARED_LOCK, .unlock_lock_type = SMB2_LOCKFLAG_UNLOCK, - .header_size = sizeof(struct smb2_hdr), - .header_preamble_size = 4, + .header_size = sizeof(struct smb2_sync_hdr), + .header_preamble_size = 0, .max_header_size = MAX_SMB2_HDR_SIZE, .read_rsp_size = sizeof(struct smb2_read_rsp) - 1, .lock_cmd = SMB2_LOCK, @@ -3137,8 +3245,8 @@ struct smb_version_values smb21_values = { .exclusive_lock_type = SMB2_LOCKFLAG_EXCLUSIVE_LOCK, .shared_lock_type = SMB2_LOCKFLAG_SHARED_LOCK, .unlock_lock_type = SMB2_LOCKFLAG_UNLOCK, - .header_size = sizeof(struct smb2_hdr), - .header_preamble_size = 4, + .header_size = sizeof(struct smb2_sync_hdr), + .header_preamble_size = 0, .max_header_size = MAX_SMB2_HDR_SIZE, .read_rsp_size = sizeof(struct smb2_read_rsp) - 1, .lock_cmd = SMB2_LOCK, @@ -3158,8 +3266,8 @@ struct smb_version_values smb3any_values = { .exclusive_lock_type = SMB2_LOCKFLAG_EXCLUSIVE_LOCK, .shared_lock_type = SMB2_LOCKFLAG_SHARED_LOCK, .unlock_lock_type = SMB2_LOCKFLAG_UNLOCK, - .header_size = sizeof(struct smb2_hdr), - .header_preamble_size = 4, + .header_size = sizeof(struct smb2_sync_hdr), + .header_preamble_size = 0, .max_header_size = MAX_SMB2_HDR_SIZE, .read_rsp_size = sizeof(struct smb2_read_rsp) - 1, .lock_cmd = SMB2_LOCK, @@ -3179,8 +3287,8 @@ struct smb_version_values smbdefault_values = { .exclusive_lock_type = SMB2_LOCKFLAG_EXCLUSIVE_LOCK, .shared_lock_type = SMB2_LOCKFLAG_SHARED_LOCK, .unlock_lock_type = SMB2_LOCKFLAG_UNLOCK, - .header_size = sizeof(struct smb2_hdr), - .header_preamble_size = 4, + .header_size = sizeof(struct smb2_sync_hdr), + .header_preamble_size = 0, .max_header_size = MAX_SMB2_HDR_SIZE, .read_rsp_size = sizeof(struct smb2_read_rsp) - 1, .lock_cmd = SMB2_LOCK, @@ -3200,8 +3308,8 @@ struct smb_version_values smb30_values = { .exclusive_lock_type = SMB2_LOCKFLAG_EXCLUSIVE_LOCK, .shared_lock_type = SMB2_LOCKFLAG_SHARED_LOCK, .unlock_lock_type = SMB2_LOCKFLAG_UNLOCK, - .header_size = sizeof(struct smb2_hdr), - .header_preamble_size = 4, + .header_size = sizeof(struct smb2_sync_hdr), + .header_preamble_size = 0, .max_header_size = MAX_SMB2_HDR_SIZE, .read_rsp_size = sizeof(struct smb2_read_rsp) - 1, .lock_cmd = SMB2_LOCK, @@ -3221,8 +3329,8 @@ struct smb_version_values smb302_values = { .exclusive_lock_type = SMB2_LOCKFLAG_EXCLUSIVE_LOCK, .shared_lock_type = SMB2_LOCKFLAG_SHARED_LOCK, .unlock_lock_type = SMB2_LOCKFLAG_UNLOCK, - .header_size = sizeof(struct smb2_hdr), - .header_preamble_size = 4, + .header_size = sizeof(struct smb2_sync_hdr), + .header_preamble_size = 0, .max_header_size = MAX_SMB2_HDR_SIZE, .read_rsp_size = sizeof(struct smb2_read_rsp) - 1, .lock_cmd = SMB2_LOCK, @@ -3243,8 +3351,8 @@ struct smb_version_values smb311_values = { .exclusive_lock_type = SMB2_LOCKFLAG_EXCLUSIVE_LOCK, .shared_lock_type = SMB2_LOCKFLAG_SHARED_LOCK, .unlock_lock_type = SMB2_LOCKFLAG_UNLOCK, - .header_size = sizeof(struct smb2_hdr), - .header_preamble_size = 4, + .header_size = sizeof(struct smb2_sync_hdr), + .header_preamble_size = 0, .max_header_size = MAX_SMB2_HDR_SIZE, .read_rsp_size = sizeof(struct smb2_read_rsp) - 1, .lock_cmd = SMB2_LOCK, diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index 60db51bae0e3..281fbc1dc720 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -49,6 +49,7 @@ #include "cifspdu.h" #include "cifs_spnego.h" #include "smbdirect.h" +#include "trace.h" /* * The following table defines the expected "StructureSize" of SMB2 requests @@ -79,7 +80,7 @@ static const int smb2_req_struct_sizes[NUMBER_OF_SMB2_COMMANDS] = { /* SMB2_OPLOCK_BREAK */ 24 /* BB this is 36 for LEASE_BREAK variant */ }; -static int encryption_required(const struct cifs_tcon *tcon) +static int smb3_encryption_required(const struct cifs_tcon *tcon) { if (!tcon) return 0; @@ -145,7 +146,7 @@ smb2_hdr_assemble(struct smb2_sync_hdr *shdr, __le16 smb2_cmd, shdr->Flags |= SMB2_FLAGS_DFS_OPERATIONS; */ if (tcon->ses && tcon->ses->server && tcon->ses->server->sign && - !encryption_required(tcon)) + !smb3_encryption_required(tcon)) shdr->Flags |= SMB2_FLAGS_SIGNED; out: return; @@ -367,6 +368,7 @@ smb2_plain_req_init(__le16 smb2_command, struct cifs_tcon *tcon, #define SMB2_PREAUTH_INTEGRITY_CAPABILITIES cpu_to_le16(1) #define SMB2_ENCRYPTION_CAPABILITIES cpu_to_le16(2) +#define SMB2_POSIX_EXTENSIONS_AVAILABLE cpu_to_le16(0x100) static void build_preauth_ctxt(struct smb2_preauth_neg_context *pneg_ctxt) @@ -390,21 +392,35 @@ build_encrypt_ctxt(struct smb2_encryption_neg_context *pneg_ctxt) } static void +build_posix_ctxt(struct smb2_posix_neg_context *pneg_ctxt) +{ + pneg_ctxt->ContextType = SMB2_POSIX_EXTENSIONS_AVAILABLE; + pneg_ctxt->DataLength = cpu_to_le16(POSIX_CTXT_DATA_LEN); +} + +static void assemble_neg_contexts(struct smb2_negotiate_req *req, unsigned int *total_len) { char *pneg_ctxt = (char *)req + OFFSET_OF_NEG_CONTEXT; + unsigned int ctxt_len; + *total_len += 2; /* Add 2 due to round to 8 byte boundary for 1st ctxt */ build_preauth_ctxt((struct smb2_preauth_neg_context *)pneg_ctxt); - /* Add 2 to size to round to 8 byte boundary */ + ctxt_len = DIV_ROUND_UP(sizeof(struct smb2_preauth_neg_context), 8) * 8; + *total_len += ctxt_len; + pneg_ctxt += ctxt_len; - pneg_ctxt += 2 + sizeof(struct smb2_preauth_neg_context); build_encrypt_ctxt((struct smb2_encryption_neg_context *)pneg_ctxt); - req->NegotiateContextOffset = cpu_to_le32(OFFSET_OF_NEG_CONTEXT); - req->NegotiateContextCount = cpu_to_le16(2); + ctxt_len = DIV_ROUND_UP(sizeof(struct smb2_encryption_neg_context), 8) * 8; + *total_len += ctxt_len; + pneg_ctxt += ctxt_len; + + build_posix_ctxt((struct smb2_posix_neg_context *)pneg_ctxt); + *total_len += sizeof(struct smb2_posix_neg_context); - *total_len += 4 + sizeof(struct smb2_preauth_neg_context) - + sizeof(struct smb2_encryption_neg_context); + req->NegotiateContextOffset = cpu_to_le32(OFFSET_OF_NEG_CONTEXT); + req->NegotiateContextCount = cpu_to_le16(3); } static void decode_preauth_context(struct smb2_preauth_neg_context *ctxt) @@ -449,12 +465,12 @@ static int decode_encrypt_ctx(struct TCP_Server_Info *server, } static int smb311_decode_neg_context(struct smb2_negotiate_rsp *rsp, - struct TCP_Server_Info *server) + struct TCP_Server_Info *server, + unsigned int len_of_smb) { struct smb2_neg_context *pctx; unsigned int offset = le32_to_cpu(rsp->NegotiateContextOffset); unsigned int ctxt_cnt = le16_to_cpu(rsp->NegotiateContextCount); - unsigned int len_of_smb = be32_to_cpu(rsp->hdr.smb2_buf_length); unsigned int len_of_ctxts, i; int rc = 0; @@ -475,8 +491,7 @@ static int smb311_decode_neg_context(struct smb2_negotiate_rsp *rsp, if (len_of_ctxts < sizeof(struct smb2_neg_context)) break; - pctx = (struct smb2_neg_context *)(offset + - server->vals->header_preamble_size + (char *)rsp); + pctx = (struct smb2_neg_context *)(offset + (char *)rsp); clen = le16_to_cpu(pctx->DataLength); if (clen > len_of_ctxts) break; @@ -487,6 +502,8 @@ static int smb311_decode_neg_context(struct smb2_negotiate_rsp *rsp, else if (pctx->ContextType == SMB2_ENCRYPTION_CAPABILITIES) rc = decode_encrypt_ctx(server, (struct smb2_encryption_neg_context *)pctx); + else if (pctx->ContextType == SMB2_POSIX_EXTENSIONS_AVAILABLE) + server->posix_ext_supported = true; else cifs_dbg(VFS, "unknown negcontext of type %d ignored\n", le16_to_cpu(pctx->ContextType)); @@ -501,6 +518,64 @@ static int smb311_decode_neg_context(struct smb2_negotiate_rsp *rsp, return rc; } +static struct create_posix * +create_posix_buf(umode_t mode) +{ + struct create_posix *buf; + + buf = kzalloc(sizeof(struct create_posix), + GFP_KERNEL); + if (!buf) + return NULL; + + buf->ccontext.DataOffset = + cpu_to_le16(offsetof(struct create_posix, Mode)); + buf->ccontext.DataLength = cpu_to_le32(4); + buf->ccontext.NameOffset = + cpu_to_le16(offsetof(struct create_posix, Name)); + buf->ccontext.NameLength = cpu_to_le16(16); + + /* SMB2_CREATE_TAG_POSIX is "0x93AD25509CB411E7B42383DE968BCD7C" */ + buf->Name[0] = 0x93; + buf->Name[1] = 0xAD; + buf->Name[2] = 0x25; + buf->Name[3] = 0x50; + buf->Name[4] = 0x9C; + buf->Name[5] = 0xB4; + buf->Name[6] = 0x11; + buf->Name[7] = 0xE7; + buf->Name[8] = 0xB4; + buf->Name[9] = 0x23; + buf->Name[10] = 0x83; + buf->Name[11] = 0xDE; + buf->Name[12] = 0x96; + buf->Name[13] = 0x8B; + buf->Name[14] = 0xCD; + buf->Name[15] = 0x7C; + buf->Mode = cpu_to_le32(mode); + cifs_dbg(FYI, "mode on posix create 0%o", mode); + return buf; +} + +static int +add_posix_context(struct kvec *iov, unsigned int *num_iovec, umode_t mode) +{ + struct smb2_create_req *req = iov[0].iov_base; + unsigned int num = *num_iovec; + + iov[num].iov_base = create_posix_buf(mode); + if (iov[num].iov_base == NULL) + return -ENOMEM; + iov[num].iov_len = sizeof(struct create_posix); + if (!req->CreateContextsOffset) + req->CreateContextsOffset = cpu_to_le32( + sizeof(struct smb2_create_req) + + iov[num - 1].iov_len); + le32_add_cpu(&req->CreateContextsLength, sizeof(struct create_posix)); + *num_iovec = num + 1; + return 0; +} + #else static void assemble_neg_contexts(struct smb2_negotiate_req *req, unsigned int *total_len) @@ -691,7 +766,7 @@ SMB2_negotiate(const unsigned int xid, struct cifs_ses *ses) server->capabilities |= SMB2_NT_FIND | SMB2_LARGE_FILES; security_blob = smb2_get_data_area_len(&blob_offset, &blob_length, - &rsp->hdr); + (struct smb2_sync_hdr *)rsp); /* * See MS-SMB2 section 2.2.4: if no blob, client picks default which * for us will be @@ -718,7 +793,8 @@ SMB2_negotiate(const unsigned int xid, struct cifs_ses *ses) #ifdef CONFIG_CIFS_SMB311 if (rsp->DialectRevision == cpu_to_le16(SMB311_PROT_ID)) { if (rsp->NegotiateContextCount) - rc = smb311_decode_neg_context(rsp, server); + rc = smb311_decode_neg_context(rsp, server, + rsp_iov.iov_len); else cifs_dbg(VFS, "Missing expected negotiate contexts\n"); } @@ -730,19 +806,14 @@ neg_exit: int smb3_validate_negotiate(const unsigned int xid, struct cifs_tcon *tcon) { - int rc = 0; - struct validate_negotiate_info_req vneg_inbuf; + int rc; + struct validate_negotiate_info_req *pneg_inbuf; struct validate_negotiate_info_rsp *pneg_rsp = NULL; u32 rsplen; u32 inbuflen; /* max of 4 dialects */ cifs_dbg(FYI, "validate negotiate\n"); -#ifdef CONFIG_CIFS_SMB_DIRECT - if (tcon->ses->server->rdma) - return 0; -#endif - /* In SMB3.11 preauth integrity supersedes validate negotiate */ if (tcon->ses->server->dialect == SMB311_PROT_ID) return 0; @@ -765,63 +836,69 @@ int smb3_validate_negotiate(const unsigned int xid, struct cifs_tcon *tcon) if (tcon->ses->session_flags & SMB2_SESSION_FLAG_IS_NULL) cifs_dbg(VFS, "Unexpected null user (anonymous) auth flag sent by server\n"); - vneg_inbuf.Capabilities = + pneg_inbuf = kmalloc(sizeof(*pneg_inbuf), GFP_NOFS); + if (!pneg_inbuf) + return -ENOMEM; + + pneg_inbuf->Capabilities = cpu_to_le32(tcon->ses->server->vals->req_capabilities); - memcpy(vneg_inbuf.Guid, tcon->ses->server->client_guid, + memcpy(pneg_inbuf->Guid, tcon->ses->server->client_guid, SMB2_CLIENT_GUID_SIZE); if (tcon->ses->sign) - vneg_inbuf.SecurityMode = + pneg_inbuf->SecurityMode = cpu_to_le16(SMB2_NEGOTIATE_SIGNING_REQUIRED); else if (global_secflags & CIFSSEC_MAY_SIGN) - vneg_inbuf.SecurityMode = + pneg_inbuf->SecurityMode = cpu_to_le16(SMB2_NEGOTIATE_SIGNING_ENABLED); else - vneg_inbuf.SecurityMode = 0; + pneg_inbuf->SecurityMode = 0; if (strcmp(tcon->ses->server->vals->version_string, SMB3ANY_VERSION_STRING) == 0) { - vneg_inbuf.Dialects[0] = cpu_to_le16(SMB30_PROT_ID); - vneg_inbuf.Dialects[1] = cpu_to_le16(SMB302_PROT_ID); - vneg_inbuf.DialectCount = cpu_to_le16(2); + pneg_inbuf->Dialects[0] = cpu_to_le16(SMB30_PROT_ID); + pneg_inbuf->Dialects[1] = cpu_to_le16(SMB302_PROT_ID); + pneg_inbuf->DialectCount = cpu_to_le16(2); /* structure is big enough for 3 dialects, sending only 2 */ - inbuflen = sizeof(struct validate_negotiate_info_req) - 2; + inbuflen = sizeof(*pneg_inbuf) - + sizeof(pneg_inbuf->Dialects[0]); } else if (strcmp(tcon->ses->server->vals->version_string, SMBDEFAULT_VERSION_STRING) == 0) { - vneg_inbuf.Dialects[0] = cpu_to_le16(SMB21_PROT_ID); - vneg_inbuf.Dialects[1] = cpu_to_le16(SMB30_PROT_ID); - vneg_inbuf.Dialects[2] = cpu_to_le16(SMB302_PROT_ID); - vneg_inbuf.DialectCount = cpu_to_le16(3); + pneg_inbuf->Dialects[0] = cpu_to_le16(SMB21_PROT_ID); + pneg_inbuf->Dialects[1] = cpu_to_le16(SMB30_PROT_ID); + pneg_inbuf->Dialects[2] = cpu_to_le16(SMB302_PROT_ID); + pneg_inbuf->DialectCount = cpu_to_le16(3); /* structure is big enough for 3 dialects */ - inbuflen = sizeof(struct validate_negotiate_info_req); + inbuflen = sizeof(*pneg_inbuf); } else { /* otherwise specific dialect was requested */ - vneg_inbuf.Dialects[0] = + pneg_inbuf->Dialects[0] = cpu_to_le16(tcon->ses->server->vals->protocol_id); - vneg_inbuf.DialectCount = cpu_to_le16(1); + pneg_inbuf->DialectCount = cpu_to_le16(1); /* structure is big enough for 3 dialects, sending only 1 */ - inbuflen = sizeof(struct validate_negotiate_info_req) - 4; + inbuflen = sizeof(*pneg_inbuf) - + sizeof(pneg_inbuf->Dialects[0]) * 2; } rc = SMB2_ioctl(xid, tcon, NO_FILE_ID, NO_FILE_ID, FSCTL_VALIDATE_NEGOTIATE_INFO, true /* is_fsctl */, - (char *)&vneg_inbuf, sizeof(struct validate_negotiate_info_req), - (char **)&pneg_rsp, &rsplen); + (char *)pneg_inbuf, inbuflen, (char **)&pneg_rsp, &rsplen); if (rc != 0) { cifs_dbg(VFS, "validate protocol negotiate failed: %d\n", rc); - return -EIO; + rc = -EIO; + goto out_free_inbuf; } - if (rsplen != sizeof(struct validate_negotiate_info_rsp)) { + rc = -EIO; + if (rsplen != sizeof(*pneg_rsp)) { cifs_dbg(VFS, "invalid protocol negotiate response size: %d\n", rsplen); /* relax check since Mac returns max bufsize allowed on ioctl */ - if ((rsplen > CIFSMaxBufSize) - || (rsplen < sizeof(struct validate_negotiate_info_rsp))) - goto err_rsp_free; + if (rsplen > CIFSMaxBufSize || rsplen < sizeof(*pneg_rsp)) + goto out_free_rsp; } /* check validate negotiate info response matches what we got earlier */ @@ -838,15 +915,17 @@ int smb3_validate_negotiate(const unsigned int xid, struct cifs_tcon *tcon) goto vneg_out; /* validate negotiate successful */ + rc = 0; cifs_dbg(FYI, "validate negotiate info successful\n"); - kfree(pneg_rsp); - return 0; + goto out_free_rsp; vneg_out: cifs_dbg(VFS, "protocol revalidation - security settings mismatch\n"); -err_rsp_free: +out_free_rsp: kfree(pneg_rsp); - return -EIO; +out_free_inbuf: + kfree(pneg_inbuf); + return rc; } enum securityEnum @@ -1051,7 +1130,7 @@ SMB2_auth_kerberos(struct SMB2_sess_data *sess_data) goto out_put_spnego_key; rsp = (struct smb2_sess_setup_rsp *)sess_data->iov[0].iov_base; - ses->Suid = rsp->hdr.sync_hdr.SessionId; + ses->Suid = rsp->sync_hdr.SessionId; ses->session_flags = le16_to_cpu(rsp->SessionFlags); @@ -1127,13 +1206,13 @@ SMB2_sess_auth_rawntlmssp_negotiate(struct SMB2_sess_data *sess_data) /* If true, rc here is expected and not an error */ if (sess_data->buf0_type != CIFS_NO_BUFFER && - rsp->hdr.sync_hdr.Status == STATUS_MORE_PROCESSING_REQUIRED) + rsp->sync_hdr.Status == STATUS_MORE_PROCESSING_REQUIRED) rc = 0; if (rc) goto out; - if (offsetof(struct smb2_sess_setup_rsp, Buffer) - ses->server->vals->header_preamble_size != + if (offsetof(struct smb2_sess_setup_rsp, Buffer) != le16_to_cpu(rsp->SecurityBufferOffset)) { cifs_dbg(VFS, "Invalid security buffer offset %d\n", le16_to_cpu(rsp->SecurityBufferOffset)); @@ -1148,7 +1227,7 @@ SMB2_sess_auth_rawntlmssp_negotiate(struct SMB2_sess_data *sess_data) cifs_dbg(FYI, "rawntlmssp session setup challenge phase\n"); - ses->Suid = rsp->hdr.sync_hdr.SessionId; + ses->Suid = rsp->sync_hdr.SessionId; ses->session_flags = le16_to_cpu(rsp->SessionFlags); out: @@ -1206,7 +1285,7 @@ SMB2_sess_auth_rawntlmssp_authenticate(struct SMB2_sess_data *sess_data) rsp = (struct smb2_sess_setup_rsp *)sess_data->iov[0].iov_base; - ses->Suid = rsp->hdr.sync_hdr.SessionId; + ses->Suid = rsp->sync_hdr.SessionId; ses->session_flags = le16_to_cpu(rsp->SessionFlags); rc = SMB2_sess_establish_session(sess_data); @@ -1273,6 +1352,7 @@ SMB2_sess_setup(const unsigned int xid, struct cifs_ses *ses, sess_data->ses = ses; sess_data->buf0_type = CIFS_NO_BUFFER; sess_data->nls_cp = (struct nls_table *) nls_cp; + sess_data->previous_session = ses->Suid; #ifdef CONFIG_CIFS_SMB311 /* @@ -1400,7 +1480,7 @@ SMB2_tcon(const unsigned int xid, struct cifs_ses *ses, const char *tree, return rc; } - if (encryption_required(tcon)) + if (smb3_encryption_required(tcon)) flags |= CIFS_TRANSFORM_REQ; iov[0].iov_base = (char *)req; @@ -1416,7 +1496,7 @@ SMB2_tcon(const unsigned int xid, struct cifs_ses *ses, const char *tree, /* 3.11 tcon req must be signed if not encrypted. See MS-SMB2 3.2.4.1.1 */ if ((ses->server->dialect == SMB311_PROT_ID) && - !encryption_required(tcon)) + !smb3_encryption_required(tcon)) req->sync_hdr.Flags |= SMB2_FLAGS_SIGNED; rc = smb2_send_recv(xid, ses, iov, 2, &resp_buftype, flags, &rsp_iov); @@ -1454,7 +1534,7 @@ SMB2_tcon(const unsigned int xid, struct cifs_ses *ses, const char *tree, tcon->maximal_access = le32_to_cpu(rsp->MaximalAccess); tcon->tidStatus = CifsGood; tcon->need_reconnect = false; - tcon->tid = rsp->hdr.sync_hdr.TreeId; + tcon->tid = rsp->sync_hdr.TreeId; strlcpy(tcon->treeName, tree, sizeof(tcon->treeName)); if ((rsp->Capabilities & SMB2_SHARE_CAP_DFS) && @@ -1474,7 +1554,7 @@ tcon_exit: return rc; tcon_error_exit: - if (rsp && rsp->hdr.sync_hdr.Status == STATUS_BAD_NETWORK_NAME) { + if (rsp && rsp->sync_hdr.Status == STATUS_BAD_NETWORK_NAME) { cifs_dbg(VFS, "BAD_NETWORK_NAME: %s\n", tree); } goto tcon_exit; @@ -1505,7 +1585,7 @@ SMB2_tdis(const unsigned int xid, struct cifs_tcon *tcon) if (rc) return rc; - if (encryption_required(tcon)) + if (smb3_encryption_required(tcon)) flags |= CIFS_TRANSFORM_REQ; flags |= CIFS_NO_RESP; @@ -1572,7 +1652,7 @@ create_reconnect_durable_buf(struct cifs_fid *fid) static __u8 parse_lease_state(struct TCP_Server_Info *server, struct smb2_create_rsp *rsp, - unsigned int *epoch) + unsigned int *epoch, char *lease_key) { char *data_offset; struct create_context *cc; @@ -1580,14 +1660,15 @@ parse_lease_state(struct TCP_Server_Info *server, struct smb2_create_rsp *rsp, unsigned int remaining; char *name; - data_offset = (char *)rsp + server->vals->header_preamble_size + le32_to_cpu(rsp->CreateContextsOffset); + data_offset = (char *)rsp + le32_to_cpu(rsp->CreateContextsOffset); remaining = le32_to_cpu(rsp->CreateContextsLength); cc = (struct create_context *)data_offset; while (remaining >= sizeof(struct create_context)) { name = le16_to_cpu(cc->NameOffset) + (char *)cc; if (le16_to_cpu(cc->NameLength) == 4 && strncmp(name, "RqLs", 4) == 0) - return server->ops->parse_lease_buf(cc, epoch); + return server->ops->parse_lease_buf(cc, epoch, + lease_key); next = le32_to_cpu(cc->Next); if (!next) @@ -1815,7 +1896,7 @@ SMB2_open(const unsigned int xid, struct cifs_open_parms *oparms, __le16 *path, struct TCP_Server_Info *server; struct cifs_tcon *tcon = oparms->tcon; struct cifs_ses *ses = tcon->ses; - struct kvec iov[4]; + struct kvec iov[5]; /* make sure at least one for each open context */ struct kvec rsp_iov = {NULL, 0}; int resp_buftype; int uni_path_len; @@ -1824,7 +1905,7 @@ SMB2_open(const unsigned int xid, struct cifs_open_parms *oparms, __le16 *path, int rc = 0; unsigned int n_iov = 2; __u32 file_attributes = 0; - char *dhc_buf = NULL, *lc_buf = NULL; + char *dhc_buf = NULL, *lc_buf = NULL, *pc_buf = NULL; int flags = 0; unsigned int total_len; @@ -1840,7 +1921,7 @@ SMB2_open(const unsigned int xid, struct cifs_open_parms *oparms, __le16 *path, if (rc) return rc; - if (encryption_required(tcon)) + if (smb3_encryption_required(tcon)) flags |= CIFS_TRANSFORM_REQ; if (oparms->create_options & CREATE_OPTION_READONLY) @@ -1941,6 +2022,27 @@ SMB2_open(const unsigned int xid, struct cifs_open_parms *oparms, __le16 *path, dhc_buf = iov[n_iov-1].iov_base; } +#ifdef CONFIG_CIFS_SMB311 + if (tcon->posix_extensions) { + if (n_iov > 2) { + struct create_context *ccontext = + (struct create_context *)iov[n_iov-1].iov_base; + ccontext->Next = + cpu_to_le32(iov[n_iov-1].iov_len); + } + + rc = add_posix_context(iov, &n_iov, oparms->mode); + if (rc) { + cifs_small_buf_release(req); + kfree(copy_path); + kfree(lc_buf); + kfree(dhc_buf); + return rc; + } + pc_buf = iov[n_iov-1].iov_base; + } +#endif /* SMB311 */ + rc = smb2_send_recv(xid, ses, iov, n_iov, &resp_buftype, flags, &rsp_iov); cifs_small_buf_release(req); @@ -1953,8 +2055,13 @@ SMB2_open(const unsigned int xid, struct cifs_open_parms *oparms, __le16 *path, resp_buftype = CIFS_NO_BUFFER; rsp = NULL; } + trace_smb3_open_err(xid, tcon->tid, ses->Suid, + oparms->create_options, oparms->desired_access, rc); goto creat_exit; - } + } else + trace_smb3_open_done(xid, rsp->PersistentFileId, tcon->tid, + ses->Suid, oparms->create_options, + oparms->desired_access); oparms->fid->persistent_fid = rsp->PersistentFileId; oparms->fid->volatile_fid = rsp->VolatileFileId; @@ -1969,13 +2076,15 @@ SMB2_open(const unsigned int xid, struct cifs_open_parms *oparms, __le16 *path, } if (rsp->OplockLevel == SMB2_OPLOCK_LEVEL_LEASE) - *oplock = parse_lease_state(server, rsp, &oparms->fid->epoch); + *oplock = parse_lease_state(server, rsp, &oparms->fid->epoch, + oparms->fid->lease_key); else *oplock = rsp->OplockLevel; creat_exit: kfree(copy_path); kfree(lc_buf); kfree(dhc_buf); + kfree(pc_buf); free_rsp_buf(resp_buftype, rsp); return rc; } @@ -1991,7 +2100,6 @@ SMB2_ioctl(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid, { struct smb2_ioctl_req *req; struct smb2_ioctl_rsp *rsp; - struct smb2_sync_hdr *shdr; struct cifs_ses *ses; struct kvec iov[2]; struct kvec rsp_iov; @@ -2022,7 +2130,7 @@ SMB2_ioctl(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid, if (rc) return rc; - if (encryption_required(tcon)) + if (smb3_encryption_required(tcon)) flags |= CIFS_TRANSFORM_REQ; req->CtlCode = cpu_to_le32(opcode); @@ -2085,6 +2193,10 @@ SMB2_ioctl(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid, cifs_small_buf_release(req); rsp = (struct smb2_ioctl_rsp *)rsp_iov.iov_base; + if (rc != 0) + trace_smb3_fsctl_err(xid, persistent_fid, tcon->tid, + ses->Suid, 0, opcode, rc); + if ((rc != 0) && (rc != -EINVAL)) { cifs_stats_fail_inc(tcon, SMB2_IOCTL_HE); goto ioctl_exit; @@ -2112,7 +2224,7 @@ SMB2_ioctl(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid, goto ioctl_exit; } - if (get_rfc1002_length(rsp) < le32_to_cpu(rsp->OutputOffset) + *plen) { + if (rsp_iov.iov_len < le32_to_cpu(rsp->OutputOffset) + *plen) { cifs_dbg(VFS, "Malformed ioctl resp: len %d offset %d\n", *plen, le32_to_cpu(rsp->OutputOffset)); *plen = 0; @@ -2126,8 +2238,7 @@ SMB2_ioctl(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid, goto ioctl_exit; } - shdr = get_sync_hdr(rsp); - memcpy(*out_data, (char *)shdr + le32_to_cpu(rsp->OutputOffset), *plen); + memcpy(*out_data, (char *)rsp + le32_to_cpu(rsp->OutputOffset), *plen); ioctl_exit: free_rsp_buf(resp_buftype, rsp); return rc; @@ -2159,8 +2270,8 @@ SMB2_set_compression(const unsigned int xid, struct cifs_tcon *tcon, } int -SMB2_close(const unsigned int xid, struct cifs_tcon *tcon, - u64 persistent_fid, u64 volatile_fid) +SMB2_close_flags(const unsigned int xid, struct cifs_tcon *tcon, + u64 persistent_fid, u64 volatile_fid, int flags) { struct smb2_close_req *req; struct smb2_close_rsp *rsp; @@ -2169,7 +2280,6 @@ SMB2_close(const unsigned int xid, struct cifs_tcon *tcon, struct kvec rsp_iov; int resp_buftype; int rc = 0; - int flags = 0; unsigned int total_len; cifs_dbg(FYI, "Close\n"); @@ -2181,7 +2291,7 @@ SMB2_close(const unsigned int xid, struct cifs_tcon *tcon, if (rc) return rc; - if (encryption_required(tcon)) + if (smb3_encryption_required(tcon)) flags |= CIFS_TRANSFORM_REQ; req->PersistentFileId = persistent_fid; @@ -2196,6 +2306,8 @@ SMB2_close(const unsigned int xid, struct cifs_tcon *tcon, if (rc != 0) { cifs_stats_fail_inc(tcon, SMB2_CLOSE_HE); + trace_smb3_close_err(xid, persistent_fid, tcon->tid, ses->Suid, + rc); goto close_exit; } @@ -2206,14 +2318,20 @@ close_exit: return rc; } +int +SMB2_close(const unsigned int xid, struct cifs_tcon *tcon, + u64 persistent_fid, u64 volatile_fid) +{ + return SMB2_close_flags(xid, tcon, persistent_fid, volatile_fid, 0); +} + static int -validate_iov(struct TCP_Server_Info *server, - unsigned int offset, unsigned int buffer_length, +validate_iov(unsigned int offset, unsigned int buffer_length, struct kvec *iov, unsigned int min_buf_size) { unsigned int smb_len = iov->iov_len; - char *end_of_smb = smb_len + server->vals->header_preamble_size + (char *)iov->iov_base; - char *begin_of_buf = server->vals->header_preamble_size + offset + (char *)iov->iov_base; + char *end_of_smb = smb_len + (char *)iov->iov_base; + char *begin_of_buf = offset + (char *)iov->iov_base; char *end_of_buf = begin_of_buf + buffer_length; @@ -2243,18 +2361,17 @@ validate_iov(struct TCP_Server_Info *server, * Caller must free buffer. */ static int -validate_and_copy_iov(struct TCP_Server_Info *server, - unsigned int offset, unsigned int buffer_length, +validate_and_copy_iov(unsigned int offset, unsigned int buffer_length, struct kvec *iov, unsigned int minbufsize, char *data) { - char *begin_of_buf = server->vals->header_preamble_size + offset + (char *)(iov->iov_base); + char *begin_of_buf = offset + (char *)iov->iov_base; int rc; if (!data) return -EINVAL; - rc = validate_iov(server, offset, buffer_length, iov, minbufsize); + rc = validate_iov(offset, buffer_length, iov, minbufsize); if (rc) return rc; @@ -2289,7 +2406,7 @@ query_info(const unsigned int xid, struct cifs_tcon *tcon, if (rc) return rc; - if (encryption_required(tcon)) + if (smb3_encryption_required(tcon)) flags |= CIFS_TRANSFORM_REQ; req->InfoType = info_type; @@ -2315,6 +2432,8 @@ query_info(const unsigned int xid, struct cifs_tcon *tcon, if (rc) { cifs_stats_fail_inc(tcon, SMB2_QUERY_INFO_HE); + trace_smb3_query_info_err(xid, persistent_fid, tcon->tid, + ses->Suid, info_class, (__u32)info_type, rc); goto qinf_exit; } @@ -2332,8 +2451,7 @@ query_info(const unsigned int xid, struct cifs_tcon *tcon, } } - rc = validate_and_copy_iov(ses->server, - le16_to_cpu(rsp->OutputBufferOffset), + rc = validate_and_copy_iov(le16_to_cpu(rsp->OutputBufferOffset), le32_to_cpu(rsp->OutputBufferLength), &rsp_iov, min_len, *data); @@ -2404,7 +2522,7 @@ smb2_echo_callback(struct mid_q_entry *mid) unsigned int credits_received = 1; if (mid->mid_state == MID_RESPONSE_RECEIVED) - credits_received = le16_to_cpu(rsp->hdr.sync_hdr.CreditRequest); + credits_received = le16_to_cpu(rsp->sync_hdr.CreditRequest); DeleteMidQEntry(mid); add_credits(server, credits_received, CIFS_ECHO_OP); @@ -2533,7 +2651,7 @@ SMB2_flush(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid, if (rc) return rc; - if (encryption_required(tcon)) + if (smb3_encryption_required(tcon)) flags |= CIFS_TRANSFORM_REQ; req->PersistentFileId = persistent_fid; @@ -2545,8 +2663,11 @@ SMB2_flush(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid, rc = smb2_send_recv(xid, ses, iov, 1, &resp_buftype, flags, &rsp_iov); cifs_small_buf_release(req); - if (rc != 0) + if (rc != 0) { cifs_stats_fail_inc(tcon, SMB2_FLUSH_HE); + trace_smb3_flush_err(xid, persistent_fid, tcon->tid, ses->Suid, + rc); + } free_rsp_buf(resp_buftype, rsp_iov.iov_base); return rc; @@ -2655,11 +2776,12 @@ smb2_readv_callback(struct mid_q_entry *mid) struct cifs_tcon *tcon = tlink_tcon(rdata->cfile->tlink); struct TCP_Server_Info *server = tcon->ses->server; struct smb2_sync_hdr *shdr = - (struct smb2_sync_hdr *)rdata->iov[1].iov_base; + (struct smb2_sync_hdr *)rdata->iov[0].iov_base; unsigned int credits_received = 1; struct smb_rqst rqst = { .rq_iov = rdata->iov, .rq_nvec = 2, .rq_pages = rdata->pages, + .rq_offset = rdata->page_offset, .rq_npages = rdata->nr_pages, .rq_pagesz = rdata->pagesz, .rq_tailsz = rdata->tailsz }; @@ -2757,7 +2879,7 @@ smb2_async_readv(struct cifs_readdata *rdata) return rc; } - if (encryption_required(io_parms.tcon)) + if (smb3_encryption_required(io_parms.tcon)) flags |= CIFS_TRANSFORM_REQ; req_len = cpu_to_be32(total_len); @@ -2788,7 +2910,13 @@ smb2_async_readv(struct cifs_readdata *rdata) if (rc) { kref_put(&rdata->refcount, cifs_readdata_release); cifs_stats_fail_inc(io_parms.tcon, SMB2_READ_HE); - } + trace_smb3_read_err(rc, 0 /* xid */, io_parms.persistent_fid, + io_parms.tcon->tid, io_parms.tcon->ses->Suid, + io_parms.offset, io_parms.length); + } else + trace_smb3_read_done(0 /* xid */, io_parms.persistent_fid, + io_parms.tcon->tid, io_parms.tcon->ses->Suid, + io_parms.offset, io_parms.length); cifs_small_buf_release(buf); return rc; @@ -2801,7 +2929,6 @@ SMB2_read(const unsigned int xid, struct cifs_io_parms *io_parms, int resp_buftype, rc = -EACCES; struct smb2_read_plain_req *req = NULL; struct smb2_read_rsp *rsp = NULL; - struct smb2_sync_hdr *shdr; struct kvec iov[1]; struct kvec rsp_iov; unsigned int total_len; @@ -2813,7 +2940,7 @@ SMB2_read(const unsigned int xid, struct cifs_io_parms *io_parms, if (rc) return rc; - if (encryption_required(io_parms->tcon)) + if (smb3_encryption_required(io_parms->tcon)) flags |= CIFS_TRANSFORM_REQ; iov[0].iov_base = (char *)req; @@ -2829,9 +2956,15 @@ SMB2_read(const unsigned int xid, struct cifs_io_parms *io_parms, cifs_stats_fail_inc(io_parms->tcon, SMB2_READ_HE); cifs_dbg(VFS, "Send error in read = %d\n", rc); } + trace_smb3_read_err(rc, xid, req->PersistentFileId, + io_parms->tcon->tid, ses->Suid, + io_parms->offset, io_parms->length); free_rsp_buf(resp_buftype, rsp_iov.iov_base); return rc == -ENODATA ? 0 : rc; - } + } else + trace_smb3_read_done(xid, req->PersistentFileId, + io_parms->tcon->tid, ses->Suid, + io_parms->offset, io_parms->length); *nbytes = le32_to_cpu(rsp->DataLength); if ((*nbytes > CIFS_MAX_MSGSIZE) || @@ -2842,10 +2975,8 @@ SMB2_read(const unsigned int xid, struct cifs_io_parms *io_parms, *nbytes = 0; } - shdr = get_sync_hdr(rsp); - if (*buf) { - memcpy(*buf, (char *)shdr + rsp->DataOffset, *nbytes); + memcpy(*buf, (char *)rsp + rsp->DataOffset, *nbytes); free_rsp_buf(resp_buftype, rsp_iov.iov_base); } else if (resp_buftype != CIFS_NO_BUFFER) { *buf = rsp_iov.iov_base; @@ -2872,7 +3003,7 @@ smb2_writev_callback(struct mid_q_entry *mid) switch (mid->mid_state) { case MID_RESPONSE_RECEIVED: - credits_received = le16_to_cpu(rsp->hdr.sync_hdr.CreditRequest); + credits_received = le16_to_cpu(rsp->sync_hdr.CreditRequest); wdata->result = smb2_check_receive(mid, tcon->ses->server, 0); if (wdata->result != 0) break; @@ -2949,7 +3080,7 @@ smb2_async_writev(struct cifs_writedata *wdata, goto async_writev_out; } - if (encryption_required(tcon)) + if (smb3_encryption_required(tcon)) flags |= CIFS_TRANSFORM_REQ; shdr = (struct smb2_sync_hdr *)req; @@ -3010,6 +3141,7 @@ smb2_async_writev(struct cifs_writedata *wdata, rqst.rq_iov = iov; rqst.rq_nvec = 2; rqst.rq_pages = wdata->pages; + rqst.rq_offset = wdata->page_offset; rqst.rq_npages = wdata->nr_pages; rqst.rq_pagesz = wdata->pagesz; rqst.rq_tailsz = wdata->tailsz; @@ -3047,9 +3179,15 @@ smb2_async_writev(struct cifs_writedata *wdata, wdata, flags); if (rc) { + trace_smb3_write_err(0 /* no xid */, req->PersistentFileId, + tcon->tid, tcon->ses->Suid, wdata->offset, + wdata->bytes, rc); kref_put(&wdata->refcount, release); cifs_stats_fail_inc(tcon, SMB2_WRITE_HE); - } + } else + trace_smb3_write_done(0 /* no xid */, req->PersistentFileId, + tcon->tid, tcon->ses->Suid, wdata->offset, + wdata->bytes); async_writev_out: cifs_small_buf_release(req); @@ -3087,7 +3225,7 @@ SMB2_write(const unsigned int xid, struct cifs_io_parms *io_parms, if (io_parms->tcon->ses->server == NULL) return -ECONNABORTED; - if (encryption_required(io_parms->tcon)) + if (smb3_encryption_required(io_parms->tcon)) flags |= CIFS_TRANSFORM_REQ; req->sync_hdr.ProcessId = cpu_to_le32(io_parms->pid); @@ -3113,10 +3251,19 @@ SMB2_write(const unsigned int xid, struct cifs_io_parms *io_parms, rsp = (struct smb2_write_rsp *)rsp_iov.iov_base; if (rc) { + trace_smb3_write_err(xid, req->PersistentFileId, + io_parms->tcon->tid, + io_parms->tcon->ses->Suid, + io_parms->offset, io_parms->length, rc); cifs_stats_fail_inc(io_parms->tcon, SMB2_WRITE_HE); cifs_dbg(VFS, "Send error in write = %d\n", rc); - } else + } else { *nbytes = le32_to_cpu(rsp->DataLength); + trace_smb3_write_done(xid, req->PersistentFileId, + io_parms->tcon->tid, + io_parms->tcon->ses->Suid, + io_parms->offset, *nbytes); + } free_rsp_buf(resp_buftype, rsp); return rc; @@ -3197,7 +3344,7 @@ SMB2_query_directory(const unsigned int xid, struct cifs_tcon *tcon, if (rc) return rc; - if (encryption_required(tcon)) + if (smb3_encryption_required(tcon)) flags |= CIFS_TRANSFORM_REQ; switch (srch_inf->info_level) { @@ -3248,7 +3395,7 @@ SMB2_query_directory(const unsigned int xid, struct cifs_tcon *tcon, if (rc) { if (rc == -ENODATA && - rsp->hdr.sync_hdr.Status == STATUS_NO_MORE_FILES) { + rsp->sync_hdr.Status == STATUS_NO_MORE_FILES) { srch_inf->endOfSearch = true; rc = 0; } @@ -3256,8 +3403,7 @@ SMB2_query_directory(const unsigned int xid, struct cifs_tcon *tcon, goto qdir_exit; } - rc = validate_iov(server, - le16_to_cpu(rsp->OutputBufferOffset), + rc = validate_iov(le16_to_cpu(rsp->OutputBufferOffset), le32_to_cpu(rsp->OutputBufferLength), &rsp_iov, info_buf_size); if (rc) @@ -3272,10 +3418,9 @@ SMB2_query_directory(const unsigned int xid, struct cifs_tcon *tcon, cifs_buf_release(srch_inf->ntwrk_buf_start); } srch_inf->ntwrk_buf_start = (char *)rsp; - srch_inf->srch_entries_start = srch_inf->last_entry = 4 /* rfclen */ + - (char *)&rsp->hdr + le16_to_cpu(rsp->OutputBufferOffset); - /* 4 for rfc1002 length field */ - end_of_smb = get_rfc1002_length(rsp) + 4 + (char *)&rsp->hdr; + srch_inf->srch_entries_start = srch_inf->last_entry = + (char *)rsp + le16_to_cpu(rsp->OutputBufferOffset); + end_of_smb = rsp_iov.iov_len + (char *)rsp; srch_inf->entries_in_buffer = num_entries(srch_inf->srch_entries_start, end_of_smb, &srch_inf->last_entry, info_buf_size); @@ -3330,7 +3475,7 @@ send_set_info(const unsigned int xid, struct cifs_tcon *tcon, return rc; } - if (encryption_required(tcon)) + if (smb3_encryption_required(tcon)) flags |= CIFS_TRANSFORM_REQ; req->sync_hdr.ProcessId = cpu_to_le32(pid); @@ -3363,8 +3508,11 @@ send_set_info(const unsigned int xid, struct cifs_tcon *tcon, cifs_small_buf_release(req); rsp = (struct smb2_set_info_rsp *)rsp_iov.iov_base; - if (rc != 0) + if (rc != 0) { cifs_stats_fail_inc(tcon, SMB2_SET_INFO_HE); + trace_smb3_set_info_err(xid, persistent_fid, tcon->tid, + ses->Suid, info_class, (__u32)info_type, rc); + } free_rsp_buf(resp_buftype, rsp); kfree(iov); @@ -3511,7 +3659,7 @@ SMB2_oplock_break(const unsigned int xid, struct cifs_tcon *tcon, __u8 oplock_level) { int rc; - struct smb2_oplock_break_req *req = NULL; + struct smb2_oplock_break *req = NULL; struct cifs_ses *ses = tcon->ses; int flags = CIFS_OBREAK_OP; unsigned int total_len; @@ -3525,7 +3673,7 @@ SMB2_oplock_break(const unsigned int xid, struct cifs_tcon *tcon, if (rc) return rc; - if (encryption_required(tcon)) + if (smb3_encryption_required(tcon)) flags |= CIFS_TRANSFORM_REQ; req->VolatileFid = volatile_fid; @@ -3590,7 +3738,7 @@ build_qfs_info_req(struct kvec *iov, struct cifs_tcon *tcon, int level, req->InputBufferOffset = cpu_to_le16(sizeof(struct smb2_query_info_req) - 1); req->OutputBufferLength = cpu_to_le32( - outbuf_len + sizeof(struct smb2_query_info_rsp) - 1 - server->vals->header_preamble_size); + outbuf_len + sizeof(struct smb2_query_info_rsp) - 1); iov->iov_base = (char *)req; iov->iov_len = total_len; @@ -3607,7 +3755,6 @@ SMB2_QFS_info(const unsigned int xid, struct cifs_tcon *tcon, int rc = 0; int resp_buftype; struct cifs_ses *ses = tcon->ses; - struct TCP_Server_Info *server = ses->server; struct smb2_fs_full_size_info *info = NULL; int flags = 0; @@ -3617,7 +3764,7 @@ SMB2_QFS_info(const unsigned int xid, struct cifs_tcon *tcon, if (rc) return rc; - if (encryption_required(tcon)) + if (smb3_encryption_required(tcon)) flags |= CIFS_TRANSFORM_REQ; rc = smb2_send_recv(xid, ses, &iov, 1, &resp_buftype, flags, &rsp_iov); @@ -3628,10 +3775,9 @@ SMB2_QFS_info(const unsigned int xid, struct cifs_tcon *tcon, } rsp = (struct smb2_query_info_rsp *)rsp_iov.iov_base; - info = (struct smb2_fs_full_size_info *)(server->vals->header_preamble_size + - le16_to_cpu(rsp->OutputBufferOffset) + (char *)&rsp->hdr); - rc = validate_iov(server, - le16_to_cpu(rsp->OutputBufferOffset), + info = (struct smb2_fs_full_size_info *)( + le16_to_cpu(rsp->OutputBufferOffset) + (char *)rsp); + rc = validate_iov(le16_to_cpu(rsp->OutputBufferOffset), le32_to_cpu(rsp->OutputBufferLength), &rsp_iov, sizeof(struct smb2_fs_full_size_info)); if (!rc) @@ -3652,7 +3798,6 @@ SMB2_QFS_attr(const unsigned int xid, struct cifs_tcon *tcon, int rc = 0; int resp_buftype, max_len, min_len; struct cifs_ses *ses = tcon->ses; - struct TCP_Server_Info *server = ses->server; unsigned int rsp_len, offset; int flags = 0; @@ -3675,7 +3820,7 @@ SMB2_QFS_attr(const unsigned int xid, struct cifs_tcon *tcon, if (rc) return rc; - if (encryption_required(tcon)) + if (smb3_encryption_required(tcon)) flags |= CIFS_TRANSFORM_REQ; rc = smb2_send_recv(xid, ses, &iov, 1, &resp_buftype, flags, &rsp_iov); @@ -3688,20 +3833,20 @@ SMB2_QFS_attr(const unsigned int xid, struct cifs_tcon *tcon, rsp_len = le32_to_cpu(rsp->OutputBufferLength); offset = le16_to_cpu(rsp->OutputBufferOffset); - rc = validate_iov(server, offset, rsp_len, &rsp_iov, min_len); + rc = validate_iov(offset, rsp_len, &rsp_iov, min_len); if (rc) goto qfsattr_exit; if (level == FS_ATTRIBUTE_INFORMATION) - memcpy(&tcon->fsAttrInfo, server->vals->header_preamble_size + offset - + (char *)&rsp->hdr, min_t(unsigned int, + memcpy(&tcon->fsAttrInfo, offset + + (char *)rsp, min_t(unsigned int, rsp_len, max_len)); else if (level == FS_DEVICE_INFORMATION) - memcpy(&tcon->fsDevInfo, server->vals->header_preamble_size + offset - + (char *)&rsp->hdr, sizeof(FILE_SYSTEM_DEVICE_INFO)); + memcpy(&tcon->fsDevInfo, offset + + (char *)rsp, sizeof(FILE_SYSTEM_DEVICE_INFO)); else if (level == FS_SECTOR_SIZE_INFORMATION) { struct smb3_fs_ss_info *ss_info = (struct smb3_fs_ss_info *) - (server->vals->header_preamble_size + offset + (char *)&rsp->hdr); + (offset + (char *)rsp); tcon->ss_flags = le32_to_cpu(ss_info->Flags); tcon->perf_sector_size = le32_to_cpu(ss_info->PhysicalBytesPerSectorForPerf); @@ -3732,7 +3877,7 @@ smb2_lockv(const unsigned int xid, struct cifs_tcon *tcon, if (rc) return rc; - if (encryption_required(tcon)) + if (smb3_encryption_required(tcon)) flags |= CIFS_TRANSFORM_REQ; req->sync_hdr.ProcessId = cpu_to_le32(pid); @@ -3755,6 +3900,8 @@ smb2_lockv(const unsigned int xid, struct cifs_tcon *tcon, if (rc) { cifs_dbg(FYI, "Send error in smb2_lockv = %d\n", rc); cifs_stats_fail_inc(tcon, SMB2_LOCK_HE); + trace_smb3_lock_err(xid, persist_fid, tcon->tid, + tcon->ses->Suid, rc); } return rc; @@ -3796,7 +3943,7 @@ SMB2_lease_break(const unsigned int xid, struct cifs_tcon *tcon, if (rc) return rc; - if (encryption_required(tcon)) + if (smb3_encryption_required(tcon)) flags |= CIFS_TRANSFORM_REQ; req->sync_hdr.CreditRequest = cpu_to_le16(1); diff --git a/fs/cifs/smb2pdu.h b/fs/cifs/smb2pdu.h index d28f358022c5..a345560001ce 100644 --- a/fs/cifs/smb2pdu.h +++ b/fs/cifs/smb2pdu.h @@ -122,25 +122,10 @@ struct smb2_sync_pdu { __le16 StructureSize2; /* size of wct area (varies, request specific) */ } __packed; -struct smb2_hdr { - __be32 smb2_buf_length; /* big endian on wire */ - /* length is only two or three bytes - with */ - /* one or two byte type preceding it that MBZ */ - struct smb2_sync_hdr sync_hdr; -} __packed; - -struct smb2_pdu { - struct smb2_hdr hdr; - __le16 StructureSize2; /* size of wct area (varies, request specific) */ -} __packed; - #define SMB3_AES128CMM_NONCE 11 #define SMB3_AES128GCM_NONCE 12 struct smb2_transform_hdr { - __be32 smb2_buf_length; /* big endian on wire */ - /* length is only two or three bytes - with - one or two byte type preceding it that MBZ */ __le32 ProtocolId; /* 0xFD 'S' 'M' 'B' */ __u8 Signature[16]; __u8 Nonce[16]; @@ -171,7 +156,7 @@ struct smb2_transform_hdr { #define SMB2_ERROR_STRUCTURE_SIZE2 cpu_to_le16(9) struct smb2_err_rsp { - struct smb2_hdr hdr; + struct smb2_sync_hdr sync_hdr; __le16 StructureSize; __le16 Reserved; /* MBZ */ __le32 ByteCount; /* even if zero, at least one byte follows */ @@ -300,8 +285,16 @@ struct smb2_encryption_neg_context { __le16 Ciphers[1]; /* Ciphers[0] since only one used now */ } __packed; +#define POSIX_CTXT_DATA_LEN 8 +struct smb2_posix_neg_context { + __le16 ContextType; /* 0x100 */ + __le16 DataLength; + __le32 Reserved; + __le64 Reserved1; /* In case needed for future (eg version or caps) */ +} __packed; + struct smb2_negotiate_rsp { - struct smb2_hdr hdr; + struct smb2_sync_hdr sync_hdr; __le16 StructureSize; /* Must be 65 */ __le16 SecurityMode; __le16 DialectRevision; @@ -341,7 +334,7 @@ struct smb2_sess_setup_req { #define SMB2_SESSION_FLAG_IS_NULL 0x0002 #define SMB2_SESSION_FLAG_ENCRYPT_DATA 0x0004 struct smb2_sess_setup_rsp { - struct smb2_hdr hdr; + struct smb2_sync_hdr sync_hdr; __le16 StructureSize; /* Must be 9 */ __le16 SessionFlags; __le16 SecurityBufferOffset; @@ -356,7 +349,7 @@ struct smb2_logoff_req { } __packed; struct smb2_logoff_rsp { - struct smb2_hdr hdr; + struct smb2_sync_hdr sync_hdr; __le16 StructureSize; /* Must be 4 */ __le16 Reserved; } __packed; @@ -452,7 +445,7 @@ struct smb2_tree_connect_req_extension { } __packed; struct smb2_tree_connect_rsp { - struct smb2_hdr hdr; + struct smb2_sync_hdr sync_hdr; __le16 StructureSize; /* Must be 16 */ __u8 ShareType; /* see below */ __u8 Reserved; @@ -503,7 +496,7 @@ struct smb2_tree_disconnect_req { } __packed; struct smb2_tree_disconnect_rsp { - struct smb2_hdr hdr; + struct smb2_sync_hdr sync_hdr; __le16 StructureSize; /* Must be 4 */ __le16 Reserved; } __packed; @@ -615,7 +608,9 @@ struct smb2_tree_disconnect_rsp { #define SMB2_CREATE_DURABLE_HANDLE_REQUEST_V2 "DH2Q" #define SMB2_CREATE_DURABLE_HANDLE_RECONNECT_V2 "DH2C" #define SMB2_CREATE_APP_INSTANCE_ID 0x45BCA66AEFA7F74A9008FA462E144D74 -#define SVHDX_OPEN_DEVICE_CONTEXT 0x83CE6F1AD851E0986E34401CC9BCFCE9 +#define SVHDX_OPEN_DEVICE_CONTEX 0x9CCBCF9E04C1E643980E158DA1F6EC83 +#define SMB2_CREATE_TAG_POSIX 0x93AD25509CB411E7B42383DE968BCD7C + struct smb2_create_req { struct smb2_sync_hdr sync_hdr; @@ -638,7 +633,7 @@ struct smb2_create_req { } __packed; struct smb2_create_rsp { - struct smb2_hdr hdr; + struct smb2_sync_hdr sync_hdr; __le16 StructureSize; /* Must be 89 */ __u8 OplockLevel; __u8 Reserved; @@ -727,6 +722,13 @@ struct create_durable { } Data; } __packed; +struct create_posix { + struct create_context ccontext; + __u8 Name[16]; + __le32 Mode; + __u32 Reserved; +} __packed; + /* See MS-SMB2 2.2.13.2.11 */ /* Flags */ #define SMB2_DHANDLE_FLAG_PERSISTENT 0x00000002 @@ -894,7 +896,7 @@ struct smb2_ioctl_req { } __packed; struct smb2_ioctl_rsp { - struct smb2_hdr hdr; + struct smb2_sync_hdr sync_hdr; __le16 StructureSize; /* Must be 57 */ __u16 Reserved; __le32 CtlCode; @@ -921,7 +923,7 @@ struct smb2_close_req { } __packed; struct smb2_close_rsp { - struct smb2_hdr hdr; + struct smb2_sync_hdr sync_hdr; __le16 StructureSize; /* 60 */ __le16 Flags; __le32 Reserved; @@ -944,7 +946,7 @@ struct smb2_flush_req { } __packed; struct smb2_flush_rsp { - struct smb2_hdr hdr; + struct smb2_sync_hdr sync_hdr; __le16 StructureSize; __le16 Reserved; } __packed; @@ -976,7 +978,7 @@ struct smb2_read_plain_req { } __packed; struct smb2_read_rsp { - struct smb2_hdr hdr; + struct smb2_sync_hdr sync_hdr; __le16 StructureSize; /* Must be 17 */ __u8 DataOffset; __u8 Reserved; @@ -1007,7 +1009,7 @@ struct smb2_write_req { } __packed; struct smb2_write_rsp { - struct smb2_hdr hdr; + struct smb2_sync_hdr sync_hdr; __le16 StructureSize; /* Must be 17 */ __u8 DataOffset; __u8 Reserved; @@ -1041,7 +1043,7 @@ struct smb2_lock_req { } __packed; struct smb2_lock_rsp { - struct smb2_hdr hdr; + struct smb2_sync_hdr sync_hdr; __le16 StructureSize; /* Must be 4 */ __le16 Reserved; } __packed; @@ -1053,7 +1055,7 @@ struct smb2_echo_req { } __packed; struct smb2_echo_rsp { - struct smb2_hdr hdr; + struct smb2_sync_hdr sync_hdr; __le16 StructureSize; /* Must be 4 */ __u16 Reserved; } __packed; @@ -1079,7 +1081,7 @@ struct smb2_query_directory_req { } __packed; struct smb2_query_directory_rsp { - struct smb2_hdr hdr; + struct smb2_sync_hdr sync_hdr; __le16 StructureSize; /* Must be 9 */ __le16 OutputBufferOffset; __le32 OutputBufferLength; @@ -1128,7 +1130,7 @@ struct smb2_query_info_req { } __packed; struct smb2_query_info_rsp { - struct smb2_hdr hdr; + struct smb2_sync_hdr sync_hdr; __le16 StructureSize; /* Must be 9 */ __le16 OutputBufferOffset; __le32 OutputBufferLength; @@ -1150,12 +1152,11 @@ struct smb2_set_info_req { } __packed; struct smb2_set_info_rsp { - struct smb2_hdr hdr; + struct smb2_sync_hdr sync_hdr; __le16 StructureSize; /* Must be 2 */ } __packed; -/* oplock break without an rfc1002 header */ -struct smb2_oplock_break_req { +struct smb2_oplock_break { struct smb2_sync_hdr sync_hdr; __le16 StructureSize; /* Must be 24 */ __u8 OplockLevel; @@ -1165,21 +1166,10 @@ struct smb2_oplock_break_req { __u64 VolatileFid; } __packed; -/* oplock break with an rfc1002 header */ -struct smb2_oplock_break_rsp { - struct smb2_hdr hdr; - __le16 StructureSize; /* Must be 24 */ - __u8 OplockLevel; - __u8 Reserved; - __le32 Reserved2; - __u64 PersistentFid; - __u64 VolatileFid; -} __packed; - #define SMB2_NOTIFY_BREAK_LEASE_FLAG_ACK_REQUIRED cpu_to_le32(0x01) struct smb2_lease_break { - struct smb2_hdr hdr; + struct smb2_sync_hdr sync_hdr; __le16 StructureSize; /* Must be 44 */ __le16 Reserved; __le32 Flags; diff --git a/fs/cifs/smb2proto.h b/fs/cifs/smb2proto.h index 8ba24a95db71..908555b1c6b5 100644 --- a/fs/cifs/smb2proto.h +++ b/fs/cifs/smb2proto.h @@ -36,8 +36,9 @@ struct smb_rqst; extern int map_smb2_to_linux_error(char *buf, bool log_err); extern int smb2_check_message(char *buf, unsigned int length, struct TCP_Server_Info *server); -extern unsigned int smb2_calc_size(void *buf); -extern char *smb2_get_data_area_len(int *off, int *len, struct smb2_hdr *hdr); +extern unsigned int smb2_calc_size(void *buf, struct TCP_Server_Info *server); +extern char *smb2_get_data_area_len(int *off, int *len, + struct smb2_sync_hdr *shdr); extern __le16 *cifs_convert_path_to_utf16(const char *from, struct cifs_sb_info *cifs_sb); @@ -65,6 +66,8 @@ extern struct cifs_ses *smb2_find_smb_ses(struct TCP_Server_Info *server, extern int smb3_handle_read_data(struct TCP_Server_Info *server, struct mid_q_entry *mid); +extern int open_shroot(unsigned int xid, struct cifs_tcon *tcon, + struct cifs_fid *pfid); extern void move_smb2_info_to_cifs(FILE_ALL_INFO *dst, struct smb2_file_all_info *src); extern int smb2_query_path_info(const unsigned int xid, struct cifs_tcon *tcon, @@ -129,6 +132,8 @@ extern int SMB2_ioctl(const unsigned int xid, struct cifs_tcon *tcon, char **out_data, u32 *plen /* returned data len */); extern int SMB2_close(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_file_id, u64 volatile_file_id); +extern int SMB2_close_flags(const unsigned int xid, struct cifs_tcon *tcon, + u64 persistent_fid, u64 volatile_fid, int flags); extern int SMB2_flush(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_file_id, u64 volatile_file_id); extern int SMB2_query_eas(const unsigned int xid, struct cifs_tcon *tcon, diff --git a/fs/cifs/smb2transport.c b/fs/cifs/smb2transport.c index 8806f3f76c1d..2c671123a6bf 100644 --- a/fs/cifs/smb2transport.c +++ b/fs/cifs/smb2transport.c @@ -480,7 +480,7 @@ smb2_verify_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server) unsigned int rc; char server_response_sig[16]; struct smb2_sync_hdr *shdr = - (struct smb2_sync_hdr *)rqst->rq_iov[1].iov_base; + (struct smb2_sync_hdr *)rqst->rq_iov[0].iov_base; if ((shdr->Command == SMB2_NEGOTIATE) || (shdr->Command == SMB2_SESSION_SETUP) || @@ -605,14 +605,12 @@ smb2_check_receive(struct mid_q_entry *mid, struct TCP_Server_Info *server, bool log_error) { unsigned int len = mid->resp_buf_size; - struct kvec iov[2]; + struct kvec iov[1]; struct smb_rqst rqst = { .rq_iov = iov, - .rq_nvec = 2 }; + .rq_nvec = 1 }; iov[0].iov_base = (char *)mid->resp_buf; - iov[0].iov_len = 4; - iov[1].iov_base = (char *)mid->resp_buf + 4; - iov[1].iov_len = len; + iov[0].iov_len = len; dump_smb(mid->resp_buf, min_t(u32, 80, len)); /* convert the length into a more usable form */ diff --git a/fs/cifs/trace.c b/fs/cifs/trace.c new file mode 100644 index 000000000000..bd4a546feec1 --- /dev/null +++ b/fs/cifs/trace.c @@ -0,0 +1,18 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2018, Microsoft Corporation. + * + * Author(s): Steve French <stfrench@microsoft.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + */ +#define CREATE_TRACE_POINTS +#include "trace.h" diff --git a/fs/cifs/trace.h b/fs/cifs/trace.h new file mode 100644 index 000000000000..61e74d455d90 --- /dev/null +++ b/fs/cifs/trace.h @@ -0,0 +1,429 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2018, Microsoft Corporation. + * + * Author(s): Steve French <stfrench@microsoft.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + */ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM cifs + +#if !defined(_CIFS_TRACE_H) || defined(TRACE_HEADER_MULTI_READ) +#define _CIFS_TRACE_H + +#include <linux/tracepoint.h> + +/* For logging errors in read or write */ +DECLARE_EVENT_CLASS(smb3_rw_err_class, + TP_PROTO(unsigned int xid, + __u64 fid, + __u32 tid, + __u64 sesid, + __u64 offset, + __u32 len, + int rc), + TP_ARGS(xid, fid, tid, sesid, offset, len, rc), + TP_STRUCT__entry( + __field(unsigned int, xid) + __field(__u64, fid) + __field(__u32, tid) + __field(__u64, sesid) + __field(__u64, offset) + __field(__u32, len) + __field(int, rc) + ), + TP_fast_assign( + __entry->xid = xid; + __entry->fid = fid; + __entry->tid = tid; + __entry->sesid = sesid; + __entry->offset = offset; + __entry->len = len; + __entry->rc = rc; + ), + TP_printk("\txid=%u sid=0x%llx tid=0x%x fid=0x%llx offset=0x%llx len=0x%x rc=%d", + __entry->xid, __entry->sesid, __entry->tid, __entry->fid, + __entry->offset, __entry->len, __entry->rc) +) + +#define DEFINE_SMB3_RW_ERR_EVENT(name) \ +DEFINE_EVENT(smb3_rw_err_class, smb3_##name, \ + TP_PROTO(unsigned int xid, \ + __u64 fid, \ + __u32 tid, \ + __u64 sesid, \ + __u64 offset, \ + __u32 len, \ + int rc), \ + TP_ARGS(xid, fid, tid, sesid, offset, len, rc)) + +DEFINE_SMB3_RW_ERR_EVENT(write_err); +DEFINE_SMB3_RW_ERR_EVENT(read_err); + + +/* For logging successful read or write */ +DECLARE_EVENT_CLASS(smb3_rw_done_class, + TP_PROTO(unsigned int xid, + __u64 fid, + __u32 tid, + __u64 sesid, + __u64 offset, + __u32 len), + TP_ARGS(xid, fid, tid, sesid, offset, len), + TP_STRUCT__entry( + __field(unsigned int, xid) + __field(__u64, fid) + __field(__u32, tid) + __field(__u64, sesid) + __field(__u64, offset) + __field(__u32, len) + ), + TP_fast_assign( + __entry->xid = xid; + __entry->fid = fid; + __entry->tid = tid; + __entry->sesid = sesid; + __entry->offset = offset; + __entry->len = len; + ), + TP_printk("xid=%u sid=0x%llx tid=0x%x fid=0x%llx offset=0x%llx len=0x%x", + __entry->xid, __entry->sesid, __entry->tid, __entry->fid, + __entry->offset, __entry->len) +) + +#define DEFINE_SMB3_RW_DONE_EVENT(name) \ +DEFINE_EVENT(smb3_rw_done_class, smb3_##name, \ + TP_PROTO(unsigned int xid, \ + __u64 fid, \ + __u32 tid, \ + __u64 sesid, \ + __u64 offset, \ + __u32 len), \ + TP_ARGS(xid, fid, tid, sesid, offset, len)) + +DEFINE_SMB3_RW_DONE_EVENT(write_done); +DEFINE_SMB3_RW_DONE_EVENT(read_done); + +/* + * For handle based calls other than read and write, and get/set info + */ +DECLARE_EVENT_CLASS(smb3_fd_err_class, + TP_PROTO(unsigned int xid, + __u64 fid, + __u32 tid, + __u64 sesid, + int rc), + TP_ARGS(xid, fid, tid, sesid, rc), + TP_STRUCT__entry( + __field(unsigned int, xid) + __field(__u64, fid) + __field(__u32, tid) + __field(__u64, sesid) + __field(int, rc) + ), + TP_fast_assign( + __entry->xid = xid; + __entry->fid = fid; + __entry->tid = tid; + __entry->sesid = sesid; + __entry->rc = rc; + ), + TP_printk("\txid=%u sid=0x%llx tid=0x%x fid=0x%llx rc=%d", + __entry->xid, __entry->sesid, __entry->tid, __entry->fid, + __entry->rc) +) + +#define DEFINE_SMB3_FD_ERR_EVENT(name) \ +DEFINE_EVENT(smb3_fd_err_class, smb3_##name, \ + TP_PROTO(unsigned int xid, \ + __u64 fid, \ + __u32 tid, \ + __u64 sesid, \ + int rc), \ + TP_ARGS(xid, fid, tid, sesid, rc)) + +DEFINE_SMB3_FD_ERR_EVENT(flush_err); +DEFINE_SMB3_FD_ERR_EVENT(lock_err); +DEFINE_SMB3_FD_ERR_EVENT(close_err); + +/* + * For handle based query/set info calls + */ +DECLARE_EVENT_CLASS(smb3_inf_err_class, + TP_PROTO(unsigned int xid, + __u64 fid, + __u32 tid, + __u64 sesid, + __u8 infclass, + __u32 type, + int rc), + TP_ARGS(xid, fid, tid, sesid, infclass, type, rc), + TP_STRUCT__entry( + __field(unsigned int, xid) + __field(__u64, fid) + __field(__u32, tid) + __field(__u64, sesid) + __field(__u8, infclass) + __field(__u32, type) + __field(int, rc) + ), + TP_fast_assign( + __entry->xid = xid; + __entry->fid = fid; + __entry->tid = tid; + __entry->sesid = sesid; + __entry->infclass = infclass; + __entry->type = type; + __entry->rc = rc; + ), + TP_printk("xid=%u sid=0x%llx tid=0x%x fid=0x%llx class=%u type=0x%x rc=%d", + __entry->xid, __entry->sesid, __entry->tid, __entry->fid, + __entry->infclass, __entry->type, __entry->rc) +) + +#define DEFINE_SMB3_INF_ERR_EVENT(name) \ +DEFINE_EVENT(smb3_inf_err_class, smb3_##name, \ + TP_PROTO(unsigned int xid, \ + __u64 fid, \ + __u32 tid, \ + __u64 sesid, \ + __u8 infclass, \ + __u32 type, \ + int rc), \ + TP_ARGS(xid, fid, tid, sesid, infclass, type, rc)) + +DEFINE_SMB3_INF_ERR_EVENT(query_info_err); +DEFINE_SMB3_INF_ERR_EVENT(set_info_err); +DEFINE_SMB3_INF_ERR_EVENT(fsctl_err); + +/* + * For logging SMB3 Status code and Command for responses which return errors + */ +DECLARE_EVENT_CLASS(smb3_cmd_err_class, + TP_PROTO(__u32 tid, + __u64 sesid, + __u16 cmd, + __u64 mid, + __u32 status, + int rc), + TP_ARGS(tid, sesid, cmd, mid, status, rc), + TP_STRUCT__entry( + __field(__u32, tid) + __field(__u64, sesid) + __field(__u16, cmd) + __field(__u64, mid) + __field(__u32, status) + __field(int, rc) + ), + TP_fast_assign( + __entry->tid = tid; + __entry->sesid = sesid; + __entry->cmd = cmd; + __entry->mid = mid; + __entry->status = status; + __entry->rc = rc; + ), + TP_printk("\tsid=0x%llx tid=0x%x cmd=%u mid=%llu status=0x%x rc=%d", + __entry->sesid, __entry->tid, __entry->cmd, __entry->mid, + __entry->status, __entry->rc) +) + +#define DEFINE_SMB3_CMD_ERR_EVENT(name) \ +DEFINE_EVENT(smb3_cmd_err_class, smb3_##name, \ + TP_PROTO(__u32 tid, \ + __u64 sesid, \ + __u16 cmd, \ + __u64 mid, \ + __u32 status, \ + int rc), \ + TP_ARGS(tid, sesid, cmd, mid, status, rc)) + +DEFINE_SMB3_CMD_ERR_EVENT(cmd_err); + +DECLARE_EVENT_CLASS(smb3_cmd_done_class, + TP_PROTO(__u32 tid, + __u64 sesid, + __u16 cmd, + __u64 mid), + TP_ARGS(tid, sesid, cmd, mid), + TP_STRUCT__entry( + __field(__u32, tid) + __field(__u64, sesid) + __field(__u16, cmd) + __field(__u64, mid) + ), + TP_fast_assign( + __entry->tid = tid; + __entry->sesid = sesid; + __entry->cmd = cmd; + __entry->mid = mid; + ), + TP_printk("\tsid=0x%llx tid=0x%x cmd=%u mid=%llu", + __entry->sesid, __entry->tid, + __entry->cmd, __entry->mid) +) + +#define DEFINE_SMB3_CMD_DONE_EVENT(name) \ +DEFINE_EVENT(smb3_cmd_done_class, smb3_##name, \ + TP_PROTO(__u32 tid, \ + __u64 sesid, \ + __u16 cmd, \ + __u64 mid), \ + TP_ARGS(tid, sesid, cmd, mid)) + +DEFINE_SMB3_CMD_DONE_EVENT(cmd_done); + +DECLARE_EVENT_CLASS(smb3_exit_err_class, + TP_PROTO(unsigned int xid, + const char *func_name, + int rc), + TP_ARGS(xid, func_name, rc), + TP_STRUCT__entry( + __field(unsigned int, xid) + __field(const char *, func_name) + __field(int, rc) + ), + TP_fast_assign( + __entry->xid = xid; + __entry->func_name = func_name; + __entry->rc = rc; + ), + TP_printk("\t%s: xid=%u rc=%d", + __entry->func_name, __entry->xid, __entry->rc) +) + +#define DEFINE_SMB3_EXIT_ERR_EVENT(name) \ +DEFINE_EVENT(smb3_exit_err_class, smb3_##name, \ + TP_PROTO(unsigned int xid, \ + const char *func_name, \ + int rc), \ + TP_ARGS(xid, func_name, rc)) + +DEFINE_SMB3_EXIT_ERR_EVENT(exit_err); + +DECLARE_EVENT_CLASS(smb3_enter_exit_class, + TP_PROTO(unsigned int xid, + const char *func_name), + TP_ARGS(xid, func_name), + TP_STRUCT__entry( + __field(unsigned int, xid) + __field(const char *, func_name) + ), + TP_fast_assign( + __entry->xid = xid; + __entry->func_name = func_name; + ), + TP_printk("\t%s: xid=%u", + __entry->func_name, __entry->xid) +) + +#define DEFINE_SMB3_ENTER_EXIT_EVENT(name) \ +DEFINE_EVENT(smb3_enter_exit_class, smb3_##name, \ + TP_PROTO(unsigned int xid, \ + const char *func_name), \ + TP_ARGS(xid, func_name)) + +DEFINE_SMB3_ENTER_EXIT_EVENT(enter); +DEFINE_SMB3_ENTER_EXIT_EVENT(exit_done); + +/* + * For smb2/smb3 open call + */ +DECLARE_EVENT_CLASS(smb3_open_err_class, + TP_PROTO(unsigned int xid, + __u32 tid, + __u64 sesid, + int create_options, + int desired_access, + int rc), + TP_ARGS(xid, tid, sesid, create_options, desired_access, rc), + TP_STRUCT__entry( + __field(unsigned int, xid) + __field(__u32, tid) + __field(__u64, sesid) + __field(int, create_options) + __field(int, desired_access) + __field(int, rc) + ), + TP_fast_assign( + __entry->xid = xid; + __entry->tid = tid; + __entry->sesid = sesid; + __entry->create_options = create_options; + __entry->desired_access = desired_access; + __entry->rc = rc; + ), + TP_printk("xid=%u sid=0x%llx tid=0x%x cr_opts=0x%x des_access=0x%x rc=%d", + __entry->xid, __entry->sesid, __entry->tid, + __entry->create_options, __entry->desired_access, __entry->rc) +) + +#define DEFINE_SMB3_OPEN_ERR_EVENT(name) \ +DEFINE_EVENT(smb3_open_err_class, smb3_##name, \ + TP_PROTO(unsigned int xid, \ + __u32 tid, \ + __u64 sesid, \ + int create_options, \ + int desired_access, \ + int rc), \ + TP_ARGS(xid, tid, sesid, create_options, desired_access, rc)) + +DEFINE_SMB3_OPEN_ERR_EVENT(open_err); + + +DECLARE_EVENT_CLASS(smb3_open_done_class, + TP_PROTO(unsigned int xid, + __u64 fid, + __u32 tid, + __u64 sesid, + int create_options, + int desired_access), + TP_ARGS(xid, fid, tid, sesid, create_options, desired_access), + TP_STRUCT__entry( + __field(unsigned int, xid) + __field(__u64, fid) + __field(__u32, tid) + __field(__u64, sesid) + __field(int, create_options) + __field(int, desired_access) + ), + TP_fast_assign( + __entry->xid = xid; + __entry->fid = fid; + __entry->tid = tid; + __entry->sesid = sesid; + __entry->create_options = create_options; + __entry->desired_access = desired_access; + ), + TP_printk("xid=%u sid=0x%llx tid=0x%x fid=0x%llx cr_opts=0x%x des_access=0x%x", + __entry->xid, __entry->sesid, __entry->tid, __entry->fid, + __entry->create_options, __entry->desired_access) +) + +#define DEFINE_SMB3_OPEN_DONE_EVENT(name) \ +DEFINE_EVENT(smb3_open_done_class, smb3_##name, \ + TP_PROTO(unsigned int xid, \ + __u64 fid, \ + __u32 tid, \ + __u64 sesid, \ + int create_options, \ + int desired_access), \ + TP_ARGS(xid, fid, tid, sesid, create_options, desired_access)) + +DEFINE_SMB3_OPEN_DONE_EVENT(open_done); + +#endif /* _CIFS_TRACE_H */ + +#undef TRACE_INCLUDE_PATH +#define TRACE_INCLUDE_PATH . +#define TRACE_INCLUDE_FILE trace +#include <trace/define_trace.h> diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c index 927226a2122f..e7254e386b79 100644 --- a/fs/cifs/transport.c +++ b/fs/cifs/transport.c @@ -800,8 +800,8 @@ cifs_send_recv(const unsigned int xid, struct cifs_ses *ses, #ifdef CONFIG_CIFS_SMB311 if ((ses->status == CifsNew) || (optype & CIFS_NEG_OP)) { struct kvec iov = { - .iov_base = buf + 4, - .iov_len = get_rfc1002_length(buf) + .iov_base = buf, + .iov_len = midQ->resp_buf_size }; smb311_update_preauth_hash(ses, &iov, 1); } diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c index 017b0ab19bc4..c4fb9ad7c808 100644 --- a/fs/cramfs/inode.c +++ b/fs/cramfs/inode.c @@ -492,7 +492,7 @@ static void cramfs_kill_sb(struct super_block *sb) { struct cramfs_sb_info *sbi = CRAMFS_SB(sb); - if (IS_ENABLED(CCONFIG_CRAMFS_MTD) && sb->s_mtd) { + if (IS_ENABLED(CONFIG_CRAMFS_MTD) && sb->s_mtd) { if (sbi && sbi->mtd_point_size) mtd_unpoint(sb->s_mtd, 0, sbi->mtd_point_size); kill_mtd_super(sb); @@ -808,10 +808,7 @@ static struct dentry *cramfs_lookup(struct inode *dir, struct dentry *dentry, un } out: mutex_unlock(&read_mutex); - if (IS_ERR(inode)) - return ERR_CAST(inode); - d_add(dentry, inode); - return NULL; + return d_splice_alias(inode, dentry); } static int cramfs_readpage(struct file *file, struct page *page) @@ -677,7 +677,7 @@ static void dax_mapping_entry_mkclean(struct address_space *mapping, * downgrading page table protection not changing it to point * to a new page. * - * See Documentation/vm/mmu_notifier.txt + * See Documentation/vm/mmu_notifier.rst */ if (pmdp) { #ifdef CONFIG_FS_DAX_PMD diff --git a/fs/dcache.c b/fs/dcache.c index 86d2de63461e..0e8e5de3c48a 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -580,6 +580,7 @@ static void __dentry_kill(struct dentry *dentry) spin_unlock(&dentry->d_lock); if (likely(can_free)) dentry_free(dentry); + cond_resched(); } static struct dentry *__lock_parent(struct dentry *dentry) @@ -827,30 +828,24 @@ static inline bool fast_dput(struct dentry *dentry) */ void dput(struct dentry *dentry) { - if (unlikely(!dentry)) - return; + while (dentry) { + might_sleep(); -repeat: - might_sleep(); + rcu_read_lock(); + if (likely(fast_dput(dentry))) { + rcu_read_unlock(); + return; + } - rcu_read_lock(); - if (likely(fast_dput(dentry))) { + /* Slow case: now with the dentry lock held */ rcu_read_unlock(); - return; - } - - /* Slow case: now with the dentry lock held */ - rcu_read_unlock(); - if (likely(retain_dentry(dentry))) { - spin_unlock(&dentry->d_lock); - return; - } + if (likely(retain_dentry(dentry))) { + spin_unlock(&dentry->d_lock); + return; + } - dentry = dentry_kill(dentry); - if (dentry) { - cond_resched(); - goto repeat; + dentry = dentry_kill(dentry); } } EXPORT_SYMBOL(dput); @@ -907,6 +902,35 @@ repeat: } EXPORT_SYMBOL(dget_parent); +static struct dentry * __d_find_any_alias(struct inode *inode) +{ + struct dentry *alias; + + if (hlist_empty(&inode->i_dentry)) + return NULL; + alias = hlist_entry(inode->i_dentry.first, struct dentry, d_u.d_alias); + __dget(alias); + return alias; +} + +/** + * d_find_any_alias - find any alias for a given inode + * @inode: inode to find an alias for + * + * If any aliases exist for the given inode, take and return a + * reference for one of them. If no aliases exist, return %NULL. + */ +struct dentry *d_find_any_alias(struct inode *inode) +{ + struct dentry *de; + + spin_lock(&inode->i_lock); + de = __d_find_any_alias(inode); + spin_unlock(&inode->i_lock); + return de; +} +EXPORT_SYMBOL(d_find_any_alias); + /** * d_find_alias - grab a hashed alias of inode * @inode: inode in question @@ -923,34 +947,19 @@ EXPORT_SYMBOL(dget_parent); */ static struct dentry *__d_find_alias(struct inode *inode) { - struct dentry *alias, *discon_alias; + struct dentry *alias; + + if (S_ISDIR(inode->i_mode)) + return __d_find_any_alias(inode); -again: - discon_alias = NULL; hlist_for_each_entry(alias, &inode->i_dentry, d_u.d_alias) { spin_lock(&alias->d_lock); - if (S_ISDIR(inode->i_mode) || !d_unhashed(alias)) { - if (IS_ROOT(alias) && - (alias->d_flags & DCACHE_DISCONNECTED)) { - discon_alias = alias; - } else { - __dget_dlock(alias); - spin_unlock(&alias->d_lock); - return alias; - } - } - spin_unlock(&alias->d_lock); - } - if (discon_alias) { - alias = discon_alias; - spin_lock(&alias->d_lock); - if (S_ISDIR(inode->i_mode) || !d_unhashed(alias)) { + if (!d_unhashed(alias)) { __dget_dlock(alias); spin_unlock(&alias->d_lock); return alias; } spin_unlock(&alias->d_lock); - goto again; } return NULL; } @@ -1052,8 +1061,6 @@ static void shrink_dentry_list(struct list_head *list) while (!list_empty(list)) { struct dentry *dentry, *parent; - cond_resched(); - dentry = list_entry(list->prev, struct dentry, d_lru); spin_lock(&dentry->d_lock); rcu_read_lock(); @@ -1230,13 +1237,11 @@ enum d_walk_ret { * @parent: start of walk * @data: data passed to @enter() and @finish() * @enter: callback when first entering the dentry - * @finish: callback when successfully finished the walk * - * The @enter() and @finish() callbacks are called with d_lock held. + * The @enter() callbacks are called with d_lock held. */ static void d_walk(struct dentry *parent, void *data, - enum d_walk_ret (*enter)(void *, struct dentry *), - void (*finish)(void *)) + enum d_walk_ret (*enter)(void *, struct dentry *)) { struct dentry *this_parent; struct list_head *next; @@ -1325,8 +1330,6 @@ ascend: if (need_seqretry(&rename_lock, seq)) goto rename_retry; rcu_read_unlock(); - if (finish) - finish(data); out_unlock: spin_unlock(&this_parent->d_lock); @@ -1375,7 +1378,7 @@ int path_has_submounts(const struct path *parent) struct check_mount data = { .mnt = parent->mnt, .mounted = 0 }; read_seqlock_excl(&mount_lock); - d_walk(parent->dentry, &data, path_check_mount, NULL); + d_walk(parent->dentry, &data, path_check_mount); read_sequnlock_excl(&mount_lock); return data.mounted; @@ -1483,11 +1486,16 @@ void shrink_dcache_parent(struct dentry *parent) data.start = parent; data.found = 0; - d_walk(parent, &data, select_collect, NULL); + d_walk(parent, &data, select_collect); + + if (!list_empty(&data.dispose)) { + shrink_dentry_list(&data.dispose); + continue; + } + + cond_resched(); if (!data.found) break; - - shrink_dentry_list(&data.dispose); } } EXPORT_SYMBOL(shrink_dcache_parent); @@ -1518,7 +1526,7 @@ static enum d_walk_ret umount_check(void *_data, struct dentry *dentry) static void do_one_tree(struct dentry *dentry) { shrink_dcache_parent(dentry); - d_walk(dentry, dentry, umount_check, NULL); + d_walk(dentry, dentry, umount_check); d_drop(dentry); dput(dentry); } @@ -1542,78 +1550,48 @@ void shrink_dcache_for_umount(struct super_block *sb) } } -struct detach_data { - struct select_data select; - struct dentry *mountpoint; -}; -static enum d_walk_ret detach_and_collect(void *_data, struct dentry *dentry) +static enum d_walk_ret find_submount(void *_data, struct dentry *dentry) { - struct detach_data *data = _data; - + struct dentry **victim = _data; if (d_mountpoint(dentry)) { __dget_dlock(dentry); - data->mountpoint = dentry; + *victim = dentry; return D_WALK_QUIT; } - - return select_collect(&data->select, dentry); -} - -static void check_and_drop(void *_data) -{ - struct detach_data *data = _data; - - if (!data->mountpoint && list_empty(&data->select.dispose)) - __d_drop(data->select.start); + return D_WALK_CONTINUE; } /** * d_invalidate - detach submounts, prune dcache, and drop * @dentry: dentry to invalidate (aka detach, prune and drop) - * - * no dcache lock. - * - * The final d_drop is done as an atomic operation relative to - * rename_lock ensuring there are no races with d_set_mounted. This - * ensures there are no unhashed dentries on the path to a mountpoint. */ void d_invalidate(struct dentry *dentry) { - /* - * If it's already been dropped, return OK. - */ + bool had_submounts = false; spin_lock(&dentry->d_lock); if (d_unhashed(dentry)) { spin_unlock(&dentry->d_lock); return; } + __d_drop(dentry); spin_unlock(&dentry->d_lock); /* Negative dentries can be dropped without further checks */ - if (!dentry->d_inode) { - d_drop(dentry); + if (!dentry->d_inode) return; - } + shrink_dcache_parent(dentry); for (;;) { - struct detach_data data; - - data.mountpoint = NULL; - INIT_LIST_HEAD(&data.select.dispose); - data.select.start = dentry; - data.select.found = 0; - - d_walk(dentry, &data, detach_and_collect, check_and_drop); - - if (!list_empty(&data.select.dispose)) - shrink_dentry_list(&data.select.dispose); - else if (!data.mountpoint) + struct dentry *victim = NULL; + d_walk(dentry, &victim, find_submount); + if (!victim) { + if (had_submounts) + shrink_dcache_parent(dentry); return; - - if (data.mountpoint) { - detach_mounts(data.mountpoint); - dput(data.mountpoint); } + had_submounts = true; + detach_mounts(victim); + dput(victim); } } EXPORT_SYMBOL(d_invalidate); @@ -1899,6 +1877,28 @@ void d_instantiate(struct dentry *entry, struct inode * inode) } EXPORT_SYMBOL(d_instantiate); +/* + * This should be equivalent to d_instantiate() + unlock_new_inode(), + * with lockdep-related part of unlock_new_inode() done before + * anything else. Use that instead of open-coding d_instantiate()/ + * unlock_new_inode() combinations. + */ +void d_instantiate_new(struct dentry *entry, struct inode *inode) +{ + BUG_ON(!hlist_unhashed(&entry->d_u.d_alias)); + BUG_ON(!inode); + lockdep_annotate_inode_mutex_key(inode); + security_d_instantiate(entry, inode); + spin_lock(&inode->i_lock); + __d_instantiate(entry, inode); + WARN_ON(!(inode->i_state & I_NEW)); + inode->i_state &= ~I_NEW; + smp_mb(); + wake_up_bit(&inode->i_state, __I_NEW); + spin_unlock(&inode->i_lock); +} +EXPORT_SYMBOL(d_instantiate_new); + /** * d_instantiate_no_diralias - instantiate a non-aliased dentry * @entry: dentry to complete @@ -1941,35 +1941,6 @@ struct dentry *d_make_root(struct inode *root_inode) } EXPORT_SYMBOL(d_make_root); -static struct dentry * __d_find_any_alias(struct inode *inode) -{ - struct dentry *alias; - - if (hlist_empty(&inode->i_dentry)) - return NULL; - alias = hlist_entry(inode->i_dentry.first, struct dentry, d_u.d_alias); - __dget(alias); - return alias; -} - -/** - * d_find_any_alias - find any alias for a given inode - * @inode: inode to find an alias for - * - * If any aliases exist for the given inode, take and return a - * reference for one of them. If no aliases exist, return %NULL. - */ -struct dentry *d_find_any_alias(struct inode *inode) -{ - struct dentry *de; - - spin_lock(&inode->i_lock); - de = __d_find_any_alias(inode); - spin_unlock(&inode->i_lock); - return de; -} -EXPORT_SYMBOL(d_find_any_alias); - static struct dentry *__d_instantiate_anon(struct dentry *dentry, struct inode *inode, bool disconnected) @@ -3112,7 +3083,7 @@ static enum d_walk_ret d_genocide_kill(void *data, struct dentry *dentry) void d_genocide(struct dentry *parent) { - d_walk(parent, parent, d_genocide_kill, NULL); + d_walk(parent, parent, d_genocide_kill); } EXPORT_SYMBOL(d_genocide); diff --git a/fs/direct-io.c b/fs/direct-io.c index 874607bb6e02..093fb54cd316 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -432,8 +432,8 @@ dio_bio_alloc(struct dio *dio, struct dio_submit *sdio, struct bio *bio; /* - * bio_alloc() is guaranteed to return a bio when called with - * __GFP_RECLAIM and we request a valid number of vectors. + * bio_alloc() is guaranteed to return a bio when allowed to sleep and + * we request a valid number of vectors. */ bio = bio_alloc(GFP_KERNEL, nr_vecs); diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c index 5243989a60cc..a5e4a221435c 100644 --- a/fs/dlm/lowcomms.c +++ b/fs/dlm/lowcomms.c @@ -1037,6 +1037,7 @@ static void sctp_connect_to_sock(struct connection *con) int result; int addr_len; struct socket *sock; + struct timeval tv = { .tv_sec = 5, .tv_usec = 0 }; if (con->nodeid == 0) { log_print("attempt to connect sock 0 foiled"); @@ -1080,11 +1081,22 @@ static void sctp_connect_to_sock(struct connection *con) log_print("connecting to %d", con->nodeid); /* Turn off Nagle's algorithm */ - kernel_setsockopt(sock, SOL_TCP, TCP_NODELAY, (char *)&one, + kernel_setsockopt(sock, SOL_SCTP, SCTP_NODELAY, (char *)&one, sizeof(one)); + /* + * Make sock->ops->connect() function return in specified time, + * since O_NONBLOCK argument in connect() function does not work here, + * then, we should restore the default value of this attribute. + */ + kernel_setsockopt(sock, SOL_SOCKET, SO_SNDTIMEO, (char *)&tv, + sizeof(tv)); result = sock->ops->connect(sock, (struct sockaddr *)&daddr, addr_len, - O_NONBLOCK); + 0); + memset(&tv, 0, sizeof(tv)); + kernel_setsockopt(sock, SOL_SOCKET, SO_SNDTIMEO, (char *)&tv, + sizeof(tv)); + if (result == -EINPROGRESS) result = 0; if (result == 0) diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c index 97d17eaeba07..49121e5a8de2 100644 --- a/fs/ecryptfs/inode.c +++ b/fs/ecryptfs/inode.c @@ -283,8 +283,7 @@ ecryptfs_create(struct inode *directory_inode, struct dentry *ecryptfs_dentry, iget_failed(ecryptfs_inode); goto out; } - unlock_new_inode(ecryptfs_inode); - d_instantiate(ecryptfs_dentry, ecryptfs_inode); + d_instantiate_new(ecryptfs_dentry, ecryptfs_inode); out: return rc; } diff --git a/fs/eventfd.c b/fs/eventfd.c index 08d3bd602f73..61c9514da5e9 100644 --- a/fs/eventfd.c +++ b/fs/eventfd.c @@ -101,14 +101,20 @@ static int eventfd_release(struct inode *inode, struct file *file) return 0; } -static __poll_t eventfd_poll(struct file *file, poll_table *wait) +static struct wait_queue_head * +eventfd_get_poll_head(struct file *file, __poll_t events) +{ + struct eventfd_ctx *ctx = file->private_data; + + return &ctx->wqh; +} + +static __poll_t eventfd_poll_mask(struct file *file, __poll_t eventmask) { struct eventfd_ctx *ctx = file->private_data; __poll_t events = 0; u64 count; - poll_wait(file, &ctx->wqh, wait); - /* * All writes to ctx->count occur within ctx->wqh.lock. This read * can be done outside ctx->wqh.lock because we know that poll_wait @@ -305,7 +311,8 @@ static const struct file_operations eventfd_fops = { .show_fdinfo = eventfd_show_fdinfo, #endif .release = eventfd_release, - .poll = eventfd_poll, + .get_poll_head = eventfd_get_poll_head, + .poll_mask = eventfd_poll_mask, .read = eventfd_read, .write = eventfd_write, .llseek = noop_llseek, diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 602ca4285b2e..67db22fe99c5 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -884,8 +884,7 @@ static __poll_t ep_item_poll(const struct epitem *epi, poll_table *pt, pt->_key = epi->event.events; if (!is_file_epoll(epi->ffd.file)) - return epi->ffd.file->f_op->poll(epi->ffd.file, pt) & - epi->event.events; + return vfs_poll(epi->ffd.file, pt) & epi->event.events; ep = epi->ffd.file->private_data; poll_wait(epi->ffd.file, &ep->poll_wait, pt); @@ -2025,7 +2024,7 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd, /* The target file descriptor must support poll */ error = -EPERM; - if (!tf.file->f_op->poll) + if (!file_can_poll(tf.file)) goto error_tgt_fput; /* Check if EPOLLWAKEUP is allowed */ diff --git a/fs/exofs/ore.c b/fs/exofs/ore.c index 3c6a9c156b7a..ddbf87246898 100644 --- a/fs/exofs/ore.c +++ b/fs/exofs/ore.c @@ -790,7 +790,7 @@ int ore_create(struct ore_io_state *ios) for (i = 0; i < ios->oc->numdevs; i++) { struct osd_request *or; - or = osd_start_request(_ios_od(ios, i), GFP_KERNEL); + or = osd_start_request(_ios_od(ios, i)); if (unlikely(!or)) { ORE_ERR("%s: osd_start_request failed\n", __func__); ret = -ENOMEM; @@ -815,7 +815,7 @@ int ore_remove(struct ore_io_state *ios) for (i = 0; i < ios->oc->numdevs; i++) { struct osd_request *or; - or = osd_start_request(_ios_od(ios, i), GFP_KERNEL); + or = osd_start_request(_ios_od(ios, i)); if (unlikely(!or)) { ORE_ERR("%s: osd_start_request failed\n", __func__); ret = -ENOMEM; @@ -847,7 +847,7 @@ static int _write_mirror(struct ore_io_state *ios, int cur_comp) struct ore_per_dev_state *per_dev = &ios->per_dev[cur_comp]; struct osd_request *or; - or = osd_start_request(_ios_od(ios, dev), GFP_KERNEL); + or = osd_start_request(_ios_od(ios, dev)); if (unlikely(!or)) { ORE_ERR("%s: osd_start_request failed\n", __func__); ret = -ENOMEM; @@ -966,7 +966,7 @@ int _ore_read_mirror(struct ore_io_state *ios, unsigned cur_comp) return 0; /* Just an empty slot */ first_dev = per_dev->dev + first_dev % ios->layout->mirrors_p1; - or = osd_start_request(_ios_od(ios, first_dev), GFP_KERNEL); + or = osd_start_request(_ios_od(ios, first_dev)); if (unlikely(!or)) { ORE_ERR("%s: osd_start_request failed\n", __func__); return -ENOMEM; @@ -1060,7 +1060,7 @@ static int _truncate_mirrors(struct ore_io_state *ios, unsigned cur_comp, struct ore_per_dev_state *per_dev = &ios->per_dev[cur_comp]; struct osd_request *or; - or = osd_start_request(_ios_od(ios, cur_comp), GFP_KERNEL); + or = osd_start_request(_ios_od(ios, cur_comp)); if (unlikely(!or)) { ORE_ERR("%s: osd_start_request failed\n", __func__); return -ENOMEM; diff --git a/fs/exofs/super.c b/fs/exofs/super.c index 179cd5c2f52a..719a3152da80 100644 --- a/fs/exofs/super.c +++ b/fs/exofs/super.c @@ -229,7 +229,7 @@ void exofs_make_credential(u8 cred_a[OSD_CAP_LEN], const struct osd_obj_id *obj) static int exofs_read_kern(struct osd_dev *od, u8 *cred, struct osd_obj_id *obj, u64 offset, void *p, unsigned length) { - struct osd_request *or = osd_start_request(od, GFP_KERNEL); + struct osd_request *or = osd_start_request(od); /* struct osd_sense_info osi = {.key = 0};*/ int ret; diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c index 1e01fabef130..71635909df3b 100644 --- a/fs/ext2/inode.c +++ b/fs/ext2/inode.c @@ -1264,21 +1264,11 @@ do_indirects: static void ext2_truncate_blocks(struct inode *inode, loff_t offset) { - /* - * XXX: it seems like a bug here that we don't allow - * IS_APPEND inode to have blocks-past-i_size trimmed off. - * review and fix this. - * - * Also would be nice to be able to handle IO errors and such, - * but that's probably too much to ask. - */ if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))) return; if (ext2_inode_is_fast_symlink(inode)) return; - if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) - return; dax_sem_down_write(EXT2_I(inode)); __ext2_truncate_blocks(inode, offset); diff --git a/fs/ext2/namei.c b/fs/ext2/namei.c index 55f7caadb093..152453a91877 100644 --- a/fs/ext2/namei.c +++ b/fs/ext2/namei.c @@ -41,8 +41,7 @@ static inline int ext2_add_nondir(struct dentry *dentry, struct inode *inode) { int err = ext2_add_link(dentry, inode); if (!err) { - unlock_new_inode(inode); - d_instantiate(dentry, inode); + d_instantiate_new(dentry, inode); return 0; } inode_dec_link_count(inode); @@ -255,8 +254,7 @@ static int ext2_mkdir(struct inode * dir, struct dentry * dentry, umode_t mode) if (err) goto out_fail; - unlock_new_inode(inode); - d_instantiate(dentry, inode); + d_instantiate_new(dentry, inode); out: return err; diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c index 508b905d744d..b00481c475cb 100644 --- a/fs/ext4/balloc.c +++ b/fs/ext4/balloc.c @@ -185,25 +185,15 @@ static int ext4_init_block_bitmap(struct super_block *sb, struct ext4_sb_info *sbi = EXT4_SB(sb); ext4_fsblk_t start, tmp; int flex_bg = 0; - struct ext4_group_info *grp; J_ASSERT_BH(bh, buffer_locked(bh)); /* If checksum is bad mark all blocks used to prevent allocation * essentially implementing a per-group read-only flag. */ if (!ext4_group_desc_csum_verify(sb, block_group, gdp)) { - grp = ext4_get_group_info(sb, block_group); - if (!EXT4_MB_GRP_BBITMAP_CORRUPT(grp)) - percpu_counter_sub(&sbi->s_freeclusters_counter, - grp->bb_free); - set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT, &grp->bb_state); - if (!EXT4_MB_GRP_IBITMAP_CORRUPT(grp)) { - int count; - count = ext4_free_inodes_count(sb, gdp); - percpu_counter_sub(&sbi->s_freeinodes_counter, - count); - } - set_bit(EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT, &grp->bb_state); + ext4_mark_group_bitmap_corrupted(sb, block_group, + EXT4_GROUP_INFO_BBITMAP_CORRUPT | + EXT4_GROUP_INFO_IBITMAP_CORRUPT); return -EFSBADCRC; } memset(bh->b_data, 0, sb->s_blocksize); @@ -375,7 +365,6 @@ static int ext4_validate_block_bitmap(struct super_block *sb, { ext4_fsblk_t blk; struct ext4_group_info *grp = ext4_get_group_info(sb, block_group); - struct ext4_sb_info *sbi = EXT4_SB(sb); if (buffer_verified(bh)) return 0; @@ -387,10 +376,8 @@ static int ext4_validate_block_bitmap(struct super_block *sb, desc, bh))) { ext4_unlock_group(sb, block_group); ext4_error(sb, "bg %u: bad block bitmap checksum", block_group); - if (!EXT4_MB_GRP_BBITMAP_CORRUPT(grp)) - percpu_counter_sub(&sbi->s_freeclusters_counter, - grp->bb_free); - set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT, &grp->bb_state); + ext4_mark_group_bitmap_corrupted(sb, block_group, + EXT4_GROUP_INFO_BBITMAP_CORRUPT); return -EFSBADCRC; } blk = ext4_valid_block_bitmap(sb, desc, block_group, bh); @@ -398,10 +385,8 @@ static int ext4_validate_block_bitmap(struct super_block *sb, ext4_unlock_group(sb, block_group); ext4_error(sb, "bg %u: block %llu: invalid block bitmap", block_group, blk); - if (!EXT4_MB_GRP_BBITMAP_CORRUPT(grp)) - percpu_counter_sub(&sbi->s_freeclusters_counter, - grp->bb_free); - set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT, &grp->bb_state); + ext4_mark_group_bitmap_corrupted(sb, block_group, + EXT4_GROUP_INFO_BBITMAP_CORRUPT); return -EFSCORRUPTED; } set_buffer_verified(bh); @@ -436,6 +421,8 @@ ext4_read_block_bitmap_nowait(struct super_block *sb, ext4_group_t block_group) (bitmap_blk >= ext4_blocks_count(sbi->s_es))) { ext4_error(sb, "Invalid block bitmap block %llu in " "block_group %u", bitmap_blk, block_group); + ext4_mark_group_bitmap_corrupted(sb, block_group, + EXT4_GROUP_INFO_BBITMAP_CORRUPT); return ERR_PTR(-EFSCORRUPTED); } bh = sb_getblk(sb, bitmap_blk); @@ -514,6 +501,8 @@ int ext4_wait_block_bitmap(struct super_block *sb, ext4_group_t block_group, ext4_error(sb, "Cannot read block bitmap - " "block_group = %u, block_bitmap = %llu", block_group, (unsigned long long) bh->b_blocknr); + ext4_mark_group_bitmap_corrupted(sb, block_group, + EXT4_GROUP_INFO_BBITMAP_CORRUPT); return -EIO; } clear_buffer_new(bh); diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index a42e71203e53..df95412915ea 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -2390,7 +2390,7 @@ extern int ext4_init_inode_table(struct super_block *sb, extern void ext4_end_bitmap_read(struct buffer_head *bh, int uptodate); /* mballoc.c */ -extern const struct file_operations ext4_seq_mb_groups_fops; +extern const struct seq_operations ext4_mb_seq_groups_ops; extern long ext4_mb_stats; extern long ext4_mb_max_to_scan; extern int ext4_mb_init(struct super_block *); @@ -2530,6 +2530,9 @@ extern int ext4_alloc_flex_bg_array(struct super_block *sb, ext4_group_t ngroup); extern const char *ext4_decode_error(struct super_block *sb, int errno, char nbuf[16]); +extern void ext4_mark_group_bitmap_corrupted(struct super_block *sb, + ext4_group_t block_group, + unsigned int flags); extern __printf(4, 5) void __ext4_error(struct super_block *, const char *, unsigned int, @@ -2857,6 +2860,10 @@ struct ext4_group_info { #define EXT4_GROUP_INFO_WAS_TRIMMED_BIT 1 #define EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT 2 #define EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT 3 +#define EXT4_GROUP_INFO_BBITMAP_CORRUPT \ + (1 << EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT) +#define EXT4_GROUP_INFO_IBITMAP_CORRUPT \ + (1 << EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT) #define EXT4_MB_GRP_NEED_INIT(grp) \ (test_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &((grp)->bb_state))) diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c index 763ef185dd17..c4e6fb15101b 100644 --- a/fs/ext4/extents_status.c +++ b/fs/ext4/extents_status.c @@ -162,8 +162,7 @@ int __init ext4_init_es(void) void ext4_exit_es(void) { - if (ext4_es_cachep) - kmem_cache_destroy(ext4_es_cachep); + kmem_cache_destroy(ext4_es_cachep); } void ext4_es_init_tree(struct ext4_es_tree *tree) diff --git a/fs/ext4/file.c b/fs/ext4/file.c index fb6f023622fe..7f8023340eb8 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -277,10 +277,11 @@ out: } #ifdef CONFIG_FS_DAX -static int ext4_dax_huge_fault(struct vm_fault *vmf, +static vm_fault_t ext4_dax_huge_fault(struct vm_fault *vmf, enum page_entry_size pe_size) { - int result, error = 0; + int error = 0; + vm_fault_t result; int retries = 0; handle_t *handle = NULL; struct inode *inode = file_inode(vmf->vma->vm_file); @@ -335,7 +336,7 @@ retry: return result; } -static int ext4_dax_fault(struct vm_fault *vmf) +static vm_fault_t ext4_dax_fault(struct vm_fault *vmf) { return ext4_dax_huge_fault(vmf, PE_SIZE_PTE); } @@ -380,50 +381,64 @@ static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma) return 0; } -static int ext4_file_open(struct inode * inode, struct file * filp) +static int ext4_sample_last_mounted(struct super_block *sb, + struct vfsmount *mnt) { - struct super_block *sb = inode->i_sb; - struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); - struct vfsmount *mnt = filp->f_path.mnt; + struct ext4_sb_info *sbi = EXT4_SB(sb); struct path path; char buf[64], *cp; + handle_t *handle; + int err; + + if (likely(sbi->s_mount_flags & EXT4_MF_MNTDIR_SAMPLED)) + return 0; + + if (sb_rdonly(sb) || !sb_start_intwrite_trylock(sb)) + return 0; + + sbi->s_mount_flags |= EXT4_MF_MNTDIR_SAMPLED; + /* + * Sample where the filesystem has been mounted and + * store it in the superblock for sysadmin convenience + * when trying to sort through large numbers of block + * devices or filesystem images. + */ + memset(buf, 0, sizeof(buf)); + path.mnt = mnt; + path.dentry = mnt->mnt_root; + cp = d_path(&path, buf, sizeof(buf)); + err = 0; + if (IS_ERR(cp)) + goto out; + + handle = ext4_journal_start_sb(sb, EXT4_HT_MISC, 1); + err = PTR_ERR(handle); + if (IS_ERR(handle)) + goto out; + BUFFER_TRACE(sbi->s_sbh, "get_write_access"); + err = ext4_journal_get_write_access(handle, sbi->s_sbh); + if (err) + goto out_journal; + strlcpy(sbi->s_es->s_last_mounted, cp, + sizeof(sbi->s_es->s_last_mounted)); + ext4_handle_dirty_super(handle, sb); +out_journal: + ext4_journal_stop(handle); +out: + sb_end_intwrite(sb); + return err; +} + +static int ext4_file_open(struct inode * inode, struct file * filp) +{ int ret; if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb)))) return -EIO; - if (unlikely(!(sbi->s_mount_flags & EXT4_MF_MNTDIR_SAMPLED) && - !sb_rdonly(sb))) { - sbi->s_mount_flags |= EXT4_MF_MNTDIR_SAMPLED; - /* - * Sample where the filesystem has been mounted and - * store it in the superblock for sysadmin convenience - * when trying to sort through large numbers of block - * devices or filesystem images. - */ - memset(buf, 0, sizeof(buf)); - path.mnt = mnt; - path.dentry = mnt->mnt_root; - cp = d_path(&path, buf, sizeof(buf)); - if (!IS_ERR(cp)) { - handle_t *handle; - int err; - - handle = ext4_journal_start_sb(sb, EXT4_HT_MISC, 1); - if (IS_ERR(handle)) - return PTR_ERR(handle); - BUFFER_TRACE(sbi->s_sbh, "get_write_access"); - err = ext4_journal_get_write_access(handle, sbi->s_sbh); - if (err) { - ext4_journal_stop(handle); - return err; - } - strlcpy(sbi->s_es->s_last_mounted, cp, - sizeof(sbi->s_es->s_last_mounted)); - ext4_handle_dirty_super(handle, sb); - ext4_journal_stop(handle); - } - } + ret = ext4_sample_last_mounted(inode->i_sb, filp->f_path.mnt); + if (ret) + return ret; ret = fscrypt_file_open(inode, filp); if (ret) diff --git a/fs/ext4/fsmap.c b/fs/ext4/fsmap.c index e871c4bf18e9..4b99e2db95b8 100644 --- a/fs/ext4/fsmap.c +++ b/fs/ext4/fsmap.c @@ -402,8 +402,8 @@ static void ext4_getfsmap_free_fixed_metadata(struct list_head *meta_list) } /* Find all the fixed metadata in the filesystem. */ -int ext4_getfsmap_find_fixed_metadata(struct super_block *sb, - struct list_head *meta_list) +static int ext4_getfsmap_find_fixed_metadata(struct super_block *sb, + struct list_head *meta_list) { struct ext4_group_desc *gdp; ext4_group_t agno; diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index df92e3ec9913..4d6e007f3569 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -83,7 +83,6 @@ static int ext4_validate_inode_bitmap(struct super_block *sb, { ext4_fsblk_t blk; struct ext4_group_info *grp = ext4_get_group_info(sb, block_group); - struct ext4_sb_info *sbi = EXT4_SB(sb); if (buffer_verified(bh)) return 0; @@ -97,14 +96,8 @@ static int ext4_validate_inode_bitmap(struct super_block *sb, ext4_unlock_group(sb, block_group); ext4_error(sb, "Corrupt inode bitmap - block_group = %u, " "inode_bitmap = %llu", block_group, blk); - grp = ext4_get_group_info(sb, block_group); - if (!EXT4_MB_GRP_IBITMAP_CORRUPT(grp)) { - int count; - count = ext4_free_inodes_count(sb, desc); - percpu_counter_sub(&sbi->s_freeinodes_counter, - count); - } - set_bit(EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT, &grp->bb_state); + ext4_mark_group_bitmap_corrupted(sb, block_group, + EXT4_GROUP_INFO_IBITMAP_CORRUPT); return -EFSBADCRC; } set_buffer_verified(bh); @@ -136,6 +129,8 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group) (bitmap_blk >= ext4_blocks_count(sbi->s_es))) { ext4_error(sb, "Invalid inode bitmap blk %llu in " "block_group %u", bitmap_blk, block_group); + ext4_mark_group_bitmap_corrupted(sb, block_group, + EXT4_GROUP_INFO_IBITMAP_CORRUPT); return ERR_PTR(-EFSCORRUPTED); } bh = sb_getblk(sb, bitmap_blk); @@ -143,7 +138,7 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group) ext4_error(sb, "Cannot read inode bitmap - " "block_group = %u, inode_bitmap = %llu", block_group, bitmap_blk); - return ERR_PTR(-EIO); + return ERR_PTR(-ENOMEM); } if (bitmap_uptodate(bh)) goto verify; @@ -190,6 +185,8 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group) ext4_error(sb, "Cannot read inode bitmap - " "block_group = %u, inode_bitmap = %llu", block_group, bitmap_blk); + ext4_mark_group_bitmap_corrupted(sb, block_group, + EXT4_GROUP_INFO_IBITMAP_CORRUPT); return ERR_PTR(-EIO); } @@ -337,13 +334,8 @@ out: fatal = err; } else { ext4_error(sb, "bit already cleared for inode %lu", ino); - if (gdp && !EXT4_MB_GRP_IBITMAP_CORRUPT(grp)) { - int count; - count = ext4_free_inodes_count(sb, gdp); - percpu_counter_sub(&sbi->s_freeinodes_counter, - count); - } - set_bit(EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT, &grp->bb_state); + ext4_mark_group_bitmap_corrupted(sb, block_group, + EXT4_GROUP_INFO_IBITMAP_CORRUPT); } error_return: @@ -914,6 +906,8 @@ repeat_in_this_group: if (group == 0 && (ino + 1) < EXT4_FIRST_INO(sb)) { ext4_error(sb, "reserved inode found cleared - " "inode=%lu", ino + 1); + ext4_mark_group_bitmap_corrupted(sb, group, + EXT4_GROUP_INFO_IBITMAP_CORRUPT); goto next_group; } @@ -1105,6 +1099,8 @@ got: err = -EIO; ext4_error(sb, "failed to insert inode %lu: doubly allocated?", inode->i_ino); + ext4_mark_group_bitmap_corrupted(sb, group, + EXT4_GROUP_INFO_IBITMAP_CORRUPT); goto out; } inode->i_generation = prandom_u32(); @@ -1206,11 +1202,8 @@ struct inode *ext4_orphan_get(struct super_block *sb, unsigned long ino) block_group = (ino - 1) / EXT4_INODES_PER_GROUP(sb); bit = (ino - 1) % EXT4_INODES_PER_GROUP(sb); bitmap_bh = ext4_read_inode_bitmap(sb, block_group); - if (IS_ERR(bitmap_bh)) { - ext4_error(sb, "inode bitmap error %ld for orphan %lu", - ino, PTR_ERR(bitmap_bh)); + if (IS_ERR(bitmap_bh)) return (struct inode *) bitmap_bh; - } /* Having the inode bit set should be a 100% indicator that this * is a valid orphan (no e2fsck run on fs). Orphans also include diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c index c32802c956d5..bf7fa1507e81 100644 --- a/fs/ext4/indirect.c +++ b/fs/ext4/indirect.c @@ -561,10 +561,16 @@ int ext4_ind_map_blocks(handle_t *handle, struct inode *inode, unsigned epb = inode->i_sb->s_blocksize / sizeof(u32); int i; - /* Count number blocks in a subtree under 'partial' */ - count = 1; - for (i = 0; partial + i != chain + depth - 1; i++) - count *= epb; + /* + * Count number blocks in a subtree under 'partial'. At each + * level we count number of complete empty subtrees beyond + * current offset and then descend into the subtree only + * partially beyond current offset. + */ + count = 0; + for (i = partial - chain + 1; i < depth; i++) + count = count * epb + (epb - offsets[i] - 1); + count++; /* Fill in size of a hole we found */ map->m_pblk = 0; map->m_len = min_t(unsigned int, map->m_len, count); diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c index e1f00891ef95..285ed1588730 100644 --- a/fs/ext4/inline.c +++ b/fs/ext4/inline.c @@ -144,6 +144,12 @@ int ext4_find_inline_data_nolock(struct inode *inode) goto out; if (!is.s.not_found) { + if (is.s.here->e_value_inum) { + EXT4_ERROR_INODE(inode, "inline data xattr refers " + "to an external xattr inode"); + error = -EFSCORRUPTED; + goto out; + } EXT4_I(inode)->i_inline_off = (u16)((void *)is.s.here - (void *)ext4_raw_inode(&is.iloc)); EXT4_I(inode)->i_inline_size = EXT4_MIN_INLINE_DATA_SIZE + diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 1e50c5efae67..2ea07efbe016 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -4298,28 +4298,28 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length) EXT4_BLOCK_SIZE_BITS(sb); stop_block = (offset + length) >> EXT4_BLOCK_SIZE_BITS(sb); - /* If there are no blocks to remove, return now */ - if (first_block >= stop_block) - goto out_stop; + /* If there are blocks to remove, do it */ + if (stop_block > first_block) { - down_write(&EXT4_I(inode)->i_data_sem); - ext4_discard_preallocations(inode); + down_write(&EXT4_I(inode)->i_data_sem); + ext4_discard_preallocations(inode); - ret = ext4_es_remove_extent(inode, first_block, - stop_block - first_block); - if (ret) { - up_write(&EXT4_I(inode)->i_data_sem); - goto out_stop; - } + ret = ext4_es_remove_extent(inode, first_block, + stop_block - first_block); + if (ret) { + up_write(&EXT4_I(inode)->i_data_sem); + goto out_stop; + } - if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) - ret = ext4_ext_remove_space(inode, first_block, - stop_block - 1); - else - ret = ext4_ind_remove_space(handle, inode, first_block, - stop_block); + if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) + ret = ext4_ext_remove_space(inode, first_block, + stop_block - 1); + else + ret = ext4_ind_remove_space(handle, inode, first_block, + stop_block); - up_write(&EXT4_I(inode)->i_data_sem); + up_write(&EXT4_I(inode)->i_data_sem); + } if (IS_SYNC(inode)) ext4_handle_sync(handle); @@ -4701,19 +4701,21 @@ static blkcnt_t ext4_inode_blocks(struct ext4_inode *raw_inode, } } -static inline void ext4_iget_extra_inode(struct inode *inode, +static inline int ext4_iget_extra_inode(struct inode *inode, struct ext4_inode *raw_inode, struct ext4_inode_info *ei) { __le32 *magic = (void *)raw_inode + EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize; + if (EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize + sizeof(__le32) <= EXT4_INODE_SIZE(inode->i_sb) && *magic == cpu_to_le32(EXT4_XATTR_MAGIC)) { ext4_set_inode_state(inode, EXT4_STATE_XATTR); - ext4_find_inline_data_nolock(inode); + return ext4_find_inline_data_nolock(inode); } else EXT4_I(inode)->i_inline_off = 0; + return 0; } int ext4_get_projid(struct inode *inode, kprojid_t *projid) @@ -4724,6 +4726,26 @@ int ext4_get_projid(struct inode *inode, kprojid_t *projid) return 0; } +/* + * ext4 has self-managed i_version for ea inodes, it stores the lower 32bit of + * refcount in i_version, so use raw values if inode has EXT4_EA_INODE_FL flag + * set. + */ +static inline void ext4_inode_set_iversion_queried(struct inode *inode, u64 val) +{ + if (unlikely(EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL)) + inode_set_iversion_raw(inode, val); + else + inode_set_iversion_queried(inode, val); +} +static inline u64 ext4_inode_peek_iversion(const struct inode *inode) +{ + if (unlikely(EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL)) + return inode_peek_iversion_raw(inode); + else + return inode_peek_iversion(inode); +} + struct inode *ext4_iget(struct super_block *sb, unsigned long ino) { struct ext4_iloc iloc; @@ -4893,7 +4915,9 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) ei->i_extra_isize = sizeof(struct ext4_inode) - EXT4_GOOD_OLD_INODE_SIZE; } else { - ext4_iget_extra_inode(inode, raw_inode, ei); + ret = ext4_iget_extra_inode(inode, raw_inode, ei); + if (ret) + goto bad_inode; } } @@ -4910,7 +4934,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) ivers |= (__u64)(le32_to_cpu(raw_inode->i_version_hi)) << 32; } - inode_set_iversion_queried(inode, ivers); + ext4_inode_set_iversion_queried(inode, ivers); } ret = 0; @@ -4945,6 +4969,13 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) inode->i_op = &ext4_dir_inode_operations; inode->i_fop = &ext4_dir_operations; } else if (S_ISLNK(inode->i_mode)) { + /* VFS does not allow setting these so must be corruption */ + if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) { + EXT4_ERROR_INODE(inode, + "immutable or append flags not allowed on symlinks"); + ret = -EFSCORRUPTED; + goto bad_inode; + } if (ext4_encrypted_inode(inode)) { inode->i_op = &ext4_encrypted_symlink_inode_operations; ext4_set_aops(inode); @@ -5196,7 +5227,7 @@ static int ext4_do_update_inode(handle_t *handle, } if (likely(!test_opt2(inode->i_sb, HURD_COMPAT))) { - u64 ivers = inode_peek_iversion(inode); + u64 ivers = ext4_inode_peek_iversion(inode); raw_inode->i_disk_version = cpu_to_le32(ivers); if (ei->i_extra_isize) { diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 769a62708b1c..6eae2b91aafa 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -470,6 +470,8 @@ static void mb_free_blocks_double(struct inode *inode, struct ext4_buddy *e4b, "freeing block already freed " "(bit %u)", first + i); + ext4_mark_group_bitmap_corrupted(sb, e4b->bd_group, + EXT4_GROUP_INFO_BBITMAP_CORRUPT); } mb_clear_bit(first + i, e4b->bd_info->bb_bitmap); } @@ -747,10 +749,8 @@ void ext4_mb_generate_buddy(struct super_block *sb, * corrupt and update bb_free using bitmap value */ grp->bb_free = free; - if (!EXT4_MB_GRP_BBITMAP_CORRUPT(grp)) - percpu_counter_sub(&sbi->s_freeclusters_counter, - grp->bb_free); - set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT, &grp->bb_state); + ext4_mark_group_bitmap_corrupted(sb, group, + EXT4_GROUP_INFO_BBITMAP_CORRUPT); } mb_set_largest_free_order(sb, grp); @@ -1454,12 +1454,8 @@ static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b, "freeing already freed block " "(bit %u); block bitmap corrupt.", block); - if (!EXT4_MB_GRP_BBITMAP_CORRUPT(e4b->bd_info)) - percpu_counter_sub(&sbi->s_freeclusters_counter, - e4b->bd_info->bb_free); - /* Mark the block group as corrupt. */ - set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT, - &e4b->bd_info->bb_state); + ext4_mark_group_bitmap_corrupted(sb, e4b->bd_group, + EXT4_GROUP_INFO_BBITMAP_CORRUPT); mb_regenerate_buddy(e4b); goto done; } @@ -1956,6 +1952,8 @@ void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac, "%d free clusters as per " "group info. But bitmap says 0", free); + ext4_mark_group_bitmap_corrupted(sb, e4b->bd_group, + EXT4_GROUP_INFO_BBITMAP_CORRUPT); break; } @@ -1966,6 +1964,8 @@ void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac, "%d free clusters as per " "group info. But got %d blocks", free, ex.fe_len); + ext4_mark_group_bitmap_corrupted(sb, e4b->bd_group, + EXT4_GROUP_INFO_BBITMAP_CORRUPT); /* * The number of free blocks differs. This mostly * indicate that the bitmap is corrupt. So exit @@ -2254,7 +2254,7 @@ out: static void *ext4_mb_seq_groups_start(struct seq_file *seq, loff_t *pos) { - struct super_block *sb = seq->private; + struct super_block *sb = PDE_DATA(file_inode(seq->file)); ext4_group_t group; if (*pos < 0 || *pos >= ext4_get_groups_count(sb)) @@ -2265,7 +2265,7 @@ static void *ext4_mb_seq_groups_start(struct seq_file *seq, loff_t *pos) static void *ext4_mb_seq_groups_next(struct seq_file *seq, void *v, loff_t *pos) { - struct super_block *sb = seq->private; + struct super_block *sb = PDE_DATA(file_inode(seq->file)); ext4_group_t group; ++*pos; @@ -2277,7 +2277,7 @@ static void *ext4_mb_seq_groups_next(struct seq_file *seq, void *v, loff_t *pos) static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v) { - struct super_block *sb = seq->private; + struct super_block *sb = PDE_DATA(file_inode(seq->file)); ext4_group_t group = (ext4_group_t) ((unsigned long) v); int i; int err, buddy_loaded = 0; @@ -2330,34 +2330,13 @@ static void ext4_mb_seq_groups_stop(struct seq_file *seq, void *v) { } -static const struct seq_operations ext4_mb_seq_groups_ops = { +const struct seq_operations ext4_mb_seq_groups_ops = { .start = ext4_mb_seq_groups_start, .next = ext4_mb_seq_groups_next, .stop = ext4_mb_seq_groups_stop, .show = ext4_mb_seq_groups_show, }; -static int ext4_mb_seq_groups_open(struct inode *inode, struct file *file) -{ - struct super_block *sb = PDE_DATA(inode); - int rc; - - rc = seq_open(file, &ext4_mb_seq_groups_ops); - if (rc == 0) { - struct seq_file *m = file->private_data; - m->private = sb; - } - return rc; - -} - -const struct file_operations ext4_seq_mb_groups_fops = { - .open = ext4_mb_seq_groups_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release, -}; - static struct kmem_cache *get_groupinfo_cache(int blocksize_bits) { int cache_index = blocksize_bits - EXT4_MIN_BLOCK_LOG_SIZE; @@ -2537,8 +2516,7 @@ static void ext4_groupinfo_destroy_slabs(void) int i; for (i = 0; i < NR_GRPINFO_CACHES; i++) { - if (ext4_groupinfo_caches[i]) - kmem_cache_destroy(ext4_groupinfo_caches[i]); + kmem_cache_destroy(ext4_groupinfo_caches[i]); ext4_groupinfo_caches[i] = NULL; } } diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index b1f21e3a0763..4a09063ce1d2 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -2411,8 +2411,7 @@ static int ext4_add_nondir(handle_t *handle, int err = ext4_add_entry(handle, dentry, inode); if (!err) { ext4_mark_inode_dirty(handle, inode); - unlock_new_inode(inode); - d_instantiate(dentry, inode); + d_instantiate_new(dentry, inode); return 0; } drop_nlink(inode); @@ -2651,8 +2650,7 @@ out_clear_inode: err = ext4_mark_inode_dirty(handle, dir); if (err) goto out_clear_inode; - unlock_new_inode(inode); - d_instantiate(dentry, inode); + d_instantiate_new(dentry, inode); if (IS_DIRSYNC(dir)) ext4_handle_sync(handle); diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index b6bec270a8e4..d792b7689d92 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -1933,7 +1933,7 @@ retry: return 0; n_group = ext4_get_group_number(sb, n_blocks_count - 1); - if (n_group > (0xFFFFFFFFUL / EXT4_INODES_PER_GROUP(sb))) { + if (n_group >= (0xFFFFFFFFUL / EXT4_INODES_PER_GROUP(sb))) { ext4_warning(sb, "resize would cause inodes_count overflow"); return -EINVAL; } diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 2e1622907f4a..7022d843c667 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -763,6 +763,36 @@ __acquires(bitlock) return; } +void ext4_mark_group_bitmap_corrupted(struct super_block *sb, + ext4_group_t group, + unsigned int flags) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct ext4_group_info *grp = ext4_get_group_info(sb, group); + struct ext4_group_desc *gdp = ext4_get_group_desc(sb, group, NULL); + + if ((flags & EXT4_GROUP_INFO_BBITMAP_CORRUPT) && + !EXT4_MB_GRP_BBITMAP_CORRUPT(grp)) { + percpu_counter_sub(&sbi->s_freeclusters_counter, + grp->bb_free); + set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT, + &grp->bb_state); + } + + if ((flags & EXT4_GROUP_INFO_IBITMAP_CORRUPT) && + !EXT4_MB_GRP_IBITMAP_CORRUPT(grp)) { + if (gdp) { + int count; + + count = ext4_free_inodes_count(sb, gdp); + percpu_counter_sub(&sbi->s_freeinodes_counter, + count); + } + set_bit(EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT, + &grp->bb_state); + } +} + void ext4_update_dynamic_rev(struct super_block *sb) { struct ext4_super_block *es = EXT4_SB(sb)->s_es; @@ -2116,12 +2146,12 @@ static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es, int read_only) { struct ext4_sb_info *sbi = EXT4_SB(sb); - int res = 0; + int err = 0; if (le32_to_cpu(es->s_rev_level) > EXT4_MAX_SUPP_REV) { ext4_msg(sb, KERN_ERR, "revision level too high, " "forcing read-only mode"); - res = SB_RDONLY; + err = -EROFS; } if (read_only) goto done; @@ -2154,7 +2184,7 @@ static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es, if (sbi->s_journal) ext4_set_feature_journal_needs_recovery(sb); - ext4_commit_super(sb, 1); + err = ext4_commit_super(sb, 1); done: if (test_opt(sb, DEBUG)) printk(KERN_INFO "[EXT4 FS bs=%lu, gc=%u, " @@ -2166,7 +2196,7 @@ done: sbi->s_mount_opt, sbi->s_mount_opt2); cleancache_init_fs(sb); - return res; + return err; } int ext4_alloc_flex_bg_array(struct super_block *sb, ext4_group_t ngroup) @@ -4223,8 +4253,12 @@ no_journal: goto failed_mount4; } - if (ext4_setup_super(sb, es, sb_rdonly(sb))) + ret = ext4_setup_super(sb, es, sb_rdonly(sb)); + if (ret == -EROFS) { sb->s_flags |= SB_RDONLY; + ret = 0; + } else if (ret) + goto failed_mount4a; /* determine the minimum size of new large inodes, if present */ if (sbi->s_inode_size > EXT4_GOOD_OLD_INODE_SIZE && @@ -4759,11 +4793,7 @@ static int ext4_commit_super(struct super_block *sb, int sync) unlock_buffer(sbh); error = __sync_dirty_buffer(sbh, REQ_SYNC | (test_opt(sb, BARRIER) ? REQ_FUA : 0)); - if (error) - return error; - - error = buffer_write_io_error(sbh); - if (error) { + if (buffer_write_io_error(sbh)) { ext4_msg(sb, KERN_ERR, "I/O error while writing " "superblock"); clear_buffer_write_io_error(sbh); @@ -5164,8 +5194,12 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) if (sbi->s_journal) ext4_clear_journal_err(sb, es); sbi->s_mount_state = le16_to_cpu(es->s_state); - if (!ext4_setup_super(sb, es, 0)) - sb->s_flags &= ~SB_RDONLY; + + err = ext4_setup_super(sb, es, 0); + if (err) + goto restore_opts; + + sb->s_flags &= ~SB_RDONLY; if (ext4_has_feature_mmp(sb)) if (ext4_multi_mount_protect(sb, le64_to_cpu(es->s_mmp_block))) { @@ -5189,8 +5223,11 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) } ext4_setup_system_zone(sb); - if (sbi->s_journal == NULL && !(old_sb_flags & SB_RDONLY)) - ext4_commit_super(sb, 1); + if (sbi->s_journal == NULL && !(old_sb_flags & SB_RDONLY)) { + err = ext4_commit_super(sb, 1); + if (err) + goto restore_opts; + } #ifdef CONFIG_QUOTA /* Release old quota file names */ @@ -5251,7 +5288,8 @@ static int ext4_statfs_project(struct super_block *sb, dquot->dq_dqb.dqb_bsoftlimit : dquot->dq_dqb.dqb_bhardlimit) >> sb->s_blocksize_bits; if (limit && buf->f_blocks > limit) { - curblock = dquot->dq_dqb.dqb_curspace >> sb->s_blocksize_bits; + curblock = (dquot->dq_dqb.dqb_curspace + + dquot->dq_dqb.dqb_rsvspace) >> sb->s_blocksize_bits; buf->f_blocks = limit; buf->f_bfree = buf->f_bavail = (buf->f_blocks > curblock) ? diff --git a/fs/ext4/sysfs.c b/fs/ext4/sysfs.c index 9ebd26c957c2..f34da0bb8f17 100644 --- a/fs/ext4/sysfs.c +++ b/fs/ext4/sysfs.c @@ -346,39 +346,9 @@ static struct kobject *ext4_root; static struct kobject *ext4_feat; -#define PROC_FILE_SHOW_DEFN(name) \ -static int name##_open(struct inode *inode, struct file *file) \ -{ \ - return single_open(file, ext4_seq_##name##_show, PDE_DATA(inode)); \ -} \ -\ -static const struct file_operations ext4_seq_##name##_fops = { \ - .open = name##_open, \ - .read = seq_read, \ - .llseek = seq_lseek, \ - .release = single_release, \ -} - -#define PROC_FILE_LIST(name) \ - { __stringify(name), &ext4_seq_##name##_fops } - -PROC_FILE_SHOW_DEFN(es_shrinker_info); -PROC_FILE_SHOW_DEFN(options); - -static const struct ext4_proc_files { - const char *name; - const struct file_operations *fops; -} proc_files[] = { - PROC_FILE_LIST(options), - PROC_FILE_LIST(es_shrinker_info), - PROC_FILE_LIST(mb_groups), - { NULL, NULL }, -}; - int ext4_register_sysfs(struct super_block *sb) { struct ext4_sb_info *sbi = EXT4_SB(sb); - const struct ext4_proc_files *p; int err; init_completion(&sbi->s_kobj_unregister); @@ -392,11 +362,14 @@ int ext4_register_sysfs(struct super_block *sb) if (ext4_proc_root) sbi->s_proc = proc_mkdir(sb->s_id, ext4_proc_root); - if (sbi->s_proc) { - for (p = proc_files; p->name; p++) - proc_create_data(p->name, S_IRUGO, sbi->s_proc, - p->fops, sb); + proc_create_single_data("options", S_IRUGO, sbi->s_proc, + ext4_seq_options_show, sb); + proc_create_single_data("es_shrinker_info", S_IRUGO, + sbi->s_proc, ext4_seq_es_shrinker_info_show, + sb); + proc_create_seq_data("mb_groups", S_IRUGO, sbi->s_proc, + &ext4_mb_seq_groups_ops, sb); } return 0; } @@ -404,13 +377,9 @@ int ext4_register_sysfs(struct super_block *sb) void ext4_unregister_sysfs(struct super_block *sb) { struct ext4_sb_info *sbi = EXT4_SB(sb); - const struct ext4_proc_files *p; - if (sbi->s_proc) { - for (p = proc_files; p->name; p++) - remove_proc_entry(p->name, sbi->s_proc); - remove_proc_entry(sb->s_id, ext4_proc_root); - } + if (sbi->s_proc) + remove_proc_subtree(sb->s_id, ext4_proc_root); kobject_del(&sbi->s_kobj); } diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index 499cb4b1fbd2..fc4ced59c565 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -1688,7 +1688,7 @@ static int ext4_xattr_set_entry(struct ext4_xattr_info *i, /* No failures allowed past this point. */ - if (!s->not_found && here->e_value_offs) { + if (!s->not_found && here->e_value_size && here->e_value_offs) { /* Remove the old value. */ void *first_val = s->base + min_offs; size_t offs = le16_to_cpu(here->e_value_offs); diff --git a/fs/ext4/xattr_security.c b/fs/ext4/xattr_security.c index 629001b28632..197a9d8a15ef 100644 --- a/fs/ext4/xattr_security.c +++ b/fs/ext4/xattr_security.c @@ -43,7 +43,7 @@ ext4_initxattrs(struct inode *inode, const struct xattr *xattr_array, err = ext4_xattr_set_handle(handle, inode, EXT4_XATTR_INDEX_SECURITY, xattr->name, xattr->value, - xattr->value_len, 0); + xattr->value_len, XATTR_CREATE); if (err < 0) break; } diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index d5098efe577c..75e37fd720b2 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -294,8 +294,7 @@ static int f2fs_create(struct inode *dir, struct dentry *dentry, umode_t mode, alloc_nid_done(sbi, ino); - d_instantiate(dentry, inode); - unlock_new_inode(inode); + d_instantiate_new(dentry, inode); if (IS_DIRSYNC(dir)) f2fs_sync_fs(sbi->sb, 1); @@ -597,8 +596,7 @@ static int f2fs_symlink(struct inode *dir, struct dentry *dentry, err = page_symlink(inode, disk_link.name, disk_link.len); err_out: - d_instantiate(dentry, inode); - unlock_new_inode(inode); + d_instantiate_new(dentry, inode); /* * Let's flush symlink data in order to avoid broken symlink as much as @@ -661,8 +659,7 @@ static int f2fs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) alloc_nid_done(sbi, inode->i_ino); - d_instantiate(dentry, inode); - unlock_new_inode(inode); + d_instantiate_new(dentry, inode); if (IS_DIRSYNC(dir)) f2fs_sync_fs(sbi->sb, 1); @@ -713,8 +710,7 @@ static int f2fs_mknod(struct inode *dir, struct dentry *dentry, alloc_nid_done(sbi, inode->i_ino); - d_instantiate(dentry, inode); - unlock_new_inode(inode); + d_instantiate_new(dentry, inode); if (IS_DIRSYNC(dir)) f2fs_sync_fs(sbi->sb, 1); diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index f33a56d6e6dd..4b47ca6296a7 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -572,23 +572,6 @@ static int iostat_info_seq_show(struct seq_file *seq, void *offset) return 0; } -#define F2FS_PROC_FILE_DEF(_name) \ -static int _name##_open_fs(struct inode *inode, struct file *file) \ -{ \ - return single_open(file, _name##_seq_show, PDE_DATA(inode)); \ -} \ - \ -static const struct file_operations f2fs_seq_##_name##_fops = { \ - .open = _name##_open_fs, \ - .read = seq_read, \ - .llseek = seq_lseek, \ - .release = single_release, \ -}; - -F2FS_PROC_FILE_DEF(segment_info); -F2FS_PROC_FILE_DEF(segment_bits); -F2FS_PROC_FILE_DEF(iostat_info); - int __init f2fs_init_sysfs(void) { int ret; @@ -632,12 +615,12 @@ int f2fs_register_sysfs(struct f2fs_sb_info *sbi) sbi->s_proc = proc_mkdir(sb->s_id, f2fs_proc_root); if (sbi->s_proc) { - proc_create_data("segment_info", S_IRUGO, sbi->s_proc, - &f2fs_seq_segment_info_fops, sb); - proc_create_data("segment_bits", S_IRUGO, sbi->s_proc, - &f2fs_seq_segment_bits_fops, sb); - proc_create_data("iostat_info", S_IRUGO, sbi->s_proc, - &f2fs_seq_iostat_info_fops, sb); + proc_create_single_data("segment_info", S_IRUGO, sbi->s_proc, + segment_info_seq_show, sb); + proc_create_single_data("segment_bits", S_IRUGO, sbi->s_proc, + segment_bits_seq_show, sb); + proc_create_single_data("iostat_info", S_IRUGO, sbi->s_proc, + iostat_info_seq_show, sb); } return 0; } diff --git a/fs/fat/namei_msdos.c b/fs/fat/namei_msdos.c index 582ca731a6c9..484ce674e0cd 100644 --- a/fs/fat/namei_msdos.c +++ b/fs/fat/namei_msdos.c @@ -314,10 +314,6 @@ static int msdos_rmdir(struct inode *dir, struct dentry *dentry) int err; mutex_lock(&MSDOS_SB(sb)->s_lock); - /* - * Check whether the directory is not in use, then check - * whether it is empty. - */ err = fat_dir_empty(inode); if (err) goto out; diff --git a/fs/fat/namei_vfat.c b/fs/fat/namei_vfat.c index 2649759c478a..4f4362d5a04c 100644 --- a/fs/fat/namei_vfat.c +++ b/fs/fat/namei_vfat.c @@ -697,15 +697,6 @@ static int vfat_find(struct inode *dir, const struct qstr *qname, return fat_search_long(dir, qname->name, len, sinfo); } -/* - * (nfsd's) anonymous disconnected dentry? - * NOTE: !IS_ROOT() is not anonymous (I.e. d_splice_alias() did the job). - */ -static int vfat_d_anon_disconn(struct dentry *dentry) -{ - return IS_ROOT(dentry) && (dentry->d_flags & DCACHE_DISCONNECTED); -} - static struct dentry *vfat_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) { @@ -738,8 +729,7 @@ static struct dentry *vfat_lookup(struct inode *dir, struct dentry *dentry, * Checking "alias->d_parent == dentry->d_parent" to make sure * FS is not corrupted (especially double linked dir). */ - if (alias && alias->d_parent == dentry->d_parent && - !vfat_d_anon_disconn(alias)) { + if (alias && alias->d_parent == dentry->d_parent) { /* * This inode has non anonymous-DCACHE_DISCONNECTED * dentry. This means, the user did ->lookup() by an @@ -747,7 +737,6 @@ static struct dentry *vfat_lookup(struct inode *dir, struct dentry *dentry, * * Switch to new one for reason of locality if possible. */ - BUG_ON(d_unhashed(alias)); if (!S_ISDIR(inode->i_mode)) d_move(alias, dentry); iput(inode); diff --git a/fs/fcntl.c b/fs/fcntl.c index d737ff082472..c42169459298 100644 --- a/fs/fcntl.c +++ b/fs/fcntl.c @@ -871,9 +871,9 @@ int fasync_remove_entry(struct file *filp, struct fasync_struct **fapp) if (fa->fa_file != filp) continue; - spin_lock_irq(&fa->fa_lock); + write_lock_irq(&fa->fa_lock); fa->fa_file = NULL; - spin_unlock_irq(&fa->fa_lock); + write_unlock_irq(&fa->fa_lock); *fp = fa->fa_next; call_rcu(&fa->fa_rcu, fasync_free_rcu); @@ -918,13 +918,13 @@ struct fasync_struct *fasync_insert_entry(int fd, struct file *filp, struct fasy if (fa->fa_file != filp) continue; - spin_lock_irq(&fa->fa_lock); + write_lock_irq(&fa->fa_lock); fa->fa_fd = fd; - spin_unlock_irq(&fa->fa_lock); + write_unlock_irq(&fa->fa_lock); goto out; } - spin_lock_init(&new->fa_lock); + rwlock_init(&new->fa_lock); new->magic = FASYNC_MAGIC; new->fa_file = filp; new->fa_fd = fd; @@ -987,14 +987,13 @@ static void kill_fasync_rcu(struct fasync_struct *fa, int sig, int band) { while (fa) { struct fown_struct *fown; - unsigned long flags; if (fa->magic != FASYNC_MAGIC) { printk(KERN_ERR "kill_fasync: bad magic number in " "fasync_struct!\n"); return; } - spin_lock_irqsave(&fa->fa_lock, flags); + read_lock(&fa->fa_lock); if (fa->fa_file) { fown = &fa->fa_file->f_owner; /* Don't send SIGURG to processes which have not set a @@ -1003,7 +1002,7 @@ static void kill_fasync_rcu(struct fasync_struct *fa, int sig, int band) if (!(sig == SIGURG && fown->signum == 0)) send_sigio(fown, fa->fa_fd, band); } - spin_unlock_irqrestore(&fa->fa_lock, flags); + read_unlock(&fa->fa_lock); fa = rcu_dereference(fa->fa_next); } } diff --git a/fs/filesystems.c b/fs/filesystems.c index f2728a4a03a1..b03f57b1105b 100644 --- a/fs/filesystems.c +++ b/fs/filesystems.c @@ -238,21 +238,9 @@ static int filesystems_proc_show(struct seq_file *m, void *v) return 0; } -static int filesystems_proc_open(struct inode *inode, struct file *file) -{ - return single_open(file, filesystems_proc_show, NULL); -} - -static const struct file_operations filesystems_proc_fops = { - .open = filesystems_proc_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; - static int __init proc_filesystems_init(void) { - proc_create("filesystems", 0, NULL, &filesystems_proc_fops); + proc_create_single("filesystems", 0, NULL, filesystems_proc_show); return 0; } module_init(proc_filesystems_init); diff --git a/fs/freevxfs/vxfs_lookup.c b/fs/freevxfs/vxfs_lookup.c index ce4785fd81c6..a51425634f65 100644 --- a/fs/freevxfs/vxfs_lookup.c +++ b/fs/freevxfs/vxfs_lookup.c @@ -193,13 +193,9 @@ vxfs_lookup(struct inode *dip, struct dentry *dp, unsigned int flags) return ERR_PTR(-ENAMETOOLONG); ino = vxfs_inode_by_name(dip, dp); - if (ino) { + if (ino) ip = vxfs_iget(dip->i_sb, ino); - if (IS_ERR(ip)) - return ERR_CAST(ip); - } - d_add(dp, ip); - return NULL; + return d_splice_alias(ip, dp); } /** diff --git a/fs/fscache/histogram.c b/fs/fscache/histogram.c index 15a3d042247e..9a13e9e15b69 100644 --- a/fs/fscache/histogram.c +++ b/fs/fscache/histogram.c @@ -83,24 +83,9 @@ static void fscache_histogram_stop(struct seq_file *m, void *v) { } -static const struct seq_operations fscache_histogram_ops = { +const struct seq_operations fscache_histogram_ops = { .start = fscache_histogram_start, .stop = fscache_histogram_stop, .next = fscache_histogram_next, .show = fscache_histogram_show, }; - -/* - * open "/proc/fs/fscache/histogram" to provide latency data - */ -static int fscache_histogram_open(struct inode *inode, struct file *file) -{ - return seq_open(file, &fscache_histogram_ops); -} - -const struct file_operations fscache_histogram_fops = { - .open = fscache_histogram_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release, -}; diff --git a/fs/fscache/internal.h b/fs/fscache/internal.h index 500650f938fe..f83328a7f048 100644 --- a/fs/fscache/internal.h +++ b/fs/fscache/internal.h @@ -31,6 +31,7 @@ #include <linux/fscache-cache.h> #include <trace/events/fscache.h> #include <linux/sched.h> +#include <linux/seq_file.h> #define FSCACHE_MIN_THREADS 4 #define FSCACHE_MAX_THREADS 32 @@ -84,7 +85,7 @@ static inline void fscache_hist(atomic_t histogram[], unsigned long start_jif) atomic_inc(&histogram[jif]); } -extern const struct file_operations fscache_histogram_fops; +extern const struct seq_operations fscache_histogram_ops; #else #define fscache_hist(hist, start_jif) do {} while (0) @@ -294,7 +295,7 @@ static inline void fscache_stat_d(atomic_t *stat) #define __fscache_stat(stat) (stat) -extern const struct file_operations fscache_stats_fops; +int fscache_stats_show(struct seq_file *m, void *v); #else #define __fscache_stat(stat) (NULL) diff --git a/fs/fscache/proc.c b/fs/fscache/proc.c index 1d9e4951a597..49a8c90414bc 100644 --- a/fs/fscache/proc.c +++ b/fs/fscache/proc.c @@ -26,14 +26,14 @@ int __init fscache_proc_init(void) goto error_dir; #ifdef CONFIG_FSCACHE_STATS - if (!proc_create("fs/fscache/stats", S_IFREG | 0444, NULL, - &fscache_stats_fops)) + if (!proc_create_single("fs/fscache/stats", S_IFREG | 0444, NULL, + fscache_stats_show)) goto error_stats; #endif #ifdef CONFIG_FSCACHE_HISTOGRAM - if (!proc_create("fs/fscache/histogram", S_IFREG | 0444, NULL, - &fscache_histogram_fops)) + if (!proc_create_seq("fs/fscache/histogram", S_IFREG | 0444, NULL, + &fscache_histogram_ops)) goto error_histogram; #endif diff --git a/fs/fscache/stats.c b/fs/fscache/stats.c index fcc8c2f2690e..00564a1dfd76 100644 --- a/fs/fscache/stats.c +++ b/fs/fscache/stats.c @@ -138,7 +138,7 @@ atomic_t fscache_n_cache_culled_objects; /* * display the general statistics */ -static int fscache_stats_show(struct seq_file *m, void *v) +int fscache_stats_show(struct seq_file *m, void *v) { seq_puts(m, "FS-Cache statistics\n"); @@ -284,18 +284,3 @@ static int fscache_stats_show(struct seq_file *m, void *v) atomic_read(&fscache_n_cache_culled_objects)); return 0; } - -/* - * open "/proc/fs/fscache/stats" allowing provision of a statistical summary - */ -static int fscache_stats_open(struct inode *inode, struct file *file) -{ - return single_open(file, fscache_stats_show, NULL); -} - -const struct file_operations fscache_stats_fops = { - .open = fscache_stats_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c index f58716567972..35f5ee23566d 100644 --- a/fs/gfs2/aops.c +++ b/fs/gfs2/aops.c @@ -54,8 +54,7 @@ static void gfs2_page_add_databufs(struct gfs2_inode *ip, struct page *page, continue; if (start >= to) break; - if (gfs2_is_jdata(ip)) - set_buffer_uptodate(bh); + set_buffer_uptodate(bh); gfs2_trans_add_data(ip->i_gl, bh); } } @@ -747,18 +746,21 @@ out: put_page(page); gfs2_trans_end(sdp); - if (pos + len > ip->i_inode.i_size) - gfs2_trim_blocks(&ip->i_inode); - goto out_trans_fail; + if (alloc_required) { + gfs2_inplace_release(ip); + if (pos + len > ip->i_inode.i_size) + gfs2_trim_blocks(&ip->i_inode); + } + goto out_qunlock; out_endtrans: gfs2_trans_end(sdp); out_trans_fail: - if (alloc_required) { + if (alloc_required) gfs2_inplace_release(ip); out_qunlock: + if (alloc_required) gfs2_quota_unlock(ip); - } out_unlock: if (&ip->i_inode == sdp->sd_rindex) { gfs2_glock_dq(&m_ip->i_gh); @@ -814,7 +816,6 @@ out: * @inode: The inode * @dibh: The buffer_head containing the on-disk inode * @pos: The file position - * @len: The length of the write * @copied: How much was actually copied by the VFS * @page: The page * @@ -824,17 +825,15 @@ out: * Returns: errno */ static int gfs2_stuffed_write_end(struct inode *inode, struct buffer_head *dibh, - loff_t pos, unsigned len, unsigned copied, + loff_t pos, unsigned copied, struct page *page) { struct gfs2_inode *ip = GFS2_I(inode); - struct gfs2_sbd *sdp = GFS2_SB(inode); - struct gfs2_inode *m_ip = GFS2_I(sdp->sd_statfs_inode); u64 to = pos + copied; void *kaddr; unsigned char *buf = dibh->b_data + sizeof(struct gfs2_dinode); - BUG_ON(pos + len > gfs2_max_stuffed_size(ip)); + BUG_ON(pos + copied > gfs2_max_stuffed_size(ip)); kaddr = kmap_atomic(page); memcpy(buf + pos, kaddr + pos, copied); @@ -850,20 +849,6 @@ static int gfs2_stuffed_write_end(struct inode *inode, struct buffer_head *dibh, i_size_write(inode, to); mark_inode_dirty(inode); } - - if (inode == sdp->sd_rindex) { - adjust_fs_space(inode); - sdp->sd_rindex_uptodate = 0; - } - - brelse(dibh); - gfs2_trans_end(sdp); - if (inode == sdp->sd_rindex) { - gfs2_glock_dq(&m_ip->i_gh); - gfs2_holder_uninit(&m_ip->i_gh); - } - gfs2_glock_dq(&ip->i_gh); - gfs2_holder_uninit(&ip->i_gh); return copied; } @@ -877,9 +862,8 @@ static int gfs2_stuffed_write_end(struct inode *inode, struct buffer_head *dibh, * @page: The page that has been written * @fsdata: The fsdata (unused in GFS2) * - * The main write_end function for GFS2. We have a separate one for - * stuffed files as they are slightly different, otherwise we just - * put our locking around the VFS provided functions. + * The main write_end function for GFS2. We just put our locking around the VFS + * provided functions. * * Returns: errno */ @@ -900,32 +884,39 @@ static int gfs2_write_end(struct file *file, struct address_space *mapping, BUG_ON(gfs2_glock_is_locked_by_me(ip->i_gl) == NULL); ret = gfs2_meta_inode_buffer(ip, &dibh); - if (unlikely(ret)) { - unlock_page(page); - put_page(page); - goto failed; - } + if (unlikely(ret)) + goto out; - if (gfs2_is_stuffed(ip)) - return gfs2_stuffed_write_end(inode, dibh, pos, len, copied, page); + if (gfs2_is_stuffed(ip)) { + ret = gfs2_stuffed_write_end(inode, dibh, pos, copied, page); + page = NULL; + goto out2; + } - if (!gfs2_is_writeback(ip)) + if (gfs2_is_jdata(ip)) gfs2_page_add_databufs(ip, page, pos & ~PAGE_MASK, len); + else + gfs2_ordered_add_inode(ip); ret = generic_write_end(file, mapping, pos, len, copied, page, fsdata); + page = NULL; if (tr->tr_num_buf_new) __mark_inode_dirty(inode, I_DIRTY_DATASYNC); else gfs2_trans_add_meta(ip->i_gl, dibh); - +out2: if (inode == sdp->sd_rindex) { adjust_fs_space(inode); sdp->sd_rindex_uptodate = 0; } brelse(dibh); -failed: +out: + if (page) { + unlock_page(page); + put_page(page); + } gfs2_trans_end(sdp); gfs2_inplace_release(ip); if (ip->i_qadata && ip->i_qadata->qa_qd_num) diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c index 8efa6297e19c..ed6699705c13 100644 --- a/fs/gfs2/bmap.c +++ b/fs/gfs2/bmap.c @@ -89,10 +89,12 @@ static int gfs2_unstuffer_page(struct gfs2_inode *ip, struct buffer_head *dibh, map_bh(bh, inode->i_sb, block); set_buffer_uptodate(bh); - if (!gfs2_is_jdata(ip)) - mark_buffer_dirty(bh); - if (!gfs2_is_writeback(ip)) + if (gfs2_is_jdata(ip)) gfs2_trans_add_data(ip->i_gl, bh); + else { + mark_buffer_dirty(bh); + gfs2_ordered_add_inode(ip); + } if (release) { unlock_page(page); @@ -176,8 +178,8 @@ out: /** * find_metapath - Find path through the metadata tree * @sdp: The superblock - * @mp: The metapath to return the result in * @block: The disk block to look up + * @mp: The metapath to return the result in * @height: The pre-calculated height of the metadata tree * * This routine returns a struct metapath structure that defines a path @@ -188,8 +190,7 @@ out: * filesystem with a blocksize of 4096. * * find_metapath() would return a struct metapath structure set to: - * mp_offset = 101342453, mp_height = 3, mp_list[0] = 0, mp_list[1] = 48, - * and mp_list[2] = 165. + * mp_fheight = 3, mp_list[0] = 0, mp_list[1] = 48, and mp_list[2] = 165. * * That means that in order to get to the block containing the byte at * offset 101342453, we would load the indirect block pointed to by pointer @@ -279,6 +280,21 @@ static inline __be64 *metapointer(unsigned int height, const struct metapath *mp return p + mp->mp_list[height]; } +static inline const __be64 *metaend(unsigned int height, const struct metapath *mp) +{ + const struct buffer_head *bh = mp->mp_bh[height]; + return (const __be64 *)(bh->b_data + bh->b_size); +} + +static void clone_metapath(struct metapath *clone, struct metapath *mp) +{ + unsigned int hgt; + + *clone = *mp; + for (hgt = 0; hgt < mp->mp_aheight; hgt++) + get_bh(clone->mp_bh[hgt]); +} + static void gfs2_metapath_ra(struct gfs2_glock *gl, __be64 *start, __be64 *end) { const __be64 *t; @@ -420,20 +436,140 @@ static inline unsigned int gfs2_extent_length(void *start, unsigned int len, __b return (ptr - first); } -static inline void bmap_lock(struct gfs2_inode *ip, int create) +typedef const __be64 *(*gfs2_metadata_walker)( + struct metapath *mp, + const __be64 *start, const __be64 *end, + u64 factor, void *data); + +#define WALK_STOP ((__be64 *)0) +#define WALK_NEXT ((__be64 *)1) + +static int gfs2_walk_metadata(struct inode *inode, sector_t lblock, + u64 len, struct metapath *mp, gfs2_metadata_walker walker, + void *data) { - if (create) - down_write(&ip->i_rw_mutex); - else - down_read(&ip->i_rw_mutex); + struct metapath clone; + struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_sbd *sdp = GFS2_SB(inode); + const __be64 *start, *end, *ptr; + u64 factor = 1; + unsigned int hgt; + int ret = 0; + + for (hgt = ip->i_height - 1; hgt >= mp->mp_aheight; hgt--) + factor *= sdp->sd_inptrs; + + for (;;) { + u64 step; + + /* Walk indirect block. */ + start = metapointer(hgt, mp); + end = metaend(hgt, mp); + + step = (end - start) * factor; + if (step > len) + end = start + DIV_ROUND_UP_ULL(len, factor); + + ptr = walker(mp, start, end, factor, data); + if (ptr == WALK_STOP) + break; + if (step >= len) + break; + len -= step; + if (ptr != WALK_NEXT) { + BUG_ON(!*ptr); + mp->mp_list[hgt] += ptr - start; + goto fill_up_metapath; + } + +lower_metapath: + /* Decrease height of metapath. */ + if (mp != &clone) { + clone_metapath(&clone, mp); + mp = &clone; + } + brelse(mp->mp_bh[hgt]); + mp->mp_bh[hgt] = NULL; + if (!hgt) + break; + hgt--; + factor *= sdp->sd_inptrs; + + /* Advance in metadata tree. */ + (mp->mp_list[hgt])++; + start = metapointer(hgt, mp); + end = metaend(hgt, mp); + if (start >= end) { + mp->mp_list[hgt] = 0; + if (!hgt) + break; + goto lower_metapath; + } + +fill_up_metapath: + /* Increase height of metapath. */ + if (mp != &clone) { + clone_metapath(&clone, mp); + mp = &clone; + } + ret = fillup_metapath(ip, mp, ip->i_height - 1); + if (ret < 0) + break; + hgt += ret; + for (; ret; ret--) + do_div(factor, sdp->sd_inptrs); + mp->mp_aheight = hgt + 1; + } + if (mp == &clone) + release_metapath(mp); + return ret; } -static inline void bmap_unlock(struct gfs2_inode *ip, int create) +struct gfs2_hole_walker_args { + u64 blocks; +}; + +static const __be64 *gfs2_hole_walker(struct metapath *mp, + const __be64 *start, const __be64 *end, + u64 factor, void *data) { - if (create) - up_write(&ip->i_rw_mutex); - else - up_read(&ip->i_rw_mutex); + struct gfs2_hole_walker_args *args = data; + const __be64 *ptr; + + for (ptr = start; ptr < end; ptr++) { + if (*ptr) { + args->blocks += (ptr - start) * factor; + if (mp->mp_aheight == mp->mp_fheight) + return WALK_STOP; + return ptr; /* increase height */ + } + } + args->blocks += (end - start) * factor; + return WALK_NEXT; +} + +/** + * gfs2_hole_size - figure out the size of a hole + * @inode: The inode + * @lblock: The logical starting block number + * @len: How far to look (in blocks) + * @mp: The metapath at lblock + * @iomap: The iomap to store the hole size in + * + * This function modifies @mp. + * + * Returns: errno on error + */ +static int gfs2_hole_size(struct inode *inode, sector_t lblock, u64 len, + struct metapath *mp, struct iomap *iomap) +{ + struct gfs2_hole_walker_args args = { }; + int ret = 0; + + ret = gfs2_walk_metadata(inode, lblock, len, mp, gfs2_hole_walker, &args); + if (!ret) + iomap->length = args.blocks << inode->i_blkbits; + return ret; } static inline __be64 *gfs2_indirect_init(struct metapath *mp, @@ -462,15 +598,11 @@ enum alloc_state { }; /** - * gfs2_bmap_alloc - Build a metadata tree of the requested height + * gfs2_iomap_alloc - Build a metadata tree of the requested height * @inode: The GFS2 inode - * @lblock: The logical starting block of the extent - * @bh_map: This is used to return the mapping details - * @zero_new: True if newly allocated blocks should be zeroed + * @iomap: The iomap structure + * @flags: iomap flags * @mp: The metapath, with proper height information calculated - * @maxlen: The max number of data blocks to alloc - * @dblock: Pointer to return the resulting new block - * @dblks: Pointer to return the number of blocks allocated * * In this routine we may have to alloc: * i) Indirect blocks to grow the metadata tree height @@ -483,6 +615,13 @@ enum alloc_state { * blocks are available, there will only be one request per bmap call) * and uses the state machine to initialise the blocks in order. * + * Right now, this function will allocate at most one indirect block + * worth of data -- with a default block size of 4K, that's slightly + * less than 2M. If this limitation is ever removed to allow huge + * allocations, we would probably still want to limit the iomap size we + * return to avoid stalling other tasks during huge writes; the next + * iomap iteration would then find the blocks already allocated. + * * Returns: errno on error */ @@ -497,6 +636,7 @@ static int gfs2_iomap_alloc(struct inode *inode, struct iomap *iomap, unsigned dblks = 0; unsigned ptrs_per_blk; const unsigned end_of_metadata = mp->mp_fheight - 1; + int ret; enum alloc_state state; __be64 *ptr; __be64 zero_bn = 0; @@ -507,6 +647,8 @@ static int gfs2_iomap_alloc(struct inode *inode, struct iomap *iomap, gfs2_trans_add_meta(ip->i_gl, dibh); + down_write(&ip->i_rw_mutex); + if (mp->mp_fheight == mp->mp_aheight) { struct buffer_head *bh; int eob; @@ -542,11 +684,10 @@ static int gfs2_iomap_alloc(struct inode *inode, struct iomap *iomap, blks = dblks + iblks; i = mp->mp_aheight; do { - int error; n = blks - alloced; - error = gfs2_alloc_blocks(ip, &bn, &n, 0, NULL); - if (error) - return error; + ret = gfs2_alloc_blocks(ip, &bn, &n, 0, NULL); + if (ret) + goto out; alloced += n; if (state != ALLOC_DATA || gfs2_is_jdata(ip)) gfs2_trans_add_unrevoke(sdp, bn, n); @@ -602,7 +743,7 @@ static int gfs2_iomap_alloc(struct inode *inode, struct iomap *iomap, dblks = n; ptr = metapointer(end_of_metadata, mp); iomap->addr = bn << inode->i_blkbits; - iomap->flags |= IOMAP_F_NEW; + iomap->flags |= IOMAP_F_MERGED | IOMAP_F_NEW; while (n-- > 0) *ptr++ = cpu_to_be64(bn++); break; @@ -612,64 +753,10 @@ static int gfs2_iomap_alloc(struct inode *inode, struct iomap *iomap, iomap->length = (u64)dblks << inode->i_blkbits; ip->i_height = mp->mp_fheight; gfs2_add_inode_blocks(&ip->i_inode, alloced); - gfs2_dinode_out(ip, mp->mp_bh[0]->b_data); - return 0; -} - -/** - * hole_size - figure out the size of a hole - * @inode: The inode - * @lblock: The logical starting block number - * @mp: The metapath - * - * Returns: The hole size in bytes - * - */ -static u64 hole_size(struct inode *inode, sector_t lblock, struct metapath *mp) -{ - struct gfs2_inode *ip = GFS2_I(inode); - struct gfs2_sbd *sdp = GFS2_SB(inode); - struct metapath mp_eof; - u64 factor = 1; - int hgt; - u64 holesz = 0; - const __be64 *first, *end, *ptr; - const struct buffer_head *bh; - u64 lblock_stop = (i_size_read(inode) - 1) >> inode->i_blkbits; - int zeroptrs; - bool done = false; - - /* Get another metapath, to the very last byte */ - find_metapath(sdp, lblock_stop, &mp_eof, ip->i_height); - for (hgt = ip->i_height - 1; hgt >= 0 && !done; hgt--) { - bh = mp->mp_bh[hgt]; - if (bh) { - zeroptrs = 0; - first = metapointer(hgt, mp); - end = (const __be64 *)(bh->b_data + bh->b_size); - - for (ptr = first; ptr < end; ptr++) { - if (*ptr) { - done = true; - break; - } else { - zeroptrs++; - } - } - } else { - zeroptrs = sdp->sd_inptrs; - } - if (factor * zeroptrs >= lblock_stop - lblock + 1) { - holesz = lblock_stop - lblock + 1; - break; - } - holesz += factor * zeroptrs; - - factor *= sdp->sd_inptrs; - if (hgt && (mp->mp_list[hgt - 1] < mp_eof.mp_list[hgt - 1])) - (mp->mp_list[hgt - 1])++; - } - return holesz << inode->i_blkbits; + gfs2_dinode_out(ip, dibh->b_data); +out: + up_write(&ip->i_rw_mutex); + return ret; } static void gfs2_stuffed_iomap(struct inode *inode, struct iomap *iomap) @@ -686,121 +773,130 @@ static void gfs2_stuffed_iomap(struct inode *inode, struct iomap *iomap) #define IOMAP_F_GFS2_BOUNDARY IOMAP_F_PRIVATE /** - * gfs2_iomap_begin - Map blocks from an inode to disk blocks + * gfs2_iomap_get - Map blocks from an inode to disk blocks * @inode: The inode * @pos: Starting position in bytes * @length: Length to map, in bytes * @flags: iomap flags * @iomap: The iomap structure + * @mp: The metapath * * Returns: errno */ -int gfs2_iomap_begin(struct inode *inode, loff_t pos, loff_t length, - unsigned flags, struct iomap *iomap) +static int gfs2_iomap_get(struct inode *inode, loff_t pos, loff_t length, + unsigned flags, struct iomap *iomap, + struct metapath *mp) { struct gfs2_inode *ip = GFS2_I(inode); struct gfs2_sbd *sdp = GFS2_SB(inode); - struct metapath mp = { .mp_aheight = 1, }; - unsigned int factor = sdp->sd_sb.sb_bsize; - const u64 *arr = sdp->sd_heightsize; __be64 *ptr; sector_t lblock; - sector_t lend; - int ret = 0; + sector_t lblock_stop; + int ret; int eob; - unsigned int len; + u64 len; struct buffer_head *bh; u8 height; - trace_gfs2_iomap_start(ip, pos, length, flags); - if (!length) { - ret = -EINVAL; - goto out; - } + if (!length) + return -EINVAL; if (gfs2_is_stuffed(ip)) { if (flags & IOMAP_REPORT) { + if (pos >= i_size_read(inode)) + return -ENOENT; gfs2_stuffed_iomap(inode, iomap); - if (pos >= iomap->length) - ret = -ENOENT; - goto out; + return 0; } BUG_ON(!(flags & IOMAP_WRITE)); } - lblock = pos >> inode->i_blkbits; - lend = (pos + length + sdp->sd_sb.sb_bsize - 1) >> inode->i_blkbits; - iomap->offset = lblock << inode->i_blkbits; - iomap->addr = IOMAP_NULL_ADDR; - iomap->type = IOMAP_HOLE; - iomap->length = (u64)(lend - lblock) << inode->i_blkbits; - iomap->flags = IOMAP_F_MERGED; - bmap_lock(ip, flags & IOMAP_WRITE); + lblock_stop = (pos + length - 1) >> inode->i_blkbits; + len = lblock_stop - lblock + 1; - /* - * Directory data blocks have a struct gfs2_meta_header header, so the - * remaining size is smaller than the filesystem block size. Logical - * block numbers for directories are in units of this remaining size! - */ - if (gfs2_is_dir(ip)) { - factor = sdp->sd_jbsize; - arr = sdp->sd_jheightsize; - } + down_read(&ip->i_rw_mutex); - ret = gfs2_meta_inode_buffer(ip, &mp.mp_bh[0]); + ret = gfs2_meta_inode_buffer(ip, &mp->mp_bh[0]); if (ret) - goto out_release; + goto unlock; height = ip->i_height; - while ((lblock + 1) * factor > arr[height]) + while ((lblock + 1) * sdp->sd_sb.sb_bsize > sdp->sd_heightsize[height]) height++; - find_metapath(sdp, lblock, &mp, height); + find_metapath(sdp, lblock, mp, height); if (height > ip->i_height || gfs2_is_stuffed(ip)) goto do_alloc; - ret = lookup_metapath(ip, &mp); + ret = lookup_metapath(ip, mp); if (ret) - goto out_release; + goto unlock; - if (mp.mp_aheight != ip->i_height) + if (mp->mp_aheight != ip->i_height) goto do_alloc; - ptr = metapointer(ip->i_height - 1, &mp); + ptr = metapointer(ip->i_height - 1, mp); if (*ptr == 0) goto do_alloc; - iomap->type = IOMAP_MAPPED; - iomap->addr = be64_to_cpu(*ptr) << inode->i_blkbits; + bh = mp->mp_bh[ip->i_height - 1]; + len = gfs2_extent_length(bh->b_data, bh->b_size, ptr, len, &eob); - bh = mp.mp_bh[ip->i_height - 1]; - len = gfs2_extent_length(bh->b_data, bh->b_size, ptr, lend - lblock, &eob); + iomap->addr = be64_to_cpu(*ptr) << inode->i_blkbits; + iomap->length = len << inode->i_blkbits; + iomap->type = IOMAP_MAPPED; + iomap->flags = IOMAP_F_MERGED; if (eob) iomap->flags |= IOMAP_F_GFS2_BOUNDARY; - iomap->length = (u64)len << inode->i_blkbits; -out_release: - release_metapath(&mp); - bmap_unlock(ip, flags & IOMAP_WRITE); out: - trace_gfs2_iomap_end(ip, iomap, ret); + iomap->bdev = inode->i_sb->s_bdev; +unlock: + up_read(&ip->i_rw_mutex); return ret; do_alloc: - if (flags & IOMAP_WRITE) { - ret = gfs2_iomap_alloc(inode, iomap, flags, &mp); - } else if (flags & IOMAP_REPORT) { + iomap->addr = IOMAP_NULL_ADDR; + iomap->length = len << inode->i_blkbits; + iomap->type = IOMAP_HOLE; + iomap->flags = 0; + if (flags & IOMAP_REPORT) { loff_t size = i_size_read(inode); if (pos >= size) ret = -ENOENT; - else if (height <= ip->i_height) - iomap->length = hole_size(inode, lblock, &mp); + else if (height == ip->i_height) + ret = gfs2_hole_size(inode, lblock, len, mp, iomap); else iomap->length = size - pos; } - goto out_release; + goto out; +} + +static int gfs2_iomap_begin(struct inode *inode, loff_t pos, loff_t length, + unsigned flags, struct iomap *iomap) +{ + struct gfs2_inode *ip = GFS2_I(inode); + struct metapath mp = { .mp_aheight = 1, }; + int ret; + + trace_gfs2_iomap_start(ip, pos, length, flags); + if (flags & IOMAP_WRITE) { + ret = gfs2_iomap_get(inode, pos, length, flags, iomap, &mp); + if (!ret && iomap->type == IOMAP_HOLE) + ret = gfs2_iomap_alloc(inode, iomap, flags, &mp); + release_metapath(&mp); + } else { + ret = gfs2_iomap_get(inode, pos, length, flags, iomap, &mp); + release_metapath(&mp); + } + trace_gfs2_iomap_end(ip, iomap, ret); + return ret; } +const struct iomap_ops gfs2_iomap_ops = { + .iomap_begin = gfs2_iomap_begin, +}; + /** * gfs2_block_map - Map one or more blocks of an inode to a disk block * @inode: The inode @@ -826,25 +922,34 @@ int gfs2_block_map(struct inode *inode, sector_t lblock, struct buffer_head *bh_map, int create) { struct gfs2_inode *ip = GFS2_I(inode); - struct iomap iomap; - int ret, flags = 0; + loff_t pos = (loff_t)lblock << inode->i_blkbits; + loff_t length = bh_map->b_size; + struct metapath mp = { .mp_aheight = 1, }; + struct iomap iomap = { }; + int ret; clear_buffer_mapped(bh_map); clear_buffer_new(bh_map); clear_buffer_boundary(bh_map); trace_gfs2_bmap(ip, bh_map, lblock, create, 1); - if (create) - flags |= IOMAP_WRITE; - ret = gfs2_iomap_begin(inode, (loff_t)lblock << inode->i_blkbits, - bh_map->b_size, flags, &iomap); - if (ret) { - if (!create && ret == -ENOENT) { - /* Return unmapped buffer beyond the end of file. */ + if (create) { + ret = gfs2_iomap_get(inode, pos, length, IOMAP_WRITE, &iomap, &mp); + if (!ret && iomap.type == IOMAP_HOLE) + ret = gfs2_iomap_alloc(inode, &iomap, IOMAP_WRITE, &mp); + release_metapath(&mp); + } else { + ret = gfs2_iomap_get(inode, pos, length, 0, &iomap, &mp); + release_metapath(&mp); + + /* Return unmapped buffer beyond the end of file. */ + if (ret == -ENOENT) { ret = 0; + goto out; } - goto out; } + if (ret) + goto out; if (iomap.length > bh_map->b_size) { iomap.length = bh_map->b_size; @@ -946,8 +1051,10 @@ static int gfs2_block_zero_range(struct inode *inode, loff_t from, err = 0; } - if (!gfs2_is_writeback(ip)) + if (gfs2_is_jdata(ip)) gfs2_trans_add_data(ip->i_gl, bh); + else + gfs2_ordered_add_inode(ip); zero_user(page, offset, length); mark_buffer_dirty(bh); @@ -1057,6 +1164,19 @@ out: return error; } +int gfs2_iomap_get_alloc(struct inode *inode, loff_t pos, loff_t length, + struct iomap *iomap) +{ + struct metapath mp = { .mp_aheight = 1, }; + int ret; + + ret = gfs2_iomap_get(inode, pos, length, IOMAP_WRITE, iomap, &mp); + if (!ret && iomap->type == IOMAP_HOLE) + ret = gfs2_iomap_alloc(inode, iomap, IOMAP_WRITE, &mp); + release_metapath(&mp); + return ret; +} + /** * sweep_bh_for_rgrps - find an rgrp in a meta buffer and free blocks therein * @ip: inode diff --git a/fs/gfs2/bmap.h b/fs/gfs2/bmap.h index c3402fe00653..6b18fb323f0a 100644 --- a/fs/gfs2/bmap.h +++ b/fs/gfs2/bmap.h @@ -46,11 +46,13 @@ static inline void gfs2_write_calc_reserv(const struct gfs2_inode *ip, } } +extern const struct iomap_ops gfs2_iomap_ops; + extern int gfs2_unstuff_dinode(struct gfs2_inode *ip, struct page *page); extern int gfs2_block_map(struct inode *inode, sector_t lblock, struct buffer_head *bh, int create); -extern int gfs2_iomap_begin(struct inode *inode, loff_t pos, loff_t length, - unsigned flags, struct iomap *iomap); +extern int gfs2_iomap_get_alloc(struct inode *inode, loff_t pos, loff_t length, + struct iomap *iomap); extern int gfs2_extent_map(struct inode *inode, u64 lblock, int *new, u64 *dblock, unsigned *extlen); extern int gfs2_setattr_size(struct inode *inode, u64 size); diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index 4b71f021a9e2..7137db7b0119 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -733,7 +733,7 @@ static int fallocate_chunk(struct inode *inode, loff_t offset, loff_t len, struct gfs2_inode *ip = GFS2_I(inode); loff_t end = offset + len; struct buffer_head *dibh; - struct iomap iomap; + struct iomap iomap = { }; int error; error = gfs2_meta_inode_buffer(ip, &dibh); @@ -749,8 +749,8 @@ static int fallocate_chunk(struct inode *inode, loff_t offset, loff_t len, } while (offset < end) { - error = gfs2_iomap_begin(inode, offset, end - offset, - IOMAP_WRITE, &iomap); + error = gfs2_iomap_get_alloc(inode, offset, end - offset, + &iomap); if (error) goto out; offset = iomap.offset + iomap.length; diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h index 1b6b1e3f5caf..d2ad817e089f 100644 --- a/fs/gfs2/incore.h +++ b/fs/gfs2/incore.h @@ -116,6 +116,7 @@ static inline struct gfs2_bitmap *rbm_bi(const struct gfs2_rbm *rbm) static inline u64 gfs2_rbm_to_block(const struct gfs2_rbm *rbm) { + BUG_ON(rbm->offset >= rbm->rgd->rd_data); return rbm->rgd->rd_data0 + (rbm_bi(rbm)->bi_start * GFS2_NBBY) + rbm->offset; } @@ -696,8 +697,6 @@ struct gfs2_sbd { u32 sd_max_dirres; /* Max blocks needed to add a directory entry */ u32 sd_max_height; /* Max height of a file's metadata tree */ u64 sd_heightsize[GFS2_MAX_META_HEIGHT + 1]; - u32 sd_max_jheight; /* Max height of journaled file's meta tree */ - u64 sd_jheightsize[GFS2_MAX_META_HEIGHT + 1]; u32 sd_max_dents_per_leaf; /* Max number of dirents in a leaf block */ struct gfs2_args sd_args; /* Mount arguments */ diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c index 8700eb815638..feda55f67050 100644 --- a/fs/gfs2/inode.c +++ b/fs/gfs2/inode.c @@ -2006,10 +2006,6 @@ static int gfs2_getattr(const struct path *path, struct kstat *stat, return 0; } -const struct iomap_ops gfs2_iomap_ops = { - .iomap_begin = gfs2_iomap_begin, -}; - static int gfs2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, u64 start, u64 len) { diff --git a/fs/gfs2/log.h b/fs/gfs2/log.h index 1862e310a067..20241436126d 100644 --- a/fs/gfs2/log.h +++ b/fs/gfs2/log.h @@ -14,6 +14,7 @@ #include <linux/spinlock.h> #include <linux/writeback.h> #include "incore.h" +#include "inode.h" /** * gfs2_log_lock - acquire the right to mess with the log manager @@ -50,8 +51,12 @@ static inline void gfs2_log_pointers_init(struct gfs2_sbd *sdp, static inline void gfs2_ordered_add_inode(struct gfs2_inode *ip) { - struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); + struct gfs2_sbd *sdp; + if (!gfs2_is_ordered(ip)) + return; + + sdp = GFS2_SB(&ip->i_inode); if (!test_bit(GIF_ORDERED, &ip->i_flags)) { spin_lock(&sdp->sd_ordered_lock); if (!test_and_set_bit(GIF_ORDERED, &ip->i_flags)) diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c index 3ba3f167641c..c2469833b4fb 100644 --- a/fs/gfs2/ops_fstype.c +++ b/fs/gfs2/ops_fstype.c @@ -335,25 +335,6 @@ static int gfs2_read_sb(struct gfs2_sbd *sdp, int silent) sdp->sd_heightsize[x] = ~0; gfs2_assert(sdp, sdp->sd_max_height <= GFS2_MAX_META_HEIGHT); - sdp->sd_jheightsize[0] = sdp->sd_sb.sb_bsize - - sizeof(struct gfs2_dinode); - sdp->sd_jheightsize[1] = sdp->sd_jbsize * sdp->sd_diptrs; - for (x = 2;; x++) { - u64 space, d; - u32 m; - - space = sdp->sd_jheightsize[x - 1] * sdp->sd_inptrs; - d = space; - m = do_div(d, sdp->sd_inptrs); - - if (d != sdp->sd_jheightsize[x - 1] || m) - break; - sdp->sd_jheightsize[x] = space; - } - sdp->sd_max_jheight = x; - sdp->sd_jheightsize[x] = ~0; - gfs2_assert(sdp, sdp->sd_max_jheight <= GFS2_MAX_META_HEIGHT); - sdp->sd_max_dents_per_leaf = (sdp->sd_sb.sb_bsize - sizeof(struct gfs2_leaf)) / GFS2_MIN_DIRENT_SIZE; diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c index 7a98abd340ee..e8585dfd209f 100644 --- a/fs/gfs2/quota.c +++ b/fs/gfs2/quota.c @@ -735,7 +735,10 @@ static int gfs2_write_buf_to_page(struct gfs2_inode *ip, unsigned long index, if (!buffer_uptodate(bh)) goto unlock_out; } - gfs2_trans_add_data(ip->i_gl, bh); + if (gfs2_is_jdata(ip)) + gfs2_trans_add_data(ip->i_gl, bh); + else + gfs2_ordered_add_inode(ip); /* If we need to write to the next block as well */ if (to_write > (bsize - boff)) { diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c index 8b683917a27e..6bc5cfe710d1 100644 --- a/fs/gfs2/rgrp.c +++ b/fs/gfs2/rgrp.c @@ -372,8 +372,8 @@ static u32 gfs2_free_extlen(const struct gfs2_rbm *rrbm, u32 len) start = bi->bi_bh->b_data; if (bi->bi_clone) start = bi->bi_clone; - end = start + bi->bi_bh->b_size; start += bi->bi_offset; + end = start + bi->bi_len; BUG_ON(rbm.offset & 3); start += (rbm.offset / GFS2_NBBY); bytes = min_t(u32, len / GFS2_NBBY, (end - start)); diff --git a/fs/gfs2/trans.c b/fs/gfs2/trans.c index c75cacaa349b..064c9a0ef046 100644 --- a/fs/gfs2/trans.c +++ b/fs/gfs2/trans.c @@ -143,32 +143,21 @@ static struct gfs2_bufdata *gfs2_alloc_bufdata(struct gfs2_glock *gl, * @gl: The inode glock associated with the buffer * @bh: The buffer to add * - * This is used in two distinct cases: - * i) In ordered write mode - * We put the data buffer on a list so that we can ensure that it's - * synced to disk at the right time - * ii) In journaled data mode - * We need to journal the data block in the same way as metadata in - * the functions above. The difference is that here we have a tag - * which is two __be64's being the block number (as per meta data) - * and a flag which says whether the data block needs escaping or - * not. This means we need a new log entry for each 251 or so data - * blocks, which isn't an enormous overhead but twice as much as - * for normal metadata blocks. + * This is used in journaled data mode. + * We need to journal the data block in the same way as metadata in + * the functions above. The difference is that here we have a tag + * which is two __be64's being the block number (as per meta data) + * and a flag which says whether the data block needs escaping or + * not. This means we need a new log entry for each 251 or so data + * blocks, which isn't an enormous overhead but twice as much as + * for normal metadata blocks. */ void gfs2_trans_add_data(struct gfs2_glock *gl, struct buffer_head *bh) { struct gfs2_trans *tr = current->journal_info; struct gfs2_sbd *sdp = gl->gl_name.ln_sbd; - struct address_space *mapping = bh->b_page->mapping; - struct gfs2_inode *ip = GFS2_I(mapping->host); struct gfs2_bufdata *bd; - if (!gfs2_is_jdata(ip)) { - gfs2_ordered_add_inode(ip); - return; - } - lock_buffer(bh); if (buffer_pinned(bh)) { set_bit(TR_TOUCHED, &tr->tr_flags); diff --git a/fs/hfs/dir.c b/fs/hfs/dir.c index 75b254280ff6..3bf2ae0e467c 100644 --- a/fs/hfs/dir.c +++ b/fs/hfs/dir.c @@ -31,21 +31,15 @@ static struct dentry *hfs_lookup(struct inode *dir, struct dentry *dentry, hfs_cat_build_key(dir->i_sb, fd.search_key, dir->i_ino, &dentry->d_name); res = hfs_brec_read(&fd, &rec, sizeof(rec)); if (res) { - hfs_find_exit(&fd); - if (res == -ENOENT) { - /* No such entry */ - inode = NULL; - goto done; - } - return ERR_PTR(res); + if (res != -ENOENT) + inode = ERR_PTR(res); + } else { + inode = hfs_iget(dir->i_sb, &fd.search_key->cat, &rec); + if (!inode) + inode = ERR_PTR(-EACCES); } - inode = hfs_iget(dir->i_sb, &fd.search_key->cat, &rec); hfs_find_exit(&fd); - if (!inode) - return ERR_PTR(-EACCES); -done: - d_add(dentry, inode); - return NULL; + return d_splice_alias(inode, dentry); } /* diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c index 2538b49cc349..b3309b83371a 100644 --- a/fs/hfs/inode.c +++ b/fs/hfs/inode.c @@ -543,9 +543,9 @@ static struct dentry *hfs_file_lookup(struct inode *dir, struct dentry *dentry, igrab(dir); hlist_add_fake(&inode->i_hash); mark_inode_dirty(inode); + dont_mount(dentry); out: - d_add(dentry, inode); - return NULL; + return d_splice_alias(inode, dentry); } void hfs_evict_inode(struct inode *inode) diff --git a/fs/hfsplus/dir.c b/fs/hfsplus/dir.c index 15e06fb552da..b5254378f011 100644 --- a/fs/hfsplus/dir.c +++ b/fs/hfsplus/dir.c @@ -122,8 +122,7 @@ again: if (S_ISREG(inode->i_mode)) HFSPLUS_I(inode)->linkid = linkid; out: - d_add(dentry, inode); - return NULL; + return d_splice_alias(inode, dentry); fail: hfs_find_exit(&fd); return ERR_PTR(err); diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c index 513c357c734b..a6c0f54c48c3 100644 --- a/fs/hfsplus/super.c +++ b/fs/hfsplus/super.c @@ -588,6 +588,7 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent) return 0; out_put_hidden_dir: + cancel_delayed_work_sync(&sbi->sync_work); iput(sbi->hidden_dir); out_put_root: dput(sb->s_root); diff --git a/fs/internal.h b/fs/internal.h index e08972db0303..980d005b21b4 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -125,6 +125,7 @@ int do_fchmodat(int dfd, const char __user *filename, umode_t mode); int do_fchownat(int dfd, const char __user *filename, uid_t user, gid_t group, int flag); +extern int open_check_o_direct(struct file *f); extern int vfs_open(const struct path *, struct file *, const struct cred *); extern struct file *filp_clone_open(struct file *); diff --git a/fs/ioctl.c b/fs/ioctl.c index 4823431d1c9d..b445b13fc59b 100644 --- a/fs/ioctl.c +++ b/fs/ioctl.c @@ -549,7 +549,7 @@ static int ioctl_fsfreeze(struct file *filp) { struct super_block *sb = file_inode(filp)->i_sb; - if (!capable(CAP_SYS_ADMIN)) + if (!ns_capable(sb->s_user_ns, CAP_SYS_ADMIN)) return -EPERM; /* If filesystem doesn't support freeze feature, return. */ @@ -566,7 +566,7 @@ static int ioctl_fsthaw(struct file *filp) { struct super_block *sb = file_inode(filp)->i_sb; - if (!capable(CAP_SYS_ADMIN)) + if (!ns_capable(sb->s_user_ns, CAP_SYS_ADMIN)) return -EPERM; /* Thaw */ diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index dfb057900e79..8ef6b6daaa7a 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -114,7 +114,7 @@ void __jbd2_debug(int level, const char *file, const char *func, va_start(args, fmt); vaf.fmt = fmt; vaf.va = &args; - printk(KERN_DEBUG "%s: (%s, %u): %pV\n", file, func, line, &vaf); + printk(KERN_DEBUG "%s: (%s, %u): %pV", file, func, line, &vaf); va_end(args); } EXPORT_SYMBOL(__jbd2_debug); @@ -2302,8 +2302,7 @@ static void jbd2_journal_destroy_slabs(void) int i; for (i = 0; i < JBD2_MAX_SLABS; i++) { - if (jbd2_slab[i]) - kmem_cache_destroy(jbd2_slab[i]); + kmem_cache_destroy(jbd2_slab[i]); jbd2_slab[i] = NULL; } } @@ -2404,10 +2403,8 @@ static int jbd2_journal_init_journal_head_cache(void) static void jbd2_journal_destroy_journal_head_cache(void) { - if (jbd2_journal_head_cache) { - kmem_cache_destroy(jbd2_journal_head_cache); - jbd2_journal_head_cache = NULL; - } + kmem_cache_destroy(jbd2_journal_head_cache); + jbd2_journal_head_cache = NULL; } /* @@ -2665,11 +2662,10 @@ static int __init jbd2_journal_init_handle_cache(void) static void jbd2_journal_destroy_handle_cache(void) { - if (jbd2_handle_cache) - kmem_cache_destroy(jbd2_handle_cache); - if (jbd2_inode_cache) - kmem_cache_destroy(jbd2_inode_cache); - + kmem_cache_destroy(jbd2_handle_cache); + jbd2_handle_cache = NULL; + kmem_cache_destroy(jbd2_inode_cache); + jbd2_inode_cache = NULL; } /* diff --git a/fs/jbd2/revoke.c b/fs/jbd2/revoke.c index 696ef15ec942..240779e4689c 100644 --- a/fs/jbd2/revoke.c +++ b/fs/jbd2/revoke.c @@ -180,14 +180,10 @@ static struct jbd2_revoke_record_s *find_revoke_record(journal_t *journal, void jbd2_journal_destroy_revoke_caches(void) { - if (jbd2_revoke_record_cache) { - kmem_cache_destroy(jbd2_revoke_record_cache); - jbd2_revoke_record_cache = NULL; - } - if (jbd2_revoke_table_cache) { - kmem_cache_destroy(jbd2_revoke_table_cache); - jbd2_revoke_table_cache = NULL; - } + kmem_cache_destroy(jbd2_revoke_record_cache); + jbd2_revoke_record_cache = NULL; + kmem_cache_destroy(jbd2_revoke_table_cache); + jbd2_revoke_table_cache = NULL; } int __init jbd2_journal_init_revoke_caches(void) diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index 8aa453784402..51dd68e67b0f 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -49,10 +49,8 @@ int __init jbd2_journal_init_transaction_cache(void) void jbd2_journal_destroy_transaction_cache(void) { - if (transaction_cache) { - kmem_cache_destroy(transaction_cache); - transaction_cache = NULL; - } + kmem_cache_destroy(transaction_cache); + transaction_cache = NULL; } void jbd2_journal_free_transaction(transaction_t *transaction) diff --git a/fs/jffs2/dir.c b/fs/jffs2/dir.c index 0a754f38462e..e5a6deb38e1e 100644 --- a/fs/jffs2/dir.c +++ b/fs/jffs2/dir.c @@ -209,8 +209,7 @@ static int jffs2_create(struct inode *dir_i, struct dentry *dentry, __func__, inode->i_ino, inode->i_mode, inode->i_nlink, f->inocache->pino_nlink, inode->i_mapping->nrpages); - unlock_new_inode(inode); - d_instantiate(dentry, inode); + d_instantiate_new(dentry, inode); return 0; fail: @@ -430,8 +429,7 @@ static int jffs2_symlink (struct inode *dir_i, struct dentry *dentry, const char mutex_unlock(&dir_f->sem); jffs2_complete_reservation(c); - unlock_new_inode(inode); - d_instantiate(dentry, inode); + d_instantiate_new(dentry, inode); return 0; fail: @@ -575,8 +573,7 @@ static int jffs2_mkdir (struct inode *dir_i, struct dentry *dentry, umode_t mode mutex_unlock(&dir_f->sem); jffs2_complete_reservation(c); - unlock_new_inode(inode); - d_instantiate(dentry, inode); + d_instantiate_new(dentry, inode); return 0; fail: @@ -747,8 +744,7 @@ static int jffs2_mknod (struct inode *dir_i, struct dentry *dentry, umode_t mode mutex_unlock(&dir_f->sem); jffs2_complete_reservation(c); - unlock_new_inode(inode); - d_instantiate(dentry, inode); + d_instantiate_new(dentry, inode); return 0; fail: diff --git a/fs/jfs/jfs_debug.c b/fs/jfs/jfs_debug.c index a70907606025..35a5b2a81ae0 100644 --- a/fs/jfs/jfs_debug.c +++ b/fs/jfs/jfs_debug.c @@ -29,7 +29,6 @@ #ifdef PROC_FS_JFS /* see jfs_debug.h */ -static struct proc_dir_entry *base; #ifdef CONFIG_JFS_DEBUG static int jfs_loglevel_proc_show(struct seq_file *m, void *v) { @@ -66,43 +65,29 @@ static const struct file_operations jfs_loglevel_proc_fops = { }; #endif -static struct { - const char *name; - const struct file_operations *proc_fops; -} Entries[] = { -#ifdef CONFIG_JFS_STATISTICS - { "lmstats", &jfs_lmstats_proc_fops, }, - { "txstats", &jfs_txstats_proc_fops, }, - { "xtstat", &jfs_xtstat_proc_fops, }, - { "mpstat", &jfs_mpstat_proc_fops, }, -#endif -#ifdef CONFIG_JFS_DEBUG - { "TxAnchor", &jfs_txanchor_proc_fops, }, - { "loglevel", &jfs_loglevel_proc_fops } -#endif -}; -#define NPROCENT ARRAY_SIZE(Entries) - void jfs_proc_init(void) { - int i; + struct proc_dir_entry *base; - if (!(base = proc_mkdir("fs/jfs", NULL))) + base = proc_mkdir("fs/jfs", NULL); + if (!base) return; - for (i = 0; i < NPROCENT; i++) - proc_create(Entries[i].name, 0, base, Entries[i].proc_fops); +#ifdef CONFIG_JFS_STATISTICS + proc_create_single("lmstats", 0, base, jfs_lmstats_proc_show); + proc_create_single("txstats", 0, base, jfs_txstats_proc_show); + proc_create_single("xtstat", 0, base, jfs_xtstat_proc_show); + proc_create_single("mpstat", 0, base, jfs_mpstat_proc_show); +#endif +#ifdef CONFIG_JFS_DEBUG + proc_create_single("TxAnchor", 0, base, jfs_txanchor_proc_show); + proc_create("loglevel", 0, base, &jfs_loglevel_proc_fops); +#endif } void jfs_proc_clean(void) { - int i; - - if (base) { - for (i = 0; i < NPROCENT; i++) - remove_proc_entry(Entries[i].name, base); - remove_proc_entry("fs/jfs", NULL); - } + remove_proc_subtree("fs/jfs", NULL); } #endif /* PROC_FS_JFS */ diff --git a/fs/jfs/jfs_debug.h b/fs/jfs/jfs_debug.h index eafd1300a00b..0d9e35da8462 100644 --- a/fs/jfs/jfs_debug.h +++ b/fs/jfs/jfs_debug.h @@ -62,7 +62,7 @@ extern void jfs_proc_clean(void); extern int jfsloglevel; -extern const struct file_operations jfs_txanchor_proc_fops; +int jfs_txanchor_proc_show(struct seq_file *m, void *v); /* information message: e.g., configuration, major event */ #define jfs_info(fmt, arg...) do { \ @@ -105,10 +105,10 @@ extern const struct file_operations jfs_txanchor_proc_fops; * ---------- */ #ifdef CONFIG_JFS_STATISTICS -extern const struct file_operations jfs_lmstats_proc_fops; -extern const struct file_operations jfs_txstats_proc_fops; -extern const struct file_operations jfs_mpstat_proc_fops; -extern const struct file_operations jfs_xtstat_proc_fops; +int jfs_lmstats_proc_show(struct seq_file *m, void *v); +int jfs_txstats_proc_show(struct seq_file *m, void *v); +int jfs_mpstat_proc_show(struct seq_file *m, void *v); +int jfs_xtstat_proc_show(struct seq_file *m, void *v); #define INCREMENT(x) ((x)++) #define DECREMENT(x) ((x)--) diff --git a/fs/jfs/jfs_logmgr.c b/fs/jfs/jfs_logmgr.c index 0e5d412c0b01..6b68df395892 100644 --- a/fs/jfs/jfs_logmgr.c +++ b/fs/jfs/jfs_logmgr.c @@ -2493,7 +2493,7 @@ exit: } #ifdef CONFIG_JFS_STATISTICS -static int jfs_lmstats_proc_show(struct seq_file *m, void *v) +int jfs_lmstats_proc_show(struct seq_file *m, void *v) { seq_printf(m, "JFS Logmgr stats\n" @@ -2510,16 +2510,4 @@ static int jfs_lmstats_proc_show(struct seq_file *m, void *v) lmStat.partial_page); return 0; } - -static int jfs_lmstats_proc_open(struct inode *inode, struct file *file) -{ - return single_open(file, jfs_lmstats_proc_show, NULL); -} - -const struct file_operations jfs_lmstats_proc_fops = { - .open = jfs_lmstats_proc_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; #endif /* CONFIG_JFS_STATISTICS */ diff --git a/fs/jfs/jfs_metapage.c b/fs/jfs/jfs_metapage.c index 1a3b0cc22ad3..fa2c6824c7f2 100644 --- a/fs/jfs/jfs_metapage.c +++ b/fs/jfs/jfs_metapage.c @@ -815,7 +815,7 @@ void __invalidate_metapages(struct inode *ip, s64 addr, int len) } #ifdef CONFIG_JFS_STATISTICS -static int jfs_mpstat_proc_show(struct seq_file *m, void *v) +int jfs_mpstat_proc_show(struct seq_file *m, void *v) { seq_printf(m, "JFS Metapage statistics\n" @@ -828,16 +828,4 @@ static int jfs_mpstat_proc_show(struct seq_file *m, void *v) mpStat.lockwait); return 0; } - -static int jfs_mpstat_proc_open(struct inode *inode, struct file *file) -{ - return single_open(file, jfs_mpstat_proc_show, NULL); -} - -const struct file_operations jfs_mpstat_proc_fops = { - .open = jfs_mpstat_proc_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; #endif diff --git a/fs/jfs/jfs_txnmgr.c b/fs/jfs/jfs_txnmgr.c index 4d973524c887..a5663cb621d8 100644 --- a/fs/jfs/jfs_txnmgr.c +++ b/fs/jfs/jfs_txnmgr.c @@ -2998,7 +2998,7 @@ int jfs_sync(void *arg) } #if defined(CONFIG_PROC_FS) && defined(CONFIG_JFS_DEBUG) -static int jfs_txanchor_proc_show(struct seq_file *m, void *v) +int jfs_txanchor_proc_show(struct seq_file *m, void *v) { char *freewait; char *freelockwait; @@ -3032,22 +3032,10 @@ static int jfs_txanchor_proc_show(struct seq_file *m, void *v) list_empty(&TxAnchor.unlock_queue) ? "" : "not "); return 0; } - -static int jfs_txanchor_proc_open(struct inode *inode, struct file *file) -{ - return single_open(file, jfs_txanchor_proc_show, NULL); -} - -const struct file_operations jfs_txanchor_proc_fops = { - .open = jfs_txanchor_proc_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; #endif #if defined(CONFIG_PROC_FS) && defined(CONFIG_JFS_STATISTICS) -static int jfs_txstats_proc_show(struct seq_file *m, void *v) +int jfs_txstats_proc_show(struct seq_file *m, void *v) { seq_printf(m, "JFS TxStats\n" @@ -3072,16 +3060,4 @@ static int jfs_txstats_proc_show(struct seq_file *m, void *v) TxStat.txLockAlloc_freelock); return 0; } - -static int jfs_txstats_proc_open(struct inode *inode, struct file *file) -{ - return single_open(file, jfs_txstats_proc_show, NULL); -} - -const struct file_operations jfs_txstats_proc_fops = { - .open = jfs_txstats_proc_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; #endif diff --git a/fs/jfs/jfs_xtree.c b/fs/jfs/jfs_xtree.c index 5cde6d2fcfca..2c200b5256a6 100644 --- a/fs/jfs/jfs_xtree.c +++ b/fs/jfs/jfs_xtree.c @@ -3874,7 +3874,7 @@ s64 xtTruncate_pmap(tid_t tid, struct inode *ip, s64 committed_size) } #ifdef CONFIG_JFS_STATISTICS -static int jfs_xtstat_proc_show(struct seq_file *m, void *v) +int jfs_xtstat_proc_show(struct seq_file *m, void *v) { seq_printf(m, "JFS Xtree statistics\n" @@ -3887,16 +3887,4 @@ static int jfs_xtstat_proc_show(struct seq_file *m, void *v) xtStat.split); return 0; } - -static int jfs_xtstat_proc_open(struct inode *inode, struct file *file) -{ - return single_open(file, jfs_xtstat_proc_show, NULL); -} - -const struct file_operations jfs_xtstat_proc_fops = { - .open = jfs_xtstat_proc_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; #endif diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c index b41596d71858..56c3fcbfe80e 100644 --- a/fs/jfs/namei.c +++ b/fs/jfs/namei.c @@ -178,8 +178,7 @@ static int jfs_create(struct inode *dip, struct dentry *dentry, umode_t mode, unlock_new_inode(ip); iput(ip); } else { - unlock_new_inode(ip); - d_instantiate(dentry, ip); + d_instantiate_new(dentry, ip); } out2: @@ -313,8 +312,7 @@ static int jfs_mkdir(struct inode *dip, struct dentry *dentry, umode_t mode) unlock_new_inode(ip); iput(ip); } else { - unlock_new_inode(ip); - d_instantiate(dentry, ip); + d_instantiate_new(dentry, ip); } out2: @@ -1059,8 +1057,7 @@ static int jfs_symlink(struct inode *dip, struct dentry *dentry, unlock_new_inode(ip); iput(ip); } else { - unlock_new_inode(ip); - d_instantiate(dentry, ip); + d_instantiate_new(dentry, ip); } out2: @@ -1447,8 +1444,7 @@ static int jfs_mknod(struct inode *dir, struct dentry *dentry, unlock_new_inode(ip); iput(ip); } else { - unlock_new_inode(ip); - d_instantiate(dentry, ip); + d_instantiate_new(dentry, ip); } out1: diff --git a/fs/kernfs/mount.c b/fs/kernfs/mount.c index 26dd9a50f383..ff2716f9322e 100644 --- a/fs/kernfs/mount.c +++ b/fs/kernfs/mount.c @@ -316,6 +316,7 @@ struct dentry *kernfs_mount_ns(struct file_system_type *fs_type, int flags, info->root = root; info->ns = ns; + INIT_LIST_HEAD(&info->node); sb = sget_userns(fs_type, kernfs_test_super, kernfs_set_super, flags, &init_user_ns, info); diff --git a/fs/locks.c b/fs/locks.c index 62bbe8b31f26..05e211be8684 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -2788,22 +2788,10 @@ static const struct seq_operations locks_seq_operations = { .show = locks_show, }; -static int locks_open(struct inode *inode, struct file *filp) -{ - return seq_open_private(filp, &locks_seq_operations, - sizeof(struct locks_iterator)); -} - -static const struct file_operations proc_locks_operations = { - .open = locks_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release_private, -}; - static int __init proc_locks_init(void) { - proc_create("locks", 0, NULL, &proc_locks_operations); + proc_create_seq_private("locks", 0, NULL, &locks_seq_operations, + sizeof(struct locks_iterator), NULL); return 0; } fs_initcall(proc_locks_init); diff --git a/fs/minix/namei.c b/fs/minix/namei.c index ccf0f00030bf..1a6084d2b02e 100644 --- a/fs/minix/namei.c +++ b/fs/minix/namei.c @@ -28,13 +28,9 @@ static struct dentry *minix_lookup(struct inode * dir, struct dentry *dentry, un return ERR_PTR(-ENAMETOOLONG); ino = minix_inode_by_name(dentry); - if (ino) { + if (ino) inode = minix_iget(dir->i_sb, ino); - if (IS_ERR(inode)) - return ERR_CAST(inode); - } - d_add(dentry, inode); - return NULL; + return d_splice_alias(inode, dentry); } static int minix_mknod(struct inode * dir, struct dentry *dentry, umode_t mode, dev_t rdev) diff --git a/fs/namei.c b/fs/namei.c index 186bd2464fd5..6df1f61855d6 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -984,13 +984,15 @@ static bool safe_hardlink_source(struct inode *inode) */ static int may_linkat(struct path *link) { - struct inode *inode; + struct inode *inode = link->dentry->d_inode; + + /* Inode writeback is not safe when the uid or gid are invalid. */ + if (!uid_valid(inode->i_uid) || !gid_valid(inode->i_gid)) + return -EOVERFLOW; if (!sysctl_protected_hardlinks) return 0; - inode = link->dentry->d_inode; - /* Source inode owner (or CAP_FOWNER) can hardlink all they like, * otherwise, it must be a safe source. */ @@ -1438,10 +1440,8 @@ static int path_parent_directory(struct path *path) static int follow_dotdot(struct nameidata *nd) { while(1) { - if (nd->path.dentry == nd->root.dentry && - nd->path.mnt == nd->root.mnt) { + if (path_equal(&nd->path, &nd->root)) break; - } if (nd->path.dentry != nd->path.mnt->mnt_root) { int ret = path_parent_directory(&nd->path); if (ret) @@ -2749,6 +2749,11 @@ static int may_delete(struct inode *dir, struct dentry *victim, bool isdir) BUG_ON(!inode); BUG_ON(victim->d_parent->d_inode != dir); + + /* Inode writeback is not safe when the uid or gid are invalid. */ + if (!uid_valid(inode->i_uid) || !gid_valid(inode->i_gid)) + return -EOVERFLOW; + audit_inode_child(dir, victim, AUDIT_TYPE_CHILD_DELETE); error = inode_permission(dir, MAY_WRITE | MAY_EXEC); @@ -3367,7 +3372,9 @@ finish_open_created: goto out; *opened |= FILE_OPENED; opened: - error = ima_file_check(file, op->acc_mode, *opened); + error = open_check_o_direct(file); + if (!error) + error = ima_file_check(file, op->acc_mode, *opened); if (!error && will_truncate) error = handle_truncate(file); out: @@ -3447,6 +3454,9 @@ static int do_tmpfile(struct nameidata *nd, unsigned flags, error = finish_open(file, child, NULL, opened); if (error) goto out2; + error = open_check_o_direct(file); + if (error) + fput(file); out2: mnt_drop_write(path.mnt); out: @@ -3672,7 +3682,8 @@ int vfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev) if (error) return error; - if ((S_ISCHR(mode) || S_ISBLK(mode)) && !capable(CAP_MKNOD)) + if ((S_ISCHR(mode) || S_ISBLK(mode)) && + !ns_capable(dentry->d_sb->s_user_ns, CAP_MKNOD)) return -EPERM; if (!dir->i_op->mknod) @@ -3847,11 +3858,11 @@ int vfs_rmdir(struct inode *dir, struct dentry *dentry) if (error) goto out; - shrink_dcache_parent(dentry); error = dir->i_op->rmdir(dir, dentry); if (error) goto out; + shrink_dcache_parent(dentry); dentry->d_inode->i_flags |= S_DEAD; dont_mount(dentry); detach_mounts(dentry); @@ -4434,8 +4445,6 @@ int vfs_rename(struct inode *old_dir, struct dentry *old_dentry, old_dir->i_nlink >= max_links) goto out; } - if (is_dir && !(flags & RENAME_EXCHANGE) && target) - shrink_dcache_parent(new_dentry); if (!is_dir) { error = try_break_deleg(source, delegated_inode); if (error) @@ -4452,8 +4461,10 @@ int vfs_rename(struct inode *old_dir, struct dentry *old_dentry, goto out; if (!(flags & RENAME_EXCHANGE) && target) { - if (is_dir) + if (is_dir) { + shrink_dcache_parent(new_dentry); target->i_flags |= S_DEAD; + } dont_mount(new_dentry); detach_mounts(new_dentry); } diff --git a/fs/namespace.c b/fs/namespace.c index 5f75969adff1..8ddd14806799 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -1590,7 +1590,7 @@ static int do_umount(struct mount *mnt, int flags) * Special case for "unmounting" root ... * we just try to remount it readonly. */ - if (!capable(CAP_SYS_ADMIN)) + if (!ns_capable(sb->s_user_ns, CAP_SYS_ADMIN)) return -EPERM; down_write(&sb->s_umount); if (!sb_rdonly(sb)) @@ -2333,7 +2333,7 @@ static int do_remount(struct path *path, int ms_flags, int sb_flags, down_write(&sb->s_umount); if (ms_flags & MS_BIND) err = change_mount_flags(path->mnt, ms_flags); - else if (!capable(CAP_SYS_ADMIN)) + else if (!ns_capable(sb->s_user_ns, CAP_SYS_ADMIN)) err = -EPERM; else err = do_remount_sb(sb, sb_flags, data, 0); diff --git a/fs/nfs/client.c b/fs/nfs/client.c index b9129e2befea..bbc91d7ca1bd 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -1067,7 +1067,6 @@ void nfs_clients_init(struct net *net) } #ifdef CONFIG_PROC_FS -static int nfs_server_list_open(struct inode *inode, struct file *file); static void *nfs_server_list_start(struct seq_file *p, loff_t *pos); static void *nfs_server_list_next(struct seq_file *p, void *v, loff_t *pos); static void nfs_server_list_stop(struct seq_file *p, void *v); @@ -1080,14 +1079,6 @@ static const struct seq_operations nfs_server_list_ops = { .show = nfs_server_list_show, }; -static const struct file_operations nfs_server_list_fops = { - .open = nfs_server_list_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release_net, -}; - -static int nfs_volume_list_open(struct inode *inode, struct file *file); static void *nfs_volume_list_start(struct seq_file *p, loff_t *pos); static void *nfs_volume_list_next(struct seq_file *p, void *v, loff_t *pos); static void nfs_volume_list_stop(struct seq_file *p, void *v); @@ -1100,23 +1091,6 @@ static const struct seq_operations nfs_volume_list_ops = { .show = nfs_volume_list_show, }; -static const struct file_operations nfs_volume_list_fops = { - .open = nfs_volume_list_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release_net, -}; - -/* - * open "/proc/fs/nfsfs/servers" which provides a summary of servers with which - * we're dealing - */ -static int nfs_server_list_open(struct inode *inode, struct file *file) -{ - return seq_open_net(inode, file, &nfs_server_list_ops, - sizeof(struct seq_net_private)); -} - /* * set up the iterator to start reading from the server list and return the first item */ @@ -1185,15 +1159,6 @@ static int nfs_server_list_show(struct seq_file *m, void *v) } /* - * open "/proc/fs/nfsfs/volumes" which provides a summary of extant volumes - */ -static int nfs_volume_list_open(struct inode *inode, struct file *file) -{ - return seq_open_net(inode, file, &nfs_volume_list_ops, - sizeof(struct seq_net_private)); -} - -/* * set up the iterator to start reading from the volume list and return the first item */ static void *nfs_volume_list_start(struct seq_file *m, loff_t *_pos) @@ -1278,14 +1243,14 @@ int nfs_fs_proc_net_init(struct net *net) goto error_0; /* a file of servers with which we're dealing */ - p = proc_create("servers", S_IFREG|S_IRUGO, - nn->proc_nfsfs, &nfs_server_list_fops); + p = proc_create_net("servers", S_IFREG|S_IRUGO, nn->proc_nfsfs, + &nfs_server_list_ops, sizeof(struct seq_net_private)); if (!p) goto error_1; /* a file of volumes that we have mounted */ - p = proc_create("volumes", S_IFREG|S_IRUGO, - nn->proc_nfsfs, &nfs_volume_list_fops); + p = proc_create_net("volumes", S_IFREG|S_IRUGO, nn->proc_nfsfs, + &nfs_volume_list_ops, sizeof(struct seq_net_private)); if (!p) goto error_1; return 0; diff --git a/fs/nfsd/blocklayout.c b/fs/nfsd/blocklayout.c index 70b8bf781fce..a43dfedd69ec 100644 --- a/fs/nfsd/blocklayout.c +++ b/fs/nfsd/blocklayout.c @@ -227,7 +227,7 @@ static int nfsd4_scsi_identify_device(struct block_device *bdev, if (!buf) return -ENOMEM; - rq = blk_get_request(q, REQ_OP_SCSI_IN, GFP_KERNEL); + rq = blk_get_request(q, REQ_OP_SCSI_IN, 0); if (IS_ERR(rq)) { error = -ENOMEM; goto out_free_buf; diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index 2410b093a2e6..b0555d7d8200 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -1201,6 +1201,28 @@ nfsd_create_locked(struct svc_rqst *rqstp, struct svc_fh *fhp, break; case S_IFDIR: host_err = vfs_mkdir(dirp, dchild, iap->ia_mode); + if (!host_err && unlikely(d_unhashed(dchild))) { + struct dentry *d; + d = lookup_one_len(dchild->d_name.name, + dchild->d_parent, + dchild->d_name.len); + if (IS_ERR(d)) { + host_err = PTR_ERR(d); + break; + } + if (unlikely(d_is_negative(d))) { + dput(d); + err = nfserr_serverfault; + goto out; + } + dput(resfhp->fh_dentry); + resfhp->fh_dentry = dget(d); + err = fh_update(resfhp); + dput(dchild); + dchild = d; + if (err) + goto out; + } break; case S_IFCHR: case S_IFBLK: diff --git a/fs/nilfs2/namei.c b/fs/nilfs2/namei.c index 1a2894aa0194..dd52d3f82e8d 100644 --- a/fs/nilfs2/namei.c +++ b/fs/nilfs2/namei.c @@ -46,8 +46,7 @@ static inline int nilfs_add_nondir(struct dentry *dentry, struct inode *inode) int err = nilfs_add_link(dentry, inode); if (!err) { - d_instantiate(dentry, inode); - unlock_new_inode(inode); + d_instantiate_new(dentry, inode); return 0; } inode_dec_link_count(inode); @@ -243,8 +242,7 @@ static int nilfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) goto out_fail; nilfs_mark_inode_dirty(inode); - d_instantiate(dentry, inode); - unlock_new_inode(inode); + d_instantiate_new(dentry, inode); out: if (!err) err = nilfs_transaction_commit(dir->i_sb); diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c index 91a8889abf9b..ea8c551bcd7e 100644 --- a/fs/ocfs2/cluster/heartbeat.c +++ b/fs/ocfs2/cluster/heartbeat.c @@ -570,16 +570,7 @@ static struct bio *o2hb_setup_one_bio(struct o2hb_region *reg, current_page, vec_len, vec_start); len = bio_add_page(bio, page, vec_len, vec_start); - if (len != vec_len) { - mlog(ML_ERROR, "Adding page[%d] to bio failed, " - "page %p, len %d, vec_len %u, vec_start %u, " - "bi_sector %llu\n", current_page, page, len, - vec_len, vec_start, - (unsigned long long)bio->bi_iter.bi_sector); - bio_put(bio); - bio = ERR_PTR(-EIO); - return bio; - } + if (len != vec_len) break; cs += vec_len / (PAGE_SIZE/spp); vec_start = 0; diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c index 01c6b3894406..7869622af22a 100644 --- a/fs/ocfs2/refcounttree.c +++ b/fs/ocfs2/refcounttree.c @@ -4250,10 +4250,11 @@ out: static int ocfs2_reflink(struct dentry *old_dentry, struct inode *dir, struct dentry *new_dentry, bool preserve) { - int error; + int error, had_lock; struct inode *inode = d_inode(old_dentry); struct buffer_head *old_bh = NULL; struct inode *new_orphan_inode = NULL; + struct ocfs2_lock_holder oh; if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb))) return -EOPNOTSUPP; @@ -4295,6 +4296,14 @@ static int ocfs2_reflink(struct dentry *old_dentry, struct inode *dir, goto out; } + had_lock = ocfs2_inode_lock_tracker(new_orphan_inode, NULL, 1, + &oh); + if (had_lock < 0) { + error = had_lock; + mlog_errno(error); + goto out; + } + /* If the security isn't preserved, we need to re-initialize them. */ if (!preserve) { error = ocfs2_init_security_and_acl(dir, new_orphan_inode, @@ -4302,14 +4311,15 @@ static int ocfs2_reflink(struct dentry *old_dentry, struct inode *dir, if (error) mlog_errno(error); } -out: if (!error) { error = ocfs2_mv_orphaned_inode_to_new(dir, new_orphan_inode, new_dentry); if (error) mlog_errno(error); } + ocfs2_inode_unlock_tracker(new_orphan_inode, 1, &oh, had_lock); +out: if (new_orphan_inode) { /* * We need to open_unlock the inode no matter whether we diff --git a/fs/omfs/dir.c b/fs/omfs/dir.c index b7146526afff..4bee3a72b9f3 100644 --- a/fs/omfs/dir.c +++ b/fs/omfs/dir.c @@ -305,11 +305,10 @@ static struct dentry *omfs_lookup(struct inode *dir, struct dentry *dentry, ino_t ino = be64_to_cpu(oi->i_head.h_self); brelse(bh); inode = omfs_iget(dir->i_sb, ino); - if (IS_ERR(inode)) - return ERR_CAST(inode); + } else if (bh != ERR_PTR(-ENOENT)) { + inode = ERR_CAST(bh); } - d_add(dentry, inode); - return NULL; + return d_splice_alias(inode, dentry); } /* sanity check block's self pointer */ diff --git a/fs/open.c b/fs/open.c index c5ee7cd60424..d0e955b558ad 100644 --- a/fs/open.c +++ b/fs/open.c @@ -724,6 +724,16 @@ SYSCALL_DEFINE3(fchown, unsigned int, fd, uid_t, user, gid_t, group) return ksys_fchown(fd, user, group); } +int open_check_o_direct(struct file *f) +{ + /* NB: we're sure to have correct a_ops only after f_op->open */ + if (f->f_flags & O_DIRECT) { + if (!f->f_mapping->a_ops || !f->f_mapping->a_ops->direct_IO) + return -EINVAL; + } + return 0; +} + static int do_dentry_open(struct file *f, struct inode *inode, int (*open)(struct inode *, struct file *), @@ -745,7 +755,7 @@ static int do_dentry_open(struct file *f, if (unlikely(f->f_flags & O_PATH)) { f->f_mode = FMODE_PATH; f->f_op = &empty_fops; - goto done; + return 0; } if (f->f_mode & FMODE_WRITE && !special_file(inode->i_mode)) { @@ -798,12 +808,7 @@ static int do_dentry_open(struct file *f, f->f_flags &= ~(O_CREAT | O_EXCL | O_NOCTTY | O_TRUNC); file_ra_state_init(&f->f_ra, f->f_mapping->host->i_mapping); -done: - /* NB: we're sure to have correct a_ops only after f_op->open */ - error = -EINVAL; - if ((f->f_flags & O_DIRECT) && - (!f->f_mapping->a_ops || !f->f_mapping->a_ops->direct_IO)) - goto out_fput; + return 0; cleanup_all: @@ -818,9 +823,6 @@ cleanup_file: f->f_path.dentry = NULL; f->f_inode = NULL; return error; -out_fput: - fput(f); - return error; } /** @@ -918,14 +920,20 @@ struct file *dentry_open(const struct path *path, int flags, BUG_ON(!path->mnt); f = get_empty_filp(); - if (IS_ERR(f)) - return f; - - f->f_flags = flags; - error = vfs_open(path, f, cred); - if (error) { - put_filp(f); - return ERR_PTR(error); + if (!IS_ERR(f)) { + f->f_flags = flags; + error = vfs_open(path, f, cred); + if (!error) { + /* from now on we need fput() to dispose of f */ + error = open_check_o_direct(f); + if (error) { + fput(f); + f = ERR_PTR(error); + } + } else { + put_filp(f); + f = ERR_PTR(error); + } } return f; } diff --git a/fs/openpromfs/inode.c b/fs/openpromfs/inode.c index 2200662a9bf1..607092f367ad 100644 --- a/fs/openpromfs/inode.c +++ b/fs/openpromfs/inode.c @@ -256,8 +256,7 @@ found: break; } - d_add(dentry, inode); - return NULL; + return d_splice_alias(inode, dentry); } static int openpromfs_readdir(struct file *file, struct dir_context *ctx) diff --git a/fs/orangefs/namei.c b/fs/orangefs/namei.c index 6e3134e6d98a..365cd73d9109 100644 --- a/fs/orangefs/namei.c +++ b/fs/orangefs/namei.c @@ -75,8 +75,7 @@ static int orangefs_create(struct inode *dir, get_khandle_from_ino(inode), dentry); - d_instantiate(dentry, inode); - unlock_new_inode(inode); + d_instantiate_new(dentry, inode); orangefs_set_timeout(dentry); ORANGEFS_I(inode)->getattr_time = jiffies - 1; ORANGEFS_I(inode)->getattr_mask = STATX_BASIC_STATS; @@ -111,7 +110,6 @@ static struct dentry *orangefs_lookup(struct inode *dir, struct dentry *dentry, struct orangefs_inode_s *parent = ORANGEFS_I(dir); struct orangefs_kernel_op_s *new_op; struct inode *inode; - struct dentry *res; int ret = -EINVAL; /* @@ -159,65 +157,18 @@ static struct dentry *orangefs_lookup(struct inode *dir, struct dentry *dentry, new_op->downcall.resp.lookup.refn.fs_id, ret); - if (ret < 0) { - if (ret == -ENOENT) { - /* - * if no inode was found, add a negative dentry to - * dcache anyway; if we don't, we don't hold expected - * lookup semantics and we most noticeably break - * during directory renames. - * - * however, if the operation failed or exited, do not - * add the dentry (e.g. in the case that a touch is - * issued on a file that already exists that was - * interrupted during this lookup -- no need to add - * another negative dentry for an existing file) - */ - - gossip_debug(GOSSIP_NAME_DEBUG, - "orangefs_lookup: Adding *negative* dentry " - "%p for %pd\n", - dentry, - dentry); - - d_add(dentry, NULL); - res = NULL; - goto out; - } - + if (ret >= 0) { + orangefs_set_timeout(dentry); + inode = orangefs_iget(dir->i_sb, &new_op->downcall.resp.lookup.refn); + } else if (ret == -ENOENT) { + inode = NULL; + } else { /* must be a non-recoverable error */ - res = ERR_PTR(ret); - goto out; - } - - orangefs_set_timeout(dentry); - - inode = orangefs_iget(dir->i_sb, &new_op->downcall.resp.lookup.refn); - if (IS_ERR(inode)) { - gossip_debug(GOSSIP_NAME_DEBUG, - "error %ld from iget\n", PTR_ERR(inode)); - res = ERR_CAST(inode); - goto out; + inode = ERR_PTR(ret); } - gossip_debug(GOSSIP_NAME_DEBUG, - "%s:%s:%d " - "Found good inode [%lu] with count [%d]\n", - __FILE__, - __func__, - __LINE__, - inode->i_ino, - (int)atomic_read(&inode->i_count)); - - /* update dentry/inode pair into dcache */ - res = d_splice_alias(inode, dentry); - - gossip_debug(GOSSIP_NAME_DEBUG, - "Lookup success (inode ct = %d)\n", - (int)atomic_read(&inode->i_count)); -out: op_release(new_op); - return res; + return d_splice_alias(inode, dentry); } /* return 0 on success; non-zero otherwise */ @@ -332,8 +283,7 @@ static int orangefs_symlink(struct inode *dir, "Assigned symlink inode new number of %pU\n", get_khandle_from_ino(inode)); - d_instantiate(dentry, inode); - unlock_new_inode(inode); + d_instantiate_new(dentry, inode); orangefs_set_timeout(dentry); ORANGEFS_I(inode)->getattr_time = jiffies - 1; ORANGEFS_I(inode)->getattr_mask = STATX_BASIC_STATS; @@ -402,8 +352,7 @@ static int orangefs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode "Assigned dir inode new number of %pU\n", get_khandle_from_ino(inode)); - d_instantiate(dentry, inode); - unlock_new_inode(inode); + d_instantiate_new(dentry, inode); orangefs_set_timeout(dentry); ORANGEFS_I(inode)->getattr_time = jiffies - 1; ORANGEFS_I(inode)->getattr_mask = STATX_BASIC_STATS; diff --git a/fs/pipe.c b/fs/pipe.c index 39d6f431da83..bb0840e234f3 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -509,19 +509,22 @@ static long pipe_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) } } -/* No kernel lock held - fine */ -static __poll_t -pipe_poll(struct file *filp, poll_table *wait) +static struct wait_queue_head * +pipe_get_poll_head(struct file *filp, __poll_t events) { - __poll_t mask; struct pipe_inode_info *pipe = filp->private_data; - int nrbufs; - poll_wait(filp, &pipe->wait, wait); + return &pipe->wait; +} + +/* No kernel lock held - fine */ +static __poll_t pipe_poll_mask(struct file *filp, __poll_t events) +{ + struct pipe_inode_info *pipe = filp->private_data; + int nrbufs = pipe->nrbufs; + __poll_t mask = 0; /* Reading only -- no need for acquiring the semaphore. */ - nrbufs = pipe->nrbufs; - mask = 0; if (filp->f_mode & FMODE_READ) { mask = (nrbufs > 0) ? EPOLLIN | EPOLLRDNORM : 0; if (!pipe->writers && filp->f_version != pipe->w_counter) @@ -1020,7 +1023,8 @@ const struct file_operations pipefifo_fops = { .llseek = no_llseek, .read_iter = pipe_read, .write_iter = pipe_write, - .poll = pipe_poll, + .get_poll_head = pipe_get_poll_head, + .poll_mask = pipe_poll_mask, .unlocked_ioctl = pipe_ioctl, .release = pipe_release, .fasync = pipe_fasync, diff --git a/fs/proc/array.c b/fs/proc/array.c index ae2c807fd719..e6d7f41b6684 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -85,6 +85,7 @@ #include <linux/delayacct.h> #include <linux/seq_file.h> #include <linux/pid_namespace.h> +#include <linux/prctl.h> #include <linux/ptrace.h> #include <linux/tracehook.h> #include <linux/string_helpers.h> @@ -335,6 +336,30 @@ static inline void task_seccomp(struct seq_file *m, struct task_struct *p) #ifdef CONFIG_SECCOMP seq_put_decimal_ull(m, "\nSeccomp:\t", p->seccomp.mode); #endif + seq_printf(m, "\nSpeculation_Store_Bypass:\t"); + switch (arch_prctl_spec_ctrl_get(p, PR_SPEC_STORE_BYPASS)) { + case -EINVAL: + seq_printf(m, "unknown"); + break; + case PR_SPEC_NOT_AFFECTED: + seq_printf(m, "not vulnerable"); + break; + case PR_SPEC_PRCTL | PR_SPEC_FORCE_DISABLE: + seq_printf(m, "thread force mitigated"); + break; + case PR_SPEC_PRCTL | PR_SPEC_DISABLE: + seq_printf(m, "thread mitigated"); + break; + case PR_SPEC_PRCTL | PR_SPEC_ENABLE: + seq_printf(m, "thread vulnerable"); + break; + case PR_SPEC_DISABLE: + seq_printf(m, "globally mitigated"); + break; + default: + seq_printf(m, "vulnerable"); + break; + } seq_putc(m, '\n'); } @@ -677,25 +702,22 @@ out: static int children_seq_show(struct seq_file *seq, void *v) { - struct inode *inode = seq->private; - pid_t pid; - - pid = pid_nr_ns(v, inode->i_sb->s_fs_info); - seq_printf(seq, "%d ", pid); + struct inode *inode = file_inode(seq->file); + seq_printf(seq, "%d ", pid_nr_ns(v, proc_pid_ns(inode))); return 0; } static void *children_seq_start(struct seq_file *seq, loff_t *pos) { - return get_children_pid(seq->private, NULL, *pos); + return get_children_pid(file_inode(seq->file), NULL, *pos); } static void *children_seq_next(struct seq_file *seq, void *v, loff_t *pos) { struct pid *pid; - pid = get_children_pid(seq->private, v, *pos + 1); + pid = get_children_pid(file_inode(seq->file), v, *pos + 1); put_pid(v); ++*pos; @@ -716,17 +738,7 @@ static const struct seq_operations children_seq_ops = { static int children_seq_open(struct inode *inode, struct file *file) { - struct seq_file *m; - int ret; - - ret = seq_open(file, &children_seq_ops); - if (ret) - return ret; - - m = file->private_data; - m->private = inode; - - return ret; + return seq_open(file, &children_seq_ops); } const struct file_operations proc_tid_children_operations = { diff --git a/fs/proc/base.c b/fs/proc/base.c index 1b2ede6abcdf..33ed1746927a 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -261,7 +261,7 @@ static ssize_t proc_pid_cmdline_read(struct file *file, char __user *buf, * Inherently racy -- command line shares address space * with code and data. */ - rv = access_remote_vm(mm, arg_end - 1, &c, 1, 0); + rv = access_remote_vm(mm, arg_end - 1, &c, 1, FOLL_ANON); if (rv <= 0) goto out_free_page; @@ -279,7 +279,7 @@ static ssize_t proc_pid_cmdline_read(struct file *file, char __user *buf, int nr_read; _count = min3(count, len, PAGE_SIZE); - nr_read = access_remote_vm(mm, p, page, _count, 0); + nr_read = access_remote_vm(mm, p, page, _count, FOLL_ANON); if (nr_read < 0) rv = nr_read; if (nr_read <= 0) @@ -325,7 +325,7 @@ static ssize_t proc_pid_cmdline_read(struct file *file, char __user *buf, bool final; _count = min3(count, len, PAGE_SIZE); - nr_read = access_remote_vm(mm, p, page, _count, 0); + nr_read = access_remote_vm(mm, p, page, _count, FOLL_ANON); if (nr_read < 0) rv = nr_read; if (nr_read <= 0) @@ -698,7 +698,7 @@ static bool has_pid_permissions(struct pid_namespace *pid, static int proc_pid_permission(struct inode *inode, int mask) { - struct pid_namespace *pid = inode->i_sb->s_fs_info; + struct pid_namespace *pid = proc_pid_ns(inode); struct task_struct *task; bool has_perms; @@ -733,13 +733,11 @@ static const struct inode_operations proc_def_inode_operations = { static int proc_single_show(struct seq_file *m, void *v) { struct inode *inode = m->private; - struct pid_namespace *ns; - struct pid *pid; + struct pid_namespace *ns = proc_pid_ns(inode); + struct pid *pid = proc_pid(inode); struct task_struct *task; int ret; - ns = inode->i_sb->s_fs_info; - pid = proc_pid(inode); task = get_pid_task(pid, PIDTYPE_PID); if (!task) return -ESRCH; @@ -946,7 +944,7 @@ static ssize_t environ_read(struct file *file, char __user *buf, max_len = min_t(size_t, PAGE_SIZE, count); this_len = min(max_len, this_len); - retval = access_remote_vm(mm, (env_start + src), page, this_len, 0); + retval = access_remote_vm(mm, (env_start + src), page, this_len, FOLL_ANON); if (retval <= 0) { ret = retval; @@ -1410,7 +1408,7 @@ static const struct file_operations proc_fail_nth_operations = { static int sched_show(struct seq_file *m, void *v) { struct inode *inode = m->private; - struct pid_namespace *ns = inode->i_sb->s_fs_info; + struct pid_namespace *ns = proc_pid_ns(inode); struct task_struct *p; p = get_proc_task(inode); @@ -1782,8 +1780,8 @@ int pid_getattr(const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags) { struct inode *inode = d_inode(path->dentry); + struct pid_namespace *pid = proc_pid_ns(inode); struct task_struct *task; - struct pid_namespace *pid = path->dentry->d_sb->s_fs_info; generic_fillattr(inode, stat); @@ -1809,15 +1807,22 @@ int pid_getattr(const struct path *path, struct kstat *stat, /* dentry stuff */ /* - * Exceptional case: normally we are not allowed to unhash a busy - * directory. In this case, however, we can do it - no aliasing problems - * due to the way we treat inodes. - * + * Set <pid>/... inode ownership (can change due to setuid(), etc.) + */ +void pid_update_inode(struct task_struct *task, struct inode *inode) +{ + task_dump_owner(task, inode->i_mode, &inode->i_uid, &inode->i_gid); + + inode->i_mode &= ~(S_ISUID | S_ISGID); + security_task_to_inode(task, inode); +} + +/* * Rewrite the inode's ownerships here because the owning task may have * performed a setuid(), etc. * */ -int pid_revalidate(struct dentry *dentry, unsigned int flags) +static int pid_revalidate(struct dentry *dentry, unsigned int flags) { struct inode *inode; struct task_struct *task; @@ -1829,10 +1834,7 @@ int pid_revalidate(struct dentry *dentry, unsigned int flags) task = get_proc_task(inode); if (task) { - task_dump_owner(task, inode->i_mode, &inode->i_uid, &inode->i_gid); - - inode->i_mode &= ~(S_ISUID | S_ISGID); - security_task_to_inode(task, inode); + pid_update_inode(task, inode); put_task_struct(task); return 1; } @@ -1880,8 +1882,8 @@ bool proc_fill_cache(struct file *file, struct dir_context *ctx, struct dentry *child, *dir = file->f_path.dentry; struct qstr qname = QSTR_INIT(name, len); struct inode *inode; - unsigned type; - ino_t ino; + unsigned type = DT_UNKNOWN; + ino_t ino = 1; child = d_hash_and_lookup(dir, &qname); if (!child) { @@ -1890,22 +1892,23 @@ bool proc_fill_cache(struct file *file, struct dir_context *ctx, if (IS_ERR(child)) goto end_instantiate; if (d_in_lookup(child)) { - int err = instantiate(d_inode(dir), child, task, ptr); + struct dentry *res; + res = instantiate(child, task, ptr); d_lookup_done(child); - if (err < 0) { - dput(child); + if (IS_ERR(res)) goto end_instantiate; + if (unlikely(res)) { + dput(child); + child = res; } } } inode = d_inode(child); ino = inode->i_ino; type = inode->i_mode >> 12; +end_instantiate: dput(child); return dir_emit(ctx, name, len, ino, type); - -end_instantiate: - return dir_emit(ctx, name, len, 1, DT_UNKNOWN); } /* @@ -2067,19 +2070,19 @@ static const struct inode_operations proc_map_files_link_inode_operations = { .setattr = proc_setattr, }; -static int -proc_map_files_instantiate(struct inode *dir, struct dentry *dentry, +static struct dentry * +proc_map_files_instantiate(struct dentry *dentry, struct task_struct *task, const void *ptr) { fmode_t mode = (fmode_t)(unsigned long)ptr; struct proc_inode *ei; struct inode *inode; - inode = proc_pid_make_inode(dir->i_sb, task, S_IFLNK | + inode = proc_pid_make_inode(dentry->d_sb, task, S_IFLNK | ((mode & FMODE_READ ) ? S_IRUSR : 0) | ((mode & FMODE_WRITE) ? S_IWUSR : 0)); if (!inode) - return -ENOENT; + return ERR_PTR(-ENOENT); ei = PROC_I(inode); ei->op.proc_get_link = map_files_get_link; @@ -2088,9 +2091,7 @@ proc_map_files_instantiate(struct inode *dir, struct dentry *dentry, inode->i_size = 64; d_set_d_op(dentry, &tid_map_files_dentry_operations); - d_add(dentry, inode); - - return 0; + return d_splice_alias(inode, dentry); } static struct dentry *proc_map_files_lookup(struct inode *dir, @@ -2099,19 +2100,19 @@ static struct dentry *proc_map_files_lookup(struct inode *dir, unsigned long vm_start, vm_end; struct vm_area_struct *vma; struct task_struct *task; - int result; + struct dentry *result; struct mm_struct *mm; - result = -ENOENT; + result = ERR_PTR(-ENOENT); task = get_proc_task(dir); if (!task) goto out; - result = -EACCES; + result = ERR_PTR(-EACCES); if (!ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS)) goto out_put_task; - result = -ENOENT; + result = ERR_PTR(-ENOENT); if (dname_to_vma_addr(dentry, &vm_start, &vm_end)) goto out_put_task; @@ -2125,7 +2126,7 @@ static struct dentry *proc_map_files_lookup(struct inode *dir, goto out_no_vma; if (vma->vm_file) - result = proc_map_files_instantiate(dir, dentry, task, + result = proc_map_files_instantiate(dentry, task, (void *)(unsigned long)vma->vm_file->f_mode); out_no_vma: @@ -2134,7 +2135,7 @@ out_no_vma: out_put_task: put_task_struct(task); out: - return ERR_PTR(result); + return result; } static const struct inode_operations proc_map_files_inode_operations = { @@ -2337,7 +2338,7 @@ static int proc_timers_open(struct inode *inode, struct file *file) return -ENOMEM; tp->pid = proc_pid(inode); - tp->ns = inode->i_sb->s_fs_info; + tp->ns = proc_pid_ns(inode); return 0; } @@ -2435,16 +2436,16 @@ static const struct file_operations proc_pid_set_timerslack_ns_operations = { .release = single_release, }; -static int proc_pident_instantiate(struct inode *dir, - struct dentry *dentry, struct task_struct *task, const void *ptr) +static struct dentry *proc_pident_instantiate(struct dentry *dentry, + struct task_struct *task, const void *ptr) { const struct pid_entry *p = ptr; struct inode *inode; struct proc_inode *ei; - inode = proc_pid_make_inode(dir->i_sb, task, p->mode); + inode = proc_pid_make_inode(dentry->d_sb, task, p->mode); if (!inode) - goto out; + return ERR_PTR(-ENOENT); ei = PROC_I(inode); if (S_ISDIR(inode->i_mode)) @@ -2454,13 +2455,9 @@ static int proc_pident_instantiate(struct inode *dir, if (p->fop) inode->i_fop = p->fop; ei->op = p->op; + pid_update_inode(task, inode); d_set_d_op(dentry, &pid_dentry_operations); - d_add(dentry, inode); - /* Close the race of the process dying before we return the dentry */ - if (pid_revalidate(dentry, 0)) - return 0; -out: - return -ENOENT; + return d_splice_alias(inode, dentry); } static struct dentry *proc_pident_lookup(struct inode *dir, @@ -2468,11 +2465,9 @@ static struct dentry *proc_pident_lookup(struct inode *dir, const struct pid_entry *ents, unsigned int nents) { - int error; struct task_struct *task = get_proc_task(dir); const struct pid_entry *p, *last; - - error = -ENOENT; + struct dentry *res = ERR_PTR(-ENOENT); if (!task) goto out_no_task; @@ -2491,11 +2486,11 @@ static struct dentry *proc_pident_lookup(struct inode *dir, if (p >= last) goto out; - error = proc_pident_instantiate(dir, dentry, task, p); + res = proc_pident_instantiate(dentry, task, p); out: put_task_struct(task); out_no_task: - return ERR_PTR(error); + return res; } static int proc_pident_readdir(struct file *file, struct dir_context *ctx, @@ -3138,38 +3133,32 @@ void proc_flush_task(struct task_struct *task) } } -static int proc_pid_instantiate(struct inode *dir, - struct dentry * dentry, +static struct dentry *proc_pid_instantiate(struct dentry * dentry, struct task_struct *task, const void *ptr) { struct inode *inode; - inode = proc_pid_make_inode(dir->i_sb, task, S_IFDIR | S_IRUGO | S_IXUGO); + inode = proc_pid_make_inode(dentry->d_sb, task, S_IFDIR | S_IRUGO | S_IXUGO); if (!inode) - goto out; + return ERR_PTR(-ENOENT); inode->i_op = &proc_tgid_base_inode_operations; inode->i_fop = &proc_tgid_base_operations; inode->i_flags|=S_IMMUTABLE; set_nlink(inode, nlink_tgid); + pid_update_inode(task, inode); d_set_d_op(dentry, &pid_dentry_operations); - - d_add(dentry, inode); - /* Close the race of the process dying before we return the dentry */ - if (pid_revalidate(dentry, 0)) - return 0; -out: - return -ENOENT; + return d_splice_alias(inode, dentry); } struct dentry *proc_pid_lookup(struct inode *dir, struct dentry * dentry, unsigned int flags) { - int result = -ENOENT; struct task_struct *task; unsigned tgid; struct pid_namespace *ns; + struct dentry *result = ERR_PTR(-ENOENT); tgid = name_to_int(&dentry->d_name); if (tgid == ~0U) @@ -3184,10 +3173,10 @@ struct dentry *proc_pid_lookup(struct inode *dir, struct dentry * dentry, unsign if (!task) goto out; - result = proc_pid_instantiate(dir, dentry, task, NULL); + result = proc_pid_instantiate(dentry, task, NULL); put_task_struct(task); out: - return ERR_PTR(result); + return result; } /* @@ -3239,7 +3228,7 @@ retry: int proc_pid_readdir(struct file *file, struct dir_context *ctx) { struct tgid_iter iter; - struct pid_namespace *ns = file_inode(file)->i_sb->s_fs_info; + struct pid_namespace *ns = proc_pid_ns(file_inode(file)); loff_t pos = ctx->pos; if (pos >= PID_MAX_LIMIT + TGID_OFFSET) @@ -3435,37 +3424,32 @@ static const struct inode_operations proc_tid_base_inode_operations = { .setattr = proc_setattr, }; -static int proc_task_instantiate(struct inode *dir, - struct dentry *dentry, struct task_struct *task, const void *ptr) +static struct dentry *proc_task_instantiate(struct dentry *dentry, + struct task_struct *task, const void *ptr) { struct inode *inode; - inode = proc_pid_make_inode(dir->i_sb, task, S_IFDIR | S_IRUGO | S_IXUGO); - + inode = proc_pid_make_inode(dentry->d_sb, task, S_IFDIR | S_IRUGO | S_IXUGO); if (!inode) - goto out; + return ERR_PTR(-ENOENT); + inode->i_op = &proc_tid_base_inode_operations; inode->i_fop = &proc_tid_base_operations; - inode->i_flags|=S_IMMUTABLE; + inode->i_flags |= S_IMMUTABLE; set_nlink(inode, nlink_tid); + pid_update_inode(task, inode); d_set_d_op(dentry, &pid_dentry_operations); - - d_add(dentry, inode); - /* Close the race of the process dying before we return the dentry */ - if (pid_revalidate(dentry, 0)) - return 0; -out: - return -ENOENT; + return d_splice_alias(inode, dentry); } static struct dentry *proc_task_lookup(struct inode *dir, struct dentry * dentry, unsigned int flags) { - int result = -ENOENT; struct task_struct *task; struct task_struct *leader = get_proc_task(dir); unsigned tid; struct pid_namespace *ns; + struct dentry *result = ERR_PTR(-ENOENT); if (!leader) goto out_no_task; @@ -3485,13 +3469,13 @@ static struct dentry *proc_task_lookup(struct inode *dir, struct dentry * dentry if (!same_thread_group(leader, task)) goto out_drop_task; - result = proc_task_instantiate(dir, dentry, task, NULL); + result = proc_task_instantiate(dentry, task, NULL); out_drop_task: put_task_struct(task); out: put_task_struct(leader); out_no_task: - return ERR_PTR(result); + return result; } /* @@ -3588,7 +3572,7 @@ static int proc_task_readdir(struct file *file, struct dir_context *ctx) /* f_version caches the tgid value that the last readdir call couldn't * return. lseek aka telldir automagically resets f_version to 0. */ - ns = inode->i_sb->s_fs_info; + ns = proc_pid_ns(inode); tid = (int)file->f_version; file->f_version = 0; for (task = first_tid(proc_pid(inode), tid, ctx->pos - 2, ns); diff --git a/fs/proc/cmdline.c b/fs/proc/cmdline.c index 8233e7af9389..fa762c5fbcb2 100644 --- a/fs/proc/cmdline.c +++ b/fs/proc/cmdline.c @@ -11,21 +11,9 @@ static int cmdline_proc_show(struct seq_file *m, void *v) return 0; } -static int cmdline_proc_open(struct inode *inode, struct file *file) -{ - return single_open(file, cmdline_proc_show, NULL); -} - -static const struct file_operations cmdline_proc_fops = { - .open = cmdline_proc_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; - static int __init proc_cmdline_init(void) { - proc_create("cmdline", 0, NULL, &cmdline_proc_fops); + proc_create_single("cmdline", 0, NULL, cmdline_proc_show); return 0; } fs_initcall(proc_cmdline_init); diff --git a/fs/proc/consoles.c b/fs/proc/consoles.c index a8ac48aebd59..954caf0b7fee 100644 --- a/fs/proc/consoles.c +++ b/fs/proc/consoles.c @@ -91,21 +91,9 @@ static const struct seq_operations consoles_op = { .show = show_console_dev }; -static int consoles_open(struct inode *inode, struct file *file) -{ - return seq_open(file, &consoles_op); -} - -static const struct file_operations proc_consoles_operations = { - .open = consoles_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release, -}; - static int __init proc_consoles_init(void) { - proc_create("consoles", 0, NULL, &proc_consoles_operations); + proc_create_seq("consoles", 0, NULL, &consoles_op); return 0; } fs_initcall(proc_consoles_init); diff --git a/fs/proc/devices.c b/fs/proc/devices.c index 2c7f22b14489..37d38697eaf8 100644 --- a/fs/proc/devices.c +++ b/fs/proc/devices.c @@ -51,21 +51,9 @@ static const struct seq_operations devinfo_ops = { .show = devinfo_show }; -static int devinfo_open(struct inode *inode, struct file *filp) -{ - return seq_open(filp, &devinfo_ops); -} - -static const struct file_operations proc_devinfo_operations = { - .open = devinfo_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release, -}; - static int __init proc_devices_init(void) { - proc_create("devices", 0, NULL, &proc_devinfo_operations); + proc_create_seq("devices", 0, NULL, &devinfo_ops); return 0; } fs_initcall(proc_devices_init); diff --git a/fs/proc/fd.c b/fs/proc/fd.c index 6b80cd1e419a..05b9893e9a22 100644 --- a/fs/proc/fd.c +++ b/fs/proc/fd.c @@ -81,9 +81,41 @@ static const struct file_operations proc_fdinfo_file_operations = { .release = single_release, }; +static bool tid_fd_mode(struct task_struct *task, unsigned fd, fmode_t *mode) +{ + struct files_struct *files = get_files_struct(task); + struct file *file; + + if (!files) + return false; + + rcu_read_lock(); + file = fcheck_files(files, fd); + if (file) + *mode = file->f_mode; + rcu_read_unlock(); + put_files_struct(files); + return !!file; +} + +static void tid_fd_update_inode(struct task_struct *task, struct inode *inode, + fmode_t f_mode) +{ + task_dump_owner(task, 0, &inode->i_uid, &inode->i_gid); + + if (S_ISLNK(inode->i_mode)) { + unsigned i_mode = S_IFLNK; + if (f_mode & FMODE_READ) + i_mode |= S_IRUSR | S_IXUSR; + if (f_mode & FMODE_WRITE) + i_mode |= S_IWUSR | S_IXUSR; + inode->i_mode = i_mode; + } + security_task_to_inode(task, inode); +} + static int tid_fd_revalidate(struct dentry *dentry, unsigned int flags) { - struct files_struct *files; struct task_struct *task; struct inode *inode; unsigned int fd; @@ -96,35 +128,11 @@ static int tid_fd_revalidate(struct dentry *dentry, unsigned int flags) fd = proc_fd(inode); if (task) { - files = get_files_struct(task); - if (files) { - struct file *file; - - rcu_read_lock(); - file = fcheck_files(files, fd); - if (file) { - unsigned f_mode = file->f_mode; - - rcu_read_unlock(); - put_files_struct(files); - - task_dump_owner(task, 0, &inode->i_uid, &inode->i_gid); - - if (S_ISLNK(inode->i_mode)) { - unsigned i_mode = S_IFLNK; - if (f_mode & FMODE_READ) - i_mode |= S_IRUSR | S_IXUSR; - if (f_mode & FMODE_WRITE) - i_mode |= S_IWUSR | S_IXUSR; - inode->i_mode = i_mode; - } - - security_task_to_inode(task, inode); - put_task_struct(task); - return 1; - } - rcu_read_unlock(); - put_files_struct(files); + fmode_t f_mode; + if (tid_fd_mode(task, fd, &f_mode)) { + tid_fd_update_inode(task, inode, f_mode); + put_task_struct(task); + return 1; } put_task_struct(task); } @@ -166,34 +174,33 @@ static int proc_fd_link(struct dentry *dentry, struct path *path) return ret; } -static int -proc_fd_instantiate(struct inode *dir, struct dentry *dentry, - struct task_struct *task, const void *ptr) +struct fd_data { + fmode_t mode; + unsigned fd; +}; + +static struct dentry *proc_fd_instantiate(struct dentry *dentry, + struct task_struct *task, const void *ptr) { - unsigned fd = (unsigned long)ptr; + const struct fd_data *data = ptr; struct proc_inode *ei; struct inode *inode; - inode = proc_pid_make_inode(dir->i_sb, task, S_IFLNK); + inode = proc_pid_make_inode(dentry->d_sb, task, S_IFLNK); if (!inode) - goto out; + return ERR_PTR(-ENOENT); ei = PROC_I(inode); - ei->fd = fd; + ei->fd = data->fd; inode->i_op = &proc_pid_link_inode_operations; inode->i_size = 64; ei->op.proc_get_link = proc_fd_link; + tid_fd_update_inode(task, inode, data->mode); d_set_d_op(dentry, &tid_fd_dentry_operations); - d_add(dentry, inode); - - /* Close the race of the process dying before we return the dentry */ - if (tid_fd_revalidate(dentry, 0)) - return 0; - out: - return -ENOENT; + return d_splice_alias(inode, dentry); } static struct dentry *proc_lookupfd_common(struct inode *dir, @@ -201,19 +208,21 @@ static struct dentry *proc_lookupfd_common(struct inode *dir, instantiate_t instantiate) { struct task_struct *task = get_proc_task(dir); - int result = -ENOENT; - unsigned fd = name_to_int(&dentry->d_name); + struct fd_data data = {.fd = name_to_int(&dentry->d_name)}; + struct dentry *result = ERR_PTR(-ENOENT); if (!task) goto out_no_task; - if (fd == ~0U) + if (data.fd == ~0U) + goto out; + if (!tid_fd_mode(task, data.fd, &data.mode)) goto out; - result = instantiate(dir, dentry, task, (void *)(unsigned long)fd); + result = instantiate(dentry, task, &data); out: put_task_struct(task); out_no_task: - return ERR_PTR(result); + return result; } static int proc_readfd_common(struct file *file, struct dir_context *ctx, @@ -236,17 +245,22 @@ static int proc_readfd_common(struct file *file, struct dir_context *ctx, for (fd = ctx->pos - 2; fd < files_fdtable(files)->max_fds; fd++, ctx->pos++) { + struct file *f; + struct fd_data data; char name[10 + 1]; int len; - if (!fcheck_files(files, fd)) + f = fcheck_files(files, fd); + if (!f) continue; + data.mode = f->f_mode; rcu_read_unlock(); + data.fd = fd; len = snprintf(name, sizeof(name), "%u", fd); if (!proc_fill_cache(file, ctx, name, len, instantiate, p, - (void *)(unsigned long)fd)) + &data)) goto out_fd_loop; cond_resched(); rcu_read_lock(); @@ -304,31 +318,25 @@ const struct inode_operations proc_fd_inode_operations = { .setattr = proc_setattr, }; -static int -proc_fdinfo_instantiate(struct inode *dir, struct dentry *dentry, - struct task_struct *task, const void *ptr) +static struct dentry *proc_fdinfo_instantiate(struct dentry *dentry, + struct task_struct *task, const void *ptr) { - unsigned fd = (unsigned long)ptr; + const struct fd_data *data = ptr; struct proc_inode *ei; struct inode *inode; - inode = proc_pid_make_inode(dir->i_sb, task, S_IFREG | S_IRUSR); + inode = proc_pid_make_inode(dentry->d_sb, task, S_IFREG | S_IRUSR); if (!inode) - goto out; + return ERR_PTR(-ENOENT); ei = PROC_I(inode); - ei->fd = fd; + ei->fd = data->fd; inode->i_fop = &proc_fdinfo_file_operations; + tid_fd_update_inode(task, inode, 0); d_set_d_op(dentry, &tid_fd_dentry_operations); - d_add(dentry, inode); - - /* Close the race of the process dying before we return the dentry */ - if (tid_fd_revalidate(dentry, 0)) - return 0; - out: - return -ENOENT; + return d_splice_alias(inode, dentry); } static struct dentry * diff --git a/fs/proc/generic.c b/fs/proc/generic.c index 2078e70e1595..7b4d9714f248 100644 --- a/fs/proc/generic.c +++ b/fs/proc/generic.c @@ -25,6 +25,7 @@ #include <linux/spinlock.h> #include <linux/completion.h> #include <linux/uaccess.h> +#include <linux/seq_file.h> #include "internal.h" @@ -256,8 +257,7 @@ struct dentry *proc_lookup_de(struct inode *dir, struct dentry *dentry, if (!inode) return ERR_PTR(-ENOMEM); d_set_d_op(dentry, &proc_misc_dentry_ops); - d_add(dentry, inode); - return NULL; + return d_splice_alias(inode, dentry); } read_unlock(&proc_subdir_lock); return ERR_PTR(-ENOENT); @@ -346,13 +346,12 @@ static const struct inode_operations proc_dir_inode_operations = { .setattr = proc_notify_change, }; -static int proc_register(struct proc_dir_entry * dir, struct proc_dir_entry * dp) +/* returns the registered entry, or frees dp and returns NULL on failure */ +struct proc_dir_entry *proc_register(struct proc_dir_entry *dir, + struct proc_dir_entry *dp) { - int ret; - - ret = proc_alloc_inum(&dp->low_ino); - if (ret) - return ret; + if (proc_alloc_inum(&dp->low_ino)) + goto out_free_entry; write_lock(&proc_subdir_lock); dp->parent = dir; @@ -360,12 +359,16 @@ static int proc_register(struct proc_dir_entry * dir, struct proc_dir_entry * dp WARN(1, "proc_dir_entry '%s/%s' already registered\n", dir->name, dp->name); write_unlock(&proc_subdir_lock); - proc_free_inum(dp->low_ino); - return -EEXIST; + goto out_free_inum; } write_unlock(&proc_subdir_lock); - return 0; + return dp; +out_free_inum: + proc_free_inum(dp->low_ino); +out_free_entry: + pde_free(dp); + return NULL; } static struct proc_dir_entry *__proc_create(struct proc_dir_entry **parent, @@ -443,10 +446,7 @@ struct proc_dir_entry *proc_symlink(const char *name, if (ent->data) { strcpy((char*)ent->data,dest); ent->proc_iops = &proc_link_inode_operations; - if (proc_register(parent, ent) < 0) { - pde_free(ent); - ent = NULL; - } + ent = proc_register(parent, ent); } else { pde_free(ent); ent = NULL; @@ -470,11 +470,9 @@ struct proc_dir_entry *proc_mkdir_data(const char *name, umode_t mode, ent->proc_fops = &proc_dir_operations; ent->proc_iops = &proc_dir_inode_operations; parent->nlink++; - if (proc_register(parent, ent) < 0) { - pde_free(ent); + ent = proc_register(parent, ent); + if (!ent) parent->nlink--; - ent = NULL; - } } return ent; } @@ -505,47 +503,47 @@ struct proc_dir_entry *proc_create_mount_point(const char *name) ent->proc_fops = NULL; ent->proc_iops = NULL; parent->nlink++; - if (proc_register(parent, ent) < 0) { - pde_free(ent); + ent = proc_register(parent, ent); + if (!ent) parent->nlink--; - ent = NULL; - } } return ent; } EXPORT_SYMBOL(proc_create_mount_point); -struct proc_dir_entry *proc_create_data(const char *name, umode_t mode, - struct proc_dir_entry *parent, - const struct file_operations *proc_fops, - void *data) +struct proc_dir_entry *proc_create_reg(const char *name, umode_t mode, + struct proc_dir_entry **parent, void *data) { - struct proc_dir_entry *pde; + struct proc_dir_entry *p; + if ((mode & S_IFMT) == 0) mode |= S_IFREG; - - if (!S_ISREG(mode)) { - WARN_ON(1); /* use proc_mkdir() */ + if ((mode & S_IALLUGO) == 0) + mode |= S_IRUGO; + if (WARN_ON_ONCE(!S_ISREG(mode))) return NULL; + + p = __proc_create(parent, name, mode, 1); + if (p) { + p->proc_iops = &proc_file_inode_operations; + p->data = data; } + return p; +} + +struct proc_dir_entry *proc_create_data(const char *name, umode_t mode, + struct proc_dir_entry *parent, + const struct file_operations *proc_fops, void *data) +{ + struct proc_dir_entry *p; BUG_ON(proc_fops == NULL); - if ((mode & S_IALLUGO) == 0) - mode |= S_IRUGO; - pde = __proc_create(&parent, name, mode, 1); - if (!pde) - goto out; - pde->proc_fops = proc_fops; - pde->data = data; - pde->proc_iops = &proc_file_inode_operations; - if (proc_register(parent, pde) < 0) - goto out_free; - return pde; -out_free: - pde_free(pde); -out: - return NULL; + p = proc_create_reg(name, mode, &parent, data); + if (!p) + return NULL; + p->proc_fops = proc_fops; + return proc_register(parent, p); } EXPORT_SYMBOL(proc_create_data); @@ -557,6 +555,67 @@ struct proc_dir_entry *proc_create(const char *name, umode_t mode, } EXPORT_SYMBOL(proc_create); +static int proc_seq_open(struct inode *inode, struct file *file) +{ + struct proc_dir_entry *de = PDE(inode); + + if (de->state_size) + return seq_open_private(file, de->seq_ops, de->state_size); + return seq_open(file, de->seq_ops); +} + +static const struct file_operations proc_seq_fops = { + .open = proc_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +struct proc_dir_entry *proc_create_seq_private(const char *name, umode_t mode, + struct proc_dir_entry *parent, const struct seq_operations *ops, + unsigned int state_size, void *data) +{ + struct proc_dir_entry *p; + + p = proc_create_reg(name, mode, &parent, data); + if (!p) + return NULL; + p->proc_fops = &proc_seq_fops; + p->seq_ops = ops; + p->state_size = state_size; + return proc_register(parent, p); +} +EXPORT_SYMBOL(proc_create_seq_private); + +static int proc_single_open(struct inode *inode, struct file *file) +{ + struct proc_dir_entry *de = PDE(inode); + + return single_open(file, de->single_show, de->data); +} + +static const struct file_operations proc_single_fops = { + .open = proc_single_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +struct proc_dir_entry *proc_create_single_data(const char *name, umode_t mode, + struct proc_dir_entry *parent, + int (*show)(struct seq_file *, void *), void *data) +{ + struct proc_dir_entry *p; + + p = proc_create_reg(name, mode, &parent, data); + if (!p) + return NULL; + p->proc_fops = &proc_single_fops; + p->single_show = show; + return proc_register(parent, p); +} +EXPORT_SYMBOL(proc_create_single_data); + void proc_set_size(struct proc_dir_entry *de, loff_t size) { de->size = size; diff --git a/fs/proc/internal.h b/fs/proc/internal.h index 0f1692e63cb6..43c70c9e6b62 100644 --- a/fs/proc/internal.h +++ b/fs/proc/internal.h @@ -44,7 +44,12 @@ struct proc_dir_entry { struct completion *pde_unload_completion; const struct inode_operations *proc_iops; const struct file_operations *proc_fops; + union { + const struct seq_operations *seq_ops; + int (*single_show)(struct seq_file *, void *); + }; void *data; + unsigned int state_size; unsigned int low_ino; nlink_t nlink; kuid_t uid; @@ -57,9 +62,9 @@ struct proc_dir_entry { umode_t mode; u8 namelen; #ifdef CONFIG_64BIT -#define SIZEOF_PDE_INLINE_NAME (192-139) +#define SIZEOF_PDE_INLINE_NAME (192-155) #else -#define SIZEOF_PDE_INLINE_NAME (128-87) +#define SIZEOF_PDE_INLINE_NAME (128-95) #endif char inline_name[SIZEOF_PDE_INLINE_NAME]; } __randomize_layout; @@ -147,14 +152,14 @@ extern const struct dentry_operations pid_dentry_operations; extern int pid_getattr(const struct path *, struct kstat *, u32, unsigned int); extern int proc_setattr(struct dentry *, struct iattr *); extern struct inode *proc_pid_make_inode(struct super_block *, struct task_struct *, umode_t); -extern int pid_revalidate(struct dentry *, unsigned int); +extern void pid_update_inode(struct task_struct *, struct inode *); extern int pid_delete_dentry(const struct dentry *); extern int proc_pid_readdir(struct file *, struct dir_context *); extern struct dentry *proc_pid_lookup(struct inode *, struct dentry *, unsigned int); extern loff_t mem_lseek(struct file *, loff_t, int); /* Lookups */ -typedef int instantiate_t(struct inode *, struct dentry *, +typedef struct dentry *instantiate_t(struct dentry *, struct task_struct *, const void *); extern bool proc_fill_cache(struct file *, struct dir_context *, const char *, int, instantiate_t, struct task_struct *, const void *); @@ -162,6 +167,10 @@ extern bool proc_fill_cache(struct file *, struct dir_context *, const char *, i /* * generic.c */ +struct proc_dir_entry *proc_create_reg(const char *name, umode_t mode, + struct proc_dir_entry **parent, void *data); +struct proc_dir_entry *proc_register(struct proc_dir_entry *dir, + struct proc_dir_entry *dp); extern struct dentry *proc_lookup(struct inode *, struct dentry *, unsigned int); struct dentry *proc_lookup_de(struct inode *, struct dentry *, struct proc_dir_entry *); extern int proc_readdir(struct file *, struct dir_context *); diff --git a/fs/proc/interrupts.c b/fs/proc/interrupts.c index 6a6bee9c603c..cb0edc7cbf09 100644 --- a/fs/proc/interrupts.c +++ b/fs/proc/interrupts.c @@ -34,21 +34,9 @@ static const struct seq_operations int_seq_ops = { .show = show_interrupts }; -static int interrupts_open(struct inode *inode, struct file *filp) -{ - return seq_open(filp, &int_seq_ops); -} - -static const struct file_operations proc_interrupts_operations = { - .open = interrupts_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release, -}; - static int __init proc_interrupts_init(void) { - proc_create("interrupts", 0, NULL, &proc_interrupts_operations); + proc_create_seq("interrupts", 0, NULL, &int_seq_ops); return 0; } fs_initcall(proc_interrupts_init); diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c index d1e82761de81..e64ecb9f2720 100644 --- a/fs/proc/kcore.c +++ b/fs/proc/kcore.c @@ -209,25 +209,34 @@ kclist_add_private(unsigned long pfn, unsigned long nr_pages, void *arg) { struct list_head *head = (struct list_head *)arg; struct kcore_list *ent; + struct page *p; + + if (!pfn_valid(pfn)) + return 1; + + p = pfn_to_page(pfn); + if (!memmap_valid_within(pfn, p, page_zone(p))) + return 1; ent = kmalloc(sizeof(*ent), GFP_KERNEL); if (!ent) return -ENOMEM; - ent->addr = (unsigned long)__va((pfn << PAGE_SHIFT)); + ent->addr = (unsigned long)page_to_virt(p); ent->size = nr_pages << PAGE_SHIFT; - /* Sanity check: Can happen in 32bit arch...maybe */ - if (ent->addr < (unsigned long) __va(0)) + if (!virt_addr_valid(ent->addr)) goto free_out; /* cut not-mapped area. ....from ppc-32 code. */ if (ULONG_MAX - ent->addr < ent->size) ent->size = ULONG_MAX - ent->addr; - /* cut when vmalloc() area is higher than direct-map area */ - if (VMALLOC_START > (unsigned long)__va(0)) { - if (ent->addr > VMALLOC_START) - goto free_out; + /* + * We've already checked virt_addr_valid so we know this address + * is a valid pointer, therefore we can check against it to determine + * if we need to trim + */ + if (VMALLOC_START > ent->addr) { if (VMALLOC_START - ent->addr < ent->size) ent->size = VMALLOC_START - ent->addr; } diff --git a/fs/proc/loadavg.c b/fs/proc/loadavg.c index b572cc865b92..d06694757201 100644 --- a/fs/proc/loadavg.c +++ b/fs/proc/loadavg.c @@ -28,21 +28,9 @@ static int loadavg_proc_show(struct seq_file *m, void *v) return 0; } -static int loadavg_proc_open(struct inode *inode, struct file *file) -{ - return single_open(file, loadavg_proc_show, NULL); -} - -static const struct file_operations loadavg_proc_fops = { - .open = loadavg_proc_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; - static int __init proc_loadavg_init(void) { - proc_create("loadavg", 0, NULL, &loadavg_proc_fops); + proc_create_single("loadavg", 0, NULL, loadavg_proc_show); return 0; } fs_initcall(proc_loadavg_init); diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c index 65a72ab57471..2fb04846ed11 100644 --- a/fs/proc/meminfo.c +++ b/fs/proc/meminfo.c @@ -149,21 +149,9 @@ static int meminfo_proc_show(struct seq_file *m, void *v) return 0; } -static int meminfo_proc_open(struct inode *inode, struct file *file) -{ - return single_open(file, meminfo_proc_show, NULL); -} - -static const struct file_operations meminfo_proc_fops = { - .open = meminfo_proc_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; - static int __init proc_meminfo_init(void) { - proc_create("meminfo", 0, NULL, &meminfo_proc_fops); + proc_create_single("meminfo", 0, NULL, meminfo_proc_show); return 0; } fs_initcall(proc_meminfo_init); diff --git a/fs/proc/namespaces.c b/fs/proc/namespaces.c index 59b17e509f46..dd2b35f78b09 100644 --- a/fs/proc/namespaces.c +++ b/fs/proc/namespaces.c @@ -87,28 +87,24 @@ static const struct inode_operations proc_ns_link_inode_operations = { .setattr = proc_setattr, }; -static int proc_ns_instantiate(struct inode *dir, - struct dentry *dentry, struct task_struct *task, const void *ptr) +static struct dentry *proc_ns_instantiate(struct dentry *dentry, + struct task_struct *task, const void *ptr) { const struct proc_ns_operations *ns_ops = ptr; struct inode *inode; struct proc_inode *ei; - inode = proc_pid_make_inode(dir->i_sb, task, S_IFLNK | S_IRWXUGO); + inode = proc_pid_make_inode(dentry->d_sb, task, S_IFLNK | S_IRWXUGO); if (!inode) - goto out; + return ERR_PTR(-ENOENT); ei = PROC_I(inode); inode->i_op = &proc_ns_link_inode_operations; ei->ns_ops = ns_ops; + pid_update_inode(task, inode); d_set_d_op(dentry, &pid_dentry_operations); - d_add(dentry, inode); - /* Close the race of the process dying before we return the dentry */ - if (pid_revalidate(dentry, 0)) - return 0; -out: - return -ENOENT; + return d_splice_alias(inode, dentry); } static int proc_ns_dir_readdir(struct file *file, struct dir_context *ctx) @@ -147,12 +143,10 @@ const struct file_operations proc_ns_dir_operations = { static struct dentry *proc_ns_dir_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) { - int error; struct task_struct *task = get_proc_task(dir); const struct proc_ns_operations **entry, **last; unsigned int len = dentry->d_name.len; - - error = -ENOENT; + struct dentry *res = ERR_PTR(-ENOENT); if (!task) goto out_no_task; @@ -167,11 +161,11 @@ static struct dentry *proc_ns_dir_lookup(struct inode *dir, if (entry == last) goto out; - error = proc_ns_instantiate(dir, dentry, task, *entry); + res = proc_ns_instantiate(dentry, task, *entry); out: put_task_struct(task); out_no_task: - return ERR_PTR(error); + return res; } const struct inode_operations proc_ns_dir_inode_operations = { diff --git a/fs/proc/nommu.c b/fs/proc/nommu.c index 75634379f82e..3b63be64e436 100644 --- a/fs/proc/nommu.c +++ b/fs/proc/nommu.c @@ -113,21 +113,9 @@ static const struct seq_operations proc_nommu_region_list_seqop = { .show = nommu_region_list_show }; -static int proc_nommu_region_list_open(struct inode *inode, struct file *file) -{ - return seq_open(file, &proc_nommu_region_list_seqop); -} - -static const struct file_operations proc_nommu_region_list_operations = { - .open = proc_nommu_region_list_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release, -}; - static int __init proc_nommu_init(void) { - proc_create("maps", S_IRUGO, NULL, &proc_nommu_region_list_operations); + proc_create_seq("maps", S_IRUGO, NULL, &proc_nommu_region_list_seqop); return 0; } diff --git a/fs/proc/proc_net.c b/fs/proc/proc_net.c index 1763f370489d..7d94fa005b0d 100644 --- a/fs/proc/proc_net.c +++ b/fs/proc/proc_net.c @@ -38,20 +38,20 @@ static struct net *get_proc_net(const struct inode *inode) return maybe_get_net(PDE_NET(PDE(inode))); } -int seq_open_net(struct inode *ino, struct file *f, - const struct seq_operations *ops, int size) +static int seq_open_net(struct inode *inode, struct file *file) { - struct net *net; + unsigned int state_size = PDE(inode)->state_size; struct seq_net_private *p; + struct net *net; - BUG_ON(size < sizeof(*p)); + WARN_ON_ONCE(state_size < sizeof(*p)); - net = get_proc_net(ino); - if (net == NULL) + net = get_proc_net(inode); + if (!net) return -ENXIO; - p = __seq_open_private(f, ops, size); - if (p == NULL) { + p = __seq_open_private(file, PDE(inode)->seq_ops, state_size); + if (!p) { put_net(net); return -ENOMEM; } @@ -60,51 +60,83 @@ int seq_open_net(struct inode *ino, struct file *f, #endif return 0; } -EXPORT_SYMBOL_GPL(seq_open_net); -int single_open_net(struct inode *inode, struct file *file, - int (*show)(struct seq_file *, void *)) +static int seq_release_net(struct inode *ino, struct file *f) { - int err; - struct net *net; - - err = -ENXIO; - net = get_proc_net(inode); - if (net == NULL) - goto err_net; - - err = single_open(file, show, net); - if (err < 0) - goto err_open; + struct seq_file *seq = f->private_data; + put_net(seq_file_net(seq)); + seq_release_private(ino, f); return 0; +} -err_open: - put_net(net); -err_net: - return err; +static const struct file_operations proc_net_seq_fops = { + .open = seq_open_net, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release_net, +}; + +struct proc_dir_entry *proc_create_net_data(const char *name, umode_t mode, + struct proc_dir_entry *parent, const struct seq_operations *ops, + unsigned int state_size, void *data) +{ + struct proc_dir_entry *p; + + p = proc_create_reg(name, mode, &parent, data); + if (!p) + return NULL; + p->proc_fops = &proc_net_seq_fops; + p->seq_ops = ops; + p->state_size = state_size; + return proc_register(parent, p); } -EXPORT_SYMBOL_GPL(single_open_net); +EXPORT_SYMBOL_GPL(proc_create_net_data); -int seq_release_net(struct inode *ino, struct file *f) +static int single_open_net(struct inode *inode, struct file *file) { - struct seq_file *seq; + struct proc_dir_entry *de = PDE(inode); + struct net *net; + int err; - seq = f->private_data; + net = get_proc_net(inode); + if (!net) + return -ENXIO; - put_net(seq_file_net(seq)); - seq_release_private(ino, f); - return 0; + err = single_open(file, de->single_show, net); + if (err) + put_net(net); + return err; } -EXPORT_SYMBOL_GPL(seq_release_net); -int single_release_net(struct inode *ino, struct file *f) +static int single_release_net(struct inode *ino, struct file *f) { struct seq_file *seq = f->private_data; put_net(seq->private); return single_release(ino, f); } -EXPORT_SYMBOL_GPL(single_release_net); + +static const struct file_operations proc_net_single_fops = { + .open = single_open_net, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release_net, +}; + +struct proc_dir_entry *proc_create_net_single(const char *name, umode_t mode, + struct proc_dir_entry *parent, + int (*show)(struct seq_file *, void *), void *data) +{ + struct proc_dir_entry *p; + + p = proc_create_reg(name, mode, &parent, data); + if (!p) + return NULL; + p->proc_fops = &proc_net_single_fops; + p->single_show = show; + return proc_register(parent, p); +} +EXPORT_SYMBOL_GPL(proc_create_net_single); static struct net *get_proc_task_net(struct inode *dir) { diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index 8989936f2995..4d765e5e91ed 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -554,9 +554,8 @@ static struct dentry *proc_sys_lookup(struct inode *dir, struct dentry *dentry, if (!inode) goto out; - err = NULL; d_set_d_op(dentry, &proc_sys_dentry_operations); - d_add(dentry, inode); + err = d_splice_alias(inode, dentry); out: if (h) @@ -684,6 +683,7 @@ static bool proc_sys_fill_cache(struct file *file, if (IS_ERR(child)) return false; if (d_in_lookup(child)) { + struct dentry *res; inode = proc_sys_make_inode(dir->d_sb, head, table); if (!inode) { d_lookup_done(child); @@ -691,7 +691,16 @@ static bool proc_sys_fill_cache(struct file *file, return false; } d_set_d_op(child, &proc_sys_dentry_operations); - d_add(child, inode); + res = d_splice_alias(inode, child); + d_lookup_done(child); + if (unlikely(res)) { + if (IS_ERR(res)) { + dput(child); + return false; + } + dput(child); + child = res; + } } } inode = d_inode(child); diff --git a/fs/proc/proc_tty.c b/fs/proc/proc_tty.c index d0cf1c50bb6c..c69ff191e5d8 100644 --- a/fs/proc/proc_tty.c +++ b/fs/proc/proc_tty.c @@ -126,18 +126,6 @@ static const struct seq_operations tty_drivers_op = { .show = show_tty_driver }; -static int tty_drivers_open(struct inode *inode, struct file *file) -{ - return seq_open(file, &tty_drivers_op); -} - -static const struct file_operations proc_tty_drivers_operations = { - .open = tty_drivers_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release, -}; - /* * This function is called by tty_register_driver() to handle * registering the driver's /proc handler into /proc/tty/driver/<foo> @@ -147,11 +135,11 @@ void proc_tty_register_driver(struct tty_driver *driver) struct proc_dir_entry *ent; if (!driver->driver_name || driver->proc_entry || - !driver->ops->proc_fops) + !driver->ops->proc_show) return; - ent = proc_create_data(driver->driver_name, 0, proc_tty_driver, - driver->ops->proc_fops, driver); + ent = proc_create_single_data(driver->driver_name, 0, proc_tty_driver, + driver->ops->proc_show, driver); driver->proc_entry = ent; } @@ -186,6 +174,6 @@ void __init proc_tty_init(void) * entry. */ proc_tty_driver = proc_mkdir_mode("tty/driver", S_IRUSR|S_IXUSR, NULL); - proc_create("tty/ldiscs", 0, NULL, &tty_ldiscs_proc_fops); - proc_create("tty/drivers", 0, NULL, &proc_tty_drivers_operations); + proc_create_seq("tty/ldiscs", 0, NULL, &tty_ldiscs_seq_ops); + proc_create_seq("tty/drivers", 0, NULL, &tty_drivers_op); } diff --git a/fs/proc/self.c b/fs/proc/self.c index 4d7d061696b3..127265e5c55f 100644 --- a/fs/proc/self.c +++ b/fs/proc/self.c @@ -12,7 +12,7 @@ static const char *proc_self_get_link(struct dentry *dentry, struct inode *inode, struct delayed_call *done) { - struct pid_namespace *ns = inode->i_sb->s_fs_info; + struct pid_namespace *ns = proc_pid_ns(inode); pid_t tgid = task_tgid_nr_ns(current, ns); char *name; @@ -36,7 +36,7 @@ static unsigned self_inum __ro_after_init; int proc_setup_self(struct super_block *s) { struct inode *root_inode = d_inode(s->s_root); - struct pid_namespace *ns = s->s_fs_info; + struct pid_namespace *ns = proc_pid_ns(root_inode); struct dentry *self; inode_lock(root_inode); diff --git a/fs/proc/softirqs.c b/fs/proc/softirqs.c index 24072cc06e65..12901dcf57e2 100644 --- a/fs/proc/softirqs.c +++ b/fs/proc/softirqs.c @@ -25,21 +25,9 @@ static int show_softirqs(struct seq_file *p, void *v) return 0; } -static int softirqs_open(struct inode *inode, struct file *file) -{ - return single_open(file, show_softirqs, NULL); -} - -static const struct file_operations proc_softirqs_operations = { - .open = softirqs_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; - static int __init proc_softirqs_init(void) { - proc_create("softirqs", 0, NULL, &proc_softirqs_operations); + proc_create_single("softirqs", 0, NULL, show_softirqs); return 0; } fs_initcall(proc_softirqs_init); diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index c486ad4b43f0..a20c6e495bb2 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -937,7 +937,7 @@ static inline void clear_soft_dirty(struct vm_area_struct *vma, /* * The soft-dirty tracker uses #PF-s to catch writes * to pages, so write-protect the pte as well. See the - * Documentation/vm/soft-dirty.txt for full description + * Documentation/admin-guide/mm/soft-dirty.rst for full description * of how soft-dirty works. */ pte_t ptent = *pte; @@ -1421,7 +1421,7 @@ static int pagemap_hugetlb_range(pte_t *ptep, unsigned long hmask, * Bits 0-54 page frame number (PFN) if present * Bits 0-4 swap type if swapped * Bits 5-54 swap offset if swapped - * Bit 55 pte is soft-dirty (see Documentation/vm/soft-dirty.txt) + * Bit 55 pte is soft-dirty (see Documentation/admin-guide/mm/soft-dirty.rst) * Bit 56 page exclusively mapped * Bits 57-60 zero * Bit 61 page is file-page or shared-anon diff --git a/fs/proc/thread_self.c b/fs/proc/thread_self.c index 9d2efaca499f..b905010ca9eb 100644 --- a/fs/proc/thread_self.c +++ b/fs/proc/thread_self.c @@ -12,7 +12,7 @@ static const char *proc_thread_self_get_link(struct dentry *dentry, struct inode *inode, struct delayed_call *done) { - struct pid_namespace *ns = inode->i_sb->s_fs_info; + struct pid_namespace *ns = proc_pid_ns(inode); pid_t tgid = task_tgid_nr_ns(current, ns); pid_t pid = task_pid_nr_ns(current, ns); char *name; @@ -36,7 +36,7 @@ static unsigned thread_self_inum __ro_after_init; int proc_setup_thread_self(struct super_block *s) { struct inode *root_inode = d_inode(s->s_root); - struct pid_namespace *ns = s->s_fs_info; + struct pid_namespace *ns = proc_pid_ns(root_inode); struct dentry *thread_self; inode_lock(root_inode); diff --git a/fs/proc/uptime.c b/fs/proc/uptime.c index 95a708d83721..3bd12f955867 100644 --- a/fs/proc/uptime.c +++ b/fs/proc/uptime.c @@ -30,21 +30,9 @@ static int uptime_proc_show(struct seq_file *m, void *v) return 0; } -static int uptime_proc_open(struct inode *inode, struct file *file) -{ - return single_open(file, uptime_proc_show, NULL); -} - -static const struct file_operations uptime_proc_fops = { - .open = uptime_proc_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; - static int __init proc_uptime_init(void) { - proc_create("uptime", 0, NULL, &uptime_proc_fops); + proc_create_single("uptime", 0, NULL, uptime_proc_show); return 0; } fs_initcall(proc_uptime_init); diff --git a/fs/proc/version.c b/fs/proc/version.c index 94901e8e700d..b449f186577f 100644 --- a/fs/proc/version.c +++ b/fs/proc/version.c @@ -15,21 +15,9 @@ static int version_proc_show(struct seq_file *m, void *v) return 0; } -static int version_proc_open(struct inode *inode, struct file *file) -{ - return single_open(file, version_proc_show, NULL); -} - -static const struct file_operations version_proc_fops = { - .open = version_proc_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; - static int __init proc_version_init(void) { - proc_create("version", 0, NULL, &version_proc_fops); + proc_create_single("version", 0, NULL, version_proc_show); return 0; } fs_initcall(proc_version_init); diff --git a/fs/qnx4/namei.c b/fs/qnx4/namei.c index eca27878079d..8d72221735d7 100644 --- a/fs/qnx4/namei.c +++ b/fs/qnx4/namei.c @@ -114,13 +114,9 @@ struct dentry * qnx4_lookup(struct inode *dir, struct dentry *dentry, unsigned i brelse(bh); foundinode = qnx4_iget(dir->i_sb, ino); - if (IS_ERR(foundinode)) { + if (IS_ERR(foundinode)) QNX4DEBUG((KERN_ERR "qnx4: lookup->iget -> error %ld\n", PTR_ERR(foundinode))); - return ERR_CAST(foundinode); - } out: - d_add(dentry, foundinode); - - return NULL; + return d_splice_alias(foundinode, dentry); } diff --git a/fs/qnx6/namei.c b/fs/qnx6/namei.c index 72c2770830be..e2e98e653b8d 100644 --- a/fs/qnx6/namei.c +++ b/fs/qnx6/namei.c @@ -29,15 +29,11 @@ struct dentry *qnx6_lookup(struct inode *dir, struct dentry *dentry, if (ino) { foundinode = qnx6_iget(dir->i_sb, ino); qnx6_put_page(page); - if (IS_ERR(foundinode)) { + if (IS_ERR(foundinode)) pr_debug("lookup->iget -> error %ld\n", PTR_ERR(foundinode)); - return ERR_CAST(foundinode); - } } else { pr_debug("%s(): not found %s\n", __func__, name); - return NULL; } - d_add(dentry, foundinode); - return NULL; + return d_splice_alias(foundinode, dentry); } diff --git a/fs/read_write.c b/fs/read_write.c index c4eabbfc90df..e83bd9744b5d 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -2023,7 +2023,7 @@ int vfs_dedupe_file_range(struct file *file, struct file_dedupe_range *same) ret = mnt_want_write_file(dst_file); if (ret) { info->status = ret; - goto next_loop; + goto next_fdput; } dst_off = info->dest_offset; @@ -2058,9 +2058,9 @@ int vfs_dedupe_file_range(struct file *file, struct file_dedupe_range *same) next_file: mnt_drop_write_file(dst_file); -next_loop: +next_fdput: fdput(dst_fd); - +next_loop: if (fatal_signal_pending(current)) goto out; } diff --git a/fs/reiserfs/namei.c b/fs/reiserfs/namei.c index bd39a998843d..5089dac02660 100644 --- a/fs/reiserfs/namei.c +++ b/fs/reiserfs/namei.c @@ -687,8 +687,7 @@ static int reiserfs_create(struct inode *dir, struct dentry *dentry, umode_t mod reiserfs_update_inode_transaction(inode); reiserfs_update_inode_transaction(dir); - unlock_new_inode(inode); - d_instantiate(dentry, inode); + d_instantiate_new(dentry, inode); retval = journal_end(&th); out_failed: @@ -771,8 +770,7 @@ static int reiserfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode goto out_failed; } - unlock_new_inode(inode); - d_instantiate(dentry, inode); + d_instantiate_new(dentry, inode); retval = journal_end(&th); out_failed: @@ -871,8 +869,7 @@ static int reiserfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode /* the above add_entry did not update dir's stat data */ reiserfs_update_sd(&th, dir); - unlock_new_inode(inode); - d_instantiate(dentry, inode); + d_instantiate_new(dentry, inode); retval = journal_end(&th); out_failed: reiserfs_write_unlock(dir->i_sb); @@ -1187,8 +1184,7 @@ static int reiserfs_symlink(struct inode *parent_dir, goto out_failed; } - unlock_new_inode(inode); - d_instantiate(dentry, inode); + d_instantiate_new(dentry, inode); retval = journal_end(&th); out_failed: reiserfs_write_unlock(parent_dir->i_sb); diff --git a/fs/reiserfs/procfs.c b/fs/reiserfs/procfs.c index fe999157dd97..e39b3910d24d 100644 --- a/fs/reiserfs/procfs.c +++ b/fs/reiserfs/procfs.c @@ -389,27 +389,13 @@ static int show_journal(struct seq_file *m, void *unused) return 0; } -static int r_open(struct inode *inode, struct file *file) -{ - return single_open(file, PDE_DATA(inode), - proc_get_parent_data(inode)); -} - -static const struct file_operations r_file_operations = { - .open = r_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; - static struct proc_dir_entry *proc_info_root = NULL; static const char proc_info_root_name[] = "fs/reiserfs"; static void add_file(struct super_block *sb, char *name, int (*func) (struct seq_file *, void *)) { - proc_create_data(name, 0, REISERFS_SB(sb)->procdir, - &r_file_operations, func); + proc_create_single_data(name, 0, REISERFS_SB(sb)->procdir, func, sb); } int reiserfs_proc_info_init(struct super_block *sb) diff --git a/fs/romfs/super.c b/fs/romfs/super.c index 8f06fd1f3d69..6ccb51993a76 100644 --- a/fs/romfs/super.c +++ b/fs/romfs/super.c @@ -213,7 +213,7 @@ static struct dentry *romfs_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) { unsigned long offset, maxoff; - struct inode *inode; + struct inode *inode = NULL; struct romfs_inode ri; const char *name; /* got from dentry */ int len, ret; @@ -233,7 +233,7 @@ static struct dentry *romfs_lookup(struct inode *dir, struct dentry *dentry, for (;;) { if (!offset || offset >= maxoff) - goto out0; + break; ret = romfs_dev_read(dir->i_sb, offset, &ri, sizeof(ri)); if (ret < 0) @@ -244,37 +244,19 @@ static struct dentry *romfs_lookup(struct inode *dir, struct dentry *dentry, len); if (ret < 0) goto error; - if (ret == 1) + if (ret == 1) { + /* Hard link handling */ + if ((be32_to_cpu(ri.next) & ROMFH_TYPE) == ROMFH_HRD) + offset = be32_to_cpu(ri.spec) & ROMFH_MASK; + inode = romfs_iget(dir->i_sb, offset); break; + } /* next entry */ offset = be32_to_cpu(ri.next) & ROMFH_MASK; } - /* Hard link handling */ - if ((be32_to_cpu(ri.next) & ROMFH_TYPE) == ROMFH_HRD) - offset = be32_to_cpu(ri.spec) & ROMFH_MASK; - - inode = romfs_iget(dir->i_sb, offset); - if (IS_ERR(inode)) { - ret = PTR_ERR(inode); - goto error; - } - goto outi; - - /* - * it's a bit funky, _lookup needs to return an error code - * (negative) or a NULL, both as a dentry. ENOENT should not - * be returned, instead we need to create a negative dentry by - * d_add(dentry, NULL); and return 0 as no error. - * (Although as I see, it only matters on writable file - * systems). - */ -out0: - inode = NULL; -outi: - d_add(dentry, inode); - ret = 0; + return d_splice_alias(inode, dentry); error: return ERR_PTR(ret); } diff --git a/fs/select.c b/fs/select.c index ba879c51288f..bc3cc0f98896 100644 --- a/fs/select.c +++ b/fs/select.c @@ -34,6 +34,29 @@ #include <linux/uaccess.h> +__poll_t vfs_poll(struct file *file, struct poll_table_struct *pt) +{ + if (file->f_op->poll) { + return file->f_op->poll(file, pt); + } else if (file_has_poll_mask(file)) { + unsigned int events = poll_requested_events(pt); + struct wait_queue_head *head; + + if (pt && pt->_qproc) { + head = file->f_op->get_poll_head(file, events); + if (!head) + return DEFAULT_POLLMASK; + if (IS_ERR(head)) + return EPOLLERR; + pt->_qproc(file, head, pt); + } + + return file->f_op->poll_mask(file, events); + } else { + return DEFAULT_POLLMASK; + } +} +EXPORT_SYMBOL_GPL(vfs_poll); /* * Estimate expected accuracy in ns from a timeval. @@ -233,7 +256,7 @@ static void __pollwait(struct file *filp, wait_queue_head_t *wait_address, add_wait_queue(wait_address, &entry->wait); } -int poll_schedule_timeout(struct poll_wqueues *pwq, int state, +static int poll_schedule_timeout(struct poll_wqueues *pwq, int state, ktime_t *expires, unsigned long slack) { int rc = -EINTR; @@ -258,7 +281,6 @@ int poll_schedule_timeout(struct poll_wqueues *pwq, int state, return rc; } -EXPORT_SYMBOL(poll_schedule_timeout); /** * poll_select_set_timeout - helper function to setup the timeout value @@ -503,14 +525,10 @@ static int do_select(int n, fd_set_bits *fds, struct timespec64 *end_time) continue; f = fdget(i); if (f.file) { - const struct file_operations *f_op; - f_op = f.file->f_op; - mask = DEFAULT_POLLMASK; - if (f_op->poll) { - wait_key_set(wait, in, out, - bit, busy_flag); - mask = (*f_op->poll)(f.file, wait); - } + wait_key_set(wait, in, out, bit, + busy_flag); + mask = vfs_poll(f.file, wait); + fdput(f); if ((mask & POLLIN_SET) && (in & bit)) { res_in |= bit; @@ -813,34 +831,29 @@ static inline __poll_t do_pollfd(struct pollfd *pollfd, poll_table *pwait, bool *can_busy_poll, __poll_t busy_flag) { - __poll_t mask; - int fd; - - mask = 0; - fd = pollfd->fd; - if (fd >= 0) { - struct fd f = fdget(fd); - mask = EPOLLNVAL; - if (f.file) { - /* userland u16 ->events contains POLL... bitmap */ - __poll_t filter = demangle_poll(pollfd->events) | - EPOLLERR | EPOLLHUP; - mask = DEFAULT_POLLMASK; - if (f.file->f_op->poll) { - pwait->_key = filter; - pwait->_key |= busy_flag; - mask = f.file->f_op->poll(f.file, pwait); - if (mask & busy_flag) - *can_busy_poll = true; - } - /* Mask out unneeded events. */ - mask &= filter; - fdput(f); - } - } + int fd = pollfd->fd; + __poll_t mask = 0, filter; + struct fd f; + + if (fd < 0) + goto out; + mask = EPOLLNVAL; + f = fdget(fd); + if (!f.file) + goto out; + + /* userland u16 ->events contains POLL... bitmap */ + filter = demangle_poll(pollfd->events) | EPOLLERR | EPOLLHUP; + pwait->_key = filter | busy_flag; + mask = vfs_poll(f.file, pwait); + if (mask & busy_flag) + *can_busy_poll = true; + mask &= filter; /* Mask out unneeded events. */ + fdput(f); + +out: /* ... and so does ->revents */ pollfd->revents = mangle_poll(mask); - return mask; } diff --git a/fs/seq_file.c b/fs/seq_file.c index c6c27f1f9c98..4cc090b50cc5 100644 --- a/fs/seq_file.c +++ b/fs/seq_file.c @@ -709,11 +709,6 @@ void seq_put_decimal_ull_width(struct seq_file *m, const char *delimiter, if (m->count + width >= m->size) goto overflow; - if (num < 10) { - m->buf[m->count++] = num + '0'; - return; - } - len = num_to_str(m->buf + m->count, m->size - m->count, num, width); if (!len) goto overflow; diff --git a/fs/signalfd.c b/fs/signalfd.c index d2187a813376..cbb42f77a2bd 100644 --- a/fs/signalfd.c +++ b/fs/signalfd.c @@ -81,83 +81,86 @@ static __poll_t signalfd_poll(struct file *file, poll_table *wait) static int signalfd_copyinfo(struct signalfd_siginfo __user *uinfo, siginfo_t const *kinfo) { - long err; + struct signalfd_siginfo new; BUILD_BUG_ON(sizeof(struct signalfd_siginfo) != 128); /* * Unused members should be zero ... */ - err = __clear_user(uinfo, sizeof(*uinfo)); + memset(&new, 0, sizeof(new)); /* * If you change siginfo_t structure, please be sure * this code is fixed accordingly. */ - err |= __put_user(kinfo->si_signo, &uinfo->ssi_signo); - err |= __put_user(kinfo->si_errno, &uinfo->ssi_errno); - err |= __put_user(kinfo->si_code, &uinfo->ssi_code); + new.ssi_signo = kinfo->si_signo; + new.ssi_errno = kinfo->si_errno; + new.ssi_code = kinfo->si_code; switch (siginfo_layout(kinfo->si_signo, kinfo->si_code)) { case SIL_KILL: - err |= __put_user(kinfo->si_pid, &uinfo->ssi_pid); - err |= __put_user(kinfo->si_uid, &uinfo->ssi_uid); + new.ssi_pid = kinfo->si_pid; + new.ssi_uid = kinfo->si_uid; break; case SIL_TIMER: - err |= __put_user(kinfo->si_tid, &uinfo->ssi_tid); - err |= __put_user(kinfo->si_overrun, &uinfo->ssi_overrun); - err |= __put_user((long) kinfo->si_ptr, &uinfo->ssi_ptr); - err |= __put_user(kinfo->si_int, &uinfo->ssi_int); + new.ssi_tid = kinfo->si_tid; + new.ssi_overrun = kinfo->si_overrun; + new.ssi_ptr = (long) kinfo->si_ptr; + new.ssi_int = kinfo->si_int; break; case SIL_POLL: - err |= __put_user(kinfo->si_band, &uinfo->ssi_band); - err |= __put_user(kinfo->si_fd, &uinfo->ssi_fd); + new.ssi_band = kinfo->si_band; + new.ssi_fd = kinfo->si_fd; break; - case SIL_FAULT: - err |= __put_user((long) kinfo->si_addr, &uinfo->ssi_addr); -#ifdef __ARCH_SI_TRAPNO - err |= __put_user(kinfo->si_trapno, &uinfo->ssi_trapno); -#endif -#ifdef BUS_MCEERR_AO + case SIL_FAULT_BNDERR: + case SIL_FAULT_PKUERR: /* - * Other callers might not initialize the si_lsb field, - * so check explicitly for the right codes here. + * Fall through to the SIL_FAULT case. Both SIL_FAULT_BNDERR + * and SIL_FAULT_PKUERR are only generated by faults that + * deliver them synchronously to userspace. In case someone + * injects one of these signals and signalfd catches it treat + * it as SIL_FAULT. */ - if (kinfo->si_signo == SIGBUS && - kinfo->si_code == BUS_MCEERR_AO) - err |= __put_user((short) kinfo->si_addr_lsb, - &uinfo->ssi_addr_lsb); + case SIL_FAULT: + new.ssi_addr = (long) kinfo->si_addr; +#ifdef __ARCH_SI_TRAPNO + new.ssi_trapno = kinfo->si_trapno; #endif -#ifdef BUS_MCEERR_AR - /* - * Other callers might not initialize the si_lsb field, - * so check explicitly for the right codes here. - */ - if (kinfo->si_signo == SIGBUS && - kinfo->si_code == BUS_MCEERR_AR) - err |= __put_user((short) kinfo->si_addr_lsb, - &uinfo->ssi_addr_lsb); + break; + case SIL_FAULT_MCEERR: + new.ssi_addr = (long) kinfo->si_addr; +#ifdef __ARCH_SI_TRAPNO + new.ssi_trapno = kinfo->si_trapno; #endif + new.ssi_addr_lsb = (short) kinfo->si_addr_lsb; break; case SIL_CHLD: - err |= __put_user(kinfo->si_pid, &uinfo->ssi_pid); - err |= __put_user(kinfo->si_uid, &uinfo->ssi_uid); - err |= __put_user(kinfo->si_status, &uinfo->ssi_status); - err |= __put_user(kinfo->si_utime, &uinfo->ssi_utime); - err |= __put_user(kinfo->si_stime, &uinfo->ssi_stime); + new.ssi_pid = kinfo->si_pid; + new.ssi_uid = kinfo->si_uid; + new.ssi_status = kinfo->si_status; + new.ssi_utime = kinfo->si_utime; + new.ssi_stime = kinfo->si_stime; break; case SIL_RT: - default: /* * This case catches also the signals queued by sigqueue(). */ - err |= __put_user(kinfo->si_pid, &uinfo->ssi_pid); - err |= __put_user(kinfo->si_uid, &uinfo->ssi_uid); - err |= __put_user((long) kinfo->si_ptr, &uinfo->ssi_ptr); - err |= __put_user(kinfo->si_int, &uinfo->ssi_int); + new.ssi_pid = kinfo->si_pid; + new.ssi_uid = kinfo->si_uid; + new.ssi_ptr = (long) kinfo->si_ptr; + new.ssi_int = kinfo->si_int; + break; + case SIL_SYS: + new.ssi_call_addr = (long) kinfo->si_call_addr; + new.ssi_syscall = kinfo->si_syscall; + new.ssi_arch = kinfo->si_arch; break; } - return err ? -EFAULT: sizeof(*uinfo); + if (copy_to_user(uinfo, &new, sizeof(struct signalfd_siginfo))) + return -EFAULT; + + return sizeof(*uinfo); } static ssize_t signalfd_dequeue(struct signalfd_ctx *ctx, siginfo_t *info, diff --git a/fs/super.c b/fs/super.c index 122c402049a2..50728d9c1a05 100644 --- a/fs/super.c +++ b/fs/super.c @@ -121,13 +121,23 @@ static unsigned long super_cache_count(struct shrinker *shrink, sb = container_of(shrink, struct super_block, s_shrink); /* - * Don't call trylock_super as it is a potential - * scalability bottleneck. The counts could get updated - * between super_cache_count and super_cache_scan anyway. - * Call to super_cache_count with shrinker_rwsem held - * ensures the safety of call to list_lru_shrink_count() and - * s_op->nr_cached_objects(). + * We don't call trylock_super() here as it is a scalability bottleneck, + * so we're exposed to partial setup state. The shrinker rwsem does not + * protect filesystem operations backing list_lru_shrink_count() or + * s_op->nr_cached_objects(). Counts can change between + * super_cache_count and super_cache_scan, so we really don't need locks + * here. + * + * However, if we are currently mounting the superblock, the underlying + * filesystem might be in a state of partial construction and hence it + * is dangerous to access it. trylock_super() uses a SB_BORN check to + * avoid this situation, so do the same here. The memory barrier is + * matched with the one in mount_fs() as we don't hold locks here. */ + if (!(sb->s_flags & SB_BORN)) + return 0; + smp_rmb(); + if (sb->s_op && sb->s_op->nr_cached_objects) total_objects = sb->s_op->nr_cached_objects(sb, sc); @@ -937,7 +947,7 @@ void emergency_remount(void) static void do_thaw_all_callback(struct super_block *sb) { down_write(&sb->s_umount); - if (sb->s_root && sb->s_flags & MS_BORN) { + if (sb->s_root && sb->s_flags & SB_BORN) { emergency_thaw_bdev(sb); thaw_super_locked(sb); } else { @@ -1272,6 +1282,14 @@ mount_fs(struct file_system_type *type, int flags, const char *name, void *data) sb = root->d_sb; BUG_ON(!sb); WARN_ON(!sb->s_bdi); + + /* + * Write barrier is for super_cache_count(). We place it before setting + * SB_BORN as the data dependency between the two functions is the + * superblock structure contents that we just set up, not the SB_BORN + * flag. + */ + smp_wmb(); sb->s_flags |= SB_BORN; error = security_sb_kern_mount(sb, flags, secdata); diff --git a/fs/sysfs/mount.c b/fs/sysfs/mount.c index b428d317ae92..92682fcc41f6 100644 --- a/fs/sysfs/mount.c +++ b/fs/sysfs/mount.c @@ -25,7 +25,7 @@ static struct dentry *sysfs_mount(struct file_system_type *fs_type, { struct dentry *root; void *ns; - bool new_sb; + bool new_sb = false; if (!(flags & SB_KERNMOUNT)) { if (!kobj_ns_current_may_mount(KOBJ_NS_TYPE_NET)) @@ -35,9 +35,9 @@ static struct dentry *sysfs_mount(struct file_system_type *fs_type, ns = kobj_ns_grab_current(KOBJ_NS_TYPE_NET); root = kernfs_mount_ns(fs_type, flags, sysfs_root, SYSFS_MAGIC, &new_sb, ns); - if (IS_ERR(root) || !new_sb) + if (!new_sb) kobj_ns_drop(KOBJ_NS_TYPE_NET, ns); - else if (new_sb) + else if (!IS_ERR(root)) root->d_sb->s_iflags |= SB_I_USERNS_VISIBLE; return root; diff --git a/fs/sysv/namei.c b/fs/sysv/namei.c index 250b0755b908..4d5d20491ffd 100644 --- a/fs/sysv/namei.c +++ b/fs/sysv/namei.c @@ -51,14 +51,9 @@ static struct dentry *sysv_lookup(struct inode * dir, struct dentry * dentry, un if (dentry->d_name.len > SYSV_NAMELEN) return ERR_PTR(-ENAMETOOLONG); ino = sysv_inode_by_name(dentry); - - if (ino) { + if (ino) inode = sysv_iget(dir->i_sb, ino); - if (IS_ERR(inode)) - return ERR_CAST(inode); - } - d_add(dentry, inode); - return NULL; + return d_splice_alias(inode, dentry); } static int sysv_mknod(struct inode * dir, struct dentry * dentry, umode_t mode, dev_t rdev) diff --git a/fs/timerfd.c b/fs/timerfd.c index cdad49da3ff7..d84a2bee4f82 100644 --- a/fs/timerfd.c +++ b/fs/timerfd.c @@ -226,21 +226,20 @@ static int timerfd_release(struct inode *inode, struct file *file) kfree_rcu(ctx, rcu); return 0; } - -static __poll_t timerfd_poll(struct file *file, poll_table *wait) + +static struct wait_queue_head *timerfd_get_poll_head(struct file *file, + __poll_t eventmask) { struct timerfd_ctx *ctx = file->private_data; - __poll_t events = 0; - unsigned long flags; - poll_wait(file, &ctx->wqh, wait); + return &ctx->wqh; +} - spin_lock_irqsave(&ctx->wqh.lock, flags); - if (ctx->ticks) - events |= EPOLLIN; - spin_unlock_irqrestore(&ctx->wqh.lock, flags); +static __poll_t timerfd_poll_mask(struct file *file, __poll_t eventmask) +{ + struct timerfd_ctx *ctx = file->private_data; - return events; + return ctx->ticks ? EPOLLIN : 0; } static ssize_t timerfd_read(struct file *file, char __user *buf, size_t count, @@ -364,7 +363,8 @@ static long timerfd_ioctl(struct file *file, unsigned int cmd, unsigned long arg static const struct file_operations timerfd_fops = { .release = timerfd_release, - .poll = timerfd_poll, + .get_poll_head = timerfd_get_poll_head, + .poll_mask = timerfd_poll_mask, .read = timerfd_read, .llseek = noop_llseek, .show_fdinfo = timerfd_show, diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c index 9d7fb88e172e..4e267cc21c77 100644 --- a/fs/ubifs/dir.c +++ b/fs/ubifs/dir.c @@ -214,7 +214,7 @@ static struct dentry *ubifs_lookup(struct inode *dir, struct dentry *dentry, int err; union ubifs_key key; struct inode *inode = NULL; - struct ubifs_dent_node *dent; + struct ubifs_dent_node *dent = NULL; struct ubifs_info *c = dir->i_sb->s_fs_info; struct fscrypt_name nm; @@ -229,14 +229,14 @@ static struct dentry *ubifs_lookup(struct inode *dir, struct dentry *dentry, return ERR_PTR(err); if (fname_len(&nm) > UBIFS_MAX_NLEN) { - err = -ENAMETOOLONG; - goto out_fname; + inode = ERR_PTR(-ENAMETOOLONG); + goto done; } dent = kmalloc(UBIFS_MAX_DENT_NODE_SZ, GFP_NOFS); if (!dent) { - err = -ENOMEM; - goto out_fname; + inode = ERR_PTR(-ENOMEM); + goto done; } if (nm.hash) { @@ -250,16 +250,16 @@ static struct dentry *ubifs_lookup(struct inode *dir, struct dentry *dentry, } if (err) { - if (err == -ENOENT) { + if (err == -ENOENT) dbg_gen("not found"); - goto done; - } - goto out_dent; + else + inode = ERR_PTR(err); + goto done; } if (dbg_check_name(c, dent, &nm)) { - err = -EINVAL; - goto out_dent; + inode = ERR_PTR(-EINVAL); + goto done; } inode = ubifs_iget(dir->i_sb, le64_to_cpu(dent->inum)); @@ -272,7 +272,7 @@ static struct dentry *ubifs_lookup(struct inode *dir, struct dentry *dentry, ubifs_err(c, "dead directory entry '%pd', error %d", dentry, err); ubifs_ro_mode(c, err); - goto out_dent; + goto done; } if (ubifs_crypt_is_encrypted(dir) && @@ -280,27 +280,14 @@ static struct dentry *ubifs_lookup(struct inode *dir, struct dentry *dentry, !fscrypt_has_permitted_context(dir, inode)) { ubifs_warn(c, "Inconsistent encryption contexts: %lu/%lu", dir->i_ino, inode->i_ino); - err = -EPERM; - goto out_inode; + iput(inode); + inode = ERR_PTR(-EPERM); } done: kfree(dent); fscrypt_free_filename(&nm); - /* - * Note, d_splice_alias() would be required instead if we supported - * NFS. - */ - d_add(dentry, inode); - return NULL; - -out_inode: - iput(inode); -out_dent: - kfree(dent); -out_fname: - fscrypt_free_filename(&nm); - return ERR_PTR(err); + return d_splice_alias(inode, dentry); } static int ubifs_create(struct inode *dir, struct dentry *dentry, umode_t mode, diff --git a/fs/udf/namei.c b/fs/udf/namei.c index 0458dd47e105..c586026508db 100644 --- a/fs/udf/namei.c +++ b/fs/udf/namei.c @@ -622,8 +622,7 @@ static int udf_add_nondir(struct dentry *dentry, struct inode *inode) if (fibh.sbh != fibh.ebh) brelse(fibh.ebh); brelse(fibh.sbh); - unlock_new_inode(inode); - d_instantiate(dentry, inode); + d_instantiate_new(dentry, inode); return 0; } @@ -733,8 +732,7 @@ static int udf_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) inc_nlink(dir); dir->i_ctime = dir->i_mtime = current_time(dir); mark_inode_dirty(dir); - unlock_new_inode(inode); - d_instantiate(dentry, inode); + d_instantiate_new(dentry, inode); if (fibh.sbh != fibh.ebh) brelse(fibh.ebh); brelse(fibh.sbh); diff --git a/fs/ufs/namei.c b/fs/ufs/namei.c index 32545cd00ceb..d5f43ba76c59 100644 --- a/fs/ufs/namei.c +++ b/fs/ufs/namei.c @@ -39,8 +39,7 @@ static inline int ufs_add_nondir(struct dentry *dentry, struct inode *inode) { int err = ufs_add_link(dentry, inode); if (!err) { - unlock_new_inode(inode); - d_instantiate(dentry, inode); + d_instantiate_new(dentry, inode); return 0; } inode_dec_link_count(inode); @@ -193,8 +192,7 @@ static int ufs_mkdir(struct inode * dir, struct dentry * dentry, umode_t mode) if (err) goto out_fail; - unlock_new_inode(inode); - d_instantiate(dentry, inode); + d_instantiate_new(dentry, inode); return 0; out_fail: diff --git a/fs/xattr.c b/fs/xattr.c index 61cd28ba25f3..f9cb1db187b7 100644 --- a/fs/xattr.c +++ b/fs/xattr.c @@ -229,7 +229,7 @@ out: } EXPORT_SYMBOL_GPL(vfs_setxattr); -ssize_t +static ssize_t xattr_getsecurity(struct inode *inode, const char *name, void *value, size_t size) { @@ -254,7 +254,6 @@ out: out_noalloc: return len; } -EXPORT_SYMBOL_GPL(xattr_getsecurity); /* * vfs_getxattr_alloc - allocate memory, if necessary, before calling getxattr @@ -354,7 +353,6 @@ vfs_listxattr(struct dentry *dentry, char *list, size_t size) if (error) return error; if (inode->i_op->listxattr && (inode->i_opflags & IOP_XATTR)) { - error = -EOPNOTSUPP; error = inode->i_op->listxattr(dentry, list, size); } else { error = security_inode_listsecurity(inode, list, size); diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 56e405572909..ca6903726689 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -594,7 +594,7 @@ xfs_alloc_ioend( struct xfs_ioend *ioend; struct bio *bio; - bio = bio_alloc_bioset(GFP_NOFS, BIO_MAX_PAGES, xfs_ioend_bioset); + bio = bio_alloc_bioset(GFP_NOFS, BIO_MAX_PAGES, &xfs_ioend_bioset); xfs_init_bio_from_bh(bio, bh); ioend = container_of(bio, struct xfs_ioend, io_inline_bio); diff --git a/fs/xfs/xfs_aops.h b/fs/xfs/xfs_aops.h index 69346d460dfa..694c85b03813 100644 --- a/fs/xfs/xfs_aops.h +++ b/fs/xfs/xfs_aops.h @@ -18,7 +18,7 @@ #ifndef __XFS_AOPS_H__ #define __XFS_AOPS_H__ -extern struct bio_set *xfs_ioend_bioset; +extern struct bio_set xfs_ioend_bioset; /* * Types of I/O for bmap clustering and I/O completion tracking. diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index 9925d75411bb..b0eb49bb4918 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -260,6 +260,7 @@ xfs_vn_lookup( struct dentry *dentry, unsigned int flags) { + struct inode *inode; struct xfs_inode *cip; struct xfs_name name; int error; @@ -269,14 +270,13 @@ xfs_vn_lookup( xfs_dentry_to_name(&name, dentry); error = xfs_lookup(XFS_I(dir), &name, &cip, NULL); - if (unlikely(error)) { - if (unlikely(error != -ENOENT)) - return ERR_PTR(error); - d_add(dentry, NULL); - return NULL; - } - - return d_splice_alias(VFS_I(cip), dentry); + if (likely(!error)) + inode = VFS_I(cip); + else if (likely(error == -ENOENT)) + inode = NULL; + else + inode = ERR_PTR(error); + return d_splice_alias(inode, dentry); } STATIC struct dentry * diff --git a/fs/xfs/xfs_stats.c b/fs/xfs/xfs_stats.c index 056e12b421eb..1cc79907b377 100644 --- a/fs/xfs/xfs_stats.c +++ b/fs/xfs/xfs_stats.c @@ -113,6 +113,7 @@ void xfs_stats_clearall(struct xfsstats __percpu *stats) } } +#ifdef CONFIG_PROC_FS /* legacy quota interfaces */ #ifdef CONFIG_XFS_QUOTA static int xqm_proc_show(struct seq_file *m, void *v) @@ -124,18 +125,6 @@ static int xqm_proc_show(struct seq_file *m, void *v) return 0; } -static int xqm_proc_open(struct inode *inode, struct file *file) -{ - return single_open(file, xqm_proc_show, NULL); -} - -static const struct file_operations xqm_proc_fops = { - .open = xqm_proc_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; - /* legacy quota stats interface no 2 */ static int xqmstat_proc_show(struct seq_file *m, void *v) { @@ -147,22 +136,8 @@ static int xqmstat_proc_show(struct seq_file *m, void *v) seq_putc(m, '\n'); return 0; } - -static int xqmstat_proc_open(struct inode *inode, struct file *file) -{ - return single_open(file, xqmstat_proc_show, NULL); -} - -static const struct file_operations xqmstat_proc_fops = { - .owner = THIS_MODULE, - .open = xqmstat_proc_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; #endif /* CONFIG_XFS_QUOTA */ -#ifdef CONFIG_PROC_FS int xfs_init_procfs(void) { @@ -174,11 +149,9 @@ xfs_init_procfs(void) goto out; #ifdef CONFIG_XFS_QUOTA - if (!proc_create("fs/xfs/xqmstat", 0, NULL, - &xqmstat_proc_fops)) + if (!proc_create_single("fs/xfs/xqmstat", 0, NULL, xqmstat_proc_show)) goto out; - if (!proc_create("fs/xfs/xqm", 0, NULL, - &xqm_proc_fops)) + if (!proc_create_single("fs/xfs/xqm", 0, NULL, xqm_proc_show)) goto out; #endif return 0; diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 4e66e61865fb..ed67389f4948 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -63,7 +63,7 @@ #include <linux/parser.h> static const struct super_operations xfs_super_operations; -struct bio_set *xfs_ioend_bioset; +struct bio_set xfs_ioend_bioset; static struct kset *xfs_kset; /* top-level xfs sysfs dir */ #ifdef DEBUG @@ -1872,10 +1872,9 @@ MODULE_ALIAS_FS("xfs"); STATIC int __init xfs_init_zones(void) { - xfs_ioend_bioset = bioset_create(4 * MAX_BUF_PER_PAGE, + if (bioset_init(&xfs_ioend_bioset, 4 * MAX_BUF_PER_PAGE, offsetof(struct xfs_ioend, io_inline_bio), - BIOSET_NEED_BVECS); - if (!xfs_ioend_bioset) + BIOSET_NEED_BVECS)) goto out; xfs_log_ticket_zone = kmem_zone_init(sizeof(xlog_ticket_t), @@ -2017,7 +2016,7 @@ xfs_init_zones(void) out_destroy_log_ticket_zone: kmem_zone_destroy(xfs_log_ticket_zone); out_free_ioend_bioset: - bioset_free(xfs_ioend_bioset); + bioset_exit(&xfs_ioend_bioset); out: return -ENOMEM; } @@ -2048,7 +2047,7 @@ xfs_destroy_zones(void) kmem_zone_destroy(xfs_btree_cur_zone); kmem_zone_destroy(xfs_bmap_free_item_zone); kmem_zone_destroy(xfs_log_ticket_zone); - bioset_free(xfs_ioend_bioset); + bioset_exit(&xfs_ioend_bioset); } STATIC int __init |