diff options
author | Artem Bityutskiy <artem.bityutskiy@linux.intel.com> | 2015-03-25 11:03:07 +0200 |
---|---|---|
committer | Artem Bityutskiy <artem.bityutskiy@linux.intel.com> | 2015-03-25 11:03:07 +0200 |
commit | 3527a86b7ae17c949307d00e1eb7087604bca1b4 (patch) | |
tree | 8e8eab905e7e73a3bf2c4d47abd356166533c9db /fs | |
parent | b388e6a7a6ba988998ddd83919ae8d3debf1a13d (diff) | |
parent | bc465aa9d045feb0e13b4a8f32cc33c1943f62d6 (diff) |
Merge tag 'v4.0-rc5' into linux-next
Merge the upstream -rc5 tag because we needed a more up-to-date base our
further work.
Diffstat (limited to 'fs')
121 files changed, 2373 insertions, 1118 deletions
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c index 9ee5343d4884..3662f1d1d9cf 100644 --- a/fs/9p/vfs_inode.c +++ b/fs/9p/vfs_inode.c @@ -1127,7 +1127,7 @@ static int v9fs_vfs_setattr(struct dentry *dentry, struct iattr *iattr) } /* Write all dirty data */ - if (S_ISREG(dentry->d_inode->i_mode)) + if (d_is_reg(dentry)) filemap_write_and_wait(dentry->d_inode->i_mapping); retval = p9_client_wstat(fid, &wstat); @@ -1285,7 +1285,7 @@ SYSCALL_DEFINE2(io_setup, unsigned, nr_events, aio_context_t __user *, ctxp) ret = -EINVAL; if (unlikely(ctx || nr_events == 0)) { - pr_debug("EINVAL: io_setup: ctx %lu nr_events %u\n", + pr_debug("EINVAL: ctx %lu nr_events %u\n", ctx, nr_events); goto out; } @@ -1333,7 +1333,7 @@ SYSCALL_DEFINE1(io_destroy, aio_context_t, ctx) return ret; } - pr_debug("EINVAL: io_destroy: invalid context id\n"); + pr_debug("EINVAL: invalid context id\n"); return -EINVAL; } @@ -1515,7 +1515,7 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb, (iocb->aio_nbytes != (size_t)iocb->aio_nbytes) || ((ssize_t)iocb->aio_nbytes < 0) )) { - pr_debug("EINVAL: io_submit: overflow check\n"); + pr_debug("EINVAL: overflow check\n"); return -EINVAL; } diff --git a/fs/autofs4/dev-ioctl.c b/fs/autofs4/dev-ioctl.c index aaf96cb25452..ac7d921ed984 100644 --- a/fs/autofs4/dev-ioctl.c +++ b/fs/autofs4/dev-ioctl.c @@ -95,7 +95,7 @@ static int check_dev_ioctl_version(int cmd, struct autofs_dev_ioctl *param) */ static struct autofs_dev_ioctl *copy_dev_ioctl(struct autofs_dev_ioctl __user *in) { - struct autofs_dev_ioctl tmp; + struct autofs_dev_ioctl tmp, *res; if (copy_from_user(&tmp, in, sizeof(tmp))) return ERR_PTR(-EFAULT); @@ -106,7 +106,11 @@ static struct autofs_dev_ioctl *copy_dev_ioctl(struct autofs_dev_ioctl __user *i if (tmp.size > (PATH_MAX + sizeof(tmp))) return ERR_PTR(-ENAMETOOLONG); - return memdup_user(in, tmp.size); + res = memdup_user(in, tmp.size); + if (!IS_ERR(res)) + res->size = tmp.size; + + return res; } static inline void free_dev_ioctl(struct autofs_dev_ioctl *param) diff --git a/fs/autofs4/expire.c b/fs/autofs4/expire.c index bfdbaba9c2ba..11dd118f75e2 100644 --- a/fs/autofs4/expire.c +++ b/fs/autofs4/expire.c @@ -374,7 +374,7 @@ static struct dentry *should_expire(struct dentry *dentry, return NULL; } - if (dentry->d_inode && S_ISLNK(dentry->d_inode->i_mode)) { + if (dentry->d_inode && d_is_symlink(dentry)) { DPRINTK("checking symlink %p %pd", dentry, dentry); /* * A symlink can't be "busy" in the usual sense so diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c index dbb5b7212ce1..7e44fdd03e2d 100644 --- a/fs/autofs4/root.c +++ b/fs/autofs4/root.c @@ -108,7 +108,7 @@ static int autofs4_dir_open(struct inode *inode, struct file *file) struct dentry *dentry = file->f_path.dentry; struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb); - DPRINTK("file=%p dentry=%p %pD", file, dentry, dentry); + DPRINTK("file=%p dentry=%p %pd", file, dentry, dentry); if (autofs4_oz_mode(sbi)) goto out; @@ -371,7 +371,7 @@ static struct vfsmount *autofs4_d_automount(struct path *path) * having d_mountpoint() true, so there's no need to call back * to the daemon. */ - if (dentry->d_inode && S_ISLNK(dentry->d_inode->i_mode)) { + if (dentry->d_inode && d_is_symlink(dentry)) { spin_unlock(&sbi->fs_lock); goto done; } @@ -485,7 +485,7 @@ static int autofs4_d_manage(struct dentry *dentry, bool rcu_walk) * an incorrect ELOOP error return. */ if ((!d_mountpoint(dentry) && !simple_empty(dentry)) || - (dentry->d_inode && S_ISLNK(dentry->d_inode->i_mode))) + (dentry->d_inode && d_is_symlink(dentry))) status = -EISDIR; } spin_unlock(&sbi->fs_lock); diff --git a/fs/bad_inode.c b/fs/bad_inode.c index afd2b4408adf..861b1e1c4777 100644 --- a/fs/bad_inode.c +++ b/fs/bad_inode.c @@ -15,161 +15,14 @@ #include <linux/namei.h> #include <linux/poll.h> - -static loff_t bad_file_llseek(struct file *file, loff_t offset, int whence) -{ - return -EIO; -} - -static ssize_t bad_file_read(struct file *filp, char __user *buf, - size_t size, loff_t *ppos) -{ - return -EIO; -} - -static ssize_t bad_file_write(struct file *filp, const char __user *buf, - size_t siz, loff_t *ppos) -{ - return -EIO; -} - -static ssize_t bad_file_aio_read(struct kiocb *iocb, const struct iovec *iov, - unsigned long nr_segs, loff_t pos) -{ - return -EIO; -} - -static ssize_t bad_file_aio_write(struct kiocb *iocb, const struct iovec *iov, - unsigned long nr_segs, loff_t pos) -{ - return -EIO; -} - -static int bad_file_readdir(struct file *file, struct dir_context *ctx) -{ - return -EIO; -} - -static unsigned int bad_file_poll(struct file *filp, poll_table *wait) -{ - return POLLERR; -} - -static long bad_file_unlocked_ioctl(struct file *file, unsigned cmd, - unsigned long arg) -{ - return -EIO; -} - -static long bad_file_compat_ioctl(struct file *file, unsigned int cmd, - unsigned long arg) -{ - return -EIO; -} - -static int bad_file_mmap(struct file *file, struct vm_area_struct *vma) -{ - return -EIO; -} - static int bad_file_open(struct inode *inode, struct file *filp) { return -EIO; } -static int bad_file_flush(struct file *file, fl_owner_t id) -{ - return -EIO; -} - -static int bad_file_release(struct inode *inode, struct file *filp) -{ - return -EIO; -} - -static int bad_file_fsync(struct file *file, loff_t start, loff_t end, - int datasync) -{ - return -EIO; -} - -static int bad_file_aio_fsync(struct kiocb *iocb, int datasync) -{ - return -EIO; -} - -static int bad_file_fasync(int fd, struct file *filp, int on) -{ - return -EIO; -} - -static int bad_file_lock(struct file *file, int cmd, struct file_lock *fl) -{ - return -EIO; -} - -static ssize_t bad_file_sendpage(struct file *file, struct page *page, - int off, size_t len, loff_t *pos, int more) -{ - return -EIO; -} - -static unsigned long bad_file_get_unmapped_area(struct file *file, - unsigned long addr, unsigned long len, - unsigned long pgoff, unsigned long flags) -{ - return -EIO; -} - -static int bad_file_check_flags(int flags) -{ - return -EIO; -} - -static int bad_file_flock(struct file *filp, int cmd, struct file_lock *fl) -{ - return -EIO; -} - -static ssize_t bad_file_splice_write(struct pipe_inode_info *pipe, - struct file *out, loff_t *ppos, size_t len, - unsigned int flags) -{ - return -EIO; -} - -static ssize_t bad_file_splice_read(struct file *in, loff_t *ppos, - struct pipe_inode_info *pipe, size_t len, - unsigned int flags) -{ - return -EIO; -} - static const struct file_operations bad_file_ops = { - .llseek = bad_file_llseek, - .read = bad_file_read, - .write = bad_file_write, - .aio_read = bad_file_aio_read, - .aio_write = bad_file_aio_write, - .iterate = bad_file_readdir, - .poll = bad_file_poll, - .unlocked_ioctl = bad_file_unlocked_ioctl, - .compat_ioctl = bad_file_compat_ioctl, - .mmap = bad_file_mmap, .open = bad_file_open, - .flush = bad_file_flush, - .release = bad_file_release, - .fsync = bad_file_fsync, - .aio_fsync = bad_file_aio_fsync, - .fasync = bad_file_fasync, - .lock = bad_file_lock, - .sendpage = bad_file_sendpage, - .get_unmapped_area = bad_file_get_unmapped_area, - .check_flags = bad_file_check_flags, - .flock = bad_file_flock, - .splice_write = bad_file_splice_write, - .splice_read = bad_file_splice_read, }; static int bad_inode_create (struct inode *dir, struct dentry *dentry, diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index 02b16910f4c9..995986b8e36b 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -645,11 +645,12 @@ out: static unsigned long randomize_stack_top(unsigned long stack_top) { - unsigned int random_variable = 0; + unsigned long random_variable = 0; if ((current->flags & PF_RANDOMIZE) && !(current->personality & ADDR_NO_RANDOMIZE)) { - random_variable = get_random_int() & STACK_RND_MASK; + random_variable = (unsigned long) get_random_int(); + random_variable &= STACK_RND_MASK; random_variable <<= PAGE_SHIFT; } #ifdef CONFIG_STACK_GROWSUP diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 993642199326..6d67f32e648d 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -1645,14 +1645,14 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans, parent_nritems = btrfs_header_nritems(parent); blocksize = root->nodesize; - end_slot = parent_nritems; + end_slot = parent_nritems - 1; - if (parent_nritems == 1) + if (parent_nritems <= 1) return 0; btrfs_set_lock_blocking(parent); - for (i = start_slot; i < end_slot; i++) { + for (i = start_slot; i <= end_slot; i++) { int close = 1; btrfs_node_key(parent, &disk_key, i); @@ -1669,7 +1669,7 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans, other = btrfs_node_blockptr(parent, i - 1); close = close_blocks(blocknr, other, blocksize); } - if (!close && i < end_slot - 2) { + if (!close && i < end_slot) { other = btrfs_node_blockptr(parent, i + 1); close = close_blocks(blocknr, other, blocksize); } diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 84c3b00f3de8..f9c89cae39ee 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -3387,6 +3387,8 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans, struct btrfs_root *root); +int btrfs_setup_space_cache(struct btrfs_trans_handle *trans, + struct btrfs_root *root); int btrfs_extent_readonly(struct btrfs_root *root, u64 bytenr); int btrfs_free_block_groups(struct btrfs_fs_info *info); int btrfs_read_block_groups(struct btrfs_root *root); @@ -3909,6 +3911,9 @@ int btrfs_prealloc_file_range_trans(struct inode *inode, loff_t actual_len, u64 *alloc_hint); int btrfs_inode_check_errors(struct inode *inode); extern const struct dentry_operations btrfs_dentry_operations; +#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS +void btrfs_test_inode_set_ops(struct inode *inode); +#endif /* ioctl.c */ long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg); diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index f79f38542a73..639f2663ed3f 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -3921,7 +3921,7 @@ static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info, } if (btrfs_super_sys_array_size(sb) < sizeof(struct btrfs_disk_key) + sizeof(struct btrfs_chunk)) { - printk(KERN_ERR "BTRFS: system chunk array too small %u < %lu\n", + printk(KERN_ERR "BTRFS: system chunk array too small %u < %zu\n", btrfs_super_sys_array_size(sb), sizeof(struct btrfs_disk_key) + sizeof(struct btrfs_chunk)); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 571f402d3fc4..8b353ad02f03 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -3208,6 +3208,8 @@ static int cache_save_setup(struct btrfs_block_group_cache *block_group, return 0; } + if (trans->aborted) + return 0; again: inode = lookup_free_space_inode(root, block_group, path); if (IS_ERR(inode) && PTR_ERR(inode) != -ENOENT) { @@ -3243,6 +3245,20 @@ again: */ BTRFS_I(inode)->generation = 0; ret = btrfs_update_inode(trans, root, inode); + if (ret) { + /* + * So theoretically we could recover from this, simply set the + * super cache generation to 0 so we know to invalidate the + * cache, but then we'd have to keep track of the block groups + * that fail this way so we know we _have_ to reset this cache + * before the next commit or risk reading stale cache. So to + * limit our exposure to horrible edge cases lets just abort the + * transaction, this only happens in really bad situations + * anyway. + */ + btrfs_abort_transaction(trans, root, ret); + goto out_put; + } WARN_ON(ret); if (i_size_read(inode) > 0) { @@ -3309,6 +3325,32 @@ out: return ret; } +int btrfs_setup_space_cache(struct btrfs_trans_handle *trans, + struct btrfs_root *root) +{ + struct btrfs_block_group_cache *cache, *tmp; + struct btrfs_transaction *cur_trans = trans->transaction; + struct btrfs_path *path; + + if (list_empty(&cur_trans->dirty_bgs) || + !btrfs_test_opt(root, SPACE_CACHE)) + return 0; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + /* Could add new block groups, use _safe just in case */ + list_for_each_entry_safe(cache, tmp, &cur_trans->dirty_bgs, + dirty_list) { + if (cache->disk_cache_state == BTRFS_DC_CLEAR) + cache_save_setup(cache, trans, path); + } + + btrfs_free_path(path); + return 0; +} + int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans, struct btrfs_root *root) { @@ -5094,7 +5136,11 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes) num_bytes = ALIGN(num_bytes, root->sectorsize); spin_lock(&BTRFS_I(inode)->lock); - BTRFS_I(inode)->outstanding_extents++; + nr_extents = (unsigned)div64_u64(num_bytes + + BTRFS_MAX_EXTENT_SIZE - 1, + BTRFS_MAX_EXTENT_SIZE); + BTRFS_I(inode)->outstanding_extents += nr_extents; + nr_extents = 0; if (BTRFS_I(inode)->outstanding_extents > BTRFS_I(inode)->reserved_extents) @@ -5239,6 +5285,9 @@ void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes) if (dropped > 0) to_free += btrfs_calc_trans_metadata_size(root, dropped); + if (btrfs_test_is_dummy_root(root)) + return; + trace_btrfs_space_reservation(root->fs_info, "delalloc", btrfs_ino(inode), to_free, 0); if (root->fs_info->quota_enabled) { diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index c7233ff1d533..d688cfe5d496 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -4968,6 +4968,12 @@ static int release_extent_buffer(struct extent_buffer *eb) /* Should be safe to release our pages at this point */ btrfs_release_extent_buffer_page(eb); +#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS + if (unlikely(test_bit(EXTENT_BUFFER_DUMMY, &eb->bflags))) { + __free_extent_buffer(eb); + return 1; + } +#endif call_rcu(&eb->rcu_head, btrfs_release_extent_buffer_rcu); return 1; } diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index b78bbbac900d..30982bbd31c3 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1811,22 +1811,10 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb, mutex_unlock(&inode->i_mutex); /* - * we want to make sure fsync finds this change - * but we haven't joined a transaction running right now. - * - * Later on, someone is sure to update the inode and get the - * real transid recorded. - * - * We set last_trans now to the fs_info generation + 1, - * this will either be one more than the running transaction - * or the generation used for the next transaction if there isn't - * one running right now. - * * We also have to set last_sub_trans to the current log transid, * otherwise subsequent syncs to a file that's been synced in this * transaction will appear to have already occured. */ - BTRFS_I(inode)->last_trans = root->fs_info->generation + 1; BTRFS_I(inode)->last_sub_trans = root->log_transid; if (num_written > 0) { err = generic_write_sync(file, pos, num_written); @@ -1959,25 +1947,37 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) atomic_inc(&root->log_batch); /* - * check the transaction that last modified this inode - * and see if its already been committed - */ - if (!BTRFS_I(inode)->last_trans) { - mutex_unlock(&inode->i_mutex); - goto out; - } - - /* - * if the last transaction that changed this file was before - * the current transaction, we can bail out now without any - * syncing + * If the last transaction that changed this file was before the current + * transaction and we have the full sync flag set in our inode, we can + * bail out now without any syncing. + * + * Note that we can't bail out if the full sync flag isn't set. This is + * because when the full sync flag is set we start all ordered extents + * and wait for them to fully complete - when they complete they update + * the inode's last_trans field through: + * + * btrfs_finish_ordered_io() -> + * btrfs_update_inode_fallback() -> + * btrfs_update_inode() -> + * btrfs_set_inode_last_trans() + * + * So we are sure that last_trans is up to date and can do this check to + * bail out safely. For the fast path, when the full sync flag is not + * set in our inode, we can not do it because we start only our ordered + * extents and don't wait for them to complete (that is when + * btrfs_finish_ordered_io runs), so here at this point their last_trans + * value might be less than or equals to fs_info->last_trans_committed, + * and setting a speculative last_trans for an inode when a buffered + * write is made (such as fs_info->generation + 1 for example) would not + * be reliable since after setting the value and before fsync is called + * any number of transactions can start and commit (transaction kthread + * commits the current transaction periodically), and a transaction + * commit does not start nor waits for ordered extents to complete. */ smp_mb(); if (btrfs_inode_in_log(inode, root->fs_info->generation) || - BTRFS_I(inode)->last_trans <= - root->fs_info->last_trans_committed) { - BTRFS_I(inode)->last_trans = 0; - + (full_sync && BTRFS_I(inode)->last_trans <= + root->fs_info->last_trans_committed)) { /* * We'v had everything committed since the last time we were * modified so clear this flag in case it was set for whatever @@ -2275,6 +2275,8 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) bool same_page; bool no_holes = btrfs_fs_incompat(root->fs_info, NO_HOLES); u64 ino_size; + bool truncated_page = false; + bool updated_inode = false; ret = btrfs_wait_ordered_range(inode, offset, len); if (ret) @@ -2306,13 +2308,18 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) * entire page. */ if (same_page && len < PAGE_CACHE_SIZE) { - if (offset < ino_size) + if (offset < ino_size) { + truncated_page = true; ret = btrfs_truncate_page(inode, offset, len, 0); + } else { + ret = 0; + } goto out_only_mutex; } /* zero back part of the first page */ if (offset < ino_size) { + truncated_page = true; ret = btrfs_truncate_page(inode, offset, 0, 0); if (ret) { mutex_unlock(&inode->i_mutex); @@ -2348,6 +2355,7 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) if (!ret) { /* zero the front end of the last page */ if (tail_start + tail_len < ino_size) { + truncated_page = true; ret = btrfs_truncate_page(inode, tail_start + tail_len, 0, 1); if (ret) @@ -2357,8 +2365,8 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) } if (lockend < lockstart) { - mutex_unlock(&inode->i_mutex); - return 0; + ret = 0; + goto out_only_mutex; } while (1) { @@ -2506,6 +2514,7 @@ out_trans: trans->block_rsv = &root->fs_info->trans_block_rsv; ret = btrfs_update_inode(trans, root, inode); + updated_inode = true; btrfs_end_transaction(trans, root); btrfs_btree_balance_dirty(root); out_free: @@ -2515,6 +2524,22 @@ out: unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend, &cached_state, GFP_NOFS); out_only_mutex: + if (!updated_inode && truncated_page && !ret && !err) { + /* + * If we only end up zeroing part of a page, we still need to + * update the inode item, so that all the time fields are + * updated as well as the necessary btrfs inode in memory fields + * for detecting, at fsync time, if the inode isn't yet in the + * log tree or it's there but not up to date. + */ + trans = btrfs_start_transaction(root, 1); + if (IS_ERR(trans)) { + err = PTR_ERR(trans); + } else { + err = btrfs_update_inode(trans, root, inode); + ret = btrfs_end_transaction(trans, root); + } + } mutex_unlock(&inode->i_mutex); if (ret && !err) err = ret; diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index a85c23dfcddb..d2e732d7af52 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -108,6 +108,13 @@ static struct extent_map *create_pinned_em(struct inode *inode, u64 start, static int btrfs_dirty_inode(struct inode *inode); +#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS +void btrfs_test_inode_set_ops(struct inode *inode) +{ + BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops; +} +#endif + static int btrfs_init_inode_security(struct btrfs_trans_handle *trans, struct inode *inode, struct inode *dir, const struct qstr *qstr) @@ -1542,30 +1549,17 @@ static void btrfs_split_extent_hook(struct inode *inode, u64 new_size; /* - * We need the largest size of the remaining extent to see if we - * need to add a new outstanding extent. Think of the following - * case - * - * [MEAX_EXTENT_SIZEx2 - 4k][4k] - * - * The new_size would just be 4k and we'd think we had enough - * outstanding extents for this if we only took one side of the - * split, same goes for the other direction. We need to see if - * the larger size still is the same amount of extents as the - * original size, because if it is we need to add a new - * outstanding extent. But if we split up and the larger size - * is less than the original then we are good to go since we've - * already accounted for the extra extent in our original - * accounting. + * See the explanation in btrfs_merge_extent_hook, the same + * applies here, just in reverse. */ new_size = orig->end - split + 1; - if ((split - orig->start) > new_size) - new_size = split - orig->start; - - num_extents = div64_u64(size + BTRFS_MAX_EXTENT_SIZE - 1, + num_extents = div64_u64(new_size + BTRFS_MAX_EXTENT_SIZE - 1, BTRFS_MAX_EXTENT_SIZE); - if (div64_u64(new_size + BTRFS_MAX_EXTENT_SIZE - 1, - BTRFS_MAX_EXTENT_SIZE) < num_extents) + new_size = split - orig->start; + num_extents += div64_u64(new_size + BTRFS_MAX_EXTENT_SIZE - 1, + BTRFS_MAX_EXTENT_SIZE); + if (div64_u64(size + BTRFS_MAX_EXTENT_SIZE - 1, + BTRFS_MAX_EXTENT_SIZE) >= num_extents) return; } @@ -1591,8 +1585,10 @@ static void btrfs_merge_extent_hook(struct inode *inode, if (!(other->state & EXTENT_DELALLOC)) return; - old_size = other->end - other->start + 1; - new_size = old_size + (new->end - new->start + 1); + if (new->start > other->start) + new_size = new->end - other->start + 1; + else + new_size = other->end - new->start + 1; /* we're not bigger than the max, unreserve the space and go */ if (new_size <= BTRFS_MAX_EXTENT_SIZE) { @@ -1603,13 +1599,32 @@ static void btrfs_merge_extent_hook(struct inode *inode, } /* - * If we grew by another max_extent, just return, we want to keep that - * reserved amount. + * We have to add up either side to figure out how many extents were + * accounted for before we merged into one big extent. If the number of + * extents we accounted for is <= the amount we need for the new range + * then we can return, otherwise drop. Think of it like this + * + * [ 4k][MAX_SIZE] + * + * So we've grown the extent by a MAX_SIZE extent, this would mean we + * need 2 outstanding extents, on one side we have 1 and the other side + * we have 1 so they are == and we can return. But in this case + * + * [MAX_SIZE+4k][MAX_SIZE+4k] + * + * Each range on their own accounts for 2 extents, but merged together + * they are only 3 extents worth of accounting, so we need to drop in + * this case. */ + old_size = other->end - other->start + 1; num_extents = div64_u64(old_size + BTRFS_MAX_EXTENT_SIZE - 1, BTRFS_MAX_EXTENT_SIZE); + old_size = new->end - new->start + 1; + num_extents += div64_u64(old_size + BTRFS_MAX_EXTENT_SIZE - 1, + BTRFS_MAX_EXTENT_SIZE); + if (div64_u64(new_size + BTRFS_MAX_EXTENT_SIZE - 1, - BTRFS_MAX_EXTENT_SIZE) > num_extents) + BTRFS_MAX_EXTENT_SIZE) >= num_extents) return; spin_lock(&BTRFS_I(inode)->lock); @@ -1686,6 +1701,10 @@ static void btrfs_set_bit_hook(struct inode *inode, spin_unlock(&BTRFS_I(inode)->lock); } + /* For sanity tests */ + if (btrfs_test_is_dummy_root(root)) + return; + __percpu_counter_add(&root->fs_info->delalloc_bytes, len, root->fs_info->delalloc_batch); spin_lock(&BTRFS_I(inode)->lock); @@ -1741,6 +1760,10 @@ static void btrfs_clear_bit_hook(struct inode *inode, root != root->fs_info->tree_root) btrfs_delalloc_release_metadata(inode, len); + /* For sanity tests. */ + if (btrfs_test_is_dummy_root(root)) + return; + if (root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID && do_list && !(state->state & EXTENT_NORESERVE)) btrfs_free_reserved_data_space(inode, len); @@ -7213,7 +7236,7 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock, u64 start = iblock << inode->i_blkbits; u64 lockstart, lockend; u64 len = bh_result->b_size; - u64 orig_len = len; + u64 *outstanding_extents = NULL; int unlock_bits = EXTENT_LOCKED; int ret = 0; @@ -7225,6 +7248,16 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock, lockstart = start; lockend = start + len - 1; + if (current->journal_info) { + /* + * Need to pull our outstanding extents and set journal_info to NULL so + * that anything that needs to check if there's a transction doesn't get + * confused. + */ + outstanding_extents = current->journal_info; + current->journal_info = NULL; + } + /* * If this errors out it's because we couldn't invalidate pagecache for * this range and we need to fallback to buffered. @@ -7285,7 +7318,6 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock, ((BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) && em->block_start != EXTENT_MAP_HOLE)) { int type; - int ret; u64 block_start, orig_start, orig_block_len, ram_bytes; if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) @@ -7349,11 +7381,20 @@ unlock: if (start + len > i_size_read(inode)) i_size_write(inode, start + len); - if (len < orig_len) { + /* + * If we have an outstanding_extents count still set then we're + * within our reservation, otherwise we need to adjust our inode + * counter appropriately. + */ + if (*outstanding_extents) { + (*outstanding_extents)--; + } else { spin_lock(&BTRFS_I(inode)->lock); BTRFS_I(inode)->outstanding_extents++; spin_unlock(&BTRFS_I(inode)->lock); } + + current->journal_info = outstanding_extents; btrfs_free_reserved_data_space(inode, len); } @@ -7377,6 +7418,8 @@ unlock: unlock_err: clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend, unlock_bits, 1, 0, &cached_state, GFP_NOFS); + if (outstanding_extents) + current->journal_info = outstanding_extents; return ret; } @@ -8076,6 +8119,7 @@ static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb, { struct file *file = iocb->ki_filp; struct inode *inode = file->f_mapping->host; + u64 outstanding_extents = 0; size_t count = 0; int flags = 0; bool wakeup = true; @@ -8113,6 +8157,16 @@ static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb, ret = btrfs_delalloc_reserve_space(inode, count); if (ret) goto out; + outstanding_extents = div64_u64(count + + BTRFS_MAX_EXTENT_SIZE - 1, + BTRFS_MAX_EXTENT_SIZE); + + /* + * We need to know how many extents we reserved so that we can + * do the accounting properly if we go over the number we + * originally calculated. Abuse current->journal_info for this. + */ + current->journal_info = &outstanding_extents; } else if (test_bit(BTRFS_INODE_READDIO_NEED_LOCK, &BTRFS_I(inode)->runtime_flags)) { inode_dio_done(inode); @@ -8125,6 +8179,7 @@ static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb, iter, offset, btrfs_get_blocks_direct, NULL, btrfs_submit_direct, flags); if (rw & WRITE) { + current->journal_info = NULL; if (ret < 0 && ret != -EIOCBQUEUED) btrfs_delalloc_release_space(inode, count); else if (ret >= 0 && (size_t)ret < count) diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index d49fe8a0f6b5..74609b931ba5 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -776,11 +776,11 @@ static int btrfs_may_delete(struct inode *dir, struct dentry *victim, int isdir) IS_IMMUTABLE(victim->d_inode) || IS_SWAPFILE(victim->d_inode)) return -EPERM; if (isdir) { - if (!S_ISDIR(victim->d_inode->i_mode)) + if (!d_is_dir(victim)) return -ENOTDIR; if (IS_ROOT(victim)) return -EBUSY; - } else if (S_ISDIR(victim->d_inode->i_mode)) + } else if (d_is_dir(victim)) return -EISDIR; if (IS_DEADDIR(dir)) return -ENOENT; diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index 534544e08f76..157cc54fc634 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -452,9 +452,7 @@ void btrfs_get_logged_extents(struct inode *inode, continue; if (entry_end(ordered) <= start) break; - if (!list_empty(&ordered->log_list)) - continue; - if (test_bit(BTRFS_ORDERED_LOGGED, &ordered->flags)) + if (test_and_set_bit(BTRFS_ORDERED_LOGGED, &ordered->flags)) continue; list_add(&ordered->log_list, logged_list); atomic_inc(&ordered->refs); @@ -511,8 +509,7 @@ void btrfs_wait_logged_extents(struct btrfs_trans_handle *trans, wait_event(ordered->wait, test_bit(BTRFS_ORDERED_IO_DONE, &ordered->flags)); - if (!test_and_set_bit(BTRFS_ORDERED_LOGGED, &ordered->flags)) - list_add_tail(&ordered->trans_list, &trans->ordered); + list_add_tail(&ordered->trans_list, &trans->ordered); spin_lock_irq(&log->log_extents_lock[index]); } spin_unlock_irq(&log->log_extents_lock[index]); diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index 97159a8e91d4..058c79eecbfb 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -1259,7 +1259,7 @@ static int comp_oper(struct btrfs_qgroup_operation *oper1, if (oper1->seq < oper2->seq) return -1; if (oper1->seq > oper2->seq) - return -1; + return 1; if (oper1->ref_root < oper2->ref_root) return -1; if (oper1->ref_root > oper2->ref_root) diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index fe5857223515..d6033f540cc7 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -230,6 +230,7 @@ struct pending_dir_move { u64 parent_ino; u64 ino; u64 gen; + bool is_orphan; struct list_head update_refs; }; @@ -2984,7 +2985,8 @@ static int add_pending_dir_move(struct send_ctx *sctx, u64 ino_gen, u64 parent_ino, struct list_head *new_refs, - struct list_head *deleted_refs) + struct list_head *deleted_refs, + const bool is_orphan) { struct rb_node **p = &sctx->pending_dir_moves.rb_node; struct rb_node *parent = NULL; @@ -2999,6 +3001,7 @@ static int add_pending_dir_move(struct send_ctx *sctx, pm->parent_ino = parent_ino; pm->ino = ino; pm->gen = ino_gen; + pm->is_orphan = is_orphan; INIT_LIST_HEAD(&pm->list); INIT_LIST_HEAD(&pm->update_refs); RB_CLEAR_NODE(&pm->node); @@ -3131,16 +3134,20 @@ static int apply_dir_move(struct send_ctx *sctx, struct pending_dir_move *pm) rmdir_ino = dm->rmdir_ino; free_waiting_dir_move(sctx, dm); - ret = get_first_ref(sctx->parent_root, pm->ino, - &parent_ino, &parent_gen, name); - if (ret < 0) - goto out; - - ret = get_cur_path(sctx, parent_ino, parent_gen, - from_path); - if (ret < 0) - goto out; - ret = fs_path_add_path(from_path, name); + if (pm->is_orphan) { + ret = gen_unique_name(sctx, pm->ino, + pm->gen, from_path); + } else { + ret = get_first_ref(sctx->parent_root, pm->ino, + &parent_ino, &parent_gen, name); + if (ret < 0) + goto out; + ret = get_cur_path(sctx, parent_ino, parent_gen, + from_path); + if (ret < 0) + goto out; + ret = fs_path_add_path(from_path, name); + } if (ret < 0) goto out; @@ -3150,7 +3157,8 @@ static int apply_dir_move(struct send_ctx *sctx, struct pending_dir_move *pm) LIST_HEAD(deleted_refs); ASSERT(ancestor > BTRFS_FIRST_FREE_OBJECTID); ret = add_pending_dir_move(sctx, pm->ino, pm->gen, ancestor, - &pm->update_refs, &deleted_refs); + &pm->update_refs, &deleted_refs, + pm->is_orphan); if (ret < 0) goto out; if (rmdir_ino) { @@ -3283,6 +3291,127 @@ out: return ret; } +/* + * We might need to delay a directory rename even when no ancestor directory + * (in the send root) with a higher inode number than ours (sctx->cur_ino) was + * renamed. This happens when we rename a directory to the old name (the name + * in the parent root) of some other unrelated directory that got its rename + * delayed due to some ancestor with higher number that got renamed. + * + * Example: + * + * Parent snapshot: + * . (ino 256) + * |---- a/ (ino 257) + * | |---- file (ino 260) + * | + * |---- b/ (ino 258) + * |---- c/ (ino 259) + * + * Send snapshot: + * . (ino 256) + * |---- a/ (ino 258) + * |---- x/ (ino 259) + * |---- y/ (ino 257) + * |----- file (ino 260) + * + * Here we can not rename 258 from 'b' to 'a' without the rename of inode 257 + * from 'a' to 'x/y' happening first, which in turn depends on the rename of + * inode 259 from 'c' to 'x'. So the order of rename commands the send stream + * must issue is: + * + * 1 - rename 259 from 'c' to 'x' + * 2 - rename 257 from 'a' to 'x/y' + * 3 - rename 258 from 'b' to 'a' + * + * Returns 1 if the rename of sctx->cur_ino needs to be delayed, 0 if it can + * be done right away and < 0 on error. + */ +static int wait_for_dest_dir_move(struct send_ctx *sctx, + struct recorded_ref *parent_ref, + const bool is_orphan) +{ + struct btrfs_path *path; + struct btrfs_key key; + struct btrfs_key di_key; + struct btrfs_dir_item *di; + u64 left_gen; + u64 right_gen; + int ret = 0; + + if (RB_EMPTY_ROOT(&sctx->waiting_dir_moves)) + return 0; + + path = alloc_path_for_send(); + if (!path) + return -ENOMEM; + + key.objectid = parent_ref->dir; + key.type = BTRFS_DIR_ITEM_KEY; + key.offset = btrfs_name_hash(parent_ref->name, parent_ref->name_len); + + ret = btrfs_search_slot(NULL, sctx->parent_root, &key, path, 0, 0); + if (ret < 0) { + goto out; + } else if (ret > 0) { + ret = 0; + goto out; + } + + di = btrfs_match_dir_item_name(sctx->parent_root, path, + parent_ref->name, parent_ref->name_len); + if (!di) { + ret = 0; + goto out; + } + /* + * di_key.objectid has the number of the inode that has a dentry in the + * parent directory with the same name that sctx->cur_ino is being + * renamed to. We need to check if that inode is in the send root as + * well and if it is currently marked as an inode with a pending rename, + * if it is, we need to delay the rename of sctx->cur_ino as well, so + * that it happens after that other inode is renamed. + */ + btrfs_dir_item_key_to_cpu(path->nodes[0], di, &di_key); + if (di_key.type != BTRFS_INODE_ITEM_KEY) { + ret = 0; + goto out; + } + + ret = get_inode_info(sctx->parent_root, di_key.objectid, NULL, + &left_gen, NULL, NULL, NULL, NULL); + if (ret < 0) + goto out; + ret = get_inode_info(sctx->send_root, di_key.objectid, NULL, + &right_gen, NULL, NULL, NULL, NULL); + if (ret < 0) { + if (ret == -ENOENT) + ret = 0; + goto out; + } + + /* Different inode, no need to delay the rename of sctx->cur_ino */ + if (right_gen != left_gen) { + ret = 0; + goto out; + } + + if (is_waiting_for_move(sctx, di_key.objectid)) { + ret = add_pending_dir_move(sctx, + sctx->cur_ino, + sctx->cur_inode_gen, + di_key.objectid, + &sctx->new_refs, + &sctx->deleted_refs, + is_orphan); + if (!ret) + ret = 1; + } +out: + btrfs_free_path(path); + return ret; +} + static int wait_for_parent_move(struct send_ctx *sctx, struct recorded_ref *parent_ref) { @@ -3349,7 +3478,8 @@ out: sctx->cur_inode_gen, ino, &sctx->new_refs, - &sctx->deleted_refs); + &sctx->deleted_refs, + false); if (!ret) ret = 1; } @@ -3372,6 +3502,7 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move) int did_overwrite = 0; int is_orphan = 0; u64 last_dir_ino_rm = 0; + bool can_rename = true; verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino); @@ -3490,12 +3621,22 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino); } } + if (S_ISDIR(sctx->cur_inode_mode) && sctx->parent_root) { + ret = wait_for_dest_dir_move(sctx, cur, is_orphan); + if (ret < 0) + goto out; + if (ret == 1) { + can_rename = false; + *pending_move = 1; + } + } + /* * link/move the ref to the new place. If we have an orphan * inode, move it and update valid_path. If not, link or move * it depending on the inode mode. */ - if (is_orphan) { + if (is_orphan && can_rename) { ret = send_rename(sctx, valid_path, cur->full_path); if (ret < 0) goto out; @@ -3503,7 +3644,7 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino); ret = fs_path_copy(valid_path, cur->full_path); if (ret < 0) goto out; - } else { + } else if (can_rename) { if (S_ISDIR(sctx->cur_inode_mode)) { /* * Dirs can't be linked, so move it. For moved diff --git a/fs/btrfs/tests/inode-tests.c b/fs/btrfs/tests/inode-tests.c index a116b55ce788..054fc0d97131 100644 --- a/fs/btrfs/tests/inode-tests.c +++ b/fs/btrfs/tests/inode-tests.c @@ -911,6 +911,197 @@ out: return ret; } +static int test_extent_accounting(void) +{ + struct inode *inode = NULL; + struct btrfs_root *root = NULL; + int ret = -ENOMEM; + + inode = btrfs_new_test_inode(); + if (!inode) { + test_msg("Couldn't allocate inode\n"); + return ret; + } + + root = btrfs_alloc_dummy_root(); + if (IS_ERR(root)) { + test_msg("Couldn't allocate root\n"); + goto out; + } + + root->fs_info = btrfs_alloc_dummy_fs_info(); + if (!root->fs_info) { + test_msg("Couldn't allocate dummy fs info\n"); + goto out; + } + + BTRFS_I(inode)->root = root; + btrfs_test_inode_set_ops(inode); + + /* [BTRFS_MAX_EXTENT_SIZE] */ + BTRFS_I(inode)->outstanding_extents++; + ret = btrfs_set_extent_delalloc(inode, 0, BTRFS_MAX_EXTENT_SIZE - 1, + NULL); + if (ret) { + test_msg("btrfs_set_extent_delalloc returned %d\n", ret); + goto out; + } + if (BTRFS_I(inode)->outstanding_extents != 1) { + ret = -EINVAL; + test_msg("Miscount, wanted 1, got %u\n", + BTRFS_I(inode)->outstanding_extents); + goto out; + } + + /* [BTRFS_MAX_EXTENT_SIZE][4k] */ + BTRFS_I(inode)->outstanding_extents++; + ret = btrfs_set_extent_delalloc(inode, BTRFS_MAX_EXTENT_SIZE, + BTRFS_MAX_EXTENT_SIZE + 4095, NULL); + if (ret) { + test_msg("btrfs_set_extent_delalloc returned %d\n", ret); + goto out; + } + if (BTRFS_I(inode)->outstanding_extents != 2) { + ret = -EINVAL; + test_msg("Miscount, wanted 2, got %u\n", + BTRFS_I(inode)->outstanding_extents); + goto out; + } + + /* [BTRFS_MAX_EXTENT_SIZE/2][4K HOLE][the rest] */ + ret = clear_extent_bit(&BTRFS_I(inode)->io_tree, + BTRFS_MAX_EXTENT_SIZE >> 1, + (BTRFS_MAX_EXTENT_SIZE >> 1) + 4095, + EXTENT_DELALLOC | EXTENT_DIRTY | + EXTENT_UPTODATE | EXTENT_DO_ACCOUNTING, 0, 0, + NULL, GFP_NOFS); + if (ret) { + test_msg("clear_extent_bit returned %d\n", ret); + goto out; + } + if (BTRFS_I(inode)->outstanding_extents != 2) { + ret = -EINVAL; + test_msg("Miscount, wanted 2, got %u\n", + BTRFS_I(inode)->outstanding_extents); + goto out; + } + + /* [BTRFS_MAX_EXTENT_SIZE][4K] */ + BTRFS_I(inode)->outstanding_extents++; + ret = btrfs_set_extent_delalloc(inode, BTRFS_MAX_EXTENT_SIZE >> 1, + (BTRFS_MAX_EXTENT_SIZE >> 1) + 4095, + NULL); + if (ret) { + test_msg("btrfs_set_extent_delalloc returned %d\n", ret); + goto out; + } + if (BTRFS_I(inode)->outstanding_extents != 2) { + ret = -EINVAL; + test_msg("Miscount, wanted 2, got %u\n", + BTRFS_I(inode)->outstanding_extents); + goto out; + } + + /* + * [BTRFS_MAX_EXTENT_SIZE+4K][4K HOLE][BTRFS_MAX_EXTENT_SIZE+4K] + * + * I'm artificially adding 2 to outstanding_extents because in the + * buffered IO case we'd add things up as we go, but I don't feel like + * doing that here, this isn't the interesting case we want to test. + */ + BTRFS_I(inode)->outstanding_extents += 2; + ret = btrfs_set_extent_delalloc(inode, BTRFS_MAX_EXTENT_SIZE + 8192, + (BTRFS_MAX_EXTENT_SIZE << 1) + 12287, + NULL); + if (ret) { + test_msg("btrfs_set_extent_delalloc returned %d\n", ret); + goto out; + } + if (BTRFS_I(inode)->outstanding_extents != 4) { + ret = -EINVAL; + test_msg("Miscount, wanted 4, got %u\n", + BTRFS_I(inode)->outstanding_extents); + goto out; + } + + /* [BTRFS_MAX_EXTENT_SIZE+4k][4k][BTRFS_MAX_EXTENT_SIZE+4k] */ + BTRFS_I(inode)->outstanding_extents++; + ret = btrfs_set_extent_delalloc(inode, BTRFS_MAX_EXTENT_SIZE+4096, + BTRFS_MAX_EXTENT_SIZE+8191, NULL); + if (ret) { + test_msg("btrfs_set_extent_delalloc returned %d\n", ret); + goto out; + } + if (BTRFS_I(inode)->outstanding_extents != 3) { + ret = -EINVAL; + test_msg("Miscount, wanted 3, got %u\n", + BTRFS_I(inode)->outstanding_extents); + goto out; + } + + /* [BTRFS_MAX_EXTENT_SIZE+4k][4K HOLE][BTRFS_MAX_EXTENT_SIZE+4k] */ + ret = clear_extent_bit(&BTRFS_I(inode)->io_tree, + BTRFS_MAX_EXTENT_SIZE+4096, + BTRFS_MAX_EXTENT_SIZE+8191, + EXTENT_DIRTY | EXTENT_DELALLOC | + EXTENT_DO_ACCOUNTING | EXTENT_UPTODATE, 0, 0, + NULL, GFP_NOFS); + if (ret) { + test_msg("clear_extent_bit returned %d\n", ret); + goto out; + } + if (BTRFS_I(inode)->outstanding_extents != 4) { + ret = -EINVAL; + test_msg("Miscount, wanted 4, got %u\n", + BTRFS_I(inode)->outstanding_extents); + goto out; + } + + /* + * Refill the hole again just for good measure, because I thought it + * might fail and I'd rather satisfy my paranoia at this point. + */ + BTRFS_I(inode)->outstanding_extents++; + ret = btrfs_set_extent_delalloc(inode, BTRFS_MAX_EXTENT_SIZE+4096, + BTRFS_MAX_EXTENT_SIZE+8191, NULL); + if (ret) { + test_msg("btrfs_set_extent_delalloc returned %d\n", ret); + goto out; + } + if (BTRFS_I(inode)->outstanding_extents != 3) { + ret = -EINVAL; + test_msg("Miscount, wanted 3, got %u\n", + BTRFS_I(inode)->outstanding_extents); + goto out; + } + + /* Empty */ + ret = clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, (u64)-1, + EXTENT_DIRTY | EXTENT_DELALLOC | + EXTENT_DO_ACCOUNTING | EXTENT_UPTODATE, 0, 0, + NULL, GFP_NOFS); + if (ret) { + test_msg("clear_extent_bit returned %d\n", ret); + goto out; + } + if (BTRFS_I(inode)->outstanding_extents) { + ret = -EINVAL; + test_msg("Miscount, wanted 0, got %u\n", + BTRFS_I(inode)->outstanding_extents); + goto out; + } + ret = 0; +out: + if (ret) + clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, (u64)-1, + EXTENT_DIRTY | EXTENT_DELALLOC | + EXTENT_DO_ACCOUNTING | EXTENT_UPTODATE, 0, 0, + NULL, GFP_NOFS); + iput(inode); + btrfs_free_dummy_root(root); + return ret; +} + int btrfs_test_inodes(void) { int ret; @@ -924,5 +1115,9 @@ int btrfs_test_inodes(void) if (ret) return ret; test_msg("Running hole first btrfs_get_extent test\n"); - return test_hole_first(); + ret = test_hole_first(); + if (ret) + return ret; + test_msg("Running outstanding_extents tests\n"); + return test_extent_accounting(); } diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 7e80f32550a6..8be4278e25e8 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -1023,17 +1023,13 @@ static int update_cowonly_root(struct btrfs_trans_handle *trans, u64 old_root_bytenr; u64 old_root_used; struct btrfs_root *tree_root = root->fs_info->tree_root; - bool extent_root = (root->objectid == BTRFS_EXTENT_TREE_OBJECTID); old_root_used = btrfs_root_used(&root->root_item); - btrfs_write_dirty_block_groups(trans, root); while (1) { old_root_bytenr = btrfs_root_bytenr(&root->root_item); if (old_root_bytenr == root->node->start && - old_root_used == btrfs_root_used(&root->root_item) && - (!extent_root || - list_empty(&trans->transaction->dirty_bgs))) + old_root_used == btrfs_root_used(&root->root_item)) break; btrfs_set_root_node(&root->root_item, root->node); @@ -1044,17 +1040,6 @@ static int update_cowonly_root(struct btrfs_trans_handle *trans, return ret; old_root_used = btrfs_root_used(&root->root_item); - if (extent_root) { - ret = btrfs_write_dirty_block_groups(trans, root); - if (ret) - return ret; - } - ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1); - if (ret) - return ret; - ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1); - if (ret) - return ret; } return 0; @@ -1071,6 +1056,7 @@ static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans, struct btrfs_root *root) { struct btrfs_fs_info *fs_info = root->fs_info; + struct list_head *dirty_bgs = &trans->transaction->dirty_bgs; struct list_head *next; struct extent_buffer *eb; int ret; @@ -1098,11 +1084,15 @@ static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans, if (ret) return ret; + ret = btrfs_setup_space_cache(trans, root); + if (ret) + return ret; + /* run_qgroups might have added some more refs */ ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1); if (ret) return ret; - +again: while (!list_empty(&fs_info->dirty_cowonly_roots)) { next = fs_info->dirty_cowonly_roots.next; list_del_init(next); @@ -1115,8 +1105,23 @@ static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans, ret = update_cowonly_root(trans, root); if (ret) return ret; + ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1); + if (ret) + return ret; } + while (!list_empty(dirty_bgs)) { + ret = btrfs_write_dirty_block_groups(trans, root); + if (ret) + return ret; + ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1); + if (ret) + return ret; + } + + if (!list_empty(&fs_info->dirty_cowonly_roots)) + goto again; + list_add_tail(&fs_info->extent_root->dirty_list, &trans->transaction->switch_commits); btrfs_after_dev_replace_commit(fs_info); @@ -1814,6 +1819,9 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, wait_for_commit(root, cur_trans); + if (unlikely(cur_trans->aborted)) + ret = cur_trans->aborted; + btrfs_put_transaction(cur_trans); return ret; diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 9a37f8b39bae..c5b8ba37f88e 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -1012,7 +1012,7 @@ again: base = btrfs_item_ptr_offset(leaf, path->slots[0]); while (cur_offset < item_size) { - extref = (struct btrfs_inode_extref *)base + cur_offset; + extref = (struct btrfs_inode_extref *)(base + cur_offset); victim_name_len = btrfs_inode_extref_name_len(leaf, extref); diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index cd4d1315aaa9..8222f6f74147 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -4903,10 +4903,17 @@ static void sort_parity_stripes(struct btrfs_bio *bbio, int num_stripes) static struct btrfs_bio *alloc_btrfs_bio(int total_stripes, int real_stripes) { struct btrfs_bio *bbio = kzalloc( + /* the size of the btrfs_bio */ sizeof(struct btrfs_bio) + + /* plus the variable array for the stripes */ sizeof(struct btrfs_bio_stripe) * (total_stripes) + + /* plus the variable array for the tgt dev */ sizeof(int) * (real_stripes) + - sizeof(u64) * (real_stripes), + /* + * plus the raid_map, which includes both the tgt dev + * and the stripes + */ + sizeof(u64) * (total_stripes), GFP_NOFS); if (!bbio) return NULL; diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c index 47b19465f0dc..883b93623bc5 100644 --- a/fs/btrfs/xattr.c +++ b/fs/btrfs/xattr.c @@ -111,6 +111,8 @@ static int do_setxattr(struct btrfs_trans_handle *trans, name, name_len, -1); if (!di && (flags & XATTR_REPLACE)) ret = -ENODATA; + else if (IS_ERR(di)) + ret = PTR_ERR(di); else if (di) ret = btrfs_delete_one_dir_name(trans, root, path, di); goto out; @@ -127,10 +129,12 @@ static int do_setxattr(struct btrfs_trans_handle *trans, ASSERT(mutex_is_locked(&inode->i_mutex)); di = btrfs_lookup_xattr(NULL, root, path, btrfs_ino(inode), name, name_len, 0); - if (!di) { + if (!di) ret = -ENODATA; + else if (IS_ERR(di)) + ret = PTR_ERR(di); + if (ret) goto out; - } btrfs_release_path(path); di = NULL; } diff --git a/fs/cachefiles/daemon.c b/fs/cachefiles/daemon.c index ce1b115dcc28..f601def05bdf 100644 --- a/fs/cachefiles/daemon.c +++ b/fs/cachefiles/daemon.c @@ -574,7 +574,7 @@ static int cachefiles_daemon_cull(struct cachefiles_cache *cache, char *args) /* extract the directory dentry from the cwd */ get_fs_pwd(current->fs, &path); - if (!S_ISDIR(path.dentry->d_inode->i_mode)) + if (!d_can_lookup(path.dentry)) goto notdir; cachefiles_begin_secure(cache, &saved_cred); @@ -646,7 +646,7 @@ static int cachefiles_daemon_inuse(struct cachefiles_cache *cache, char *args) /* extract the directory dentry from the cwd */ get_fs_pwd(current->fs, &path); - if (!S_ISDIR(path.dentry->d_inode->i_mode)) + if (!d_can_lookup(path.dentry)) goto notdir; cachefiles_begin_secure(cache, &saved_cred); diff --git a/fs/cachefiles/interface.c b/fs/cachefiles/interface.c index 1c7293c3a93a..232426214fdd 100644 --- a/fs/cachefiles/interface.c +++ b/fs/cachefiles/interface.c @@ -437,7 +437,7 @@ static int cachefiles_attr_changed(struct fscache_object *_object) if (!object->backer) return -ENOBUFS; - ASSERT(S_ISREG(object->backer->d_inode->i_mode)); + ASSERT(d_is_reg(object->backer)); fscache_set_store_limit(&object->fscache, ni_size); @@ -501,7 +501,7 @@ static void cachefiles_invalidate_object(struct fscache_operation *op) op->object->debug_id, (unsigned long long)ni_size); if (object->backer) { - ASSERT(S_ISREG(object->backer->d_inode->i_mode)); + ASSERT(d_is_reg(object->backer)); fscache_set_store_limit(&object->fscache, ni_size); diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c index 7f8e83f9d74e..1e51714eb33e 100644 --- a/fs/cachefiles/namei.c +++ b/fs/cachefiles/namei.c @@ -277,7 +277,7 @@ static int cachefiles_bury_object(struct cachefiles_cache *cache, _debug("remove %p from %p", rep, dir); /* non-directories can just be unlinked */ - if (!S_ISDIR(rep->d_inode->i_mode)) { + if (!d_is_dir(rep)) { _debug("unlink stale object"); path.mnt = cache->mnt; @@ -323,7 +323,7 @@ try_again: return 0; } - if (!S_ISDIR(cache->graveyard->d_inode->i_mode)) { + if (!d_can_lookup(cache->graveyard)) { unlock_rename(cache->graveyard, dir); cachefiles_io_error(cache, "Graveyard no longer a directory"); return -EIO; @@ -475,7 +475,7 @@ int cachefiles_walk_to_object(struct cachefiles_object *parent, ASSERT(parent->dentry); ASSERT(parent->dentry->d_inode); - if (!(S_ISDIR(parent->dentry->d_inode->i_mode))) { + if (!(d_is_dir(parent->dentry))) { // TODO: convert file to dir _leave("looking up in none directory"); return -ENOBUFS; @@ -539,7 +539,7 @@ lookup_again: _debug("mkdir -> %p{%p{ino=%lu}}", next, next->d_inode, next->d_inode->i_ino); - } else if (!S_ISDIR(next->d_inode->i_mode)) { + } else if (!d_can_lookup(next)) { pr_err("inode %lu is not a directory\n", next->d_inode->i_ino); ret = -ENOBUFS; @@ -568,8 +568,8 @@ lookup_again: _debug("create -> %p{%p{ino=%lu}}", next, next->d_inode, next->d_inode->i_ino); - } else if (!S_ISDIR(next->d_inode->i_mode) && - !S_ISREG(next->d_inode->i_mode) + } else if (!d_can_lookup(next) && + !d_is_reg(next) ) { pr_err("inode %lu is not a file or directory\n", next->d_inode->i_ino); @@ -642,7 +642,7 @@ lookup_again: /* open a file interface onto a data file */ if (object->type != FSCACHE_COOKIE_TYPE_INDEX) { - if (S_ISREG(object->dentry->d_inode->i_mode)) { + if (d_is_reg(object->dentry)) { const struct address_space_operations *aops; ret = -EPERM; @@ -763,7 +763,7 @@ struct dentry *cachefiles_get_directory(struct cachefiles_cache *cache, /* we need to make sure the subdir is a directory */ ASSERT(subdir->d_inode); - if (!S_ISDIR(subdir->d_inode->i_mode)) { + if (!d_can_lookup(subdir)) { pr_err("%s is not a directory\n", dirname); ret = -EIO; goto check_error; diff --git a/fs/cachefiles/rdwr.c b/fs/cachefiles/rdwr.c index 616db0e77b44..c6cd8d7a4eef 100644 --- a/fs/cachefiles/rdwr.c +++ b/fs/cachefiles/rdwr.c @@ -900,7 +900,7 @@ int cachefiles_write_page(struct fscache_storage *op, struct page *page) return -ENOBUFS; } - ASSERT(S_ISREG(object->backer->d_inode->i_mode)); + ASSERT(d_is_reg(object->backer)); cache = container_of(object->fscache.cache, struct cachefiles_cache, cache); diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index 0411dbb15815..83e9976f7189 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -904,7 +904,7 @@ static int ceph_unlink(struct inode *dir, struct dentry *dentry) } else if (ceph_snap(dir) == CEPH_NOSNAP) { dout("unlink/rmdir dir %p dn %p inode %p\n", dir, dentry, inode); - op = S_ISDIR(dentry->d_inode->i_mode) ? + op = d_is_dir(dentry) ? CEPH_MDS_OP_RMDIR : CEPH_MDS_OP_UNLINK; } else goto out; diff --git a/fs/ceph/file.c b/fs/ceph/file.c index a3d774b35149..d533075a823d 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -292,7 +292,7 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry, } if (err) goto out_req; - if (dn || dentry->d_inode == NULL || S_ISLNK(dentry->d_inode->i_mode)) { + if (dn || dentry->d_inode == NULL || d_is_symlink(dentry)) { /* make vfs retry on splice, ENOENT, or symlink */ dout("atomic_open finish_no_open on dn %p\n", dn); err = finish_no_open(file, dn); diff --git a/fs/coda/dir.c b/fs/coda/dir.c index 281ee011bb6a..60cb88c1dd2b 100644 --- a/fs/coda/dir.c +++ b/fs/coda/dir.c @@ -304,7 +304,7 @@ static int coda_rename(struct inode *old_dir, struct dentry *old_dentry, (const char *) old_name, (const char *)new_name); if (!error) { if (new_dentry->d_inode) { - if (S_ISDIR(new_dentry->d_inode->i_mode)) { + if (d_is_dir(new_dentry)) { coda_dir_drop_nlink(old_dir); coda_dir_inc_nlink(new_dir); } diff --git a/fs/configfs/configfs_internal.h b/fs/configfs/configfs_internal.h index a315677e44d3..b65d1ef532d5 100644 --- a/fs/configfs/configfs_internal.h +++ b/fs/configfs/configfs_internal.h @@ -69,14 +69,13 @@ extern struct kmem_cache *configfs_dir_cachep; extern int configfs_is_root(struct config_item *item); extern struct inode * configfs_new_inode(umode_t mode, struct configfs_dirent *, struct super_block *); -extern int configfs_create(struct dentry *, umode_t mode, int (*init)(struct inode *)); +extern int configfs_create(struct dentry *, umode_t mode, void (*init)(struct inode *)); extern int configfs_create_file(struct config_item *, const struct configfs_attribute *); extern int configfs_make_dirent(struct configfs_dirent *, struct dentry *, void *, umode_t, int); extern int configfs_dirent_is_ready(struct configfs_dirent *); -extern int configfs_add_file(struct dentry *, const struct configfs_attribute *, int); extern void configfs_hash_and_remove(struct dentry * dir, const char * name); extern const unsigned char * configfs_get_name(struct configfs_dirent *sd); diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c index c9c298bd3058..cf0db005d2f5 100644 --- a/fs/configfs/dir.c +++ b/fs/configfs/dir.c @@ -240,60 +240,26 @@ int configfs_make_dirent(struct configfs_dirent * parent_sd, return 0; } -static int init_dir(struct inode * inode) +static void init_dir(struct inode * inode) { inode->i_op = &configfs_dir_inode_operations; inode->i_fop = &configfs_dir_operations; /* directory inodes start off with i_nlink == 2 (for "." entry) */ inc_nlink(inode); - return 0; } -static int configfs_init_file(struct inode * inode) +static void configfs_init_file(struct inode * inode) { inode->i_size = PAGE_SIZE; inode->i_fop = &configfs_file_operations; - return 0; } -static int init_symlink(struct inode * inode) +static void init_symlink(struct inode * inode) { inode->i_op = &configfs_symlink_inode_operations; - return 0; -} - -static int create_dir(struct config_item *k, struct dentry *d) -{ - int error; - umode_t mode = S_IFDIR| S_IRWXU | S_IRUGO | S_IXUGO; - struct dentry *p = d->d_parent; - - BUG_ON(!k); - - error = configfs_dirent_exists(p->d_fsdata, d->d_name.name); - if (!error) - error = configfs_make_dirent(p->d_fsdata, d, k, mode, - CONFIGFS_DIR | CONFIGFS_USET_CREATING); - if (!error) { - configfs_set_dir_dirent_depth(p->d_fsdata, d->d_fsdata); - error = configfs_create(d, mode, init_dir); - if (!error) { - inc_nlink(p->d_inode); - } else { - struct configfs_dirent *sd = d->d_fsdata; - if (sd) { - spin_lock(&configfs_dirent_lock); - list_del_init(&sd->s_sibling); - spin_unlock(&configfs_dirent_lock); - configfs_put(sd); - } - } - } - return error; } - /** * configfs_create_dir - create a directory for an config_item. * @item: config_itemwe're creating directory for. @@ -303,11 +269,37 @@ static int create_dir(struct config_item *k, struct dentry *d) * until it is validated by configfs_dir_set_ready() */ -static int configfs_create_dir(struct config_item * item, struct dentry *dentry) +static int configfs_create_dir(struct config_item *item, struct dentry *dentry) { - int error = create_dir(item, dentry); - if (!error) + int error; + umode_t mode = S_IFDIR| S_IRWXU | S_IRUGO | S_IXUGO; + struct dentry *p = dentry->d_parent; + + BUG_ON(!item); + + error = configfs_dirent_exists(p->d_fsdata, dentry->d_name.name); + if (unlikely(error)) + return error; + + error = configfs_make_dirent(p->d_fsdata, dentry, item, mode, + CONFIGFS_DIR | CONFIGFS_USET_CREATING); + if (unlikely(error)) + return error; + + configfs_set_dir_dirent_depth(p->d_fsdata, dentry->d_fsdata); + error = configfs_create(dentry, mode, init_dir); + if (!error) { + inc_nlink(p->d_inode); item->ci_dentry = dentry; + } else { + struct configfs_dirent *sd = dentry->d_fsdata; + if (sd) { + spin_lock(&configfs_dirent_lock); + list_del_init(&sd->s_sibling); + spin_unlock(&configfs_dirent_lock); + configfs_put(sd); + } + } return error; } diff --git a/fs/configfs/file.c b/fs/configfs/file.c index 1d1c41f1014d..56d2cdc9ae0a 100644 --- a/fs/configfs/file.c +++ b/fs/configfs/file.c @@ -313,21 +313,6 @@ const struct file_operations configfs_file_operations = { .release = configfs_release, }; - -int configfs_add_file(struct dentry * dir, const struct configfs_attribute * attr, int type) -{ - struct configfs_dirent * parent_sd = dir->d_fsdata; - umode_t mode = (attr->ca_mode & S_IALLUGO) | S_IFREG; - int error = 0; - - mutex_lock_nested(&dir->d_inode->i_mutex, I_MUTEX_NORMAL); - error = configfs_make_dirent(parent_sd, NULL, (void *) attr, mode, type); - mutex_unlock(&dir->d_inode->i_mutex); - - return error; -} - - /** * configfs_create_file - create an attribute file for an item. * @item: item we're creating for. @@ -336,9 +321,16 @@ int configfs_add_file(struct dentry * dir, const struct configfs_attribute * att int configfs_create_file(struct config_item * item, const struct configfs_attribute * attr) { - BUG_ON(!item || !item->ci_dentry || !attr); + struct dentry *dir = item->ci_dentry; + struct configfs_dirent *parent_sd = dir->d_fsdata; + umode_t mode = (attr->ca_mode & S_IALLUGO) | S_IFREG; + int error = 0; - return configfs_add_file(item->ci_dentry, attr, - CONFIGFS_ITEM_ATTR); + mutex_lock_nested(&dir->d_inode->i_mutex, I_MUTEX_NORMAL); + error = configfs_make_dirent(parent_sd, NULL, (void *) attr, mode, + CONFIGFS_ITEM_ATTR); + mutex_unlock(&dir->d_inode->i_mutex); + + return error; } diff --git a/fs/configfs/inode.c b/fs/configfs/inode.c index 65af86147154..5423a6a6ecc8 100644 --- a/fs/configfs/inode.c +++ b/fs/configfs/inode.c @@ -176,7 +176,7 @@ static void configfs_set_inode_lock_class(struct configfs_dirent *sd, #endif /* CONFIG_LOCKDEP */ -int configfs_create(struct dentry * dentry, umode_t mode, int (*init)(struct inode *)) +int configfs_create(struct dentry * dentry, umode_t mode, void (*init)(struct inode *)) { int error = 0; struct inode *inode = NULL; @@ -198,13 +198,7 @@ int configfs_create(struct dentry * dentry, umode_t mode, int (*init)(struct ino p_inode->i_mtime = p_inode->i_ctime = CURRENT_TIME; configfs_set_inode_lock_class(sd, inode); - if (init) { - error = init(inode); - if (error) { - iput(inode); - return error; - } - } + init(inode); d_instantiate(dentry, inode); if (S_ISDIR(mode) || S_ISLNK(mode)) dget(dentry); /* pin link and directory dentries in core */ @@ -242,7 +236,7 @@ void configfs_drop_dentry(struct configfs_dirent * sd, struct dentry * parent) if (dentry) { spin_lock(&dentry->d_lock); - if (!(d_unhashed(dentry) && dentry->d_inode)) { + if (!d_unhashed(dentry) && dentry->d_inode) { dget_dlock(dentry); __d_drop(dentry); spin_unlock(&dentry->d_lock); diff --git a/fs/coredump.c b/fs/coredump.c index b5c86ffd5033..f319926ddf8c 100644 --- a/fs/coredump.c +++ b/fs/coredump.c @@ -572,7 +572,7 @@ void do_coredump(const siginfo_t *siginfo) * * Normally core limits are irrelevant to pipes, since * we're not writing to the file system, but we use - * cprm.limit of 1 here as a speacial value, this is a + * cprm.limit of 1 here as a special value, this is a * consistent way to catch recursive crashes. * We can still crash if the core_pattern binary sets * RLIM_CORE = !1, but it runs as root, and can do diff --git a/fs/dcache.c b/fs/dcache.c index dc400fd29f4d..c71e3732e53b 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -1659,9 +1659,25 @@ void d_set_d_op(struct dentry *dentry, const struct dentry_operations *op) } EXPORT_SYMBOL(d_set_d_op); + +/* + * d_set_fallthru - Mark a dentry as falling through to a lower layer + * @dentry - The dentry to mark + * + * Mark a dentry as falling through to the lower layer (as set with + * d_pin_lower()). This flag may be recorded on the medium. + */ +void d_set_fallthru(struct dentry *dentry) +{ + spin_lock(&dentry->d_lock); + dentry->d_flags |= DCACHE_FALLTHRU; + spin_unlock(&dentry->d_lock); +} +EXPORT_SYMBOL(d_set_fallthru); + static unsigned d_flags_for_inode(struct inode *inode) { - unsigned add_flags = DCACHE_FILE_TYPE; + unsigned add_flags = DCACHE_REGULAR_TYPE; if (!inode) return DCACHE_MISS_TYPE; @@ -1674,13 +1690,21 @@ static unsigned d_flags_for_inode(struct inode *inode) else inode->i_opflags |= IOP_LOOKUP; } - } else if (unlikely(!(inode->i_opflags & IOP_NOFOLLOW))) { - if (unlikely(inode->i_op->follow_link)) + goto type_determined; + } + + if (unlikely(!(inode->i_opflags & IOP_NOFOLLOW))) { + if (unlikely(inode->i_op->follow_link)) { add_flags = DCACHE_SYMLINK_TYPE; - else - inode->i_opflags |= IOP_NOFOLLOW; + goto type_determined; + } + inode->i_opflags |= IOP_NOFOLLOW; } + if (unlikely(!S_ISREG(inode->i_mode))) + add_flags = DCACHE_SPECIAL_TYPE; + +type_determined: if (unlikely(IS_AUTOMOUNT(inode))) add_flags |= DCACHE_NEED_AUTOMOUNT; return add_flags; @@ -1691,7 +1715,8 @@ static void __d_instantiate(struct dentry *dentry, struct inode *inode) unsigned add_flags = d_flags_for_inode(inode); spin_lock(&dentry->d_lock); - __d_set_type(dentry, add_flags); + dentry->d_flags &= ~(DCACHE_ENTRY_TYPE | DCACHE_FALLTHRU); + dentry->d_flags |= add_flags; if (inode) hlist_add_head(&dentry->d_u.d_alias, &inode->i_dentry); dentry->d_inode = inode; diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c index 45b18a5e225c..96400ab42d13 100644 --- a/fs/debugfs/inode.c +++ b/fs/debugfs/inode.c @@ -169,10 +169,19 @@ static int debugfs_show_options(struct seq_file *m, struct dentry *root) return 0; } +static void debugfs_evict_inode(struct inode *inode) +{ + truncate_inode_pages_final(&inode->i_data); + clear_inode(inode); + if (S_ISLNK(inode->i_mode)) + kfree(inode->i_private); +} + static const struct super_operations debugfs_super_operations = { .statfs = simple_statfs, .remount_fs = debugfs_remount, .show_options = debugfs_show_options, + .evict_inode = debugfs_evict_inode, }; static struct vfsmount *debugfs_automount(struct path *path) @@ -511,23 +520,14 @@ static int __debugfs_remove(struct dentry *dentry, struct dentry *parent) int ret = 0; if (debugfs_positive(dentry)) { - if (dentry->d_inode) { - dget(dentry); - switch (dentry->d_inode->i_mode & S_IFMT) { - case S_IFDIR: - ret = simple_rmdir(parent->d_inode, dentry); - break; - case S_IFLNK: - kfree(dentry->d_inode->i_private); - /* fall through */ - default: - simple_unlink(parent->d_inode, dentry); - break; - } - if (!ret) - d_delete(dentry); - dput(dentry); - } + dget(dentry); + if (S_ISDIR(dentry->d_inode->i_mode)) + ret = simple_rmdir(parent->d_inode, dentry); + else + simple_unlink(parent->d_inode, dentry); + if (!ret) + d_delete(dentry); + dput(dentry); } return ret; } @@ -690,7 +690,7 @@ struct dentry *debugfs_rename(struct dentry *old_dir, struct dentry *old_dentry, } d_move(old_dentry, dentry); fsnotify_move(old_dir->d_inode, new_dir->d_inode, old_name, - S_ISDIR(old_dentry->d_inode->i_mode), + d_is_dir(old_dentry), NULL, old_dentry); fsnotify_oldname_free(old_name); unlock_rename(new_dir, old_dir); diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h index 90d1882b306f..5ba029e627cc 100644 --- a/fs/ecryptfs/ecryptfs_kernel.h +++ b/fs/ecryptfs/ecryptfs_kernel.h @@ -124,7 +124,7 @@ ecryptfs_get_key_payload_data(struct key *key) } #define ECRYPTFS_MAX_KEYSET_SIZE 1024 -#define ECRYPTFS_MAX_CIPHER_NAME_SIZE 32 +#define ECRYPTFS_MAX_CIPHER_NAME_SIZE 31 #define ECRYPTFS_MAX_NUM_ENC_KEYS 64 #define ECRYPTFS_MAX_IV_BYTES 16 /* 128 bits */ #define ECRYPTFS_SALT_BYTES 2 @@ -237,7 +237,7 @@ struct ecryptfs_crypt_stat { struct crypto_ablkcipher *tfm; struct crypto_hash *hash_tfm; /* Crypto context for generating * the initialization vectors */ - unsigned char cipher[ECRYPTFS_MAX_CIPHER_NAME_SIZE]; + unsigned char cipher[ECRYPTFS_MAX_CIPHER_NAME_SIZE + 1]; unsigned char key[ECRYPTFS_MAX_KEY_BYTES]; unsigned char root_iv[ECRYPTFS_MAX_IV_BYTES]; struct list_head keysig_list; diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c index 6f4e659f508f..fd39bad6f1bd 100644 --- a/fs/ecryptfs/file.c +++ b/fs/ecryptfs/file.c @@ -230,7 +230,7 @@ static int ecryptfs_open(struct inode *inode, struct file *file) } ecryptfs_set_file_lower( file, ecryptfs_inode_to_private(inode)->lower_file); - if (S_ISDIR(ecryptfs_dentry->d_inode->i_mode)) { + if (d_is_dir(ecryptfs_dentry)) { ecryptfs_printk(KERN_DEBUG, "This is a directory\n"); mutex_lock(&crypt_stat->cs_mutex); crypt_stat->flags &= ~(ECRYPTFS_ENCRYPTED); @@ -303,9 +303,22 @@ ecryptfs_unlocked_ioctl(struct file *file, unsigned int cmd, unsigned long arg) struct file *lower_file = ecryptfs_file_to_lower(file); long rc = -ENOTTY; - if (lower_file->f_op->unlocked_ioctl) + if (!lower_file->f_op->unlocked_ioctl) + return rc; + + switch (cmd) { + case FITRIM: + case FS_IOC_GETFLAGS: + case FS_IOC_SETFLAGS: + case FS_IOC_GETVERSION: + case FS_IOC_SETVERSION: rc = lower_file->f_op->unlocked_ioctl(lower_file, cmd, arg); - return rc; + fsstack_copy_attr_all(file_inode(file), file_inode(lower_file)); + + return rc; + default: + return rc; + } } #ifdef CONFIG_COMPAT @@ -315,9 +328,22 @@ ecryptfs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) struct file *lower_file = ecryptfs_file_to_lower(file); long rc = -ENOIOCTLCMD; - if (lower_file->f_op->compat_ioctl) + if (!lower_file->f_op->compat_ioctl) + return rc; + + switch (cmd) { + case FITRIM: + case FS_IOC32_GETFLAGS: + case FS_IOC32_SETFLAGS: + case FS_IOC32_GETVERSION: + case FS_IOC32_SETVERSION: rc = lower_file->f_op->compat_ioctl(lower_file, cmd, arg); - return rc; + fsstack_copy_attr_all(file_inode(file), file_inode(lower_file)); + + return rc; + default: + return rc; + } } #endif diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c index 34b36a504059..b08b5187f662 100644 --- a/fs/ecryptfs/inode.c +++ b/fs/ecryptfs/inode.c @@ -907,9 +907,9 @@ static int ecryptfs_setattr(struct dentry *dentry, struct iattr *ia) lower_inode = ecryptfs_inode_to_lower(inode); lower_dentry = ecryptfs_dentry_to_lower(dentry); mutex_lock(&crypt_stat->cs_mutex); - if (S_ISDIR(dentry->d_inode->i_mode)) + if (d_is_dir(dentry)) crypt_stat->flags &= ~(ECRYPTFS_ENCRYPTED); - else if (S_ISREG(dentry->d_inode->i_mode) + else if (d_is_reg(dentry) && (!(crypt_stat->flags & ECRYPTFS_POLICY_APPLIED) || !(crypt_stat->flags & ECRYPTFS_KEY_VALID))) { struct ecryptfs_mount_crypt_stat *mount_crypt_stat; diff --git a/fs/ecryptfs/keystore.c b/fs/ecryptfs/keystore.c index 917bd5c9776a..6bd67e2011f0 100644 --- a/fs/ecryptfs/keystore.c +++ b/fs/ecryptfs/keystore.c @@ -891,7 +891,7 @@ struct ecryptfs_parse_tag_70_packet_silly_stack { struct blkcipher_desc desc; char fnek_sig_hex[ECRYPTFS_SIG_SIZE_HEX + 1]; char iv[ECRYPTFS_MAX_IV_BYTES]; - char cipher_string[ECRYPTFS_MAX_CIPHER_NAME_SIZE]; + char cipher_string[ECRYPTFS_MAX_CIPHER_NAME_SIZE + 1]; }; /** diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c index 1895d60f4122..c095d3264259 100644 --- a/fs/ecryptfs/main.c +++ b/fs/ecryptfs/main.c @@ -407,7 +407,7 @@ static int ecryptfs_parse_options(struct ecryptfs_sb_info *sbi, char *options, if (!cipher_name_set) { int cipher_name_len = strlen(ECRYPTFS_DEFAULT_CIPHER); - BUG_ON(cipher_name_len >= ECRYPTFS_MAX_CIPHER_NAME_SIZE); + BUG_ON(cipher_name_len > ECRYPTFS_MAX_CIPHER_NAME_SIZE); strcpy(mount_crypt_stat->global_default_cipher_name, ECRYPTFS_DEFAULT_CIPHER); } diff --git a/fs/exportfs/expfs.c b/fs/exportfs/expfs.c index fdfd206c737a..714cd37a6ba3 100644 --- a/fs/exportfs/expfs.c +++ b/fs/exportfs/expfs.c @@ -429,7 +429,7 @@ struct dentry *exportfs_decode_fh(struct vfsmount *mnt, struct fid *fid, if (IS_ERR(result)) return result; - if (S_ISDIR(result->d_inode->i_mode)) { + if (d_is_dir(result)) { /* * This request is for a directory. * diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 982d934fd9ac..f63c3d5805c4 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -364,7 +364,8 @@ struct flex_groups { #define EXT4_DIRTY_FL 0x00000100 #define EXT4_COMPRBLK_FL 0x00000200 /* One or more compressed clusters */ #define EXT4_NOCOMPR_FL 0x00000400 /* Don't compress */ -#define EXT4_ECOMPR_FL 0x00000800 /* Compression error */ + /* nb: was previously EXT2_ECOMPR_FL */ +#define EXT4_ENCRYPT_FL 0x00000800 /* encrypted file */ /* End compression flags --- maybe not all used */ #define EXT4_INDEX_FL 0x00001000 /* hash-indexed directory */ #define EXT4_IMAGIC_FL 0x00002000 /* AFS directory */ @@ -421,7 +422,7 @@ enum { EXT4_INODE_DIRTY = 8, EXT4_INODE_COMPRBLK = 9, /* One or more compressed clusters */ EXT4_INODE_NOCOMPR = 10, /* Don't compress */ - EXT4_INODE_ECOMPR = 11, /* Compression error */ + EXT4_INODE_ENCRYPT = 11, /* Compression error */ /* End compression flags --- maybe not all used */ EXT4_INODE_INDEX = 12, /* hash-indexed directory */ EXT4_INODE_IMAGIC = 13, /* AFS directory */ @@ -466,7 +467,7 @@ static inline void ext4_check_flag_values(void) CHECK_FLAG_VALUE(DIRTY); CHECK_FLAG_VALUE(COMPRBLK); CHECK_FLAG_VALUE(NOCOMPR); - CHECK_FLAG_VALUE(ECOMPR); + CHECK_FLAG_VALUE(ENCRYPT); CHECK_FLAG_VALUE(INDEX); CHECK_FLAG_VALUE(IMAGIC); CHECK_FLAG_VALUE(JOURNAL_DATA); @@ -1048,6 +1049,12 @@ extern void ext4_set_bits(void *bm, int cur, int len); /* Metadata checksum algorithm codes */ #define EXT4_CRC32C_CHKSUM 1 +/* Encryption algorithms */ +#define EXT4_ENCRYPTION_MODE_INVALID 0 +#define EXT4_ENCRYPTION_MODE_AES_256_XTS 1 +#define EXT4_ENCRYPTION_MODE_AES_256_GCM 2 +#define EXT4_ENCRYPTION_MODE_AES_256_CBC 3 + /* * Structure of the super block */ @@ -1161,7 +1168,8 @@ struct ext4_super_block { __le32 s_grp_quota_inum; /* inode for tracking group quota */ __le32 s_overhead_clusters; /* overhead blocks/clusters in fs */ __le32 s_backup_bgs[2]; /* groups with sparse_super2 SBs */ - __le32 s_reserved[106]; /* Padding to the end of the block */ + __u8 s_encrypt_algos[4]; /* Encryption algorithms in use */ + __le32 s_reserved[105]; /* Padding to the end of the block */ __le32 s_checksum; /* crc32c(superblock) */ }; @@ -1527,6 +1535,7 @@ static inline void ext4_clear_state_flags(struct ext4_inode_info *ei) * GDT_CSUM bits are mutually exclusive. */ #define EXT4_FEATURE_RO_COMPAT_METADATA_CSUM 0x0400 +#define EXT4_FEATURE_RO_COMPAT_READONLY 0x1000 #define EXT4_FEATURE_INCOMPAT_COMPRESSION 0x0001 #define EXT4_FEATURE_INCOMPAT_FILETYPE 0x0002 @@ -1542,6 +1551,7 @@ static inline void ext4_clear_state_flags(struct ext4_inode_info *ei) #define EXT4_FEATURE_INCOMPAT_BG_USE_META_CSUM 0x2000 /* use crc32c for bg */ #define EXT4_FEATURE_INCOMPAT_LARGEDIR 0x4000 /* >2GB or 3-lvl htree */ #define EXT4_FEATURE_INCOMPAT_INLINE_DATA 0x8000 /* data in inode */ +#define EXT4_FEATURE_INCOMPAT_ENCRYPT 0x10000 #define EXT2_FEATURE_COMPAT_SUPP EXT4_FEATURE_COMPAT_EXT_ATTR #define EXT2_FEATURE_INCOMPAT_SUPP (EXT4_FEATURE_INCOMPAT_FILETYPE| \ diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c index 6b9878a24182..45fe924f82bc 100644 --- a/fs/ext4/indirect.c +++ b/fs/ext4/indirect.c @@ -1401,10 +1401,7 @@ end_range: * to free. Everything was covered by the start * of the range. */ - return 0; - } else { - /* Shared branch grows from an indirect block */ - partial2--; + goto do_indirects; } } else { /* @@ -1435,56 +1432,96 @@ end_range: /* Punch happened within the same level (n == n2) */ partial = ext4_find_shared(inode, n, offsets, chain, &nr); partial2 = ext4_find_shared(inode, n2, offsets2, chain2, &nr2); - /* - * ext4_find_shared returns Indirect structure which - * points to the last element which should not be - * removed by truncate. But this is end of the range - * in punch_hole so we need to point to the next element - */ - partial2->p++; - while ((partial > chain) || (partial2 > chain2)) { - /* We're at the same block, so we're almost finished */ - if ((partial->bh && partial2->bh) && - (partial->bh->b_blocknr == partial2->bh->b_blocknr)) { - if ((partial > chain) && (partial2 > chain2)) { + + /* Free top, but only if partial2 isn't its subtree. */ + if (nr) { + int level = min(partial - chain, partial2 - chain2); + int i; + int subtree = 1; + + for (i = 0; i <= level; i++) { + if (offsets[i] != offsets2[i]) { + subtree = 0; + break; + } + } + + if (!subtree) { + if (partial == chain) { + /* Shared branch grows from the inode */ + ext4_free_branches(handle, inode, NULL, + &nr, &nr+1, + (chain+n-1) - partial); + *partial->p = 0; + } else { + /* Shared branch grows from an indirect block */ + BUFFER_TRACE(partial->bh, "get_write_access"); ext4_free_branches(handle, inode, partial->bh, - partial->p + 1, - partial2->p, + partial->p, + partial->p+1, (chain+n-1) - partial); - BUFFER_TRACE(partial->bh, "call brelse"); - brelse(partial->bh); - BUFFER_TRACE(partial2->bh, "call brelse"); - brelse(partial2->bh); } - return 0; } + } + + if (!nr2) { /* - * Clear the ends of indirect blocks on the shared branch - * at the start of the range + * ext4_find_shared returns Indirect structure which + * points to the last element which should not be + * removed by truncate. But this is end of the range + * in punch_hole so we need to point to the next element */ - if (partial > chain) { + partial2->p++; + } + + while (partial > chain || partial2 > chain2) { + int depth = (chain+n-1) - partial; + int depth2 = (chain2+n2-1) - partial2; + + if (partial > chain && partial2 > chain2 && + partial->bh->b_blocknr == partial2->bh->b_blocknr) { + /* + * We've converged on the same block. Clear the range, + * then we're done. + */ ext4_free_branches(handle, inode, partial->bh, - partial->p + 1, - (__le32 *)partial->bh->b_data+addr_per_block, - (chain+n-1) - partial); + partial->p + 1, + partial2->p, + (chain+n-1) - partial); BUFFER_TRACE(partial->bh, "call brelse"); brelse(partial->bh); - partial--; + BUFFER_TRACE(partial2->bh, "call brelse"); + brelse(partial2->bh); + return 0; } + /* - * Clear the ends of indirect blocks on the shared branch - * at the end of the range + * The start and end partial branches may not be at the same + * level even though the punch happened within one level. So, we + * give them a chance to arrive at the same level, then walk + * them in step with each other until we converge on the same + * block. */ - if (partial2 > chain2) { + if (partial > chain && depth <= depth2) { + ext4_free_branches(handle, inode, partial->bh, + partial->p + 1, + (__le32 *)partial->bh->b_data+addr_per_block, + (chain+n-1) - partial); + BUFFER_TRACE(partial->bh, "call brelse"); + brelse(partial->bh); + partial--; + } + if (partial2 > chain2 && depth2 <= depth) { ext4_free_branches(handle, inode, partial2->bh, (__le32 *)partial2->bh->b_data, partial2->p, - (chain2+n-1) - partial2); + (chain2+n2-1) - partial2); BUFFER_TRACE(partial2->bh, "call brelse"); brelse(partial2->bh); partial2--; } } + return 0; do_indirects: /* Kill the remaining (whole) subtrees */ diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 85404f15e53a..5cb9a212b86f 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -1024,6 +1024,7 @@ static int ext4_write_end(struct file *file, { handle_t *handle = ext4_journal_current_handle(); struct inode *inode = mapping->host; + loff_t old_size = inode->i_size; int ret = 0, ret2; int i_size_changed = 0; @@ -1054,6 +1055,8 @@ static int ext4_write_end(struct file *file, unlock_page(page); page_cache_release(page); + if (old_size < pos) + pagecache_isize_extended(inode, old_size, pos); /* * Don't mark the inode dirty under page lock. First, it unnecessarily * makes the holding time of page lock longer. Second, it forces lock @@ -1095,6 +1098,7 @@ static int ext4_journalled_write_end(struct file *file, { handle_t *handle = ext4_journal_current_handle(); struct inode *inode = mapping->host; + loff_t old_size = inode->i_size; int ret = 0, ret2; int partial = 0; unsigned from, to; @@ -1127,6 +1131,9 @@ static int ext4_journalled_write_end(struct file *file, unlock_page(page); page_cache_release(page); + if (old_size < pos) + pagecache_isize_extended(inode, old_size, pos); + if (size_changed) { ret2 = ext4_mark_inode_dirty(handle, inode); if (!ret) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 1adac6868e6f..e061e66c8280 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -2779,6 +2779,12 @@ static int ext4_feature_set_ok(struct super_block *sb, int readonly) if (readonly) return 1; + if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_READONLY)) { + ext4_msg(sb, KERN_INFO, "filesystem is read-only"); + sb->s_flags |= MS_RDONLY; + return 1; + } + /* Check that feature set is OK for a read-write mount */ if (EXT4_HAS_RO_COMPAT_FEATURE(sb, ~EXT4_FEATURE_RO_COMPAT_SUPP)) { ext4_msg(sb, KERN_ERR, "couldn't mount RDWR because of " @@ -3936,9 +3942,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) get_random_bytes(&sbi->s_next_generation, sizeof(u32)); spin_lock_init(&sbi->s_next_gen_lock); - init_timer(&sbi->s_err_report); - sbi->s_err_report.function = print_daily_error_info; - sbi->s_err_report.data = (unsigned long) sb; + setup_timer(&sbi->s_err_report, print_daily_error_info, + (unsigned long) sb); /* Register extent status tree shrinker */ if (ext4_es_register_shrinker(sbi)) @@ -4866,9 +4871,6 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) if (sbi->s_journal && sbi->s_journal->j_task->io_context) journal_ioprio = sbi->s_journal->j_task->io_context->ioprio; - /* - * Allow the "check" option to be passed as a remount option. - */ if (!parse_options(data, sb, NULL, &journal_ioprio, 1)) { err = -EINVAL; goto restore_opts; @@ -4877,17 +4879,8 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) if ((old_opts.s_mount_opt & EXT4_MOUNT_JOURNAL_CHECKSUM) ^ test_opt(sb, JOURNAL_CHECKSUM)) { ext4_msg(sb, KERN_ERR, "changing journal_checksum " - "during remount not supported"); - err = -EINVAL; - goto restore_opts; - } - - if ((old_opts.s_mount_opt & EXT4_MOUNT_JOURNAL_CHECKSUM) ^ - test_opt(sb, JOURNAL_CHECKSUM)) { - ext4_msg(sb, KERN_ERR, "changing journal_checksum " - "during remount not supported"); - err = -EINVAL; - goto restore_opts; + "during remount not supported; ignoring"); + sbi->s_mount_opt ^= EXT4_MOUNT_JOURNAL_CHECKSUM; } if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) { @@ -4963,7 +4956,9 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) ext4_mark_recovery_complete(sb, es); } else { /* Make sure we can mount this feature set readwrite */ - if (!ext4_feature_set_ok(sb, 0)) { + if (EXT4_HAS_RO_COMPAT_FEATURE(sb, + EXT4_FEATURE_RO_COMPAT_READONLY) || + !ext4_feature_set_ok(sb, 0)) { err = -EROFS; goto restore_opts; } diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 073657f755d4..e907052eeadb 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -769,9 +769,9 @@ static long __writeback_inodes_wb(struct bdi_writeback *wb, struct inode *inode = wb_inode(wb->b_io.prev); struct super_block *sb = inode->i_sb; - if (!grab_super_passive(sb)) { + if (!trylock_super(sb)) { /* - * grab_super_passive() may fail consistently due to + * trylock_super() may fail consistently due to * s_umount being grabbed by someone else. Don't use * requeue_io() to avoid busy retrying the inode/sb. */ @@ -779,7 +779,7 @@ static long __writeback_inodes_wb(struct bdi_writeback *wb, continue; } wrote += writeback_sb_inodes(sb, wb, work); - drop_super(sb); + up_read(&sb->s_umount); /* refer to the same tests at the end of writeback_sb_inodes */ if (wrote) { diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index ed19a7d622fa..39706c57ad3c 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -890,8 +890,8 @@ static int fuse_try_move_page(struct fuse_copy_state *cs, struct page **pagep) newpage = buf->page; - if (WARN_ON(!PageUptodate(newpage))) - return -EIO; + if (!PageUptodate(newpage)) + SetPageUptodate(newpage); ClearPageMappedToDisk(newpage); @@ -1353,6 +1353,17 @@ static ssize_t fuse_dev_do_read(struct fuse_conn *fc, struct file *file, return err; } +static int fuse_dev_open(struct inode *inode, struct file *file) +{ + /* + * The fuse device's file's private_data is used to hold + * the fuse_conn(ection) when it is mounted, and is used to + * keep track of whether the file has been mounted already. + */ + file->private_data = NULL; + return 0; +} + static ssize_t fuse_dev_read(struct kiocb *iocb, const struct iovec *iov, unsigned long nr_segs, loff_t pos) { @@ -1797,6 +1808,9 @@ copy_finish: static int fuse_notify(struct fuse_conn *fc, enum fuse_notify_code code, unsigned int size, struct fuse_copy_state *cs) { + /* Don't try to move pages (yet) */ + cs->move_pages = 0; + switch (code) { case FUSE_NOTIFY_POLL: return fuse_notify_poll(fc, size, cs); @@ -2217,6 +2231,7 @@ static int fuse_dev_fasync(int fd, struct file *file, int on) const struct file_operations fuse_dev_operations = { .owner = THIS_MODULE, + .open = fuse_dev_open, .llseek = no_llseek, .read = do_sync_read, .aio_read = fuse_dev_read, diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index 08e7b1a9d5d0..1545b711ddcf 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -971,7 +971,7 @@ int fuse_reverse_inval_entry(struct super_block *sb, u64 parent_nodeid, err = -EBUSY; goto badentry; } - if (S_ISDIR(entry->d_inode->i_mode)) { + if (d_is_dir(entry)) { shrink_dcache_parent(entry); if (!simple_empty(entry)) { err = -ENOTEMPTY; diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c index 6371192961e2..487527b42d94 100644 --- a/fs/gfs2/dir.c +++ b/fs/gfs2/dir.c @@ -1809,7 +1809,7 @@ int gfs2_dir_del(struct gfs2_inode *dip, const struct dentry *dentry) gfs2_consist_inode(dip); dip->i_entries--; dip->i_inode.i_mtime = dip->i_inode.i_ctime = tv; - if (S_ISDIR(dentry->d_inode->i_mode)) + if (d_is_dir(dentry)) drop_nlink(&dip->i_inode); mark_inode_dirty(&dip->i_inode); diff --git a/fs/hfsplus/dir.c b/fs/hfsplus/dir.c index 435bea231cc6..f0235c1640af 100644 --- a/fs/hfsplus/dir.c +++ b/fs/hfsplus/dir.c @@ -530,7 +530,7 @@ static int hfsplus_rename(struct inode *old_dir, struct dentry *old_dentry, /* Unlink destination if it already exists */ if (new_dentry->d_inode) { - if (S_ISDIR(new_dentry->d_inode->i_mode)) + if (d_is_dir(new_dentry)) res = hfsplus_rmdir(new_dir, new_dentry); else res = hfsplus_unlink(new_dir, new_dentry); diff --git a/fs/hppfs/hppfs.c b/fs/hppfs/hppfs.c index 5f2755117ce7..043ac9d77262 100644 --- a/fs/hppfs/hppfs.c +++ b/fs/hppfs/hppfs.c @@ -678,10 +678,10 @@ static struct inode *get_inode(struct super_block *sb, struct dentry *dentry) return NULL; } - if (S_ISDIR(dentry->d_inode->i_mode)) { + if (d_is_dir(dentry)) { inode->i_op = &hppfs_dir_iops; inode->i_fop = &hppfs_dir_fops; - } else if (S_ISLNK(dentry->d_inode->i_mode)) { + } else if (d_is_symlink(dentry)) { inode->i_op = &hppfs_link_iops; inode->i_fop = &hppfs_file_fops; } else { diff --git a/fs/internal.h b/fs/internal.h index 30459dab409d..01dce1d1476b 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -84,7 +84,7 @@ extern struct file *get_empty_filp(void); * super.c */ extern int do_remount_sb(struct super_block *, int, void *, int); -extern bool grab_super_passive(struct super_block *sb); +extern bool trylock_super(struct super_block *sb); extern struct dentry *mount_fs(struct file_system_type *, int, const char *, void *); extern struct super_block *user_get_super(dev_t); diff --git a/fs/jbd2/recovery.c b/fs/jbd2/recovery.c index bcbef08a4d8f..b5128c6e63ad 100644 --- a/fs/jbd2/recovery.c +++ b/fs/jbd2/recovery.c @@ -524,6 +524,9 @@ static int do_one_pass(journal_t *journal, if (descr_csum_size > 0 && !jbd2_descr_block_csum_verify(journal, bh->b_data)) { + printk(KERN_ERR "JBD2: Invalid checksum " + "recovering block %lu in log\n", + next_log_block); err = -EIO; brelse(bh); goto failed; diff --git a/fs/jffs2/dir.c b/fs/jffs2/dir.c index 938556025d64..f21b6fb5e4c4 100644 --- a/fs/jffs2/dir.c +++ b/fs/jffs2/dir.c @@ -252,7 +252,7 @@ static int jffs2_link (struct dentry *old_dentry, struct inode *dir_i, struct de if (!f->inocache) return -EIO; - if (S_ISDIR(old_dentry->d_inode->i_mode)) + if (d_is_dir(old_dentry)) return -EPERM; /* XXX: This is ugly */ @@ -772,7 +772,7 @@ static int jffs2_rename (struct inode *old_dir_i, struct dentry *old_dentry, */ if (new_dentry->d_inode) { victim_f = JFFS2_INODE_INFO(new_dentry->d_inode); - if (S_ISDIR(new_dentry->d_inode->i_mode)) { + if (d_is_dir(new_dentry)) { struct jffs2_full_dirent *fd; mutex_lock(&victim_f->sem); @@ -807,7 +807,7 @@ static int jffs2_rename (struct inode *old_dir_i, struct dentry *old_dentry, if (victim_f) { /* There was a victim. Kill it off nicely */ - if (S_ISDIR(new_dentry->d_inode->i_mode)) + if (d_is_dir(new_dentry)) clear_nlink(new_dentry->d_inode); else drop_nlink(new_dentry->d_inode); @@ -815,7 +815,7 @@ static int jffs2_rename (struct inode *old_dir_i, struct dentry *old_dentry, inode which didn't exist. */ if (victim_f->inocache) { mutex_lock(&victim_f->sem); - if (S_ISDIR(new_dentry->d_inode->i_mode)) + if (d_is_dir(new_dentry)) victim_f->inocache->pino_nlink = 0; else victim_f->inocache->pino_nlink--; @@ -825,7 +825,7 @@ static int jffs2_rename (struct inode *old_dir_i, struct dentry *old_dentry, /* If it was a directory we moved, and there was no victim, increase i_nlink on its new parent */ - if (S_ISDIR(old_dentry->d_inode->i_mode) && !victim_f) + if (d_is_dir(old_dentry) && !victim_f) inc_nlink(new_dir_i); /* Unlink the original */ @@ -839,7 +839,7 @@ static int jffs2_rename (struct inode *old_dir_i, struct dentry *old_dentry, struct jffs2_inode_info *f = JFFS2_INODE_INFO(old_dentry->d_inode); mutex_lock(&f->sem); inc_nlink(old_dentry->d_inode); - if (f->inocache && !S_ISDIR(old_dentry->d_inode->i_mode)) + if (f->inocache && !d_is_dir(old_dentry)) f->inocache->pino_nlink++; mutex_unlock(&f->sem); @@ -852,7 +852,7 @@ static int jffs2_rename (struct inode *old_dir_i, struct dentry *old_dentry, return ret; } - if (S_ISDIR(old_dentry->d_inode->i_mode)) + if (d_is_dir(old_dentry)) drop_nlink(old_dir_i); new_dir_i->i_mtime = new_dir_i->i_ctime = old_dir_i->i_mtime = old_dir_i->i_ctime = ITIME(now); diff --git a/fs/jffs2/super.c b/fs/jffs2/super.c index 0918f0e2e266..3d76f28a2ba9 100644 --- a/fs/jffs2/super.c +++ b/fs/jffs2/super.c @@ -138,7 +138,7 @@ static struct dentry *jffs2_get_parent(struct dentry *child) struct jffs2_inode_info *f; uint32_t pino; - BUG_ON(!S_ISDIR(child->d_inode->i_mode)); + BUG_ON(!d_is_dir(child)); f = JFFS2_INODE_INFO(child->d_inode); diff --git a/fs/kernfs/file.c b/fs/kernfs/file.c index b684e8a132e6..2bacb9988566 100644 --- a/fs/kernfs/file.c +++ b/fs/kernfs/file.c @@ -207,6 +207,7 @@ static ssize_t kernfs_file_direct_read(struct kernfs_open_file *of, goto out_free; } + of->event = atomic_read(&of->kn->attr.open->event); ops = kernfs_ops(of->kn); if (ops->read) len = ops->read(of, buf, len, *ppos); diff --git a/fs/libfs.c b/fs/libfs.c index b2ffdb045be4..0ab65122ee45 100644 --- a/fs/libfs.c +++ b/fs/libfs.c @@ -329,7 +329,7 @@ int simple_rename(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry) { struct inode *inode = old_dentry->d_inode; - int they_are_dirs = S_ISDIR(old_dentry->d_inode->i_mode); + int they_are_dirs = d_is_dir(old_dentry); if (!simple_empty(new_dentry)) return -ENOTEMPTY; diff --git a/fs/locks.c b/fs/locks.c index 365c82e1b3a9..528fedfda15e 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -1665,7 +1665,8 @@ generic_add_lease(struct file *filp, long arg, struct file_lock **flp, void **pr } if (my_fl != NULL) { - error = lease->fl_lmops->lm_change(my_fl, arg, &dispose); + lease = my_fl; + error = lease->fl_lmops->lm_change(lease, arg, &dispose); if (error) goto out; goto out_setup; @@ -1727,7 +1728,7 @@ static int generic_delete_lease(struct file *filp, void *owner) break; } } - trace_generic_delete_lease(inode, fl); + trace_generic_delete_lease(inode, victim); if (victim) error = fl->fl_lmops->lm_change(victim, F_UNLCK, &dispose); spin_unlock(&ctx->flc_lock); diff --git a/fs/namei.c b/fs/namei.c index 96ca11dea4a2..c83145af4bfc 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -2814,7 +2814,7 @@ no_open: } else if (!dentry->d_inode) { goto out; } else if ((open_flag & O_TRUNC) && - S_ISREG(dentry->d_inode->i_mode)) { + d_is_reg(dentry)) { goto out; } /* will fail later, go on to get the right error */ diff --git a/fs/namespace.c b/fs/namespace.c index 72a286e0d33e..82ef1405260e 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -1907,8 +1907,8 @@ static int graft_tree(struct mount *mnt, struct mount *p, struct mountpoint *mp) if (mnt->mnt.mnt_sb->s_flags & MS_NOUSER) return -EINVAL; - if (S_ISDIR(mp->m_dentry->d_inode->i_mode) != - S_ISDIR(mnt->mnt.mnt_root->d_inode->i_mode)) + if (d_is_dir(mp->m_dentry) != + d_is_dir(mnt->mnt.mnt_root)) return -ENOTDIR; return attach_recursive_mnt(mnt, p, mp, NULL); @@ -2180,8 +2180,8 @@ static int do_move_mount(struct path *path, const char *old_name) if (!mnt_has_parent(old)) goto out1; - if (S_ISDIR(path->dentry->d_inode->i_mode) != - S_ISDIR(old_path.dentry->d_inode->i_mode)) + if (d_is_dir(path->dentry) != + d_is_dir(old_path.dentry)) goto out1; /* * Don't move a mount residing in a shared parent. @@ -2271,7 +2271,7 @@ static int do_add_mount(struct mount *newmnt, struct path *path, int mnt_flags) goto unlock; err = -EINVAL; - if (S_ISLNK(newmnt->mnt.mnt_root->d_inode->i_mode)) + if (d_is_symlink(newmnt->mnt.mnt_root)) goto unlock; newmnt->mnt.mnt_flags = mnt_flags; diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c index e36a9d78ea49..197806fb87ff 100644 --- a/fs/nfs/callback_proc.c +++ b/fs/nfs/callback_proc.c @@ -427,6 +427,8 @@ __be32 nfs4_callback_sequence(struct cb_sequenceargs *args, if (clp == NULL) goto out; + if (!(clp->cl_session->flags & SESSION4_BACK_CHAN)) + goto out; tbl = &clp->cl_session->bc_slot_table; spin_lock(&tbl->slot_tbl_lock); diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c index f4ccfe6521ec..19ca95cdfd9b 100644 --- a/fs/nfs/callback_xdr.c +++ b/fs/nfs/callback_xdr.c @@ -313,7 +313,7 @@ __be32 decode_devicenotify_args(struct svc_rqst *rqstp, goto out; } - args->devs = kmalloc(n * sizeof(*args->devs), GFP_KERNEL); + args->devs = kmalloc_array(n, sizeof(*args->devs), GFP_KERNEL); if (!args->devs) { status = htonl(NFS4ERR_DELAY); goto out; @@ -415,7 +415,7 @@ static __be32 decode_rc_list(struct xdr_stream *xdr, rc_list->rcl_nrefcalls * 2 * sizeof(uint32_t)); if (unlikely(p == NULL)) goto out; - rc_list->rcl_refcalls = kmalloc(rc_list->rcl_nrefcalls * + rc_list->rcl_refcalls = kmalloc_array(rc_list->rcl_nrefcalls, sizeof(*rc_list->rcl_refcalls), GFP_KERNEL); if (unlikely(rc_list->rcl_refcalls == NULL)) @@ -464,8 +464,10 @@ static __be32 decode_cb_sequence_args(struct svc_rqst *rqstp, for (i = 0; i < args->csa_nrclists; i++) { status = decode_rc_list(xdr, &args->csa_rclists[i]); - if (status) + if (status) { + args->csa_nrclists = i; goto out_free; + } } } status = 0; diff --git a/fs/nfs/client.c b/fs/nfs/client.c index f9f4845db989..19874151e95c 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -433,7 +433,7 @@ static struct nfs_client *nfs_match_client(const struct nfs_client_initdata *dat static bool nfs_client_init_is_complete(const struct nfs_client *clp) { - return clp->cl_cons_state != NFS_CS_INITING; + return clp->cl_cons_state <= NFS_CS_READY; } int nfs_wait_client_init_complete(const struct nfs_client *clp) diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c index da5433230bb1..a6ad68865880 100644 --- a/fs/nfs/delegation.c +++ b/fs/nfs/delegation.c @@ -180,10 +180,9 @@ void nfs_inode_reclaim_delegation(struct inode *inode, struct rpc_cred *cred, delegation->cred = get_rpccred(cred); clear_bit(NFS_DELEGATION_NEED_RECLAIM, &delegation->flags); - NFS_I(inode)->delegation_state = delegation->type; spin_unlock(&delegation->lock); - put_rpccred(oldcred); rcu_read_unlock(); + put_rpccred(oldcred); trace_nfs4_reclaim_delegation(inode, res->delegation_type); } else { /* We appear to have raced with a delegation return. */ @@ -275,7 +274,6 @@ nfs_detach_delegation_locked(struct nfs_inode *nfsi, set_bit(NFS_DELEGATION_RETURNING, &delegation->flags); list_del_rcu(&delegation->super_list); delegation->inode = NULL; - nfsi->delegation_state = 0; rcu_assign_pointer(nfsi->delegation, NULL); spin_unlock(&delegation->lock); return delegation; @@ -355,7 +353,6 @@ int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct &delegation->stateid)) { nfs_update_inplace_delegation(old_delegation, delegation); - nfsi->delegation_state = old_delegation->type; goto out; } /* @@ -373,13 +370,15 @@ int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct delegation = NULL; goto out; } - freeme = nfs_detach_delegation_locked(nfsi, + if (test_and_set_bit(NFS_DELEGATION_RETURNING, + &old_delegation->flags)) + goto out; + freeme = nfs_detach_delegation_locked(nfsi, old_delegation, clp); if (freeme == NULL) goto out; } list_add_rcu(&delegation->super_list, &server->delegations); - nfsi->delegation_state = delegation->type; rcu_assign_pointer(nfsi->delegation, delegation); delegation = NULL; @@ -437,6 +436,8 @@ static bool nfs_delegation_need_return(struct nfs_delegation *delegation) { bool ret = false; + if (test_bit(NFS_DELEGATION_RETURNING, &delegation->flags)) + goto out; if (test_and_clear_bit(NFS_DELEGATION_RETURN, &delegation->flags)) ret = true; if (test_and_clear_bit(NFS_DELEGATION_RETURN_IF_CLOSED, &delegation->flags) && !ret) { @@ -448,6 +449,7 @@ static bool nfs_delegation_need_return(struct nfs_delegation *delegation) ret = true; spin_unlock(&delegation->lock); } +out: return ret; } @@ -475,14 +477,20 @@ restart: super_list) { if (!nfs_delegation_need_return(delegation)) continue; - inode = nfs_delegation_grab_inode(delegation); - if (inode == NULL) + if (!nfs_sb_active(server->super)) continue; + inode = nfs_delegation_grab_inode(delegation); + if (inode == NULL) { + rcu_read_unlock(); + nfs_sb_deactive(server->super); + goto restart; + } delegation = nfs_start_delegation_return_locked(NFS_I(inode)); rcu_read_unlock(); err = nfs_end_delegation_return(inode, delegation, 0); iput(inode); + nfs_sb_deactive(server->super); if (!err) goto restart; set_bit(NFS4CLNT_DELEGRETURN, &clp->cl_state); @@ -813,19 +821,30 @@ restart: list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) { list_for_each_entry_rcu(delegation, &server->delegations, super_list) { + if (test_bit(NFS_DELEGATION_RETURNING, + &delegation->flags)) + continue; if (test_bit(NFS_DELEGATION_NEED_RECLAIM, &delegation->flags) == 0) continue; - inode = nfs_delegation_grab_inode(delegation); - if (inode == NULL) + if (!nfs_sb_active(server->super)) continue; - delegation = nfs_detach_delegation(NFS_I(inode), - delegation, server); + inode = nfs_delegation_grab_inode(delegation); + if (inode == NULL) { + rcu_read_unlock(); + nfs_sb_deactive(server->super); + goto restart; + } + delegation = nfs_start_delegation_return_locked(NFS_I(inode)); rcu_read_unlock(); - - if (delegation != NULL) - nfs_free_delegation(delegation); + if (delegation != NULL) { + delegation = nfs_detach_delegation(NFS_I(inode), + delegation, server); + if (delegation != NULL) + nfs_free_delegation(delegation); + } iput(inode); + nfs_sb_deactive(server->super); goto restart; } } diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index 9b0c55cb2a2e..c19e16f0b2d0 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -408,14 +408,22 @@ static int xdr_decode(nfs_readdir_descriptor_t *desc, return 0; } +/* Match file and dirent using either filehandle or fileid + * Note: caller is responsible for checking the fsid + */ static int nfs_same_file(struct dentry *dentry, struct nfs_entry *entry) { + struct nfs_inode *nfsi; + if (dentry->d_inode == NULL) goto different; - if (nfs_compare_fh(entry->fh, NFS_FH(dentry->d_inode)) != 0) - goto different; - return 1; + + nfsi = NFS_I(dentry->d_inode); + if (entry->fattr->fileid == nfsi->fileid) + return 1; + if (nfs_compare_fh(entry->fh, &nfsi->fh) == 0) + return 1; different: return 0; } @@ -469,6 +477,10 @@ void nfs_prime_dcache(struct dentry *parent, struct nfs_entry *entry) struct inode *inode; int status; + if (!(entry->fattr->valid & NFS_ATTR_FATTR_FILEID)) + return; + if (!(entry->fattr->valid & NFS_ATTR_FATTR_FSID)) + return; if (filename.name[0] == '.') { if (filename.len == 1) return; @@ -479,6 +491,10 @@ void nfs_prime_dcache(struct dentry *parent, struct nfs_entry *entry) dentry = d_lookup(parent, &filename); if (dentry != NULL) { + /* Is there a mountpoint here? If so, just exit */ + if (!nfs_fsid_equal(&NFS_SB(dentry->d_sb)->fsid, + &entry->fattr->fsid)) + goto out; if (nfs_same_file(dentry, entry)) { nfs_set_verifier(dentry, nfs_save_change_attribute(dir)); status = nfs_refresh_inode(dentry->d_inode, entry->fattr); diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c index 7077521acdf4..e907c8cf732e 100644 --- a/fs/nfs/direct.c +++ b/fs/nfs/direct.c @@ -283,7 +283,7 @@ static void nfs_direct_release_pages(struct page **pages, unsigned int npages) void nfs_init_cinfo_from_dreq(struct nfs_commit_info *cinfo, struct nfs_direct_req *dreq) { - cinfo->lock = &dreq->lock; + cinfo->lock = &dreq->inode->i_lock; cinfo->mds = &dreq->mds_cinfo; cinfo->ds = &dreq->ds_cinfo; cinfo->dreq = dreq; diff --git a/fs/nfs/file.c b/fs/nfs/file.c index 94712fc781fa..e679d24c39d3 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -178,7 +178,7 @@ nfs_file_read(struct kiocb *iocb, struct iov_iter *to) iocb->ki_filp, iov_iter_count(to), (unsigned long) iocb->ki_pos); - result = nfs_revalidate_mapping(inode, iocb->ki_filp->f_mapping); + result = nfs_revalidate_mapping_protected(inode, iocb->ki_filp->f_mapping); if (!result) { result = generic_file_read_iter(iocb, to); if (result > 0) @@ -199,7 +199,7 @@ nfs_file_splice_read(struct file *filp, loff_t *ppos, dprintk("NFS: splice_read(%pD2, %lu@%Lu)\n", filp, (unsigned long) count, (unsigned long long) *ppos); - res = nfs_revalidate_mapping(inode, filp->f_mapping); + res = nfs_revalidate_mapping_protected(inode, filp->f_mapping); if (!res) { res = generic_file_splice_read(filp, ppos, pipe, count, flags); if (res > 0) @@ -372,6 +372,10 @@ start: nfs_wait_bit_killable, TASK_KILLABLE); if (ret) return ret; + /* + * Wait for O_DIRECT to complete + */ + nfs_inode_dio_wait(mapping->host); page = grab_cache_page_write_begin(mapping, index, flags); if (!page) @@ -619,6 +623,9 @@ static int nfs_vm_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) /* make sure the cache has finished storing the page */ nfs_fscache_wait_on_page_write(NFS_I(inode), page); + wait_on_bit_action(&NFS_I(inode)->flags, NFS_INO_INVALIDATING, + nfs_wait_bit_killable, TASK_KILLABLE); + lock_page(page); mapping = page_file_mapping(page); if (mapping != inode->i_mapping) diff --git a/fs/nfs/filelayout/filelayout.c b/fs/nfs/filelayout/filelayout.c index 7ae1c263c5cf..91e88a7ecef0 100644 --- a/fs/nfs/filelayout/filelayout.c +++ b/fs/nfs/filelayout/filelayout.c @@ -960,52 +960,19 @@ filelayout_mark_request_commit(struct nfs_page *req, { struct nfs4_filelayout_segment *fl = FILELAYOUT_LSEG(lseg); u32 i, j; - struct list_head *list; - struct pnfs_commit_bucket *buckets; if (fl->commit_through_mds) { - list = &cinfo->mds->list; - spin_lock(cinfo->lock); - goto mds_commit; - } - - /* Note that we are calling nfs4_fl_calc_j_index on each page - * that ends up being committed to a data server. An attractive - * alternative is to add a field to nfs_write_data and nfs_page - * to store the value calculated in filelayout_write_pagelist - * and just use that here. - */ - j = nfs4_fl_calc_j_index(lseg, req_offset(req)); - i = select_bucket_index(fl, j); - spin_lock(cinfo->lock); - buckets = cinfo->ds->buckets; - list = &buckets[i].written; - if (list_empty(list)) { - /* Non-empty buckets hold a reference on the lseg. That ref - * is normally transferred to the COMMIT call and released - * there. It could also be released if the last req is pulled - * off due to a rewrite, in which case it will be done in - * pnfs_generic_clear_request_commit + nfs_request_add_commit_list(req, &cinfo->mds->list, cinfo); + } else { + /* Note that we are calling nfs4_fl_calc_j_index on each page + * that ends up being committed to a data server. An attractive + * alternative is to add a field to nfs_write_data and nfs_page + * to store the value calculated in filelayout_write_pagelist + * and just use that here. */ - buckets[i].wlseg = pnfs_get_lseg(lseg); - } - set_bit(PG_COMMIT_TO_DS, &req->wb_flags); - cinfo->ds->nwritten++; - -mds_commit: - /* nfs_request_add_commit_list(). We need to add req to list without - * dropping cinfo lock. - */ - set_bit(PG_CLEAN, &(req)->wb_flags); - nfs_list_add_request(req, list); - cinfo->mds->ncommit++; - spin_unlock(cinfo->lock); - if (!cinfo->dreq) { - inc_zone_page_state(req->wb_page, NR_UNSTABLE_NFS); - inc_bdi_stat(inode_to_bdi(page_file_mapping(req->wb_page)->host), - BDI_RECLAIMABLE); - __mark_inode_dirty(req->wb_context->dentry->d_inode, - I_DIRTY_DATASYNC); + j = nfs4_fl_calc_j_index(lseg, req_offset(req)); + i = select_bucket_index(fl, j); + pnfs_layout_mark_request_commit(req, lseg, cinfo, i); } } diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c index c22ecaa86c1c..315cc68945b9 100644 --- a/fs/nfs/flexfilelayout/flexfilelayout.c +++ b/fs/nfs/flexfilelayout/flexfilelayout.c @@ -1332,47 +1332,6 @@ ff_layout_write_pagelist(struct nfs_pgio_header *hdr, int sync) return PNFS_ATTEMPTED; } -static void -ff_layout_mark_request_commit(struct nfs_page *req, - struct pnfs_layout_segment *lseg, - struct nfs_commit_info *cinfo, - u32 ds_commit_idx) -{ - struct list_head *list; - struct pnfs_commit_bucket *buckets; - - spin_lock(cinfo->lock); - buckets = cinfo->ds->buckets; - list = &buckets[ds_commit_idx].written; - if (list_empty(list)) { - /* Non-empty buckets hold a reference on the lseg. That ref - * is normally transferred to the COMMIT call and released - * there. It could also be released if the last req is pulled - * off due to a rewrite, in which case it will be done in - * pnfs_common_clear_request_commit - */ - WARN_ON_ONCE(buckets[ds_commit_idx].wlseg != NULL); - buckets[ds_commit_idx].wlseg = pnfs_get_lseg(lseg); - } - set_bit(PG_COMMIT_TO_DS, &req->wb_flags); - cinfo->ds->nwritten++; - - /* nfs_request_add_commit_list(). We need to add req to list without - * dropping cinfo lock. - */ - set_bit(PG_CLEAN, &(req)->wb_flags); - nfs_list_add_request(req, list); - cinfo->mds->ncommit++; - spin_unlock(cinfo->lock); - if (!cinfo->dreq) { - inc_zone_page_state(req->wb_page, NR_UNSTABLE_NFS); - inc_bdi_stat(inode_to_bdi(page_file_mapping(req->wb_page)->host), - BDI_RECLAIMABLE); - __mark_inode_dirty(req->wb_context->dentry->d_inode, - I_DIRTY_DATASYNC); - } -} - static u32 calc_ds_index_from_commit(struct pnfs_layout_segment *lseg, u32 i) { return i; @@ -1540,7 +1499,7 @@ static struct pnfs_layoutdriver_type flexfilelayout_type = { .pg_write_ops = &ff_layout_pg_write_ops, .get_ds_info = ff_layout_get_ds_info, .free_deviceid_node = ff_layout_free_deveiceid_node, - .mark_request_commit = ff_layout_mark_request_commit, + .mark_request_commit = pnfs_layout_mark_request_commit, .clear_request_commit = pnfs_generic_clear_request_commit, .scan_commit_lists = pnfs_generic_scan_commit_lists, .recover_commit_reqs = pnfs_generic_recover_commit_reqs, diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index e4f0dcef8f54..d42dff6d5e98 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -556,6 +556,7 @@ EXPORT_SYMBOL_GPL(nfs_setattr); * This is a copy of the common vmtruncate, but with the locking * corrected to take into account the fact that NFS requires * inode->i_size to be updated under the inode->i_lock. + * Note: must be called with inode->i_lock held! */ static int nfs_vmtruncate(struct inode * inode, loff_t offset) { @@ -565,14 +566,14 @@ static int nfs_vmtruncate(struct inode * inode, loff_t offset) if (err) goto out; - spin_lock(&inode->i_lock); i_size_write(inode, offset); /* Optimisation */ if (offset == 0) NFS_I(inode)->cache_validity &= ~NFS_INO_INVALID_DATA; - spin_unlock(&inode->i_lock); + spin_unlock(&inode->i_lock); truncate_pagecache(inode, offset); + spin_lock(&inode->i_lock); out: return err; } @@ -585,10 +586,15 @@ out: * Note: we do this in the *proc.c in order to ensure that * it works for things like exclusive creates too. */ -void nfs_setattr_update_inode(struct inode *inode, struct iattr *attr) +void nfs_setattr_update_inode(struct inode *inode, struct iattr *attr, + struct nfs_fattr *fattr) { + /* Barrier: bump the attribute generation count. */ + nfs_fattr_set_barrier(fattr); + + spin_lock(&inode->i_lock); + NFS_I(inode)->attr_gencount = fattr->gencount; if ((attr->ia_valid & (ATTR_MODE|ATTR_UID|ATTR_GID)) != 0) { - spin_lock(&inode->i_lock); if ((attr->ia_valid & ATTR_MODE) != 0) { int mode = attr->ia_mode & S_IALLUGO; mode |= inode->i_mode & ~S_IALLUGO; @@ -600,12 +606,13 @@ void nfs_setattr_update_inode(struct inode *inode, struct iattr *attr) inode->i_gid = attr->ia_gid; nfs_set_cache_invalid(inode, NFS_INO_INVALID_ACCESS | NFS_INO_INVALID_ACL); - spin_unlock(&inode->i_lock); } if ((attr->ia_valid & ATTR_SIZE) != 0) { nfs_inc_stats(inode, NFSIOS_SETATTRTRUNC); nfs_vmtruncate(inode, attr->ia_size); } + nfs_update_inode(inode, fattr); + spin_unlock(&inode->i_lock); } EXPORT_SYMBOL_GPL(nfs_setattr_update_inode); @@ -1028,6 +1035,7 @@ static int nfs_invalidate_mapping(struct inode *inode, struct address_space *map if (mapping->nrpages != 0) { if (S_ISREG(inode->i_mode)) { + unmap_mapping_range(mapping, 0, 0, 0); ret = nfs_sync_mapping(mapping); if (ret < 0) return ret; @@ -1060,11 +1068,14 @@ static bool nfs_mapping_need_revalidate_inode(struct inode *inode) } /** - * nfs_revalidate_mapping - Revalidate the pagecache + * __nfs_revalidate_mapping - Revalidate the pagecache * @inode - pointer to host inode * @mapping - pointer to mapping + * @may_lock - take inode->i_mutex? */ -int nfs_revalidate_mapping(struct inode *inode, struct address_space *mapping) +static int __nfs_revalidate_mapping(struct inode *inode, + struct address_space *mapping, + bool may_lock) { struct nfs_inode *nfsi = NFS_I(inode); unsigned long *bitlock = &nfsi->flags; @@ -1113,7 +1124,12 @@ int nfs_revalidate_mapping(struct inode *inode, struct address_space *mapping) nfsi->cache_validity &= ~NFS_INO_INVALID_DATA; spin_unlock(&inode->i_lock); trace_nfs_invalidate_mapping_enter(inode); - ret = nfs_invalidate_mapping(inode, mapping); + if (may_lock) { + mutex_lock(&inode->i_mutex); + ret = nfs_invalidate_mapping(inode, mapping); + mutex_unlock(&inode->i_mutex); + } else + ret = nfs_invalidate_mapping(inode, mapping); trace_nfs_invalidate_mapping_exit(inode, ret); clear_bit_unlock(NFS_INO_INVALIDATING, bitlock); @@ -1123,6 +1139,29 @@ out: return ret; } +/** + * nfs_revalidate_mapping - Revalidate the pagecache + * @inode - pointer to host inode + * @mapping - pointer to mapping + */ +int nfs_revalidate_mapping(struct inode *inode, struct address_space *mapping) +{ + return __nfs_revalidate_mapping(inode, mapping, false); +} + +/** + * nfs_revalidate_mapping_protected - Revalidate the pagecache + * @inode - pointer to host inode + * @mapping - pointer to mapping + * + * Differs from nfs_revalidate_mapping() in that it grabs the inode->i_mutex + * while invalidating the mapping. + */ +int nfs_revalidate_mapping_protected(struct inode *inode, struct address_space *mapping) +{ + return __nfs_revalidate_mapping(inode, mapping, true); +} + static unsigned long nfs_wcc_update_inode(struct inode *inode, struct nfs_fattr *fattr) { struct nfs_inode *nfsi = NFS_I(inode); @@ -1231,13 +1270,6 @@ static int nfs_ctime_need_update(const struct inode *inode, const struct nfs_fat return timespec_compare(&fattr->ctime, &inode->i_ctime) > 0; } -static int nfs_size_need_update(const struct inode *inode, const struct nfs_fattr *fattr) -{ - if (!(fattr->valid & NFS_ATTR_FATTR_SIZE)) - return 0; - return nfs_size_to_loff_t(fattr->size) > i_size_read(inode); -} - static atomic_long_t nfs_attr_generation_counter; static unsigned long nfs_read_attr_generation_counter(void) @@ -1249,6 +1281,7 @@ unsigned long nfs_inc_attr_generation_counter(void) { return atomic_long_inc_return(&nfs_attr_generation_counter); } +EXPORT_SYMBOL_GPL(nfs_inc_attr_generation_counter); void nfs_fattr_init(struct nfs_fattr *fattr) { @@ -1260,6 +1293,22 @@ void nfs_fattr_init(struct nfs_fattr *fattr) } EXPORT_SYMBOL_GPL(nfs_fattr_init); +/** + * nfs_fattr_set_barrier + * @fattr: attributes + * + * Used to set a barrier after an attribute was updated. This + * barrier ensures that older attributes from RPC calls that may + * have raced with our update cannot clobber these new values. + * Note that you are still responsible for ensuring that other + * operations which change the attribute on the server do not + * collide. + */ +void nfs_fattr_set_barrier(struct nfs_fattr *fattr) +{ + fattr->gencount = nfs_inc_attr_generation_counter(); +} + struct nfs_fattr *nfs_alloc_fattr(void) { struct nfs_fattr *fattr; @@ -1370,7 +1419,6 @@ static int nfs_inode_attrs_need_update(const struct inode *inode, const struct n return ((long)fattr->gencount - (long)nfsi->attr_gencount) > 0 || nfs_ctime_need_update(inode, fattr) || - nfs_size_need_update(inode, fattr) || ((long)nfsi->attr_gencount - (long)nfs_read_attr_generation_counter() > 0); } @@ -1460,6 +1508,7 @@ int nfs_post_op_update_inode(struct inode *inode, struct nfs_fattr *fattr) int status; spin_lock(&inode->i_lock); + nfs_fattr_set_barrier(fattr); status = nfs_post_op_update_inode_locked(inode, fattr); spin_unlock(&inode->i_lock); @@ -1468,7 +1517,7 @@ int nfs_post_op_update_inode(struct inode *inode, struct nfs_fattr *fattr) EXPORT_SYMBOL_GPL(nfs_post_op_update_inode); /** - * nfs_post_op_update_inode_force_wcc - try to update the inode attribute cache + * nfs_post_op_update_inode_force_wcc_locked - update the inode attribute cache * @inode - pointer to inode * @fattr - updated attributes * @@ -1478,11 +1527,10 @@ EXPORT_SYMBOL_GPL(nfs_post_op_update_inode); * * This function is mainly designed to be used by the ->write_done() functions. */ -int nfs_post_op_update_inode_force_wcc(struct inode *inode, struct nfs_fattr *fattr) +int nfs_post_op_update_inode_force_wcc_locked(struct inode *inode, struct nfs_fattr *fattr) { int status; - spin_lock(&inode->i_lock); /* Don't do a WCC update if these attributes are already stale */ if ((fattr->valid & NFS_ATTR_FATTR) == 0 || !nfs_inode_attrs_need_update(inode, fattr)) { @@ -1514,6 +1562,27 @@ int nfs_post_op_update_inode_force_wcc(struct inode *inode, struct nfs_fattr *fa } out_noforce: status = nfs_post_op_update_inode_locked(inode, fattr); + return status; +} + +/** + * nfs_post_op_update_inode_force_wcc - try to update the inode attribute cache + * @inode - pointer to inode + * @fattr - updated attributes + * + * After an operation that has changed the inode metadata, mark the + * attribute cache as being invalid, then try to update it. Fake up + * weak cache consistency data, if none exist. + * + * This function is mainly designed to be used by the ->write_done() functions. + */ +int nfs_post_op_update_inode_force_wcc(struct inode *inode, struct nfs_fattr *fattr) +{ + int status; + + spin_lock(&inode->i_lock); + nfs_fattr_set_barrier(fattr); + status = nfs_post_op_update_inode_force_wcc_locked(inode, fattr); spin_unlock(&inode->i_lock); return status; } @@ -1715,6 +1784,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) nfs_inc_stats(inode, NFSIOS_ATTRINVALIDATE); nfsi->attrtimeo = NFS_MINATTRTIMEO(inode); nfsi->attrtimeo_timestamp = now; + /* Set barrier to be more recent than all outstanding updates */ nfsi->attr_gencount = nfs_inc_attr_generation_counter(); } else { if (!time_in_range_open(now, nfsi->attrtimeo_timestamp, nfsi->attrtimeo_timestamp + nfsi->attrtimeo)) { @@ -1722,6 +1792,9 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) nfsi->attrtimeo = NFS_MAXATTRTIMEO(inode); nfsi->attrtimeo_timestamp = now; } + /* Set the barrier to be more recent than this fattr */ + if ((long)fattr->gencount - (long)nfsi->attr_gencount > 0) + nfsi->attr_gencount = fattr->gencount; } invalid &= ~NFS_INO_INVALID_ATTR; /* Don't invalidate the data if we were to blame */ @@ -1775,7 +1848,6 @@ static inline void nfs4_init_once(struct nfs_inode *nfsi) #if IS_ENABLED(CONFIG_NFS_V4) INIT_LIST_HEAD(&nfsi->open_states); nfsi->delegation = NULL; - nfsi->delegation_state = 0; init_rwsem(&nfsi->rwsem); nfsi->layout = NULL; #endif diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index 212b8c883d22..9e6475bc5ba2 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -459,6 +459,7 @@ void nfs_mark_request_commit(struct nfs_page *req, struct nfs_commit_info *cinfo, u32 ds_commit_idx); int nfs_write_need_commit(struct nfs_pgio_header *); +void nfs_writeback_update_inode(struct nfs_pgio_header *hdr); int nfs_generic_commit_list(struct inode *inode, struct list_head *head, int how, struct nfs_commit_info *cinfo); void nfs_retry_commit(struct list_head *page_list, @@ -598,6 +599,19 @@ void nfs_super_set_maxbytes(struct super_block *sb, __u64 maxfilesize) } /* + * Record the page as unstable and mark its inode as dirty. + */ +static inline +void nfs_mark_page_unstable(struct page *page) +{ + struct inode *inode = page_file_mapping(page)->host; + + inc_zone_page_state(page, NR_UNSTABLE_NFS); + inc_bdi_stat(inode_to_bdi(inode), BDI_RECLAIMABLE); + __mark_inode_dirty(inode, I_DIRTY_DATASYNC); +} + +/* * Determine the number of bytes of data the page contains */ static inline diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c index 78e557c3ab87..1f11d2533ee4 100644 --- a/fs/nfs/nfs3proc.c +++ b/fs/nfs/nfs3proc.c @@ -138,7 +138,7 @@ nfs3_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr, nfs_fattr_init(fattr); status = rpc_call_sync(NFS_CLIENT(inode), &msg, 0); if (status == 0) - nfs_setattr_update_inode(inode, sattr); + nfs_setattr_update_inode(inode, sattr, fattr); dprintk("NFS reply setattr: %d\n", status); return status; } @@ -834,7 +834,7 @@ static int nfs3_write_done(struct rpc_task *task, struct nfs_pgio_header *hdr) if (nfs3_async_handle_jukebox(task, inode)) return -EAGAIN; if (task->tk_status >= 0) - nfs_post_op_update_inode_force_wcc(inode, hdr->res.fattr); + nfs_writeback_update_inode(hdr); return 0; } diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c index 2a932fdc57cb..53852a4bd88b 100644 --- a/fs/nfs/nfs3xdr.c +++ b/fs/nfs/nfs3xdr.c @@ -1987,6 +1987,11 @@ int nfs3_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry, if (entry->fattr->valid & NFS_ATTR_FATTR_V3) entry->d_type = nfs_umode_to_dtype(entry->fattr->mode); + if (entry->fattr->fileid != entry->ino) { + entry->fattr->mounted_on_fileid = entry->ino; + entry->fattr->valid |= NFS_ATTR_FATTR_MOUNTED_ON_FILEID; + } + /* In fact, a post_op_fh3: */ p = xdr_inline_decode(xdr, 4); if (unlikely(p == NULL)) diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c index 8646af9b11d2..86d6214ea022 100644 --- a/fs/nfs/nfs4client.c +++ b/fs/nfs/nfs4client.c @@ -621,6 +621,9 @@ int nfs41_walk_client_list(struct nfs_client *new, spin_lock(&nn->nfs_client_lock); list_for_each_entry(pos, &nn->nfs_client_list, cl_share_link) { + if (pos == new) + goto found; + if (pos->rpc_ops != new->rpc_ops) continue; @@ -639,10 +642,6 @@ int nfs41_walk_client_list(struct nfs_client *new, prev = pos; status = nfs_wait_client_init_complete(pos); - if (pos->cl_cons_state == NFS_CS_SESSION_INITING) { - nfs4_schedule_lease_recovery(pos); - status = nfs4_wait_clnt_recover(pos); - } spin_lock(&nn->nfs_client_lock); if (status < 0) break; @@ -668,7 +667,7 @@ int nfs41_walk_client_list(struct nfs_client *new, */ if (!nfs4_match_client_owner_id(pos, new)) continue; - +found: atomic_inc(&pos->cl_count); *result = pos; status = 0; diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 2e7c9f7a6f7c..627f37c44456 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -901,6 +901,7 @@ static void update_changeattr(struct inode *dir, struct nfs4_change_info *cinfo) if (!cinfo->atomic || cinfo->before != dir->i_version) nfs_force_lookup_revalidate(dir); dir->i_version = cinfo->after; + nfsi->attr_gencount = nfs_inc_attr_generation_counter(); nfs_fscache_invalidate(dir); spin_unlock(&dir->i_lock); } @@ -1552,6 +1553,9 @@ static int nfs4_open_recover_helper(struct nfs4_opendata *opendata, fmode_t fmod opendata->o_arg.open_flags = 0; opendata->o_arg.fmode = fmode; + opendata->o_arg.share_access = nfs4_map_atomic_open_share( + NFS_SB(opendata->dentry->d_sb), + fmode, 0); memset(&opendata->o_res, 0, sizeof(opendata->o_res)); memset(&opendata->c_res, 0, sizeof(opendata->c_res)); nfs4_init_opendata_res(opendata); @@ -2413,8 +2417,8 @@ static int _nfs4_do_open(struct inode *dir, opendata->o_res.f_attr, sattr, state, label, olabel); if (status == 0) { - nfs_setattr_update_inode(state->inode, sattr); - nfs_post_op_update_inode(state->inode, opendata->o_res.f_attr); + nfs_setattr_update_inode(state->inode, sattr, + opendata->o_res.f_attr); nfs_setsecurity(state->inode, opendata->o_res.f_attr, olabel); } } @@ -2651,7 +2655,7 @@ static void nfs4_close_done(struct rpc_task *task, void *data) case -NFS4ERR_BAD_STATEID: case -NFS4ERR_EXPIRED: if (!nfs4_stateid_match(&calldata->arg.stateid, - &state->stateid)) { + &state->open_stateid)) { rpc_restart_call_prepare(task); goto out_release; } @@ -2687,7 +2691,7 @@ static void nfs4_close_prepare(struct rpc_task *task, void *data) is_rdwr = test_bit(NFS_O_RDWR_STATE, &state->flags); is_rdonly = test_bit(NFS_O_RDONLY_STATE, &state->flags); is_wronly = test_bit(NFS_O_WRONLY_STATE, &state->flags); - nfs4_stateid_copy(&calldata->arg.stateid, &state->stateid); + nfs4_stateid_copy(&calldata->arg.stateid, &state->open_stateid); /* Calculate the change in open mode */ calldata->arg.fmode = 0; if (state->n_rdwr == 0) { @@ -3288,7 +3292,7 @@ nfs4_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr, status = nfs4_do_setattr(inode, cred, fattr, sattr, state, NULL, label); if (status == 0) { - nfs_setattr_update_inode(inode, sattr); + nfs_setattr_update_inode(inode, sattr, fattr); nfs_setsecurity(inode, fattr, label); } nfs4_label_free(label); @@ -4234,7 +4238,7 @@ static int nfs4_write_done_cb(struct rpc_task *task, } if (task->tk_status >= 0) { renew_lease(NFS_SERVER(inode), hdr->timestamp); - nfs_post_op_update_inode_force_wcc(inode, &hdr->fattr); + nfs_writeback_update_inode(hdr); } return 0; } @@ -6648,47 +6652,47 @@ nfs41_same_server_scope(struct nfs41_server_scope *a, int nfs4_proc_bind_conn_to_session(struct nfs_client *clp, struct rpc_cred *cred) { int status; + struct nfs41_bind_conn_to_session_args args = { + .client = clp, + .dir = NFS4_CDFC4_FORE_OR_BOTH, + }; struct nfs41_bind_conn_to_session_res res; struct rpc_message msg = { .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_BIND_CONN_TO_SESSION], - .rpc_argp = clp, + .rpc_argp = &args, .rpc_resp = &res, .rpc_cred = cred, }; dprintk("--> %s\n", __func__); - res.session = kzalloc(sizeof(struct nfs4_session), GFP_NOFS); - if (unlikely(res.session == NULL)) { - status = -ENOMEM; - goto out; - } + nfs4_copy_sessionid(&args.sessionid, &clp->cl_session->sess_id); + if (!(clp->cl_session->flags & SESSION4_BACK_CHAN)) + args.dir = NFS4_CDFC4_FORE; status = rpc_call_sync(clp->cl_rpcclient, &msg, RPC_TASK_TIMEOUT); trace_nfs4_bind_conn_to_session(clp, status); if (status == 0) { - if (memcmp(res.session->sess_id.data, + if (memcmp(res.sessionid.data, clp->cl_session->sess_id.data, NFS4_MAX_SESSIONID_LEN)) { dprintk("NFS: %s: Session ID mismatch\n", __func__); status = -EIO; - goto out_session; + goto out; } - if (res.dir != NFS4_CDFS4_BOTH) { + if ((res.dir & args.dir) != res.dir || res.dir == 0) { dprintk("NFS: %s: Unexpected direction from server\n", __func__); status = -EIO; - goto out_session; + goto out; } - if (res.use_conn_in_rdma_mode) { + if (res.use_conn_in_rdma_mode != args.use_conn_in_rdma_mode) { dprintk("NFS: %s: Server returned RDMA mode = true\n", __func__); status = -EIO; - goto out_session; + goto out; } } -out_session: - kfree(res.session); out: dprintk("<-- %s status= %d\n", __func__, status); return status; @@ -6893,9 +6897,13 @@ static int _nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred, if (status == 0) { clp->cl_clientid = res.clientid; - clp->cl_exchange_flags = (res.flags & ~EXCHGID4_FLAG_CONFIRMED_R); - if (!(res.flags & EXCHGID4_FLAG_CONFIRMED_R)) + clp->cl_exchange_flags = res.flags; + /* Client ID is not confirmed */ + if (!(res.flags & EXCHGID4_FLAG_CONFIRMED_R)) { + clear_bit(NFS4_SESSION_ESTABLISHED, + &clp->cl_session->session_state); clp->cl_seqid = res.seqid; + } kfree(clp->cl_serverowner); clp->cl_serverowner = res.server_owner; @@ -7166,10 +7174,11 @@ static void nfs4_init_channel_attrs(struct nfs41_create_session_args *args) args->bc_attrs.max_reqs); } -static int nfs4_verify_fore_channel_attrs(struct nfs41_create_session_args *args, struct nfs4_session *session) +static int nfs4_verify_fore_channel_attrs(struct nfs41_create_session_args *args, + struct nfs41_create_session_res *res) { struct nfs4_channel_attrs *sent = &args->fc_attrs; - struct nfs4_channel_attrs *rcvd = &session->fc_attrs; + struct nfs4_channel_attrs *rcvd = &res->fc_attrs; if (rcvd->max_resp_sz > sent->max_resp_sz) return -EINVAL; @@ -7188,11 +7197,14 @@ static int nfs4_verify_fore_channel_attrs(struct nfs41_create_session_args *args return 0; } -static int nfs4_verify_back_channel_attrs(struct nfs41_create_session_args *args, struct nfs4_session *session) +static int nfs4_verify_back_channel_attrs(struct nfs41_create_session_args *args, + struct nfs41_create_session_res *res) { struct nfs4_channel_attrs *sent = &args->bc_attrs; - struct nfs4_channel_attrs *rcvd = &session->bc_attrs; + struct nfs4_channel_attrs *rcvd = &res->bc_attrs; + if (!(res->flags & SESSION4_BACK_CHAN)) + goto out; if (rcvd->max_rqst_sz > sent->max_rqst_sz) return -EINVAL; if (rcvd->max_resp_sz < sent->max_resp_sz) @@ -7204,18 +7216,33 @@ static int nfs4_verify_back_channel_attrs(struct nfs41_create_session_args *args return -EINVAL; if (rcvd->max_reqs != sent->max_reqs) return -EINVAL; +out: return 0; } static int nfs4_verify_channel_attrs(struct nfs41_create_session_args *args, - struct nfs4_session *session) + struct nfs41_create_session_res *res) { int ret; - ret = nfs4_verify_fore_channel_attrs(args, session); + ret = nfs4_verify_fore_channel_attrs(args, res); if (ret) return ret; - return nfs4_verify_back_channel_attrs(args, session); + return nfs4_verify_back_channel_attrs(args, res); +} + +static void nfs4_update_session(struct nfs4_session *session, + struct nfs41_create_session_res *res) +{ + nfs4_copy_sessionid(&session->sess_id, &res->sessionid); + /* Mark client id and session as being confirmed */ + session->clp->cl_exchange_flags |= EXCHGID4_FLAG_CONFIRMED_R; + set_bit(NFS4_SESSION_ESTABLISHED, &session->session_state); + session->flags = res->flags; + memcpy(&session->fc_attrs, &res->fc_attrs, sizeof(session->fc_attrs)); + if (res->flags & SESSION4_BACK_CHAN) + memcpy(&session->bc_attrs, &res->bc_attrs, + sizeof(session->bc_attrs)); } static int _nfs4_proc_create_session(struct nfs_client *clp, @@ -7224,11 +7251,12 @@ static int _nfs4_proc_create_session(struct nfs_client *clp, struct nfs4_session *session = clp->cl_session; struct nfs41_create_session_args args = { .client = clp, + .clientid = clp->cl_clientid, + .seqid = clp->cl_seqid, .cb_program = NFS4_CALLBACK, }; - struct nfs41_create_session_res res = { - .client = clp, - }; + struct nfs41_create_session_res res; + struct rpc_message msg = { .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_CREATE_SESSION], .rpc_argp = &args, @@ -7245,11 +7273,15 @@ static int _nfs4_proc_create_session(struct nfs_client *clp, if (!status) { /* Verify the session's negotiated channel_attrs values */ - status = nfs4_verify_channel_attrs(&args, session); + status = nfs4_verify_channel_attrs(&args, &res); /* Increment the clientid slot sequence id */ - clp->cl_seqid++; + if (clp->cl_seqid == res.seqid) + clp->cl_seqid++; + if (status) + goto out; + nfs4_update_session(session, &res); } - +out: return status; } @@ -7301,8 +7333,8 @@ int nfs4_proc_destroy_session(struct nfs4_session *session, dprintk("--> nfs4_proc_destroy_session\n"); /* session is still being setup */ - if (session->clp->cl_cons_state != NFS_CS_READY) - return status; + if (!test_and_clear_bit(NFS4_SESSION_ESTABLISHED, &session->session_state)) + return 0; status = rpc_call_sync(session->clp->cl_rpcclient, &msg, RPC_TASK_TIMEOUT); trace_nfs4_destroy_session(session->clp, status); diff --git a/fs/nfs/nfs4session.c b/fs/nfs/nfs4session.c index e799dc3c3b1d..e23366effcfb 100644 --- a/fs/nfs/nfs4session.c +++ b/fs/nfs/nfs4session.c @@ -450,7 +450,7 @@ int nfs4_setup_session_slot_tables(struct nfs4_session *ses) tbl = &ses->fc_slot_table; tbl->session = ses; status = nfs4_realloc_slot_table(tbl, ses->fc_attrs.max_reqs, 1); - if (status) /* -ENOMEM */ + if (status || !(ses->flags & SESSION4_BACK_CHAN)) /* -ENOMEM */ return status; /* Back channel */ tbl = &ses->bc_slot_table; diff --git a/fs/nfs/nfs4session.h b/fs/nfs/nfs4session.h index b34ada9bc6a2..e3ea2c5324d6 100644 --- a/fs/nfs/nfs4session.h +++ b/fs/nfs/nfs4session.h @@ -70,6 +70,7 @@ struct nfs4_session { enum nfs4_session_state { NFS4_SESSION_INITING, + NFS4_SESSION_ESTABLISHED, }; extern int nfs4_setup_slot_table(struct nfs4_slot_table *tbl, @@ -118,6 +119,12 @@ static inline int nfs4_has_persistent_session(const struct nfs_client *clp) return 0; } +static inline void nfs4_copy_sessionid(struct nfs4_sessionid *dst, + const struct nfs4_sessionid *src) +{ + memcpy(dst->data, src->data, NFS4_MAX_SESSIONID_LEN); +} + #ifdef CONFIG_CRC32 /* * nfs_session_id_hash - calculate the crc32 hash for the session id diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index 5ad908e9ce9c..f95e3b58bbc3 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -346,9 +346,23 @@ int nfs41_discover_server_trunking(struct nfs_client *clp, status = nfs4_proc_exchange_id(clp, cred); if (status != NFS4_OK) return status; - set_bit(NFS4CLNT_LEASE_CONFIRM, &clp->cl_state); - return nfs41_walk_client_list(clp, result, cred); + status = nfs41_walk_client_list(clp, result, cred); + if (status < 0) + return status; + if (clp != *result) + return 0; + + /* Purge state if the client id was established in a prior instance */ + if (clp->cl_exchange_flags & EXCHGID4_FLAG_CONFIRMED_R) + set_bit(NFS4CLNT_PURGE_STATE, &clp->cl_state); + else + set_bit(NFS4CLNT_LEASE_CONFIRM, &clp->cl_state); + nfs4_schedule_state_manager(clp); + status = nfs_wait_client_init_complete(clp); + if (status < 0) + nfs_put_client(clp); + return status; } #endif /* CONFIG_NFS_V4_1 */ diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index e23a0a664e12..5c399ec41079 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -1715,17 +1715,17 @@ static void encode_secinfo(struct xdr_stream *xdr, const struct qstr *name, stru #if defined(CONFIG_NFS_V4_1) /* NFSv4.1 operations */ static void encode_bind_conn_to_session(struct xdr_stream *xdr, - struct nfs4_session *session, + struct nfs41_bind_conn_to_session_args *args, struct compound_hdr *hdr) { __be32 *p; encode_op_hdr(xdr, OP_BIND_CONN_TO_SESSION, decode_bind_conn_to_session_maxsz, hdr); - encode_opaque_fixed(xdr, session->sess_id.data, NFS4_MAX_SESSIONID_LEN); + encode_opaque_fixed(xdr, args->sessionid.data, NFS4_MAX_SESSIONID_LEN); p = xdr_reserve_space(xdr, 8); - *p++ = cpu_to_be32(NFS4_CDFC4_BACK_OR_BOTH); - *p = 0; /* use_conn_in_rdma_mode = False */ + *p++ = cpu_to_be32(args->dir); + *p = (args->use_conn_in_rdma_mode) ? cpu_to_be32(1) : cpu_to_be32(0); } static void encode_op_map(struct xdr_stream *xdr, struct nfs4_op_map *op_map) @@ -1806,8 +1806,8 @@ static void encode_create_session(struct xdr_stream *xdr, encode_op_hdr(xdr, OP_CREATE_SESSION, decode_create_session_maxsz, hdr); p = reserve_space(xdr, 16 + 2*28 + 20 + clnt->cl_nodelen + 12); - p = xdr_encode_hyper(p, clp->cl_clientid); - *p++ = cpu_to_be32(clp->cl_seqid); /*Sequence id */ + p = xdr_encode_hyper(p, args->clientid); + *p++ = cpu_to_be32(args->seqid); /*Sequence id */ *p++ = cpu_to_be32(args->flags); /*flags */ /* Fore Channel */ @@ -2734,14 +2734,14 @@ static void nfs4_xdr_enc_fsid_present(struct rpc_rqst *req, */ static void nfs4_xdr_enc_bind_conn_to_session(struct rpc_rqst *req, struct xdr_stream *xdr, - struct nfs_client *clp) + struct nfs41_bind_conn_to_session_args *args) { struct compound_hdr hdr = { - .minorversion = clp->cl_mvops->minor_version, + .minorversion = args->client->cl_mvops->minor_version, }; encode_compound_hdr(xdr, req, &hdr); - encode_bind_conn_to_session(xdr, clp->cl_session, &hdr); + encode_bind_conn_to_session(xdr, args, &hdr); encode_nops(&hdr); } @@ -5613,7 +5613,7 @@ static int decode_bind_conn_to_session(struct xdr_stream *xdr, status = decode_op_hdr(xdr, OP_BIND_CONN_TO_SESSION); if (!status) - status = decode_sessionid(xdr, &res->session->sess_id); + status = decode_sessionid(xdr, &res->sessionid); if (unlikely(status)) return status; @@ -5641,12 +5641,10 @@ static int decode_create_session(struct xdr_stream *xdr, { __be32 *p; int status; - struct nfs_client *clp = res->client; - struct nfs4_session *session = clp->cl_session; status = decode_op_hdr(xdr, OP_CREATE_SESSION); if (!status) - status = decode_sessionid(xdr, &session->sess_id); + status = decode_sessionid(xdr, &res->sessionid); if (unlikely(status)) return status; @@ -5654,13 +5652,13 @@ static int decode_create_session(struct xdr_stream *xdr, p = xdr_inline_decode(xdr, 8); if (unlikely(!p)) goto out_overflow; - clp->cl_seqid = be32_to_cpup(p++); - session->flags = be32_to_cpup(p); + res->seqid = be32_to_cpup(p++); + res->flags = be32_to_cpup(p); /* Channel attributes */ - status = decode_chan_attrs(xdr, &session->fc_attrs); + status = decode_chan_attrs(xdr, &res->fc_attrs); if (!status) - status = decode_chan_attrs(xdr, &session->bc_attrs); + status = decode_chan_attrs(xdr, &res->bc_attrs); return status; out_overflow: print_overflow_msg(__func__, xdr); diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h index 797cd6253adf..635f0865671c 100644 --- a/fs/nfs/pnfs.h +++ b/fs/nfs/pnfs.h @@ -344,6 +344,10 @@ void nfs4_pnfs_ds_connect(struct nfs_server *mds_srv, struct nfs4_pnfs_ds *ds, struct nfs4_pnfs_ds_addr *nfs4_decode_mp_ds_addr(struct net *net, struct xdr_stream *xdr, gfp_t gfp_flags); +void pnfs_layout_mark_request_commit(struct nfs_page *req, + struct pnfs_layout_segment *lseg, + struct nfs_commit_info *cinfo, + u32 ds_commit_idx); static inline bool nfs_have_layout(struct inode *inode) { diff --git a/fs/nfs/pnfs_nfs.c b/fs/nfs/pnfs_nfs.c index fdc4f6562bb7..54e36b38fb5f 100644 --- a/fs/nfs/pnfs_nfs.c +++ b/fs/nfs/pnfs_nfs.c @@ -838,3 +838,33 @@ out_err: return NULL; } EXPORT_SYMBOL_GPL(nfs4_decode_mp_ds_addr); + +void +pnfs_layout_mark_request_commit(struct nfs_page *req, + struct pnfs_layout_segment *lseg, + struct nfs_commit_info *cinfo, + u32 ds_commit_idx) +{ + struct list_head *list; + struct pnfs_commit_bucket *buckets; + + spin_lock(cinfo->lock); + buckets = cinfo->ds->buckets; + list = &buckets[ds_commit_idx].written; + if (list_empty(list)) { + /* Non-empty buckets hold a reference on the lseg. That ref + * is normally transferred to the COMMIT call and released + * there. It could also be released if the last req is pulled + * off due to a rewrite, in which case it will be done in + * pnfs_common_clear_request_commit + */ + WARN_ON_ONCE(buckets[ds_commit_idx].wlseg != NULL); + buckets[ds_commit_idx].wlseg = pnfs_get_lseg(lseg); + } + set_bit(PG_COMMIT_TO_DS, &req->wb_flags); + cinfo->ds->nwritten++; + spin_unlock(cinfo->lock); + + nfs_request_add_commit_list(req, list, cinfo); +} +EXPORT_SYMBOL_GPL(pnfs_layout_mark_request_commit); diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c index b09cc23d6f43..c63189acd052 100644 --- a/fs/nfs/proc.c +++ b/fs/nfs/proc.c @@ -139,7 +139,7 @@ nfs_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr, nfs_fattr_init(fattr); status = rpc_call_sync(NFS_CLIENT(inode), &msg, 0); if (status == 0) - nfs_setattr_update_inode(inode, sattr); + nfs_setattr_update_inode(inode, sattr, fattr); dprintk("NFS reply setattr: %d\n", status); return status; } @@ -609,10 +609,8 @@ static int nfs_proc_pgio_rpc_prepare(struct rpc_task *task, static int nfs_write_done(struct rpc_task *task, struct nfs_pgio_header *hdr) { - struct inode *inode = hdr->inode; - if (task->tk_status >= 0) - nfs_post_op_update_inode_force_wcc(inode, hdr->res.fattr); + nfs_writeback_update_inode(hdr); return 0; } diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 88a6d2196ece..849ed784d6ac 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -789,13 +789,8 @@ nfs_request_add_commit_list(struct nfs_page *req, struct list_head *dst, nfs_list_add_request(req, dst); cinfo->mds->ncommit++; spin_unlock(cinfo->lock); - if (!cinfo->dreq) { - inc_zone_page_state(req->wb_page, NR_UNSTABLE_NFS); - inc_bdi_stat(inode_to_bdi(page_file_mapping(req->wb_page)->host), - BDI_RECLAIMABLE); - __mark_inode_dirty(req->wb_context->dentry->d_inode, - I_DIRTY_DATASYNC); - } + if (!cinfo->dreq) + nfs_mark_page_unstable(req->wb_page); } EXPORT_SYMBOL_GPL(nfs_request_add_commit_list); @@ -1382,6 +1377,36 @@ static int nfs_should_remove_suid(const struct inode *inode) return 0; } +static void nfs_writeback_check_extend(struct nfs_pgio_header *hdr, + struct nfs_fattr *fattr) +{ + struct nfs_pgio_args *argp = &hdr->args; + struct nfs_pgio_res *resp = &hdr->res; + + if (!(fattr->valid & NFS_ATTR_FATTR_SIZE)) + return; + if (argp->offset + resp->count != fattr->size) + return; + if (nfs_size_to_loff_t(fattr->size) < i_size_read(hdr->inode)) + return; + /* Set attribute barrier */ + nfs_fattr_set_barrier(fattr); +} + +void nfs_writeback_update_inode(struct nfs_pgio_header *hdr) +{ + struct nfs_fattr *fattr = hdr->res.fattr; + struct inode *inode = hdr->inode; + + if (fattr == NULL) + return; + spin_lock(&inode->i_lock); + nfs_writeback_check_extend(hdr, fattr); + nfs_post_op_update_inode_force_wcc_locked(inode, fattr); + spin_unlock(&inode->i_lock); +} +EXPORT_SYMBOL_GPL(nfs_writeback_update_inode); + /* * This function is called when the WRITE call is complete. */ @@ -1605,11 +1630,8 @@ void nfs_retry_commit(struct list_head *page_list, req = nfs_list_entry(page_list->next); nfs_list_remove_request(req); nfs_mark_request_commit(req, lseg, cinfo, ds_commit_idx); - if (!cinfo->dreq) { - dec_zone_page_state(req->wb_page, NR_UNSTABLE_NFS); - dec_bdi_stat(inode_to_bdi(page_file_mapping(req->wb_page)->host), - BDI_RECLAIMABLE); - } + if (!cinfo->dreq) + nfs_clear_page_commit(req->wb_page); nfs_unlock_and_release_request(req); } } diff --git a/fs/nfsd/nfs4layouts.c b/fs/nfsd/nfs4layouts.c index 3c1bfa155571..1028a0629543 100644 --- a/fs/nfsd/nfs4layouts.c +++ b/fs/nfsd/nfs4layouts.c @@ -587,8 +587,6 @@ nfsd4_cb_layout_fail(struct nfs4_layout_stateid *ls) rpc_ntop((struct sockaddr *)&clp->cl_addr, addr_str, sizeof(addr_str)); - nfsd4_cb_layout_fail(ls); - printk(KERN_WARNING "nfsd: client %s failed to respond to layout recall. " " Fencing..\n", addr_str); diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c index cc6a76072009..1c307f02baa8 100644 --- a/fs/nfsd/nfs4recover.c +++ b/fs/nfsd/nfs4recover.c @@ -583,7 +583,7 @@ nfs4_reset_recoverydir(char *recdir) if (status) return status; status = -ENOTDIR; - if (S_ISDIR(path.dentry->d_inode->i_mode)) { + if (d_is_dir(path.dentry)) { strcpy(user_recovery_dirname, recdir); status = 0; } @@ -1426,7 +1426,7 @@ nfsd4_client_tracking_init(struct net *net) nn->client_tracking_ops = &nfsd4_legacy_tracking_ops; status = kern_path(nfs4_recoverydir(), LOOKUP_FOLLOW, &path); if (!status) { - status = S_ISDIR(path.dentry->d_inode->i_mode); + status = d_is_dir(path.dentry); path_put(&path); if (status) goto do_init; diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index f6b2a09f793f..d2f2c37dc2db 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -1638,7 +1638,7 @@ __destroy_client(struct nfs4_client *clp) nfs4_put_stid(&dp->dl_stid); } while (!list_empty(&clp->cl_revoked)) { - dp = list_entry(reaplist.next, struct nfs4_delegation, dl_recall_lru); + dp = list_entry(clp->cl_revoked.next, struct nfs4_delegation, dl_recall_lru); list_del_init(&dp->dl_recall_lru); nfs4_put_stid(&dp->dl_stid); } diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c index 965b478d50fc..e9fa966fc37f 100644 --- a/fs/nfsd/nfsfh.c +++ b/fs/nfsd/nfsfh.c @@ -114,8 +114,8 @@ static inline __be32 check_pseudo_root(struct svc_rqst *rqstp, * We're exposing only the directories and symlinks that have to be * traversed on the way to real exports: */ - if (unlikely(!S_ISDIR(dentry->d_inode->i_mode) && - !S_ISLNK(dentry->d_inode->i_mode))) + if (unlikely(!d_is_dir(dentry) && + !d_is_symlink(dentry))) return nfserr_stale; /* * A pseudoroot export gives permission to access only one @@ -259,7 +259,7 @@ static __be32 nfsd_set_fh_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp) goto out; } - if (S_ISDIR(dentry->d_inode->i_mode) && + if (d_is_dir(dentry) && (dentry->d_flags & DCACHE_DISCONNECTED)) { printk("nfsd: find_fh_dentry returned a DISCONNECTED directory: %pd2\n", dentry); @@ -414,7 +414,7 @@ static inline void _fh_update_old(struct dentry *dentry, { fh->ofh_ino = ino_t_to_u32(dentry->d_inode->i_ino); fh->ofh_generation = dentry->d_inode->i_generation; - if (S_ISDIR(dentry->d_inode->i_mode) || + if (d_is_dir(dentry) || (exp->ex_flags & NFSEXP_NOSUBTREECHECK)) fh->ofh_dirino = 0; } diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index 5685c679dd93..368526582429 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -615,9 +615,9 @@ nfsd_access(struct svc_rqst *rqstp, struct svc_fh *fhp, u32 *access, u32 *suppor export = fhp->fh_export; dentry = fhp->fh_dentry; - if (S_ISREG(dentry->d_inode->i_mode)) + if (d_is_reg(dentry)) map = nfs3_regaccess; - else if (S_ISDIR(dentry->d_inode->i_mode)) + else if (d_is_dir(dentry)) map = nfs3_diraccess; else map = nfs3_anyaccess; @@ -1402,7 +1402,7 @@ do_nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp, switch (createmode) { case NFS3_CREATE_UNCHECKED: - if (! S_ISREG(dchild->d_inode->i_mode)) + if (! d_is_reg(dchild)) goto out; else if (truncp) { /* in nfsv4, we need to treat this case a little @@ -1615,7 +1615,7 @@ nfsd_link(struct svc_rqst *rqstp, struct svc_fh *ffhp, if (err) goto out; err = nfserr_isdir; - if (S_ISDIR(tfhp->fh_dentry->d_inode->i_mode)) + if (d_is_dir(tfhp->fh_dentry)) goto out; err = nfserr_perm; if (!len) diff --git a/fs/nilfs2/btree.c b/fs/nilfs2/btree.c index b2e3ff347620..ecdbae19a766 100644 --- a/fs/nilfs2/btree.c +++ b/fs/nilfs2/btree.c @@ -31,6 +31,8 @@ #include "alloc.h" #include "dat.h" +static void __nilfs_btree_init(struct nilfs_bmap *bmap); + static struct nilfs_btree_path *nilfs_btree_alloc_path(void) { struct nilfs_btree_path *path; @@ -368,6 +370,34 @@ static int nilfs_btree_node_broken(const struct nilfs_btree_node *node, return ret; } +/** + * nilfs_btree_root_broken - verify consistency of btree root node + * @node: btree root node to be examined + * @ino: inode number + * + * Return Value: If node is broken, 1 is returned. Otherwise, 0 is returned. + */ +static int nilfs_btree_root_broken(const struct nilfs_btree_node *node, + unsigned long ino) +{ + int level, flags, nchildren; + int ret = 0; + + level = nilfs_btree_node_get_level(node); + flags = nilfs_btree_node_get_flags(node); + nchildren = nilfs_btree_node_get_nchildren(node); + + if (unlikely(level < NILFS_BTREE_LEVEL_NODE_MIN || + level > NILFS_BTREE_LEVEL_MAX || + nchildren < 0 || + nchildren > NILFS_BTREE_ROOT_NCHILDREN_MAX)) { + pr_crit("NILFS: bad btree root (inode number=%lu): level = %d, flags = 0x%x, nchildren = %d\n", + ino, level, flags, nchildren); + ret = 1; + } + return ret; +} + int nilfs_btree_broken_node_block(struct buffer_head *bh) { int ret; @@ -1713,7 +1743,7 @@ nilfs_btree_commit_convert_and_insert(struct nilfs_bmap *btree, /* convert and insert */ dat = NILFS_BMAP_USE_VBN(btree) ? nilfs_bmap_get_dat(btree) : NULL; - nilfs_btree_init(btree); + __nilfs_btree_init(btree); if (nreq != NULL) { nilfs_bmap_commit_alloc_ptr(btree, dreq, dat); nilfs_bmap_commit_alloc_ptr(btree, nreq, dat); @@ -2294,12 +2324,23 @@ static const struct nilfs_bmap_operations nilfs_btree_ops_gc = { .bop_gather_data = NULL, }; -int nilfs_btree_init(struct nilfs_bmap *bmap) +static void __nilfs_btree_init(struct nilfs_bmap *bmap) { bmap->b_ops = &nilfs_btree_ops; bmap->b_nchildren_per_block = NILFS_BTREE_NODE_NCHILDREN_MAX(nilfs_btree_node_size(bmap)); - return 0; +} + +int nilfs_btree_init(struct nilfs_bmap *bmap) +{ + int ret = 0; + + __nilfs_btree_init(bmap); + + if (nilfs_btree_root_broken(nilfs_btree_get_root(bmap), + bmap->b_inode->i_ino)) + ret = -EIO; + return ret; } void nilfs_btree_init_gc(struct nilfs_bmap *bmap) diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c index 469086b9f99b..0c3f303baf32 100644 --- a/fs/nilfs2/segment.c +++ b/fs/nilfs2/segment.c @@ -1907,6 +1907,7 @@ static void nilfs_segctor_drop_written_files(struct nilfs_sc_info *sci, struct the_nilfs *nilfs) { struct nilfs_inode_info *ii, *n; + int during_mount = !(sci->sc_super->s_flags & MS_ACTIVE); int defer_iput = false; spin_lock(&nilfs->ns_inode_lock); @@ -1919,10 +1920,10 @@ static void nilfs_segctor_drop_written_files(struct nilfs_sc_info *sci, brelse(ii->i_bh); ii->i_bh = NULL; list_del_init(&ii->i_dirty); - if (!ii->vfs_inode.i_nlink) { + if (!ii->vfs_inode.i_nlink || during_mount) { /* - * Defer calling iput() to avoid a deadlock - * over I_SYNC flag for inodes with i_nlink == 0 + * Defer calling iput() to avoid deadlocks if + * i_nlink == 0 or mount is not yet finished. */ list_add_tail(&ii->i_dirty, &sci->sc_iput_queue); defer_iput = true; diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c index 51ceb8107284..d2f97ecca6a5 100644 --- a/fs/notify/fanotify/fanotify.c +++ b/fs/notify/fanotify/fanotify.c @@ -115,8 +115,8 @@ static bool fanotify_should_send_event(struct fsnotify_mark *inode_mark, return false; /* sorry, fanotify only gives a damn about files and dirs */ - if (!S_ISREG(path->dentry->d_inode->i_mode) && - !S_ISDIR(path->dentry->d_inode->i_mode)) + if (!d_is_reg(path->dentry) && + !d_can_lookup(path->dentry)) return false; if (inode_mark && vfsmnt_mark) { @@ -139,11 +139,12 @@ static bool fanotify_should_send_event(struct fsnotify_mark *inode_mark, BUG(); } - if (S_ISDIR(path->dentry->d_inode->i_mode) && + if (d_is_dir(path->dentry) && !(marks_mask & FS_ISDIR & ~marks_ignored_mask)) return false; - if (event_mask & marks_mask & ~marks_ignored_mask) + if (event_mask & FAN_ALL_OUTGOING_EVENTS & marks_mask & + ~marks_ignored_mask) return true; return false; diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h index 8490c64d34fe..460c6c37e683 100644 --- a/fs/ocfs2/ocfs2.h +++ b/fs/ocfs2/ocfs2.h @@ -502,7 +502,7 @@ static inline int ocfs2_writes_unwritten_extents(struct ocfs2_super *osb) static inline int ocfs2_supports_append_dio(struct ocfs2_super *osb) { - if (osb->s_feature_ro_compat & OCFS2_FEATURE_RO_COMPAT_APPEND_DIO) + if (osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_APPEND_DIO) return 1; return 0; } diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h index 20e37a3ed26f..db64ce2d4667 100644 --- a/fs/ocfs2/ocfs2_fs.h +++ b/fs/ocfs2/ocfs2_fs.h @@ -102,11 +102,11 @@ | OCFS2_FEATURE_INCOMPAT_INDEXED_DIRS \ | OCFS2_FEATURE_INCOMPAT_REFCOUNT_TREE \ | OCFS2_FEATURE_INCOMPAT_DISCONTIG_BG \ - | OCFS2_FEATURE_INCOMPAT_CLUSTERINFO) + | OCFS2_FEATURE_INCOMPAT_CLUSTERINFO \ + | OCFS2_FEATURE_INCOMPAT_APPEND_DIO) #define OCFS2_FEATURE_RO_COMPAT_SUPP (OCFS2_FEATURE_RO_COMPAT_UNWRITTEN \ | OCFS2_FEATURE_RO_COMPAT_USRQUOTA \ - | OCFS2_FEATURE_RO_COMPAT_GRPQUOTA \ - | OCFS2_FEATURE_RO_COMPAT_APPEND_DIO) + | OCFS2_FEATURE_RO_COMPAT_GRPQUOTA) /* * Heartbeat-only devices are missing journals and other files. The @@ -179,6 +179,11 @@ #define OCFS2_FEATURE_INCOMPAT_CLUSTERINFO 0x4000 /* + * Append Direct IO support + */ +#define OCFS2_FEATURE_INCOMPAT_APPEND_DIO 0x8000 + +/* * backup superblock flag is used to indicate that this volume * has backup superblocks. */ @@ -200,10 +205,6 @@ #define OCFS2_FEATURE_RO_COMPAT_USRQUOTA 0x0002 #define OCFS2_FEATURE_RO_COMPAT_GRPQUOTA 0x0004 -/* - * Append Direct IO support - */ -#define OCFS2_FEATURE_RO_COMPAT_APPEND_DIO 0x0008 /* The byte offset of the first backup block will be 1G. * The following will be 4G, 16G, 64G, 256G and 1T. diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c index ea10a8719107..24f640441bd9 100644 --- a/fs/overlayfs/copy_up.c +++ b/fs/overlayfs/copy_up.c @@ -191,7 +191,6 @@ int ovl_set_attr(struct dentry *upperdentry, struct kstat *stat) ovl_set_timestamps(upperdentry, stat); return err; - } static int ovl_copy_up_locked(struct dentry *workdir, struct dentry *upperdir, @@ -385,7 +384,7 @@ int ovl_copy_up(struct dentry *dentry) struct kstat stat; enum ovl_path_type type = ovl_path_type(dentry); - if (type != OVL_PATH_LOWER) + if (OVL_TYPE_UPPER(type)) break; next = dget(dentry); @@ -394,7 +393,7 @@ int ovl_copy_up(struct dentry *dentry) parent = dget_parent(next); type = ovl_path_type(parent); - if (type != OVL_PATH_LOWER) + if (OVL_TYPE_UPPER(type)) break; dput(next); diff --git a/fs/overlayfs/dir.c b/fs/overlayfs/dir.c index 8ffc4b980f1b..d139405d2bfa 100644 --- a/fs/overlayfs/dir.c +++ b/fs/overlayfs/dir.c @@ -19,7 +19,7 @@ void ovl_cleanup(struct inode *wdir, struct dentry *wdentry) int err; dget(wdentry); - if (S_ISDIR(wdentry->d_inode->i_mode)) + if (d_is_dir(wdentry)) err = ovl_do_rmdir(wdir, wdentry); else err = ovl_do_unlink(wdir, wdentry); @@ -118,14 +118,14 @@ int ovl_create_real(struct inode *dir, struct dentry *newdentry, static int ovl_set_opaque(struct dentry *upperdentry) { - return ovl_do_setxattr(upperdentry, ovl_opaque_xattr, "y", 1, 0); + return ovl_do_setxattr(upperdentry, OVL_XATTR_OPAQUE, "y", 1, 0); } static void ovl_remove_opaque(struct dentry *upperdentry) { int err; - err = ovl_do_removexattr(upperdentry, ovl_opaque_xattr); + err = ovl_do_removexattr(upperdentry, OVL_XATTR_OPAQUE); if (err) { pr_warn("overlayfs: failed to remove opaque from '%s' (%i)\n", upperdentry->d_name.name, err); @@ -152,7 +152,7 @@ static int ovl_dir_getattr(struct vfsmount *mnt, struct dentry *dentry, * correct link count. nlink=1 seems to pacify 'find' and * other utilities. */ - if (type == OVL_PATH_MERGE) + if (OVL_TYPE_MERGE(type)) stat->nlink = 1; return 0; @@ -506,7 +506,7 @@ static int ovl_remove_and_whiteout(struct dentry *dentry, bool is_dir) struct dentry *opaquedir = NULL; int err; - if (is_dir) { + if (is_dir && OVL_TYPE_MERGE_OR_LOWER(ovl_path_type(dentry))) { opaquedir = ovl_check_empty_and_clear(dentry); err = PTR_ERR(opaquedir); if (IS_ERR(opaquedir)) @@ -630,7 +630,7 @@ static int ovl_do_remove(struct dentry *dentry, bool is_dir) goto out_drop_write; type = ovl_path_type(dentry); - if (type == OVL_PATH_PURE_UPPER) { + if (OVL_TYPE_PURE_UPPER(type)) { err = ovl_remove_upper(dentry, is_dir); } else { const struct cred *old_cred; @@ -693,7 +693,7 @@ static int ovl_rename2(struct inode *olddir, struct dentry *old, bool new_create = false; bool cleanup_whiteout = false; bool overwrite = !(flags & RENAME_EXCHANGE); - bool is_dir = S_ISDIR(old->d_inode->i_mode); + bool is_dir = d_is_dir(old); bool new_is_dir = false; struct dentry *opaquedir = NULL; const struct cred *old_cred = NULL; @@ -712,7 +712,7 @@ static int ovl_rename2(struct inode *olddir, struct dentry *old, /* Don't copy up directory trees */ old_type = ovl_path_type(old); err = -EXDEV; - if ((old_type == OVL_PATH_LOWER || old_type == OVL_PATH_MERGE) && is_dir) + if (OVL_TYPE_MERGE_OR_LOWER(old_type) && is_dir) goto out; if (new->d_inode) { @@ -720,30 +720,30 @@ static int ovl_rename2(struct inode *olddir, struct dentry *old, if (err) goto out; - if (S_ISDIR(new->d_inode->i_mode)) + if (d_is_dir(new)) new_is_dir = true; new_type = ovl_path_type(new); err = -EXDEV; - if (!overwrite && (new_type == OVL_PATH_LOWER || new_type == OVL_PATH_MERGE) && new_is_dir) + if (!overwrite && OVL_TYPE_MERGE_OR_LOWER(new_type) && new_is_dir) goto out; err = 0; - if (new_type == OVL_PATH_LOWER && old_type == OVL_PATH_LOWER) { + if (!OVL_TYPE_UPPER(new_type) && !OVL_TYPE_UPPER(old_type)) { if (ovl_dentry_lower(old)->d_inode == ovl_dentry_lower(new)->d_inode) goto out; } - if (new_type != OVL_PATH_LOWER && old_type != OVL_PATH_LOWER) { + if (OVL_TYPE_UPPER(new_type) && OVL_TYPE_UPPER(old_type)) { if (ovl_dentry_upper(old)->d_inode == ovl_dentry_upper(new)->d_inode) goto out; } } else { if (ovl_dentry_is_opaque(new)) - new_type = OVL_PATH_UPPER; + new_type = __OVL_PATH_UPPER; else - new_type = OVL_PATH_PURE_UPPER; + new_type = __OVL_PATH_UPPER | __OVL_PATH_PURE; } err = ovl_want_write(old); @@ -763,8 +763,8 @@ static int ovl_rename2(struct inode *olddir, struct dentry *old, goto out_drop_write; } - old_opaque = old_type != OVL_PATH_PURE_UPPER; - new_opaque = new_type != OVL_PATH_PURE_UPPER; + old_opaque = !OVL_TYPE_PURE_UPPER(old_type); + new_opaque = !OVL_TYPE_PURE_UPPER(new_type); if (old_opaque || new_opaque) { err = -ENOMEM; @@ -787,7 +787,7 @@ static int ovl_rename2(struct inode *olddir, struct dentry *old, old_cred = override_creds(override_cred); } - if (overwrite && (new_type == OVL_PATH_LOWER || new_type == OVL_PATH_MERGE) && new_is_dir) { + if (overwrite && OVL_TYPE_MERGE_OR_LOWER(new_type) && new_is_dir) { opaquedir = ovl_check_empty_and_clear(new); err = PTR_ERR(opaquedir); if (IS_ERR(opaquedir)) { diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c index 07d74b24913b..04f124884687 100644 --- a/fs/overlayfs/inode.c +++ b/fs/overlayfs/inode.c @@ -205,7 +205,7 @@ static int ovl_readlink(struct dentry *dentry, char __user *buf, int bufsiz) static bool ovl_is_private_xattr(const char *name) { - return strncmp(name, "trusted.overlay.", 14) == 0; + return strncmp(name, OVL_XATTR_PRE_NAME, OVL_XATTR_PRE_LEN) == 0; } int ovl_setxattr(struct dentry *dentry, const char *name, @@ -238,7 +238,10 @@ out: static bool ovl_need_xattr_filter(struct dentry *dentry, enum ovl_path_type type) { - return type == OVL_PATH_UPPER && S_ISDIR(dentry->d_inode->i_mode); + if ((type & (__OVL_PATH_PURE | __OVL_PATH_UPPER)) == __OVL_PATH_UPPER) + return S_ISDIR(dentry->d_inode->i_mode); + else + return false; } ssize_t ovl_getxattr(struct dentry *dentry, const char *name, @@ -299,7 +302,7 @@ int ovl_removexattr(struct dentry *dentry, const char *name) if (ovl_need_xattr_filter(dentry, type) && ovl_is_private_xattr(name)) goto out_drop_write; - if (type == OVL_PATH_LOWER) { + if (!OVL_TYPE_UPPER(type)) { err = vfs_getxattr(realpath.dentry, name, NULL, 0); if (err < 0) goto out_drop_write; @@ -321,7 +324,7 @@ out: static bool ovl_open_need_copy_up(int flags, enum ovl_path_type type, struct dentry *realdentry) { - if (type != OVL_PATH_LOWER) + if (OVL_TYPE_UPPER(type)) return false; if (special_file(realdentry->d_inode->i_mode)) @@ -430,5 +433,4 @@ struct inode *ovl_new_inode(struct super_block *sb, umode_t mode, } return inode; - } diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h index 814bed33dd07..17ac5afc9ffb 100644 --- a/fs/overlayfs/overlayfs.h +++ b/fs/overlayfs/overlayfs.h @@ -12,13 +12,20 @@ struct ovl_entry; enum ovl_path_type { - OVL_PATH_PURE_UPPER, - OVL_PATH_UPPER, - OVL_PATH_MERGE, - OVL_PATH_LOWER, + __OVL_PATH_PURE = (1 << 0), + __OVL_PATH_UPPER = (1 << 1), + __OVL_PATH_MERGE = (1 << 2), }; -extern const char *ovl_opaque_xattr; +#define OVL_TYPE_UPPER(type) ((type) & __OVL_PATH_UPPER) +#define OVL_TYPE_MERGE(type) ((type) & __OVL_PATH_MERGE) +#define OVL_TYPE_PURE_UPPER(type) ((type) & __OVL_PATH_PURE) +#define OVL_TYPE_MERGE_OR_LOWER(type) \ + (OVL_TYPE_MERGE(type) || !OVL_TYPE_UPPER(type)) + +#define OVL_XATTR_PRE_NAME "trusted.overlay." +#define OVL_XATTR_PRE_LEN 16 +#define OVL_XATTR_OPAQUE OVL_XATTR_PRE_NAME"opaque" static inline int ovl_do_rmdir(struct inode *dir, struct dentry *dentry) { @@ -130,6 +137,7 @@ void ovl_dentry_version_inc(struct dentry *dentry); void ovl_path_upper(struct dentry *dentry, struct path *path); void ovl_path_lower(struct dentry *dentry, struct path *path); enum ovl_path_type ovl_path_real(struct dentry *dentry, struct path *path); +int ovl_path_next(int idx, struct dentry *dentry, struct path *path); struct dentry *ovl_dentry_upper(struct dentry *dentry); struct dentry *ovl_dentry_lower(struct dentry *dentry); struct dentry *ovl_dentry_real(struct dentry *dentry); diff --git a/fs/overlayfs/readdir.c b/fs/overlayfs/readdir.c index c0205990a9f5..907870e81a72 100644 --- a/fs/overlayfs/readdir.c +++ b/fs/overlayfs/readdir.c @@ -24,7 +24,6 @@ struct ovl_cache_entry { struct list_head l_node; struct rb_node node; bool is_whiteout; - bool is_cursor; char name[]; }; @@ -40,6 +39,7 @@ struct ovl_readdir_data { struct rb_root root; struct list_head *list; struct list_head middle; + struct dentry *dir; int count; int err; }; @@ -48,7 +48,7 @@ struct ovl_dir_file { bool is_real; bool is_upper; struct ovl_dir_cache *cache; - struct ovl_cache_entry cursor; + struct list_head *cursor; struct file *realfile; struct file *upperfile; }; @@ -79,23 +79,49 @@ static struct ovl_cache_entry *ovl_cache_entry_find(struct rb_root *root, return NULL; } -static struct ovl_cache_entry *ovl_cache_entry_new(const char *name, int len, +static struct ovl_cache_entry *ovl_cache_entry_new(struct dentry *dir, + const char *name, int len, u64 ino, unsigned int d_type) { struct ovl_cache_entry *p; size_t size = offsetof(struct ovl_cache_entry, name[len + 1]); p = kmalloc(size, GFP_KERNEL); - if (p) { - memcpy(p->name, name, len); - p->name[len] = '\0'; - p->len = len; - p->type = d_type; - p->ino = ino; - p->is_whiteout = false; - p->is_cursor = false; - } + if (!p) + return NULL; + + memcpy(p->name, name, len); + p->name[len] = '\0'; + p->len = len; + p->type = d_type; + p->ino = ino; + p->is_whiteout = false; + + if (d_type == DT_CHR) { + struct dentry *dentry; + const struct cred *old_cred; + struct cred *override_cred; + + override_cred = prepare_creds(); + if (!override_cred) { + kfree(p); + return NULL; + } + + /* + * CAP_DAC_OVERRIDE for lookup + */ + cap_raise(override_cred->cap_effective, CAP_DAC_OVERRIDE); + old_cred = override_creds(override_cred); + dentry = lookup_one_len(name, dir, len); + if (!IS_ERR(dentry)) { + p->is_whiteout = ovl_is_whiteout(dentry); + dput(dentry); + } + revert_creds(old_cred); + put_cred(override_cred); + } return p; } @@ -122,7 +148,7 @@ static int ovl_cache_entry_add_rb(struct ovl_readdir_data *rdd, return 0; } - p = ovl_cache_entry_new(name, len, ino, d_type); + p = ovl_cache_entry_new(rdd->dir, name, len, ino, d_type); if (p == NULL) return -ENOMEM; @@ -143,7 +169,7 @@ static int ovl_fill_lower(struct ovl_readdir_data *rdd, if (p) { list_move_tail(&p->l_node, &rdd->middle); } else { - p = ovl_cache_entry_new(name, namelen, ino, d_type); + p = ovl_cache_entry_new(rdd->dir, name, namelen, ino, d_type); if (p == NULL) rdd->err = -ENOMEM; else @@ -168,7 +194,6 @@ static void ovl_cache_put(struct ovl_dir_file *od, struct dentry *dentry) { struct ovl_dir_cache *cache = od->cache; - list_del_init(&od->cursor.l_node); WARN_ON(cache->refcount <= 0); cache->refcount--; if (!cache->refcount) { @@ -204,6 +229,7 @@ static inline int ovl_dir_read(struct path *realpath, if (IS_ERR(realfile)) return PTR_ERR(realfile); + rdd->dir = realpath->dentry; rdd->ctx.pos = 0; do { rdd->count = 0; @@ -227,108 +253,58 @@ static void ovl_dir_reset(struct file *file) if (cache && ovl_dentry_version_get(dentry) != cache->version) { ovl_cache_put(od, dentry); od->cache = NULL; + od->cursor = NULL; } - WARN_ON(!od->is_real && type != OVL_PATH_MERGE); - if (od->is_real && type == OVL_PATH_MERGE) + WARN_ON(!od->is_real && !OVL_TYPE_MERGE(type)); + if (od->is_real && OVL_TYPE_MERGE(type)) od->is_real = false; } -static int ovl_dir_mark_whiteouts(struct dentry *dir, - struct ovl_readdir_data *rdd) -{ - struct ovl_cache_entry *p; - struct dentry *dentry; - const struct cred *old_cred; - struct cred *override_cred; - - override_cred = prepare_creds(); - if (!override_cred) { - ovl_cache_free(rdd->list); - return -ENOMEM; - } - - /* - * CAP_DAC_OVERRIDE for lookup - */ - cap_raise(override_cred->cap_effective, CAP_DAC_OVERRIDE); - old_cred = override_creds(override_cred); - - mutex_lock(&dir->d_inode->i_mutex); - list_for_each_entry(p, rdd->list, l_node) { - if (p->is_cursor) - continue; - - if (p->type != DT_CHR) - continue; - - dentry = lookup_one_len(p->name, dir, p->len); - if (IS_ERR(dentry)) - continue; - - p->is_whiteout = ovl_is_whiteout(dentry); - dput(dentry); - } - mutex_unlock(&dir->d_inode->i_mutex); - - revert_creds(old_cred); - put_cred(override_cred); - - return 0; -} - static int ovl_dir_read_merged(struct dentry *dentry, struct list_head *list) { int err; - struct path lowerpath; - struct path upperpath; + struct path realpath; struct ovl_readdir_data rdd = { .ctx.actor = ovl_fill_merge, .list = list, .root = RB_ROOT, .is_merge = false, }; + int idx, next; - ovl_path_lower(dentry, &lowerpath); - ovl_path_upper(dentry, &upperpath); + for (idx = 0; idx != -1; idx = next) { + next = ovl_path_next(idx, dentry, &realpath); - if (upperpath.dentry) { - err = ovl_dir_read(&upperpath, &rdd); - if (err) - goto out; - - if (lowerpath.dentry) { - err = ovl_dir_mark_whiteouts(upperpath.dentry, &rdd); + if (next != -1) { + err = ovl_dir_read(&realpath, &rdd); if (err) - goto out; + break; + } else { + /* + * Insert lowest layer entries before upper ones, this + * allows offsets to be reasonably constant + */ + list_add(&rdd.middle, rdd.list); + rdd.is_merge = true; + err = ovl_dir_read(&realpath, &rdd); + list_del(&rdd.middle); } } - if (lowerpath.dentry) { - /* - * Insert lowerpath entries before upperpath ones, this allows - * offsets to be reasonably constant - */ - list_add(&rdd.middle, rdd.list); - rdd.is_merge = true; - err = ovl_dir_read(&lowerpath, &rdd); - list_del(&rdd.middle); - } -out: return err; } static void ovl_seek_cursor(struct ovl_dir_file *od, loff_t pos) { - struct ovl_cache_entry *p; + struct list_head *p; loff_t off = 0; - list_for_each_entry(p, &od->cache->entries, l_node) { - if (p->is_cursor) - continue; + list_for_each(p, &od->cache->entries) { if (off >= pos) break; off++; } - list_move_tail(&od->cursor.l_node, &p->l_node); + /* Cursor is safe since the cache is stable */ + od->cursor = p; } static struct ovl_dir_cache *ovl_cache_get(struct dentry *dentry) @@ -367,6 +343,7 @@ static int ovl_iterate(struct file *file, struct dir_context *ctx) { struct ovl_dir_file *od = file->private_data; struct dentry *dentry = file->f_path.dentry; + struct ovl_cache_entry *p; if (!ctx->pos) ovl_dir_reset(file); @@ -385,19 +362,13 @@ static int ovl_iterate(struct file *file, struct dir_context *ctx) ovl_seek_cursor(od, ctx->pos); } - while (od->cursor.l_node.next != &od->cache->entries) { - struct ovl_cache_entry *p; - - p = list_entry(od->cursor.l_node.next, struct ovl_cache_entry, l_node); - /* Skip cursors */ - if (!p->is_cursor) { - if (!p->is_whiteout) { - if (!dir_emit(ctx, p->name, p->len, p->ino, p->type)) - break; - } - ctx->pos++; - } - list_move(&od->cursor.l_node, &p->l_node); + while (od->cursor != &od->cache->entries) { + p = list_entry(od->cursor, struct ovl_cache_entry, l_node); + if (!p->is_whiteout) + if (!dir_emit(ctx, p->name, p->len, p->ino, p->type)) + break; + od->cursor = p->l_node.next; + ctx->pos++; } return 0; } @@ -452,7 +423,7 @@ static int ovl_dir_fsync(struct file *file, loff_t start, loff_t end, /* * Need to check if we started out being a lower dir, but got copied up */ - if (!od->is_upper && ovl_path_type(dentry) != OVL_PATH_LOWER) { + if (!od->is_upper && OVL_TYPE_UPPER(ovl_path_type(dentry))) { struct inode *inode = file_inode(file); realfile = lockless_dereference(od->upperfile); @@ -516,11 +487,9 @@ static int ovl_dir_open(struct inode *inode, struct file *file) kfree(od); return PTR_ERR(realfile); } - INIT_LIST_HEAD(&od->cursor.l_node); od->realfile = realfile; - od->is_real = (type != OVL_PATH_MERGE); - od->is_upper = (type != OVL_PATH_LOWER); - od->cursor.is_cursor = true; + od->is_real = !OVL_TYPE_MERGE(type); + od->is_upper = OVL_TYPE_UPPER(type); file->private_data = od; return 0; diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c index f16d318b71f8..5f0d1993e6e3 100644 --- a/fs/overlayfs/super.c +++ b/fs/overlayfs/super.c @@ -35,7 +35,8 @@ struct ovl_config { /* private information held for overlayfs's superblock */ struct ovl_fs { struct vfsmount *upper_mnt; - struct vfsmount *lower_mnt; + unsigned numlower; + struct vfsmount **lower_mnt; struct dentry *workdir; long lower_namelen; /* pathnames of lower and upper dirs, for show_options */ @@ -47,7 +48,6 @@ struct ovl_dir_cache; /* private information held for every overlayfs dentry */ struct ovl_entry { struct dentry *__upperdentry; - struct dentry *lowerdentry; struct ovl_dir_cache *cache; union { struct { @@ -56,30 +56,36 @@ struct ovl_entry { }; struct rcu_head rcu; }; + unsigned numlower; + struct path lowerstack[]; }; -const char *ovl_opaque_xattr = "trusted.overlay.opaque"; +#define OVL_MAX_STACK 500 +static struct dentry *__ovl_dentry_lower(struct ovl_entry *oe) +{ + return oe->numlower ? oe->lowerstack[0].dentry : NULL; +} enum ovl_path_type ovl_path_type(struct dentry *dentry) { struct ovl_entry *oe = dentry->d_fsdata; + enum ovl_path_type type = 0; if (oe->__upperdentry) { - if (oe->lowerdentry) { + type = __OVL_PATH_UPPER; + + if (oe->numlower) { if (S_ISDIR(dentry->d_inode->i_mode)) - return OVL_PATH_MERGE; - else - return OVL_PATH_UPPER; - } else { - if (oe->opaque) - return OVL_PATH_UPPER; - else - return OVL_PATH_PURE_UPPER; + type |= __OVL_PATH_MERGE; + } else if (!oe->opaque) { + type |= __OVL_PATH_PURE; } } else { - return OVL_PATH_LOWER; + if (oe->numlower > 1) + type |= __OVL_PATH_MERGE; } + return type; } static struct dentry *ovl_upperdentry_dereference(struct ovl_entry *oe) @@ -98,10 +104,9 @@ void ovl_path_upper(struct dentry *dentry, struct path *path) enum ovl_path_type ovl_path_real(struct dentry *dentry, struct path *path) { - enum ovl_path_type type = ovl_path_type(dentry); - if (type == OVL_PATH_LOWER) + if (!OVL_TYPE_UPPER(type)) ovl_path_lower(dentry, path); else ovl_path_upper(dentry, path); @@ -120,7 +125,7 @@ struct dentry *ovl_dentry_lower(struct dentry *dentry) { struct ovl_entry *oe = dentry->d_fsdata; - return oe->lowerdentry; + return __ovl_dentry_lower(oe); } struct dentry *ovl_dentry_real(struct dentry *dentry) @@ -130,7 +135,7 @@ struct dentry *ovl_dentry_real(struct dentry *dentry) realdentry = ovl_upperdentry_dereference(oe); if (!realdentry) - realdentry = oe->lowerdentry; + realdentry = __ovl_dentry_lower(oe); return realdentry; } @@ -143,7 +148,7 @@ struct dentry *ovl_entry_real(struct ovl_entry *oe, bool *is_upper) if (realdentry) { *is_upper = true; } else { - realdentry = oe->lowerdentry; + realdentry = __ovl_dentry_lower(oe); *is_upper = false; } return realdentry; @@ -165,11 +170,9 @@ void ovl_set_dir_cache(struct dentry *dentry, struct ovl_dir_cache *cache) void ovl_path_lower(struct dentry *dentry, struct path *path) { - struct ovl_fs *ofs = dentry->d_sb->s_fs_info; struct ovl_entry *oe = dentry->d_fsdata; - path->mnt = ofs->lower_mnt; - path->dentry = oe->lowerdentry; + *path = oe->numlower ? oe->lowerstack[0] : (struct path) { NULL, NULL }; } int ovl_want_write(struct dentry *dentry) @@ -249,7 +252,7 @@ static bool ovl_is_opaquedir(struct dentry *dentry) if (!S_ISDIR(inode->i_mode) || !inode->i_op->getxattr) return false; - res = inode->i_op->getxattr(dentry, ovl_opaque_xattr, &val, 1); + res = inode->i_op->getxattr(dentry, OVL_XATTR_OPAQUE, &val, 1); if (res == 1 && val == 'y') return true; @@ -261,8 +264,11 @@ static void ovl_dentry_release(struct dentry *dentry) struct ovl_entry *oe = dentry->d_fsdata; if (oe) { + unsigned int i; + dput(oe->__upperdentry); - dput(oe->lowerdentry); + for (i = 0; i < oe->numlower; i++) + dput(oe->lowerstack[i].dentry); kfree_rcu(oe, rcu); } } @@ -271,9 +277,15 @@ static const struct dentry_operations ovl_dentry_operations = { .d_release = ovl_dentry_release, }; -static struct ovl_entry *ovl_alloc_entry(void) +static struct ovl_entry *ovl_alloc_entry(unsigned int numlower) { - return kzalloc(sizeof(struct ovl_entry), GFP_KERNEL); + size_t size = offsetof(struct ovl_entry, lowerstack[numlower]); + struct ovl_entry *oe = kzalloc(size, GFP_KERNEL); + + if (oe) + oe->numlower = numlower; + + return oe; } static inline struct dentry *ovl_lookup_real(struct dentry *dir, @@ -295,82 +307,154 @@ static inline struct dentry *ovl_lookup_real(struct dentry *dir, return dentry; } +/* + * Returns next layer in stack starting from top. + * Returns -1 if this is the last layer. + */ +int ovl_path_next(int idx, struct dentry *dentry, struct path *path) +{ + struct ovl_entry *oe = dentry->d_fsdata; + + BUG_ON(idx < 0); + if (idx == 0) { + ovl_path_upper(dentry, path); + if (path->dentry) + return oe->numlower ? 1 : -1; + idx++; + } + BUG_ON(idx > oe->numlower); + *path = oe->lowerstack[idx - 1]; + + return (idx < oe->numlower) ? idx + 1 : -1; +} + struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) { struct ovl_entry *oe; - struct dentry *upperdir; - struct dentry *lowerdir; - struct dentry *upperdentry = NULL; - struct dentry *lowerdentry = NULL; + struct ovl_entry *poe = dentry->d_parent->d_fsdata; + struct path *stack = NULL; + struct dentry *upperdir, *upperdentry = NULL; + unsigned int ctr = 0; struct inode *inode = NULL; + bool upperopaque = false; + struct dentry *this, *prev = NULL; + unsigned int i; int err; - err = -ENOMEM; - oe = ovl_alloc_entry(); - if (!oe) - goto out; - - upperdir = ovl_dentry_upper(dentry->d_parent); - lowerdir = ovl_dentry_lower(dentry->d_parent); - + upperdir = ovl_upperdentry_dereference(poe); if (upperdir) { - upperdentry = ovl_lookup_real(upperdir, &dentry->d_name); - err = PTR_ERR(upperdentry); - if (IS_ERR(upperdentry)) - goto out_put_dir; - - if (lowerdir && upperdentry) { - if (ovl_is_whiteout(upperdentry)) { - dput(upperdentry); - upperdentry = NULL; - oe->opaque = true; - } else if (ovl_is_opaquedir(upperdentry)) { - oe->opaque = true; + this = ovl_lookup_real(upperdir, &dentry->d_name); + err = PTR_ERR(this); + if (IS_ERR(this)) + goto out; + + if (this) { + if (ovl_is_whiteout(this)) { + dput(this); + this = NULL; + upperopaque = true; + } else if (poe->numlower && ovl_is_opaquedir(this)) { + upperopaque = true; } } + upperdentry = prev = this; } - if (lowerdir && !oe->opaque) { - lowerdentry = ovl_lookup_real(lowerdir, &dentry->d_name); - err = PTR_ERR(lowerdentry); - if (IS_ERR(lowerdentry)) - goto out_dput_upper; + + if (!upperopaque && poe->numlower) { + err = -ENOMEM; + stack = kcalloc(poe->numlower, sizeof(struct path), GFP_KERNEL); + if (!stack) + goto out_put_upper; } - if (lowerdentry && upperdentry && - (!S_ISDIR(upperdentry->d_inode->i_mode) || - !S_ISDIR(lowerdentry->d_inode->i_mode))) { - dput(lowerdentry); - lowerdentry = NULL; - oe->opaque = true; + for (i = 0; !upperopaque && i < poe->numlower; i++) { + bool opaque = false; + struct path lowerpath = poe->lowerstack[i]; + + this = ovl_lookup_real(lowerpath.dentry, &dentry->d_name); + err = PTR_ERR(this); + if (IS_ERR(this)) { + /* + * If it's positive, then treat ENAMETOOLONG as ENOENT. + */ + if (err == -ENAMETOOLONG && (upperdentry || ctr)) + continue; + goto out_put; + } + if (!this) + continue; + if (ovl_is_whiteout(this)) { + dput(this); + break; + } + /* + * Only makes sense to check opaque dir if this is not the + * lowermost layer. + */ + if (i < poe->numlower - 1 && ovl_is_opaquedir(this)) + opaque = true; + + if (prev && (!S_ISDIR(prev->d_inode->i_mode) || + !S_ISDIR(this->d_inode->i_mode))) { + /* + * FIXME: check for upper-opaqueness maybe better done + * in remove code. + */ + if (prev == upperdentry) + upperopaque = true; + dput(this); + break; + } + /* + * If this is a non-directory then stop here. + */ + if (!S_ISDIR(this->d_inode->i_mode)) + opaque = true; + + stack[ctr].dentry = this; + stack[ctr].mnt = lowerpath.mnt; + ctr++; + prev = this; + if (opaque) + break; } - if (lowerdentry || upperdentry) { + oe = ovl_alloc_entry(ctr); + err = -ENOMEM; + if (!oe) + goto out_put; + + if (upperdentry || ctr) { struct dentry *realdentry; - realdentry = upperdentry ? upperdentry : lowerdentry; + realdentry = upperdentry ? upperdentry : stack[0].dentry; + err = -ENOMEM; inode = ovl_new_inode(dentry->d_sb, realdentry->d_inode->i_mode, oe); if (!inode) - goto out_dput; + goto out_free_oe; ovl_copyattr(realdentry->d_inode, inode); } + oe->opaque = upperopaque; oe->__upperdentry = upperdentry; - oe->lowerdentry = lowerdentry; - + memcpy(oe->lowerstack, stack, sizeof(struct path) * ctr); + kfree(stack); dentry->d_fsdata = oe; d_add(dentry, inode); return NULL; -out_dput: - dput(lowerdentry); -out_dput_upper: - dput(upperdentry); -out_put_dir: +out_free_oe: kfree(oe); +out_put: + for (i = 0; i < ctr; i++) + dput(stack[i].dentry); + kfree(stack); +out_put_upper: + dput(upperdentry); out: return ERR_PTR(err); } @@ -383,10 +467,12 @@ struct file *ovl_path_open(struct path *path, int flags) static void ovl_put_super(struct super_block *sb) { struct ovl_fs *ufs = sb->s_fs_info; + unsigned i; dput(ufs->workdir); mntput(ufs->upper_mnt); - mntput(ufs->lower_mnt); + for (i = 0; i < ufs->numlower; i++) + mntput(ufs->lower_mnt[i]); kfree(ufs->config.lowerdir); kfree(ufs->config.upperdir); @@ -400,7 +486,7 @@ static void ovl_put_super(struct super_block *sb) * @buf: The struct kstatfs to fill in with stats * * Get the filesystem statistics. As writes always target the upper layer - * filesystem pass the statfs to the same filesystem. + * filesystem pass the statfs to the upper filesystem (if it exists) */ static int ovl_statfs(struct dentry *dentry, struct kstatfs *buf) { @@ -409,7 +495,7 @@ static int ovl_statfs(struct dentry *dentry, struct kstatfs *buf) struct path path; int err; - ovl_path_upper(root_dentry, &path); + ovl_path_real(root_dentry, &path); err = vfs_statfs(&path, buf); if (!err) { @@ -432,8 +518,20 @@ static int ovl_show_options(struct seq_file *m, struct dentry *dentry) struct ovl_fs *ufs = sb->s_fs_info; seq_printf(m, ",lowerdir=%s", ufs->config.lowerdir); - seq_printf(m, ",upperdir=%s", ufs->config.upperdir); - seq_printf(m, ",workdir=%s", ufs->config.workdir); + if (ufs->config.upperdir) { + seq_printf(m, ",upperdir=%s", ufs->config.upperdir); + seq_printf(m, ",workdir=%s", ufs->config.workdir); + } + return 0; +} + +static int ovl_remount(struct super_block *sb, int *flags, char *data) +{ + struct ovl_fs *ufs = sb->s_fs_info; + + if (!(*flags & MS_RDONLY) && !ufs->upper_mnt) + return -EROFS; + return 0; } @@ -441,6 +539,7 @@ static const struct super_operations ovl_super_operations = { .put_super = ovl_put_super, .statfs = ovl_statfs, .show_options = ovl_show_options, + .remount_fs = ovl_remount, }; enum { @@ -515,9 +614,19 @@ static int ovl_parse_opt(char *opt, struct ovl_config *config) break; default: + pr_err("overlayfs: unrecognized mount option \"%s\" or missing value\n", p); return -EINVAL; } } + + /* Workdir is useless in non-upper mount */ + if (!config->upperdir && config->workdir) { + pr_info("overlayfs: option \"workdir=%s\" is useless in a non-upper mount, ignore\n", + config->workdir); + kfree(config->workdir); + config->workdir = NULL; + } + return 0; } @@ -585,24 +694,6 @@ static void ovl_unescape(char *s) } } -static int ovl_mount_dir(const char *name, struct path *path) -{ - int err; - char *tmp = kstrdup(name, GFP_KERNEL); - - if (!tmp) - return -ENOMEM; - - ovl_unescape(tmp); - err = kern_path(tmp, LOOKUP_FOLLOW, path); - if (err) { - pr_err("overlayfs: failed to resolve '%s': %i\n", tmp, err); - err = -EINVAL; - } - kfree(tmp); - return err; -} - static bool ovl_is_allowed_fs_type(struct dentry *root) { const struct dentry_operations *dop = root->d_op; @@ -622,6 +713,75 @@ static bool ovl_is_allowed_fs_type(struct dentry *root) return true; } +static int ovl_mount_dir_noesc(const char *name, struct path *path) +{ + int err = -EINVAL; + + if (!*name) { + pr_err("overlayfs: empty lowerdir\n"); + goto out; + } + err = kern_path(name, LOOKUP_FOLLOW, path); + if (err) { + pr_err("overlayfs: failed to resolve '%s': %i\n", name, err); + goto out; + } + err = -EINVAL; + if (!ovl_is_allowed_fs_type(path->dentry)) { + pr_err("overlayfs: filesystem on '%s' not supported\n", name); + goto out_put; + } + if (!S_ISDIR(path->dentry->d_inode->i_mode)) { + pr_err("overlayfs: '%s' not a directory\n", name); + goto out_put; + } + return 0; + +out_put: + path_put(path); +out: + return err; +} + +static int ovl_mount_dir(const char *name, struct path *path) +{ + int err = -ENOMEM; + char *tmp = kstrdup(name, GFP_KERNEL); + + if (tmp) { + ovl_unescape(tmp); + err = ovl_mount_dir_noesc(tmp, path); + kfree(tmp); + } + return err; +} + +static int ovl_lower_dir(const char *name, struct path *path, long *namelen, + int *stack_depth) +{ + int err; + struct kstatfs statfs; + + err = ovl_mount_dir_noesc(name, path); + if (err) + goto out; + + err = vfs_statfs(path, &statfs); + if (err) { + pr_err("overlayfs: statfs failed on '%s'\n", name); + goto out_put; + } + *namelen = max(*namelen, statfs.f_namelen); + *stack_depth = max(*stack_depth, path->mnt->mnt_sb->s_stack_depth); + + return 0; + +out_put: + path_put(path); +out: + return err; +} + /* Workdir should not be subdir of upperdir and vice versa */ static bool ovl_workdir_ok(struct dentry *workdir, struct dentry *upperdir) { @@ -634,16 +794,39 @@ static bool ovl_workdir_ok(struct dentry *workdir, struct dentry *upperdir) return ok; } +static unsigned int ovl_split_lowerdirs(char *str) +{ + unsigned int ctr = 1; + char *s, *d; + + for (s = d = str;; s++, d++) { + if (*s == '\\') { + s++; + } else if (*s == ':') { + *d = '\0'; + ctr++; + continue; + } + *d = *s; + if (!*s) + break; + } + return ctr; +} + static int ovl_fill_super(struct super_block *sb, void *data, int silent) { - struct path lowerpath; - struct path upperpath; - struct path workpath; - struct inode *root_inode; + struct path upperpath = { NULL, NULL }; + struct path workpath = { NULL, NULL }; struct dentry *root_dentry; struct ovl_entry *oe; struct ovl_fs *ufs; - struct kstatfs statfs; + struct path *stack = NULL; + char *lowertmp; + char *lower; + unsigned int numlower; + unsigned int stacklen = 0; + unsigned int i; int err; err = -ENOMEM; @@ -655,123 +838,147 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent) if (err) goto out_free_config; - /* FIXME: workdir is not needed for a R/O mount */ err = -EINVAL; - if (!ufs->config.upperdir || !ufs->config.lowerdir || - !ufs->config.workdir) { - pr_err("overlayfs: missing upperdir or lowerdir or workdir\n"); + if (!ufs->config.lowerdir) { + pr_err("overlayfs: missing 'lowerdir'\n"); goto out_free_config; } - err = -ENOMEM; - oe = ovl_alloc_entry(); - if (oe == NULL) - goto out_free_config; - - err = ovl_mount_dir(ufs->config.upperdir, &upperpath); - if (err) - goto out_free_oe; + sb->s_stack_depth = 0; + if (ufs->config.upperdir) { + if (!ufs->config.workdir) { + pr_err("overlayfs: missing 'workdir'\n"); + goto out_free_config; + } - err = ovl_mount_dir(ufs->config.lowerdir, &lowerpath); - if (err) - goto out_put_upperpath; + err = ovl_mount_dir(ufs->config.upperdir, &upperpath); + if (err) + goto out_free_config; - err = ovl_mount_dir(ufs->config.workdir, &workpath); - if (err) - goto out_put_lowerpath; + /* Upper fs should not be r/o */ + if (upperpath.mnt->mnt_sb->s_flags & MS_RDONLY) { + pr_err("overlayfs: upper fs is r/o, try multi-lower layers mount\n"); + err = -EINVAL; + goto out_put_upperpath; + } - err = -EINVAL; - if (!S_ISDIR(upperpath.dentry->d_inode->i_mode) || - !S_ISDIR(lowerpath.dentry->d_inode->i_mode) || - !S_ISDIR(workpath.dentry->d_inode->i_mode)) { - pr_err("overlayfs: upperdir or lowerdir or workdir not a directory\n"); - goto out_put_workpath; - } + err = ovl_mount_dir(ufs->config.workdir, &workpath); + if (err) + goto out_put_upperpath; - if (upperpath.mnt != workpath.mnt) { - pr_err("overlayfs: workdir and upperdir must reside under the same mount\n"); - goto out_put_workpath; + err = -EINVAL; + if (upperpath.mnt != workpath.mnt) { + pr_err("overlayfs: workdir and upperdir must reside under the same mount\n"); + goto out_put_workpath; + } + if (!ovl_workdir_ok(workpath.dentry, upperpath.dentry)) { + pr_err("overlayfs: workdir and upperdir must be separate subtrees\n"); + goto out_put_workpath; + } + sb->s_stack_depth = upperpath.mnt->mnt_sb->s_stack_depth; } - if (!ovl_workdir_ok(workpath.dentry, upperpath.dentry)) { - pr_err("overlayfs: workdir and upperdir must be separate subtrees\n"); + err = -ENOMEM; + lowertmp = kstrdup(ufs->config.lowerdir, GFP_KERNEL); + if (!lowertmp) goto out_put_workpath; - } - if (!ovl_is_allowed_fs_type(upperpath.dentry)) { - pr_err("overlayfs: filesystem of upperdir is not supported\n"); - goto out_put_workpath; + err = -EINVAL; + stacklen = ovl_split_lowerdirs(lowertmp); + if (stacklen > OVL_MAX_STACK) { + pr_err("overlayfs: too many lower directries, limit is %d\n", + OVL_MAX_STACK); + goto out_free_lowertmp; + } else if (!ufs->config.upperdir && stacklen == 1) { + pr_err("overlayfs: at least 2 lowerdir are needed while upperdir nonexistent\n"); + goto out_free_lowertmp; } - if (!ovl_is_allowed_fs_type(lowerpath.dentry)) { - pr_err("overlayfs: filesystem of lowerdir is not supported\n"); - goto out_put_workpath; - } + stack = kcalloc(stacklen, sizeof(struct path), GFP_KERNEL); + if (!stack) + goto out_free_lowertmp; - err = vfs_statfs(&lowerpath, &statfs); - if (err) { - pr_err("overlayfs: statfs failed on lowerpath\n"); - goto out_put_workpath; - } - ufs->lower_namelen = statfs.f_namelen; + lower = lowertmp; + for (numlower = 0; numlower < stacklen; numlower++) { + err = ovl_lower_dir(lower, &stack[numlower], + &ufs->lower_namelen, &sb->s_stack_depth); + if (err) + goto out_put_lowerpath; - sb->s_stack_depth = max(upperpath.mnt->mnt_sb->s_stack_depth, - lowerpath.mnt->mnt_sb->s_stack_depth) + 1; + lower = strchr(lower, '\0') + 1; + } err = -EINVAL; + sb->s_stack_depth++; if (sb->s_stack_depth > FILESYSTEM_MAX_STACK_DEPTH) { pr_err("overlayfs: maximum fs stacking depth exceeded\n"); - goto out_put_workpath; + goto out_put_lowerpath; } - ufs->upper_mnt = clone_private_mount(&upperpath); - err = PTR_ERR(ufs->upper_mnt); - if (IS_ERR(ufs->upper_mnt)) { - pr_err("overlayfs: failed to clone upperpath\n"); - goto out_put_workpath; - } + if (ufs->config.upperdir) { + ufs->upper_mnt = clone_private_mount(&upperpath); + err = PTR_ERR(ufs->upper_mnt); + if (IS_ERR(ufs->upper_mnt)) { + pr_err("overlayfs: failed to clone upperpath\n"); + goto out_put_lowerpath; + } - ufs->lower_mnt = clone_private_mount(&lowerpath); - err = PTR_ERR(ufs->lower_mnt); - if (IS_ERR(ufs->lower_mnt)) { - pr_err("overlayfs: failed to clone lowerpath\n"); - goto out_put_upper_mnt; + ufs->workdir = ovl_workdir_create(ufs->upper_mnt, workpath.dentry); + err = PTR_ERR(ufs->workdir); + if (IS_ERR(ufs->workdir)) { + pr_err("overlayfs: failed to create directory %s/%s\n", + ufs->config.workdir, OVL_WORKDIR_NAME); + goto out_put_upper_mnt; + } } - ufs->workdir = ovl_workdir_create(ufs->upper_mnt, workpath.dentry); - err = PTR_ERR(ufs->workdir); - if (IS_ERR(ufs->workdir)) { - pr_err("overlayfs: failed to create directory %s/%s\n", - ufs->config.workdir, OVL_WORKDIR_NAME); - goto out_put_lower_mnt; - } + err = -ENOMEM; + ufs->lower_mnt = kcalloc(numlower, sizeof(struct vfsmount *), GFP_KERNEL); + if (ufs->lower_mnt == NULL) + goto out_put_workdir; + for (i = 0; i < numlower; i++) { + struct vfsmount *mnt = clone_private_mount(&stack[i]); - /* - * Make lower_mnt R/O. That way fchmod/fchown on lower file - * will fail instead of modifying lower fs. - */ - ufs->lower_mnt->mnt_flags |= MNT_READONLY; + err = PTR_ERR(mnt); + if (IS_ERR(mnt)) { + pr_err("overlayfs: failed to clone lowerpath\n"); + goto out_put_lower_mnt; + } + /* + * Make lower_mnt R/O. That way fchmod/fchown on lower file + * will fail instead of modifying lower fs. + */ + mnt->mnt_flags |= MNT_READONLY; + + ufs->lower_mnt[ufs->numlower] = mnt; + ufs->numlower++; + } - /* If the upper fs is r/o, we mark overlayfs r/o too */ - if (ufs->upper_mnt->mnt_sb->s_flags & MS_RDONLY) + /* If the upper fs is nonexistent, we mark overlayfs r/o too */ + if (!ufs->upper_mnt) sb->s_flags |= MS_RDONLY; sb->s_d_op = &ovl_dentry_operations; err = -ENOMEM; - root_inode = ovl_new_inode(sb, S_IFDIR, oe); - if (!root_inode) - goto out_put_workdir; + oe = ovl_alloc_entry(numlower); + if (!oe) + goto out_put_lower_mnt; - root_dentry = d_make_root(root_inode); + root_dentry = d_make_root(ovl_new_inode(sb, S_IFDIR, oe)); if (!root_dentry) - goto out_put_workdir; + goto out_free_oe; mntput(upperpath.mnt); - mntput(lowerpath.mnt); + for (i = 0; i < numlower; i++) + mntput(stack[i].mnt); path_put(&workpath); + kfree(lowertmp); oe->__upperdentry = upperpath.dentry; - oe->lowerdentry = lowerpath.dentry; + for (i = 0; i < numlower; i++) { + oe->lowerstack[i].dentry = stack[i].dentry; + oe->lowerstack[i].mnt = ufs->lower_mnt[i]; + } root_dentry->d_fsdata = oe; @@ -782,20 +989,26 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent) return 0; +out_free_oe: + kfree(oe); +out_put_lower_mnt: + for (i = 0; i < ufs->numlower; i++) + mntput(ufs->lower_mnt[i]); + kfree(ufs->lower_mnt); out_put_workdir: dput(ufs->workdir); -out_put_lower_mnt: - mntput(ufs->lower_mnt); out_put_upper_mnt: mntput(ufs->upper_mnt); +out_put_lowerpath: + for (i = 0; i < numlower; i++) + path_put(&stack[i]); + kfree(stack); +out_free_lowertmp: + kfree(lowertmp); out_put_workpath: path_put(&workpath); -out_put_lowerpath: - path_put(&lowerpath); out_put_upperpath: path_put(&upperpath); -out_free_oe: - kfree(oe); out_free_config: kfree(ufs->config.lowerdir); kfree(ufs->config.upperdir); diff --git a/fs/posix_acl.c b/fs/posix_acl.c index 0855f772cd41..3a48bb789c9f 100644 --- a/fs/posix_acl.c +++ b/fs/posix_acl.c @@ -564,13 +564,11 @@ posix_acl_create(struct inode *dir, umode_t *mode, *acl = posix_acl_clone(p, GFP_NOFS); if (!*acl) - return -ENOMEM; + goto no_mem; ret = posix_acl_create_masq(*acl, mode); - if (ret < 0) { - posix_acl_release(*acl); - return -ENOMEM; - } + if (ret < 0) + goto no_mem_clone; if (ret == 0) { posix_acl_release(*acl); @@ -591,6 +589,12 @@ no_acl: *default_acl = NULL; *acl = NULL; return 0; + +no_mem_clone: + posix_acl_release(*acl); +no_mem: + posix_acl_release(p); + return -ENOMEM; } EXPORT_SYMBOL_GPL(posix_acl_create); @@ -772,7 +776,7 @@ posix_acl_xattr_get(struct dentry *dentry, const char *name, if (!IS_POSIXACL(dentry->d_inode)) return -EOPNOTSUPP; - if (S_ISLNK(dentry->d_inode->i_mode)) + if (d_is_symlink(dentry)) return -EOPNOTSUPP; acl = get_acl(dentry->d_inode, type); @@ -832,7 +836,7 @@ posix_acl_xattr_list(struct dentry *dentry, char *list, size_t list_size, if (!IS_POSIXACL(dentry->d_inode)) return -EOPNOTSUPP; - if (S_ISLNK(dentry->d_inode->i_mode)) + if (d_is_symlink(dentry)) return -EOPNOTSUPP; if (type == ACL_TYPE_ACCESS) diff --git a/fs/proc/generic.c b/fs/proc/generic.c index 3309f59d421b..be65b2082135 100644 --- a/fs/proc/generic.c +++ b/fs/proc/generic.c @@ -19,7 +19,6 @@ #include <linux/mount.h> #include <linux/init.h> #include <linux/idr.h> -#include <linux/namei.h> #include <linux/bitops.h> #include <linux/spinlock.h> #include <linux/completion.h> @@ -223,17 +222,6 @@ void proc_free_inum(unsigned int inum) spin_unlock_irqrestore(&proc_inum_lock, flags); } -static void *proc_follow_link(struct dentry *dentry, struct nameidata *nd) -{ - nd_set_link(nd, __PDE_DATA(dentry->d_inode)); - return NULL; -} - -static const struct inode_operations proc_link_inode_operations = { - .readlink = generic_readlink, - .follow_link = proc_follow_link, -}; - /* * Don't create negative dentries here, return -ENOENT by hand * instead. diff --git a/fs/proc/inode.c b/fs/proc/inode.c index 13a50a32652d..7697b6621cfd 100644 --- a/fs/proc/inode.c +++ b/fs/proc/inode.c @@ -23,6 +23,7 @@ #include <linux/slab.h> #include <linux/mount.h> #include <linux/magic.h> +#include <linux/namei.h> #include <asm/uaccess.h> @@ -393,6 +394,26 @@ static const struct file_operations proc_reg_file_ops_no_compat = { }; #endif +static void *proc_follow_link(struct dentry *dentry, struct nameidata *nd) +{ + struct proc_dir_entry *pde = PDE(dentry->d_inode); + if (unlikely(!use_pde(pde))) + return ERR_PTR(-EINVAL); + nd_set_link(nd, pde->data); + return pde; +} + +static void proc_put_link(struct dentry *dentry, struct nameidata *nd, void *p) +{ + unuse_pde(p); +} + +const struct inode_operations proc_link_inode_operations = { + .readlink = generic_readlink, + .follow_link = proc_follow_link, + .put_link = proc_put_link, +}; + struct inode *proc_get_inode(struct super_block *sb, struct proc_dir_entry *de) { struct inode *inode = new_inode_pseudo(sb); diff --git a/fs/proc/internal.h b/fs/proc/internal.h index 6fcdba573e0f..c835b94c0cd3 100644 --- a/fs/proc/internal.h +++ b/fs/proc/internal.h @@ -200,6 +200,7 @@ struct pde_opener { int closing; struct completion *c; }; +extern const struct inode_operations proc_link_inode_operations; extern const struct inode_operations proc_pid_link_inode_operations; diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 956b75d61809..6dee68d013ff 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -1325,6 +1325,9 @@ out: static int pagemap_open(struct inode *inode, struct file *file) { + /* do not disclose physical addresses: attack vector */ + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; pr_warn_once("Bits 55-60 of /proc/PID/pagemap entries are about " "to stop being page-shift some time soon. See the " "linux/Documentation/vm/pagemap.txt for details.\n"); diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c index 04b06146bae2..4e781e697c90 100644 --- a/fs/reiserfs/xattr.c +++ b/fs/reiserfs/xattr.c @@ -266,7 +266,7 @@ static int reiserfs_for_each_xattr(struct inode *inode, for (i = 0; !err && i < buf.count && buf.dentries[i]; i++) { struct dentry *dentry = buf.dentries[i]; - if (!S_ISDIR(dentry->d_inode->i_mode)) + if (!d_is_dir(dentry)) err = action(dentry, data); dput(dentry); @@ -322,7 +322,7 @@ static int delete_one_xattr(struct dentry *dentry, void *data) struct inode *dir = dentry->d_parent->d_inode; /* This is the xattr dir, handle specially. */ - if (S_ISDIR(dentry->d_inode->i_mode)) + if (d_is_dir(dentry)) return xattr_rmdir(dir, dentry); return xattr_unlink(dir, dentry); diff --git a/fs/super.c b/fs/super.c index 65a53efc1cf4..2b7dc90ccdbb 100644 --- a/fs/super.c +++ b/fs/super.c @@ -71,7 +71,7 @@ static unsigned long super_cache_scan(struct shrinker *shrink, if (!(sc->gfp_mask & __GFP_FS)) return SHRINK_STOP; - if (!grab_super_passive(sb)) + if (!trylock_super(sb)) return SHRINK_STOP; if (sb->s_op->nr_cached_objects) @@ -105,7 +105,7 @@ static unsigned long super_cache_scan(struct shrinker *shrink, freed += sb->s_op->free_cached_objects(sb, sc); } - drop_super(sb); + up_read(&sb->s_umount); return freed; } @@ -118,7 +118,7 @@ static unsigned long super_cache_count(struct shrinker *shrink, sb = container_of(shrink, struct super_block, s_shrink); /* - * Don't call grab_super_passive as it is a potential + * Don't call trylock_super as it is a potential * scalability bottleneck. The counts could get updated * between super_cache_count and super_cache_scan anyway. * Call to super_cache_count with shrinker_rwsem held @@ -348,35 +348,31 @@ static int grab_super(struct super_block *s) __releases(sb_lock) } /* - * grab_super_passive - acquire a passive reference + * trylock_super - try to grab ->s_umount shared * @sb: reference we are trying to grab * - * Tries to acquire a passive reference. This is used in places where we + * Try to prevent fs shutdown. This is used in places where we * cannot take an active reference but we need to ensure that the - * superblock does not go away while we are working on it. It returns - * false if a reference was not gained, and returns true with the s_umount - * lock held in read mode if a reference is gained. On successful return, - * the caller must drop the s_umount lock and the passive reference when - * done. + * filesystem is not shut down while we are working on it. It returns + * false if we cannot acquire s_umount or if we lose the race and + * filesystem already got into shutdown, and returns true with the s_umount + * lock held in read mode in case of success. On successful return, + * the caller must drop the s_umount lock when done. + * + * Note that unlike get_super() et.al. this one does *not* bump ->s_count. + * The reason why it's safe is that we are OK with doing trylock instead + * of down_read(). There's a couple of places that are OK with that, but + * it's very much not a general-purpose interface. */ -bool grab_super_passive(struct super_block *sb) +bool trylock_super(struct super_block *sb) { - spin_lock(&sb_lock); - if (hlist_unhashed(&sb->s_instances)) { - spin_unlock(&sb_lock); - return false; - } - - sb->s_count++; - spin_unlock(&sb_lock); - if (down_read_trylock(&sb->s_umount)) { - if (sb->s_root && (sb->s_flags & MS_BORN)) + if (!hlist_unhashed(&sb->s_instances) && + sb->s_root && (sb->s_flags & MS_BORN)) return true; up_read(&sb->s_umount); } - put_super(sb); return false; } diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile index d61799949580..df6828570e87 100644 --- a/fs/xfs/Makefile +++ b/fs/xfs/Makefile @@ -121,3 +121,4 @@ xfs-$(CONFIG_XFS_POSIX_ACL) += xfs_acl.o xfs-$(CONFIG_PROC_FS) += xfs_stats.o xfs-$(CONFIG_SYSCTL) += xfs_sysctl.o xfs-$(CONFIG_COMPAT) += xfs_ioctl32.o +xfs-$(CONFIG_NFSD_PNFS) += xfs_pnfs.o diff --git a/fs/xfs/xfs_export.c b/fs/xfs/xfs_export.c index 5eb4a14e0a0f..b97359ba2648 100644 --- a/fs/xfs/xfs_export.c +++ b/fs/xfs/xfs_export.c @@ -30,6 +30,7 @@ #include "xfs_trace.h" #include "xfs_icache.h" #include "xfs_log.h" +#include "xfs_pnfs.h" /* * Note that we only accept fileids which are long enough rather than allow @@ -245,4 +246,9 @@ const struct export_operations xfs_export_operations = { .fh_to_parent = xfs_fs_fh_to_parent, .get_parent = xfs_fs_get_parent, .commit_metadata = xfs_fs_nfs_commit_metadata, +#ifdef CONFIG_NFSD_PNFS + .get_uuid = xfs_fs_get_uuid, + .map_blocks = xfs_fs_map_blocks, + .commit_blocks = xfs_fs_commit_blocks, +#endif }; diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 1cdba95c78cb..a2e1cb8a568b 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -36,6 +36,7 @@ #include "xfs_trace.h" #include "xfs_log.h" #include "xfs_icache.h" +#include "xfs_pnfs.h" #include <linux/aio.h> #include <linux/dcache.h> @@ -396,7 +397,8 @@ STATIC int /* error (positive) */ xfs_zero_last_block( struct xfs_inode *ip, xfs_fsize_t offset, - xfs_fsize_t isize) + xfs_fsize_t isize, + bool *did_zeroing) { struct xfs_mount *mp = ip->i_mount; xfs_fileoff_t last_fsb = XFS_B_TO_FSBT(mp, isize); @@ -424,6 +426,7 @@ xfs_zero_last_block( zero_len = mp->m_sb.sb_blocksize - zero_offset; if (isize + zero_len > offset) zero_len = offset - isize; + *did_zeroing = true; return xfs_iozero(ip, isize, zero_len); } @@ -442,7 +445,8 @@ int /* error (positive) */ xfs_zero_eof( struct xfs_inode *ip, xfs_off_t offset, /* starting I/O offset */ - xfs_fsize_t isize) /* current inode size */ + xfs_fsize_t isize, /* current inode size */ + bool *did_zeroing) { struct xfs_mount *mp = ip->i_mount; xfs_fileoff_t start_zero_fsb; @@ -464,7 +468,7 @@ xfs_zero_eof( * We only zero a part of that block so it is handled specially. */ if (XFS_B_FSB_OFFSET(mp, isize) != 0) { - error = xfs_zero_last_block(ip, offset, isize); + error = xfs_zero_last_block(ip, offset, isize, did_zeroing); if (error) return error; } @@ -524,6 +528,7 @@ xfs_zero_eof( if (error) return error; + *did_zeroing = true; start_zero_fsb = imap.br_startoff + imap.br_blockcount; ASSERT(start_zero_fsb <= (end_zero_fsb + 1)); } @@ -554,6 +559,10 @@ restart: if (error) return error; + error = xfs_break_layouts(inode, iolock); + if (error) + return error; + /* * If the offset is beyond the size of the file, we need to zero any * blocks that fall between the existing EOF and the start of this @@ -562,13 +571,15 @@ restart: * having to redo all checks before. */ if (*pos > i_size_read(inode)) { + bool zero = false; + if (*iolock == XFS_IOLOCK_SHARED) { xfs_rw_iunlock(ip, *iolock); *iolock = XFS_IOLOCK_EXCL; xfs_rw_ilock(ip, *iolock); goto restart; } - error = xfs_zero_eof(ip, *pos, i_size_read(inode)); + error = xfs_zero_eof(ip, *pos, i_size_read(inode), &zero); if (error) return error; } @@ -822,6 +833,7 @@ xfs_file_fallocate( struct xfs_inode *ip = XFS_I(inode); long error; enum xfs_prealloc_flags flags = 0; + uint iolock = XFS_IOLOCK_EXCL; loff_t new_size = 0; if (!S_ISREG(inode->i_mode)) @@ -830,7 +842,11 @@ xfs_file_fallocate( FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE)) return -EOPNOTSUPP; - xfs_ilock(ip, XFS_IOLOCK_EXCL); + xfs_ilock(ip, iolock); + error = xfs_break_layouts(inode, &iolock); + if (error) + goto out_unlock; + if (mode & FALLOC_FL_PUNCH_HOLE) { error = xfs_free_file_space(ip, offset, len); if (error) @@ -894,7 +910,7 @@ xfs_file_fallocate( } out_unlock: - xfs_iunlock(ip, XFS_IOLOCK_EXCL); + xfs_iunlock(ip, iolock); return error; } diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c index fba6532efba4..74efe5b760dc 100644 --- a/fs/xfs/xfs_fsops.c +++ b/fs/xfs/xfs_fsops.c @@ -602,6 +602,12 @@ xfs_growfs_data( if (!mutex_trylock(&mp->m_growlock)) return -EWOULDBLOCK; error = xfs_growfs_data_private(mp, in); + /* + * Increment the generation unconditionally, the error could be from + * updating the secondary superblocks, in which case the new size + * is live already. + */ + mp->m_generation++; mutex_unlock(&mp->m_growlock); return error; } diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index daafa1f6d260..6163767aa856 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -2867,6 +2867,10 @@ xfs_rename( * Handle RENAME_EXCHANGE flags */ if (flags & RENAME_EXCHANGE) { + if (target_ip == NULL) { + error = -EINVAL; + goto error_return; + } error = xfs_cross_rename(tp, src_dp, src_name, src_ip, target_dp, target_name, target_ip, &free_list, &first_block, spaceres); diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index 86cd6b39bed7..a1cd55f3f351 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -384,10 +384,11 @@ enum xfs_prealloc_flags { XFS_PREALLOC_INVISIBLE = (1 << 4), }; -int xfs_update_prealloc_flags(struct xfs_inode *, - enum xfs_prealloc_flags); -int xfs_zero_eof(struct xfs_inode *, xfs_off_t, xfs_fsize_t); -int xfs_iozero(struct xfs_inode *, loff_t, size_t); +int xfs_update_prealloc_flags(struct xfs_inode *ip, + enum xfs_prealloc_flags flags); +int xfs_zero_eof(struct xfs_inode *ip, xfs_off_t offset, + xfs_fsize_t isize, bool *did_zeroing); +int xfs_iozero(struct xfs_inode *ip, loff_t pos, size_t count); #define IHOLD(ip) \ diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index f7afb86c9148..ac4feae45eb3 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -39,6 +39,7 @@ #include "xfs_icache.h" #include "xfs_symlink.h" #include "xfs_trans.h" +#include "xfs_pnfs.h" #include <linux/capability.h> #include <linux/dcache.h> @@ -286,7 +287,7 @@ xfs_readlink_by_handle( return PTR_ERR(dentry); /* Restrict this handle operation to symlinks only. */ - if (!S_ISLNK(dentry->d_inode->i_mode)) { + if (!d_is_symlink(dentry)) { error = -EINVAL; goto out_dput; } @@ -608,6 +609,7 @@ xfs_ioc_space( { struct iattr iattr; enum xfs_prealloc_flags flags = 0; + uint iolock = XFS_IOLOCK_EXCL; int error; /* @@ -636,7 +638,10 @@ xfs_ioc_space( if (error) return error; - xfs_ilock(ip, XFS_IOLOCK_EXCL); + xfs_ilock(ip, iolock); + error = xfs_break_layouts(inode, &iolock); + if (error) + goto out_unlock; switch (bf->l_whence) { case 0: /*SEEK_SET*/ @@ -725,7 +730,7 @@ xfs_ioc_space( error = xfs_update_prealloc_flags(ip, flags); out_unlock: - xfs_iunlock(ip, XFS_IOLOCK_EXCL); + xfs_iunlock(ip, iolock); mnt_drop_write_file(filp); return error; } diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index ce80eeb8faa4..e53a90331422 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -37,6 +37,7 @@ #include "xfs_da_btree.h" #include "xfs_dir2.h" #include "xfs_trans_space.h" +#include "xfs_pnfs.h" #include <linux/capability.h> #include <linux/xattr.h> @@ -505,7 +506,7 @@ xfs_setattr_mode( inode->i_mode |= mode & ~S_IFMT; } -static void +void xfs_setattr_time( struct xfs_inode *ip, struct iattr *iattr) @@ -750,6 +751,7 @@ xfs_setattr_size( int error; uint lock_flags = 0; uint commit_flags = 0; + bool did_zeroing = false; trace_xfs_setattr(ip); @@ -793,20 +795,16 @@ xfs_setattr_size( return error; /* - * Now we can make the changes. Before we join the inode to the - * transaction, take care of the part of the truncation that must be - * done without the inode lock. This needs to be done before joining - * the inode to the transaction, because the inode cannot be unlocked - * once it is a part of the transaction. + * File data changes must be complete before we start the transaction to + * modify the inode. This needs to be done before joining the inode to + * the transaction because the inode cannot be unlocked once it is a + * part of the transaction. + * + * Start with zeroing any data block beyond EOF that we may expose on + * file extension. */ if (newsize > oldsize) { - /* - * Do the first part of growing a file: zero any data in the - * last block that is beyond the old EOF. We need to do this - * before the inode is joined to the transaction to modify - * i_size. - */ - error = xfs_zero_eof(ip, newsize, oldsize); + error = xfs_zero_eof(ip, newsize, oldsize, &did_zeroing); if (error) return error; } @@ -816,23 +814,18 @@ xfs_setattr_size( * any previous writes that are beyond the on disk EOF and the new * EOF that have not been written out need to be written here. If we * do not write the data out, we expose ourselves to the null files - * problem. - * - * Only flush from the on disk size to the smaller of the in memory - * file size or the new size as that's the range we really care about - * here and prevents waiting for other data not within the range we - * care about here. + * problem. Note that this includes any block zeroing we did above; + * otherwise those blocks may not be zeroed after a crash. */ - if (oldsize != ip->i_d.di_size && newsize > ip->i_d.di_size) { + if (newsize > ip->i_d.di_size && + (oldsize != ip->i_d.di_size || did_zeroing)) { error = filemap_write_and_wait_range(VFS_I(ip)->i_mapping, ip->i_d.di_size, newsize); if (error) return error; } - /* - * Wait for all direct I/O to complete. - */ + /* Now wait for all direct I/O to complete. */ inode_dio_wait(inode); /* @@ -979,9 +972,13 @@ xfs_vn_setattr( int error; if (iattr->ia_valid & ATTR_SIZE) { - xfs_ilock(ip, XFS_IOLOCK_EXCL); - error = xfs_setattr_size(ip, iattr); - xfs_iunlock(ip, XFS_IOLOCK_EXCL); + uint iolock = XFS_IOLOCK_EXCL; + + xfs_ilock(ip, iolock); + error = xfs_break_layouts(dentry->d_inode, &iolock); + if (!error) + error = xfs_setattr_size(ip, iattr); + xfs_iunlock(ip, iolock); } else { error = xfs_setattr_nonsize(ip, iattr, 0); } diff --git a/fs/xfs/xfs_iops.h b/fs/xfs/xfs_iops.h index 1c34e4335920..ea7a98e9cb70 100644 --- a/fs/xfs/xfs_iops.h +++ b/fs/xfs/xfs_iops.h @@ -32,6 +32,7 @@ extern void xfs_setup_inode(struct xfs_inode *); */ #define XFS_ATTR_NOACL 0x01 /* Don't call posix_acl_chmod */ +extern void xfs_setattr_time(struct xfs_inode *ip, struct iattr *iattr); extern int xfs_setattr_nonsize(struct xfs_inode *ip, struct iattr *vap, int flags); extern int xfs_setattr_size(struct xfs_inode *ip, struct iattr *vap); diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index a5b2ff822653..0d8abd6364d9 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -174,6 +174,17 @@ typedef struct xfs_mount { struct workqueue_struct *m_reclaim_workqueue; struct workqueue_struct *m_log_workqueue; struct workqueue_struct *m_eofblocks_workqueue; + + /* + * Generation of the filesysyem layout. This is incremented by each + * growfs, and used by the pNFS server to ensure the client updates + * its view of the block device once it gets a layout that might + * reference the newly added blocks. Does not need to be persistent + * as long as we only allow file system size increments, but if we + * ever support shrinks it would have to be persisted in addition + * to various other kinds of pain inflicted on the pNFS server. + */ + __uint32_t m_generation; } xfs_mount_t; /* diff --git a/fs/xfs/xfs_pnfs.c b/fs/xfs/xfs_pnfs.c new file mode 100644 index 000000000000..365dd57ea760 --- /dev/null +++ b/fs/xfs/xfs_pnfs.c @@ -0,0 +1,324 @@ +/* + * Copyright (c) 2014 Christoph Hellwig. + */ +#include "xfs.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_sb.h" +#include "xfs_mount.h" +#include "xfs_inode.h" +#include "xfs_trans.h" +#include "xfs_log.h" +#include "xfs_bmap.h" +#include "xfs_bmap_util.h" +#include "xfs_error.h" +#include "xfs_iomap.h" +#include "xfs_shared.h" +#include "xfs_bit.h" +#include "xfs_pnfs.h" + +/* + * Ensure that we do not have any outstanding pNFS layouts that can be used by + * clients to directly read from or write to this inode. This must be called + * before every operation that can remove blocks from the extent map. + * Additionally we call it during the write operation, where aren't concerned + * about exposing unallocated blocks but just want to provide basic + * synchronization between a local writer and pNFS clients. mmap writes would + * also benefit from this sort of synchronization, but due to the tricky locking + * rules in the page fault path we don't bother. + */ +int +xfs_break_layouts( + struct inode *inode, + uint *iolock) +{ + struct xfs_inode *ip = XFS_I(inode); + int error; + + ASSERT(xfs_isilocked(ip, XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL)); + + while ((error = break_layout(inode, false) == -EWOULDBLOCK)) { + xfs_iunlock(ip, *iolock); + error = break_layout(inode, true); + *iolock = XFS_IOLOCK_EXCL; + xfs_ilock(ip, *iolock); + } + + return error; +} + +/* + * Get a unique ID including its location so that the client can identify + * the exported device. + */ +int +xfs_fs_get_uuid( + struct super_block *sb, + u8 *buf, + u32 *len, + u64 *offset) +{ + struct xfs_mount *mp = XFS_M(sb); + + printk_once(KERN_NOTICE +"XFS (%s): using experimental pNFS feature, use at your own risk!\n", + mp->m_fsname); + + if (*len < sizeof(uuid_t)) + return -EINVAL; + + memcpy(buf, &mp->m_sb.sb_uuid, sizeof(uuid_t)); + *len = sizeof(uuid_t); + *offset = offsetof(struct xfs_dsb, sb_uuid); + return 0; +} + +static void +xfs_bmbt_to_iomap( + struct xfs_inode *ip, + struct iomap *iomap, + struct xfs_bmbt_irec *imap) +{ + struct xfs_mount *mp = ip->i_mount; + + if (imap->br_startblock == HOLESTARTBLOCK) { + iomap->blkno = IOMAP_NULL_BLOCK; + iomap->type = IOMAP_HOLE; + } else if (imap->br_startblock == DELAYSTARTBLOCK) { + iomap->blkno = IOMAP_NULL_BLOCK; + iomap->type = IOMAP_DELALLOC; + } else { + iomap->blkno = + XFS_FSB_TO_DADDR(ip->i_mount, imap->br_startblock); + if (imap->br_state == XFS_EXT_UNWRITTEN) + iomap->type = IOMAP_UNWRITTEN; + else + iomap->type = IOMAP_MAPPED; + } + iomap->offset = XFS_FSB_TO_B(mp, imap->br_startoff); + iomap->length = XFS_FSB_TO_B(mp, imap->br_blockcount); +} + +/* + * Get a layout for the pNFS client. + */ +int +xfs_fs_map_blocks( + struct inode *inode, + loff_t offset, + u64 length, + struct iomap *iomap, + bool write, + u32 *device_generation) +{ + struct xfs_inode *ip = XFS_I(inode); + struct xfs_mount *mp = ip->i_mount; + struct xfs_bmbt_irec imap; + xfs_fileoff_t offset_fsb, end_fsb; + loff_t limit; + int bmapi_flags = XFS_BMAPI_ENTIRE; + int nimaps = 1; + uint lock_flags; + int error = 0; + + if (XFS_FORCED_SHUTDOWN(mp)) + return -EIO; + + /* + * We can't export inodes residing on the realtime device. The realtime + * device doesn't have a UUID to identify it, so the client has no way + * to find it. + */ + if (XFS_IS_REALTIME_INODE(ip)) + return -ENXIO; + + /* + * Lock out any other I/O before we flush and invalidate the pagecache, + * and then hand out a layout to the remote system. This is very + * similar to direct I/O, except that the synchronization is much more + * complicated. See the comment near xfs_break_layouts for a detailed + * explanation. + */ + xfs_ilock(ip, XFS_IOLOCK_EXCL); + + error = -EINVAL; + limit = mp->m_super->s_maxbytes; + if (!write) + limit = max(limit, round_up(i_size_read(inode), + inode->i_sb->s_blocksize)); + if (offset > limit) + goto out_unlock; + if (offset > limit - length) + length = limit - offset; + + error = filemap_write_and_wait(inode->i_mapping); + if (error) + goto out_unlock; + error = invalidate_inode_pages2(inode->i_mapping); + if (WARN_ON_ONCE(error)) + return error; + + end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + length); + offset_fsb = XFS_B_TO_FSBT(mp, offset); + + lock_flags = xfs_ilock_data_map_shared(ip); + error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb, + &imap, &nimaps, bmapi_flags); + xfs_iunlock(ip, lock_flags); + + if (error) + goto out_unlock; + + if (write) { + enum xfs_prealloc_flags flags = 0; + + ASSERT(imap.br_startblock != DELAYSTARTBLOCK); + + if (!nimaps || imap.br_startblock == HOLESTARTBLOCK) { + error = xfs_iomap_write_direct(ip, offset, length, + &imap, nimaps); + if (error) + goto out_unlock; + + /* + * Ensure the next transaction is committed + * synchronously so that the blocks allocated and + * handed out to the client are guaranteed to be + * present even after a server crash. + */ + flags |= XFS_PREALLOC_SET | XFS_PREALLOC_SYNC; + } + + error = xfs_update_prealloc_flags(ip, flags); + if (error) + goto out_unlock; + } + xfs_iunlock(ip, XFS_IOLOCK_EXCL); + + xfs_bmbt_to_iomap(ip, iomap, &imap); + *device_generation = mp->m_generation; + return error; +out_unlock: + xfs_iunlock(ip, XFS_IOLOCK_EXCL); + return error; +} + +/* + * Ensure the size update falls into a valid allocated block. + */ +static int +xfs_pnfs_validate_isize( + struct xfs_inode *ip, + xfs_off_t isize) +{ + struct xfs_bmbt_irec imap; + int nimaps = 1; + int error = 0; + + xfs_ilock(ip, XFS_ILOCK_SHARED); + error = xfs_bmapi_read(ip, XFS_B_TO_FSBT(ip->i_mount, isize - 1), 1, + &imap, &nimaps, 0); + xfs_iunlock(ip, XFS_ILOCK_SHARED); + if (error) + return error; + + if (imap.br_startblock == HOLESTARTBLOCK || + imap.br_startblock == DELAYSTARTBLOCK || + imap.br_state == XFS_EXT_UNWRITTEN) + return -EIO; + return 0; +} + +/* + * Make sure the blocks described by maps are stable on disk. This includes + * converting any unwritten extents, flushing the disk cache and updating the + * time stamps. + * + * Note that we rely on the caller to always send us a timestamp update so that + * we always commit a transaction here. If that stops being true we will have + * to manually flush the cache here similar to what the fsync code path does + * for datasyncs on files that have no dirty metadata. + */ +int +xfs_fs_commit_blocks( + struct inode *inode, + struct iomap *maps, + int nr_maps, + struct iattr *iattr) +{ + struct xfs_inode *ip = XFS_I(inode); + struct xfs_mount *mp = ip->i_mount; + struct xfs_trans *tp; + bool update_isize = false; + int error, i; + loff_t size; + + ASSERT(iattr->ia_valid & (ATTR_ATIME|ATTR_CTIME|ATTR_MTIME)); + + xfs_ilock(ip, XFS_IOLOCK_EXCL); + + size = i_size_read(inode); + if ((iattr->ia_valid & ATTR_SIZE) && iattr->ia_size > size) { + update_isize = true; + size = iattr->ia_size; + } + + for (i = 0; i < nr_maps; i++) { + u64 start, length, end; + + start = maps[i].offset; + if (start > size) + continue; + + end = start + maps[i].length; + if (end > size) + end = size; + + length = end - start; + if (!length) + continue; + + /* + * Make sure reads through the pagecache see the new data. + */ + error = invalidate_inode_pages2_range(inode->i_mapping, + start >> PAGE_CACHE_SHIFT, + (end - 1) >> PAGE_CACHE_SHIFT); + WARN_ON_ONCE(error); + + error = xfs_iomap_write_unwritten(ip, start, length); + if (error) + goto out_drop_iolock; + } + + if (update_isize) { + error = xfs_pnfs_validate_isize(ip, size); + if (error) + goto out_drop_iolock; + } + + tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_NOT_SIZE); + error = xfs_trans_reserve(tp, &M_RES(mp)->tr_ichange, 0, 0); + if (error) { + xfs_trans_cancel(tp, 0); + goto out_drop_iolock; + } + + xfs_ilock(ip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); + + xfs_setattr_time(ip, iattr); + if (update_isize) { + i_size_write(inode, iattr->ia_size); + ip->i_d.di_size = iattr->ia_size; + } + + xfs_trans_set_sync(tp); + error = xfs_trans_commit(tp, 0); + +out_drop_iolock: + xfs_iunlock(ip, XFS_IOLOCK_EXCL); + return error; +} diff --git a/fs/xfs/xfs_pnfs.h b/fs/xfs/xfs_pnfs.h new file mode 100644 index 000000000000..b7fbfce660f6 --- /dev/null +++ b/fs/xfs/xfs_pnfs.h @@ -0,0 +1,18 @@ +#ifndef _XFS_PNFS_H +#define _XFS_PNFS_H 1 + +#ifdef CONFIG_NFSD_PNFS +int xfs_fs_get_uuid(struct super_block *sb, u8 *buf, u32 *len, u64 *offset); +int xfs_fs_map_blocks(struct inode *inode, loff_t offset, u64 length, + struct iomap *iomap, bool write, u32 *device_generation); +int xfs_fs_commit_blocks(struct inode *inode, struct iomap *maps, int nr_maps, + struct iattr *iattr); + +int xfs_break_layouts(struct inode *inode, uint *iolock); +#else +static inline int xfs_break_layouts(struct inode *inode, uint *iolock) +{ + return 0; +} +#endif /* CONFIG_NFSD_PNFS */ +#endif /* _XFS_PNFS_H */ diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c index 53cc2aaf8d2b..fbbb9e62e274 100644 --- a/fs/xfs/xfs_qm.c +++ b/fs/xfs/xfs_qm.c @@ -836,6 +836,11 @@ xfs_qm_reset_dqcounts( */ xfs_dqcheck(mp, ddq, id+j, type, XFS_QMOPT_DQREPAIR, "xfs_quotacheck"); + /* + * Reset type in case we are reusing group quota file for + * project quotas or vice versa + */ + ddq->d_flags = type; ddq->d_bcount = 0; ddq->d_icount = 0; ddq->d_rtbcount = 0; |