diff options
Diffstat (limited to 'fs')
53 files changed, 823 insertions, 365 deletions
diff --git a/fs/afs/dir.c b/fs/afs/dir.c index 4dd97afa536c..5219182e52e1 100644 --- a/fs/afs/dir.c +++ b/fs/afs/dir.c @@ -1358,6 +1358,7 @@ static int afs_mkdir(struct mnt_idmap *idmap, struct inode *dir, op->dentry = dentry; op->create.mode = S_IFDIR | mode; op->create.reason = afs_edit_dir_for_mkdir; + op->mtime = current_time(dir); op->ops = &afs_mkdir_operation; return afs_do_sync_operation(op); } @@ -1661,6 +1662,7 @@ static int afs_create(struct mnt_idmap *idmap, struct inode *dir, op->dentry = dentry; op->create.mode = S_IFREG | mode; op->create.reason = afs_edit_dir_for_create; + op->mtime = current_time(dir); op->ops = &afs_create_operation; return afs_do_sync_operation(op); @@ -1796,6 +1798,7 @@ static int afs_symlink(struct mnt_idmap *idmap, struct inode *dir, op->ops = &afs_symlink_operation; op->create.reason = afs_edit_dir_for_symlink; op->create.symlink = content; + op->mtime = current_time(dir); return afs_do_sync_operation(op); error: diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 2b1b227505f3..88e6d1072a35 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -242,7 +242,6 @@ static int btrfs_repair_eb_io_failure(const struct extent_buffer *eb, int mirror_num) { struct btrfs_fs_info *fs_info = eb->fs_info; - u64 start = eb->start; int i, num_pages = num_extent_pages(eb); int ret = 0; @@ -251,12 +250,14 @@ static int btrfs_repair_eb_io_failure(const struct extent_buffer *eb, for (i = 0; i < num_pages; i++) { struct page *p = eb->pages[i]; + u64 start = max_t(u64, eb->start, page_offset(p)); + u64 end = min_t(u64, eb->start + eb->len, page_offset(p) + PAGE_SIZE); + u32 len = end - start; - ret = btrfs_repair_io_failure(fs_info, 0, start, PAGE_SIZE, - start, p, start - page_offset(p), mirror_num); + ret = btrfs_repair_io_failure(fs_info, 0, start, len, + start, p, offset_in_page(start), mirror_num); if (ret) break; - start += PAGE_SIZE; } return ret; diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 7c666517d3d3..50c241aba1a1 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -134,8 +134,14 @@ struct scrub_stripe { * The errors hit during the initial read of the stripe. * * Would be utilized for error reporting and repair. + * + * The remaining init_nr_* records the number of errors hit, only used + * by error reporting. */ unsigned long init_error_bitmap; + unsigned int init_nr_io_errors; + unsigned int init_nr_csum_errors; + unsigned int init_nr_meta_errors; /* * The following error bitmaps are all for the current status. @@ -1003,12 +1009,9 @@ skip: sctx->stat.data_bytes_scrubbed += nr_data_sectors << fs_info->sectorsize_bits; sctx->stat.tree_bytes_scrubbed += nr_meta_sectors << fs_info->sectorsize_bits; sctx->stat.no_csum += nr_nodatacsum_sectors; - sctx->stat.read_errors += - bitmap_weight(&stripe->io_error_bitmap, stripe->nr_sectors); - sctx->stat.csum_errors += - bitmap_weight(&stripe->csum_error_bitmap, stripe->nr_sectors); - sctx->stat.verify_errors += - bitmap_weight(&stripe->meta_error_bitmap, stripe->nr_sectors); + sctx->stat.read_errors += stripe->init_nr_io_errors; + sctx->stat.csum_errors += stripe->init_nr_csum_errors; + sctx->stat.verify_errors += stripe->init_nr_meta_errors; sctx->stat.uncorrectable_errors += bitmap_weight(&stripe->error_bitmap, stripe->nr_sectors); sctx->stat.corrected_errors += nr_repaired_sectors; @@ -1041,6 +1044,12 @@ static void scrub_stripe_read_repair_worker(struct work_struct *work) scrub_verify_one_stripe(stripe, stripe->extent_sector_bitmap); /* Save the initial failed bitmap for later repair and report usage. */ stripe->init_error_bitmap = stripe->error_bitmap; + stripe->init_nr_io_errors = bitmap_weight(&stripe->io_error_bitmap, + stripe->nr_sectors); + stripe->init_nr_csum_errors = bitmap_weight(&stripe->csum_error_bitmap, + stripe->nr_sectors); + stripe->init_nr_meta_errors = bitmap_weight(&stripe->meta_error_bitmap, + stripe->nr_sectors); if (bitmap_empty(&stripe->init_error_bitmap, stripe->nr_sectors)) goto out; @@ -1490,6 +1499,9 @@ static void scrub_stripe_reset_bitmaps(struct scrub_stripe *stripe) { stripe->extent_sector_bitmap = 0; stripe->init_error_bitmap = 0; + stripe->init_nr_io_errors = 0; + stripe->init_nr_csum_errors = 0; + stripe->init_nr_meta_errors = 0; stripe->error_bitmap = 0; stripe->io_error_bitmap = 0; stripe->csum_error_bitmap = 0; @@ -1730,7 +1742,7 @@ static int flush_scrub_stripes(struct scrub_ctx *sctx) break; } } - } else { + } else if (!sctx->readonly) { for (int i = 0; i < nr_stripes; i++) { unsigned long repaired; diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index ec18e2210602..efeb1a9d040a 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -1841,6 +1841,12 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data) btrfs_clear_sb_rdonly(sb); set_bit(BTRFS_FS_OPEN, &fs_info->flags); + + /* + * If we've gone from readonly -> read/write, we need to get + * our sync/async discard lists in the right state. + */ + btrfs_discard_resume(fs_info); } out: /* diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 789be30d6ee2..2321e5ddb664 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -1627,6 +1627,7 @@ void ceph_flush_snaps(struct ceph_inode_info *ci, struct inode *inode = &ci->netfs.inode; struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc; struct ceph_mds_session *session = NULL; + bool need_put = false; int mds; dout("ceph_flush_snaps %p\n", inode); @@ -1671,8 +1672,13 @@ out: ceph_put_mds_session(session); /* we flushed them all; remove this inode from the queue */ spin_lock(&mdsc->snap_flush_lock); + if (!list_empty(&ci->i_snap_flush_item)) + need_put = true; list_del_init(&ci->i_snap_flush_item); spin_unlock(&mdsc->snap_flush_lock); + + if (need_put) + iput(inode); } /* diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c index 0b236ebd989f..2e73ba62bd7a 100644 --- a/fs/ceph/snap.c +++ b/fs/ceph/snap.c @@ -693,8 +693,10 @@ int __ceph_finish_cap_snap(struct ceph_inode_info *ci, capsnap->size); spin_lock(&mdsc->snap_flush_lock); - if (list_empty(&ci->i_snap_flush_item)) + if (list_empty(&ci->i_snap_flush_item)) { + ihold(inode); list_add_tail(&ci->i_snap_flush_item, &mdsc->snap_flush_list); + } spin_unlock(&mdsc->snap_flush_lock); return 1; /* caller may want to ceph_flush_snaps */ } diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 980483455cc0..266d45c7685b 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -1805,7 +1805,11 @@ static int ep_autoremove_wake_function(struct wait_queue_entry *wq_entry, { int ret = default_wake_function(wq_entry, mode, sync, key); - list_del_init(&wq_entry->entry); + /* + * Pairs with list_empty_careful in ep_poll, and ensures future loop + * iterations see the cause of this wakeup. + */ + list_del_init_careful(&wq_entry->entry); return ret; } diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index 300844f50dcd..cb62c8f07d1e 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -784,9 +784,13 @@ static inline bool should_fault_in_pages(struct iov_iter *i, if (!user_backed_iter(i)) return false; + /* + * Try to fault in multiple pages initially. When that doesn't result + * in any progress, fall back to a single page. + */ size = PAGE_SIZE; offs = offset_in_page(iocb->ki_pos); - if (*prev_count != count || !*window_size) { + if (*prev_count != count) { size_t nr_dirtied; nr_dirtied = max(current->nr_dirtied_pause - @@ -870,6 +874,7 @@ static ssize_t gfs2_file_direct_write(struct kiocb *iocb, struct iov_iter *from, struct gfs2_inode *ip = GFS2_I(inode); size_t prev_count = 0, window_size = 0; size_t written = 0; + bool enough_retries; ssize_t ret; /* @@ -913,11 +918,17 @@ retry: if (ret > 0) written = ret; + enough_retries = prev_count == iov_iter_count(from) && + window_size <= PAGE_SIZE; if (should_fault_in_pages(from, iocb, &prev_count, &window_size)) { gfs2_glock_dq(gh); window_size -= fault_in_iov_iter_readable(from, window_size); - if (window_size) - goto retry; + if (window_size) { + if (!enough_retries) + goto retry; + /* fall back to buffered I/O */ + ret = 0; + } } out_unlock: if (gfs2_holder_queued(gh)) diff --git a/fs/nilfs2/btnode.c b/fs/nilfs2/btnode.c index e956f886a1a1..5710833ac1cc 100644 --- a/fs/nilfs2/btnode.c +++ b/fs/nilfs2/btnode.c @@ -285,6 +285,14 @@ void nilfs_btnode_abort_change_key(struct address_space *btnc, if (nbh == NULL) { /* blocksize == pagesize */ xa_erase_irq(&btnc->i_pages, newkey); unlock_page(ctxt->bh->b_page); - } else - brelse(nbh); + } else { + /* + * When canceling a buffer that a prepare operation has + * allocated to copy a node block to another location, use + * nilfs_btnode_delete() to initialize and release the buffer + * so that the buffer flags will not be in an inconsistent + * state when it is reallocated. + */ + nilfs_btnode_delete(nbh); + } } diff --git a/fs/nilfs2/sufile.c b/fs/nilfs2/sufile.c index dc359b56fdfa..2c6078a6b8ec 100644 --- a/fs/nilfs2/sufile.c +++ b/fs/nilfs2/sufile.c @@ -779,6 +779,15 @@ int nilfs_sufile_resize(struct inode *sufile, __u64 newnsegs) goto out_header; sui->ncleansegs -= nsegs - newnsegs; + + /* + * If the sufile is successfully truncated, immediately adjust + * the segment allocation space while locking the semaphore + * "mi_sem" so that nilfs_sufile_alloc() never allocates + * segments in the truncated space. + */ + sui->allocmax = newnsegs - 1; + sui->allocmin = 0; } kaddr = kmap_atomic(header_bh->b_page); diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c index 2894152a6b25..0f0667957c81 100644 --- a/fs/nilfs2/the_nilfs.c +++ b/fs/nilfs2/the_nilfs.c @@ -405,6 +405,18 @@ unsigned long nilfs_nrsvsegs(struct the_nilfs *nilfs, unsigned long nsegs) 100)); } +/** + * nilfs_max_segment_count - calculate the maximum number of segments + * @nilfs: nilfs object + */ +static u64 nilfs_max_segment_count(struct the_nilfs *nilfs) +{ + u64 max_count = U64_MAX; + + do_div(max_count, nilfs->ns_blocks_per_segment); + return min_t(u64, max_count, ULONG_MAX); +} + void nilfs_set_nsegments(struct the_nilfs *nilfs, unsigned long nsegs) { nilfs->ns_nsegments = nsegs; @@ -414,6 +426,8 @@ void nilfs_set_nsegments(struct the_nilfs *nilfs, unsigned long nsegs) static int nilfs_store_disk_layout(struct the_nilfs *nilfs, struct nilfs_super_block *sbp) { + u64 nsegments, nblocks; + if (le32_to_cpu(sbp->s_rev_level) < NILFS_MIN_SUPP_REV) { nilfs_err(nilfs->ns_sb, "unsupported revision (superblock rev.=%d.%d, current rev.=%d.%d). Please check the version of mkfs.nilfs(2).", @@ -457,7 +471,34 @@ static int nilfs_store_disk_layout(struct the_nilfs *nilfs, return -EINVAL; } - nilfs_set_nsegments(nilfs, le64_to_cpu(sbp->s_nsegments)); + nsegments = le64_to_cpu(sbp->s_nsegments); + if (nsegments > nilfs_max_segment_count(nilfs)) { + nilfs_err(nilfs->ns_sb, + "segment count %llu exceeds upper limit (%llu segments)", + (unsigned long long)nsegments, + (unsigned long long)nilfs_max_segment_count(nilfs)); + return -EINVAL; + } + + nblocks = sb_bdev_nr_blocks(nilfs->ns_sb); + if (nblocks) { + u64 min_block_count = nsegments * nilfs->ns_blocks_per_segment; + /* + * To avoid failing to mount early device images without a + * second superblock, exclude that block count from the + * "min_block_count" calculation. + */ + + if (nblocks < min_block_count) { + nilfs_err(nilfs->ns_sb, + "total number of segment blocks %llu exceeds device size (%llu blocks)", + (unsigned long long)min_block_count, + (unsigned long long)nblocks); + return -EINVAL; + } + } + + nilfs_set_nsegments(nilfs, nsegments); nilfs->ns_crc_seed = le32_to_cpu(sbp->s_crc_seed); return 0; } diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index efb09de4343d..b173c36bcab3 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -2100,14 +2100,20 @@ static long ocfs2_fallocate(struct file *file, int mode, loff_t offset, struct ocfs2_space_resv sr; int change_size = 1; int cmd = OCFS2_IOC_RESVSP64; + int ret = 0; if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE)) return -EOPNOTSUPP; if (!ocfs2_writes_unwritten_extents(osb)) return -EOPNOTSUPP; - if (mode & FALLOC_FL_KEEP_SIZE) + if (mode & FALLOC_FL_KEEP_SIZE) { change_size = 0; + } else { + ret = inode_newsize_ok(inode, offset + len); + if (ret) + return ret; + } if (mode & FALLOC_FL_PUNCH_HOLE) cmd = OCFS2_IOC_UNRESVSP64; diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index 0b0e6a132101..988d1c076861 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -952,8 +952,10 @@ static void ocfs2_disable_quotas(struct ocfs2_super *osb) for (type = 0; type < OCFS2_MAXQUOTAS; type++) { if (!sb_has_quota_loaded(sb, type)) continue; - oinfo = sb_dqinfo(sb, type)->dqi_priv; - cancel_delayed_work_sync(&oinfo->dqi_sync_work); + if (!sb_has_quota_suspended(sb, type)) { + oinfo = sb_dqinfo(sb, type)->dqi_priv; + cancel_delayed_work_sync(&oinfo->dqi_sync_work); + } inode = igrab(sb->s_dquot.files[type]); /* Turn off quotas. This will remove all dquot structures from * memory and so they will be automatically synced to global diff --git a/fs/smb/client/cifs_debug.c b/fs/smb/client/cifs_debug.c index 5034b862cec2..b279f745466e 100644 --- a/fs/smb/client/cifs_debug.c +++ b/fs/smb/client/cifs_debug.c @@ -12,6 +12,7 @@ #include <linux/module.h> #include <linux/proc_fs.h> #include <linux/uaccess.h> +#include <uapi/linux/ethtool.h> #include "cifspdu.h" #include "cifsglob.h" #include "cifsproto.h" @@ -130,12 +131,14 @@ cifs_dump_channel(struct seq_file *m, int i, struct cifs_chan *chan) struct TCP_Server_Info *server = chan->server; seq_printf(m, "\n\n\t\tChannel: %d ConnectionId: 0x%llx" - "\n\t\tNumber of credits: %d Dialect 0x%x" + "\n\t\tNumber of credits: %d,%d,%d Dialect 0x%x" "\n\t\tTCP status: %d Instance: %d" "\n\t\tLocal Users To Server: %d SecMode: 0x%x Req On Wire: %d" "\n\t\tIn Send: %d In MaxReq Wait: %d", i+1, server->conn_id, server->credits, + server->echo_credits, + server->oplock_credits, server->dialect, server->tcpStatus, server->reconnect_instance, @@ -146,18 +149,62 @@ cifs_dump_channel(struct seq_file *m, int i, struct cifs_chan *chan) atomic_read(&server->num_waiters)); } +static inline const char *smb_speed_to_str(size_t bps) +{ + size_t mbps = bps / 1000 / 1000; + + switch (mbps) { + case SPEED_10: + return "10Mbps"; + case SPEED_100: + return "100Mbps"; + case SPEED_1000: + return "1Gbps"; + case SPEED_2500: + return "2.5Gbps"; + case SPEED_5000: + return "5Gbps"; + case SPEED_10000: + return "10Gbps"; + case SPEED_14000: + return "14Gbps"; + case SPEED_20000: + return "20Gbps"; + case SPEED_25000: + return "25Gbps"; + case SPEED_40000: + return "40Gbps"; + case SPEED_50000: + return "50Gbps"; + case SPEED_56000: + return "56Gbps"; + case SPEED_100000: + return "100Gbps"; + case SPEED_200000: + return "200Gbps"; + case SPEED_400000: + return "400Gbps"; + case SPEED_800000: + return "800Gbps"; + default: + return "Unknown"; + } +} + static void cifs_dump_iface(struct seq_file *m, struct cifs_server_iface *iface) { struct sockaddr_in *ipv4 = (struct sockaddr_in *)&iface->sockaddr; struct sockaddr_in6 *ipv6 = (struct sockaddr_in6 *)&iface->sockaddr; - seq_printf(m, "\tSpeed: %zu bps\n", iface->speed); + seq_printf(m, "\tSpeed: %s\n", smb_speed_to_str(iface->speed)); seq_puts(m, "\t\tCapabilities: "); if (iface->rdma_capable) seq_puts(m, "rdma "); if (iface->rss_capable) seq_puts(m, "rss "); + if (!iface->rdma_capable && !iface->rss_capable) + seq_puts(m, "None"); seq_putc(m, '\n'); if (iface->sockaddr.ss_family == AF_INET) seq_printf(m, "\t\tIPv4: %pI4\n", &ipv4->sin_addr); @@ -350,8 +397,11 @@ static int cifs_debug_data_proc_show(struct seq_file *m, void *v) atomic_read(&server->smbd_conn->mr_used_count)); skip_rdma: #endif - seq_printf(m, "\nNumber of credits: %d Dialect 0x%x", - server->credits, server->dialect); + seq_printf(m, "\nNumber of credits: %d,%d,%d Dialect 0x%x", + server->credits, + server->echo_credits, + server->oplock_credits, + server->dialect); if (server->compress_algorithm == SMB3_COMPRESS_LZNT1) seq_printf(m, " COMPRESS_LZNT1"); else if (server->compress_algorithm == SMB3_COMPRESS_LZ77) diff --git a/fs/smb/client/cifsglob.h b/fs/smb/client/cifsglob.h index 0d84bb1a8cd9..b212a4e16b39 100644 --- a/fs/smb/client/cifsglob.h +++ b/fs/smb/client/cifsglob.h @@ -970,43 +970,6 @@ release_iface(struct kref *ref) kfree(iface); } -/* - * compare two interfaces a and b - * return 0 if everything matches. - * return 1 if a has higher link speed, or rdma capable, or rss capable - * return -1 otherwise. - */ -static inline int -iface_cmp(struct cifs_server_iface *a, struct cifs_server_iface *b) -{ - int cmp_ret = 0; - - WARN_ON(!a || !b); - if (a->speed == b->speed) { - if (a->rdma_capable == b->rdma_capable) { - if (a->rss_capable == b->rss_capable) { - cmp_ret = memcmp(&a->sockaddr, &b->sockaddr, - sizeof(a->sockaddr)); - if (!cmp_ret) - return 0; - else if (cmp_ret > 0) - return 1; - else - return -1; - } else if (a->rss_capable > b->rss_capable) - return 1; - else - return -1; - } else if (a->rdma_capable > b->rdma_capable) - return 1; - else - return -1; - } else if (a->speed > b->speed) - return 1; - else - return -1; -} - struct cifs_chan { unsigned int in_reconnect : 1; /* if session setup in progress for this channel */ struct TCP_Server_Info *server; diff --git a/fs/smb/client/cifsproto.h b/fs/smb/client/cifsproto.h index c1c704990b98..d127aded2f28 100644 --- a/fs/smb/client/cifsproto.h +++ b/fs/smb/client/cifsproto.h @@ -87,6 +87,7 @@ extern int cifs_handle_standard(struct TCP_Server_Info *server, struct mid_q_entry *mid); extern int smb3_parse_devname(const char *devname, struct smb3_fs_context *ctx); extern int smb3_parse_opt(const char *options, const char *key, char **val); +extern int cifs_ipaddr_cmp(struct sockaddr *srcaddr, struct sockaddr *rhs); extern bool cifs_match_ipaddr(struct sockaddr *srcaddr, struct sockaddr *rhs); extern int cifs_discard_remaining_data(struct TCP_Server_Info *server); extern int cifs_call_async(struct TCP_Server_Info *server, diff --git a/fs/smb/client/connect.c b/fs/smb/client/connect.c index 8e9a672320ab..9d16626e7a66 100644 --- a/fs/smb/client/connect.c +++ b/fs/smb/client/connect.c @@ -1288,6 +1288,56 @@ next_pdu: module_put_and_kthread_exit(0); } +int +cifs_ipaddr_cmp(struct sockaddr *srcaddr, struct sockaddr *rhs) +{ + struct sockaddr_in *saddr4 = (struct sockaddr_in *)srcaddr; + struct sockaddr_in *vaddr4 = (struct sockaddr_in *)rhs; + struct sockaddr_in6 *saddr6 = (struct sockaddr_in6 *)srcaddr; + struct sockaddr_in6 *vaddr6 = (struct sockaddr_in6 *)rhs; + + switch (srcaddr->sa_family) { + case AF_UNSPEC: + switch (rhs->sa_family) { + case AF_UNSPEC: + return 0; + case AF_INET: + case AF_INET6: + return 1; + default: + return -1; + } + case AF_INET: { + switch (rhs->sa_family) { + case AF_UNSPEC: + return -1; + case AF_INET: + return memcmp(saddr4, vaddr4, + sizeof(struct sockaddr_in)); + case AF_INET6: + return 1; + default: + return -1; + } + } + case AF_INET6: { + switch (rhs->sa_family) { + case AF_UNSPEC: + case AF_INET: + return -1; + case AF_INET6: + return memcmp(saddr6, + vaddr6, + sizeof(struct sockaddr_in6)); + default: + return -1; + } + } + default: + return -1; /* don't expect to be here */ + } +} + /* * Returns true if srcaddr isn't specified and rhs isn't specified, or * if srcaddr is specified and matches the IP address of the rhs argument @@ -4086,16 +4136,17 @@ int cifs_tree_connect(const unsigned int xid, struct cifs_tcon *tcon, const stru /* only send once per connect */ spin_lock(&tcon->tc_lock); + if (tcon->status == TID_GOOD) { + spin_unlock(&tcon->tc_lock); + return 0; + } + if (tcon->status != TID_NEW && tcon->status != TID_NEED_TCON) { spin_unlock(&tcon->tc_lock); return -EHOSTDOWN; } - if (tcon->status == TID_GOOD) { - spin_unlock(&tcon->tc_lock); - return 0; - } tcon->status = TID_IN_TCON; spin_unlock(&tcon->tc_lock); diff --git a/fs/smb/client/dfs.c b/fs/smb/client/dfs.c index 2f93bf8c3325..2390b2fedd6a 100644 --- a/fs/smb/client/dfs.c +++ b/fs/smb/client/dfs.c @@ -575,16 +575,17 @@ int cifs_tree_connect(const unsigned int xid, struct cifs_tcon *tcon, const stru /* only send once per connect */ spin_lock(&tcon->tc_lock); + if (tcon->status == TID_GOOD) { + spin_unlock(&tcon->tc_lock); + return 0; + } + if (tcon->status != TID_NEW && tcon->status != TID_NEED_TCON) { spin_unlock(&tcon->tc_lock); return -EHOSTDOWN; } - if (tcon->status == TID_GOOD) { - spin_unlock(&tcon->tc_lock); - return 0; - } tcon->status = TID_IN_TCON; spin_unlock(&tcon->tc_lock); diff --git a/fs/smb/client/file.c b/fs/smb/client/file.c index df88b8c04d03..051283386e22 100644 --- a/fs/smb/client/file.c +++ b/fs/smb/client/file.c @@ -4942,9 +4942,13 @@ oplock_break_ack: * disconnected since oplock already released by the server */ if (!oplock_break_cancelled) { - rc = tcon->ses->server->ops->oplock_response(tcon, persistent_fid, + /* check for server null since can race with kill_sb calling tree disconnect */ + if (tcon->ses && tcon->ses->server) { + rc = tcon->ses->server->ops->oplock_response(tcon, persistent_fid, volatile_fid, net_fid, cinode); - cifs_dbg(FYI, "Oplock release rc = %d\n", rc); + cifs_dbg(FYI, "Oplock release rc = %d\n", rc); + } else + pr_warn_once("lease break not sent for unmounted share\n"); } cifs_done_oplock_break(cinode); diff --git a/fs/smb/client/smb2ops.c b/fs/smb/client/smb2ops.c index 6e3be58cfe49..a8bb9d00d33a 100644 --- a/fs/smb/client/smb2ops.c +++ b/fs/smb/client/smb2ops.c @@ -34,6 +34,8 @@ static int change_conf(struct TCP_Server_Info *server) { server->credits += server->echo_credits + server->oplock_credits; + if (server->credits > server->max_credits) + server->credits = server->max_credits; server->oplock_credits = server->echo_credits = 0; switch (server->credits) { case 0: @@ -91,6 +93,7 @@ smb2_add_credits(struct TCP_Server_Info *server, server->conn_id, server->hostname, *val, add, server->in_flight); } + WARN_ON_ONCE(server->in_flight == 0); server->in_flight--; if (server->in_flight == 0 && ((optype & CIFS_OP_MASK) != CIFS_NEG_OP) && @@ -510,6 +513,43 @@ smb3_negotiate_rsize(struct cifs_tcon *tcon, struct smb3_fs_context *ctx) return rsize; } +/* + * compare two interfaces a and b + * return 0 if everything matches. + * return 1 if a is rdma capable, or rss capable, or has higher link speed + * return -1 otherwise. + */ +static int +iface_cmp(struct cifs_server_iface *a, struct cifs_server_iface *b) +{ + int cmp_ret = 0; + + WARN_ON(!a || !b); + if (a->rdma_capable == b->rdma_capable) { + if (a->rss_capable == b->rss_capable) { + if (a->speed == b->speed) { + cmp_ret = cifs_ipaddr_cmp((struct sockaddr *) &a->sockaddr, + (struct sockaddr *) &b->sockaddr); + if (!cmp_ret) + return 0; + else if (cmp_ret > 0) + return 1; + else + return -1; + } else if (a->speed > b->speed) + return 1; + else + return -1; + } else if (a->rss_capable > b->rss_capable) + return 1; + else + return -1; + } else if (a->rdma_capable > b->rdma_capable) + return 1; + else + return -1; +} + static int parse_server_interfaces(struct network_interface_info_ioctl_rsp *buf, size_t buf_len, struct cifs_ses *ses, bool in_mount) diff --git a/fs/smb/client/smb2pdu.c b/fs/smb/client/smb2pdu.c index 7063b395d22f..17fe212ab895 100644 --- a/fs/smb/client/smb2pdu.c +++ b/fs/smb/client/smb2pdu.c @@ -1305,7 +1305,12 @@ SMB2_sess_alloc_buffer(struct SMB2_sess_data *sess_data) } /* enough to enable echos and oplocks and one max size write */ - req->hdr.CreditRequest = cpu_to_le16(130); + if (server->credits >= server->max_credits) + req->hdr.CreditRequest = cpu_to_le16(0); + else + req->hdr.CreditRequest = cpu_to_le16( + min_t(int, server->max_credits - + server->credits, 130)); /* only one of SMB2 signing flags may be set in SMB2 request */ if (server->sign) @@ -1899,7 +1904,12 @@ SMB2_tcon(const unsigned int xid, struct cifs_ses *ses, const char *tree, rqst.rq_nvec = 2; /* Need 64 for max size write so ask for more in case not there yet */ - req->hdr.CreditRequest = cpu_to_le16(64); + if (server->credits >= server->max_credits) + req->hdr.CreditRequest = cpu_to_le16(0); + else + req->hdr.CreditRequest = cpu_to_le16( + min_t(int, server->max_credits - + server->credits, 64)); rc = cifs_send_recv(xid, ses, server, &rqst, &resp_buftype, flags, &rsp_iov); @@ -4227,6 +4237,7 @@ smb2_async_readv(struct cifs_readdata *rdata) struct TCP_Server_Info *server; struct cifs_tcon *tcon = tlink_tcon(rdata->cfile->tlink); unsigned int total_len; + int credit_request; cifs_dbg(FYI, "%s: offset=%llu bytes=%u\n", __func__, rdata->offset, rdata->bytes); @@ -4258,7 +4269,13 @@ smb2_async_readv(struct cifs_readdata *rdata) if (rdata->credits.value > 0) { shdr->CreditCharge = cpu_to_le16(DIV_ROUND_UP(rdata->bytes, SMB2_MAX_BUFFER_SIZE)); - shdr->CreditRequest = cpu_to_le16(le16_to_cpu(shdr->CreditCharge) + 8); + credit_request = le16_to_cpu(shdr->CreditCharge) + 8; + if (server->credits >= server->max_credits) + shdr->CreditRequest = cpu_to_le16(0); + else + shdr->CreditRequest = cpu_to_le16( + min_t(int, server->max_credits - + server->credits, credit_request)); rc = adjust_credits(server, &rdata->credits, rdata->bytes); if (rc) @@ -4468,6 +4485,7 @@ smb2_async_writev(struct cifs_writedata *wdata, unsigned int total_len; struct cifs_io_parms _io_parms; struct cifs_io_parms *io_parms = NULL; + int credit_request; if (!wdata->server) server = wdata->server = cifs_pick_channel(tcon->ses); @@ -4572,7 +4590,13 @@ smb2_async_writev(struct cifs_writedata *wdata, if (wdata->credits.value > 0) { shdr->CreditCharge = cpu_to_le16(DIV_ROUND_UP(wdata->bytes, SMB2_MAX_BUFFER_SIZE)); - shdr->CreditRequest = cpu_to_le16(le16_to_cpu(shdr->CreditCharge) + 8); + credit_request = le16_to_cpu(shdr->CreditCharge) + 8; + if (server->credits >= server->max_credits) + shdr->CreditRequest = cpu_to_le16(0); + else + shdr->CreditRequest = cpu_to_le16( + min_t(int, server->max_credits - + server->credits, credit_request)); rc = adjust_credits(server, &wdata->credits, io_parms->length); if (rc) diff --git a/fs/smb/client/transport.c b/fs/smb/client/transport.c index 24bdd5f4d3bc..0474d0bba0a2 100644 --- a/fs/smb/client/transport.c +++ b/fs/smb/client/transport.c @@ -55,7 +55,7 @@ alloc_mid(const struct smb_hdr *smb_buffer, struct TCP_Server_Info *server) temp->pid = current->pid; temp->command = cpu_to_le16(smb_buffer->Command); cifs_dbg(FYI, "For smb_command %d\n", smb_buffer->Command); - /* do_gettimeofday(&temp->when_sent);*/ /* easier to use jiffies */ + /* easier to use jiffies */ /* when mid allocated can be before when sent */ temp->when_alloc = jiffies; temp->server = server; diff --git a/fs/smb/server/connection.c b/fs/smb/server/connection.c index 4882a812ea86..2a717d158f02 100644 --- a/fs/smb/server/connection.c +++ b/fs/smb/server/connection.c @@ -294,6 +294,9 @@ bool ksmbd_conn_alive(struct ksmbd_conn *conn) return true; } +#define SMB1_MIN_SUPPORTED_HEADER_SIZE (sizeof(struct smb_hdr)) +#define SMB2_MIN_SUPPORTED_HEADER_SIZE (sizeof(struct smb2_hdr) + 4) + /** * ksmbd_conn_handler_loop() - session thread to listen on new smb requests * @p: connection instance @@ -350,6 +353,9 @@ int ksmbd_conn_handler_loop(void *p) if (pdu_size > MAX_STREAM_PROT_LEN) break; + if (pdu_size < SMB1_MIN_SUPPORTED_HEADER_SIZE) + break; + /* 4 for rfc1002 length field */ /* 1 for implied bcc[0] */ size = pdu_size + 4 + 1; @@ -358,8 +364,6 @@ int ksmbd_conn_handler_loop(void *p) break; memcpy(conn->request_buf, hdr_buf, sizeof(hdr_buf)); - if (!ksmbd_smb_request(conn)) - break; /* * We already read 4 bytes to find out PDU size, now @@ -377,6 +381,15 @@ int ksmbd_conn_handler_loop(void *p) continue; } + if (!ksmbd_smb_request(conn)) + break; + + if (((struct smb2_hdr *)smb2_get_msg(conn->request_buf))->ProtocolId == + SMB2_PROTO_NUMBER) { + if (pdu_size < SMB2_MIN_SUPPORTED_HEADER_SIZE) + break; + } + if (!default_conn_ops.process_fn) { pr_err("No connection request callback\n"); break; diff --git a/fs/smb/server/oplock.c b/fs/smb/server/oplock.c index db181bdad73a..844b303baf29 100644 --- a/fs/smb/server/oplock.c +++ b/fs/smb/server/oplock.c @@ -1415,56 +1415,38 @@ void create_lease_buf(u8 *rbuf, struct lease *lease) */ struct lease_ctx_info *parse_lease_state(void *open_req) { - char *data_offset; struct create_context *cc; - unsigned int next = 0; - char *name; - bool found = false; struct smb2_create_req *req = (struct smb2_create_req *)open_req; - struct lease_ctx_info *lreq = kzalloc(sizeof(struct lease_ctx_info), - GFP_KERNEL); + struct lease_ctx_info *lreq; + + cc = smb2_find_context_vals(req, SMB2_CREATE_REQUEST_LEASE, 4); + if (IS_ERR_OR_NULL(cc)) + return NULL; + + lreq = kzalloc(sizeof(struct lease_ctx_info), GFP_KERNEL); if (!lreq) return NULL; - data_offset = (char *)req + le32_to_cpu(req->CreateContextsOffset); - cc = (struct create_context *)data_offset; - do { - cc = (struct create_context *)((char *)cc + next); - name = le16_to_cpu(cc->NameOffset) + (char *)cc; - if (le16_to_cpu(cc->NameLength) != 4 || - strncmp(name, SMB2_CREATE_REQUEST_LEASE, 4)) { - next = le32_to_cpu(cc->Next); - continue; - } - found = true; - break; - } while (next != 0); + if (sizeof(struct lease_context_v2) == le32_to_cpu(cc->DataLength)) { + struct create_lease_v2 *lc = (struct create_lease_v2 *)cc; - if (found) { - if (sizeof(struct lease_context_v2) == le32_to_cpu(cc->DataLength)) { - struct create_lease_v2 *lc = (struct create_lease_v2 *)cc; - - memcpy(lreq->lease_key, lc->lcontext.LeaseKey, SMB2_LEASE_KEY_SIZE); - lreq->req_state = lc->lcontext.LeaseState; - lreq->flags = lc->lcontext.LeaseFlags; - lreq->duration = lc->lcontext.LeaseDuration; - memcpy(lreq->parent_lease_key, lc->lcontext.ParentLeaseKey, - SMB2_LEASE_KEY_SIZE); - lreq->version = 2; - } else { - struct create_lease *lc = (struct create_lease *)cc; + memcpy(lreq->lease_key, lc->lcontext.LeaseKey, SMB2_LEASE_KEY_SIZE); + lreq->req_state = lc->lcontext.LeaseState; + lreq->flags = lc->lcontext.LeaseFlags; + lreq->duration = lc->lcontext.LeaseDuration; + memcpy(lreq->parent_lease_key, lc->lcontext.ParentLeaseKey, + SMB2_LEASE_KEY_SIZE); + lreq->version = 2; + } else { + struct create_lease *lc = (struct create_lease *)cc; - memcpy(lreq->lease_key, lc->lcontext.LeaseKey, SMB2_LEASE_KEY_SIZE); - lreq->req_state = lc->lcontext.LeaseState; - lreq->flags = lc->lcontext.LeaseFlags; - lreq->duration = lc->lcontext.LeaseDuration; - lreq->version = 1; - } - return lreq; + memcpy(lreq->lease_key, lc->lcontext.LeaseKey, SMB2_LEASE_KEY_SIZE); + lreq->req_state = lc->lcontext.LeaseState; + lreq->flags = lc->lcontext.LeaseFlags; + lreq->duration = lc->lcontext.LeaseDuration; + lreq->version = 1; } - - kfree(lreq); - return NULL; + return lreq; } /** diff --git a/fs/smb/server/smb2pdu.c b/fs/smb/server/smb2pdu.c index 7a81541de602..25c0ba04c59d 100644 --- a/fs/smb/server/smb2pdu.c +++ b/fs/smb/server/smb2pdu.c @@ -963,13 +963,13 @@ static void decode_sign_cap_ctxt(struct ksmbd_conn *conn, static __le32 deassemble_neg_contexts(struct ksmbd_conn *conn, struct smb2_negotiate_req *req, - int len_of_smb) + unsigned int len_of_smb) { /* +4 is to account for the RFC1001 len field */ struct smb2_neg_context *pctx = (struct smb2_neg_context *)req; int i = 0, len_of_ctxts; - int offset = le32_to_cpu(req->NegotiateContextOffset); - int neg_ctxt_cnt = le16_to_cpu(req->NegotiateContextCount); + unsigned int offset = le32_to_cpu(req->NegotiateContextOffset); + unsigned int neg_ctxt_cnt = le16_to_cpu(req->NegotiateContextCount); __le32 status = STATUS_INVALID_PARAMETER; ksmbd_debug(SMB, "decoding %d negotiate contexts\n", neg_ctxt_cnt); @@ -983,7 +983,7 @@ static __le32 deassemble_neg_contexts(struct ksmbd_conn *conn, while (i++ < neg_ctxt_cnt) { int clen, ctxt_len; - if (len_of_ctxts < sizeof(struct smb2_neg_context)) + if (len_of_ctxts < (int)sizeof(struct smb2_neg_context)) break; pctx = (struct smb2_neg_context *)((char *)pctx + offset); @@ -1038,9 +1038,8 @@ static __le32 deassemble_neg_contexts(struct ksmbd_conn *conn, } /* offsets must be 8 byte aligned */ - clen = (clen + 7) & ~0x7; - offset = clen + sizeof(struct smb2_neg_context); - len_of_ctxts -= clen + sizeof(struct smb2_neg_context); + offset = (ctxt_len + 7) & ~0x7; + len_of_ctxts -= offset; } return status; } diff --git a/fs/smb/server/smb_common.c b/fs/smb/server/smb_common.c index af0c2a9b8529..569e5eecdf3d 100644 --- a/fs/smb/server/smb_common.c +++ b/fs/smb/server/smb_common.c @@ -158,7 +158,19 @@ int ksmbd_verify_smb_message(struct ksmbd_work *work) */ bool ksmbd_smb_request(struct ksmbd_conn *conn) { - return conn->request_buf[0] == 0; + __le32 *proto = (__le32 *)smb2_get_msg(conn->request_buf); + + if (*proto == SMB2_COMPRESSION_TRANSFORM_ID) { + pr_err_ratelimited("smb2 compression not support yet"); + return false; + } + + if (*proto != SMB1_PROTO_NUMBER && + *proto != SMB2_PROTO_NUMBER && + *proto != SMB2_TRANSFORM_PROTO_NUM) + return false; + + return true; } static bool supported_protocol(int idx) diff --git a/fs/smb/server/smbacl.c b/fs/smb/server/smbacl.c index 6d6cfb6957a9..0a5862a61c77 100644 --- a/fs/smb/server/smbacl.c +++ b/fs/smb/server/smbacl.c @@ -1290,7 +1290,7 @@ int smb_check_perm_dacl(struct ksmbd_conn *conn, const struct path *path, if (IS_ENABLED(CONFIG_FS_POSIX_ACL)) { posix_acls = get_inode_acl(d_inode(path->dentry), ACL_TYPE_ACCESS); - if (posix_acls && !found) { + if (!IS_ERR_OR_NULL(posix_acls) && !found) { unsigned int id = -1; pa_entry = posix_acls->a_entries; @@ -1314,7 +1314,7 @@ int smb_check_perm_dacl(struct ksmbd_conn *conn, const struct path *path, } } } - if (posix_acls) + if (!IS_ERR_OR_NULL(posix_acls)) posix_acl_release(posix_acls); } diff --git a/fs/smb/server/vfs.c b/fs/smb/server/vfs.c index 6f302919e9f7..f9fb778247e7 100644 --- a/fs/smb/server/vfs.c +++ b/fs/smb/server/vfs.c @@ -1321,7 +1321,7 @@ static struct xattr_smb_acl *ksmbd_vfs_make_xattr_posix_acl(struct mnt_idmap *id return NULL; posix_acls = get_inode_acl(inode, acl_type); - if (!posix_acls) + if (IS_ERR_OR_NULL(posix_acls)) return NULL; smb_acl = kzalloc(sizeof(struct xattr_smb_acl) + @@ -1830,7 +1830,7 @@ int ksmbd_vfs_inherit_posix_acl(struct mnt_idmap *idmap, return -EOPNOTSUPP; acls = get_inode_acl(parent_inode, ACL_TYPE_DEFAULT); - if (!acls) + if (IS_ERR_OR_NULL(acls)) return -ENOENT; pace = acls->a_entries; diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index 0fd96d6e39ce..4e800bb7d2ab 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -1332,6 +1332,7 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx, bool basic_ioctls; unsigned long start, end, vma_end; struct vma_iterator vmi; + pgoff_t pgoff; user_uffdio_register = (struct uffdio_register __user *) arg; @@ -1459,6 +1460,8 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx, vma_iter_set(&vmi, start); prev = vma_prev(&vmi); + if (vma->vm_start < start) + prev = vma; ret = 0; for_each_vma_range(vmi, vma, end) { @@ -1482,8 +1485,9 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx, vma_end = min(end, vma->vm_end); new_flags = (vma->vm_flags & ~__VM_UFFD_FLAGS) | vm_flags; + pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT); prev = vma_merge(&vmi, mm, prev, start, vma_end, new_flags, - vma->anon_vma, vma->vm_file, vma->vm_pgoff, + vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma), ((struct vm_userfaultfd_ctx){ ctx }), anon_vma_name(vma)); @@ -1563,6 +1567,7 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx, unsigned long start, end, vma_end; const void __user *buf = (void __user *)arg; struct vma_iterator vmi; + pgoff_t pgoff; ret = -EFAULT; if (copy_from_user(&uffdio_unregister, buf, sizeof(uffdio_unregister))) @@ -1625,6 +1630,9 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx, vma_iter_set(&vmi, start); prev = vma_prev(&vmi); + if (vma->vm_start < start) + prev = vma; + ret = 0; for_each_vma_range(vmi, vma, end) { cond_resched(); @@ -1662,8 +1670,9 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx, uffd_wp_range(vma, start, vma_end - start, false); new_flags = vma->vm_flags & ~__VM_UFFD_FLAGS; + pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT); prev = vma_merge(&vmi, mm, prev, start, vma_end, new_flags, - vma->anon_vma, vma->vm_file, vma->vm_pgoff, + vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma), NULL_VM_UFFD_CTX, anon_vma_name(vma)); if (prev) { diff --git a/fs/xfs/libxfs/xfs_ag.c b/fs/xfs/libxfs/xfs_ag.c index 9b373a0c7aaf..ee84835ebc66 100644 --- a/fs/xfs/libxfs/xfs_ag.c +++ b/fs/xfs/libxfs/xfs_ag.c @@ -984,7 +984,10 @@ xfs_ag_shrink_space( if (err2 != -ENOSPC) goto resv_err; - __xfs_free_extent_later(*tpp, args.fsbno, delta, NULL, true); + err2 = __xfs_free_extent_later(*tpp, args.fsbno, delta, NULL, + true); + if (err2) + goto resv_err; /* * Roll the transaction before trying to re-init the per-ag diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c index fdfa08cbf4db..c20fe99405d8 100644 --- a/fs/xfs/libxfs/xfs_alloc.c +++ b/fs/xfs/libxfs/xfs_alloc.c @@ -628,6 +628,25 @@ xfs_alloc_fixup_trees( return 0; } +/* + * We do not verify the AGFL contents against AGF-based index counters here, + * even though we may have access to the perag that contains shadow copies. We + * don't know if the AGF based counters have been checked, and if they have they + * still may be inconsistent because they haven't yet been reset on the first + * allocation after the AGF has been read in. + * + * This means we can only check that all agfl entries contain valid or null + * values because we can't reliably determine the active range to exclude + * NULLAGBNO as a valid value. + * + * However, we can't even do that for v4 format filesystems because there are + * old versions of mkfs out there that does not initialise the AGFL to known, + * verifiable values. HEnce we can't tell the difference between a AGFL block + * allocated by mkfs and a corrupted AGFL block here on v4 filesystems. + * + * As a result, we can only fully validate AGFL block numbers when we pull them + * from the freelist in xfs_alloc_get_freelist(). + */ static xfs_failaddr_t xfs_agfl_verify( struct xfs_buf *bp) @@ -637,12 +656,6 @@ xfs_agfl_verify( __be32 *agfl_bno = xfs_buf_to_agfl_bno(bp); int i; - /* - * There is no verification of non-crc AGFLs because mkfs does not - * initialise the AGFL to zero or NULL. Hence the only valid part of the - * AGFL is what the AGF says is active. We can't get to the AGF, so we - * can't verify just those entries are valid. - */ if (!xfs_has_crc(mp)) return NULL; @@ -2321,12 +2334,16 @@ xfs_free_agfl_block( } /* - * Check the agfl fields of the agf for inconsistency or corruption. The purpose - * is to detect an agfl header padding mismatch between current and early v5 - * kernels. This problem manifests as a 1-slot size difference between the - * on-disk flcount and the active [first, last] range of a wrapped agfl. This - * may also catch variants of agfl count corruption unrelated to padding. Either - * way, we'll reset the agfl and warn the user. + * Check the agfl fields of the agf for inconsistency or corruption. + * + * The original purpose was to detect an agfl header padding mismatch between + * current and early v5 kernels. This problem manifests as a 1-slot size + * difference between the on-disk flcount and the active [first, last] range of + * a wrapped agfl. + * + * However, we need to use these same checks to catch agfl count corruptions + * unrelated to padding. This could occur on any v4 or v5 filesystem, so either + * way, we need to reset the agfl and warn the user. * * Return true if a reset is required before the agfl can be used, false * otherwise. @@ -2342,10 +2359,6 @@ xfs_agfl_needs_reset( int agfl_size = xfs_agfl_size(mp); int active; - /* no agfl header on v4 supers */ - if (!xfs_has_crc(mp)) - return false; - /* * The agf read verifier catches severe corruption of these fields. * Repeat some sanity checks to cover a packed -> unpacked mismatch if @@ -2418,7 +2431,7 @@ xfs_agfl_reset( * the real allocation can proceed. Deferring the free disconnects freeing up * the AGFL slot from freeing the block. */ -STATIC void +static int xfs_defer_agfl_block( struct xfs_trans *tp, xfs_agnumber_t agno, @@ -2437,17 +2450,21 @@ xfs_defer_agfl_block( xefi->xefi_blockcount = 1; xefi->xefi_owner = oinfo->oi_owner; + if (XFS_IS_CORRUPT(mp, !xfs_verify_fsbno(mp, xefi->xefi_startblock))) + return -EFSCORRUPTED; + trace_xfs_agfl_free_defer(mp, agno, 0, agbno, 1); xfs_extent_free_get_group(mp, xefi); xfs_defer_add(tp, XFS_DEFER_OPS_TYPE_AGFL_FREE, &xefi->xefi_list); + return 0; } /* * Add the extent to the list of extents to be free at transaction end. * The list is maintained sorted (by block number). */ -void +int __xfs_free_extent_later( struct xfs_trans *tp, xfs_fsblock_t bno, @@ -2474,6 +2491,9 @@ __xfs_free_extent_later( #endif ASSERT(xfs_extfree_item_cache != NULL); + if (XFS_IS_CORRUPT(mp, !xfs_verify_fsbext(mp, bno, len))) + return -EFSCORRUPTED; + xefi = kmem_cache_zalloc(xfs_extfree_item_cache, GFP_KERNEL | __GFP_NOFAIL); xefi->xefi_startblock = bno; @@ -2497,6 +2517,7 @@ __xfs_free_extent_later( xfs_extent_free_get_group(mp, xefi); xfs_defer_add(tp, XFS_DEFER_OPS_TYPE_FREE, &xefi->xefi_list); + return 0; } #ifdef DEBUG @@ -2657,7 +2678,9 @@ xfs_alloc_fix_freelist( goto out_agbp_relse; /* defer agfl frees */ - xfs_defer_agfl_block(tp, args->agno, bno, &targs.oinfo); + error = xfs_defer_agfl_block(tp, args->agno, bno, &targs.oinfo); + if (error) + goto out_agbp_relse; } targs.tp = tp; @@ -2767,6 +2790,9 @@ xfs_alloc_get_freelist( */ agfl_bno = xfs_buf_to_agfl_bno(agflbp); bno = be32_to_cpu(agfl_bno[be32_to_cpu(agf->agf_flfirst)]); + if (XFS_IS_CORRUPT(tp->t_mountp, !xfs_verify_agbno(pag, bno))) + return -EFSCORRUPTED; + be32_add_cpu(&agf->agf_flfirst, 1); xfs_trans_brelse(tp, agflbp); if (be32_to_cpu(agf->agf_flfirst) == xfs_agfl_size(mp)) @@ -2889,6 +2915,19 @@ xfs_alloc_put_freelist( return 0; } +/* + * Verify the AGF is consistent. + * + * We do not verify the AGFL indexes in the AGF are fully consistent here + * because of issues with variable on-disk structure sizes. Instead, we check + * the agfl indexes for consistency when we initialise the perag from the AGF + * information after a read completes. + * + * If the index is inconsistent, then we mark the perag as needing an AGFL + * reset. The first AGFL update performed then resets the AGFL indexes and + * refills the AGFL with known good free blocks, allowing the filesystem to + * continue operating normally at the cost of a few leaked free space blocks. + */ static xfs_failaddr_t xfs_agf_verify( struct xfs_buf *bp) @@ -2962,7 +3001,6 @@ xfs_agf_verify( return __this_address; return NULL; - } static void @@ -3187,7 +3225,8 @@ xfs_alloc_vextent_check_args( */ static int xfs_alloc_vextent_prepare_ag( - struct xfs_alloc_arg *args) + struct xfs_alloc_arg *args, + uint32_t flags) { bool need_pag = !args->pag; int error; @@ -3196,7 +3235,7 @@ xfs_alloc_vextent_prepare_ag( args->pag = xfs_perag_get(args->mp, args->agno); args->agbp = NULL; - error = xfs_alloc_fix_freelist(args, 0); + error = xfs_alloc_fix_freelist(args, flags); if (error) { trace_xfs_alloc_vextent_nofix(args); if (need_pag) @@ -3336,7 +3375,7 @@ xfs_alloc_vextent_this_ag( return error; } - error = xfs_alloc_vextent_prepare_ag(args); + error = xfs_alloc_vextent_prepare_ag(args, 0); if (!error && args->agbp) error = xfs_alloc_ag_vextent_size(args); @@ -3380,7 +3419,7 @@ restart: for_each_perag_wrap_range(mp, start_agno, restart_agno, mp->m_sb.sb_agcount, agno, args->pag) { args->agno = agno; - error = xfs_alloc_vextent_prepare_ag(args); + error = xfs_alloc_vextent_prepare_ag(args, flags); if (error) break; if (!args->agbp) { @@ -3546,7 +3585,7 @@ xfs_alloc_vextent_exact_bno( return error; } - error = xfs_alloc_vextent_prepare_ag(args); + error = xfs_alloc_vextent_prepare_ag(args, 0); if (!error && args->agbp) error = xfs_alloc_ag_vextent_exact(args); @@ -3587,7 +3626,7 @@ xfs_alloc_vextent_near_bno( if (needs_perag) args->pag = xfs_perag_grab(mp, args->agno); - error = xfs_alloc_vextent_prepare_ag(args); + error = xfs_alloc_vextent_prepare_ag(args, 0); if (!error && args->agbp) error = xfs_alloc_ag_vextent_near(args); diff --git a/fs/xfs/libxfs/xfs_alloc.h b/fs/xfs/libxfs/xfs_alloc.h index 5dbb25546d0b..85ac470be0da 100644 --- a/fs/xfs/libxfs/xfs_alloc.h +++ b/fs/xfs/libxfs/xfs_alloc.h @@ -230,7 +230,7 @@ xfs_buf_to_agfl_bno( return bp->b_addr; } -void __xfs_free_extent_later(struct xfs_trans *tp, xfs_fsblock_t bno, +int __xfs_free_extent_later(struct xfs_trans *tp, xfs_fsblock_t bno, xfs_filblks_t len, const struct xfs_owner_info *oinfo, bool skip_discard); @@ -254,14 +254,14 @@ void xfs_extent_free_get_group(struct xfs_mount *mp, #define XFS_EFI_ATTR_FORK (1U << 1) /* freeing attr fork block */ #define XFS_EFI_BMBT_BLOCK (1U << 2) /* freeing bmap btree block */ -static inline void +static inline int xfs_free_extent_later( struct xfs_trans *tp, xfs_fsblock_t bno, xfs_filblks_t len, const struct xfs_owner_info *oinfo) { - __xfs_free_extent_later(tp, bno, len, oinfo, false); + return __xfs_free_extent_later(tp, bno, len, oinfo, false); } diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index cd8870a16fd1..fef35696adb7 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -572,8 +572,12 @@ xfs_bmap_btree_to_extents( cblock = XFS_BUF_TO_BLOCK(cbp); if ((error = xfs_btree_check_block(cur, cblock, 0, cbp))) return error; + xfs_rmap_ino_bmbt_owner(&oinfo, ip->i_ino, whichfork); - xfs_free_extent_later(cur->bc_tp, cbno, 1, &oinfo); + error = xfs_free_extent_later(cur->bc_tp, cbno, 1, &oinfo); + if (error) + return error; + ip->i_nblocks--; xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT, -1L); xfs_trans_binval(tp, cbp); @@ -5230,10 +5234,12 @@ xfs_bmap_del_extent_real( if (xfs_is_reflink_inode(ip) && whichfork == XFS_DATA_FORK) { xfs_refcount_decrease_extent(tp, del); } else { - __xfs_free_extent_later(tp, del->br_startblock, + error = __xfs_free_extent_later(tp, del->br_startblock, del->br_blockcount, NULL, (bflags & XFS_BMAPI_NODISCARD) || del->br_state == XFS_EXT_UNWRITTEN); + if (error) + goto done; } } diff --git a/fs/xfs/libxfs/xfs_bmap_btree.c b/fs/xfs/libxfs/xfs_bmap_btree.c index 1b40e5f8b1ec..36564ae3084f 100644 --- a/fs/xfs/libxfs/xfs_bmap_btree.c +++ b/fs/xfs/libxfs/xfs_bmap_btree.c @@ -268,11 +268,14 @@ xfs_bmbt_free_block( struct xfs_trans *tp = cur->bc_tp; xfs_fsblock_t fsbno = XFS_DADDR_TO_FSB(mp, xfs_buf_daddr(bp)); struct xfs_owner_info oinfo; + int error; xfs_rmap_ino_bmbt_owner(&oinfo, ip->i_ino, cur->bc_ino.whichfork); - xfs_free_extent_later(cur->bc_tp, fsbno, 1, &oinfo); - ip->i_nblocks--; + error = xfs_free_extent_later(cur->bc_tp, fsbno, 1, &oinfo); + if (error) + return error; + ip->i_nblocks--; xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT, -1L); return 0; diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c index a16d5de16933..34600f94c2f4 100644 --- a/fs/xfs/libxfs/xfs_ialloc.c +++ b/fs/xfs/libxfs/xfs_ialloc.c @@ -1834,7 +1834,7 @@ retry: * might be sparse and only free the regions that are allocated as part of the * chunk. */ -STATIC void +static int xfs_difree_inode_chunk( struct xfs_trans *tp, xfs_agnumber_t agno, @@ -1851,10 +1851,10 @@ xfs_difree_inode_chunk( if (!xfs_inobt_issparse(rec->ir_holemask)) { /* not sparse, calculate extent info directly */ - xfs_free_extent_later(tp, XFS_AGB_TO_FSB(mp, agno, sagbno), - M_IGEO(mp)->ialloc_blks, - &XFS_RMAP_OINFO_INODES); - return; + return xfs_free_extent_later(tp, + XFS_AGB_TO_FSB(mp, agno, sagbno), + M_IGEO(mp)->ialloc_blks, + &XFS_RMAP_OINFO_INODES); } /* holemask is only 16-bits (fits in an unsigned long) */ @@ -1871,6 +1871,8 @@ xfs_difree_inode_chunk( XFS_INOBT_HOLEMASK_BITS); nextbit = startidx + 1; while (startidx < XFS_INOBT_HOLEMASK_BITS) { + int error; + nextbit = find_next_zero_bit(holemask, XFS_INOBT_HOLEMASK_BITS, nextbit); /* @@ -1896,8 +1898,11 @@ xfs_difree_inode_chunk( ASSERT(agbno % mp->m_sb.sb_spino_align == 0); ASSERT(contigblk % mp->m_sb.sb_spino_align == 0); - xfs_free_extent_later(tp, XFS_AGB_TO_FSB(mp, agno, agbno), - contigblk, &XFS_RMAP_OINFO_INODES); + error = xfs_free_extent_later(tp, + XFS_AGB_TO_FSB(mp, agno, agbno), + contigblk, &XFS_RMAP_OINFO_INODES); + if (error) + return error; /* reset range to current bit and carry on... */ startidx = endidx = nextbit; @@ -1905,6 +1910,7 @@ xfs_difree_inode_chunk( next: nextbit++; } + return 0; } STATIC int @@ -2003,7 +2009,9 @@ xfs_difree_inobt( goto error0; } - xfs_difree_inode_chunk(tp, pag->pag_agno, &rec); + error = xfs_difree_inode_chunk(tp, pag->pag_agno, &rec); + if (error) + goto error0; } else { xic->deleted = false; diff --git a/fs/xfs/libxfs/xfs_log_format.h b/fs/xfs/libxfs/xfs_log_format.h index f13e0809dc63..269573c82808 100644 --- a/fs/xfs/libxfs/xfs_log_format.h +++ b/fs/xfs/libxfs/xfs_log_format.h @@ -324,7 +324,6 @@ struct xfs_inode_log_format_32 { #define XFS_ILOG_DOWNER 0x200 /* change the data fork owner on replay */ #define XFS_ILOG_AOWNER 0x400 /* change the attr fork owner on replay */ - /* * The timestamps are dirty, but not necessarily anything else in the inode * core. Unlike the other fields above this one must never make it to disk @@ -333,6 +332,14 @@ struct xfs_inode_log_format_32 { */ #define XFS_ILOG_TIMESTAMP 0x4000 +/* + * The version field has been changed, but not necessarily anything else of + * interest. This must never make it to disk - it is used purely to ensure that + * the inode item ->precommit operation can update the fsync flag triggers + * in the inode item correctly. + */ +#define XFS_ILOG_IVERSION 0x8000 + #define XFS_ILOG_NONCORE (XFS_ILOG_DDATA | XFS_ILOG_DEXT | \ XFS_ILOG_DBROOT | XFS_ILOG_DEV | \ XFS_ILOG_ADATA | XFS_ILOG_AEXT | \ diff --git a/fs/xfs/libxfs/xfs_refcount.c b/fs/xfs/libxfs/xfs_refcount.c index c1c65774dcc2..b6e21433925c 100644 --- a/fs/xfs/libxfs/xfs_refcount.c +++ b/fs/xfs/libxfs/xfs_refcount.c @@ -1151,8 +1151,10 @@ xfs_refcount_adjust_extents( fsbno = XFS_AGB_TO_FSB(cur->bc_mp, cur->bc_ag.pag->pag_agno, tmp.rc_startblock); - xfs_free_extent_later(cur->bc_tp, fsbno, + error = xfs_free_extent_later(cur->bc_tp, fsbno, tmp.rc_blockcount, NULL); + if (error) + goto out_error; } (*agbno) += tmp.rc_blockcount; @@ -1210,8 +1212,10 @@ xfs_refcount_adjust_extents( fsbno = XFS_AGB_TO_FSB(cur->bc_mp, cur->bc_ag.pag->pag_agno, ext.rc_startblock); - xfs_free_extent_later(cur->bc_tp, fsbno, + error = xfs_free_extent_later(cur->bc_tp, fsbno, ext.rc_blockcount, NULL); + if (error) + goto out_error; } skip: @@ -1976,7 +1980,10 @@ xfs_refcount_recover_cow_leftovers( rr->rr_rrec.rc_blockcount); /* Free the block. */ - xfs_free_extent_later(tp, fsb, rr->rr_rrec.rc_blockcount, NULL); + error = xfs_free_extent_later(tp, fsb, + rr->rr_rrec.rc_blockcount, NULL); + if (error) + goto out_trans; error = xfs_trans_commit(tp); if (error) diff --git a/fs/xfs/libxfs/xfs_trans_inode.c b/fs/xfs/libxfs/xfs_trans_inode.c index 8b5547073379..cb4796b6e693 100644 --- a/fs/xfs/libxfs/xfs_trans_inode.c +++ b/fs/xfs/libxfs/xfs_trans_inode.c @@ -40,9 +40,8 @@ xfs_trans_ijoin( iip->ili_lock_flags = lock_flags; ASSERT(!xfs_iflags_test(ip, XFS_ISTALE)); - /* - * Get a log_item_desc to point at the new item. - */ + /* Reset the per-tx dirty context and add the item to the tx. */ + iip->ili_dirty_flags = 0; xfs_trans_add_item(tp, &iip->ili_item); } @@ -76,17 +75,10 @@ xfs_trans_ichgtime( /* * This is called to mark the fields indicated in fieldmask as needing to be * logged when the transaction is committed. The inode must already be - * associated with the given transaction. - * - * The values for fieldmask are defined in xfs_inode_item.h. We always log all - * of the core inode if any of it has changed, and we always log all of the - * inline data/extents/b-tree root if any of them has changed. - * - * Grab and pin the cluster buffer associated with this inode to avoid RMW - * cycles at inode writeback time. Avoid the need to add error handling to every - * xfs_trans_log_inode() call by shutting down on read error. This will cause - * transactions to fail and everything to error out, just like if we return a - * read error in a dirty transaction and cancel it. + * associated with the given transaction. All we do here is record where the + * inode was dirtied and mark the transaction and inode log item dirty; + * everything else is done in the ->precommit log item operation after the + * changes in the transaction have been completed. */ void xfs_trans_log_inode( @@ -96,7 +88,6 @@ xfs_trans_log_inode( { struct xfs_inode_log_item *iip = ip->i_itemp; struct inode *inode = VFS_I(ip); - uint iversion_flags = 0; ASSERT(iip); ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); @@ -105,18 +96,6 @@ xfs_trans_log_inode( tp->t_flags |= XFS_TRANS_DIRTY; /* - * Don't bother with i_lock for the I_DIRTY_TIME check here, as races - * don't matter - we either will need an extra transaction in 24 hours - * to log the timestamps, or will clear already cleared fields in the - * worst case. - */ - if (inode->i_state & I_DIRTY_TIME) { - spin_lock(&inode->i_lock); - inode->i_state &= ~I_DIRTY_TIME; - spin_unlock(&inode->i_lock); - } - - /* * First time we log the inode in a transaction, bump the inode change * counter if it is configured for this to occur. While we have the * inode locked exclusively for metadata modification, we can usually @@ -128,86 +107,10 @@ xfs_trans_log_inode( if (!test_and_set_bit(XFS_LI_DIRTY, &iip->ili_item.li_flags)) { if (IS_I_VERSION(inode) && inode_maybe_inc_iversion(inode, flags & XFS_ILOG_CORE)) - iversion_flags = XFS_ILOG_CORE; - } - - /* - * If we're updating the inode core or the timestamps and it's possible - * to upgrade this inode to bigtime format, do so now. - */ - if ((flags & (XFS_ILOG_CORE | XFS_ILOG_TIMESTAMP)) && - xfs_has_bigtime(ip->i_mount) && - !xfs_inode_has_bigtime(ip)) { - ip->i_diflags2 |= XFS_DIFLAG2_BIGTIME; - flags |= XFS_ILOG_CORE; - } - - /* - * Inode verifiers do not check that the extent size hint is an integer - * multiple of the rt extent size on a directory with both rtinherit - * and extszinherit flags set. If we're logging a directory that is - * misconfigured in this way, clear the hint. - */ - if ((ip->i_diflags & XFS_DIFLAG_RTINHERIT) && - (ip->i_diflags & XFS_DIFLAG_EXTSZINHERIT) && - (ip->i_extsize % ip->i_mount->m_sb.sb_rextsize) > 0) { - ip->i_diflags &= ~(XFS_DIFLAG_EXTSIZE | - XFS_DIFLAG_EXTSZINHERIT); - ip->i_extsize = 0; - flags |= XFS_ILOG_CORE; + flags |= XFS_ILOG_IVERSION; } - /* - * Record the specific change for fdatasync optimisation. This allows - * fdatasync to skip log forces for inodes that are only timestamp - * dirty. - */ - spin_lock(&iip->ili_lock); - iip->ili_fsync_fields |= flags; - - if (!iip->ili_item.li_buf) { - struct xfs_buf *bp; - int error; - - /* - * We hold the ILOCK here, so this inode is not going to be - * flushed while we are here. Further, because there is no - * buffer attached to the item, we know that there is no IO in - * progress, so nothing will clear the ili_fields while we read - * in the buffer. Hence we can safely drop the spin lock and - * read the buffer knowing that the state will not change from - * here. - */ - spin_unlock(&iip->ili_lock); - error = xfs_imap_to_bp(ip->i_mount, tp, &ip->i_imap, &bp); - if (error) { - xfs_force_shutdown(ip->i_mount, SHUTDOWN_META_IO_ERROR); - return; - } - - /* - * We need an explicit buffer reference for the log item but - * don't want the buffer to remain attached to the transaction. - * Hold the buffer but release the transaction reference once - * we've attached the inode log item to the buffer log item - * list. - */ - xfs_buf_hold(bp); - spin_lock(&iip->ili_lock); - iip->ili_item.li_buf = bp; - bp->b_flags |= _XBF_INODES; - list_add_tail(&iip->ili_item.li_bio_list, &bp->b_li_list); - xfs_trans_brelse(tp, bp); - } - - /* - * Always OR in the bits from the ili_last_fields field. This is to - * coordinate with the xfs_iflush() and xfs_buf_inode_iodone() routines - * in the eventual clearing of the ili_fields bits. See the big comment - * in xfs_iflush() for an explanation of this coordination mechanism. - */ - iip->ili_fields |= (flags | iip->ili_last_fields | iversion_flags); - spin_unlock(&iip->ili_lock); + iip->ili_dirty_flags |= flags; } int diff --git a/fs/xfs/scrub/bmap.c b/fs/xfs/scrub/bmap.c index 69bc89d0fc68..5bf4326e9783 100644 --- a/fs/xfs/scrub/bmap.c +++ b/fs/xfs/scrub/bmap.c @@ -769,14 +769,14 @@ xchk_are_bmaps_contiguous( * mapping or false if there are no more mappings. Caller must ensure that * @info.icur is zeroed before the first call. */ -static int +static bool xchk_bmap_iext_iter( struct xchk_bmap_info *info, struct xfs_bmbt_irec *irec) { struct xfs_bmbt_irec got; struct xfs_ifork *ifp; - xfs_filblks_t prev_len; + unsigned int nr = 0; ifp = xfs_ifork_ptr(info->sc->ip, info->whichfork); @@ -790,12 +790,12 @@ xchk_bmap_iext_iter( irec->br_startoff); return false; } + nr++; /* * Iterate subsequent iextent records and merge them with the one * that we just read, if possible. */ - prev_len = irec->br_blockcount; while (xfs_iext_peek_next_extent(ifp, &info->icur, &got)) { if (!xchk_are_bmaps_contiguous(irec, &got)) break; @@ -805,20 +805,21 @@ xchk_bmap_iext_iter( got.br_startoff); return false; } - - /* - * Notify the user of mergeable records in the data or attr - * forks. CoW forks only exist in memory so we ignore them. - */ - if (info->whichfork != XFS_COW_FORK && - prev_len + got.br_blockcount > BMBT_BLOCKCOUNT_MASK) - xchk_ino_set_preen(info->sc, info->sc->ip->i_ino); + nr++; irec->br_blockcount += got.br_blockcount; - prev_len = got.br_blockcount; xfs_iext_next(ifp, &info->icur); } + /* + * If the merged mapping could be expressed with fewer bmbt records + * than we actually found, notify the user that this fork could be + * optimized. CoW forks only exist in memory so we ignore them. + */ + if (nr > 1 && info->whichfork != XFS_COW_FORK && + howmany_64(irec->br_blockcount, XFS_MAX_BMBT_EXTLEN) < nr) + xchk_ino_set_preen(info->sc, info->sc->ip->i_ino); + return true; } diff --git a/fs/xfs/scrub/scrub.h b/fs/xfs/scrub/scrub.h index b38e93830dde..e113f2f5c254 100644 --- a/fs/xfs/scrub/scrub.h +++ b/fs/xfs/scrub/scrub.h @@ -105,10 +105,10 @@ struct xfs_scrub { }; /* XCHK state flags grow up from zero, XREP state flags grown down from 2^31 */ -#define XCHK_TRY_HARDER (1 << 0) /* can't get resources, try again */ -#define XCHK_FSGATES_DRAIN (1 << 2) /* defer ops draining enabled */ -#define XCHK_NEED_DRAIN (1 << 3) /* scrub needs to drain defer ops */ -#define XREP_ALREADY_FIXED (1 << 31) /* checking our repair work */ +#define XCHK_TRY_HARDER (1U << 0) /* can't get resources, try again */ +#define XCHK_FSGATES_DRAIN (1U << 2) /* defer ops draining enabled */ +#define XCHK_NEED_DRAIN (1U << 3) /* scrub needs to drain defer ops */ +#define XREP_ALREADY_FIXED (1U << 31) /* checking our repair work */ /* * The XCHK_FSGATES* flags reflect functionality in the main filesystem that diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c index df7322ed73fa..023d4e0385dd 100644 --- a/fs/xfs/xfs_buf_item.c +++ b/fs/xfs/xfs_buf_item.c @@ -452,10 +452,18 @@ xfs_buf_item_format( * This is called to pin the buffer associated with the buf log item in memory * so it cannot be written out. * - * We also always take a reference to the buffer log item here so that the bli - * is held while the item is pinned in memory. This means that we can - * unconditionally drop the reference count a transaction holds when the - * transaction is completed. + * We take a reference to the buffer log item here so that the BLI life cycle + * extends at least until the buffer is unpinned via xfs_buf_item_unpin() and + * inserted into the AIL. + * + * We also need to take a reference to the buffer itself as the BLI unpin + * processing requires accessing the buffer after the BLI has dropped the final + * BLI reference. See xfs_buf_item_unpin() for an explanation. + * If unpins race to drop the final BLI reference and only the + * BLI owns a reference to the buffer, then the loser of the race can have the + * buffer fgreed from under it (e.g. on shutdown). Taking a buffer reference per + * pin count ensures the life cycle of the buffer extends for as + * long as we hold the buffer pin reference in xfs_buf_item_unpin(). */ STATIC void xfs_buf_item_pin( @@ -470,13 +478,30 @@ xfs_buf_item_pin( trace_xfs_buf_item_pin(bip); + xfs_buf_hold(bip->bli_buf); atomic_inc(&bip->bli_refcount); atomic_inc(&bip->bli_buf->b_pin_count); } /* - * This is called to unpin the buffer associated with the buf log item which - * was previously pinned with a call to xfs_buf_item_pin(). + * This is called to unpin the buffer associated with the buf log item which was + * previously pinned with a call to xfs_buf_item_pin(). We enter this function + * with a buffer pin count, a buffer reference and a BLI reference. + * + * We must drop the BLI reference before we unpin the buffer because the AIL + * doesn't acquire a BLI reference whenever it accesses it. Therefore if the + * refcount drops to zero, the bli could still be AIL resident and the buffer + * submitted for I/O at any point before we return. This can result in IO + * completion freeing the buffer while we are still trying to access it here. + * This race condition can also occur in shutdown situations where we abort and + * unpin buffers from contexts other that journal IO completion. + * + * Hence we have to hold a buffer reference per pin count to ensure that the + * buffer cannot be freed until we have finished processing the unpin operation. + * The reference is taken in xfs_buf_item_pin(), and we must hold it until we + * are done processing the buffer state. In the case of an abort (remove = + * true) then we re-use the current pin reference as the IO reference we hand + * off to IO failure handling. */ STATIC void xfs_buf_item_unpin( @@ -493,24 +518,18 @@ xfs_buf_item_unpin( trace_xfs_buf_item_unpin(bip); - /* - * Drop the bli ref associated with the pin and grab the hold required - * for the I/O simulation failure in the abort case. We have to do this - * before the pin count drops because the AIL doesn't acquire a bli - * reference. Therefore if the refcount drops to zero, the bli could - * still be AIL resident and the buffer submitted for I/O (and freed on - * completion) at any point before we return. This can be removed once - * the AIL properly holds a reference on the bli. - */ freed = atomic_dec_and_test(&bip->bli_refcount); - if (freed && !stale && remove) - xfs_buf_hold(bp); if (atomic_dec_and_test(&bp->b_pin_count)) wake_up_all(&bp->b_waiters); - /* nothing to do but drop the pin count if the bli is active */ - if (!freed) + /* + * Nothing to do but drop the buffer pin reference if the BLI is + * still active. + */ + if (!freed) { + xfs_buf_rele(bp); return; + } if (stale) { ASSERT(bip->bli_flags & XFS_BLI_STALE); @@ -523,6 +542,15 @@ xfs_buf_item_unpin( trace_xfs_buf_item_unpin_stale(bip); /* + * The buffer has been locked and referenced since it was marked + * stale so we own both lock and reference exclusively here. We + * do not need the pin reference any more, so drop it now so + * that we only have one reference to drop once item completion + * processing is complete. + */ + xfs_buf_rele(bp); + + /* * If we get called here because of an IO error, we may or may * not have the item on the AIL. xfs_trans_ail_delete() will * take care of that situation. xfs_trans_ail_delete() drops @@ -538,16 +566,30 @@ xfs_buf_item_unpin( ASSERT(bp->b_log_item == NULL); } xfs_buf_relse(bp); - } else if (remove) { + return; + } + + if (remove) { /* - * The buffer must be locked and held by the caller to simulate - * an async I/O failure. We acquired the hold for this case - * before the buffer was unpinned. + * We need to simulate an async IO failures here to ensure that + * the correct error completion is run on this buffer. This + * requires a reference to the buffer and for the buffer to be + * locked. We can safely pass ownership of the pin reference to + * the IO to ensure that nothing can free the buffer while we + * wait for the lock and then run the IO failure completion. */ xfs_buf_lock(bp); bp->b_flags |= XBF_ASYNC; xfs_buf_ioend_fail(bp); + return; } + + /* + * BLI has no more active references - it will be moved to the AIL to + * manage the remaining BLI/buffer life cycle. There is nothing left for + * us to do here so drop the pin reference to the buffer. + */ + xfs_buf_rele(bp); } STATIC uint diff --git a/fs/xfs/xfs_filestream.c b/fs/xfs/xfs_filestream.c index 22c13933c8f8..2fc98d313708 100644 --- a/fs/xfs/xfs_filestream.c +++ b/fs/xfs/xfs_filestream.c @@ -78,7 +78,6 @@ restart: *longest = 0; err = xfs_bmap_longest_free_extent(pag, NULL, longest); if (err) { - xfs_perag_rele(pag); if (err != -EAGAIN) break; /* Couldn't lock the AGF, skip this AG. */ diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index 0f60e301eb1f..453890942d9f 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -454,6 +454,27 @@ xfs_inodegc_queue_all( return ret; } +/* Wait for all queued work and collect errors */ +static int +xfs_inodegc_wait_all( + struct xfs_mount *mp) +{ + int cpu; + int error = 0; + + flush_workqueue(mp->m_inodegc_wq); + for_each_online_cpu(cpu) { + struct xfs_inodegc *gc; + + gc = per_cpu_ptr(mp->m_inodegc, cpu); + if (gc->error && !error) + error = gc->error; + gc->error = 0; + } + + return error; +} + /* * Check the validity of the inode we just found it the cache */ @@ -1491,15 +1512,14 @@ xfs_blockgc_free_space( if (error) return error; - xfs_inodegc_flush(mp); - return 0; + return xfs_inodegc_flush(mp); } /* * Reclaim all the free space that we can by scheduling the background blockgc * and inodegc workers immediately and waiting for them all to clear. */ -void +int xfs_blockgc_flush_all( struct xfs_mount *mp) { @@ -1520,7 +1540,7 @@ xfs_blockgc_flush_all( for_each_perag_tag(mp, agno, pag, XFS_ICI_BLOCKGC_TAG) flush_delayed_work(&pag->pag_blockgc_work); - xfs_inodegc_flush(mp); + return xfs_inodegc_flush(mp); } /* @@ -1842,13 +1862,17 @@ xfs_inodegc_set_reclaimable( * This is the last chance to make changes to an otherwise unreferenced file * before incore reclamation happens. */ -static void +static int xfs_inodegc_inactivate( struct xfs_inode *ip) { + int error; + trace_xfs_inode_inactivating(ip); - xfs_inactive(ip); + error = xfs_inactive(ip); xfs_inodegc_set_reclaimable(ip); + return error; + } void @@ -1880,8 +1904,12 @@ xfs_inodegc_worker( WRITE_ONCE(gc->shrinker_hits, 0); llist_for_each_entry_safe(ip, n, node, i_gclist) { + int error; + xfs_iflags_set(ip, XFS_INACTIVATING); - xfs_inodegc_inactivate(ip); + error = xfs_inodegc_inactivate(ip); + if (error && !gc->error) + gc->error = error; } memalloc_nofs_restore(nofs_flag); @@ -1905,13 +1933,13 @@ xfs_inodegc_push( * Force all currently queued inode inactivation work to run immediately and * wait for the work to finish. */ -void +int xfs_inodegc_flush( struct xfs_mount *mp) { xfs_inodegc_push(mp); trace_xfs_inodegc_flush(mp, __return_address); - flush_workqueue(mp->m_inodegc_wq); + return xfs_inodegc_wait_all(mp); } /* diff --git a/fs/xfs/xfs_icache.h b/fs/xfs/xfs_icache.h index 87910191a9dd..1dcdcb23796e 100644 --- a/fs/xfs/xfs_icache.h +++ b/fs/xfs/xfs_icache.h @@ -62,7 +62,7 @@ int xfs_blockgc_free_dquots(struct xfs_mount *mp, struct xfs_dquot *udqp, unsigned int iwalk_flags); int xfs_blockgc_free_quota(struct xfs_inode *ip, unsigned int iwalk_flags); int xfs_blockgc_free_space(struct xfs_mount *mp, struct xfs_icwalk *icm); -void xfs_blockgc_flush_all(struct xfs_mount *mp); +int xfs_blockgc_flush_all(struct xfs_mount *mp); void xfs_inode_set_eofblocks_tag(struct xfs_inode *ip); void xfs_inode_clear_eofblocks_tag(struct xfs_inode *ip); @@ -80,7 +80,7 @@ void xfs_blockgc_start(struct xfs_mount *mp); void xfs_inodegc_worker(struct work_struct *work); void xfs_inodegc_push(struct xfs_mount *mp); -void xfs_inodegc_flush(struct xfs_mount *mp); +int xfs_inodegc_flush(struct xfs_mount *mp); void xfs_inodegc_stop(struct xfs_mount *mp); void xfs_inodegc_start(struct xfs_mount *mp); void xfs_inodegc_cpu_dead(struct xfs_mount *mp, unsigned int cpu); diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 5808abab786c..9e62cc500140 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -1620,16 +1620,7 @@ xfs_inactive_ifree( */ xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_ICOUNT, -1); - /* - * Just ignore errors at this point. There is nothing we can do except - * to try to keep going. Make sure it's not a silent error. - */ - error = xfs_trans_commit(tp); - if (error) - xfs_notice(mp, "%s: xfs_trans_commit returned error %d", - __func__, error); - - return 0; + return xfs_trans_commit(tp); } /* @@ -1693,12 +1684,12 @@ xfs_inode_needs_inactive( * now be truncated. Also, we clear all of the read-ahead state * kept for the inode here since the file is now closed. */ -void +int xfs_inactive( xfs_inode_t *ip) { struct xfs_mount *mp; - int error; + int error = 0; int truncate = 0; /* @@ -1736,7 +1727,7 @@ xfs_inactive( * reference to the inode at this point anyways. */ if (xfs_can_free_eofblocks(ip, true)) - xfs_free_eofblocks(ip); + error = xfs_free_eofblocks(ip); goto out; } @@ -1773,7 +1764,7 @@ xfs_inactive( /* * Free the inode. */ - xfs_inactive_ifree(ip); + error = xfs_inactive_ifree(ip); out: /* @@ -1781,6 +1772,7 @@ out: * the attached dquots. */ xfs_qm_dqdetach(ip); + return error; } /* diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index 69d21e42c10a..7547caf2f2ab 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -470,7 +470,7 @@ enum layout_break_reason { (xfs_has_grpid((pip)->i_mount) || (VFS_I(pip)->i_mode & S_ISGID)) int xfs_release(struct xfs_inode *ip); -void xfs_inactive(struct xfs_inode *ip); +int xfs_inactive(struct xfs_inode *ip); int xfs_lookup(struct xfs_inode *dp, const struct xfs_name *name, struct xfs_inode **ipp, struct xfs_name *ci_name); int xfs_create(struct mnt_idmap *idmap, diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c index ca2941ab6cbc..91c847a84e10 100644 --- a/fs/xfs/xfs_inode_item.c +++ b/fs/xfs/xfs_inode_item.c @@ -29,6 +29,153 @@ static inline struct xfs_inode_log_item *INODE_ITEM(struct xfs_log_item *lip) return container_of(lip, struct xfs_inode_log_item, ili_item); } +static uint64_t +xfs_inode_item_sort( + struct xfs_log_item *lip) +{ + return INODE_ITEM(lip)->ili_inode->i_ino; +} + +/* + * Prior to finally logging the inode, we have to ensure that all the + * per-modification inode state changes are applied. This includes VFS inode + * state updates, format conversions, verifier state synchronisation and + * ensuring the inode buffer remains in memory whilst the inode is dirty. + * + * We have to be careful when we grab the inode cluster buffer due to lock + * ordering constraints. The unlinked inode modifications (xfs_iunlink_item) + * require AGI -> inode cluster buffer lock order. The inode cluster buffer is + * not locked until ->precommit, so it happens after everything else has been + * modified. + * + * Further, we have AGI -> AGF lock ordering, and with O_TMPFILE handling we + * have AGI -> AGF -> iunlink item -> inode cluster buffer lock order. Hence we + * cannot safely lock the inode cluster buffer in xfs_trans_log_inode() because + * it can be called on a inode (e.g. via bumplink/droplink) before we take the + * AGF lock modifying directory blocks. + * + * Rather than force a complete rework of all the transactions to call + * xfs_trans_log_inode() once and once only at the end of every transaction, we + * move the pinning of the inode cluster buffer to a ->precommit operation. This + * matches how the xfs_iunlink_item locks the inode cluster buffer, and it + * ensures that the inode cluster buffer locking is always done last in a + * transaction. i.e. we ensure the lock order is always AGI -> AGF -> inode + * cluster buffer. + * + * If we return the inode number as the precommit sort key then we'll also + * guarantee that the order all inode cluster buffer locking is the same all the + * inodes and unlink items in the transaction. + */ +static int +xfs_inode_item_precommit( + struct xfs_trans *tp, + struct xfs_log_item *lip) +{ + struct xfs_inode_log_item *iip = INODE_ITEM(lip); + struct xfs_inode *ip = iip->ili_inode; + struct inode *inode = VFS_I(ip); + unsigned int flags = iip->ili_dirty_flags; + + /* + * Don't bother with i_lock for the I_DIRTY_TIME check here, as races + * don't matter - we either will need an extra transaction in 24 hours + * to log the timestamps, or will clear already cleared fields in the + * worst case. + */ + if (inode->i_state & I_DIRTY_TIME) { + spin_lock(&inode->i_lock); + inode->i_state &= ~I_DIRTY_TIME; + spin_unlock(&inode->i_lock); + } + + /* + * If we're updating the inode core or the timestamps and it's possible + * to upgrade this inode to bigtime format, do so now. + */ + if ((flags & (XFS_ILOG_CORE | XFS_ILOG_TIMESTAMP)) && + xfs_has_bigtime(ip->i_mount) && + !xfs_inode_has_bigtime(ip)) { + ip->i_diflags2 |= XFS_DIFLAG2_BIGTIME; + flags |= XFS_ILOG_CORE; + } + + /* + * Inode verifiers do not check that the extent size hint is an integer + * multiple of the rt extent size on a directory with both rtinherit + * and extszinherit flags set. If we're logging a directory that is + * misconfigured in this way, clear the hint. + */ + if ((ip->i_diflags & XFS_DIFLAG_RTINHERIT) && + (ip->i_diflags & XFS_DIFLAG_EXTSZINHERIT) && + (ip->i_extsize % ip->i_mount->m_sb.sb_rextsize) > 0) { + ip->i_diflags &= ~(XFS_DIFLAG_EXTSIZE | + XFS_DIFLAG_EXTSZINHERIT); + ip->i_extsize = 0; + flags |= XFS_ILOG_CORE; + } + + /* + * Record the specific change for fdatasync optimisation. This allows + * fdatasync to skip log forces for inodes that are only timestamp + * dirty. Once we've processed the XFS_ILOG_IVERSION flag, convert it + * to XFS_ILOG_CORE so that the actual on-disk dirty tracking + * (ili_fields) correctly tracks that the version has changed. + */ + spin_lock(&iip->ili_lock); + iip->ili_fsync_fields |= (flags & ~XFS_ILOG_IVERSION); + if (flags & XFS_ILOG_IVERSION) + flags = ((flags & ~XFS_ILOG_IVERSION) | XFS_ILOG_CORE); + + if (!iip->ili_item.li_buf) { + struct xfs_buf *bp; + int error; + + /* + * We hold the ILOCK here, so this inode is not going to be + * flushed while we are here. Further, because there is no + * buffer attached to the item, we know that there is no IO in + * progress, so nothing will clear the ili_fields while we read + * in the buffer. Hence we can safely drop the spin lock and + * read the buffer knowing that the state will not change from + * here. + */ + spin_unlock(&iip->ili_lock); + error = xfs_imap_to_bp(ip->i_mount, tp, &ip->i_imap, &bp); + if (error) + return error; + + /* + * We need an explicit buffer reference for the log item but + * don't want the buffer to remain attached to the transaction. + * Hold the buffer but release the transaction reference once + * we've attached the inode log item to the buffer log item + * list. + */ + xfs_buf_hold(bp); + spin_lock(&iip->ili_lock); + iip->ili_item.li_buf = bp; + bp->b_flags |= _XBF_INODES; + list_add_tail(&iip->ili_item.li_bio_list, &bp->b_li_list); + xfs_trans_brelse(tp, bp); + } + + /* + * Always OR in the bits from the ili_last_fields field. This is to + * coordinate with the xfs_iflush() and xfs_buf_inode_iodone() routines + * in the eventual clearing of the ili_fields bits. See the big comment + * in xfs_iflush() for an explanation of this coordination mechanism. + */ + iip->ili_fields |= (flags | iip->ili_last_fields); + spin_unlock(&iip->ili_lock); + + /* + * We are done with the log item transaction dirty state, so clear it so + * that it doesn't pollute future transactions. + */ + iip->ili_dirty_flags = 0; + return 0; +} + /* * The logged size of an inode fork is always the current size of the inode * fork. This means that when an inode fork is relogged, the size of the logged @@ -662,6 +809,8 @@ xfs_inode_item_committing( } static const struct xfs_item_ops xfs_inode_item_ops = { + .iop_sort = xfs_inode_item_sort, + .iop_precommit = xfs_inode_item_precommit, .iop_size = xfs_inode_item_size, .iop_format = xfs_inode_item_format, .iop_pin = xfs_inode_item_pin, diff --git a/fs/xfs/xfs_inode_item.h b/fs/xfs/xfs_inode_item.h index bbd836a44ff0..377e06007804 100644 --- a/fs/xfs/xfs_inode_item.h +++ b/fs/xfs/xfs_inode_item.h @@ -17,6 +17,7 @@ struct xfs_inode_log_item { struct xfs_log_item ili_item; /* common portion */ struct xfs_inode *ili_inode; /* inode ptr */ unsigned short ili_lock_flags; /* inode lock flags */ + unsigned int ili_dirty_flags; /* dirty in current tx */ /* * The ili_lock protects the interactions between the dirty state and * the flush state of the inode log item. This allows us to do atomic diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index 322eb2ee6c55..82c81d20459d 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -2711,7 +2711,9 @@ xlog_recover_iunlink_bucket( * just to flush the inodegc queue and wait for it to * complete. */ - xfs_inodegc_flush(mp); + error = xfs_inodegc_flush(mp); + if (error) + break; } prev_agino = agino; @@ -2719,10 +2721,15 @@ xlog_recover_iunlink_bucket( } if (prev_ip) { + int error2; + ip->i_prev_unlinked = prev_agino; xfs_irele(prev_ip); + + error2 = xfs_inodegc_flush(mp); + if (error2 && !error) + return error2; } - xfs_inodegc_flush(mp); return error; } @@ -2789,7 +2796,6 @@ xlog_recover_iunlink_ag( * bucket and remaining inodes on it unreferenced and * unfreeable. */ - xfs_inodegc_flush(pag->pag_mount); xlog_recover_clear_agi_bucket(pag, bucket); } } @@ -2806,13 +2812,6 @@ xlog_recover_process_iunlinks( for_each_perag(log->l_mp, agno, pag) xlog_recover_iunlink_ag(pag); - - /* - * Flush the pending unlinked inodes to ensure that the inactivations - * are fully completed on disk and the incore inodes can be reclaimed - * before we signal that recovery is complete. - */ - xfs_inodegc_flush(log->l_mp); } STATIC void diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index aaaf5ec13492..6c09f89534d3 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -62,6 +62,7 @@ struct xfs_error_cfg { struct xfs_inodegc { struct llist_head list; struct delayed_work work; + int error; /* approximate count of inodes in the list */ unsigned int items; diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c index f5dc46ce9803..abcc559f3c64 100644 --- a/fs/xfs/xfs_reflink.c +++ b/fs/xfs/xfs_reflink.c @@ -616,8 +616,10 @@ xfs_reflink_cancel_cow_blocks( xfs_refcount_free_cow_extent(*tpp, del.br_startblock, del.br_blockcount); - xfs_free_extent_later(*tpp, del.br_startblock, + error = xfs_free_extent_later(*tpp, del.br_startblock, del.br_blockcount, NULL); + if (error) + break; /* Roll the transaction */ error = xfs_defer_finish(tpp); diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 7e706255f165..4120bd1cba90 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -1100,6 +1100,7 @@ xfs_inodegc_init_percpu( #endif init_llist_head(&gc->list); gc->items = 0; + gc->error = 0; INIT_DELAYED_WORK(&gc->work, xfs_inodegc_worker); } return 0; diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c index 8afc0c080861..8c0bfc9a33b1 100644 --- a/fs/xfs/xfs_trans.c +++ b/fs/xfs/xfs_trans.c @@ -290,7 +290,9 @@ retry: * Do not perform a synchronous scan because callers can hold * other locks. */ - xfs_blockgc_flush_all(mp); + error = xfs_blockgc_flush_all(mp); + if (error) + return error; want_retry = false; goto retry; } @@ -970,6 +972,11 @@ __xfs_trans_commit( error = xfs_defer_finish_noroll(&tp); if (error) goto out_unreserve; + + /* Run precommits from final tx in defer chain. */ + error = xfs_trans_run_precommits(tp); + if (error) + goto out_unreserve; } /* |