summaryrefslogtreecommitdiff
path: root/fs
diff options
context:
space:
mode:
authorBenjamin Coddington <bcodding@redhat.com>2024-11-22 10:11:12 -0500
committerTrond Myklebust <trond.myklebust@hammerspace.com>2024-11-28 12:55:32 -0500
commit614733f9441ed53bb442d4734112ec1e24bd6da7 (patch)
tree93b20d6e21c8ede6c9ec1302dec8281a46f47cf8 /fs
parent3a4ce14d9a6b868e0787e4582420b721c04ee41e (diff)
nfs/blocklayout: Limit repeat device registration on failure
Every pNFS SCSI IO wants to do LAYOUTGET, then within the layout find the device which can drive GETDEVINFO, then finally may need to prep the device with a reservation. This slow work makes a mess of IO latencies if one of the later steps is going to fail for awhile. If we're unable to register a SCSI device, ensure we mark the device as unavailable so that it will timeout and be re-added via GETDEVINFO. This avoids repeated doomed attempts to register a device in the IO path. Add some clarifying comments as well. Fixes: d869da91cccb ("nfs/blocklayout: Fix premature PR key unregistration") Signed-off-by: Benjamin Coddington <bcodding@redhat.com> Reviewed-by: Christoph Hellwig <hch@lst.de> Reviewed-by: Chuck Lever <chuck.lever@oracle.com> Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
Diffstat (limited to 'fs')
-rw-r--r--fs/nfs/blocklayout/blocklayout.c15
1 files changed, 14 insertions, 1 deletions
diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c
index 0becdec12970..47189476b553 100644
--- a/fs/nfs/blocklayout/blocklayout.c
+++ b/fs/nfs/blocklayout/blocklayout.c
@@ -571,19 +571,32 @@ retry:
if (!node)
return ERR_PTR(-ENODEV);
+ /*
+ * Devices that are marked unavailable are left in the cache with a
+ * timeout to avoid sending GETDEVINFO after every LAYOUTGET, or
+ * constantly attempting to register the device. Once marked as
+ * unavailable they must be deleted and never reused.
+ */
if (test_bit(NFS_DEVICEID_UNAVAILABLE, &node->flags)) {
unsigned long end = jiffies;
unsigned long start = end - PNFS_DEVICE_RETRY_TIMEOUT;
if (!time_in_range(node->timestamp_unavailable, start, end)) {
+ /* Uncork subsequent GETDEVINFO operations for this device */
nfs4_delete_deviceid(node->ld, node->nfs_client, id);
goto retry;
}
goto out_put;
}
- if (!bl_register_dev(container_of(node, struct pnfs_block_dev, node)))
+ if (!bl_register_dev(container_of(node, struct pnfs_block_dev, node))) {
+ /*
+ * If we cannot register, treat this device as transient:
+ * Make a negative cache entry for the device
+ */
+ nfs4_mark_deviceid_unavailable(node);
goto out_put;
+ }
return node;