From ad26967b9afa7faee22c3b79370cb5d9ab553493 Mon Sep 17 00:00:00 2001 From: Bob Peterson Date: Fri, 30 Aug 2019 12:31:02 -0500 Subject: gfs2: Use async glocks for rename Because s_vfs_rename_mutex is not cluster-wide, multiple nodes can reverse the roles of which directories are "old" and which are "new" for the purposes of rename. This can cause deadlocks where two nodes end up waiting for each other. There can be several layers of directory dependencies across many nodes. This patch fixes the problem by acquiring all gfs2_rename's inode glocks asychronously and waiting for all glocks to be acquired. That way all inodes are locked regardless of the order. The timeout value for multiple asynchronous glocks is calculated to be the total of the individual wait times for each glock times two. Since gfs2_exchange is very similar to gfs2_rename, both functions are patched in the same way. A new async glock wait queue, sd_async_glock_wait, keeps a list of waiters for these events. If gfs2's holder_wake function detects an async holder, it wakes up any waiters for the event. The waiter only tests whether any of its requests are still pending. Since the glocks are sent to dlm asychronously, the wait function needs to check to see which glocks, if any, were granted. If a glock is granted by dlm (and therefore held), its minimum hold time is checked and adjusted as necessary, as other glock grants do. If the event times out, all glocks held thus far must be dequeued to resolve any existing deadlocks. Then, if there are any outstanding locking requests, we need to loop around and wait for dlm to respond to those requests too. After we release all requests, we return -ESTALE to the caller (vfs rename) which loops around and retries the request. Node1 Node2 --------- --------- 1. Enqueue A Enqueue B 2. Enqueue B Enqueue A 3. A granted 6. B granted 7. Wait for B 8. Wait for A 9. A times out (since Node 1 holds A) 10. Dequeue B (since it was granted) 11. Wait for all requests from DLM 12. B Granted (since Node2 released it in step 10) 13. Rename 14. Dequeue A 15. DLM Grants A 16. Dequeue A (due to the timeout and since we no longer have B held for our task). 17. Dequeue B 18. Return -ESTALE to vfs 19. VFS retries the operation, goto step 1. This release-all-locks / acquire-all-locks may slow rename / exchange down as both nodes struggle in the same way and do the same thing. However, this will only happen when there is contention for the same inodes, which ought to be rare. Signed-off-by: Bob Peterson Signed-off-by: Andreas Gruenbacher --- fs/gfs2/inode.c | 34 +++++++++++++++++++++++----------- 1 file changed, 23 insertions(+), 11 deletions(-) (limited to 'fs/gfs2/inode.c') diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c index 50eeb15c6f4f..e1e18fb587eb 100644 --- a/fs/gfs2/inode.c +++ b/fs/gfs2/inode.c @@ -1388,16 +1388,18 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry, } num_gh = 1; - gfs2_holder_init(odip->i_gl, LM_ST_EXCLUSIVE, 0, ghs); + gfs2_holder_init(odip->i_gl, LM_ST_EXCLUSIVE, GL_ASYNC, ghs); if (odip != ndip) { - gfs2_holder_init(ndip->i_gl, LM_ST_EXCLUSIVE, 0, ghs + num_gh); + gfs2_holder_init(ndip->i_gl, LM_ST_EXCLUSIVE,GL_ASYNC, + ghs + num_gh); num_gh++; } - gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, ghs + num_gh); + gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, GL_ASYNC, ghs + num_gh); num_gh++; if (nip) { - gfs2_holder_init(nip->i_gl, LM_ST_EXCLUSIVE, 0, ghs + num_gh); + gfs2_holder_init(nip->i_gl, LM_ST_EXCLUSIVE, GL_ASYNC, + ghs + num_gh); num_gh++; } @@ -1406,6 +1408,9 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry, if (error) goto out_gunlock; } + error = gfs2_glock_async_wait(num_gh, ghs); + if (error) + goto out_gunlock; if (nip) { /* Grab the resource group glock for unlink flag twiddling. @@ -1555,7 +1560,8 @@ out_gunlock: gfs2_glock_dq_uninit(&rd_gh); while (x--) { - gfs2_glock_dq(ghs + x); + if (gfs2_holder_queued(ghs + x)) + gfs2_glock_dq(ghs + x); gfs2_holder_uninit(ghs + x); } out_gunlock_r: @@ -1585,7 +1591,7 @@ static int gfs2_exchange(struct inode *odir, struct dentry *odentry, struct gfs2_inode *oip = GFS2_I(odentry->d_inode); struct gfs2_inode *nip = GFS2_I(ndentry->d_inode); struct gfs2_sbd *sdp = GFS2_SB(odir); - struct gfs2_holder ghs[5], r_gh; + struct gfs2_holder ghs[4], r_gh; unsigned int num_gh; unsigned int x; umode_t old_mode = oip->i_inode.i_mode; @@ -1619,15 +1625,16 @@ static int gfs2_exchange(struct inode *odir, struct dentry *odentry, } num_gh = 1; - gfs2_holder_init(odip->i_gl, LM_ST_EXCLUSIVE, 0, ghs); + gfs2_holder_init(odip->i_gl, LM_ST_EXCLUSIVE, GL_ASYNC, ghs); if (odip != ndip) { - gfs2_holder_init(ndip->i_gl, LM_ST_EXCLUSIVE, 0, ghs + num_gh); + gfs2_holder_init(ndip->i_gl, LM_ST_EXCLUSIVE, GL_ASYNC, + ghs + num_gh); num_gh++; } - gfs2_holder_init(oip->i_gl, LM_ST_EXCLUSIVE, 0, ghs + num_gh); + gfs2_holder_init(oip->i_gl, LM_ST_EXCLUSIVE, GL_ASYNC, ghs + num_gh); num_gh++; - gfs2_holder_init(nip->i_gl, LM_ST_EXCLUSIVE, 0, ghs + num_gh); + gfs2_holder_init(nip->i_gl, LM_ST_EXCLUSIVE, GL_ASYNC, ghs + num_gh); num_gh++; for (x = 0; x < num_gh; x++) { @@ -1636,6 +1643,10 @@ static int gfs2_exchange(struct inode *odir, struct dentry *odentry, goto out_gunlock; } + error = gfs2_glock_async_wait(num_gh, ghs); + if (error) + goto out_gunlock; + error = -ENOENT; if (oip->i_inode.i_nlink == 0 || nip->i_inode.i_nlink == 0) goto out_gunlock; @@ -1696,7 +1707,8 @@ out_end_trans: gfs2_trans_end(sdp); out_gunlock: while (x--) { - gfs2_glock_dq(ghs + x); + if (gfs2_holder_queued(ghs + x)) + gfs2_glock_dq(ghs + x); gfs2_holder_uninit(ghs + x); } out_gunlock_r: -- cgit v1.2.3-70-g09d2