diff options
author | Dave Chinner <dchinner@redhat.com> | 2022-05-12 15:12:57 +1000 |
---|---|---|
committer | Dave Chinner <david@fromorbit.com> | 2022-05-12 15:12:57 +1000 |
commit | 45ff8b471cdc58701a7ba5c5dcd8dfc57ae06829 (patch) | |
tree | 36dc8d8d0387718127c2cac40ce5b526623e97f6 | |
parent | 51e6104fdb95f377c8741794778319bd413f4fff (diff) |
xfs: can't use kmem_zalloc() for attribute buffers
Because heap allocation of 64kB buffers will fail:
....
XFS: fs_mark(8414) possible memory allocation deadlock size 65768 in kmem_alloc (mode:0x2d40)
XFS: fs_mark(8417) possible memory allocation deadlock size 65768 in kmem_alloc (mode:0x2d40)
XFS: fs_mark(8409) possible memory allocation deadlock size 65768 in kmem_alloc (mode:0x2d40)
XFS: fs_mark(8428) possible memory allocation deadlock size 65768 in kmem_alloc (mode:0x2d40)
XFS: fs_mark(8430) possible memory allocation deadlock size 65768 in kmem_alloc (mode:0x2d40)
XFS: fs_mark(8437) possible memory allocation deadlock size 65768 in kmem_alloc (mode:0x2d40)
XFS: fs_mark(8433) possible memory allocation deadlock size 65768 in kmem_alloc (mode:0x2d40)
XFS: fs_mark(8406) possible memory allocation deadlock size 65768 in kmem_alloc (mode:0x2d40)
XFS: fs_mark(8412) possible memory allocation deadlock size 65768 in kmem_alloc (mode:0x2d40)
XFS: fs_mark(8432) possible memory allocation deadlock size 65768 in kmem_alloc (mode:0x2d40)
XFS: fs_mark(8424) possible memory allocation deadlock size 65768 in kmem_alloc (mode:0x2d40)
....
I'd use kvmalloc() instead, but....
- 48.19% xfs_attr_create_intent
- 46.89% xfs_attri_init
- kvmalloc_node
- 46.04% __kmalloc_node
- kmalloc_large_node
- 45.99% __alloc_pages
- 39.39% __alloc_pages_slowpath.constprop.0
- 38.89% __alloc_pages_direct_compact
- 38.71% try_to_compact_pages
- compact_zone_order
- compact_zone
- 21.09% isolate_migratepages_block
10.31% PageHuge
5.82% set_pfnblock_flags_mask
0.86% get_pfnblock_flags_mask
- 4.48% __reset_isolation_suitable
4.44% __reset_isolation_pfn
- 3.56% __pageblock_pfn_to_page
1.33% pfn_to_online_page
2.83% get_pfnblock_flags_mask
- 0.87% migrate_pages
0.86% compaction_alloc
0.84% find_suitable_fallback
- 6.60% get_page_from_freelist
4.99% clear_page_erms
- 1.19% _raw_spin_lock_irqsave
- do_raw_spin_lock
__pv_queued_spin_lock_slowpath
- 0.86% __vmalloc_node_range
0.65% __alloc_pages_bulk
.... this is just yet another reminder of how much kvmalloc() sucks.
So lift xlog_cil_kvmalloc(), rename it to xlog_kvmalloc() and use
that instead....
We also clean up the attribute name and value lengths as they no
longer need to be rounded out to sizes compatible with log vectors.
Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Allison Henderson <allison.henderson@oracle.com>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Signed-off-by: Dave Chinner <david@fromorbit.com>
-rw-r--r-- | fs/xfs/xfs_attr_item.c | 35 | ||||
-rw-r--r-- | fs/xfs/xfs_log_cil.c | 35 | ||||
-rw-r--r-- | fs/xfs/xfs_log_priv.h | 34 |
3 files changed, 50 insertions, 54 deletions
diff --git a/fs/xfs/xfs_attr_item.c b/fs/xfs/xfs_attr_item.c index 56f678c965b7..e8ac88d9fd14 100644 --- a/fs/xfs/xfs_attr_item.c +++ b/fs/xfs/xfs_attr_item.c @@ -44,7 +44,7 @@ xfs_attri_item_free( struct xfs_attri_log_item *attrip) { kmem_free(attrip->attri_item.li_lv_shadow); - kmem_free(attrip); + kvfree(attrip); } /* @@ -119,11 +119,11 @@ xfs_attri_item_format( sizeof(struct xfs_attri_log_format)); xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_ATTR_NAME, attrip->attri_name, - xlog_calc_iovec_len(attrip->attri_name_len)); + attrip->attri_name_len); if (attrip->attri_value_len > 0) xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_ATTR_VALUE, attrip->attri_value, - xlog_calc_iovec_len(attrip->attri_value_len)); + attrip->attri_value_len); } /* @@ -163,26 +163,21 @@ xfs_attri_init( { struct xfs_attri_log_item *attrip; - uint32_t name_vec_len = 0; - uint32_t value_vec_len = 0; - uint32_t buffer_size; - - if (name_len) - name_vec_len = xlog_calc_iovec_len(name_len); - if (value_len) - value_vec_len = xlog_calc_iovec_len(value_len); - - buffer_size = name_vec_len + value_vec_len; + uint32_t buffer_size = name_len + value_len; if (buffer_size) { - attrip = kmem_zalloc(sizeof(struct xfs_attri_log_item) + - buffer_size, KM_NOFS); - if (attrip == NULL) - return NULL; + /* + * This could be over 64kB in length, so we have to use + * kvmalloc() for this. But kvmalloc() utterly sucks, so we + * use own version. + */ + attrip = xlog_kvmalloc(sizeof(struct xfs_attri_log_item) + + buffer_size); } else { - attrip = kmem_cache_zalloc(xfs_attri_cache, - GFP_NOFS | __GFP_NOFAIL); + attrip = kmem_cache_alloc(xfs_attri_cache, + GFP_NOFS | __GFP_NOFAIL); } + memset(attrip, 0, sizeof(struct xfs_attri_log_item)); attrip->attri_name_len = name_len; if (name_len) @@ -195,7 +190,7 @@ xfs_attri_init( if (value_len) attrip->attri_value = ((char *)attrip) + sizeof(struct xfs_attri_log_item) + - name_vec_len; + name_len; else attrip->attri_value = NULL; diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c index 70f718d76ceb..6ca6fe8f2747 100644 --- a/fs/xfs/xfs_log_cil.c +++ b/fs/xfs/xfs_log_cil.c @@ -135,39 +135,6 @@ xlog_cil_iovec_space( } /* - * shadow buffers can be large, so we need to use kvmalloc() here to ensure - * success. Unfortunately, kvmalloc() only allows GFP_KERNEL contexts to fall - * back to vmalloc, so we can't actually do anything useful with gfp flags to - * control the kmalloc() behaviour within kvmalloc(). Hence kmalloc() will do - * direct reclaim and compaction in the slow path, both of which are - * horrendously expensive. We just want kmalloc to fail fast and fall back to - * vmalloc if it can't get somethign straight away from the free lists or buddy - * allocator. Hence we have to open code kvmalloc outselves here. - * - * Also, we are in memalloc_nofs_save task context here, so despite the use of - * GFP_KERNEL here, we are actually going to be doing GFP_NOFS allocations. This - * is actually the only way to make vmalloc() do GFP_NOFS allocations, so lets - * just all pretend this is a GFP_KERNEL context operation.... - */ -static inline void * -xlog_cil_kvmalloc( - size_t buf_size) -{ - gfp_t flags = GFP_KERNEL; - void *p; - - flags &= ~__GFP_DIRECT_RECLAIM; - flags |= __GFP_NOWARN | __GFP_NORETRY; - do { - p = kmalloc(buf_size, flags); - if (!p) - p = vmalloc(buf_size); - } while (!p); - - return p; -} - -/* * Allocate or pin log vector buffers for CIL insertion. * * The CIL currently uses disposable buffers for copying a snapshot of the @@ -283,7 +250,7 @@ xlog_cil_alloc_shadow_bufs( * storage. */ kmem_free(lip->li_lv_shadow); - lv = xlog_cil_kvmalloc(buf_size); + lv = xlog_kvmalloc(buf_size); memset(lv, 0, xlog_cil_iovec_space(niovecs)); diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h index 4f7e844d28ad..67fd9789e69a 100644 --- a/fs/xfs/xfs_log_priv.h +++ b/fs/xfs/xfs_log_priv.h @@ -651,4 +651,38 @@ xlog_valid_lsn( return valid; } +/* + * Log vector and shadow buffers can be large, so we need to use kvmalloc() here + * to ensure success. Unfortunately, kvmalloc() only allows GFP_KERNEL contexts + * to fall back to vmalloc, so we can't actually do anything useful with gfp + * flags to control the kmalloc() behaviour within kvmalloc(). Hence kmalloc() + * will do direct reclaim and compaction in the slow path, both of which are + * horrendously expensive. We just want kmalloc to fail fast and fall back to + * vmalloc if it can't get somethign straight away from the free lists or + * buddy allocator. Hence we have to open code kvmalloc outselves here. + * + * This assumes that the caller uses memalloc_nofs_save task context here, so + * despite the use of GFP_KERNEL here, we are going to be doing GFP_NOFS + * allocations. This is actually the only way to make vmalloc() do GFP_NOFS + * allocations, so lets just all pretend this is a GFP_KERNEL context + * operation.... + */ +static inline void * +xlog_kvmalloc( + size_t buf_size) +{ + gfp_t flags = GFP_KERNEL; + void *p; + + flags &= ~__GFP_DIRECT_RECLAIM; + flags |= __GFP_NOWARN | __GFP_NORETRY; + do { + p = kmalloc(buf_size, flags); + if (!p) + p = vmalloc(buf_size); + } while (!p); + + return p; +} + #endif /* __XFS_LOG_PRIV_H__ */ |