108 lines
4.2 KiB
Diff
108 lines
4.2 KiB
Diff
From 8fdde9dd48edc32a2f5e5a061a12e8dad54c3c7e Mon Sep 17 00:00:00 2001
|
|
From: Omar Sandoval <osandov@fb.com>
|
|
Date: Thu, 15 Feb 2024 09:27:54 +0100
|
|
Subject: xfs: fix internal error from AGFL exhaustion
|
|
|
|
Source kernel commit: f63a5b3769ad7659da4c0420751d78958ab97675
|
|
|
|
We've been seeing XFS errors like the following:
|
|
|
|
XFS: Internal error i != 1 at line 3526 of file fs/xfs/libxfs/xfs_btree.c. Caller xfs_btree_insert+0x1ec/0x280
|
|
...
|
|
Call Trace:
|
|
xfs_corruption_error+0x94/0xa0
|
|
xfs_btree_insert+0x221/0x280
|
|
xfs_alloc_fixup_trees+0x104/0x3e0
|
|
xfs_alloc_ag_vextent_size+0x667/0x820
|
|
xfs_alloc_fix_freelist+0x5d9/0x750
|
|
xfs_free_extent_fix_freelist+0x65/0xa0
|
|
__xfs_free_extent+0x57/0x180
|
|
...
|
|
|
|
This is the XFS_IS_CORRUPT() check in xfs_btree_insert() when
|
|
xfs_btree_insrec() fails.
|
|
|
|
After converting this into a panic and dissecting the core dump, I found
|
|
that xfs_btree_insrec() is failing because it's trying to split a leaf
|
|
node in the cntbt when the AG free list is empty. In particular, it's
|
|
failing to get a block from the AGFL _while trying to refill the AGFL_.
|
|
|
|
If a single operation splits every level of the bnobt and the cntbt (and
|
|
the rmapbt if it is enabled) at once, the free list will be empty. Then,
|
|
when the next operation tries to refill the free list, it allocates
|
|
space. If the allocation does not use a full extent, it will need to
|
|
insert records for the remaining space in the bnobt and cntbt. And if
|
|
those new records go in full leaves, the leaves (and potentially more
|
|
nodes up to the old root) need to be split.
|
|
|
|
Fix it by accounting for the additional splits that may be required to
|
|
refill the free list in the calculation for the minimum free list size.
|
|
|
|
P.S. As far as I can tell, this bug has existed for a long time -- maybe
|
|
back to xfs-history commit afdf80ae7405 ("Add XFS_AG_MAXLEVELS macros
|
|
...") in April 1994! It requires a very unlucky sequence of events, and
|
|
in fact we didn't hit it until a particular sparse mmap workload updated
|
|
from 5.12 to 5.19. But this bug existed in 5.12, so it must've been
|
|
exposed by some other change in allocation or writeback patterns. It's
|
|
also much less likely to be hit with the rmapbt enabled, since that
|
|
increases the minimum free list size and is unlikely to split at the
|
|
same time as the bnobt and cntbt.
|
|
|
|
Reviewed-by: "Darrick J. Wong" <djwong@kernel.org>
|
|
Reviewed-by: Dave Chinner <dchinner@redhat.com>
|
|
Signed-off-by: Omar Sandoval <osandov@fb.com>
|
|
Signed-off-by: Chandan Babu R <chandanbabu@kernel.org>
|
|
Signed-off-by: Carlos Maiolino <cem@kernel.org>
|
|
---
|
|
libxfs/xfs_alloc.c | 27 ++++++++++++++++++++++++---
|
|
1 file changed, 24 insertions(+), 3 deletions(-)
|
|
|
|
diff --git a/libxfs/xfs_alloc.c b/libxfs/xfs_alloc.c
|
|
index 4519a05551..7ac7c2f6cc 100644
|
|
--- a/libxfs/xfs_alloc.c
|
|
+++ b/libxfs/xfs_alloc.c
|
|
@@ -2271,16 +2271,37 @@ xfs_alloc_min_freelist(
|
|
|
|
ASSERT(mp->m_alloc_maxlevels > 0);
|
|
|
|
+ /*
|
|
+ * For a btree shorter than the maximum height, the worst case is that
|
|
+ * every level gets split and a new level is added, then while inserting
|
|
+ * another entry to refill the AGFL, every level under the old root gets
|
|
+ * split again. This is:
|
|
+ *
|
|
+ * (full height split reservation) + (AGFL refill split height)
|
|
+ * = (current height + 1) + (current height - 1)
|
|
+ * = (new height) + (new height - 2)
|
|
+ * = 2 * new height - 2
|
|
+ *
|
|
+ * For a btree of maximum height, the worst case is that every level
|
|
+ * under the root gets split, then while inserting another entry to
|
|
+ * refill the AGFL, every level under the root gets split again. This is
|
|
+ * also:
|
|
+ *
|
|
+ * 2 * (current height - 1)
|
|
+ * = 2 * (new height - 1)
|
|
+ * = 2 * new height - 2
|
|
+ */
|
|
+
|
|
/* space needed by-bno freespace btree */
|
|
min_free = min_t(unsigned int, levels[XFS_BTNUM_BNOi] + 1,
|
|
- mp->m_alloc_maxlevels);
|
|
+ mp->m_alloc_maxlevels) * 2 - 2;
|
|
/* space needed by-size freespace btree */
|
|
min_free += min_t(unsigned int, levels[XFS_BTNUM_CNTi] + 1,
|
|
- mp->m_alloc_maxlevels);
|
|
+ mp->m_alloc_maxlevels) * 2 - 2;
|
|
/* space needed reverse mapping used space btree */
|
|
if (xfs_has_rmapbt(mp))
|
|
min_free += min_t(unsigned int, levels[XFS_BTNUM_RMAPi] + 1,
|
|
- mp->m_rmap_maxlevels);
|
|
+ mp->m_rmap_maxlevels) * 2 - 2;
|
|
|
|
return min_free;
|
|
}
|
|
--
|
|
cgit 1.2.3-korg
|
|
|