Skip to content

Commit b186158

Browse files
committed
range_tree: Add zfs_recover_rt parameter and extra debug info
There are production cases where unexpected range tree segment adding/removal leads to panic. The root cause investigation requires more debug info about the range tree and the segments in question when it happens. In addition, the zfs_recover_rt parameter allows converting such panics into warnings with a potential space leak as a trade-off. Signed-off-by: Igor Ostapenko <igor.ostapenko@klarasystems.com>
1 parent a44f423 commit b186158

File tree

9 files changed

+228
-70
lines changed

9 files changed

+228
-70
lines changed

include/sys/range_tree.h

+12
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,13 @@ typedef enum zfs_range_seg_type {
4848
ZFS_RANGE_SEG_NUM_TYPES,
4949
} zfs_range_seg_type_t;
5050

51+
typedef enum zfs_range_tree_usecase {
52+
ZFS_RANGE_TREE_UC_UNKNOWN,
53+
ZFS_RANGE_TREE_UC_ALLOCATED_SPACE,
54+
ZFS_RANGE_TREE_UC_FREE_SPACE,
55+
ZFS_RANGE_TREE_UC_NUM_CASES,
56+
} zfs_range_tree_usecase_t;
57+
5158
/*
5259
* Note: the range_tree may not be accessed concurrently; consumers
5360
* must provide external locking if required.
@@ -66,6 +73,8 @@ typedef struct zfs_range_tree {
6673
const zfs_range_tree_ops_t *rt_ops;
6774
void *rt_arg;
6875
uint64_t rt_gap; /* allowable inter-segment gap */
76+
zfs_range_tree_usecase_t rt_usecase;
77+
char *rt_instance; /* use case details for debug */
6978

7079
/*
7180
* The rt_histogram maintains a histogram of ranges. Each bucket,
@@ -280,6 +289,9 @@ zfs_range_tree_t *zfs_range_tree_create_gap(const zfs_range_tree_ops_t *ops,
280289
uint64_t gap);
281290
zfs_range_tree_t *zfs_range_tree_create(const zfs_range_tree_ops_t *ops,
282291
zfs_range_seg_type_t type, void *arg, uint64_t start, uint64_t shift);
292+
zfs_range_tree_t *zfs_range_tree_create_usecase(const zfs_range_tree_ops_t *ops,
293+
zfs_range_seg_type_t type, void *arg, uint64_t start, uint64_t shift,
294+
zfs_range_tree_usecase_t usecase, char *instance);
283295
void zfs_range_tree_destroy(zfs_range_tree_t *rt);
284296
boolean_t zfs_range_tree_contains(zfs_range_tree_t *rt, uint64_t start,
285297
uint64_t size);

man/man4/zfs.4

+6
Original file line numberDiff line numberDiff line change
@@ -1987,6 +1987,12 @@ Set to attempt to recover from fatal errors.
19871987
This should only be used as a last resort,
19881988
as it typically results in leaked space, or worse.
19891989
.
1990+
.It Sy zfs_recover_rt Ns = Ns Sy 0 Ns | Ns 1 Pq int
1991+
Set to attempt to recover from fatal errors while adding or removing
1992+
unexpected segments to a range tree.
1993+
This should only be used as a last resort,
1994+
as it typically results in leaked space.
1995+
.
19901996
.It Sy zfs_removal_ignore_errors Ns = Ns Sy 0 Ns | Ns 1 Pq int
19911997
Ignore hard I/O errors during device removal.
19921998
When set, if a device encounters a hard I/O error during the removal process

module/zfs/dnode.c

+4-2
Original file line numberDiff line numberDiff line change
@@ -2435,8 +2435,10 @@ dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx)
24352435
{
24362436
int txgoff = tx->tx_txg & TXG_MASK;
24372437
if (dn->dn_free_ranges[txgoff] == NULL) {
2438-
dn->dn_free_ranges[txgoff] = zfs_range_tree_create(NULL,
2439-
ZFS_RANGE_SEG64, NULL, 0, 0);
2438+
dn->dn_free_ranges[txgoff] =
2439+
zfs_range_tree_create_usecase(
2440+
NULL, ZFS_RANGE_SEG64, NULL, 0, 0,
2441+
ZFS_RANGE_TREE_UC_FREE_SPACE, "dn_free_ranges");
24402442
}
24412443
zfs_range_tree_clear(dn->dn_free_ranges[txgoff], blkid, nblks);
24422444
zfs_range_tree_add(dn->dn_free_ranges[txgoff], blkid, nblks);

module/zfs/metaslab.c

+47-17
Original file line numberDiff line numberDiff line change
@@ -368,6 +368,22 @@ static metaslab_stats_t metaslab_stats = {
368368
#define METASLABSTAT_BUMP(stat) \
369369
atomic_inc_64(&metaslab_stats.stat.value.ui64);
370370

371+
static inline char *
372+
metaslab_range_tree_instance(metaslab_group_t *mg, metaslab_t *ms,
373+
char *rt_name)
374+
{
375+
const size_t len = 4 * ZFS_MAX_DATASET_NAME_LEN;
376+
char *buf = kmem_zalloc(len, KM_SLEEP);
377+
378+
snprintf(buf, len, "{spa=%s vdev_guid=%llu ms_id=%llu %s}",
379+
mg->mg_vd->vdev_spa->spa_name,
380+
mg->mg_vd->vdev_guid,
381+
ms->ms_id,
382+
rt_name);
383+
384+
return (buf);
385+
}
386+
371387

372388
static kstat_t *metaslab_ksp;
373389

@@ -2753,30 +2769,42 @@ metaslab_init(metaslab_group_t *mg, uint64_t id, uint64_t object,
27532769
zfs_range_seg_type_t type =
27542770
metaslab_calculate_range_tree_type(vd, ms, &start, &shift);
27552771

2756-
ms->ms_allocatable = zfs_range_tree_create(NULL, type, NULL, start,
2757-
shift);
2772+
ms->ms_allocatable = zfs_range_tree_create_usecase(NULL, type, NULL,
2773+
start, shift, ZFS_RANGE_TREE_UC_FREE_SPACE,
2774+
metaslab_range_tree_instance(mg, ms, "ms_allocatable"));
27582775
for (int t = 0; t < TXG_SIZE; t++) {
2759-
ms->ms_allocating[t] = zfs_range_tree_create(NULL, type,
2760-
NULL, start, shift);
2761-
}
2762-
ms->ms_freeing = zfs_range_tree_create(NULL, type, NULL, start, shift);
2763-
ms->ms_freed = zfs_range_tree_create(NULL, type, NULL, start, shift);
2776+
ms->ms_allocating[t] = zfs_range_tree_create_usecase(NULL, type,
2777+
NULL, start, shift, ZFS_RANGE_TREE_UC_ALLOCATED_SPACE,
2778+
metaslab_range_tree_instance(mg, ms, "ms_allocating"));
2779+
}
2780+
ms->ms_freeing = zfs_range_tree_create_usecase(NULL, type, NULL, start,
2781+
shift, ZFS_RANGE_TREE_UC_FREE_SPACE,
2782+
metaslab_range_tree_instance(mg, ms, "ms_freeing"));
2783+
ms->ms_freed = zfs_range_tree_create_usecase(NULL, type, NULL, start,
2784+
shift, ZFS_RANGE_TREE_UC_FREE_SPACE,
2785+
metaslab_range_tree_instance(mg, ms, "ms_freed"));
27642786
for (int t = 0; t < TXG_DEFER_SIZE; t++) {
2765-
ms->ms_defer[t] = zfs_range_tree_create(NULL, type, NULL,
2766-
start, shift);
2787+
ms->ms_defer[t] = zfs_range_tree_create_usecase(NULL, type,
2788+
NULL, start, shift, ZFS_RANGE_TREE_UC_FREE_SPACE,
2789+
metaslab_range_tree_instance(mg, ms, "ms_defer"));
27672790
}
2768-
ms->ms_checkpointing =
2769-
zfs_range_tree_create(NULL, type, NULL, start, shift);
2770-
ms->ms_unflushed_allocs =
2771-
zfs_range_tree_create(NULL, type, NULL, start, shift);
2791+
ms->ms_checkpointing = zfs_range_tree_create_usecase(NULL, type, NULL,
2792+
start, shift, ZFS_RANGE_TREE_UC_FREE_SPACE,
2793+
metaslab_range_tree_instance(mg, ms, "ms_checkpointing"));
2794+
ms->ms_unflushed_allocs = zfs_range_tree_create_usecase(NULL, type,
2795+
NULL, start, shift, ZFS_RANGE_TREE_UC_ALLOCATED_SPACE,
2796+
metaslab_range_tree_instance(mg, ms, "ms_unflushed_allocs"));
27722797

27732798
metaslab_rt_arg_t *mrap = kmem_zalloc(sizeof (*mrap), KM_SLEEP);
27742799
mrap->mra_bt = &ms->ms_unflushed_frees_by_size;
27752800
mrap->mra_floor_shift = metaslab_by_size_min_shift;
2776-
ms->ms_unflushed_frees = zfs_range_tree_create(&metaslab_rt_ops,
2777-
type, mrap, start, shift);
2801+
ms->ms_unflushed_frees = zfs_range_tree_create_usecase(&metaslab_rt_ops,
2802+
type, mrap, start, shift, ZFS_RANGE_TREE_UC_FREE_SPACE,
2803+
metaslab_range_tree_instance(mg, ms, "ms_unflushed_frees"));
27782804

2779-
ms->ms_trim = zfs_range_tree_create(NULL, type, NULL, start, shift);
2805+
ms->ms_trim = zfs_range_tree_create_usecase(NULL, type, NULL, start,
2806+
shift, ZFS_RANGE_TREE_UC_FREE_SPACE,
2807+
metaslab_range_tree_instance(mg, ms, "ms_trim"));
27802808

27812809
metaslab_group_add(mg, ms);
27822810
metaslab_set_fragmentation(ms, B_FALSE);
@@ -3750,7 +3778,9 @@ metaslab_condense(metaslab_t *msp, dmu_tx_t *tx)
37503778
type = metaslab_calculate_range_tree_type(msp->ms_group->mg_vd, msp,
37513779
&start, &shift);
37523780

3753-
condense_tree = zfs_range_tree_create(NULL, type, NULL, start, shift);
3781+
condense_tree = zfs_range_tree_create_usecase(NULL, type, NULL, start,
3782+
shift, ZFS_RANGE_TREE_UC_FREE_SPACE,
3783+
metaslab_range_tree_instance(msp->ms_group, msp, "condense_tree"));
37543784

37553785
for (int t = 0; t < TXG_DEFER_SIZE; t++) {
37563786
zfs_range_tree_walk(msp->ms_defer[t],

0 commit comments

Comments
 (0)