Skip to content

Commit ac2b7cc

Browse files
committed
Add zfs_recover_ms parameter
Signed-off-by: Igor Ostapenko <[email protected]>
1 parent 523e3ad commit ac2b7cc

File tree

5 files changed

+102
-23
lines changed

5 files changed

+102
-23
lines changed

include/sys/range_tree.h

+11
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,13 @@ typedef enum zfs_range_seg_type {
4848
ZFS_RANGE_SEG_NUM_TYPES,
4949
} zfs_range_seg_type_t;
5050

51+
typedef enum zfs_range_tree_usecase {
52+
ZFS_RANGE_TREE_UC_UNKNOWN,
53+
ZFS_RANGE_TREE_UC_ALLOCATED_SPACE,
54+
ZFS_RANGE_TREE_UC_FREE_SPACE,
55+
ZFS_RANGE_TREE_UC_NUM_CASES,
56+
} zfs_range_tree_usecase_t;
57+
5158
/*
5259
* Note: the range_tree may not be accessed concurrently; consumers
5360
* must provide external locking if required.
@@ -66,6 +73,7 @@ typedef struct zfs_range_tree {
6673
const zfs_range_tree_ops_t *rt_ops;
6774
void *rt_arg;
6875
uint64_t rt_gap; /* allowable inter-segment gap */
76+
zfs_range_tree_usecase_t rt_usecase;
6977

7078
/*
7179
* The rt_histogram maintains a histogram of ranges. Each bucket,
@@ -280,6 +288,9 @@ zfs_range_tree_t *zfs_range_tree_create_gap(const zfs_range_tree_ops_t *ops,
280288
uint64_t gap);
281289
zfs_range_tree_t *zfs_range_tree_create(const zfs_range_tree_ops_t *ops,
282290
zfs_range_seg_type_t type, void *arg, uint64_t start, uint64_t shift);
291+
zfs_range_tree_t *zfs_range_tree_create_usecase(const zfs_range_tree_ops_t *ops,
292+
zfs_range_seg_type_t type, void *arg, uint64_t start, uint64_t shift,
293+
zfs_range_tree_usecase_t usecase);
283294
void zfs_range_tree_destroy(zfs_range_tree_t *rt);
284295
boolean_t zfs_range_tree_contains(zfs_range_tree_t *rt, uint64_t start,
285296
uint64_t size);

include/sys/zfs_debug.h

+1
Original file line numberDiff line numberDiff line change
@@ -98,6 +98,7 @@ extern void __dprintf(boolean_t dprint, const char *file, const char *func,
9898
#endif /* ZFS_DEBUG */
9999

100100
extern void zfs_panic_recover(const char *fmt, ...);
101+
extern void zfs_panic_recover_ms(const char *fmt, ...);
101102

102103
extern void zfs_dbgmsg_init(void);
103104
extern void zfs_dbgmsg_fini(void);

module/zfs/metaslab.c

+2-2
Original file line numberDiff line numberDiff line change
@@ -2753,8 +2753,8 @@ metaslab_init(metaslab_group_t *mg, uint64_t id, uint64_t object,
27532753
zfs_range_seg_type_t type =
27542754
metaslab_calculate_range_tree_type(vd, ms, &start, &shift);
27552755

2756-
ms->ms_allocatable = zfs_range_tree_create(NULL, type, NULL, start,
2757-
shift);
2756+
ms->ms_allocatable = zfs_range_tree_create_usecase(NULL, type, NULL,
2757+
start, shift, ZFS_RANGE_TREE_UC_FREE_SPACE);
27582758
for (int t = 0; t < TXG_SIZE; t++) {
27592759
ms->ms_allocating[t] = zfs_range_tree_create(NULL, type,
27602760
NULL, start, shift);

module/zfs/range_tree.c

+73-21
Original file line numberDiff line numberDiff line change
@@ -200,15 +200,16 @@ ZFS_BTREE_FIND_IN_BUF_FUNC(zfs_range_tree_seg64_find_in_buf, zfs_range_seg64_t,
200200
ZFS_BTREE_FIND_IN_BUF_FUNC(zfs_range_tree_seg_gap_find_in_buf,
201201
zfs_range_seg_gap_t, zfs_range_tree_seg_gap_compare)
202202

203-
zfs_range_tree_t *
204-
zfs_range_tree_create_gap(const zfs_range_tree_ops_t *ops,
203+
static zfs_range_tree_t *
204+
zfs_range_tree_create_impl(const zfs_range_tree_ops_t *ops,
205205
zfs_range_seg_type_t type, void *arg, uint64_t start, uint64_t shift,
206-
uint64_t gap)
206+
uint64_t gap, zfs_range_tree_usecase_t usecase)
207207
{
208208
zfs_range_tree_t *rt = kmem_zalloc(sizeof (zfs_range_tree_t), KM_SLEEP);
209209

210210
ASSERT3U(shift, <, 64);
211211
ASSERT3U(type, <=, ZFS_RANGE_SEG_NUM_TYPES);
212+
ASSERT3U(usecase, <, ZFS_RANGE_TREE_UC_NUM_CASES);
212213
size_t size;
213214
int (*compare) (const void *, const void *);
214215
bt_find_in_buf_f bt_find;
@@ -235,6 +236,7 @@ zfs_range_tree_create_gap(const zfs_range_tree_ops_t *ops,
235236

236237
rt->rt_ops = ops;
237238
rt->rt_gap = gap;
239+
rt->rt_usecase = usecase;
238240
rt->rt_arg = arg;
239241
rt->rt_type = type;
240242
rt->rt_start = start;
@@ -246,11 +248,30 @@ zfs_range_tree_create_gap(const zfs_range_tree_ops_t *ops,
246248
return (rt);
247249
}
248250

251+
zfs_range_tree_t *
252+
zfs_range_tree_create_gap(const zfs_range_tree_ops_t *ops,
253+
zfs_range_seg_type_t type, void *arg, uint64_t start, uint64_t shift,
254+
uint64_t gap)
255+
{
256+
return (zfs_range_tree_create_impl(ops, type, arg, start, shift, gap,
257+
ZFS_RANGE_TREE_UC_UNKNOWN));
258+
}
259+
249260
zfs_range_tree_t *
250261
zfs_range_tree_create(const zfs_range_tree_ops_t *ops,
251262
zfs_range_seg_type_t type, void *arg, uint64_t start, uint64_t shift)
252263
{
253-
return (zfs_range_tree_create_gap(ops, type, arg, start, shift, 0));
264+
return (zfs_range_tree_create_impl(ops, type, arg, start, shift, 0,
265+
ZFS_RANGE_TREE_UC_UNKNOWN));
266+
}
267+
268+
zfs_range_tree_t *
269+
zfs_range_tree_create_usecase(const zfs_range_tree_ops_t *ops,
270+
zfs_range_seg_type_t type, void *arg, uint64_t start, uint64_t shift,
271+
zfs_range_tree_usecase_t usecase)
272+
{
273+
return (zfs_range_tree_create_impl(ops, type, arg, start, shift, 0,
274+
usecase));
254275
}
255276

256277
void
@@ -318,14 +339,25 @@ zfs_range_tree_add_impl(void *arg, uint64_t start, uint64_t size, uint64_t fill)
318339
* the normal code paths.
319340
*/
320341
if (rs != NULL) {
342+
uint64_t rstart = zfs_rs_get_start(rs, rt);
343+
uint64_t rend = zfs_rs_get_end(rs, rt);
321344
if (gap == 0) {
322-
zfs_panic_recover("zfs: adding existent segment to "
323-
"range tree (offset=%llx size=%llx)",
324-
(longlong_t)start, (longlong_t)size);
345+
zfs_panic_recover_ms("zfs: adding segment "
346+
"(offset=%llx size=%llx) overlapping with "
347+
"existing one (offset=%llx size=%llx)",
348+
(longlong_t)start, (longlong_t)size,
349+
(longlong_t)rstart, (longlong_t)(rend - rstart));
350+
if (rt->rt_usecase != ZFS_RANGE_TREE_UC_ALLOCATED_SPACE)
351+
return;
352+
/* add non-overlapping chunks */
353+
if (rstart > start)
354+
range_tree_add_impl(rt, start, rstart - start,
355+
rstart - start);
356+
if (rend < end)
357+
range_tree_add_impl(rt, rend, end - rend,
358+
end - rend);
325359
return;
326360
}
327-
uint64_t rstart = zfs_rs_get_start(rs, rt);
328-
uint64_t rend = zfs_rs_get_end(rs, rt);
329361
if (rstart <= start && rend >= end) {
330362
zfs_range_tree_adjust_fill(rt, rs, fill);
331363
return;
@@ -450,6 +482,7 @@ zfs_range_tree_remove_impl(zfs_range_tree_t *rt, uint64_t start, uint64_t size,
450482
zfs_range_seg_t *rs;
451483
zfs_range_seg_max_t rsearch, rs_tmp;
452484
uint64_t end = start + size;
485+
uint64_t rstart, rend;
453486
boolean_t left_over, right_over;
454487

455488
VERIFY3U(size, !=, 0);
@@ -463,12 +496,15 @@ zfs_range_tree_remove_impl(zfs_range_tree_t *rt, uint64_t start, uint64_t size,
463496

464497
/* Make sure we completely overlap with someone */
465498
if (rs == NULL) {
466-
zfs_panic_recover("zfs: removing nonexistent segment from "
499+
zfs_panic_recover_ms("zfs: removing nonexistent segment from "
467500
"range tree (offset=%llx size=%llx)",
468501
(longlong_t)start, (longlong_t)size);
469502
return;
470503
}
471504

505+
rstart = zfs_rs_get_start(rs, rt);
506+
rend = zfs_rs_get_end(rs, rt);
507+
472508
/*
473509
* Range trees with gap support must only remove complete segments
474510
* from the tree. This allows us to maintain accurate fill accounting
@@ -478,31 +514,47 @@ zfs_range_tree_remove_impl(zfs_range_tree_t *rt, uint64_t start, uint64_t size,
478514
if (rt->rt_gap != 0) {
479515
if (do_fill) {
480516
if (zfs_rs_get_fill(rs, rt) == size) {
481-
start = zfs_rs_get_start(rs, rt);
482-
end = zfs_rs_get_end(rs, rt);
517+
start = rstart;
518+
end = rend;
483519
size = end - start;
484520
} else {
485521
zfs_range_tree_adjust_fill(rt, rs, -size);
486522
return;
487523
}
488-
} else if (zfs_rs_get_start(rs, rt) != start ||
489-
zfs_rs_get_end(rs, rt) != end) {
524+
} else if (rstart != start || rend != end) {
490525
zfs_panic_recover("zfs: freeing partial segment of "
491526
"gap tree (offset=%llx size=%llx) of "
492527
"(offset=%llx size=%llx)",
493528
(longlong_t)start, (longlong_t)size,
494-
(longlong_t)zfs_rs_get_start(rs, rt),
495-
(longlong_t)zfs_rs_get_end(rs, rt) -
496-
zfs_rs_get_start(rs, rt));
529+
(longlong_t)rstart,
530+
(longlong_t)(rend - rstart));
497531
return;
498532
}
499533
}
500534

501-
VERIFY3U(zfs_rs_get_start(rs, rt), <=, start);
502-
VERIFY3U(zfs_rs_get_end(rs, rt), >=, end);
535+
if (!(rstart <= start && rend >= end)) {
536+
zfs_panic_recover_ms("zfs: removing segment "
537+
"(offset=%llx size=%llx) not completely overlapped by "
538+
"existing one (offset=%llx size=%llx)",
539+
(longlong_t)start, (longlong_t)size,
540+
(longlong_t)rstart, (longlong_t)(rend - rstart));
541+
if (rt->rt_usecase != ZFS_RANGE_TREE_UC_FREE_SPACE)
542+
return;
543+
/* perform removal of the chunks */
544+
if (rstart > start)
545+
range_tree_remove_impl(rt, start, rstart - start,
546+
do_fill);
547+
uint64_t mstart = MAX(rstart, start);
548+
uint64_t mend = MIN(rend, end);
549+
range_tree_remove_impl(rt, mstart, mend - mstart, do_fill);
550+
if (rend < end)
551+
range_tree_remove_impl(rt, rend, end - rend,
552+
do_fill);
553+
return;
554+
}
503555

504-
left_over = (zfs_rs_get_start(rs, rt) != start);
505-
right_over = (zfs_rs_get_end(rs, rt) != end);
556+
left_over = (rstart != start);
557+
right_over = (rend != end);
506558

507559
zfs_range_tree_stat_decr(rt, rs);
508560

module/zfs/spa_misc.c

+15
Original file line numberDiff line numberDiff line change
@@ -267,6 +267,8 @@ int zfs_flags = 0;
267267
* in leaked space, or worse.
268268
*/
269269
int zfs_recover = B_FALSE;
270+
/* Localized to metaslab loading only */
271+
int zfs_recover_ms = B_FALSE;
270272

271273
/*
272274
* If destroy encounters an EIO while reading metadata (e.g. indirect
@@ -1671,6 +1673,16 @@ zfs_panic_recover(const char *fmt, ...)
16711673
va_end(adx);
16721674
}
16731675

1676+
void
1677+
zfs_panic_recover_ms(const char *fmt, ...)
1678+
{
1679+
va_list adx;
1680+
1681+
va_start(adx, fmt);
1682+
vcmn_err((zfs_recover || zfs_recover_ms) ? CE_WARN : CE_PANIC, fmt, adx);
1683+
va_end(adx);
1684+
}
1685+
16741686
/*
16751687
* This is a stripped-down version of strtoull, suitable only for converting
16761688
* lowercase hexadecimal numbers that don't overflow.
@@ -3133,6 +3145,9 @@ ZFS_MODULE_PARAM(zfs, zfs_, flags, UINT, ZMOD_RW,
31333145
ZFS_MODULE_PARAM(zfs, zfs_, recover, INT, ZMOD_RW,
31343146
"Set to attempt to recover from fatal errors");
31353147

3148+
ZFS_MODULE_PARAM(zfs, zfs_, recover_ms, INT, ZMOD_RW,
3149+
"Set to attempt to recover from fatal errors during metaslab loading");
3150+
31363151
ZFS_MODULE_PARAM(zfs, zfs_, free_leak_on_eio, INT, ZMOD_RW,
31373152
"Set to ignore IO errors during free and permanently leak the space");
31383153

0 commit comments

Comments
 (0)