Skip to content

Commit 1e63fa6

Browse files
author
Paul Dagnelie
committed
Handle interaction between gang blocks, copies, and FDT.
With the advent of fast dedup, there are no longer separate dedup tables for different copies values. There is now logic that will add DVAs to the dedup table entry if more copies are needed for new writes. However, this interacts poorly with ganging. There are two different cases that can result in mixed gang/non-gang BPs, which are illegal in ZFS. This change forces updates of existing FDT entries to only add more DVAs of the same type; if there are already gang DVAs in the FDT, we require that the new allocations also be gangs. if there are non-gang DVAs in the FDT, then this allocation may not be gangs. If it would gang, we have to redo the whole write as a non-dedup write. Sponsored by: iXsystems, Inc. Sponsored-by: Klara, Inc. Signed-off-by: Paul Dagnelie <[email protected]>
1 parent 414895e commit 1e63fa6

File tree

8 files changed

+132
-9
lines changed

8 files changed

+132
-9
lines changed

include/sys/ddt.h

+2
Original file line numberDiff line numberDiff line change
@@ -352,6 +352,8 @@ extern ddt_phys_variant_t ddt_phys_select(const ddt_t *ddt,
352352
const ddt_entry_t *dde, const blkptr_t *bp);
353353
extern uint64_t ddt_phys_birth(const ddt_univ_phys_t *ddp,
354354
ddt_phys_variant_t v);
355+
extern int ddt_phys_gang_count(const ddt_univ_phys_t *ddp,
356+
ddt_phys_variant_t v, boolean_t encrypted);
355357
extern int ddt_phys_dva_count(const ddt_univ_phys_t *ddp, ddt_phys_variant_t v,
356358
boolean_t encrypted);
357359

include/sys/zio.h

+1
Original file line numberDiff line numberDiff line change
@@ -358,6 +358,7 @@ typedef struct zio_prop {
358358
boolean_t zp_encrypt;
359359
boolean_t zp_byteorder;
360360
boolean_t zp_direct_write;
361+
boolean_t zp_must_gang;
361362
uint8_t zp_salt[ZIO_DATA_SALT_LEN];
362363
uint8_t zp_iv[ZIO_DATA_IV_LEN];
363364
uint8_t zp_mac[ZIO_DATA_MAC_LEN];

module/zfs/ddt.c

+14
Original file line numberDiff line numberDiff line change
@@ -814,6 +814,20 @@ ddt_phys_birth(const ddt_univ_phys_t *ddp, ddt_phys_variant_t v)
814814
return (ddp->ddp_trad[v].ddp_phys_birth);
815815
}
816816

817+
int
818+
ddt_phys_gang_count(const ddt_univ_phys_t *ddp, ddt_phys_variant_t v,
819+
boolean_t encrypted)
820+
{
821+
ASSERT3U(v, <, DDT_PHYS_NONE);
822+
823+
const dva_t *dvas = (v == DDT_PHYS_FLAT) ?
824+
ddp->ddp_flat.ddp_dva : ddp->ddp_trad[v].ddp_dva;
825+
826+
return (DVA_GET_GANG(&dvas[0]) +
827+
DVA_GET_GANG(&dvas[1] +
828+
DVA_GET_GANG(&dvas[2]) * !encrypted));
829+
}
830+
817831
int
818832
ddt_phys_dva_count(const ddt_univ_phys_t *ddp, ddt_phys_variant_t v,
819833
boolean_t encrypted)

module/zfs/dmu.c

+1
Original file line numberDiff line numberDiff line change
@@ -2493,6 +2493,7 @@ dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp, zio_prop_t *zp)
24932493
zp->zp_encrypt = encrypt;
24942494
zp->zp_byteorder = ZFS_HOST_BYTEORDER;
24952495
zp->zp_direct_write = (wp & WP_DIRECT_WR) ? B_TRUE : B_FALSE;
2496+
zp->zp_must_gang = B_FALSE;
24962497
memset(zp->zp_salt, 0, ZIO_DATA_SALT_LEN);
24972498
memset(zp->zp_iv, 0, ZIO_DATA_IV_LEN);
24982499
memset(zp->zp_mac, 0, ZIO_DATA_MAC_LEN);

module/zfs/zio.c

+46-8
Original file line numberDiff line numberDiff line change
@@ -3138,6 +3138,17 @@ zio_write_gang_block(zio_t *pio, metaslab_class_t *mc)
31383138
* encryption, which uses DVA[2] for the IV+salt.
31393139
*/
31403140
int gbh_copies = gio->io_prop.zp_gang_copies;
3141+
if (gbh_copies == 0) {
3142+
/*
3143+
* This should only happen in the case where we're filling in
3144+
* DDT entries for a parent that wants more copies than the DDT
3145+
* has. In that case, we cannot gang without creating a mixed
3146+
* blkptr, which is illegal.
3147+
*/
3148+
ASSERT3U(gio->io_child_type, ==, ZIO_CHILD_DDT);
3149+
pio->io_error = EAGAIN;
3150+
return (pio);
3151+
}
31413152
ASSERT3S(gbh_copies, >, 0);
31423153
ASSERT3S(gbh_copies, <=, SPA_DVAS_PER_BP);
31433154

@@ -3149,7 +3160,7 @@ zio_write_gang_block(zio_t *pio, metaslab_class_t *mc)
31493160

31503161
flags |= METASLAB_ASYNC_ALLOC;
31513162
VERIFY(zfs_refcount_held(&mc->mc_allocator[pio->io_allocator].
3152-
mca_alloc_slots, pio));
3163+
mca_alloc_slots, pio) || gio->io_prop.zp_copies == 0);
31533164

31543165
/*
31553166
* The logical zio has already placed a reservation for
@@ -3229,6 +3240,7 @@ zio_write_gang_block(zio_t *pio, metaslab_class_t *mc)
32293240
zp.zp_encrypt = gio->io_prop.zp_encrypt;
32303241
zp.zp_byteorder = gio->io_prop.zp_byteorder;
32313242
zp.zp_direct_write = B_FALSE;
3243+
zp.zp_must_gang = B_FALSE;
32323244
memset(zp.zp_salt, 0, ZIO_DATA_SALT_LEN);
32333245
memset(zp.zp_iv, 0, ZIO_DATA_IV_LEN);
32343246
memset(zp.zp_mac, 0, ZIO_DATA_MAC_LEN);
@@ -3784,8 +3796,9 @@ zio_ddt_write(zio_t *zio)
37843796
*/
37853797
int have_dvas = ddt_phys_dva_count(ddp, v, BP_IS_ENCRYPTED(bp));
37863798
IMPLY(have_dvas == 0, ddt_phys_birth(ddp, v) == 0);
3799+
int gang_dvas = ddt_phys_gang_count(ddp, v, BP_IS_ENCRYPTED(bp));
37873800

3788-
/* Number of DVAs requested bya the IO. */
3801+
/* Number of DVAs requested by the IO. */
37893802
uint8_t need_dvas = zp->zp_copies;
37903803

37913804
/*
@@ -3937,7 +3950,19 @@ zio_ddt_write(zio_t *zio)
39373950
* grow the DDT entry by to satisfy the request.
39383951
*/
39393952
zio_prop_t czp = *zp;
3940-
czp.zp_copies = czp.zp_gang_copies = need_dvas;
3953+
if (gang_dvas > 0 && have_dvas > 0) {
3954+
czp.zp_gang_copies = need_dvas +
3955+
(zp->zp_gang_copies - zp->zp_copies);
3956+
czp.zp_copies = need_dvas;
3957+
czp.zp_must_gang = B_TRUE;
3958+
} else if (gang_dvas == 0 && have_dvas > 0) {
3959+
czp.zp_copies = need_dvas;
3960+
czp.zp_gang_copies = 0;
3961+
} else {
3962+
czp.zp_copies = need_dvas;
3963+
czp.zp_gang_copies = need_dvas +
3964+
(zp->zp_gang_copies - zp->zp_copies);
3965+
}
39413966
zio_t *cio = zio_write(zio, spa, txg, bp, zio->io_orig_abd,
39423967
zio->io_orig_size, zio->io_orig_size, &czp,
39433968
zio_ddt_child_write_ready, NULL,
@@ -4109,6 +4134,7 @@ zio_dva_allocate(zio_t *zio)
41094134
ASSERT(BP_IS_HOLE(bp));
41104135
ASSERT0(BP_GET_NDVAS(bp));
41114136
ASSERT3U(zio->io_prop.zp_copies, >, 0);
4137+
41124138
ASSERT3U(zio->io_prop.zp_copies, <=, spa_max_replication(spa));
41134139
ASSERT3U(zio->io_size, ==, BP_GET_PSIZE(bp));
41144140

@@ -4141,14 +4167,15 @@ zio_dva_allocate(zio_t *zio)
41414167
* back to spa_sync() which is abysmal for performance.
41424168
*/
41434169
ASSERT(ZIO_HAS_ALLOCATOR(zio));
4144-
error = metaslab_alloc(spa, mc, zio->io_size, bp,
4145-
zio->io_prop.zp_copies, zio->io_txg, NULL, flags,
4146-
&zio->io_alloc_list, zio, zio->io_allocator);
4170+
error = zio->io_prop.zp_must_gang ? ENOSPC : metaslab_alloc(spa,
4171+
mc, zio->io_size, bp, zio->io_prop.zp_copies, zio->io_txg, NULL,
4172+
flags, &zio->io_alloc_list, zio, zio->io_allocator);
41474173

41484174
/*
41494175
* Fallback to normal class when an alloc class is full
41504176
*/
4151-
if (error == ENOSPC && mc != spa_normal_class(spa)) {
4177+
if (error == ENOSPC && mc != spa_normal_class(spa) &&
4178+
!zio->io_prop.zp_must_gang) {
41524179
/*
41534180
* When the dedup or special class is spilling into the normal
41544181
* class, there can still be significant space available due
@@ -4200,7 +4227,8 @@ zio_dva_allocate(zio_t *zio)
42004227
}
42014228

42024229
if (error == ENOSPC && zio->io_size > SPA_MINBLOCKSIZE) {
4203-
if (zfs_flags & ZFS_DEBUG_METASLAB_ALLOC) {
4230+
if (zfs_flags & ZFS_DEBUG_METASLAB_ALLOC &&
4231+
!zio->io_prop.zp_must_gang) {
42044232
zfs_dbgmsg("%s: metaslab allocation failure, "
42054233
"trying ganging: zio %px, size %llu, error %d",
42064234
spa_name(spa), zio, (u_longlong_t)zio->io_size,
@@ -5422,6 +5450,16 @@ zio_done(zio_t *zio)
54225450
}
54235451

54245452
if (zio->io_error && zio == zio->io_logical) {
5453+
5454+
/*
5455+
* A DDT child tried to create a mixed gang/non-gang BP. We're
5456+
* going to have to just retry as a non-dedup IO.
5457+
*/
5458+
if (zio->io_error == EAGAIN && IO_IS_ALLOCATING(zio) &&
5459+
zio->io_prop.zp_dedup) {
5460+
zio->io_reexecute |= ZIO_REEXECUTE_NOW;
5461+
zio->io_prop.zp_dedup = B_FALSE;
5462+
}
54255463
/*
54265464
* Determine whether zio should be reexecuted. This will
54275465
* propagate all the way to the root via zio_notify_parent().

tests/runfiles/common.run

+1-1
Original file line numberDiff line numberDiff line change
@@ -724,7 +724,7 @@ tests = ['large_dnode_001_pos', 'large_dnode_003_pos', 'large_dnode_004_neg',
724724
tags = ['functional', 'features', 'large_dnode']
725725

726726
[tests/functional/gang_blocks]
727-
tests = ['gang_blocks_redundant']
727+
tests = ['gang_blocks_redundant', 'gang_blocks_ddt_copies']
728728
tags = ['functional', 'gang_blocks']
729729

730730
[tests/functional/grow]

tests/zfs-tests/tests/Makefile.am

+1
Original file line numberDiff line numberDiff line change
@@ -1559,6 +1559,7 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \
15591559
functional/features/large_dnode/large_dnode_009_pos.ksh \
15601560
functional/features/large_dnode/setup.ksh \
15611561
functional/gang_blocks/cleanup.ksh \
1562+
functional/gang_blocks/gang_blocks_ddt_copies.ksh \
15621563
functional/gang_blocks/gang_blocks_redundant.ksh \
15631564
functional/gang_blocks/setup.ksh \
15641565
functional/grow/grow_pool_001_pos.ksh \
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,66 @@
1+
#!/bin/ksh
2+
#
3+
# This file and its contents are supplied under the terms of the
4+
# Common Development and Distribution License ("CDDL"), version 1.0.
5+
# You may only use this file in accordance with the terms of version
6+
# 1.0 of the CDDL.
7+
#
8+
# A full copy of the text of the CDDL should have accompanied this
9+
# source. A copy of the CDDL is also available via the Internet at
10+
# http://www.illumos.org/license/CDDL.
11+
#
12+
13+
#
14+
# Copyright (c) 2025 by Klara Inc.
15+
#
16+
17+
#
18+
# Description:
19+
# Verify that mixed gang blocks and copies interact correctly in FDT
20+
#
21+
# Strategy:
22+
# 1. Store a block with copies = 1 in the DDT unganged.
23+
# 2. Add a new entry with copies = 2 that gangs, ensure it doesn't panic
24+
# 3. Store a block with copies = 1 in the DDT ganged.
25+
# 4. Add a new entry with copies = 3 that doesn't gang, ensure that it doesn't panic.
26+
#
27+
28+
. $STF_SUITE/include/libtest.shlib
29+
. $STF_SUITE/tests/functional/gang_blocks/gang_blocks.kshlib
30+
31+
log_assert "Verify that mixed gang blocks and copies interact correctly in FDT"
32+
33+
save_tunable DEDUP_LOG_TXG_MAX
34+
35+
function cleanup2
36+
{
37+
zfs destroy $TESTPOOL/fs1
38+
zfs destroy $TESTPOOL/fs2
39+
restore_tunable DEDUP_LOG_TXG_MAX
40+
cleanup
41+
}
42+
43+
preamble
44+
log_onexit cleanup2
45+
46+
log_must zpool create -f -o ashift=9 $TESTPOOL $DISKS
47+
log_must zfs create -o recordsize=64k -o dedup=on $TESTPOOL/fs1
48+
log_must zfs create -o recordsize=64k -o dedup=on -o copies=3 $TESTPOOL/fs2
49+
set_tunable32 DEDUP_LOG_TXG_MAX 1
50+
log_must dd if=/dev/urandom of=/$TESTPOOL/fs1/f1 bs=64k count=1
51+
log_must sync_pool $TESTPOOL
52+
set_tunable32 METASLAB_FORCE_GANGING 20000
53+
set_tunable32 METASLAB_FORCE_GANGING_PCT 100
54+
log_must dd if=/$TESTPOOL/fs1/f1 of=/$TESTPOOL/fs2/f1 bs=64k count=1
55+
log_must sync_pool $TESTPOOL
56+
57+
log_must rm /$TESTPOOL/fs*/f1
58+
log_must sync_pool $TESTPOOL
59+
log_must dd if=/dev/urandom of=/$TESTPOOL/fs1/f1 bs=64k count=1
60+
log_must sync_pool $TESTPOOL
61+
log_must zdb -D $TESTPOOL
62+
set_tunable32 METASLAB_FORCE_GANGING_PCT 0
63+
log_must dd if=/$TESTPOOL/fs1/f1 of=/$TESTPOOL/fs2/f1 bs=64k count=1
64+
log_must sync_pool $TESTPOOL # This will panic, case 3
65+
66+
log_pass "Verify that mixed gang blocks and copies interact correctly in FDT"

0 commit comments

Comments
 (0)