From f63eee5a7dd35b9f04ab7fc044cb2912b99975a6 Mon Sep 17 00:00:00 2001 From: Tristan Partin Date: Tue, 6 Dec 2022 14:45:43 -0600 Subject: [PATCH 1/4] Implement vblock split write amp reduction algo During a vblock split, we can save write amp by using mpool_mblock_punch() to FALLOC_FL_PUNCH_HOLE a portion of the mblock for both the left and right hand side destination mblocks. Signed-off-by: Tristan Partin Co-authored-by: Nabeel Meeramohideen Mohamed --- lib/cn/kvset_split.c | 339 ++++++++++++++++++++++++++++++++++--------- 1 file changed, 270 insertions(+), 69 deletions(-) diff --git a/lib/cn/kvset_split.c b/lib/cn/kvset_split.c index 749ef8e7d..54b1ea5df 100644 --- a/lib/cn/kvset_split.c +++ b/lib/cn/kvset_split.c @@ -3,6 +3,7 @@ * SPDX-FileCopyrightText: Copyright 2022 Micron Technology, Inc. */ +#include #include #include @@ -393,13 +394,19 @@ kblocks_split( return err; } +struct vgroup_split_metadata { + bool overlaps; /**< Whether the vgroup overlaps the split key. */ + uint16_t vblk_idx; /**< Where the left kvset's vblocks end and the right kvset's vblocks begin */ + off_t offset; /**< Offset into the vblock where the access first occurs */ +}; + /** * Return a split vblock index for the specified range of vblocks [start, end] by comparing * the min/max keys stored in a vblock footer against the split key. * * Return values: * v >= start and overlap = false: left: [start, v - 1], right [v, end] - * v >= start and overlap = true : left: [start, v], right [clone(v), end] + * v >= start and overlap = true : left: [start, punched(v)], right [clone(v), end] * * NOTES: * v = start and overlap = false: All vblocks go to the right @@ -416,12 +423,12 @@ get_vblk_split_index( uint16_t v; INVARIANT(ks && split_key && overlap); + INVARIANT(start <= end && end < kvset_get_num_vblocks(ks)); *overlap = false; - assert(start <= end && end < kvset_get_num_vblocks(ks)); for (v = start; v <= end; v++) { - struct key_obj min_key = { 0 }; + struct key_obj min_key; const struct vblock_desc *vbd = kvset_get_nth_vblock_desc(ks, v); key2kobj(&min_key, vbd->vbd_mblkdesc->map_base + vbd->vbd_min_koff, vbd->vbd_min_klen); @@ -431,7 +438,7 @@ get_vblk_split_index( } if (v > start) { - struct key_obj max_key = { 0 }; + struct key_obj max_key; const struct vblock_desc *vbd = kvset_get_nth_vblock_desc(ks, v - 1); key2kobj(&max_key, vbd->vbd_mblkdesc->map_base + vbd->vbd_max_koff, vbd->vbd_max_klen); @@ -445,6 +452,120 @@ get_vblk_split_index( return v; } +static void +find_max_key_among_overlapping_vblocks( + const uint32_t nvgroups, + const struct vgroup_split_metadata *metadatav, + struct kvset *const ks, + struct key_obj *max_key) +{ + for (uint32_t i = 0; i < nvgroups; i++) { + struct key_obj curr_key = { 0 }; + const struct vblock_desc *vbd; + const struct vgroup_split_metadata *metadata = metadatav + i; + + if (!metadata->overlaps) + continue; + + vbd = kvset_get_nth_vblock_desc(ks, metadata->vblk_idx); + + key2kobj(&curr_key, vbd->vbd_mblkdesc->map_base + vbd->vbd_max_koff, vbd->vbd_max_klen); + + if (key_obj_cmp(max_key, &curr_key) < 0) + *max_key = curr_key; + } +} + +static merr_t +mark_vgroup_accesses( + const uint32_t nvgroups, + struct vgroup_split_metadata *metadatav, + struct kvset *const ks, + const struct key_obj *split_key, + const struct key_obj *max_key) +{ + merr_t err; + bool first = true; + uint32_t accesses = 0; + struct kv_iterator *iter; + + INVARIANT(key_obj_cmp(split_key, max_key) <= 0); + + err = kvset_iter_create(ks, NULL, NULL, NULL, kvset_iter_flag_mmap, &iter); + if (ev(err)) + return err; + + err = kvset_iter_seek(iter, split_key->ko_sfx, split_key->ko_sfx_len, NULL); + if (ev(err)) + goto out; + + while (true) { + uint vlen; + uint vbidx; + uint vboff; + uint complen; + uint64_t seqno; + const void *vdata; + enum kmd_vtype vtype; + struct key_obj curr_key; + struct kvset_iter_vctx vc; + + if (iter->kvi_eof) + break; + + err = kvset_iter_next_key(iter, &curr_key, &vc); + if (ev(err)) + goto out; + + /* In the event that this kvset contains the split key, skip it. */ + if (first) { + first = false; + if (key_obj_cmp(&curr_key, split_key) == 0) + continue; + } + + if (key_obj_cmp(&curr_key, max_key) > 0) + break; + + while (kvset_iter_next_vref(iter, &vc, &seqno, &vtype, &vbidx, &vboff, &vdata, &vlen, + &complen)) { + uint64_t vgidx; + const struct vblock_desc *vbd; + struct vgroup_split_metadata *metadata; + + switch (vtype) { + case VTYPE_UCVAL: + case VTYPE_CVAL: + vbd = kvset_get_nth_vblock_desc(ks, vbidx); + /* TODO: Why do I need the -1? vblock_reader.c always adds 1 for + * some reason that doesn't make sense to me. + */ + vgidx = atomic_read(&vbd->vbd_vgidx) - 1; + assert(vgidx < nvgroups); + metadata = metadatav + vgidx; + if (metadata->offset == -1) { + metadata->offset = vboff; + + /* Exit because we have marked the offset for all vgroups */ + if (++accesses == nvgroups) + goto out; + } + /* fallthrough */ + case VTYPE_IVAL: + case VTYPE_ZVAL: + case VTYPE_TOMB: + case VTYPE_PTOMB: + continue; + } + } + } + +out: + kvset_iter_release(iter); + + return err; +} + /** * @vbidx_left, @vbidx_right - tracks vblock index for the left and right kvsets * @vgidx_left, @vgidx_right - tracks vgroup index for the left and right kvsets @@ -458,6 +579,8 @@ vblocks_split( struct perfc_set *pc, struct kvset_split_res *result) { + struct key_obj max_key; + struct vgroup_split_metadata *metadatav; struct vgmap *vgmap_src = ks->ks_vgmap; struct vgmap *vgmap_left = work[LEFT].vgmap; struct vgmap *vgmap_right = work[RIGHT].vgmap; @@ -466,113 +589,188 @@ vblocks_split( uint16_t vbidx_left = 0, vbidx_right = 0; uint32_t vgidx_left = 0, vgidx_right = 0; uint32_t nvgroups = kvset_get_vgroups(ks), perfc_rwc = 0; - bool move_left = (blks_right->kblks.idc == 0); - bool move_right = (blks_left->kblks.idc == 0); + const bool move_left = (blks_right->kblks.idc == 0); + const bool move_right = (blks_left->kblks.idc == 0); uint64_t perfc_rwb = 0; merr_t err; + log_debug("splitting"); + if (move_left && move_right) { assert(nvgroups == 0); return 0; } + metadatav = calloc(nvgroups, sizeof(*metadatav)); + if (ev(!metadatav)) + return merr(ENOMEM); for (uint32_t i = 0; i < nvgroups; i++) { - uint16_t src_start, src_end, src_split, end; - uint32_t vbcnt = 0; - bool overlap = false; + /* Negative offset implies overlapping vblock was not accessed */ + metadatav[i].offset = -1; + } - /* Per vgroup start and end output vblock index in the source kvset - */ - src_start = vgmap_vbidx_out_start(ks, i); - src_end = vgmap_vbidx_out_end(ks, i); + for (uint32_t i = 0; i < nvgroups; i++) { + uint16_t start, end; + struct vgroup_split_metadata *metadata = metadatav + i; + + /* Per vgroup start and end output vblock index in the source kvset */ + start = vgmap_vbidx_out_start(ks, i); + end = vgmap_vbidx_out_end(ks, i); if (move_left || move_right) { - /* If all the kblocks are on one side then all the vblocks can be safely moved - * to the same side - */ - src_split = move_right ? src_start : src_end + 1; - assert(!overlap); + metadata->vblk_idx = move_right ? start : end; + metadata->overlaps = false; } else { - src_split = get_vblk_split_index(ks, src_start, src_end, split_key, &overlap); + metadata->vblk_idx = get_vblk_split_index(ks, start, end, split_key, + &metadata->overlaps); } - assert(src_split >= src_start && src_split <= src_end + 1); + assert(metadata->vblk_idx >= start && metadata->vblk_idx <= end + 1); + } - /* Add vblocks in [src_start, end - 1] to the left kvset - */ - end = overlap ? src_split + 1 : src_split; - for (uint16_t j = src_start; j < end; j++) { - err = blk_list_append(&blks_left->vblks, kvset_get_nth_vblock_id(ks, j)); - if (err) - return err; + max_key = *split_key; + find_max_key_among_overlapping_vblocks(nvgroups, metadatav, ks, &max_key); + + err = mark_vgroup_accesses(nvgroups, metadatav, ks, split_key, &max_key); + if (ev(err)) + goto out; + + for (uint32_t i = 0; i < nvgroups; i++) { + uint32_t vbcnt = 0; + bool overlapping_access; + uint16_t split, start, end, boundary; + const struct vgroup_split_metadata *metadata = metadatav + i; + + overlapping_access = metadata->overlaps && metadata->offset >= 0; + + /* Per vgroup start and end output vblock index in the source kvset */ + start = vgmap_vbidx_out_start(ks, i); + end = vgmap_vbidx_out_end(ks, i); + split = metadata->vblk_idx; + + log_debug("start=%u end=%u split=%u overlaps=%d offset=%jd overlapping_access=%d", start, end, split, metadata->overlaps, metadata->offset, overlapping_access); + + /* Add the vblocks in [boundary, end] to the right kvset */ + boundary = split; + for (uint16_t j = boundary; j <= end; j++) { + uint32_t alen; + uint64_t mbid; + + mbid = kvset_get_nth_vblock_id(ks, j); + + if (j == split && overlapping_access) { + off_t off; + uint64_t clone_mbid; + struct mblock_props props; + + off = metadata->offset; + off = off < PAGE_SIZE ? 0 : roundup(off - PAGE_SIZE, PAGE_SIZE); + + err = mpool_mblock_clone(ks->ks_mp, mbid, off, 0, &clone_mbid); + if (!err) { + err = blk_list_append(&blks_right->vblks, clone_mbid); + if (!err) + err = blk_list_append(result->ks[RIGHT].blks_commit, clone_mbid); + } + + if (err) + goto out; + + log_debug("Cloned mblock (0x%" PRIx64 ") starting at offset %jd", mbid, off); + + err = mpool_mblock_props_get(ks->ks_mp, clone_mbid, &props); + if (ev(err)) + goto out; + + perfc_rwc++; + if (perfc_ison(pc, PERFC_RA_CNCOMP_RBYTES) || + perfc_ison(pc, PERFC_RA_CNCOMP_WBYTES)) + perfc_rwb += props.mpr_write_len - off; + + alen = props.mpr_alloc_cap - VBLOCK_FOOTER_LEN; + } else { + err = blk_list_append(&blks_right->vblks, mbid); + if (err) + goto out; + + alen = kvset_get_nth_vblock_alen(ks, j); + } vbcnt++; - blks_left->bl_vtotal += kvset_get_nth_vblock_wlen(ks, j); + blks_right->bl_vtotal += alen; } if (vbcnt > 0) { - vbidx_left += vbcnt; + vbidx_right += vbcnt; - err = vgmap_vbidx_set(vgmap_src, end - 1, vgmap_left, vbidx_left - 1, vgidx_left); + err = vgmap_vbidx_set(vgmap_src, end, vgmap_right, vbidx_right - 1, vgidx_right); if (err) - return err; + goto out; - vgidx_left++; + vgidx_right++; } + /* Add vblocks in [start, boundary] to the left kvset + */ vbcnt = 0; /* reset vbcnt for the right kvset */ - if (overlap) { - /* Append a clone of the overlapping vblock to the right kvset */ - const uint64_t src_mbid = kvset_get_nth_vblock_id(ks, src_split); - uint64_t clone_mbid; - - err = mpool_mblock_clone(ks->ks_mp, src_mbid, 0, 0, &clone_mbid); - if (!err) { - err = blk_list_append(&blks_right->vblks, clone_mbid); - if (!err) - err = blk_list_append(result->ks[RIGHT].blks_commit, clone_mbid); - } + boundary = overlapping_access ? split : split - 1; + for (uint16_t j = start; j <= boundary; j++) { + uint32_t alen; + uint64_t mbid; - if (err) - return err; + mbid = kvset_get_nth_vblock_id(ks, j); - perfc_rwc++; - if (perfc_ison(pc, PERFC_RA_CNCOMP_RBYTES) || perfc_ison(pc, PERFC_RA_CNCOMP_WBYTES)) { + if (j == split) { + off_t off; + uint32_t wlen; struct mblock_props props; - err = mpool_mblock_props_get(ks->ks_mp, src_mbid, &props); - if (!ev(err)) - perfc_rwb += props.mpr_write_len; - else - err = 0; - } + /* Offset must be page aligned. Punching the rest of the vblock + * from the page aligned offset up to the vblock footer. + */ + off = roundup(metadata->offset, PAGE_SIZE); + wlen = kvset_get_nth_vblock_wlen(ks, j) - off; - vbcnt++; - blks_right->bl_vtotal += kvset_get_nth_vblock_wlen(ks, src_split); - src_split++; - } + err = mpool_mblock_punch(ks->ks_mp, mbid, off, wlen); + if (ev(err)) + goto out; - /* Add the remaining vblocks in [src_split, src_end] to the right kvset - */ - for (uint16_t j = src_split; j <= src_end; j++) { - err = blk_list_append(&blks_right->vblks, kvset_get_nth_vblock_id(ks, j)); - if (err) - return err; + log_debug("Punched mblock (0x%" PRIx64 ") starting at offset %jd for %u bytes", + mbid, off, wlen); + + err = mpool_mblock_props_get(ks->ks_mp, mbid, &props); + if (ev(err)) + goto out; + + alen = props.mpr_alloc_cap - VBLOCK_FOOTER_LEN; + } else { + alen = kvset_get_nth_vblock_alen(ks, j); + } + + err = blk_list_append(&blks_left->vblks, mbid); + if (ev(err)) + goto out; vbcnt++; - blks_right->bl_vtotal += kvset_get_nth_vblock_wlen(ks, j); + blks_left->bl_vtotal += alen; } if (vbcnt > 0) { - vbidx_right += vbcnt; + vbidx_left += vbcnt; - err = vgmap_vbidx_set(vgmap_src, src_end, vgmap_right, vbidx_right - 1, vgidx_right); + err = vgmap_vbidx_set(vgmap_src, boundary, vgmap_left, vbidx_left - 1, vgidx_left); if (err) - return err; + goto out; - vgidx_right++; + vgidx_left++; } } + /* Sanity check, so that we don't fall into these asserts elsewhere later + * on. + */ + assert(blks_left->bl_vtotal >= blks_left->bl_vused); + assert(blks_right->bl_vtotal >= blks_right->bl_vused); + if (nvgroups > 0) { assert(vgidx_left <= nvgroups); assert(vgidx_right <= nvgroups); @@ -587,7 +785,10 @@ vblocks_split( perfc_add2(pc, PERFC_RA_CNCOMP_WREQS, perfc_rwc, PERFC_RA_CNCOMP_WBYTES, perfc_rwb); } - return 0; +out: + free(metadatav); + + return err; } /** From f496dc457fb6e86bd2dcf4608e74d1ca86dc8519 Mon Sep 17 00:00:00 2001 From: Tristan Partin Date: Mon, 13 Feb 2023 10:02:08 -0600 Subject: [PATCH 2/4] current status --- lib/cn/kvset_split.c | 285 +++++++++++++++++++++++-------------------- 1 file changed, 153 insertions(+), 132 deletions(-) diff --git a/lib/cn/kvset_split.c b/lib/cn/kvset_split.c index 54b1ea5df..5f4db5eff 100644 --- a/lib/cn/kvset_split.c +++ b/lib/cn/kvset_split.c @@ -4,6 +4,7 @@ */ #include + #include #include @@ -16,6 +17,7 @@ #include #include #include +#include #include #include @@ -396,7 +398,8 @@ kblocks_split( struct vgroup_split_metadata { bool overlaps; /**< Whether the vgroup overlaps the split key. */ - uint16_t vblk_idx; /**< Where the left kvset's vblocks end and the right kvset's vblocks begin */ + uint16_t + vblk_idx; /**< Where the left kvset's vblocks end and the right kvset's vblocks begin */ off_t offset; /**< Offset into the vblock where the access first occurs */ }; @@ -452,35 +455,11 @@ get_vblk_split_index( return v; } -static void -find_max_key_among_overlapping_vblocks( - const uint32_t nvgroups, - const struct vgroup_split_metadata *metadatav, - struct kvset *const ks, - struct key_obj *max_key) -{ - for (uint32_t i = 0; i < nvgroups; i++) { - struct key_obj curr_key = { 0 }; - const struct vblock_desc *vbd; - const struct vgroup_split_metadata *metadata = metadatav + i; - - if (!metadata->overlaps) - continue; - - vbd = kvset_get_nth_vblock_desc(ks, metadata->vblk_idx); - - key2kobj(&curr_key, vbd->vbd_mblkdesc->map_base + vbd->vbd_max_koff, vbd->vbd_max_klen); - - if (key_obj_cmp(max_key, &curr_key) < 0) - *max_key = curr_key; - } -} - static merr_t mark_vgroup_accesses( const uint32_t nvgroups, struct vgroup_split_metadata *metadatav, - struct kvset *const ks, + struct kvset * const ks, const struct key_obj *split_key, const struct key_obj *max_key) { @@ -527,8 +506,9 @@ mark_vgroup_accesses( if (key_obj_cmp(&curr_key, max_key) > 0) break; - while (kvset_iter_next_vref(iter, &vc, &seqno, &vtype, &vbidx, &vboff, &vdata, &vlen, - &complen)) { + while (kvset_iter_next_vref( + iter, &vc, &seqno, &vtype, &vbidx, &vboff, &vdata, &vlen, &complen)) + { uint64_t vgidx; const struct vblock_desc *vbd; struct vgroup_split_metadata *metadata; @@ -545,7 +525,7 @@ mark_vgroup_accesses( metadata = metadatav + vgidx; if (metadata->offset == -1) { metadata->offset = vboff; - + log_debug("vgidx=%lu vbidx=%u offset=%u", vgidx, vbidx, vboff); /* Exit because we have marked the offset for all vgroups */ if (++accesses == nvgroups) goto out; @@ -580,6 +560,7 @@ vblocks_split( struct kvset_split_res *result) { struct key_obj max_key; + char buf[HSE_KVS_KEY_LEN_MAX + 1]; struct vgroup_split_metadata *metadatav; struct vgmap *vgmap_src = ks->ks_vgmap; struct vgmap *vgmap_left = work[LEFT].vgmap; @@ -594,21 +575,18 @@ vblocks_split( uint64_t perfc_rwb = 0; merr_t err; - log_debug("splitting"); + log_debug("splitting %d vgroups", nvgroups); if (move_left && move_right) { assert(nvgroups == 0); return 0; } - metadatav = calloc(nvgroups, sizeof(*metadatav)); + metadatav = malloc(nvgroups * sizeof(*metadatav)); if (ev(!metadatav)) return merr(ENOMEM); - for (uint32_t i = 0; i < nvgroups; i++) { - /* Negative offset implies overlapping vblock was not accessed */ - metadatav[i].offset = -1; - } + max_key = *split_key; for (uint32_t i = 0; i < nvgroups; i++) { uint16_t start, end; struct vgroup_split_metadata *metadata = metadatav + i; @@ -617,22 +595,44 @@ vblocks_split( start = vgmap_vbidx_out_start(ks, i); end = vgmap_vbidx_out_end(ks, i); + /* Negative offset implies overlapping vblock was not accessed */ + metadata->offset = -1; if (move_left || move_right) { + log_debug("move_left=%d move_right=%d", move_left, move_right); metadata->vblk_idx = move_right ? start : end; metadata->overlaps = false; } else { - metadata->vblk_idx = get_vblk_split_index(ks, start, end, split_key, - &metadata->overlaps); + metadata->vblk_idx = + get_vblk_split_index(ks, start, end, split_key, &metadata->overlaps); + if (metadata->overlaps) { + struct key_obj curr_key; + const struct vblock_desc *vbd; + + vbd = kvset_get_nth_vblock_desc(ks, metadata->vblk_idx); + + key2kobj( + &curr_key, vbd->vbd_mblkdesc->map_base + vbd->vbd_max_koff, vbd->vbd_max_klen); + + fmt_hexp( + buf, sizeof(buf), curr_key.ko_sfx, curr_key.ko_sfx_len, "0x", 4, "-", "\0"); + log_debug("vgidx=%u vblkidx=%u maxkey=%s", i, metadata->vblk_idx, buf); + if (key_obj_cmp(&max_key, &curr_key) < 0) + max_key = curr_key; + } } assert(metadata->vblk_idx >= start && metadata->vblk_idx <= end + 1); } - max_key = *split_key; - find_max_key_among_overlapping_vblocks(nvgroups, metadatav, ks, &max_key); + fmt_hexp(buf, sizeof(buf), split_key->ko_sfx, split_key->ko_sfx_len, "0x", 4, "-", "\0"); + log_debug("split key: %s", buf); + fmt_hexp(buf, sizeof(buf), max_key.ko_sfx, max_key.ko_sfx_len, "0x", 4, "-", "\0"); + log_debug("max key: %s", buf); - err = mark_vgroup_accesses(nvgroups, metadatav, ks, split_key, &max_key); - if (ev(err)) - goto out; + if (!(move_left || move_right)) { + err = mark_vgroup_accesses(nvgroups, metadatav, ks, split_key, &max_key); + if (ev(err)) + goto out; + } for (uint32_t i = 0; i < nvgroups; i++) { uint32_t vbcnt = 0; @@ -647,124 +647,145 @@ vblocks_split( end = vgmap_vbidx_out_end(ks, i); split = metadata->vblk_idx; - log_debug("start=%u end=%u split=%u overlaps=%d offset=%jd overlapping_access=%d", start, end, split, metadata->overlaps, metadata->offset, overlapping_access); - - /* Add the vblocks in [boundary, end] to the right kvset */ - boundary = split; - for (uint16_t j = boundary; j <= end; j++) { - uint32_t alen; - uint64_t mbid; + log_debug( + "start=%u end=%u split=%u overlaps=%d offset=%jd overlapping_access=%d", start, end, + split, metadata->overlaps, metadata->offset, overlapping_access); + + if (!move_left) { + /* Add the vblocks in [boundary, end] to the right kvset */ + boundary = overlapping_access ? split : split + 1; + for (uint16_t j = boundary; j <= end; j++) { + uint32_t alen; + uint64_t mbid; + + mbid = kvset_get_nth_vblock_id(ks, j); + + if (j == split && overlapping_access) { + off_t off; + uint64_t clone_mbid; + struct mblock_props props; + + /* We want clone more than enough data in the event the + * offset doesn't fall right on a page boundary. + */ + off = metadata->offset; + off = off < PAGE_SIZE ? 0 : roundup(off - PAGE_SIZE, PAGE_SIZE); + + err = mpool_mblock_clone(ks->ks_mp, mbid, off, 0, &clone_mbid); + if (!err) { + err = blk_list_append(&blks_right->vblks, clone_mbid); + if (!err) + err = blk_list_append(result->ks[RIGHT].blks_commit, clone_mbid); + } + + if (err) + goto out; - mbid = kvset_get_nth_vblock_id(ks, j); + log_debug("Cloned mblock (0x%" PRIx64 ") starting at offset %jd", mbid, off); - if (j == split && overlapping_access) { - off_t off; - uint64_t clone_mbid; - struct mblock_props props; + err = mpool_mblock_props_get(ks->ks_mp, clone_mbid, &props); + if (ev(err)) + goto out; - off = metadata->offset; - off = off < PAGE_SIZE ? 0 : roundup(off - PAGE_SIZE, PAGE_SIZE); + perfc_rwc++; + if (perfc_ison(pc, PERFC_RA_CNCOMP_RBYTES) || + perfc_ison(pc, PERFC_RA_CNCOMP_WBYTES)) + perfc_rwb += props.mpr_write_len - off; + + alen = props.mpr_alloc_cap - VBLOCK_FOOTER_LEN; + log_debug( + "right: cloned id=0x%lx vgidx=%u vblkidx=%u alen=%u", clone_mbid, i, j, + alen); + } else { + err = blk_list_append(&blks_right->vblks, mbid); + if (err) + goto out; - err = mpool_mblock_clone(ks->ks_mp, mbid, off, 0, &clone_mbid); - if (!err) { - err = blk_list_append(&blks_right->vblks, clone_mbid); - if (!err) - err = blk_list_append(result->ks[RIGHT].blks_commit, clone_mbid); + alen = kvset_get_nth_vblock_alen(ks, j); + log_debug( + "right: moved id=0x%lx vgidx=%u vblkidx=%u alen=%u", mbid, i, j, alen); } - if (err) - goto out; - - log_debug("Cloned mblock (0x%" PRIx64 ") starting at offset %jd", mbid, off); - - err = mpool_mblock_props_get(ks->ks_mp, clone_mbid, &props); - if (ev(err)) - goto out; + vbcnt++; + blks_right->bl_vtotal += alen; + } - perfc_rwc++; - if (perfc_ison(pc, PERFC_RA_CNCOMP_RBYTES) || - perfc_ison(pc, PERFC_RA_CNCOMP_WBYTES)) - perfc_rwb += props.mpr_write_len - off; + if (vbcnt > 0) { + vbidx_right += vbcnt; - alen = props.mpr_alloc_cap - VBLOCK_FOOTER_LEN; - } else { - err = blk_list_append(&blks_right->vblks, mbid); + err = vgmap_vbidx_set(vgmap_src, end, vgmap_right, vbidx_right - 1, vgidx_right); if (err) goto out; - alen = kvset_get_nth_vblock_alen(ks, j); + vgidx_right++; } - - vbcnt++; - blks_right->bl_vtotal += alen; - } - - if (vbcnt > 0) { - vbidx_right += vbcnt; - - err = vgmap_vbidx_set(vgmap_src, end, vgmap_right, vbidx_right - 1, vgidx_right); - if (err) - goto out; - - vgidx_right++; } - /* Add vblocks in [start, boundary] to the left kvset - */ - vbcnt = 0; /* reset vbcnt for the right kvset */ - boundary = overlapping_access ? split : split - 1; - for (uint16_t j = start; j <= boundary; j++) { - uint32_t alen; - uint64_t mbid; - - mbid = kvset_get_nth_vblock_id(ks, j); - - if (j == split) { - off_t off; - uint32_t wlen; - struct mblock_props props; + if (!move_right) { + /* Add vblocks in [start, boundary] to the left kvset */ + vbcnt = 0; /* reset vbcnt for the left kvset */ + boundary = split; + for (uint16_t j = start; j <= boundary; j++) { + uint32_t alen; + uint64_t mbid; + + mbid = kvset_get_nth_vblock_id(ks, j); + + if (j == split && overlapping_access) { + off_t off; + uint32_t wlen; + struct mblock_props props; + + /* Offset must be page aligned. Punching the rest of the + * vblock from the page aligned offset up to the vblock + * footer. + */ + off = roundup(metadata->offset, PAGE_SIZE); + wlen = kvset_get_nth_vblock_wlen(ks, j) - off; + + err = mpool_mblock_punch(ks->ks_mp, mbid, off, wlen); + if (ev(err)) + goto out; - /* Offset must be page aligned. Punching the rest of the vblock - * from the page aligned offset up to the vblock footer. - */ - off = roundup(metadata->offset, PAGE_SIZE); - wlen = kvset_get_nth_vblock_wlen(ks, j) - off; + log_debug( + "Punched mblock (0x%" PRIx64 ") starting at offset %jd for %u bytes", mbid, + off, wlen); - err = mpool_mblock_punch(ks->ks_mp, mbid, off, wlen); - if (ev(err)) - goto out; + err = mpool_mblock_props_get(ks->ks_mp, mbid, &props); + if (ev(err)) + goto out; - log_debug("Punched mblock (0x%" PRIx64 ") starting at offset %jd for %u bytes", - mbid, off, wlen); + alen = props.mpr_alloc_cap - VBLOCK_FOOTER_LEN; + log_debug( + "left: punched id=0x%lx vgidx=%u vblkidx=%u alen=%u", mbid, i, j, alen); + } else { + alen = kvset_get_nth_vblock_alen(ks, j); + log_debug("left: moved id=0x%lx vgidx=%u vblkidx=%u alen=%u", mbid, i, j, alen); + } - err = mpool_mblock_props_get(ks->ks_mp, mbid, &props); + err = blk_list_append(&blks_left->vblks, mbid); if (ev(err)) goto out; - alen = props.mpr_alloc_cap - VBLOCK_FOOTER_LEN; - } else { - alen = kvset_get_nth_vblock_alen(ks, j); + vbcnt++; + blks_left->bl_vtotal += alen; } - err = blk_list_append(&blks_left->vblks, mbid); - if (ev(err)) - goto out; + if (vbcnt > 0) { + vbidx_left += vbcnt; - vbcnt++; - blks_left->bl_vtotal += alen; - } - - if (vbcnt > 0) { - vbidx_left += vbcnt; - - err = vgmap_vbidx_set(vgmap_src, boundary, vgmap_left, vbidx_left - 1, vgidx_left); - if (err) - goto out; + err = vgmap_vbidx_set(vgmap_src, boundary, vgmap_left, vbidx_left - 1, vgidx_left); + if (err) + goto out; - vgidx_left++; + vgidx_left++; + } } } + log_debug("left: vtotal=%lu vused=%lu", blks_left->bl_vtotal, blks_left->bl_vused); + log_debug("right: vtotal=%lu vused=%lu", blks_right->bl_vtotal, blks_right->bl_vused); + /* Sanity check, so that we don't fall into these asserts elsewhere later * on. */ From 20010dc4ce2a5ebff4e8589e08b14b1788ffb41b Mon Sep 17 00:00:00 2001 From: Tristan Partin Date: Mon, 13 Feb 2023 17:10:57 -0600 Subject: [PATCH 3/4] fix move right and read off the end of array --- lib/cn/kvset_split.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/cn/kvset_split.c b/lib/cn/kvset_split.c index 5f4db5eff..e78567a4b 100644 --- a/lib/cn/kvset_split.c +++ b/lib/cn/kvset_split.c @@ -653,7 +653,7 @@ vblocks_split( if (!move_left) { /* Add the vblocks in [boundary, end] to the right kvset */ - boundary = overlapping_access ? split : split + 1; + boundary = (overlapping_access || move_right) ? split : split + 1; for (uint16_t j = boundary; j <= end; j++) { uint32_t alen; uint64_t mbid; @@ -724,7 +724,7 @@ vblocks_split( if (!move_right) { /* Add vblocks in [start, boundary] to the left kvset */ vbcnt = 0; /* reset vbcnt for the left kvset */ - boundary = split; + boundary = min(split, end); for (uint16_t j = start; j <= boundary; j++) { uint32_t alen; uint64_t mbid; From 74e287592b19d0112ab926ba8af622d0d821a29f Mon Sep 17 00:00:00 2001 From: Tristan Partin Date: Tue, 14 Feb 2023 00:50:31 -0600 Subject: [PATCH 4/4] Works? --- lib/cn/kvset_split.c | 106 ++++++++++++++++++++++++------------------- 1 file changed, 59 insertions(+), 47 deletions(-) diff --git a/lib/cn/kvset_split.c b/lib/cn/kvset_split.c index e78567a4b..76b24d7dd 100644 --- a/lib/cn/kvset_split.c +++ b/lib/cn/kvset_split.c @@ -409,7 +409,7 @@ struct vgroup_split_metadata { * * Return values: * v >= start and overlap = false: left: [start, v - 1], right [v, end] - * v >= start and overlap = true : left: [start, punched(v)], right [clone(v), end] + * v >= start and overlap = true : left: [start, punch(v)], right [clone(v), end] * * NOTES: * v = start and overlap = false: All vblocks go to the right @@ -652,59 +652,67 @@ vblocks_split( split, metadata->overlaps, metadata->offset, overlapping_access); if (!move_left) { - /* Add the vblocks in [boundary, end] to the right kvset */ - boundary = (overlapping_access || move_right) ? split : split + 1; - for (uint16_t j = boundary; j <= end; j++) { + // Clone a vblock with an overlapping access + if (overlapping_access) { + off_t off; uint32_t alen; - uint64_t mbid; + struct mblock_props props; + uint64_t src_mbid, cloned_mbid; - mbid = kvset_get_nth_vblock_id(ks, j); + src_mbid = kvset_get_nth_vblock_id(ks, split); - if (j == split && overlapping_access) { - off_t off; - uint64_t clone_mbid; - struct mblock_props props; + /* We want clone more than enough data in the event the offset + * doesn't fall right on a page boundary. + */ + off = metadata->offset; + off = off < PAGE_SIZE ? 0 : roundup(off - PAGE_SIZE, PAGE_SIZE); + + err = mpool_mblock_clone(ks->ks_mp, src_mbid, off, 0, &cloned_mbid); + if (!err) { + err = blk_list_append(&blks_right->vblks, cloned_mbid); + if (!err) + err = blk_list_append(result->ks[RIGHT].blks_commit, cloned_mbid); + } - /* We want clone more than enough data in the event the - * offset doesn't fall right on a page boundary. - */ - off = metadata->offset; - off = off < PAGE_SIZE ? 0 : roundup(off - PAGE_SIZE, PAGE_SIZE); + if (err) + goto out; - err = mpool_mblock_clone(ks->ks_mp, mbid, off, 0, &clone_mbid); - if (!err) { - err = blk_list_append(&blks_right->vblks, clone_mbid); - if (!err) - err = blk_list_append(result->ks[RIGHT].blks_commit, clone_mbid); - } + log_debug("Cloned mblock (0x%" PRIx64 ") starting at offset %jd", src_mbid, off); - if (err) - goto out; + err = mpool_mblock_props_get(ks->ks_mp, cloned_mbid, &props); + if (ev(err)) + goto out; - log_debug("Cloned mblock (0x%" PRIx64 ") starting at offset %jd", mbid, off); + perfc_rwc++; + if (perfc_ison(pc, PERFC_RA_CNCOMP_RBYTES) || + perfc_ison(pc, PERFC_RA_CNCOMP_WBYTES)) + perfc_rwb += props.mpr_write_len - off; - err = mpool_mblock_props_get(ks->ks_mp, clone_mbid, &props); - if (ev(err)) - goto out; + alen = props.mpr_alloc_cap - VBLOCK_FOOTER_LEN; + log_debug( + "right: cloned id=0x%lx vgidx=%u vblkidx=%u alen=%u", cloned_mbid, i, split, + alen); + + vbcnt++; + blks_right->bl_vtotal += alen; + boundary = split + 1; + } else { + boundary = metadata->overlaps ? split + 1 : split; + } - perfc_rwc++; - if (perfc_ison(pc, PERFC_RA_CNCOMP_RBYTES) || - perfc_ison(pc, PERFC_RA_CNCOMP_WBYTES)) - perfc_rwb += props.mpr_write_len - off; + /* Add the vblocks in [boundary, end] to the right kvset */ + for (uint16_t j = boundary; j <= end; j++) { + uint32_t alen; + uint64_t mbid; - alen = props.mpr_alloc_cap - VBLOCK_FOOTER_LEN; - log_debug( - "right: cloned id=0x%lx vgidx=%u vblkidx=%u alen=%u", clone_mbid, i, j, - alen); - } else { - err = blk_list_append(&blks_right->vblks, mbid); - if (err) - goto out; + mbid = kvset_get_nth_vblock_id(ks, j); - alen = kvset_get_nth_vblock_alen(ks, j); - log_debug( - "right: moved id=0x%lx vgidx=%u vblkidx=%u alen=%u", mbid, i, j, alen); - } + err = blk_list_append(&blks_right->vblks, mbid); + if (err) + goto out; + + alen = kvset_get_nth_vblock_alen(ks, j); + log_debug("right: moved id=0x%lx vgidx=%u vblkidx=%u alen=%u", mbid, i, j, alen); vbcnt++; blks_right->bl_vtotal += alen; @@ -722,10 +730,13 @@ vblocks_split( } if (!move_right) { - /* Add vblocks in [start, boundary] to the left kvset */ + /* Add vblocks in [start, boundary) to the left kvset */ vbcnt = 0; /* reset vbcnt for the left kvset */ - boundary = min(split, end); - for (uint16_t j = start; j <= boundary; j++) { + boundary = + (overlapping_access || move_left || (metadata->overlaps && metadata->offset == -1)) + ? split + 1 + : split; + for (uint16_t j = start; j < boundary; j++) { uint32_t alen; uint64_t mbid; @@ -774,7 +785,8 @@ vblocks_split( if (vbcnt > 0) { vbidx_left += vbcnt; - err = vgmap_vbidx_set(vgmap_src, boundary, vgmap_left, vbidx_left - 1, vgidx_left); + err = vgmap_vbidx_set( + vgmap_src, boundary - 1, vgmap_left, vbidx_left - 1, vgidx_left); if (err) goto out;