diff --git a/fs/btrfs/Kconfig b/fs/btrfs/Kconfig index 4fb925e8c981d8..fa8515598341ec 100644 --- a/fs/btrfs/Kconfig +++ b/fs/btrfs/Kconfig @@ -78,6 +78,32 @@ config BTRFS_ASSERT If unsure, say N. +config BTRFS_EXPERIMENTAL + bool "Btrfs experimental features" + depends on BTRFS_FS + default n + help + Enable experimental features. These features may not be stable enough + for end users. This is meant for btrfs developers or users who wish + to test the functionality and report problems. + + Current list: + + - extent map shrinker - performance problems with too frequent shrinks + + - send stream protocol v3 - fs-verity support + + - checksum offload mode - sysfs knob to affect when checksums are + calculated (at IO time, or in a thread) + + - raid-stripe-tree - additional mapping of extents to devices to + support RAID1* profiles on zoned devices, + RAID56 not yet supported + + - extent tree v2 - complex rework of extent tracking + + If unsure, say N. + config BTRFS_FS_REF_VERIFY bool "Btrfs with the ref verify tool compiled in" depends on BTRFS_FS diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile index 87617f2968bccf..3cfc440c636ca3 100644 --- a/fs/btrfs/Makefile +++ b/fs/btrfs/Makefile @@ -43,4 +43,5 @@ btrfs-$(CONFIG_FS_VERITY) += verity.o btrfs-$(CONFIG_BTRFS_FS_RUN_SANITY_TESTS) += tests/free-space-tests.o \ tests/extent-buffer-tests.o tests/btrfs-tests.o \ tests/extent-io-tests.o tests/inode-tests.o tests/qgroup-tests.o \ - tests/free-space-tree-tests.o tests/extent-map-tests.o + tests/free-space-tree-tests.o tests/extent-map-tests.o \ + tests/raid-stripe-tree-tests.o diff --git a/fs/btrfs/bio.c b/fs/btrfs/bio.c index fec5c6cde0a7f1..1f216d07eff65c 100644 --- a/fs/btrfs/bio.c +++ b/fs/btrfs/bio.c @@ -49,6 +49,7 @@ void btrfs_bio_init(struct btrfs_bio *bbio, struct btrfs_fs_info *fs_info, bbio->end_io = end_io; bbio->private = private; atomic_set(&bbio->pending_ios, 1); + WRITE_ONCE(bbio->status, BLK_STS_OK); } /* @@ -113,41 +114,29 @@ static void __btrfs_bio_end_io(struct btrfs_bio *bbio) } } -static void btrfs_orig_write_end_io(struct bio *bio); - -static void btrfs_bbio_propagate_error(struct btrfs_bio *bbio, - struct btrfs_bio *orig_bbio) -{ - /* - * For writes we tolerate nr_mirrors - 1 write failures, so we can't - * just blindly propagate a write failure here. Instead increment the - * error count in the original I/O context so that it is guaranteed to - * be larger than the error tolerance. - */ - if (bbio->bio.bi_end_io == &btrfs_orig_write_end_io) { - struct btrfs_io_stripe *orig_stripe = orig_bbio->bio.bi_private; - struct btrfs_io_context *orig_bioc = orig_stripe->bioc; - - atomic_add(orig_bioc->max_errors, &orig_bioc->error); - } else { - orig_bbio->bio.bi_status = bbio->bio.bi_status; - } -} - void btrfs_bio_end_io(struct btrfs_bio *bbio, blk_status_t status) { bbio->bio.bi_status = status; if (bbio->bio.bi_pool == &btrfs_clone_bioset) { struct btrfs_bio *orig_bbio = bbio->private; - if (bbio->bio.bi_status) - btrfs_bbio_propagate_error(bbio, orig_bbio); btrfs_cleanup_bio(bbio); bbio = orig_bbio; } - if (atomic_dec_and_test(&bbio->pending_ios)) + /* + * At this point, bbio always points to the original btrfs_bio. Save + * the first error in it. + */ + if (status != BLK_STS_OK) + cmpxchg(&bbio->status, BLK_STS_OK, status); + + if (atomic_dec_and_test(&bbio->pending_ios)) { + /* Load split bio's error which might be set above. */ + if (status == BLK_STS_OK) + bbio->bio.bi_status = READ_ONCE(bbio->status); __btrfs_bio_end_io(bbio); + } } static int next_repair_mirror(struct btrfs_failed_bio *fbio, int cur_mirror) @@ -598,7 +587,7 @@ static bool should_async_write(struct btrfs_bio *bbio) { bool auto_csum_mode = true; -#ifdef CONFIG_BTRFS_DEBUG +#ifdef CONFIG_BTRFS_EXPERIMENTAL struct btrfs_fs_devices *fs_devices = bbio->fs_info->fs_devices; enum btrfs_offload_csum_mode csum_mode = READ_ONCE(fs_devices->offload_csum_mode); diff --git a/fs/btrfs/bio.h b/fs/btrfs/bio.h index e4861234074588..e2fe16074ad655 100644 --- a/fs/btrfs/bio.h +++ b/fs/btrfs/bio.h @@ -79,6 +79,9 @@ struct btrfs_bio { /* File system that this I/O operates on. */ struct btrfs_fs_info *fs_info; + /* Save the first error status of split bio. */ + blk_status_t status; + /* * This member must come last, bio_alloc_bioset will allocate enough * bytes for entire btrfs_bio but relies on bio being last. diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index 7980b2e33a9216..4427c1b835e8b2 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -2797,7 +2797,7 @@ void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans) * uncompressed data size, because the compression is only done * when writeback triggered and we don't know how much space we * are actually going to need, so we reserve the uncompressed - * size because the data may be uncompressible in the worst case. + * size because the data may be incompressible in the worst case. */ if (ret == 0) { bool used; @@ -3819,6 +3819,8 @@ void btrfs_free_reserved_bytes(struct btrfs_block_group *cache, spin_lock(&cache->lock); if (cache->ro) space_info->bytes_readonly += num_bytes; + else if (btrfs_is_zoned(cache->fs_info)) + space_info->bytes_zone_unusable += num_bytes; cache->reserved -= num_bytes; space_info->bytes_reserved -= num_bytes; space_info->max_extent_size = 0; diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index e152fde888fc9a..157fd3f4cb333a 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -577,7 +577,6 @@ void btrfs_merge_delalloc_extent(struct btrfs_inode *inode, struct extent_state struct extent_state *other); void btrfs_split_delalloc_extent(struct btrfs_inode *inode, struct extent_state *orig, u64 split); -void btrfs_set_range_writeback(struct btrfs_inode *inode, u64 start, u64 end); void btrfs_evict_inode(struct inode *inode); struct inode *btrfs_alloc_inode(struct super_block *sb); void btrfs_destroy_inode(struct inode *inode); @@ -613,8 +612,7 @@ int btrfs_writepage_cow_fixup(struct folio *folio); int btrfs_encoded_io_compression_from_extent(struct btrfs_fs_info *fs_info, int compress_type); int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode, - u64 file_offset, u64 disk_bytenr, - u64 disk_io_size, + u64 disk_bytenr, u64 disk_io_size, struct page **pages); ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter, struct btrfs_ioctl_encoded_io_args *encoded); diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index 90aef2627ca27b..21d73ea594a999 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -545,8 +545,7 @@ static noinline int add_ra_bio_pages(struct inode *inode, * subpage::readers and to unlock the page. */ if (fs_info->sectorsize < PAGE_SIZE) - btrfs_subpage_start_reader(fs_info, folio, cur, - add_size); + btrfs_folio_set_lock(fs_info, folio, cur, add_size); folio_put(folio); cur += add_size; } @@ -702,7 +701,7 @@ static void free_heuristic_ws(struct list_head *ws) kfree(workspace); } -static struct list_head *alloc_heuristic_ws(unsigned int level) +static struct list_head *alloc_heuristic_ws(void) { struct heuristic_ws *ws; @@ -744,9 +743,9 @@ static const struct btrfs_compress_op * const btrfs_compress_op[] = { static struct list_head *alloc_workspace(int type, unsigned int level) { switch (type) { - case BTRFS_COMPRESS_NONE: return alloc_heuristic_ws(level); + case BTRFS_COMPRESS_NONE: return alloc_heuristic_ws(); case BTRFS_COMPRESS_ZLIB: return zlib_alloc_workspace(level); - case BTRFS_COMPRESS_LZO: return lzo_alloc_workspace(level); + case BTRFS_COMPRESS_LZO: return lzo_alloc_workspace(); case BTRFS_COMPRESS_ZSTD: return zstd_alloc_workspace(level); default: /* @@ -1030,6 +1029,7 @@ int btrfs_compress_folios(unsigned int type_level, struct address_space *mapping { int type = btrfs_compress_type(type_level); int level = btrfs_compress_level(type_level); + const unsigned long orig_len = *total_out; struct list_head *workspace; int ret; @@ -1037,6 +1037,8 @@ int btrfs_compress_folios(unsigned int type_level, struct address_space *mapping workspace = get_workspace(type, level); ret = compression_compress_pages(type, workspace, mapping, start, folios, out_folios, total_in, total_out); + /* The total read-in bytes should be no larger than the input. */ + ASSERT(*total_in <= orig_len); put_workspace(type, workspace); return ret; } diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h index b6563b6a333e5d..954034086d0d44 100644 --- a/fs/btrfs/compression.h +++ b/fs/btrfs/compression.h @@ -175,7 +175,7 @@ int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb); int lzo_decompress(struct list_head *ws, const u8 *data_in, struct folio *dest_folio, unsigned long dest_pgoff, size_t srclen, size_t destlen); -struct list_head *lzo_alloc_workspace(unsigned int level); +struct list_head *lzo_alloc_workspace(void); void lzo_free_workspace(struct list_head *ws); int zstd_compress_folios(struct list_head *ws, struct address_space *mapping, diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 0cc919d15b1441..cf791e5cf229eb 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -1508,26 +1508,26 @@ static noinline void unlock_up(struct btrfs_path *path, int level, */ static int read_block_for_search(struct btrfs_root *root, struct btrfs_path *p, - struct extent_buffer **eb_ret, int level, int slot, + struct extent_buffer **eb_ret, int slot, const struct btrfs_key *key) { struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_tree_parent_check check = { 0 }; u64 blocknr; - u64 gen; - struct extent_buffer *tmp; - int ret; + struct extent_buffer *tmp = NULL; + int ret = 0; int parent_level; - bool unlock_up; + int err; + bool read_tmp = false; + bool tmp_locked = false; + bool path_released = false; - unlock_up = ((level + 1 < BTRFS_MAX_LEVEL) && p->locks[level + 1]); blocknr = btrfs_node_blockptr(*eb_ret, slot); - gen = btrfs_node_ptr_generation(*eb_ret, slot); parent_level = btrfs_header_level(*eb_ret); btrfs_node_key_to_cpu(*eb_ret, &check.first_key, slot); check.has_first_key = true; check.level = parent_level - 1; - check.transid = gen; + check.transid = btrfs_node_ptr_generation(*eb_ret, slot); check.owner_root = btrfs_root_id(root); /* @@ -1540,79 +1540,115 @@ read_block_for_search(struct btrfs_root *root, struct btrfs_path *p, tmp = find_extent_buffer(fs_info, blocknr); if (tmp) { if (p->reada == READA_FORWARD_ALWAYS) - reada_for_search(fs_info, p, level, slot, key->objectid); + reada_for_search(fs_info, p, parent_level, slot, key->objectid); /* first we do an atomic uptodate check */ - if (btrfs_buffer_uptodate(tmp, gen, 1) > 0) { + if (btrfs_buffer_uptodate(tmp, check.transid, 1) > 0) { /* * Do extra check for first_key, eb can be stale due to * being cached, read from scrub, or have multiple * parents (shared tree blocks). */ - if (btrfs_verify_level_key(tmp, - parent_level - 1, &check.first_key, gen)) { - free_extent_buffer(tmp); - return -EUCLEAN; + if (btrfs_verify_level_key(tmp, &check)) { + ret = -EUCLEAN; + goto out; } *eb_ret = tmp; - return 0; + tmp = NULL; + ret = 0; + goto out; } if (p->nowait) { - free_extent_buffer(tmp); - return -EAGAIN; + ret = -EAGAIN; + goto out; } - if (unlock_up) - btrfs_unlock_up_safe(p, level + 1); - - /* now we're allowed to do a blocking uptodate check */ - ret = btrfs_read_extent_buffer(tmp, &check); - if (ret) { - free_extent_buffer(tmp); + if (!p->skip_locking) { + btrfs_unlock_up_safe(p, parent_level + 1); + tmp_locked = true; + btrfs_tree_read_lock(tmp); btrfs_release_path(p); - return ret; + ret = -EAGAIN; + path_released = true; } - if (unlock_up) - ret = -EAGAIN; + /* Now we're allowed to do a blocking uptodate check. */ + err = btrfs_read_extent_buffer(tmp, &check); + if (err) { + ret = err; + goto out; + } + if (ret == 0) { + ASSERT(!tmp_locked); + *eb_ret = tmp; + tmp = NULL; + } goto out; } else if (p->nowait) { - return -EAGAIN; + ret = -EAGAIN; + goto out; } - if (unlock_up) { - btrfs_unlock_up_safe(p, level + 1); + if (!p->skip_locking) { + btrfs_unlock_up_safe(p, parent_level + 1); ret = -EAGAIN; - } else { - ret = 0; } if (p->reada != READA_NONE) - reada_for_search(fs_info, p, level, slot, key->objectid); + reada_for_search(fs_info, p, parent_level, slot, key->objectid); - tmp = read_tree_block(fs_info, blocknr, &check); + tmp = btrfs_find_create_tree_block(fs_info, blocknr, check.owner_root, check.level); if (IS_ERR(tmp)) { + ret = PTR_ERR(tmp); + tmp = NULL; + goto out; + } + read_tmp = true; + + if (!p->skip_locking) { + ASSERT(ret == -EAGAIN); + tmp_locked = true; + btrfs_tree_read_lock(tmp); btrfs_release_path(p); - return PTR_ERR(tmp); + path_released = true; + } + + /* Now we're allowed to do a blocking uptodate check. */ + err = btrfs_read_extent_buffer(tmp, &check); + if (err) { + ret = err; + goto out; } + /* * If the read above didn't mark this buffer up to date, * it will never end up being up to date. Set ret to EIO now * and give up so that our caller doesn't loop forever * on our EAGAINs. */ - if (!extent_buffer_uptodate(tmp)) + if (!extent_buffer_uptodate(tmp)) { ret = -EIO; + goto out; + } -out: if (ret == 0) { + ASSERT(!tmp_locked); *eb_ret = tmp; - } else { - free_extent_buffer(tmp); - btrfs_release_path(p); + tmp = NULL; + } +out: + if (tmp) { + if (tmp_locked) + btrfs_tree_read_unlock(tmp); + if (read_tmp && ret && ret != -EAGAIN) + free_extent_buffer_stale(tmp); + else + free_extent_buffer(tmp); } + if (ret && !path_released) + btrfs_release_path(p); return ret; } @@ -2197,8 +2233,8 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root *root, goto done; } - err = read_block_for_search(root, p, &b, level, slot, key); - if (err == -EAGAIN) + err = read_block_for_search(root, p, &b, slot, key); + if (err == -EAGAIN && !p->nowait) goto again; if (err) { ret = err; @@ -2324,8 +2360,8 @@ int btrfs_search_old_slot(struct btrfs_root *root, const struct btrfs_key *key, goto done; } - err = read_block_for_search(root, p, &b, level, slot, key); - if (err == -EAGAIN) + err = read_block_for_search(root, p, &b, slot, key); + if (err == -EAGAIN && !p->nowait) goto again; if (err) { ret = err; @@ -2334,7 +2370,7 @@ int btrfs_search_old_slot(struct btrfs_root *root, const struct btrfs_key *key, level = btrfs_header_level(b); btrfs_tree_read_lock(b); - b = btrfs_tree_mod_log_rewind(fs_info, p, b, time_seq); + b = btrfs_tree_mod_log_rewind(fs_info, b, time_seq); if (!b) { ret = -ENOMEM; goto done; @@ -3863,6 +3899,7 @@ static noinline int setup_leaf_for_split(struct btrfs_trans_handle *trans, btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); BUG_ON(key.type != BTRFS_EXTENT_DATA_KEY && + key.type != BTRFS_RAID_STRIPE_KEY && key.type != BTRFS_EXTENT_CSUM_KEY); if (btrfs_leaf_free_space(leaf) >= ins_len) @@ -4930,8 +4967,7 @@ int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path, } next = c; - ret = read_block_for_search(root, path, &next, level, - slot, &key); + ret = read_block_for_search(root, path, &next, slot, &key); if (ret == -EAGAIN && !path->nowait) goto again; @@ -4974,8 +5010,7 @@ int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path, if (!level) break; - ret = read_block_for_search(root, path, &next, level, - 0, &key); + ret = read_block_for_search(root, path, &next, 0, &key); if (ret == -EAGAIN && !path->nowait) goto again; diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c index 115b90d29b1d8e..04586aa1191763 100644 --- a/fs/btrfs/delayed-ref.c +++ b/fs/btrfs/delayed-ref.c @@ -830,7 +830,6 @@ static void init_delayed_ref_head(struct btrfs_delayed_ref_head *head_ref, qrecord->data_rsv = reserved; qrecord->data_rsv_refroot = generic_ref->ref_root; } - qrecord->bytenr = generic_ref->bytenr; qrecord->num_bytes = generic_ref->num_bytes; qrecord->old_roots = NULL; } @@ -860,11 +859,12 @@ add_delayed_ref_head(struct btrfs_trans_handle *trans, if (qrecord) { int ret; - ret = btrfs_qgroup_trace_extent_nolock(fs_info, delayed_refs, qrecord); + ret = btrfs_qgroup_trace_extent_nolock(fs_info, delayed_refs, qrecord, + head_ref->bytenr); if (ret) { /* Clean up if insertion fails or item exists. */ xa_release(&delayed_refs->dirty_extents, - qrecord->bytenr >> fs_info->sectorsize_bits); + head_ref->bytenr >> fs_info->sectorsize_bits); /* Caller responsible for freeing qrecord on error. */ if (ret < 0) return ERR_PTR(ret); @@ -1074,7 +1074,7 @@ static int add_delayed_ref(struct btrfs_trans_handle *trans, kmem_cache_free(btrfs_delayed_ref_node_cachep, node); if (qrecord_inserted) - return btrfs_qgroup_trace_extent_post(trans, record); + return btrfs_qgroup_trace_extent_post(trans, record, generic_ref->bytenr); return 0; free_record: diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index 83d5cdd77f293e..ac8e97ed13f751 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -45,7 +45,7 @@ * * - Copy existing extents * - * This happens by re-using scrub facility, as scrub also iterates through + * This happens by reusing scrub facility, as scrub also iterates through * existing extents from commit root. * * Location: scrub_write_block_to_dev_replace() from @@ -641,6 +641,7 @@ static int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info, return ret; down_write(&dev_replace->rwsem); + dev_replace->replace_task = current; switch (dev_replace->replace_state) { case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED: case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED: @@ -994,6 +995,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, list_add(&tgt_device->dev_alloc_list, &fs_devices->alloc_list); fs_devices->rw_devices++; + dev_replace->replace_task = NULL; up_write(&dev_replace->rwsem); btrfs_rm_dev_replace_blocked(fs_info); diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c index 001c0c2f872c99..d3093eba54a553 100644 --- a/fs/btrfs/dir-item.c +++ b/fs/btrfs/dir-item.c @@ -27,7 +27,6 @@ static struct btrfs_dir_item *insert_with_overflow(struct btrfs_trans_handle const char *name, int name_len) { - struct btrfs_fs_info *fs_info = root->fs_info; int ret; char *ptr; struct extent_buffer *leaf; @@ -35,7 +34,7 @@ static struct btrfs_dir_item *insert_with_overflow(struct btrfs_trans_handle ret = btrfs_insert_empty_item(trans, root, path, cpu_key, data_size); if (ret == -EEXIST) { struct btrfs_dir_item *di; - di = btrfs_match_dir_item_name(fs_info, path, name, name_len); + di = btrfs_match_dir_item_name(path, name, name_len); if (di) return ERR_PTR(-EEXIST); btrfs_extend_item(trans, path, data_size); @@ -190,7 +189,7 @@ static struct btrfs_dir_item *btrfs_lookup_match_dir( if (ret > 0) return ERR_PTR(-ENOENT); - return btrfs_match_dir_item_name(root->fs_info, path, name, name_len); + return btrfs_match_dir_item_name(path, name, name_len); } /* @@ -341,8 +340,7 @@ btrfs_search_dir_index_item(struct btrfs_root *root, struct btrfs_path *path, if (key.objectid != dirid || key.type != BTRFS_DIR_INDEX_KEY) break; - di = btrfs_match_dir_item_name(root->fs_info, path, - name->name, name->len); + di = btrfs_match_dir_item_name(path, name->name, name->len); if (di) return di; } @@ -378,8 +376,7 @@ struct btrfs_dir_item *btrfs_lookup_xattr(struct btrfs_trans_handle *trans, * this walks through all the entries in a dir item and finds one * for a specific name. */ -struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_fs_info *fs_info, - const struct btrfs_path *path, +struct btrfs_dir_item *btrfs_match_dir_item_name(const struct btrfs_path *path, const char *name, int name_len) { struct btrfs_dir_item *dir_item; diff --git a/fs/btrfs/dir-item.h b/fs/btrfs/dir-item.h index 5f6dfafc91f1fe..28d69970bc709f 100644 --- a/fs/btrfs/dir-item.h +++ b/fs/btrfs/dir-item.h @@ -44,8 +44,7 @@ struct btrfs_dir_item *btrfs_lookup_xattr(struct btrfs_trans_handle *trans, struct btrfs_path *path, u64 dir, const char *name, u16 name_len, int mod); -struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_fs_info *fs_info, - const struct btrfs_path *path, +struct btrfs_dir_item *btrfs_match_dir_item_name(const struct btrfs_path *path, const char *name, int name_len); diff --git a/fs/btrfs/direct-io.c b/fs/btrfs/direct-io.c index bd38df5647e35b..a7c3e221378db7 100644 --- a/fs/btrfs/direct-io.c +++ b/fs/btrfs/direct-io.c @@ -834,7 +834,7 @@ ssize_t btrfs_direct_write(struct kiocb *iocb, struct iov_iter *from) return ret; } - ret = btrfs_write_check(iocb, from, ret); + ret = btrfs_write_check(iocb, ret); if (ret < 0) { btrfs_inode_unlock(BTRFS_I(inode), ilock_flags); goto out; diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 4ad5db619b008d..bcf6e53bc2a74c 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -917,8 +917,7 @@ struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans, return ERR_PTR(ret); } -static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info) +static struct btrfs_root *alloc_log_tree(struct btrfs_fs_info *fs_info) { struct btrfs_root *root; @@ -966,7 +965,7 @@ int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans, { struct btrfs_root *log_root; - log_root = alloc_log_tree(trans, fs_info); + log_root = alloc_log_tree(fs_info); if (IS_ERR(log_root)) return PTR_ERR(log_root); @@ -992,7 +991,7 @@ int btrfs_add_log_tree(struct btrfs_trans_handle *trans, struct btrfs_inode_item *inode_item; int ret; - log_root = alloc_log_tree(trans, fs_info); + log_root = alloc_log_tree(fs_info); if (IS_ERR(log_root)) return PTR_ERR(log_root); @@ -1959,7 +1958,7 @@ static void btrfs_init_qgroup(struct btrfs_fs_info *fs_info) fs_info->qgroup_seq = 1; fs_info->qgroup_ulist = NULL; fs_info->qgroup_rescan_running = false; - fs_info->qgroup_drop_subtree_thres = BTRFS_MAX_LEVEL; + fs_info->qgroup_drop_subtree_thres = BTRFS_QGROUP_DROP_SUBTREE_THRES_DEFAULT; mutex_init(&fs_info->qgroup_rescan_lock); } @@ -2786,6 +2785,7 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info) btrfs_init_scrub(fs_info); btrfs_init_balance(fs_info); btrfs_init_async_reclaim_work(fs_info); + btrfs_init_extent_map_shrinker_work(fs_info); rwlock_init(&fs_info->block_group_cache_lock); fs_info->block_group_cache_tree = RB_ROOT_CACHED; @@ -2852,8 +2852,6 @@ static int init_mount_fs_info(struct btrfs_fs_info *fs_info, struct super_block if (ret) return ret; - spin_lock_init(&fs_info->extent_map_shrinker_lock); - ret = percpu_counter_init(&fs_info->dirty_metadata_bytes, 0, GFP_KERNEL); if (ret) return ret; @@ -3202,8 +3200,7 @@ int btrfs_check_features(struct btrfs_fs_info *fs_info, bool is_rw_mount) return 0; } -int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_devices, - const char *options) +int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_devices) { u32 sectorsize; u32 nodesize; @@ -4294,6 +4291,7 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info) cancel_work_sync(&fs_info->async_reclaim_work); cancel_work_sync(&fs_info->async_data_reclaim_work); cancel_work_sync(&fs_info->preempt_reclaim_work); + cancel_work_sync(&fs_info->em_shrinker_work); /* Cancel or finish ongoing discard work */ btrfs_discard_cleanup(fs_info); diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index 99af64d3f27781..127e31e0834709 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -52,8 +52,7 @@ struct extent_buffer *btrfs_find_create_tree_block( int btrfs_start_pre_rw_mount(struct btrfs_fs_info *fs_info); int btrfs_check_super_csum(struct btrfs_fs_info *fs_info, const struct btrfs_super_block *disk_sb); -int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_devices, - const char *options); +int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_devices); void __cold close_ctree(struct btrfs_fs_info *fs_info); int btrfs_validate_super(const struct btrfs_fs_info *fs_info, const struct btrfs_super_block *sb, int mirror_num); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index d9f511babd89ab..e29024c66edbf9 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -3144,7 +3144,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, break; } - /* Quick path didn't find the EXTEMT/METADATA_ITEM */ + /* Quick path didn't find the EXTENT/METADATA_ITEM */ if (path->slots[0] - extent_slot > 5) break; extent_slot--; @@ -5270,7 +5270,7 @@ struct walk_control { * corrupted file systems must have been caught before calling this function. */ static bool visit_node_for_delete(struct btrfs_root *root, struct walk_control *wc, - struct extent_buffer *eb, u64 refs, u64 flags, int slot) + struct extent_buffer *eb, u64 flags, int slot) { struct btrfs_key key; u64 generation; @@ -5384,7 +5384,7 @@ static noinline void reada_walk_down(struct btrfs_trans_handle *trans, continue; /* If we don't need to visit this node don't reada. */ - if (!visit_node_for_delete(root, wc, eb, refs, flags, slot)) + if (!visit_node_for_delete(root, wc, eb, flags, slot)) continue; reada: btrfs_readahead_node_child(eb, slot); @@ -5737,8 +5737,7 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans, /* If we don't have to walk into this node skip it. */ if (!visit_node_for_delete(root, wc, path->nodes[level], - wc->refs[level - 1], wc->flags[level - 1], - path->slots[level])) + wc->flags[level - 1], path->slots[level])) goto skip; /* diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 309a8ae4843454..1beaba23253242 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -190,7 +190,7 @@ static void process_one_folio(struct btrfs_fs_info *fs_info, btrfs_folio_clamp_clear_writeback(fs_info, folio, start, len); if (folio != locked_folio && (page_ops & PAGE_UNLOCK)) - btrfs_folio_end_writer_lock(fs_info, folio, start, len); + btrfs_folio_end_lock(fs_info, folio, start, len); } static void __process_folios_contig(struct address_space *mapping, @@ -262,22 +262,23 @@ static noinline int lock_delalloc_folios(struct inode *inode, for (i = 0; i < found_folios; i++) { struct folio *folio = fbatch.folios[i]; - u32 len = end + 1 - start; + u64 range_start; + u32 range_len; if (folio == locked_folio) continue; - if (btrfs_folio_start_writer_lock(fs_info, folio, start, - len)) - goto out; - + folio_lock(folio); if (!folio_test_dirty(folio) || folio->mapping != mapping) { - btrfs_folio_end_writer_lock(fs_info, folio, start, - len); + folio_unlock(folio); goto out; } + range_start = max_t(u64, folio_pos(folio), start); + range_len = min_t(u64, folio_pos(folio) + folio_size(folio), + end + 1) - range_start; + btrfs_folio_set_lock(fs_info, folio, range_start, range_len); - processed_end = folio_pos(folio) + folio_size(folio) - 1; + processed_end = range_start + range_len - 1; } folio_batch_release(&fbatch); cond_resched(); @@ -437,7 +438,7 @@ static void end_folio_read(struct folio *folio, bool uptodate, u64 start, u32 le if (!btrfs_is_subpage(fs_info, folio->mapping)) folio_unlock(folio); else - btrfs_subpage_end_reader(fs_info, folio, start, len); + btrfs_folio_end_lock(fs_info, folio, start, len); } /* @@ -494,7 +495,7 @@ static void begin_folio_read(struct btrfs_fs_info *fs_info, struct folio *folio) return; ASSERT(folio_test_private(folio)); - btrfs_subpage_start_reader(fs_info, folio, folio_pos(folio), PAGE_SIZE); + btrfs_folio_set_lock(fs_info, folio, folio_pos(folio), PAGE_SIZE); } /* @@ -1101,6 +1102,45 @@ int btrfs_read_folio(struct file *file, struct folio *folio) return ret; } +static void set_delalloc_bitmap(struct folio *folio, unsigned long *delalloc_bitmap, + u64 start, u32 len) +{ + struct btrfs_fs_info *fs_info = folio_to_fs_info(folio); + const u64 folio_start = folio_pos(folio); + unsigned int start_bit; + unsigned int nbits; + + ASSERT(start >= folio_start && start + len <= folio_start + PAGE_SIZE); + start_bit = (start - folio_start) >> fs_info->sectorsize_bits; + nbits = len >> fs_info->sectorsize_bits; + ASSERT(bitmap_test_range_all_zero(delalloc_bitmap, start_bit, nbits)); + bitmap_set(delalloc_bitmap, start_bit, nbits); +} + +static bool find_next_delalloc_bitmap(struct folio *folio, + unsigned long *delalloc_bitmap, u64 start, + u64 *found_start, u32 *found_len) +{ + struct btrfs_fs_info *fs_info = folio_to_fs_info(folio); + const u64 folio_start = folio_pos(folio); + const unsigned int bitmap_size = fs_info->sectors_per_page; + unsigned int start_bit; + unsigned int first_zero; + unsigned int first_set; + + ASSERT(start >= folio_start && start < folio_start + PAGE_SIZE); + + start_bit = (start - folio_start) >> fs_info->sectorsize_bits; + first_set = find_next_bit(delalloc_bitmap, bitmap_size, start_bit); + if (first_set >= bitmap_size) + return false; + + *found_start = folio_start + (first_set << fs_info->sectorsize_bits); + first_zero = find_next_zero_bit(delalloc_bitmap, bitmap_size, first_set); + *found_len = (first_zero - first_set) << fs_info->sectorsize_bits; + return true; +} + /* * helper for extent_writepage(), doing all of the delayed allocation setup. * @@ -1120,6 +1160,7 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode, const bool is_subpage = btrfs_is_subpage(fs_info, folio->mapping); const u64 page_start = folio_pos(folio); const u64 page_end = page_start + folio_size(folio) - 1; + unsigned long delalloc_bitmap = 0; /* * Save the last found delalloc end. As the delalloc end can go beyond * page boundary, thus we cannot rely on subpage bitmap to locate the @@ -1130,6 +1171,7 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode, u64 delalloc_end = page_end; u64 delalloc_to_write = 0; int ret = 0; + int bit; /* Save the dirty bitmap as our submission bitmap will be a subset of it. */ if (btrfs_is_subpage(fs_info, inode->vfs_inode.i_mapping)) { @@ -1139,6 +1181,12 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode, bio_ctrl->submit_bitmap = 1; } + for_each_set_bit(bit, &bio_ctrl->submit_bitmap, fs_info->sectors_per_page) { + u64 start = page_start + (bit << fs_info->sectorsize_bits); + + btrfs_folio_set_lock(fs_info, folio, start, fs_info->sectorsize); + } + /* Lock all (subpage) delalloc ranges inside the folio first. */ while (delalloc_start < page_end) { delalloc_end = page_end; @@ -1147,9 +1195,8 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode, delalloc_start = delalloc_end + 1; continue; } - btrfs_folio_set_writer_lock(fs_info, folio, delalloc_start, - min(delalloc_end, page_end) + 1 - - delalloc_start); + set_delalloc_bitmap(folio, &delalloc_bitmap, delalloc_start, + min(delalloc_end, page_end) + 1 - delalloc_start); last_delalloc_end = delalloc_end; delalloc_start = delalloc_end + 1; } @@ -1174,7 +1221,7 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode, found_len = last_delalloc_end + 1 - found_start; found = true; } else { - found = btrfs_subpage_find_writer_locked(fs_info, folio, + found = find_next_delalloc_bitmap(folio, &delalloc_bitmap, delalloc_start, &found_start, &found_len); } if (!found) @@ -1313,7 +1360,7 @@ static int submit_one_sector(struct btrfs_inode *inode, * a folio for a range already written to disk. */ btrfs_folio_clear_dirty(fs_info, folio, filepos, sectorsize); - btrfs_set_range_writeback(inode, filepos, filepos + sectorsize - 1); + btrfs_folio_set_writeback(fs_info, folio, filepos, sectorsize); /* * Above call should set the whole folio with writeback flag, even * just for a single subpage sector. @@ -1390,8 +1437,6 @@ static noinline_for_stack int extent_writepage_io(struct btrfs_inode *inode, goto out; submitted_io = true; } - - btrfs_folio_assert_not_dirty(fs_info, folio, start, len); out: /* * If we didn't submitted any sector (>= i_size), folio dirty get @@ -1475,7 +1520,7 @@ static int extent_writepage(struct folio *folio, struct btrfs_bio_ctrl *bio_ctrl * Only unlock ranges that are submitted. As there can be some async * submitted ranges inside the folio. */ - btrfs_folio_end_writer_lock_bitmap(fs_info, folio, bio_ctrl->submit_bitmap); + btrfs_folio_end_lock_bitmap(fs_info, folio, bio_ctrl->submit_bitmap); ASSERT(ret <= 0); return ret; } @@ -2115,7 +2160,27 @@ static int extent_write_cache_pages(struct address_space *mapping, continue; } - if (wbc->sync_mode != WB_SYNC_NONE) { + /* + * For subpage case, compression can lead to mixed + * writeback and dirty flags, e.g: + * 0 32K 64K 96K 128K + * | |//////||/////| |//| + * + * In above case, [32K, 96K) is asynchronously submitted + * for compression, and [124K, 128K) needs to be written back. + * + * If we didn't wait wrtiteback for page 64K, [128K, 128K) + * won't be submitted as the page still has writeback flag + * and will be skipped in the next check. + * + * This mixed writeback and dirty case is only possible for + * subpage case. + * + * TODO: Remove this check after migrating compression to + * regular submission. + */ + if (wbc->sync_mode != WB_SYNC_NONE || + btrfs_is_subpage(inode_to_fs_info(inode), mapping)) { if (folio_test_writeback(folio)) submit_write_bio(bio_ctrl, 0); folio_wait_writeback(folio); @@ -2233,7 +2298,7 @@ void extent_write_locked_range(struct inode *inode, const struct folio *locked_f cur, cur_len, !ret); mapping_set_error(mapping, ret); } - btrfs_folio_end_writer_lock(fs_info, folio, cur, cur_len); + btrfs_folio_end_lock(fs_info, folio, cur, cur_len); if (ret < 0) found_error = true; next_page: @@ -2317,7 +2382,7 @@ int extent_invalidate_folio(struct extent_io_tree *tree, * to drop the page. */ static bool try_release_extent_state(struct extent_io_tree *tree, - struct folio *folio, gfp_t mask) + struct folio *folio) { u64 start = folio_pos(folio); u64 end = start + PAGE_SIZE - 1; @@ -2428,7 +2493,7 @@ bool try_release_extent_mapping(struct folio *folio, gfp_t mask) cond_resched(); } } - return try_release_extent_state(io_tree, folio, mask); + return try_release_extent_state(io_tree, folio); } static void __free_extent_buffer(struct extent_buffer *eb) @@ -2442,7 +2507,7 @@ static int extent_buffer_under_io(const struct extent_buffer *eb) test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)); } -static bool folio_range_has_eb(struct btrfs_fs_info *fs_info, struct folio *folio) +static bool folio_range_has_eb(struct folio *folio) { struct btrfs_subpage *subpage; @@ -2452,12 +2517,6 @@ static bool folio_range_has_eb(struct btrfs_fs_info *fs_info, struct folio *foli subpage = folio_get_private(folio); if (atomic_read(&subpage->eb_refs)) return true; - /* - * Even there is no eb refs here, we may still have - * end_folio_read() call relying on page::private. - */ - if (atomic_read(&subpage->readers)) - return true; } return false; } @@ -2516,7 +2575,7 @@ static void detach_extent_buffer_folio(const struct extent_buffer *eb, struct fo * We can only detach the folio private if there are no other ebs in the * page range and no unfinished IO. */ - if (!folio_range_has_eb(fs_info, folio)) + if (!folio_range_has_eb(folio)) btrfs_detach_subpage(fs_info, folio); spin_unlock(&folio->mapping->i_private_lock); @@ -3121,7 +3180,7 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, } /* * Now all pages of that extent buffer is unmapped, set UNMAPPED flag, - * so it can be cleaned up without utlizing page->mapping. + * so it can be cleaned up without utilizing page->mapping. */ set_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags); @@ -4221,7 +4280,6 @@ void btrfs_readahead_tree_block(struct btrfs_fs_info *fs_info, u64 bytenr, u64 owner_root, u64 gen, int level) { struct btrfs_tree_parent_check check = { - .has_first_key = 0, .level = level, .transid = gen }; diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index 25d191f1ac10f4..d58d6fc40da1fc 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -77,10 +77,13 @@ static u64 range_end(u64 start, u64 len) return start + len; } -static void dec_evictable_extent_maps(struct btrfs_inode *inode) +static void remove_em(struct btrfs_inode *inode, struct extent_map *em) { struct btrfs_fs_info *fs_info = inode->root->fs_info; + rb_erase(&em->rb_node, &inode->extent_tree.root); + RB_CLEAR_NODE(&em->rb_node); + if (!btrfs_is_testing(fs_info) && is_fstree(btrfs_root_id(inode->root))) percpu_counter_dec(&fs_info->evictable_extent_maps); } @@ -333,7 +336,6 @@ static void validate_extent_map(struct btrfs_fs_info *fs_info, struct extent_map static void try_merge_map(struct btrfs_inode *inode, struct extent_map *em) { struct btrfs_fs_info *fs_info = inode->root->fs_info; - struct extent_map_tree *tree = &inode->extent_tree; struct extent_map *merge = NULL; struct rb_node *rb; @@ -365,10 +367,8 @@ static void try_merge_map(struct btrfs_inode *inode, struct extent_map *em) em->flags |= EXTENT_FLAG_MERGED; validate_extent_map(fs_info, em); - rb_erase(&merge->rb_node, &tree->root); - RB_CLEAR_NODE(&merge->rb_node); + remove_em(inode, merge); free_extent_map(merge); - dec_evictable_extent_maps(inode); } } @@ -380,12 +380,10 @@ static void try_merge_map(struct btrfs_inode *inode, struct extent_map *em) if (em->disk_bytenr < EXTENT_MAP_LAST_BYTE) merge_ondisk_extents(em, merge); validate_extent_map(fs_info, em); - rb_erase(&merge->rb_node, &tree->root); - RB_CLEAR_NODE(&merge->rb_node); em->generation = max(em->generation, merge->generation); em->flags |= EXTENT_FLAG_MERGED; + remove_em(inode, merge); free_extent_map(merge); - dec_evictable_extent_maps(inode); } } @@ -582,12 +580,10 @@ void remove_extent_mapping(struct btrfs_inode *inode, struct extent_map *em) lockdep_assert_held_write(&tree->lock); WARN_ON(em->flags & EXTENT_FLAG_PINNED); - rb_erase(&em->rb_node, &tree->root); if (!(em->flags & EXTENT_FLAG_LOGGING)) list_del_init(&em->list); - RB_CLEAR_NODE(&em->rb_node); - dec_evictable_extent_maps(inode); + remove_em(inode, em); } static void replace_extent_mapping(struct btrfs_inode *inode, @@ -1116,13 +1112,12 @@ int split_extent_map(struct btrfs_inode *inode, u64 start, u64 len, u64 pre, struct btrfs_em_shrink_ctx { long nr_to_scan; long scanned; - u64 last_ino; - u64 last_root; }; static long btrfs_scan_inode(struct btrfs_inode *inode, struct btrfs_em_shrink_ctx *ctx) { - const u64 cur_fs_gen = btrfs_get_fs_generation(inode->root->fs_info); + struct btrfs_fs_info *fs_info = inode->root->fs_info; + const u64 cur_fs_gen = btrfs_get_fs_generation(fs_info); struct extent_map_tree *tree = &inode->extent_tree; long nr_dropped = 0; struct rb_node *node; @@ -1195,7 +1190,8 @@ static long btrfs_scan_inode(struct btrfs_inode *inode, struct btrfs_em_shrink_c * lock. This is to avoid slowing other tasks trying to take the * lock. */ - if (need_resched() || rwlock_needbreak(&tree->lock)) + if (need_resched() || rwlock_needbreak(&tree->lock) || + btrfs_fs_closing(fs_info)) break; node = next; } @@ -1207,19 +1203,21 @@ static long btrfs_scan_inode(struct btrfs_inode *inode, struct btrfs_em_shrink_c static long btrfs_scan_root(struct btrfs_root *root, struct btrfs_em_shrink_ctx *ctx) { + struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_inode *inode; long nr_dropped = 0; - u64 min_ino = ctx->last_ino + 1; + u64 min_ino = fs_info->em_shrinker_last_ino + 1; inode = btrfs_find_first_inode(root, min_ino); while (inode) { nr_dropped += btrfs_scan_inode(inode, ctx); min_ino = btrfs_ino(inode) + 1; - ctx->last_ino = btrfs_ino(inode); + fs_info->em_shrinker_last_ino = btrfs_ino(inode); btrfs_add_delayed_iput(inode); - if (ctx->scanned >= ctx->nr_to_scan) + if (ctx->scanned >= ctx->nr_to_scan || + btrfs_fs_closing(inode->root->fs_info)) break; cond_resched(); @@ -1235,52 +1233,43 @@ static long btrfs_scan_root(struct btrfs_root *root, struct btrfs_em_shrink_ctx * inode if there is one or we will find out this was the last * one and move to the next root. */ - ctx->last_root = btrfs_root_id(root); + fs_info->em_shrinker_last_root = btrfs_root_id(root); } else { /* * No more inodes in this root, set extent_map_shrinker_last_ino to 0 so * that when processing the next root we start from its first inode. */ - ctx->last_ino = 0; - ctx->last_root = btrfs_root_id(root) + 1; + fs_info->em_shrinker_last_ino = 0; + fs_info->em_shrinker_last_root = btrfs_root_id(root) + 1; } return nr_dropped; } -long btrfs_free_extent_maps(struct btrfs_fs_info *fs_info, long nr_to_scan) +static void btrfs_extent_map_shrinker_worker(struct work_struct *work) { + struct btrfs_fs_info *fs_info; struct btrfs_em_shrink_ctx ctx; u64 start_root_id; u64 next_root_id; bool cycled = false; long nr_dropped = 0; - ctx.scanned = 0; - ctx.nr_to_scan = nr_to_scan; + fs_info = container_of(work, struct btrfs_fs_info, em_shrinker_work); - /* - * In case we have multiple tasks running this shrinker, make the next - * one start from the next inode in case it starts before we finish. - */ - spin_lock(&fs_info->extent_map_shrinker_lock); - ctx.last_ino = fs_info->extent_map_shrinker_last_ino; - fs_info->extent_map_shrinker_last_ino++; - ctx.last_root = fs_info->extent_map_shrinker_last_root; - spin_unlock(&fs_info->extent_map_shrinker_lock); + ctx.scanned = 0; + ctx.nr_to_scan = atomic64_read(&fs_info->em_shrinker_nr_to_scan); - start_root_id = ctx.last_root; - next_root_id = ctx.last_root; + start_root_id = fs_info->em_shrinker_last_root; + next_root_id = fs_info->em_shrinker_last_root; if (trace_btrfs_extent_map_shrinker_scan_enter_enabled()) { s64 nr = percpu_counter_sum_positive(&fs_info->evictable_extent_maps); - trace_btrfs_extent_map_shrinker_scan_enter(fs_info, nr_to_scan, - nr, ctx.last_root, - ctx.last_ino); + trace_btrfs_extent_map_shrinker_scan_enter(fs_info, nr); } - while (ctx.scanned < ctx.nr_to_scan) { + while (ctx.scanned < ctx.nr_to_scan && !btrfs_fs_closing(fs_info)) { struct btrfs_root *root; unsigned long count; @@ -1294,8 +1283,8 @@ long btrfs_free_extent_maps(struct btrfs_fs_info *fs_info, long nr_to_scan) spin_unlock(&fs_info->fs_roots_radix_lock); if (start_root_id > 0 && !cycled) { next_root_id = 0; - ctx.last_root = 0; - ctx.last_ino = 0; + fs_info->em_shrinker_last_root = 0; + fs_info->em_shrinker_last_ino = 0; cycled = true; continue; } @@ -1314,29 +1303,40 @@ long btrfs_free_extent_maps(struct btrfs_fs_info *fs_info, long nr_to_scan) btrfs_put_root(root); } - /* - * In case of multiple tasks running this extent map shrinking code this - * isn't perfect but it's simple and silences things like KCSAN. It's - * not possible to know which task made more progress because we can - * cycle back to the first root and first inode if it's not the first - * time the shrinker ran, see the above logic. Also a task that started - * later may finish ealier than another task and made less progress. So - * make this simple and update to the progress of the last task that - * finished, with the occasional possiblity of having two consecutive - * runs of the shrinker process the same inodes. - */ - spin_lock(&fs_info->extent_map_shrinker_lock); - fs_info->extent_map_shrinker_last_ino = ctx.last_ino; - fs_info->extent_map_shrinker_last_root = ctx.last_root; - spin_unlock(&fs_info->extent_map_shrinker_lock); - if (trace_btrfs_extent_map_shrinker_scan_exit_enabled()) { s64 nr = percpu_counter_sum_positive(&fs_info->evictable_extent_maps); - trace_btrfs_extent_map_shrinker_scan_exit(fs_info, nr_dropped, - nr, ctx.last_root, - ctx.last_ino); + trace_btrfs_extent_map_shrinker_scan_exit(fs_info, nr_dropped, nr); } - return nr_dropped; + atomic64_set(&fs_info->em_shrinker_nr_to_scan, 0); +} + +void btrfs_free_extent_maps(struct btrfs_fs_info *fs_info, long nr_to_scan) +{ + /* + * Do nothing if the shrinker is already running. In case of high memory + * pressure we can have a lot of tasks calling us and all passing the + * same nr_to_scan value, but in reality we may need only to free + * nr_to_scan extent maps (or less). In case we need to free more than + * that, we will be called again by the fs shrinker, so no worries about + * not doing enough work to reclaim memory from extent maps. + * We can also be repeatedly called with the same nr_to_scan value + * simply because the shrinker runs asynchronously and multiple calls + * to this function are made before the shrinker does enough progress. + * + * That's why we set the atomic counter to nr_to_scan only if its + * current value is zero, instead of incrementing the counter by + * nr_to_scan. + */ + if (atomic64_cmpxchg(&fs_info->em_shrinker_nr_to_scan, 0, nr_to_scan) != 0) + return; + + queue_work(system_unbound_wq, &fs_info->em_shrinker_work); +} + +void btrfs_init_extent_map_shrinker_work(struct btrfs_fs_info *fs_info) +{ + atomic64_set(&fs_info->em_shrinker_nr_to_scan, 0); + INIT_WORK(&fs_info->em_shrinker_work, btrfs_extent_map_shrinker_worker); } diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h index 5154a8f1d26c94..cd123b266b6416 100644 --- a/fs/btrfs/extent_map.h +++ b/fs/btrfs/extent_map.h @@ -189,6 +189,7 @@ void btrfs_drop_extent_map_range(struct btrfs_inode *inode, int btrfs_replace_extent_map_range(struct btrfs_inode *inode, struct extent_map *new_em, bool modified); -long btrfs_free_extent_maps(struct btrfs_fs_info *fs_info, long nr_to_scan); +void btrfs_free_extent_maps(struct btrfs_fs_info *fs_info, long nr_to_scan); +void btrfs_init_extent_map_shrinker_work(struct btrfs_fs_info *fs_info); #endif diff --git a/fs/btrfs/fiemap.c b/fs/btrfs/fiemap.c index df7f09f3b02e06..b80c07ad8c5e71 100644 --- a/fs/btrfs/fiemap.c +++ b/fs/btrfs/fiemap.c @@ -186,7 +186,7 @@ static int emit_fiemap_extent(struct fiemap_extent_info *fieinfo, * we have in the cache is the last delalloc range we * found while the file extent item we found can be * either for a whole delalloc range we previously - * emmitted or only a part of that range. + * emitted or only a part of that range. * * We have two cases here: * @@ -194,13 +194,13 @@ static int emit_fiemap_extent(struct fiemap_extent_info *fieinfo, * cached extent's end. In this case just ignore the * current file extent item because we don't want to * overlap with previous ranges that may have been - * emmitted already; + * emitted already; * * 2) The file extent item starts behind the currently * cached extent but its end offset goes beyond the * end offset of the cached extent. We don't want to * overlap with a previous range that may have been - * emmitted already, so we emit the currently cached + * emitted already, so we emit the currently cached * extent and then partially store the current file * extent item's range in the cache, for the subrange * going the cached extent's end to the end of the diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 4fb521d91b0612..033f85ea8c9d6a 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -124,12 +124,14 @@ static void btrfs_drop_pages(struct btrfs_fs_info *fs_info, * - Update inode size for past EOF write */ int btrfs_dirty_pages(struct btrfs_inode *inode, struct page **pages, - size_t num_pages, loff_t pos, size_t write_bytes, + loff_t pos, size_t write_bytes, struct extent_state **cached, bool noreserve) { struct btrfs_fs_info *fs_info = inode->root->fs_info; int ret = 0; int i; + const int num_pages = (round_up(pos + write_bytes, PAGE_SIZE) - + round_down(pos, PAGE_SIZE)) >> PAGE_SHIFT; u64 num_bytes; u64 start_pos; u64 end_of_last_block; @@ -856,36 +858,42 @@ int btrfs_mark_extent_written(struct btrfs_trans_handle *trans, */ static int prepare_uptodate_page(struct inode *inode, struct page *page, u64 pos, - bool force_uptodate) + u64 len, bool force_uptodate) { struct folio *folio = page_folio(page); + u64 clamp_start = max_t(u64, pos, folio_pos(folio)); + u64 clamp_end = min_t(u64, pos + len, folio_pos(folio) + folio_size(folio)); int ret = 0; - if (((pos & (PAGE_SIZE - 1)) || force_uptodate) && - !PageUptodate(page)) { - ret = btrfs_read_folio(NULL, folio); - if (ret) - return ret; - lock_page(page); - if (!PageUptodate(page)) { - unlock_page(page); - return -EIO; - } + if (folio_test_uptodate(folio)) + return 0; - /* - * Since btrfs_read_folio() will unlock the folio before it - * returns, there is a window where btrfs_release_folio() can be - * called to release the page. Here we check both inode - * mapping and PagePrivate() to make sure the page was not - * released. - * - * The private flag check is essential for subpage as we need - * to store extra bitmap using folio private. - */ - if (page->mapping != inode->i_mapping || !folio_test_private(folio)) { - unlock_page(page); - return -EAGAIN; - } + if (!force_uptodate && + IS_ALIGNED(clamp_start, PAGE_SIZE) && + IS_ALIGNED(clamp_end, PAGE_SIZE)) + return 0; + + ret = btrfs_read_folio(NULL, folio); + if (ret) + return ret; + folio_lock(folio); + if (!folio_test_uptodate(folio)) { + folio_unlock(folio); + return -EIO; + } + + /* + * Since btrfs_read_folio() will unlock the folio before it returns, + * there is a window where btrfs_release_folio() can be called to + * release the page. Here we check both inode mapping and page + * private to make sure the page was not released. + * + * The private flag check is essential for subpage as we need to store + * extra bitmap using folio private. + */ + if (page->mapping != inode->i_mapping || !folio_test_private(folio)) { + folio_unlock(folio); + return -EAGAIN; } return 0; } @@ -947,12 +955,8 @@ static noinline int prepare_pages(struct inode *inode, struct page **pages, goto fail; } - if (i == 0) - ret = prepare_uptodate_page(inode, pages[i], pos, - force_uptodate); - if (!ret && i == num_pages - 1) - ret = prepare_uptodate_page(inode, pages[i], - pos + write_bytes, false); + ret = prepare_uptodate_page(inode, pages[i], pos, write_bytes, + force_uptodate); if (ret) { put_page(pages[i]); if (!nowait && ret == -EAGAIN) { @@ -1140,7 +1144,7 @@ static void update_time_for_write(struct inode *inode) inode_inc_iversion(inode); } -int btrfs_write_check(struct kiocb *iocb, struct iov_iter *from, size_t count) +int btrfs_write_check(struct kiocb *iocb, size_t count) { struct file *file = iocb->ki_filp; struct inode *inode = file_inode(file); @@ -1218,7 +1222,7 @@ ssize_t btrfs_buffered_write(struct kiocb *iocb, struct iov_iter *i) if (ret <= 0) goto out; - ret = btrfs_write_check(iocb, i, ret); + ret = btrfs_write_check(iocb, ret); if (ret < 0) goto out; @@ -1242,7 +1246,6 @@ ssize_t btrfs_buffered_write(struct kiocb *iocb, struct iov_iter *i) offset); size_t num_pages; size_t reserve_bytes; - size_t dirty_pages; size_t copied; size_t dirty_sectors; size_t num_sectors; @@ -1361,11 +1364,8 @@ ssize_t btrfs_buffered_write(struct kiocb *iocb, struct iov_iter *i) if (copied == 0) { force_page_uptodate = true; dirty_sectors = 0; - dirty_pages = 0; } else { force_page_uptodate = false; - dirty_pages = DIV_ROUND_UP(copied + offset, - PAGE_SIZE); } if (num_sectors > dirty_sectors) { @@ -1375,13 +1375,10 @@ ssize_t btrfs_buffered_write(struct kiocb *iocb, struct iov_iter *i) btrfs_delalloc_release_metadata(BTRFS_I(inode), release_bytes, true); } else { - u64 __pos; - - __pos = round_down(pos, - fs_info->sectorsize) + - (dirty_pages << PAGE_SHIFT); + u64 release_start = round_up(pos + copied, + fs_info->sectorsize); btrfs_delalloc_release_space(BTRFS_I(inode), - data_reserved, __pos, + data_reserved, release_start, release_bytes, true); } } @@ -1390,7 +1387,7 @@ ssize_t btrfs_buffered_write(struct kiocb *iocb, struct iov_iter *i) fs_info->sectorsize); ret = btrfs_dirty_pages(BTRFS_I(inode), pages, - dirty_pages, pos, copied, + pos, copied, &cached_state, only_release_metadata); /* @@ -1470,7 +1467,7 @@ static ssize_t btrfs_encoded_write(struct kiocb *iocb, struct iov_iter *from, if (ret || encoded->len == 0) goto out; - ret = btrfs_write_check(iocb, from, encoded->len); + ret = btrfs_write_check(iocb, encoded->len); if (ret < 0) goto out; diff --git a/fs/btrfs/file.h b/fs/btrfs/file.h index 912254e653cf96..c2ce0ae94a9c17 100644 --- a/fs/btrfs/file.h +++ b/fs/btrfs/file.h @@ -35,7 +35,7 @@ ssize_t btrfs_do_write_iter(struct kiocb *iocb, struct iov_iter *from, const struct btrfs_ioctl_encoded_io_args *encoded); int btrfs_release_file(struct inode *inode, struct file *file); int btrfs_dirty_pages(struct btrfs_inode *inode, struct page **pages, - size_t num_pages, loff_t pos, size_t write_bytes, + loff_t pos, size_t write_bytes, struct extent_state **cached, bool noreserve); int btrfs_fdatawrite_range(struct btrfs_inode *inode, loff_t start, loff_t end); int btrfs_check_nocow_lock(struct btrfs_inode *inode, loff_t pos, @@ -44,7 +44,7 @@ void btrfs_check_nocow_unlock(struct btrfs_inode *inode); bool btrfs_find_delalloc_in_range(struct btrfs_inode *inode, u64 start, u64 end, struct extent_state **cached_state, u64 *delalloc_start_ret, u64 *delalloc_end_ret); -int btrfs_write_check(struct kiocb *iocb, struct iov_iter *from, size_t count); +int btrfs_write_check(struct kiocb *iocb, size_t count); ssize_t btrfs_buffered_write(struct kiocb *iocb, struct iov_iter *i); #endif diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index f4bcb25306606a..0d2db205b9f629 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -1457,8 +1457,7 @@ static int __btrfs_write_out_cache(struct inode *inode, io_ctl_zero_remaining_pages(io_ctl); /* Everything is written out, now we dirty the pages in the file. */ - ret = btrfs_dirty_pages(BTRFS_I(inode), io_ctl->pages, - io_ctl->num_pages, 0, i_size_read(inode), + ret = btrfs_dirty_pages(BTRFS_I(inode), io_ctl->pages, 0, i_size_read(inode), &cached_state, false); if (ret) goto out_nospc; diff --git a/fs/btrfs/fs.h b/fs/btrfs/fs.h index 79f64e383eddf8..79a1a3d6f04d18 100644 --- a/fs/btrfs/fs.h +++ b/fs/btrfs/fs.h @@ -263,10 +263,10 @@ enum { BTRFS_FEATURE_INCOMPAT_ZONED | \ BTRFS_FEATURE_INCOMPAT_SIMPLE_QUOTA) -#ifdef CONFIG_BTRFS_DEBUG +#ifdef CONFIG_BTRFS_EXPERIMENTAL /* * Features under developmen like Extent tree v2 support is enabled - * only under CONFIG_BTRFS_DEBUG. + * only under CONFIG_BTRFS_EXPERIMENTAL */ #define BTRFS_FEATURE_INCOMPAT_SUPP \ (BTRFS_FEATURE_INCOMPAT_SUPP_STABLE | \ @@ -317,6 +317,8 @@ struct btrfs_dev_replace { struct percpu_counter bio_counter; wait_queue_head_t replace_wait; + + struct task_struct *replace_task; }; /* @@ -633,9 +635,10 @@ struct btrfs_fs_info { s32 delalloc_batch; struct percpu_counter evictable_extent_maps; - spinlock_t extent_map_shrinker_lock; - u64 extent_map_shrinker_last_root; - u64 extent_map_shrinker_last_ino; + u64 em_shrinker_last_root; + u64 em_shrinker_last_ino; + atomic64_t em_shrinker_nr_to_scan; + struct work_struct em_shrinker_work; /* Protected by 'trans_lock'. */ struct list_head dirty_cowonly_roots; @@ -876,12 +879,9 @@ struct btrfs_fs_info { #endif }; -#define page_to_inode(_page) (BTRFS_I(_Generic((_page), \ - struct page *: (_page))->mapping->host)) #define folio_to_inode(_folio) (BTRFS_I(_Generic((_folio), \ struct folio *: (_folio))->mapping->host)) -#define page_to_fs_info(_page) (page_to_inode(_page)->root->fs_info) #define folio_to_fs_info(_folio) (folio_to_inode(_folio)->root->fs_info) #define inode_to_fs_info(_inode) (BTRFS_I(_Generic((_inode), \ diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 5618ca02934a04..35f89d14c110d1 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -646,7 +646,7 @@ static bool can_cow_file_range_inline(struct btrfs_inode *inode, * If being used directly, you must have already checked we're allowed to cow * the range by getting true from can_cow_file_range_inline(). */ -static noinline int __cow_file_range_inline(struct btrfs_inode *inode, u64 offset, +static noinline int __cow_file_range_inline(struct btrfs_inode *inode, u64 size, size_t compressed_size, int compress_type, struct folio *compressed_folio, @@ -736,7 +736,7 @@ static noinline int cow_file_range_inline(struct btrfs_inode *inode, return 1; lock_extent(&inode->io_tree, offset, end, &cached); - ret = __cow_file_range_inline(inode, offset, size, compressed_size, + ret = __cow_file_range_inline(inode, size, compressed_size, compress_type, compressed_folio, update_i_size); if (ret > 0) { @@ -832,32 +832,16 @@ static inline int inode_need_compress(struct btrfs_inode *inode, u64 start, return 0; } /* - * Special check for subpage. + * Only enable sector perfect compression for experimental builds. * - * We lock the full page then run each delalloc range in the page, thus - * for the following case, we will hit some subpage specific corner case: + * This is a big feature change for subpage cases, and can hit + * different corner cases, so only limit this feature for + * experimental build for now. * - * 0 32K 64K - * | |///////| |///////| - * \- A \- B - * - * In above case, both range A and range B will try to unlock the full - * page [0, 64K), causing the one finished later will have page - * unlocked already, triggering various page lock requirement BUG_ON()s. - * - * So here we add an artificial limit that subpage compression can only - * if the range is fully page aligned. - * - * In theory we only need to ensure the first page is fully covered, but - * the tailing partial page will be locked until the full compression - * finishes, delaying the write of other range. - * - * TODO: Make btrfs_run_delalloc_range() to lock all delalloc range - * first to prevent any submitted async extent to unlock the full page. - * By this, we can ensure for subpage case that only the last async_cow - * will unlock the full page. + * ETA for moving this out of experimental builds is 6.15. */ - if (fs_info->sectorsize < PAGE_SIZE) { + if (fs_info->sectorsize < PAGE_SIZE && + !IS_ENABLED(CONFIG_BTRFS_EXPERIMENTAL)) { if (!PAGE_ALIGNED(start) || !PAGE_ALIGNED(end + 1)) return 0; @@ -902,7 +886,8 @@ static int extent_range_clear_dirty_for_io(struct inode *inode, u64 start, u64 e ret = PTR_ERR(folio); continue; } - folio_clear_dirty_for_io(folio); + btrfs_folio_clamp_clear_dirty(inode_to_fs_info(inode), folio, start, + end + 1 - start); folio_put(folio); } return ret; @@ -1001,17 +986,6 @@ static void compress_file_range(struct btrfs_work *work) (start > 0 || end + 1 < inode->disk_i_size)) goto cleanup_and_bail_uncompressed; - /* - * For subpage case, we require full page alignment for the sector - * aligned range. - * Thus we must also check against @actual_end, not just @end. - */ - if (blocksize < PAGE_SIZE) { - if (!PAGE_ALIGNED(start) || - !PAGE_ALIGNED(round_up(actual_end, blocksize))) - goto cleanup_and_bail_uncompressed; - } - total_compressed = min_t(unsigned long, total_compressed, BTRFS_MAX_UNCOMPRESSED); total_in = 0; @@ -3094,34 +3068,6 @@ int btrfs_finish_one_ordered(struct btrfs_ordered_extent *ordered_extent) goto out; } - if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) { - BUG_ON(!list_empty(&ordered_extent->list)); /* Logic error */ - - btrfs_inode_safe_disk_i_size_write(inode, 0); - if (freespace_inode) - trans = btrfs_join_transaction_spacecache(root); - else - trans = btrfs_join_transaction(root); - if (IS_ERR(trans)) { - ret = PTR_ERR(trans); - trans = NULL; - goto out; - } - trans->block_rsv = &inode->block_rsv; - ret = btrfs_update_inode_fallback(trans, inode); - if (ret) /* -ENOMEM or corruption */ - btrfs_abort_transaction(trans, ret); - - ret = btrfs_insert_raid_extent(trans, ordered_extent); - if (ret) - btrfs_abort_transaction(trans, ret); - - goto out; - } - - clear_bits |= EXTENT_LOCKED; - lock_extent(io_tree, start, end, &cached_state); - if (freespace_inode) trans = btrfs_join_transaction_spacecache(root); else @@ -3135,8 +3081,31 @@ int btrfs_finish_one_ordered(struct btrfs_ordered_extent *ordered_extent) trans->block_rsv = &inode->block_rsv; ret = btrfs_insert_raid_extent(trans, ordered_extent); - if (ret) + if (ret) { + btrfs_abort_transaction(trans, ret); goto out; + } + + if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) { + /* Logic error */ + ASSERT(list_empty(&ordered_extent->list)); + if (!list_empty(&ordered_extent->list)) { + ret = -EINVAL; + btrfs_abort_transaction(trans, ret); + goto out; + } + + btrfs_inode_safe_disk_i_size_write(inode, 0); + ret = btrfs_update_inode_fallback(trans, inode); + if (ret) { + /* -ENOMEM or corruption */ + btrfs_abort_transaction(trans, ret); + } + goto out; + } + + clear_bits |= EXTENT_LOCKED; + lock_extent(io_tree, start, end, &cached_state); if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered_extent->flags)) compress_type = ordered_extent->compress_type; @@ -6026,7 +5995,7 @@ static int btrfs_real_readdir(struct file *file, struct dir_context *ctx) * offset. This means that new entries created during readdir * are *guaranteed* to be seen in the future by that readdir. * This has broken buggy programs which operate on names as - * they're returned by readdir. Until we re-use freed offsets + * they're returned by readdir. Until we reuse freed offsets * we have this hack to stop new entries from being returned * under the assumption that they'll never reach this huge * offset. @@ -6768,8 +6737,7 @@ static noinline int uncompress_inline(struct btrfs_path *path, return ret; } -static int read_inline_extent(struct btrfs_inode *inode, struct btrfs_path *path, - struct folio *folio) +static int read_inline_extent(struct btrfs_path *path, struct folio *folio) { struct btrfs_file_extent_item *fi; void *kaddr; @@ -6967,7 +6935,7 @@ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode, ASSERT(em->disk_bytenr == EXTENT_MAP_INLINE); ASSERT(em->len == fs_info->sectorsize); - ret = read_inline_extent(inode, path, folio); + ret = read_inline_extent(path, folio); if (ret < 0) goto out; goto insert; @@ -8975,28 +8943,6 @@ static int btrfs_tmpfile(struct mnt_idmap *idmap, struct inode *dir, return finish_open_simple(file, ret); } -void btrfs_set_range_writeback(struct btrfs_inode *inode, u64 start, u64 end) -{ - struct btrfs_fs_info *fs_info = inode->root->fs_info; - unsigned long index = start >> PAGE_SHIFT; - unsigned long end_index = end >> PAGE_SHIFT; - struct folio *folio; - u32 len; - - ASSERT(end + 1 - start <= U32_MAX); - len = end + 1 - start; - while (index <= end_index) { - folio = __filemap_get_folio(inode->vfs_inode.i_mapping, index, 0, 0); - ASSERT(!IS_ERR(folio)); /* folios should be in the extent_io_tree */ - - /* This is for data, which doesn't yet support larger folio. */ - ASSERT(folio_order(folio) == 0); - btrfs_folio_set_writeback(fs_info, folio, start, len); - folio_put(folio); - index++; - } -} - int btrfs_encoded_io_compression_from_extent(struct btrfs_fs_info *fs_info, int compress_type) { @@ -9135,8 +9081,8 @@ static void btrfs_encoded_read_endio(struct btrfs_bio *bbio) } int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode, - u64 file_offset, u64 disk_bytenr, - u64 disk_io_size, struct page **pages) + u64 disk_bytenr, u64 disk_io_size, + struct page **pages) { struct btrfs_fs_info *fs_info = inode->root->fs_info; struct btrfs_encoded_read_private priv = { @@ -9206,7 +9152,7 @@ static ssize_t btrfs_encoded_read_regular(struct kiocb *iocb, goto out; } - ret = btrfs_encoded_read_regular_fill_pages(inode, start, disk_bytenr, + ret = btrfs_encoded_read_regular_fill_pages(inode, disk_bytenr, disk_io_size, pages); if (ret) goto out; @@ -9559,7 +9505,7 @@ ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from, if (encoded->unencoded_len == encoded->len && encoded->unencoded_offset == 0 && can_cow_file_range_inline(inode, start, encoded->len, orig_count)) { - ret = __cow_file_range_inline(inode, start, encoded->len, + ret = __cow_file_range_inline(inode, encoded->len, orig_count, compression, folios[0], true); if (ret <= 0) { diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 226c91fe31a707..28b9b7fda5780b 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -4058,8 +4058,7 @@ static long btrfs_ioctl_quota_rescan_status(struct btrfs_fs_info *fs_info, return 0; } -static long btrfs_ioctl_quota_rescan_wait(struct btrfs_fs_info *fs_info, - void __user *arg) +static long btrfs_ioctl_quota_rescan_wait(struct btrfs_fs_info *fs_info) { if (!capable(CAP_SYS_ADMIN)) return -EPERM; @@ -4812,7 +4811,7 @@ long btrfs_ioctl(struct file *file, unsigned int case BTRFS_IOC_QUOTA_RESCAN_STATUS: return btrfs_ioctl_quota_rescan_status(fs_info, argp); case BTRFS_IOC_QUOTA_RESCAN_WAIT: - return btrfs_ioctl_quota_rescan_wait(fs_info, argp); + return btrfs_ioctl_quota_rescan_wait(fs_info); case BTRFS_IOC_DEV_REPLACE: return btrfs_ioctl_dev_replace(fs_info, argp); case BTRFS_IOC_GET_SUPPORTED_FEATURES: diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c index 6a0b7abb5bd9aa..9a7a7b7233054e 100644 --- a/fs/btrfs/locking.c +++ b/fs/btrfs/locking.c @@ -161,21 +161,6 @@ int btrfs_try_tree_read_lock(struct extent_buffer *eb) return 0; } -/* - * Try-lock for write. - * - * Return 1 if the rwlock has been taken, 0 otherwise - */ -int btrfs_try_tree_write_lock(struct extent_buffer *eb) -{ - if (down_write_trylock(&eb->lock)) { - btrfs_set_eb_lock_owner(eb, current->pid); - trace_btrfs_try_tree_write_lock(eb); - return 1; - } - return 0; -} - /* * Release read lock. */ diff --git a/fs/btrfs/locking.h b/fs/btrfs/locking.h index 3c15c75e05829e..46c8be2afab1c6 100644 --- a/fs/btrfs/locking.h +++ b/fs/btrfs/locking.h @@ -180,7 +180,6 @@ static inline void btrfs_tree_read_lock(struct extent_buffer *eb) void btrfs_tree_read_unlock(struct extent_buffer *eb); int btrfs_try_tree_read_lock(struct extent_buffer *eb); -int btrfs_try_tree_write_lock(struct extent_buffer *eb); struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root); struct extent_buffer *btrfs_read_lock_root_node(struct btrfs_root *root); struct extent_buffer *btrfs_try_read_lock_root_node(struct btrfs_root *root); diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c index 72856f6775f7f1..a45bc11f866531 100644 --- a/fs/btrfs/lzo.c +++ b/fs/btrfs/lzo.c @@ -80,7 +80,7 @@ void lzo_free_workspace(struct list_head *ws) kfree(workspace); } -struct list_head *lzo_alloc_workspace(unsigned int level) +struct list_head *lzo_alloc_workspace(void) { struct workspace *workspace; diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index 1332ec59c5396a..a6f92836c9b113 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -226,8 +226,7 @@ static struct btrfs_qgroup *add_qgroup_rb(struct btrfs_fs_info *fs_info, return qgroup; } -static void __del_qgroup_rb(struct btrfs_fs_info *fs_info, - struct btrfs_qgroup *qgroup) +static void __del_qgroup_rb(struct btrfs_qgroup *qgroup) { struct btrfs_qgroup_list *list; @@ -258,7 +257,7 @@ static int del_qgroup_rb(struct btrfs_fs_info *fs_info, u64 qgroupid) return -ENOENT; rb_erase(&qgroup->node, &fs_info->qgroup_tree); - __del_qgroup_rb(fs_info, qgroup); + __del_qgroup_rb(qgroup); return 0; } @@ -469,7 +468,7 @@ int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info) /* * If a qgroup exists for a subvolume ID, it is possible * that subvolume has been deleted, in which case - * re-using that ID would lead to incorrect accounting. + * reusing that ID would lead to incorrect accounting. * * Ensure that we skip any such subvol ids. * @@ -643,7 +642,7 @@ void btrfs_free_qgroup_config(struct btrfs_fs_info *fs_info) while ((n = rb_first(&fs_info->qgroup_tree))) { qgroup = rb_entry(n, struct btrfs_qgroup, node); rb_erase(n, &fs_info->qgroup_tree); - __del_qgroup_rb(fs_info, qgroup); + __del_qgroup_rb(qgroup); btrfs_sysfs_del_one_qgroup(fs_info, qgroup); kfree(qgroup); } @@ -1407,7 +1406,7 @@ int btrfs_quota_disable(struct btrfs_fs_info *fs_info) fs_info->quota_root = NULL; fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_ON; fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_SIMPLE_MODE; - fs_info->qgroup_drop_subtree_thres = BTRFS_MAX_LEVEL; + fs_info->qgroup_drop_subtree_thres = BTRFS_QGROUP_DROP_SUBTREE_THRES_DEFAULT; spin_unlock(&fs_info->qgroup_lock); btrfs_free_qgroup_config(fs_info); @@ -2001,27 +2000,27 @@ int btrfs_limit_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid, * Return <0 for insertion failure, caller can free @record safely. */ int btrfs_qgroup_trace_extent_nolock(struct btrfs_fs_info *fs_info, - struct btrfs_delayed_ref_root *delayed_refs, - struct btrfs_qgroup_extent_record *record) + struct btrfs_delayed_ref_root *delayed_refs, + struct btrfs_qgroup_extent_record *record, + u64 bytenr) { struct btrfs_qgroup_extent_record *existing, *ret; - const unsigned long index = (record->bytenr >> fs_info->sectorsize_bits); + const unsigned long index = (bytenr >> fs_info->sectorsize_bits); if (!btrfs_qgroup_full_accounting(fs_info)) return 1; #if BITS_PER_LONG == 32 - if (record->bytenr >= MAX_LFS_FILESIZE) { + if (bytenr >= MAX_LFS_FILESIZE) { btrfs_err_rl(fs_info, "qgroup record for extent at %llu is beyond 32bit page cache and xarray index limit", - record->bytenr); + bytenr); btrfs_err_32bit_limit(fs_info); return -EOVERFLOW; } #endif - lockdep_assert_held(&delayed_refs->lock); - trace_btrfs_qgroup_trace_extent(fs_info, record); + trace_btrfs_qgroup_trace_extent(fs_info, record, bytenr); xa_lock(&delayed_refs->dirty_extents); existing = xa_load(&delayed_refs->dirty_extents, index); @@ -2066,12 +2065,17 @@ int btrfs_qgroup_trace_extent_nolock(struct btrfs_fs_info *fs_info, * transaction committing, but not now as qgroup accounting will be wrong again. */ int btrfs_qgroup_trace_extent_post(struct btrfs_trans_handle *trans, - struct btrfs_qgroup_extent_record *qrecord) + struct btrfs_qgroup_extent_record *qrecord, + u64 bytenr) { - struct btrfs_backref_walk_ctx ctx = { 0 }; + struct btrfs_fs_info *fs_info = trans->fs_info; + struct btrfs_backref_walk_ctx ctx = { + .bytenr = bytenr, + .fs_info = fs_info, + }; int ret; - if (!btrfs_qgroup_full_accounting(trans->fs_info)) + if (!btrfs_qgroup_full_accounting(fs_info)) return 0; /* * We are always called in a context where we are already holding a @@ -2094,16 +2098,13 @@ int btrfs_qgroup_trace_extent_post(struct btrfs_trans_handle *trans, */ ASSERT(trans != NULL); - if (trans->fs_info->qgroup_flags & BTRFS_QGROUP_RUNTIME_FLAG_NO_ACCOUNTING) + if (fs_info->qgroup_flags & BTRFS_QGROUP_RUNTIME_FLAG_NO_ACCOUNTING) return 0; - ctx.bytenr = qrecord->bytenr; - ctx.fs_info = trans->fs_info; - ret = btrfs_find_all_roots(&ctx, true); if (ret < 0) { - qgroup_mark_inconsistent(trans->fs_info); - btrfs_warn(trans->fs_info, + qgroup_mark_inconsistent(fs_info); + btrfs_warn(fs_info, "error accounting new delayed refs extent (err code: %d), quota inconsistent", ret); return 0; @@ -2138,7 +2139,7 @@ int btrfs_qgroup_trace_extent(struct btrfs_trans_handle *trans, u64 bytenr, { struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_qgroup_extent_record *record; - struct btrfs_delayed_ref_root *delayed_refs; + struct btrfs_delayed_ref_root *delayed_refs = &trans->transaction->delayed_refs; const unsigned long index = (bytenr >> fs_info->sectorsize_bits); int ret; @@ -2148,26 +2149,21 @@ int btrfs_qgroup_trace_extent(struct btrfs_trans_handle *trans, u64 bytenr, if (!record) return -ENOMEM; - if (xa_reserve(&trans->transaction->delayed_refs.dirty_extents, index, GFP_NOFS)) { + if (xa_reserve(&delayed_refs->dirty_extents, index, GFP_NOFS)) { kfree(record); return -ENOMEM; } - delayed_refs = &trans->transaction->delayed_refs; - record->bytenr = bytenr; record->num_bytes = num_bytes; - record->old_roots = NULL; - spin_lock(&delayed_refs->lock); - ret = btrfs_qgroup_trace_extent_nolock(fs_info, delayed_refs, record); - spin_unlock(&delayed_refs->lock); + ret = btrfs_qgroup_trace_extent_nolock(fs_info, delayed_refs, record, bytenr); if (ret) { /* Clean up if insertion fails or item exists. */ xa_release(&delayed_refs->dirty_extents, index); kfree(record); return 0; } - return btrfs_qgroup_trace_extent_post(trans, record); + return btrfs_qgroup_trace_extent_post(trans, record, bytenr); } /* @@ -2652,7 +2648,6 @@ int btrfs_qgroup_trace_subtree(struct btrfs_trans_handle *trans, if (!extent_buffer_uptodate(root_eb)) { struct btrfs_tree_parent_check check = { - .has_first_key = false, .transid = root_gen, .level = root_level }; @@ -3043,14 +3038,16 @@ int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans) delayed_refs = &trans->transaction->delayed_refs; qgroup_to_skip = delayed_refs->qgroup_to_skip; xa_for_each(&delayed_refs->dirty_extents, index, record) { + const u64 bytenr = (((u64)index) << fs_info->sectorsize_bits); + num_dirty_extents++; - trace_btrfs_qgroup_account_extents(fs_info, record); + trace_btrfs_qgroup_account_extents(fs_info, record, bytenr); if (!ret && !(fs_info->qgroup_flags & BTRFS_QGROUP_RUNTIME_FLAG_NO_ACCOUNTING)) { struct btrfs_backref_walk_ctx ctx = { 0 }; - ctx.bytenr = record->bytenr; + ctx.bytenr = bytenr; ctx.fs_info = fs_info; /* @@ -3092,7 +3089,7 @@ int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans) ulist_del(record->old_roots, qgroup_to_skip, 0); } - ret = btrfs_qgroup_account_extent(trans, record->bytenr, + ret = btrfs_qgroup_account_extent(trans, bytenr, record->num_bytes, record->old_roots, new_roots); @@ -4196,13 +4193,20 @@ static int try_flush_qgroup(struct btrfs_root *root) return 0; } - btrfs_run_delayed_iputs(root->fs_info); - btrfs_wait_on_delayed_iputs(root->fs_info); ret = btrfs_start_delalloc_snapshot(root, true); if (ret < 0) goto out; btrfs_wait_ordered_extents(root, U64_MAX, NULL); + /* + * After waiting for ordered extents run delayed iputs in order to free + * space from unlinked files before committing the current transaction, + * as ordered extents may have been holding the last reference of an + * inode and they add a delayed iput when they complete. + */ + btrfs_run_delayed_iputs(root->fs_info); + btrfs_wait_on_delayed_iputs(root->fs_info); + ret = btrfs_commit_current_transaction(root); out: clear_bit(BTRFS_ROOT_QGROUP_FLUSHING, &root->state); @@ -4687,8 +4691,7 @@ void btrfs_qgroup_clean_swapped_blocks(struct btrfs_root *root) * BOTH POINTERS ARE BEFORE TREE SWAP * @last_snapshot: last snapshot generation of the subvolume tree */ -int btrfs_qgroup_add_swapped_blocks(struct btrfs_trans_handle *trans, - struct btrfs_root *subvol_root, +int btrfs_qgroup_add_swapped_blocks(struct btrfs_root *subvol_root, struct btrfs_block_group *bg, struct extent_buffer *subvol_parent, int subvol_slot, struct extent_buffer *reloc_parent, int reloc_slot, @@ -4894,17 +4897,6 @@ void btrfs_qgroup_destroy_extent_records(struct btrfs_transaction *trans) xa_destroy(&trans->delayed_refs.dirty_extents); } -void btrfs_free_squota_rsv(struct btrfs_fs_info *fs_info, u64 root, u64 rsv_bytes) -{ - if (btrfs_qgroup_mode(fs_info) != BTRFS_QGROUP_MODE_SIMPLE) - return; - - if (!is_fstree(root)) - return; - - btrfs_qgroup_free_refroot(fs_info, root, rsv_bytes, BTRFS_QGROUP_RSV_DATA); -} - int btrfs_record_squota_delta(struct btrfs_fs_info *fs_info, const struct btrfs_squota_delta *delta) { diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h index 98adf4ec7b0172..e233cc79af185a 100644 --- a/fs/btrfs/qgroup.h +++ b/fs/btrfs/qgroup.h @@ -121,11 +121,18 @@ struct btrfs_inode; #define BTRFS_QGROUP_RUNTIME_FLAG_CANCEL_RESCAN (1ULL << 63) #define BTRFS_QGROUP_RUNTIME_FLAG_NO_ACCOUNTING (1ULL << 62) +#define BTRFS_QGROUP_DROP_SUBTREE_THRES_DEFAULT (3) + /* * Record a dirty extent, and info qgroup to update quota on it */ struct btrfs_qgroup_extent_record { - u64 bytenr; + /* + * The bytenr of the extent is given by its index in the dirty_extents + * xarray of struct btrfs_delayed_ref_root left shifted by + * fs_info->sectorsize_bits. + */ + u64 num_bytes; /* @@ -343,9 +350,11 @@ void btrfs_free_qgroup_config(struct btrfs_fs_info *fs_info); int btrfs_qgroup_trace_extent_nolock( struct btrfs_fs_info *fs_info, struct btrfs_delayed_ref_root *delayed_refs, - struct btrfs_qgroup_extent_record *record); + struct btrfs_qgroup_extent_record *record, + u64 bytenr); int btrfs_qgroup_trace_extent_post(struct btrfs_trans_handle *trans, - struct btrfs_qgroup_extent_record *qrecord); + struct btrfs_qgroup_extent_record *qrecord, + u64 bytenr); int btrfs_qgroup_trace_extent(struct btrfs_trans_handle *trans, u64 bytenr, u64 num_bytes); int btrfs_qgroup_trace_leaf_items(struct btrfs_trans_handle *trans, @@ -430,8 +439,7 @@ void btrfs_qgroup_init_swapped_blocks( struct btrfs_qgroup_swapped_blocks *swapped_blocks); void btrfs_qgroup_clean_swapped_blocks(struct btrfs_root *root); -int btrfs_qgroup_add_swapped_blocks(struct btrfs_trans_handle *trans, - struct btrfs_root *subvol_root, +int btrfs_qgroup_add_swapped_blocks(struct btrfs_root *subvol_root, struct btrfs_block_group *bg, struct extent_buffer *subvol_parent, int subvol_slot, struct extent_buffer *reloc_parent, int reloc_slot, @@ -440,7 +448,6 @@ int btrfs_qgroup_trace_subtree_after_cow(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *eb); void btrfs_qgroup_destroy_extent_records(struct btrfs_transaction *trans); bool btrfs_check_quota_leak(const struct btrfs_fs_info *fs_info); -void btrfs_free_squota_rsv(struct btrfs_fs_info *fs_info, u64 root, u64 rsv_bytes); int btrfs_record_squota_delta(struct btrfs_fs_info *fs_info, const struct btrfs_squota_delta *delta); diff --git a/fs/btrfs/raid-stripe-tree.c b/fs/btrfs/raid-stripe-tree.c index 4c859b550f6c33..569273e42d852b 100644 --- a/fs/btrfs/raid-stripe-tree.c +++ b/fs/btrfs/raid-stripe-tree.c @@ -13,6 +13,50 @@ #include "volumes.h" #include "print-tree.h" +static int btrfs_partially_delete_raid_extent(struct btrfs_trans_handle *trans, + struct btrfs_path *path, + struct btrfs_key *oldkey, + u64 newlen, u64 frontpad) +{ + struct btrfs_root *stripe_root = trans->fs_info->stripe_root; + struct btrfs_stripe_extent *extent; + struct extent_buffer *leaf; + int slot; + size_t item_size; + int ret; + struct btrfs_key newkey = { + .objectid = oldkey->objectid + frontpad, + .type = BTRFS_RAID_STRIPE_KEY, + .offset = newlen, + }; + + ASSERT(oldkey->type == BTRFS_RAID_STRIPE_KEY); + ret = btrfs_duplicate_item(trans, stripe_root, path, &newkey); + if (ret) + return ret; + + leaf = path->nodes[0]; + slot = path->slots[0]; + item_size = btrfs_item_size(leaf, slot); + extent = btrfs_item_ptr(leaf, slot, struct btrfs_stripe_extent); + + for (int i = 0; i < btrfs_num_raid_stripes(item_size); i++) { + struct btrfs_raid_stride *stride = &extent->strides[i]; + u64 phys; + + phys = btrfs_raid_stride_physical(leaf, stride); + btrfs_set_raid_stride_physical(leaf, stride, phys + frontpad); + } + + btrfs_mark_buffer_dirty(trans, leaf); + + /* delete the old item, after we've inserted a new one. */ + path->slots[0]--; + ret = btrfs_del_item(trans, stripe_root, path); + + return ret; +} + int btrfs_delete_raid_extent(struct btrfs_trans_handle *trans, u64 start, u64 length) { struct btrfs_fs_info *fs_info = trans->fs_info; @@ -36,23 +80,24 @@ int btrfs_delete_raid_extent(struct btrfs_trans_handle *trans, u64 start, u64 le while (1) { key.objectid = start; key.type = BTRFS_RAID_STRIPE_KEY; - key.offset = length; + key.offset = 0; ret = btrfs_search_slot(trans, stripe_root, &key, path, -1, 1); if (ret < 0) break; - if (ret > 0) { - ret = 0; - if (path->slots[0] == 0) - break; + + if (path->slots[0] == btrfs_header_nritems(path->nodes[0])) path->slots[0]--; - } leaf = path->nodes[0]; slot = path->slots[0]; btrfs_item_key_to_cpu(leaf, &key, slot); found_start = key.objectid; found_end = found_start + key.offset; + ret = 0; + + if (key.type != BTRFS_RAID_STRIPE_KEY) + break; /* That stripe ends before we start, we're done. */ if (found_end <= start) @@ -61,7 +106,42 @@ int btrfs_delete_raid_extent(struct btrfs_trans_handle *trans, u64 start, u64 le trace_btrfs_raid_extent_delete(fs_info, start, end, found_start, found_end); - ASSERT(found_start >= start && found_end <= end); + /* + * The stripe extent starts before the range we want to delete: + * + * |--- RAID Stripe Extent ---| + * |--- keep ---|--- drop ---| + * + * This means we have to duplicate the tree item, truncate the + * length to the new size and then re-insert the item. + */ + if (found_start < start) { + u64 diff = start - found_start; + + ret = btrfs_partially_delete_raid_extent(trans, path, + &key, + diff, 0); + break; + } + + /* + * The stripe extent ends after the range we want to delete: + * + * |--- RAID Stripe Extent ---| + * |--- drop ---|--- keep ---| + * + * This means we have to duplicate the tree item, truncate the + * length to the new size and then re-insert the item. + */ + if (found_end > end) { + u64 diff = found_end - end; + + ret = btrfs_partially_delete_raid_extent(trans, path, + &key, + diff, diff); + break; + } + ret = btrfs_del_item(trans, stripe_root, path); if (ret) break; @@ -108,8 +188,9 @@ static int update_raid_extent_item(struct btrfs_trans_handle *trans, return ret; } -static int btrfs_insert_one_raid_extent(struct btrfs_trans_handle *trans, - struct btrfs_io_context *bioc) +EXPORT_FOR_TESTS +int btrfs_insert_one_raid_extent(struct btrfs_trans_handle *trans, + struct btrfs_io_context *bioc) { struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_key stripe_key; @@ -233,7 +314,7 @@ int btrfs_get_raid_extent_offset(struct btrfs_fs_info *fs_info, found_end = found_logical + found_length; if (found_logical > end) { - ret = -ENOENT; + ret = -ENODATA; goto out; } @@ -279,10 +360,10 @@ int btrfs_get_raid_extent_offset(struct btrfs_fs_info *fs_info, } /* If we're here, we haven't found the requested devid in the stripe. */ - ret = -ENOENT; + ret = -ENODATA; out: if (ret > 0) - ret = -ENOENT; + ret = -ENODATA; if (ret && ret != -EIO && !stripe->rst_search_commit_root) { btrfs_debug(fs_info, "cannot find raid-stripe for logical [%llu, %llu] devid %llu, profile %s", diff --git a/fs/btrfs/raid-stripe-tree.h b/fs/btrfs/raid-stripe-tree.h index 1ac1c21aac2f9b..54183642177869 100644 --- a/fs/btrfs/raid-stripe-tree.h +++ b/fs/btrfs/raid-stripe-tree.h @@ -28,6 +28,11 @@ int btrfs_get_raid_extent_offset(struct btrfs_fs_info *fs_info, int btrfs_insert_raid_extent(struct btrfs_trans_handle *trans, struct btrfs_ordered_extent *ordered_extent); +#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS +int btrfs_insert_one_raid_extent(struct btrfs_trans_handle *trans, + struct btrfs_io_context *bioc); +#endif + static inline bool btrfs_need_stripe_tree_update(struct btrfs_fs_info *fs_info, u64 map_type) { diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index 39bec672df0cc0..cdd373c277848e 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -1272,8 +1272,7 @@ static inline void bio_list_put(struct bio_list *bio_list) static void assert_rbio(struct btrfs_raid_bio *rbio) { - if (!IS_ENABLED(CONFIG_BTRFS_DEBUG) || - !IS_ENABLED(CONFIG_BTRFS_ASSERT)) + if (!IS_ENABLED(CONFIG_BTRFS_ASSERT)) return; /* diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index f3834f8d26b456..bf267bdfa8f811 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -1244,7 +1244,7 @@ int replace_path(struct btrfs_trans_handle *trans, struct reloc_control *rc, * The real subtree rescan is delayed until we have new * CoW on the subtree root node before transaction commit. */ - ret = btrfs_qgroup_add_swapped_blocks(trans, dest, + ret = btrfs_qgroup_add_swapped_blocks(dest, rc->block_group, parent, slot, path->nodes[level], path->slots[level], last_snapshot); diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 3a34274280746c..204c928beaf9cb 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -1656,8 +1656,7 @@ static u32 stripe_length(const struct scrub_stripe *stripe) stripe->bg->start + stripe->bg->length - stripe->logical); } -static void scrub_submit_extent_sector_read(struct scrub_ctx *sctx, - struct scrub_stripe *stripe) +static void scrub_submit_extent_sector_read(struct scrub_stripe *stripe) { struct btrfs_fs_info *fs_info = stripe->bg->fs_info; struct btrfs_bio *bbio = NULL; @@ -1704,8 +1703,18 @@ static void scrub_submit_extent_sector_read(struct scrub_ctx *sctx, &stripe_len, &bioc, &io_stripe, &mirror); btrfs_put_bioc(bioc); if (err < 0) { - set_bit(i, &stripe->io_error_bitmap); - set_bit(i, &stripe->error_bitmap); + if (err != -ENODATA) { + /* + * Earlier btrfs_get_raid_extent_offset() + * returned -ENODATA, which means there's + * no entry for the corresponding range + * in the stripe tree. But if it's in + * the extent tree, then it's a preallocated + * extent and not an error. + */ + set_bit(i, &stripe->io_error_bitmap); + set_bit(i, &stripe->error_bitmap); + } continue; } @@ -1743,7 +1752,7 @@ static void scrub_submit_initial_read(struct scrub_ctx *sctx, ASSERT(test_bit(SCRUB_STRIPE_FLAG_INITIALIZED, &stripe->state)); if (btrfs_need_stripe_tree_update(fs_info, stripe->bg->flags)) { - scrub_submit_extent_sector_read(sctx, stripe); + scrub_submit_extent_sector_read(stripe); return; } @@ -1954,7 +1963,7 @@ static int scrub_raid56_parity_stripe(struct scrub_ctx *sctx, ASSERT(sctx->raid56_data_stripes); /* - * For data stripe search, we cannot re-use the same extent/csum paths, + * For data stripe search, we cannot reuse the same extent/csum paths, * as the data stripe bytenr may be smaller than previous extent. Thus * we have to use our own extent/csum paths. */ @@ -2103,7 +2112,6 @@ static int scrub_raid56_parity_stripe(struct scrub_ctx *sctx, */ static int scrub_simple_mirror(struct scrub_ctx *sctx, struct btrfs_block_group *bg, - struct btrfs_chunk_map *map, u64 logical_start, u64 logical_length, struct btrfs_device *device, u64 physical, int mirror_num) @@ -2222,7 +2230,7 @@ static int scrub_simple_stripe(struct scrub_ctx *sctx, * just RAID1, so we can reuse scrub_simple_mirror() to scrub * this stripe. */ - ret = scrub_simple_mirror(sctx, bg, map, cur_logical, + ret = scrub_simple_mirror(sctx, bg, cur_logical, BTRFS_STRIPE_LEN, device, cur_physical, mirror_num); if (ret) @@ -2256,7 +2264,6 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, /* Offset inside the chunk */ u64 offset; u64 stripe_logical; - int stop_loop = 0; /* Extent_path should be released by now. */ ASSERT(sctx->extent_path.nodes[0] == NULL); @@ -2307,7 +2314,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, * Only @physical and @mirror_num needs to calculated using * @stripe_index. */ - ret = scrub_simple_mirror(sctx, bg, map, bg->start, bg->length, + ret = scrub_simple_mirror(sctx, bg, bg->start, bg->length, scrub_dev, map->stripes[stripe_index].physical, stripe_index + 1); offset = 0; @@ -2362,7 +2369,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, * We can reuse scrub_simple_mirror() here, as the repair part * is still based on @mirror_num. */ - ret = scrub_simple_mirror(sctx, bg, map, logical, BTRFS_STRIPE_LEN, + ret = scrub_simple_mirror(sctx, bg, logical, BTRFS_STRIPE_LEN, scrub_dev, physical, 1); if (ret < 0) goto out; @@ -2370,14 +2377,8 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, logical += increment; physical += BTRFS_STRIPE_LEN; spin_lock(&sctx->stat_lock); - if (stop_loop) - sctx->stat.last_physical = - map->stripes[stripe_index].physical + dev_stripe_len; - else - sctx->stat.last_physical = physical; + sctx->stat.last_physical = physical; spin_unlock(&sctx->stat_lock); - if (stop_loop) - break; } out: ret2 = flush_scrub_stripes(sctx); diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index b068469871f8e5..03b31b1c39be29 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -980,9 +980,7 @@ static int get_inode_gen(struct btrfs_root *root, u64 ino, u64 *gen) return ret; } -typedef int (*iterate_inode_ref_t)(int num, u64 dir, int index, - struct fs_path *p, - void *ctx); +typedef int (*iterate_inode_ref_t)(u64 dir, struct fs_path *p, void *ctx); /* * Helper function to iterate the entries in ONE btrfs_inode_ref or @@ -1007,8 +1005,6 @@ static int iterate_inode_ref(struct btrfs_root *root, struct btrfs_path *path, u32 name_len; char *start; int ret = 0; - int num = 0; - int index; u64 dir; unsigned long name_off; unsigned long elem_size; @@ -1043,13 +1039,11 @@ static int iterate_inode_ref(struct btrfs_root *root, struct btrfs_path *path, iref = (struct btrfs_inode_ref *)(ptr + cur); name_len = btrfs_inode_ref_name_len(eb, iref); name_off = (unsigned long)(iref + 1); - index = btrfs_inode_ref_index(eb, iref); dir = found_key->offset; } else { extref = (struct btrfs_inode_extref *)(ptr + cur); name_len = btrfs_inode_extref_name_len(eb, extref); name_off = (unsigned long)&extref->name; - index = btrfs_inode_extref_index(eb, extref); dir = btrfs_inode_extref_parent(eb, extref); } @@ -1094,10 +1088,9 @@ static int iterate_inode_ref(struct btrfs_root *root, struct btrfs_path *path, } cur += elem_size + name_len; - ret = iterate(num, dir, index, p, ctx); + ret = iterate(dir, p, ctx); if (ret) goto out; - num++; } out: @@ -1227,8 +1220,7 @@ static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path, return ret; } -static int __copy_first_ref(int num, u64 dir, int index, - struct fs_path *p, void *ctx) +static int __copy_first_ref(u64 dir, struct fs_path *p, void *ctx) { int ret; struct fs_path *pt = ctx; @@ -3768,7 +3760,6 @@ static int wait_for_dest_dir_move(struct send_ctx *sctx, struct recorded_ref *parent_ref, const bool is_orphan) { - struct btrfs_fs_info *fs_info = sctx->parent_root->fs_info; struct btrfs_path *path; struct btrfs_key key; struct btrfs_key di_key; @@ -3797,7 +3788,7 @@ static int wait_for_dest_dir_move(struct send_ctx *sctx, goto out; } - di = btrfs_match_dir_item_name(fs_info, path, parent_ref->name, + di = btrfs_match_dir_item_name(path, parent_ref->name, parent_ref->name_len); if (!di) { ret = 0; @@ -4708,8 +4699,7 @@ static int record_ref_in_tree(struct rb_root *root, struct list_head *refs, return ret; } -static int record_new_ref_if_needed(int num, u64 dir, int index, - struct fs_path *name, void *ctx) +static int record_new_ref_if_needed(u64 dir, struct fs_path *name, void *ctx) { int ret = 0; struct send_ctx *sctx = ctx; @@ -4738,8 +4728,7 @@ static int record_new_ref_if_needed(int num, u64 dir, int index, return ret; } -static int record_deleted_ref_if_needed(int num, u64 dir, int index, - struct fs_path *name, void *ctx) +static int record_deleted_ref_if_needed(u64 dir, struct fs_path *name, void *ctx) { int ret = 0; struct send_ctx *sctx = ctx; @@ -5677,7 +5666,7 @@ static int send_encoded_extent(struct send_ctx *sctx, struct btrfs_path *path, * Note that send_buf is a mapping of send_buf_pages, so this is really * reading into send_buf. */ - ret = btrfs_encoded_read_regular_fill_pages(BTRFS_I(inode), offset, + ret = btrfs_encoded_read_regular_fill_pages(BTRFS_I(inode), disk_bytenr, disk_num_bytes, sctx->send_buf_pages + (data_offset >> PAGE_SHIFT)); diff --git a/fs/btrfs/send.h b/fs/btrfs/send.h index b07f4aa6687836..9309886c5ea17b 100644 --- a/fs/btrfs/send.h +++ b/fs/btrfs/send.h @@ -16,7 +16,7 @@ struct btrfs_ioctl_send_args; #define BTRFS_SEND_STREAM_MAGIC "btrfs-stream" /* Conditional support for the upcoming protocol version. */ -#ifdef CONFIG_BTRFS_DEBUG +#ifdef CONFIG_BTRFS_EXPERIMENTAL #define BTRFS_SEND_STREAM_VERSION 3 #else #define BTRFS_SEND_STREAM_VERSION 2 diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c index d5a9cd8a4fd8d7..255e85f78313ca 100644 --- a/fs/btrfs/space-info.c +++ b/fs/btrfs/space-info.c @@ -1279,7 +1279,7 @@ static void btrfs_preempt_reclaim_metadata_space(struct work_struct *work) * If we are freeing inodes, we want to make sure all delayed iputs have * completed, because they could have been on an inode with i_nlink == 0, and * thus have been truncated and freed up space. But again this space is not - * immediately re-usable, it comes in the form of a delayed ref, which must be + * immediately reusable, it comes in the form of a delayed ref, which must be * run and then the transaction must be committed. * * COMMIT_TRANS @@ -1488,8 +1488,7 @@ static void priority_reclaim_data_space(struct btrfs_fs_info *fs_info, spin_unlock(&space_info->lock); } -static void wait_reserve_ticket(struct btrfs_fs_info *fs_info, - struct btrfs_space_info *space_info, +static void wait_reserve_ticket(struct btrfs_space_info *space_info, struct reserve_ticket *ticket) { @@ -1547,7 +1546,7 @@ static int handle_reserve_ticket(struct btrfs_fs_info *fs_info, case BTRFS_RESERVE_FLUSH_DATA: case BTRFS_RESERVE_FLUSH_ALL: case BTRFS_RESERVE_FLUSH_ALL_STEAL: - wait_reserve_ticket(fs_info, space_info, ticket); + wait_reserve_ticket(space_info, ticket); break; case BTRFS_RESERVE_FLUSH_LIMIT: priority_reclaim_metadata_space(fs_info, space_info, ticket, @@ -1984,8 +1983,7 @@ static bool is_reclaim_urgent(struct btrfs_space_info *space_info) return unalloc < data_chunk_size; } -static void do_reclaim_sweep(const struct btrfs_fs_info *fs_info, - struct btrfs_space_info *space_info, int raid) +static void do_reclaim_sweep(struct btrfs_space_info *space_info, int raid) { struct btrfs_block_group *bg; int thresh_pct; @@ -2081,6 +2079,6 @@ void btrfs_reclaim_sweep(const struct btrfs_fs_info *fs_info) if (!btrfs_should_periodic_reclaim(space_info)) continue; for (raid = 0; raid < BTRFS_NR_RAID_TYPES; raid++) - do_reclaim_sweep(fs_info, space_info, raid); + do_reclaim_sweep(space_info, raid); } } diff --git a/fs/btrfs/subpage.c b/fs/btrfs/subpage.c index fe4d719d506bf5..d4cab3c557425d 100644 --- a/fs/btrfs/subpage.c +++ b/fs/btrfs/subpage.c @@ -140,12 +140,10 @@ struct btrfs_subpage *btrfs_alloc_subpage(const struct btrfs_fs_info *fs_info, return ERR_PTR(-ENOMEM); spin_lock_init(&ret->lock); - if (type == BTRFS_SUBPAGE_METADATA) { + if (type == BTRFS_SUBPAGE_METADATA) atomic_set(&ret->eb_refs, 0); - } else { - atomic_set(&ret->readers, 0); - atomic_set(&ret->writers, 0); - } + else + atomic_set(&ret->nr_locked, 0); return ret; } @@ -221,62 +219,6 @@ static void btrfs_subpage_assert(const struct btrfs_fs_info *fs_info, __start_bit; \ }) -void btrfs_subpage_start_reader(const struct btrfs_fs_info *fs_info, - struct folio *folio, u64 start, u32 len) -{ - struct btrfs_subpage *subpage = folio_get_private(folio); - const int start_bit = subpage_calc_start_bit(fs_info, folio, locked, start, len); - const int nbits = len >> fs_info->sectorsize_bits; - unsigned long flags; - - - btrfs_subpage_assert(fs_info, folio, start, len); - - spin_lock_irqsave(&subpage->lock, flags); - /* - * Even though it's just for reading the page, no one should have - * locked the subpage range. - */ - ASSERT(bitmap_test_range_all_zero(subpage->bitmaps, start_bit, nbits)); - bitmap_set(subpage->bitmaps, start_bit, nbits); - atomic_add(nbits, &subpage->readers); - spin_unlock_irqrestore(&subpage->lock, flags); -} - -void btrfs_subpage_end_reader(const struct btrfs_fs_info *fs_info, - struct folio *folio, u64 start, u32 len) -{ - struct btrfs_subpage *subpage = folio_get_private(folio); - const int start_bit = subpage_calc_start_bit(fs_info, folio, locked, start, len); - const int nbits = len >> fs_info->sectorsize_bits; - unsigned long flags; - bool is_data; - bool last; - - btrfs_subpage_assert(fs_info, folio, start, len); - is_data = is_data_inode(BTRFS_I(folio->mapping->host)); - - spin_lock_irqsave(&subpage->lock, flags); - - /* The range should have already been locked. */ - ASSERT(bitmap_test_range_all_set(subpage->bitmaps, start_bit, nbits)); - ASSERT(atomic_read(&subpage->readers) >= nbits); - - bitmap_clear(subpage->bitmaps, start_bit, nbits); - last = atomic_sub_and_test(nbits, &subpage->readers); - - /* - * For data we need to unlock the page if the last read has finished. - * - * And please don't replace @last with atomic_sub_and_test() call - * inside if () condition. - * As we want the atomic_sub_and_test() to be always executed. - */ - if (is_data && last) - folio_unlock(folio); - spin_unlock_irqrestore(&subpage->lock, flags); -} - static void btrfs_subpage_clamp_range(struct folio *folio, u64 *start, u32 *len) { u64 orig_start = *start; @@ -295,28 +237,8 @@ static void btrfs_subpage_clamp_range(struct folio *folio, u64 *start, u32 *len) orig_start + orig_len) - *start; } -static void btrfs_subpage_start_writer(const struct btrfs_fs_info *fs_info, - struct folio *folio, u64 start, u32 len) -{ - struct btrfs_subpage *subpage = folio_get_private(folio); - const int start_bit = subpage_calc_start_bit(fs_info, folio, locked, start, len); - const int nbits = (len >> fs_info->sectorsize_bits); - unsigned long flags; - int ret; - - btrfs_subpage_assert(fs_info, folio, start, len); - - spin_lock_irqsave(&subpage->lock, flags); - ASSERT(atomic_read(&subpage->readers) == 0); - ASSERT(bitmap_test_range_all_zero(subpage->bitmaps, start_bit, nbits)); - bitmap_set(subpage->bitmaps, start_bit, nbits); - ret = atomic_add_return(nbits, &subpage->writers); - ASSERT(ret == nbits); - spin_unlock_irqrestore(&subpage->lock, flags); -} - -static bool btrfs_subpage_end_and_test_writer(const struct btrfs_fs_info *fs_info, - struct folio *folio, u64 start, u32 len) +static bool btrfs_subpage_end_and_test_lock(const struct btrfs_fs_info *fs_info, + struct folio *folio, u64 start, u32 len) { struct btrfs_subpage *subpage = folio_get_private(folio); const int start_bit = subpage_calc_start_bit(fs_info, folio, locked, start, len); @@ -334,9 +256,9 @@ static bool btrfs_subpage_end_and_test_writer(const struct btrfs_fs_info *fs_inf * extent_clear_unlock_delalloc() for compression path. * * This @locked_page is locked by plain lock_page(), thus its - * subpage::writers is 0. Handle them in a special way. + * subpage::locked is 0. Handle them in a special way. */ - if (atomic_read(&subpage->writers) == 0) { + if (atomic_read(&subpage->nr_locked) == 0) { spin_unlock_irqrestore(&subpage->lock, flags); return true; } @@ -345,39 +267,12 @@ static bool btrfs_subpage_end_and_test_writer(const struct btrfs_fs_info *fs_inf clear_bit(bit, subpage->bitmaps); cleared++; } - ASSERT(atomic_read(&subpage->writers) >= cleared); - last = atomic_sub_and_test(cleared, &subpage->writers); + ASSERT(atomic_read(&subpage->nr_locked) >= cleared); + last = atomic_sub_and_test(cleared, &subpage->nr_locked); spin_unlock_irqrestore(&subpage->lock, flags); return last; } -/* - * Lock a folio for delalloc page writeback. - * - * Return -EAGAIN if the page is not properly initialized. - * Return 0 with the page locked, and writer counter updated. - * - * Even with 0 returned, the page still need extra check to make sure - * it's really the correct page, as the caller is using - * filemap_get_folios_contig(), which can race with page invalidating. - */ -int btrfs_folio_start_writer_lock(const struct btrfs_fs_info *fs_info, - struct folio *folio, u64 start, u32 len) -{ - if (unlikely(!fs_info) || !btrfs_is_subpage(fs_info, folio->mapping)) { - folio_lock(folio); - return 0; - } - folio_lock(folio); - if (!folio_test_private(folio) || !folio_get_private(folio)) { - folio_unlock(folio); - return -EAGAIN; - } - btrfs_subpage_clamp_range(folio, &start, &len); - btrfs_subpage_start_writer(fs_info, folio, start, len); - return 0; -} - /* * Handle different locked folios: * @@ -394,8 +289,8 @@ int btrfs_folio_start_writer_lock(const struct btrfs_fs_info *fs_info, * bitmap, reduce the writer lock number, and unlock the page if that's * the last locked range. */ -void btrfs_folio_end_writer_lock(const struct btrfs_fs_info *fs_info, - struct folio *folio, u64 start, u32 len) +void btrfs_folio_end_lock(const struct btrfs_fs_info *fs_info, + struct folio *folio, u64 start, u32 len) { struct btrfs_subpage *subpage = folio_get_private(folio); @@ -408,24 +303,24 @@ void btrfs_folio_end_writer_lock(const struct btrfs_fs_info *fs_info, /* * For subpage case, there are two types of locked page. With or - * without writers number. + * without locked number. * - * Since we own the page lock, no one else could touch subpage::writers + * Since we own the page lock, no one else could touch subpage::locked * and we are safe to do several atomic operations without spinlock. */ - if (atomic_read(&subpage->writers) == 0) { - /* No writers, locked by plain lock_page(). */ + if (atomic_read(&subpage->nr_locked) == 0) { + /* No subpage lock, locked by plain lock_page(). */ folio_unlock(folio); return; } btrfs_subpage_clamp_range(folio, &start, &len); - if (btrfs_subpage_end_and_test_writer(fs_info, folio, start, len)) + if (btrfs_subpage_end_and_test_lock(fs_info, folio, start, len)) folio_unlock(folio); } -void btrfs_folio_end_writer_lock_bitmap(const struct btrfs_fs_info *fs_info, - struct folio *folio, unsigned long bitmap) +void btrfs_folio_end_lock_bitmap(const struct btrfs_fs_info *fs_info, + struct folio *folio, unsigned long bitmap) { struct btrfs_subpage *subpage = folio_get_private(folio); const int start_bit = fs_info->sectors_per_page * btrfs_bitmap_nr_locked; @@ -439,8 +334,8 @@ void btrfs_folio_end_writer_lock_bitmap(const struct btrfs_fs_info *fs_info, return; } - if (atomic_read(&subpage->writers) == 0) { - /* No writers, locked by plain lock_page(). */ + if (atomic_read(&subpage->nr_locked) == 0) { + /* No subpage lock, locked by plain lock_page(). */ folio_unlock(folio); return; } @@ -450,8 +345,8 @@ void btrfs_folio_end_writer_lock_bitmap(const struct btrfs_fs_info *fs_info, if (test_and_clear_bit(bit + start_bit, subpage->bitmaps)) cleared++; } - ASSERT(atomic_read(&subpage->writers) >= cleared); - last = atomic_sub_and_test(cleared, &subpage->writers); + ASSERT(atomic_read(&subpage->nr_locked) >= cleared); + last = atomic_sub_and_test(cleared, &subpage->nr_locked); spin_unlock_irqrestore(&subpage->lock, flags); if (last) folio_unlock(folio); @@ -776,8 +671,8 @@ void btrfs_folio_assert_not_dirty(const struct btrfs_fs_info *fs_info, * This populates the involved subpage ranges so that subpage helpers can * properly unlock them. */ -void btrfs_folio_set_writer_lock(const struct btrfs_fs_info *fs_info, - struct folio *folio, u64 start, u32 len) +void btrfs_folio_set_lock(const struct btrfs_fs_info *fs_info, + struct folio *folio, u64 start, u32 len) { struct btrfs_subpage *subpage; unsigned long flags; @@ -796,58 +691,11 @@ void btrfs_folio_set_writer_lock(const struct btrfs_fs_info *fs_info, /* Target range should not yet be locked. */ ASSERT(bitmap_test_range_all_zero(subpage->bitmaps, start_bit, nbits)); bitmap_set(subpage->bitmaps, start_bit, nbits); - ret = atomic_add_return(nbits, &subpage->writers); + ret = atomic_add_return(nbits, &subpage->nr_locked); ASSERT(ret <= fs_info->sectors_per_page); spin_unlock_irqrestore(&subpage->lock, flags); } -/* - * Find any subpage writer locked range inside @folio, starting at file offset - * @search_start. The caller should ensure the folio is locked. - * - * Return true and update @found_start_ret and @found_len_ret to the first - * writer locked range. - * Return false if there is no writer locked range. - */ -bool btrfs_subpage_find_writer_locked(const struct btrfs_fs_info *fs_info, - struct folio *folio, u64 search_start, - u64 *found_start_ret, u32 *found_len_ret) -{ - struct btrfs_subpage *subpage = folio_get_private(folio); - const u32 sectors_per_page = fs_info->sectors_per_page; - const unsigned int len = PAGE_SIZE - offset_in_page(search_start); - const unsigned int start_bit = subpage_calc_start_bit(fs_info, folio, - locked, search_start, len); - const unsigned int locked_bitmap_start = sectors_per_page * btrfs_bitmap_nr_locked; - const unsigned int locked_bitmap_end = locked_bitmap_start + sectors_per_page; - unsigned long flags; - int first_zero; - int first_set; - bool found = false; - - ASSERT(folio_test_locked(folio)); - spin_lock_irqsave(&subpage->lock, flags); - first_set = find_next_bit(subpage->bitmaps, locked_bitmap_end, start_bit); - if (first_set >= locked_bitmap_end) - goto out; - - found = true; - - *found_start_ret = folio_pos(folio) + - ((first_set - locked_bitmap_start) << fs_info->sectorsize_bits); - /* - * Since @first_set is ensured to be smaller than locked_bitmap_end - * here, @found_start_ret should be inside the folio. - */ - ASSERT(*found_start_ret < folio_pos(folio) + PAGE_SIZE); - - first_zero = find_next_zero_bit(subpage->bitmaps, locked_bitmap_end, first_set); - *found_len_ret = (first_zero - first_set) << fs_info->sectorsize_bits; -out: - spin_unlock_irqrestore(&subpage->lock, flags); - return found; -} - #define GET_SUBPAGE_BITMAP(subpage, fs_info, name, dst) \ { \ const int sectors_per_page = fs_info->sectors_per_page; \ diff --git a/fs/btrfs/subpage.h b/fs/btrfs/subpage.h index 4b85d91d0e18b0..428fa9389fd49e 100644 --- a/fs/btrfs/subpage.h +++ b/fs/btrfs/subpage.h @@ -45,14 +45,6 @@ enum { struct btrfs_subpage { /* Common members for both data and metadata pages */ spinlock_t lock; - /* - * Both data and metadata needs to track how many readers are for the - * page. - * Data relies on @readers to unlock the page when last reader finished. - * While metadata doesn't need page unlock, it needs to prevent - * page::private get cleared before the last end_page_read(). - */ - atomic_t readers; union { /* * Structures only used by metadata @@ -62,8 +54,12 @@ struct btrfs_subpage { */ atomic_t eb_refs; - /* Structures only used by data */ - atomic_t writers; + /* + * Structures only used by data, + * + * How many sectors inside the page is locked. + */ + atomic_t nr_locked; }; unsigned long bitmaps[]; }; @@ -95,23 +91,12 @@ void btrfs_free_subpage(struct btrfs_subpage *subpage); void btrfs_folio_inc_eb_refs(const struct btrfs_fs_info *fs_info, struct folio *folio); void btrfs_folio_dec_eb_refs(const struct btrfs_fs_info *fs_info, struct folio *folio); -void btrfs_subpage_start_reader(const struct btrfs_fs_info *fs_info, - struct folio *folio, u64 start, u32 len); -void btrfs_subpage_end_reader(const struct btrfs_fs_info *fs_info, - struct folio *folio, u64 start, u32 len); - -int btrfs_folio_start_writer_lock(const struct btrfs_fs_info *fs_info, - struct folio *folio, u64 start, u32 len); -void btrfs_folio_end_writer_lock(const struct btrfs_fs_info *fs_info, - struct folio *folio, u64 start, u32 len); -void btrfs_folio_set_writer_lock(const struct btrfs_fs_info *fs_info, - struct folio *folio, u64 start, u32 len); -void btrfs_folio_end_writer_lock_bitmap(const struct btrfs_fs_info *fs_info, - struct folio *folio, unsigned long bitmap); -bool btrfs_subpage_find_writer_locked(const struct btrfs_fs_info *fs_info, - struct folio *folio, u64 search_start, - u64 *found_start_ret, u32 *found_len_ret); - +void btrfs_folio_end_lock(const struct btrfs_fs_info *fs_info, + struct folio *folio, u64 start, u32 len); +void btrfs_folio_set_lock(const struct btrfs_fs_info *fs_info, + struct folio *folio, u64 start, u32 len); +void btrfs_folio_end_lock_bitmap(const struct btrfs_fs_info *fs_info, + struct folio *folio, unsigned long bitmap); /* * Template for subpage related operations. * diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 98fa0f382480a2..fa25db4aacf996 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -28,7 +28,6 @@ #include #include #include -#include #include "messages.h" #include "delayed-inode.h" #include "ctree.h" @@ -340,6 +339,15 @@ static int btrfs_parse_param(struct fs_context *fc, struct fs_parameter *param) fallthrough; case Opt_compress: case Opt_compress_type: + /* + * Provide the same semantics as older kernels that don't use fs + * context, specifying the "compress" option clears + * "force-compress" without the need to pass + * "compress-force=[no|none]" before specifying "compress". + */ + if (opt != Opt_compress_force && opt != Opt_compress_force_type) + btrfs_clear_opt(ctx->mount_opt, FORCE_COMPRESS); + if (opt == Opt_compress || opt == Opt_compress_force) { ctx->compress_type = BTRFS_COMPRESS_ZLIB; ctx->compress_level = BTRFS_ZLIB_DEFAULT_LEVEL; @@ -937,8 +945,7 @@ static int get_default_subvol_objectid(struct btrfs_fs_info *fs_info, u64 *objec } static int btrfs_fill_super(struct super_block *sb, - struct btrfs_fs_devices *fs_devices, - void *data) + struct btrfs_fs_devices *fs_devices) { struct inode *inode; struct btrfs_fs_info *fs_info = btrfs_sb(sb); @@ -962,7 +969,7 @@ static int btrfs_fill_super(struct super_block *sb, return err; } - err = open_ctree(sb, fs_devices, (char *)data); + err = open_ctree(sb, fs_devices); if (err) { btrfs_err(fs_info, "open_ctree failed"); return err; @@ -1885,7 +1892,7 @@ static int btrfs_get_tree_super(struct fs_context *fc) snprintf(sb->s_id, sizeof(sb->s_id), "%pg", bdev); shrinker_debugfs_rename(sb->s_shrink, "sb-btrfs:%s", sb->s_id); btrfs_sb(sb)->bdev_holder = &btrfs_fs_type; - ret = btrfs_fill_super(sb, fs_devices, NULL); + ret = btrfs_fill_super(sb, fs_devices); } if (ret) { @@ -2402,13 +2409,7 @@ static long btrfs_nr_cached_objects(struct super_block *sb, struct shrink_contro trace_btrfs_extent_map_shrinker_count(fs_info, nr); - /* - * Only report the real number for DEBUG builds, as there are reports of - * serious performance degradation caused by too frequent shrinks. - */ - if (IS_ENABLED(CONFIG_BTRFS_DEBUG)) - return nr; - return 0; + return nr; } static long btrfs_free_cached_objects(struct super_block *sb, struct shrink_control *sc) @@ -2416,16 +2417,10 @@ static long btrfs_free_cached_objects(struct super_block *sb, struct shrink_cont const long nr_to_scan = min_t(unsigned long, LONG_MAX, sc->nr_to_scan); struct btrfs_fs_info *fs_info = btrfs_sb(sb); - /* - * We may be called from any task trying to allocate memory and we don't - * want to slow it down with scanning and dropping extent maps. It would - * also cause heavy lock contention if many tasks concurrently enter - * here. Therefore only allow kswapd tasks to scan and drop extent maps. - */ - if (!current_is_kswapd()) - return 0; + btrfs_free_extent_maps(fs_info, nr_to_scan); - return btrfs_free_extent_maps(fs_info, nr_to_scan); + /* The extent map shrinker runs asynchronously, so always return 0. */ + return 0; } static const struct super_operations btrfs_super_ops = { diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index 03926ad467c919..b843308e2bc6f6 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -1390,7 +1390,7 @@ static ssize_t btrfs_bg_reclaim_threshold_store(struct kobject *kobj, BTRFS_ATTR_RW(, bg_reclaim_threshold, btrfs_bg_reclaim_threshold_show, btrfs_bg_reclaim_threshold_store); -#ifdef CONFIG_BTRFS_DEBUG +#ifdef CONFIG_BTRFS_EXPERIMENTAL static ssize_t btrfs_offload_csum_show(struct kobject *kobj, struct kobj_attribute *a, char *buf) { @@ -1450,7 +1450,7 @@ static const struct attribute *btrfs_attrs[] = { BTRFS_ATTR_PTR(, bg_reclaim_threshold), BTRFS_ATTR_PTR(, commit_stats), BTRFS_ATTR_PTR(, temp_fsid), -#ifdef CONFIG_BTRFS_DEBUG +#ifdef CONFIG_BTRFS_EXPERIMENTAL BTRFS_ATTR_PTR(, offload_csum), #endif NULL, diff --git a/fs/btrfs/tests/btrfs-tests.c b/fs/btrfs/tests/btrfs-tests.c index ce50847e1e01a4..e607b5d52fb1d2 100644 --- a/fs/btrfs/tests/btrfs-tests.c +++ b/fs/btrfs/tests/btrfs-tests.c @@ -29,6 +29,7 @@ const char *test_error[] = { [TEST_ALLOC_BLOCK_GROUP] = "cannot allocate block group", [TEST_ALLOC_EXTENT_MAP] = "cannot allocate extent map", [TEST_ALLOC_CHUNK_MAP] = "cannot allocate chunk map", + [TEST_ALLOC_IO_CONTEXT] = "cannot allocate io context", }; static const struct super_operations btrfs_test_super_ops = { @@ -291,6 +292,9 @@ int btrfs_run_sanity_tests(void) ret = btrfs_test_free_space_tree(sectorsize, nodesize); if (ret) goto out; + ret = btrfs_test_raid_stripe_tree(sectorsize, nodesize); + if (ret) + goto out; } } ret = btrfs_test_extent_map(); diff --git a/fs/btrfs/tests/btrfs-tests.h b/fs/btrfs/tests/btrfs-tests.h index dc2f2ab15fa5c3..b524ecf2f452b6 100644 --- a/fs/btrfs/tests/btrfs-tests.h +++ b/fs/btrfs/tests/btrfs-tests.h @@ -24,6 +24,7 @@ enum { TEST_ALLOC_BLOCK_GROUP, TEST_ALLOC_EXTENT_MAP, TEST_ALLOC_CHUNK_MAP, + TEST_ALLOC_IO_CONTEXT, }; extern const char *test_error[]; @@ -37,6 +38,7 @@ int btrfs_test_extent_io(u32 sectorsize, u32 nodesize); int btrfs_test_inodes(u32 sectorsize, u32 nodesize); int btrfs_test_qgroups(u32 sectorsize, u32 nodesize); int btrfs_test_free_space_tree(u32 sectorsize, u32 nodesize); +int btrfs_test_raid_stripe_tree(u32 sectorsize, u32 nodesize); int btrfs_test_extent_map(void); struct inode *btrfs_new_test_inode(void); struct btrfs_fs_info *btrfs_alloc_dummy_fs_info(u32 nodesize, u32 sectorsize); diff --git a/fs/btrfs/tests/raid-stripe-tree-tests.c b/fs/btrfs/tests/raid-stripe-tree-tests.c new file mode 100644 index 00000000000000..5f1f1342b29172 --- /dev/null +++ b/fs/btrfs/tests/raid-stripe-tree-tests.c @@ -0,0 +1,540 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2024 Western Digital Corporation or its affiliates. + */ + +#include +#include "../fs.h" +#include "../disk-io.h" +#include "../transaction.h" +#include "../volumes.h" +#include "../raid-stripe-tree.h" +#include "btrfs-tests.h" + +#define RST_TEST_NUM_DEVICES (2) +#define RST_TEST_RAID1_TYPE (BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_RAID1) + +typedef int (*test_func_t)(struct btrfs_trans_handle *trans); + +static struct btrfs_device *btrfs_device_by_devid(struct btrfs_fs_devices *fs_devices, + u64 devid) +{ + struct btrfs_device *dev; + + list_for_each_entry(dev, &fs_devices->devices, dev_list) { + if (dev->devid == devid) + return dev; + } + + return NULL; +} + +/* + * Test a 64K RST write on a 2 disk RAID1 at a logical address of 1M and then + * delete the 1st 32K, making the new start address 1M+32K. + */ +static int test_front_delete(struct btrfs_trans_handle *trans) +{ + struct btrfs_fs_info *fs_info = trans->fs_info; + struct btrfs_io_context *bioc; + struct btrfs_io_stripe io_stripe = { 0 }; + u64 map_type = RST_TEST_RAID1_TYPE; + u64 logical = SZ_1M; + u64 len = SZ_64K; + int ret; + + bioc = alloc_btrfs_io_context(fs_info, logical, RST_TEST_NUM_DEVICES); + if (!bioc) { + test_std_err(TEST_ALLOC_IO_CONTEXT); + ret = -ENOMEM; + goto out; + } + + io_stripe.dev = btrfs_device_by_devid(fs_info->fs_devices, 0); + bioc->map_type = map_type; + bioc->size = len; + + for (int i = 0; i < RST_TEST_NUM_DEVICES; i++) { + struct btrfs_io_stripe *stripe = &bioc->stripes[i]; + + stripe->dev = btrfs_device_by_devid(fs_info->fs_devices, i); + if (!stripe->dev) { + test_err("cannot find device with devid %d", i); + ret = -EINVAL; + goto out; + } + + stripe->physical = logical + i * SZ_1G; + } + + ret = btrfs_insert_one_raid_extent(trans, bioc); + if (ret) { + test_err("inserting RAID extent failed: %d", ret); + goto out; + } + + ret = btrfs_get_raid_extent_offset(fs_info, logical, &len, map_type, 0, + &io_stripe); + if (ret) { + test_err("lookup of RAID extent [%llu, %llu] failed", logical, + logical + len); + goto out; + } + + if (io_stripe.physical != logical) { + test_err("invalid physical address, expected %llu got %llu", + logical, io_stripe.physical); + ret = -EINVAL; + goto out; + } + + if (len != SZ_64K) { + test_err("invalid stripe length, expected %llu got %llu", + (u64)SZ_64K, len); + ret = -EINVAL; + goto out; + } + + ret = btrfs_delete_raid_extent(trans, logical, SZ_32K); + if (ret) { + test_err("deleting RAID extent [%llu, %llu] failed", logical, + logical + SZ_32K); + goto out; + } + + len = SZ_32K; + ret = btrfs_get_raid_extent_offset(fs_info, logical + SZ_32K, &len, + map_type, 0, &io_stripe); + if (ret) { + test_err("lookup of RAID extent [%llu, %llu] failed", + logical + SZ_32K, logical + SZ_32K + len); + goto out; + } + + if (io_stripe.physical != logical + SZ_32K) { + test_err("invalid physical address, expected %llu, got %llu", + logical + SZ_32K, io_stripe.physical); + ret = -EINVAL; + goto out; + } + + if (len != SZ_32K) { + test_err("invalid stripe length, expected %llu, got %llu", + (u64)SZ_32K, len); + ret = -EINVAL; + goto out; + } + + ret = btrfs_get_raid_extent_offset(fs_info, logical, &len, map_type, 0, + &io_stripe); + if (!ret) { + ret = -EINVAL; + test_err("lookup of RAID extent [%llu, %llu] succeeded, should fail", + logical, logical + SZ_32K); + goto out; + } + + ret = btrfs_delete_raid_extent(trans, logical + SZ_32K, SZ_32K); + out: + return ret; +} + +/* + * Test a 64K RST write on a 2 disk RAID1 at a logical address of 1M and then + * truncate the stripe extent down to 32K. + */ +static int test_tail_delete(struct btrfs_trans_handle *trans) +{ + struct btrfs_fs_info *fs_info = trans->fs_info; + struct btrfs_io_context *bioc; + struct btrfs_io_stripe io_stripe = { 0 }; + u64 map_type = RST_TEST_RAID1_TYPE; + u64 logical = SZ_1M; + u64 len = SZ_64K; + int ret; + + bioc = alloc_btrfs_io_context(fs_info, logical, RST_TEST_NUM_DEVICES); + if (!bioc) { + test_std_err(TEST_ALLOC_IO_CONTEXT); + ret = -ENOMEM; + goto out; + } + + io_stripe.dev = btrfs_device_by_devid(fs_info->fs_devices, 0); + bioc->map_type = map_type; + bioc->size = len; + + for (int i = 0; i < RST_TEST_NUM_DEVICES; i++) { + struct btrfs_io_stripe *stripe = &bioc->stripes[i]; + + stripe->dev = btrfs_device_by_devid(fs_info->fs_devices, i); + if (!stripe->dev) { + test_err("cannot find device with devid %d", i); + ret = -EINVAL; + goto out; + } + + stripe->physical = logical + i * SZ_1G; + } + + ret = btrfs_insert_one_raid_extent(trans, bioc); + if (ret) { + test_err("inserting RAID extent failed: %d", ret); + goto out; + } + + io_stripe.dev = btrfs_device_by_devid(fs_info->fs_devices, 0); + if (!io_stripe.dev) { + ret = -EINVAL; + goto out; + } + + ret = btrfs_get_raid_extent_offset(fs_info, logical, &len, map_type, 0, + &io_stripe); + if (ret) { + test_err("lookup of RAID extent [%llu, %llu] failed", logical, + logical + len); + goto out; + } + + if (io_stripe.physical != logical) { + test_err("invalid physical address, expected %llu got %llu", + logical, io_stripe.physical); + ret = -EINVAL; + goto out; + } + + if (len != SZ_64K) { + test_err("invalid stripe length, expected %llu got %llu", + (u64)SZ_64K, len); + ret = -EINVAL; + goto out; + } + + ret = btrfs_delete_raid_extent(trans, logical + SZ_32K, SZ_32K); + if (ret) { + test_err("deleting RAID extent [%llu, %llu] failed", + logical + SZ_32K, logical + SZ_64K); + goto out; + } + + len = SZ_32K; + ret = btrfs_get_raid_extent_offset(fs_info, logical, &len, map_type, 0, &io_stripe); + if (ret) { + test_err("lookup of RAID extent [%llu, %llu] failed", logical, + logical + len); + goto out; + } + + if (io_stripe.physical != logical) { + test_err("invalid physical address, expected %llu, got %llu", + logical, io_stripe.physical); + ret = -EINVAL; + goto out; + } + + if (len != SZ_32K) { + test_err("invalid stripe length, expected %llu, got %llu", + (u64)SZ_32K, len); + ret = -EINVAL; + goto out; + } + + ret = btrfs_delete_raid_extent(trans, logical, len); + if (ret) + test_err("deleting RAID extent [%llu, %llu] failed", logical, + logical + len); + +out: + btrfs_put_bioc(bioc); + return ret; +} + +/* + * Test a 64K RST write on a 2 disk RAID1 at a logical address of 1M and then + * overwrite the whole range giving it new physical address at an offset of 1G. + * The intent of this test is to exercise the 'update_raid_extent_item()' + * function called be btrfs_insert_one_raid_extent(). + */ +static int test_create_update_delete(struct btrfs_trans_handle *trans) +{ + struct btrfs_fs_info *fs_info = trans->fs_info; + struct btrfs_io_context *bioc; + struct btrfs_io_stripe io_stripe = { 0 }; + u64 map_type = RST_TEST_RAID1_TYPE; + u64 logical = SZ_1M; + u64 len = SZ_64K; + int ret; + + bioc = alloc_btrfs_io_context(fs_info, logical, RST_TEST_NUM_DEVICES); + if (!bioc) { + test_std_err(TEST_ALLOC_IO_CONTEXT); + ret = -ENOMEM; + goto out; + } + + io_stripe.dev = btrfs_device_by_devid(fs_info->fs_devices, 0); + bioc->map_type = map_type; + bioc->size = len; + + for (int i = 0; i < RST_TEST_NUM_DEVICES; i++) { + struct btrfs_io_stripe *stripe = &bioc->stripes[i]; + + stripe->dev = btrfs_device_by_devid(fs_info->fs_devices, i); + if (!stripe->dev) { + test_err("cannot find device with devid %d", i); + ret = -EINVAL; + goto out; + } + + stripe->physical = logical + i * SZ_1G; + } + + ret = btrfs_insert_one_raid_extent(trans, bioc); + if (ret) { + test_err("inserting RAID extent failed: %d", ret); + goto out; + } + + io_stripe.dev = btrfs_device_by_devid(fs_info->fs_devices, 0); + if (!io_stripe.dev) { + ret = -EINVAL; + goto out; + } + + ret = btrfs_get_raid_extent_offset(fs_info, logical, &len, map_type, 0, &io_stripe); + if (ret) { + test_err("lookup of RAID extent [%llu, %llu] failed", logical, + logical + len); + goto out; + } + + if (io_stripe.physical != logical) { + test_err("invalid physical address, expected %llu got %llu", + logical, io_stripe.physical); + ret = -EINVAL; + goto out; + } + + if (len != SZ_64K) { + test_err("invalid stripe length, expected %llu got %llu", + (u64)SZ_64K, len); + ret = -EINVAL; + goto out; + } + + for (int i = 0; i < RST_TEST_NUM_DEVICES; i++) { + struct btrfs_io_stripe *stripe = &bioc->stripes[i]; + + stripe->dev = btrfs_device_by_devid(fs_info->fs_devices, i); + if (!stripe->dev) { + test_err("cannot find device with devid %d", i); + ret = -EINVAL; + goto out; + } + + stripe->physical = SZ_1G + logical + i * SZ_1G; + } + + ret = btrfs_insert_one_raid_extent(trans, bioc); + if (ret) { + test_err("updating RAID extent failed: %d", ret); + goto out; + } + + ret = btrfs_get_raid_extent_offset(fs_info, logical, &len, map_type, 0, &io_stripe); + if (ret) { + test_err("lookup of RAID extent [%llu, %llu] failed", logical, + logical + len); + goto out; + } + + if (io_stripe.physical != logical + SZ_1G) { + test_err("invalid physical address, expected %llu, got %llu", + logical + SZ_1G, io_stripe.physical); + ret = -EINVAL; + goto out; + } + + if (len != SZ_64K) { + test_err("invalid stripe length, expected %llu, got %llu", + (u64)SZ_64K, len); + ret = -EINVAL; + goto out; + } + + ret = btrfs_delete_raid_extent(trans, logical, len); + if (ret) + test_err("deleting RAID extent [%llu, %llu] failed", logical, + logical + len); + +out: + btrfs_put_bioc(bioc); + return ret; +} + +/* + * Test a simple 64K RST write on a 2 disk RAID1 at a logical address of 1M. + * The "physical" copy on device 0 is at 1M, on device 1 it is at 1G+1M. + */ +static int test_simple_create_delete(struct btrfs_trans_handle *trans) +{ + struct btrfs_fs_info *fs_info = trans->fs_info; + struct btrfs_io_context *bioc; + struct btrfs_io_stripe io_stripe = { 0 }; + u64 map_type = RST_TEST_RAID1_TYPE; + u64 logical = SZ_1M; + u64 len = SZ_64K; + int ret; + + bioc = alloc_btrfs_io_context(fs_info, logical, RST_TEST_NUM_DEVICES); + if (!bioc) { + test_std_err(TEST_ALLOC_IO_CONTEXT); + ret = -ENOMEM; + goto out; + } + + bioc->map_type = map_type; + bioc->size = SZ_64K; + + for (int i = 0; i < RST_TEST_NUM_DEVICES; i++) { + struct btrfs_io_stripe *stripe = &bioc->stripes[i]; + + stripe->dev = btrfs_device_by_devid(fs_info->fs_devices, i); + if (!stripe->dev) { + test_err("cannot find device with devid %d", i); + ret = -EINVAL; + goto out; + } + + stripe->physical = logical + i * SZ_1G; + } + + ret = btrfs_insert_one_raid_extent(trans, bioc); + if (ret) { + test_err("inserting RAID extent failed: %d", ret); + goto out; + } + + io_stripe.dev = btrfs_device_by_devid(fs_info->fs_devices, 0); + if (!io_stripe.dev) { + ret = -EINVAL; + goto out; + } + + ret = btrfs_get_raid_extent_offset(fs_info, logical, &len, map_type, 0, &io_stripe); + if (ret) { + test_err("lookup of RAID extent [%llu, %llu] failed", logical, + logical + len); + goto out; + } + + if (io_stripe.physical != logical) { + test_err("invalid physical address, expected %llu got %llu", + logical, io_stripe.physical); + ret = -EINVAL; + goto out; + } + + if (len != SZ_64K) { + test_err("invalid stripe length, expected %llu got %llu", + (u64)SZ_64K, len); + ret = -EINVAL; + goto out; + } + + ret = btrfs_delete_raid_extent(trans, logical, len); + if (ret) + test_err("deleting RAID extent [%llu, %llu] failed", logical, + logical + len); + +out: + btrfs_put_bioc(bioc); + return ret; +} + +static const test_func_t tests[] = { + test_simple_create_delete, + test_create_update_delete, + test_tail_delete, + test_front_delete, +}; + +static int run_test(test_func_t test, u32 sectorsize, u32 nodesize) +{ + struct btrfs_trans_handle trans; + struct btrfs_fs_info *fs_info; + struct btrfs_root *root = NULL; + int ret; + + fs_info = btrfs_alloc_dummy_fs_info(sectorsize, nodesize); + if (!fs_info) { + test_std_err(TEST_ALLOC_FS_INFO); + ret = -ENOMEM; + goto out; + } + + root = btrfs_alloc_dummy_root(fs_info); + if (IS_ERR(root)) { + test_std_err(TEST_ALLOC_ROOT); + ret = PTR_ERR(root); + goto out; + } + btrfs_set_super_compat_ro_flags(root->fs_info->super_copy, + BTRFS_FEATURE_INCOMPAT_RAID_STRIPE_TREE); + root->root_key.objectid = BTRFS_RAID_STRIPE_TREE_OBJECTID; + root->root_key.type = BTRFS_ROOT_ITEM_KEY; + root->root_key.offset = 0; + fs_info->stripe_root = root; + root->fs_info->tree_root = root; + + root->node = alloc_test_extent_buffer(root->fs_info, nodesize); + if (IS_ERR(root->node)) { + test_std_err(TEST_ALLOC_EXTENT_BUFFER); + ret = PTR_ERR(root->node); + goto out; + } + btrfs_set_header_level(root->node, 0); + btrfs_set_header_nritems(root->node, 0); + root->alloc_bytenr += 2 * nodesize; + + for (int i = 0; i < RST_TEST_NUM_DEVICES; i++) { + struct btrfs_device *dev; + + dev = btrfs_alloc_dummy_device(fs_info); + if (IS_ERR(dev)) { + test_err("cannot allocate device"); + ret = PTR_ERR(dev); + goto out; + } + dev->devid = i; + } + + btrfs_init_dummy_trans(&trans, root->fs_info); + ret = test(&trans); + if (ret) + goto out; + +out: + btrfs_free_dummy_root(root); + btrfs_free_dummy_fs_info(fs_info); + + return ret; +} + +int btrfs_test_raid_stripe_tree(u32 sectorsize, u32 nodesize) +{ + int ret = 0; + + test_msg("running raid-stripe-tree tests"); + for (int i = 0; i < ARRAY_SIZE(tests); i++) { + ret = run_test(tests[i], sectorsize, nodesize); + if (ret) { + test_err("test-case %ps failed with %d\n", tests[i], ret); + goto out; + } + } + +out: + return ret; +} diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h index dd9ce9b9f69e38..184fa5c0062abf 100644 --- a/fs/btrfs/transaction.h +++ b/fs/btrfs/transaction.h @@ -33,7 +33,7 @@ struct btrfs_path; */ #define BTRFS_TRANS_DIO_WRITE_STUB ((void *) 1) -/* Radix-tree tag for roots that are part of the trasaction. */ +/* Radix-tree tag for roots that are part of the transaction. */ #define BTRFS_ROOT_TRANS_TAG 0 enum btrfs_trans_state { diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c index 7b50263723bc1a..148d8cefa40eef 100644 --- a/fs/btrfs/tree-checker.c +++ b/fs/btrfs/tree-checker.c @@ -2183,8 +2183,8 @@ int btrfs_check_eb_owner(const struct extent_buffer *eb, u64 root_owner) return 0; } -int btrfs_verify_level_key(struct extent_buffer *eb, int level, - struct btrfs_key *first_key, u64 parent_transid) +int btrfs_verify_level_key(struct extent_buffer *eb, + const struct btrfs_tree_parent_check *check) { struct btrfs_fs_info *fs_info = eb->fs_info; int found_level; @@ -2192,16 +2192,16 @@ int btrfs_verify_level_key(struct extent_buffer *eb, int level, int ret; found_level = btrfs_header_level(eb); - if (found_level != level) { + if (found_level != check->level) { WARN(IS_ENABLED(CONFIG_BTRFS_DEBUG), KERN_ERR "BTRFS: tree level check failed\n"); btrfs_err(fs_info, "tree level mismatch detected, bytenr=%llu level expected=%u has=%u", - eb->start, level, found_level); + eb->start, check->level, found_level); return -EIO; } - if (!first_key) + if (!check->has_first_key) return 0; /* @@ -2226,15 +2226,15 @@ int btrfs_verify_level_key(struct extent_buffer *eb, int level, btrfs_node_key_to_cpu(eb, &found_key, 0); else btrfs_item_key_to_cpu(eb, &found_key, 0); - ret = btrfs_comp_cpu_keys(first_key, &found_key); + ret = btrfs_comp_cpu_keys(&check->first_key, &found_key); if (ret) { WARN(IS_ENABLED(CONFIG_BTRFS_DEBUG), KERN_ERR "BTRFS: tree first key check failed\n"); btrfs_err(fs_info, "tree first key mismatch detected, bytenr=%llu parent_transid=%llu key expected=(%llu,%u,%llu) has=(%llu,%u,%llu)", - eb->start, parent_transid, first_key->objectid, - first_key->type, first_key->offset, + eb->start, check->transid, check->first_key.objectid, + check->first_key.type, check->first_key.offset, found_key.objectid, found_key.type, found_key.offset); } diff --git a/fs/btrfs/tree-checker.h b/fs/btrfs/tree-checker.h index 01669cfa6578de..db67f96cbe4bc1 100644 --- a/fs/btrfs/tree-checker.h +++ b/fs/btrfs/tree-checker.h @@ -69,7 +69,7 @@ int btrfs_check_node(struct extent_buffer *node); int btrfs_check_chunk_valid(struct extent_buffer *leaf, struct btrfs_chunk *chunk, u64 logical); int btrfs_check_eb_owner(const struct extent_buffer *eb, u64 root_owner); -int btrfs_verify_level_key(struct extent_buffer *eb, int level, - struct btrfs_key *first_key, u64 parent_transid); +int btrfs_verify_level_key(struct extent_buffer *eb, + const struct btrfs_tree_parent_check *check); #endif diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 9637c7cdc0cf92..c8d6587688b3dd 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -6204,7 +6204,6 @@ static int log_delayed_deletions_full(struct btrfs_trans_handle *trans, static int batch_delete_dir_index_items(struct btrfs_trans_handle *trans, struct btrfs_inode *inode, struct btrfs_path *path, - struct btrfs_log_ctx *ctx, const struct list_head *delayed_del_list, const struct btrfs_delayed_item *first, const struct btrfs_delayed_item **last_ret) @@ -6265,7 +6264,7 @@ static int log_delayed_deletions_incremental(struct btrfs_trans_handle *trans, if (ret < 0) { return ret; } else if (ret == 0) { - ret = batch_delete_dir_index_items(trans, inode, path, ctx, + ret = batch_delete_dir_index_items(trans, inode, path, delayed_del_list, curr, &last); if (ret) diff --git a/fs/btrfs/tree-mod-log.c b/fs/btrfs/tree-mod-log.c index b382a4c443d427..1ac2678fc4caa8 100644 --- a/fs/btrfs/tree-mod-log.c +++ b/fs/btrfs/tree-mod-log.c @@ -909,7 +909,6 @@ static void tree_mod_log_rewind(struct btrfs_fs_info *fs_info, * is freed (its refcount is decremented). */ struct extent_buffer *btrfs_tree_mod_log_rewind(struct btrfs_fs_info *fs_info, - struct btrfs_path *path, struct extent_buffer *eb, u64 time_seq) { diff --git a/fs/btrfs/tree-mod-log.h b/fs/btrfs/tree-mod-log.h index 6308c577a4a4ab..1c12566040db52 100644 --- a/fs/btrfs/tree-mod-log.h +++ b/fs/btrfs/tree-mod-log.h @@ -41,7 +41,6 @@ int btrfs_tree_mod_log_insert_key(const struct extent_buffer *eb, int slot, enum btrfs_mod_log_op op); int btrfs_tree_mod_log_free_eb(struct extent_buffer *eb); struct extent_buffer *btrfs_tree_mod_log_rewind(struct btrfs_fs_info *fs_info, - struct btrfs_path *path, struct extent_buffer *eb, u64 time_seq); struct extent_buffer *btrfs_get_old_root(struct btrfs_root *root, u64 time_seq); diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 8f340ad1d93845..84e861dcb35008 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -732,6 +732,114 @@ const u8 *btrfs_sb_fsid_ptr(const struct btrfs_super_block *sb) return has_metadata_uuid ? sb->metadata_uuid : sb->fsid; } +/* + * We can have very weird soft links passed in. + * One example is "/proc/self/fd/", which can be a soft link to + * a block device. + * + * But it's never a good idea to use those weird names. + * Here we check if the path (not following symlinks) is a good one inside + * "/dev/". + */ +static bool is_good_dev_path(const char *dev_path) +{ + struct path path = { .mnt = NULL, .dentry = NULL }; + char *path_buf = NULL; + char *resolved_path; + bool is_good = false; + int ret; + + if (!dev_path) + goto out; + + path_buf = kmalloc(PATH_MAX, GFP_KERNEL); + if (!path_buf) + goto out; + + /* + * Do not follow soft link, just check if the original path is inside + * "/dev/". + */ + ret = kern_path(dev_path, 0, &path); + if (ret) + goto out; + resolved_path = d_path(&path, path_buf, PATH_MAX); + if (IS_ERR(resolved_path)) + goto out; + if (strncmp(resolved_path, "/dev/", strlen("/dev/"))) + goto out; + is_good = true; +out: + kfree(path_buf); + path_put(&path); + return is_good; +} + +static int get_canonical_dev_path(const char *dev_path, char *canonical) +{ + struct path path = { .mnt = NULL, .dentry = NULL }; + char *path_buf = NULL; + char *resolved_path; + int ret; + + if (!dev_path) { + ret = -EINVAL; + goto out; + } + + path_buf = kmalloc(PATH_MAX, GFP_KERNEL); + if (!path_buf) { + ret = -ENOMEM; + goto out; + } + + ret = kern_path(dev_path, LOOKUP_FOLLOW, &path); + if (ret) + goto out; + resolved_path = d_path(&path, path_buf, PATH_MAX); + ret = strscpy(canonical, resolved_path, PATH_MAX); +out: + kfree(path_buf); + path_put(&path); + return ret; +} + +static bool is_same_device(struct btrfs_device *device, const char *new_path) +{ + struct path old = { .mnt = NULL, .dentry = NULL }; + struct path new = { .mnt = NULL, .dentry = NULL }; + char *old_path = NULL; + bool is_same = false; + int ret; + + if (!device->name) + goto out; + + old_path = kzalloc(PATH_MAX, GFP_NOFS); + if (!old_path) + goto out; + + rcu_read_lock(); + ret = strscpy(old_path, rcu_str_deref(device->name), PATH_MAX); + rcu_read_unlock(); + if (ret < 0) + goto out; + + ret = kern_path(old_path, LOOKUP_FOLLOW, &old); + if (ret) + goto out; + ret = kern_path(new_path, LOOKUP_FOLLOW, &new); + if (ret) + goto out; + if (path_equal(&old, &new)) + is_same = true; +out: + kfree(old_path); + path_put(&old); + path_put(&new); + return is_same; +} + /* * Add new device to list of registered devices * @@ -852,7 +960,7 @@ static noinline struct btrfs_device *device_list_add(const char *path, MAJOR(path_devt), MINOR(path_devt), current->comm, task_pid_nr(current)); - } else if (!device->name || strcmp(device->name->str, path)) { + } else if (!device->name || !is_same_device(device, path)) { /* * When FS is already mounted. * 1. If you are here and if the device->name is NULL that @@ -1382,12 +1490,23 @@ struct btrfs_device *btrfs_scan_one_device(const char *path, blk_mode_t flags, bool new_device_added = false; struct btrfs_device *device = NULL; struct file *bdev_file; + char *canonical_path = NULL; u64 bytenr; dev_t devt; int ret; lockdep_assert_held(&uuid_mutex); + if (!is_good_dev_path(path)) { + canonical_path = kmalloc(PATH_MAX, GFP_KERNEL); + if (canonical_path) { + ret = get_canonical_dev_path(path, canonical_path); + if (ret < 0) { + kfree(canonical_path); + canonical_path = NULL; + } + } + } /* * Avoid an exclusive open here, as the systemd-udev may initiate the * device scan which may race with the user's mount or mkfs command, @@ -1432,7 +1551,8 @@ struct btrfs_device *btrfs_scan_one_device(const char *path, blk_mode_t flags, goto free_disk_super; } - device = device_list_add(path, disk_super, &new_device_added); + device = device_list_add(canonical_path ? : path, disk_super, + &new_device_added); if (!IS_ERR(device) && new_device_added) btrfs_free_stale_devices(device->devt, device); @@ -1441,6 +1561,7 @@ struct btrfs_device *btrfs_scan_one_device(const char *path, blk_mode_t flags, error_bdev_put: fput(bdev_file); + kfree(canonical_path); return device; } @@ -2720,8 +2841,6 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path set_blocksize(device->bdev_file, BTRFS_BDEV_BLOCKSIZE); if (seeding_dev) { - btrfs_clear_sb_rdonly(sb); - /* GFP_KERNEL allocation must not be under device_list_mutex */ seed_devices = btrfs_init_sprout(fs_info); if (IS_ERR(seed_devices)) { @@ -2864,8 +2983,6 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path mutex_unlock(&fs_info->chunk_mutex); mutex_unlock(&fs_info->fs_devices->device_list_mutex); error_trans: - if (seeding_dev) - btrfs_set_sb_rdonly(sb); if (trans) btrfs_end_transaction(trans); error_free_zone: @@ -5309,7 +5426,7 @@ static int decide_stripe_size_zoned(struct alloc_chunk_ctl *ctl, ctl->num_stripes = ctl->ndevs * ctl->dev_stripes; data_stripes = (ctl->num_stripes - ctl->nparity) / ctl->ncopies; - /* stripe_size is fixed in zoned filesysmte. Reduce ndevs instead. */ + /* stripe_size is fixed in zoned filesystem. Reduce ndevs instead. */ if (ctl->stripe_size * data_stripes > ctl->max_chunk_size) { ctl->ndevs = div_u64(div_u64(ctl->max_chunk_size * ctl->ncopies, ctl->stripe_size) + ctl->nparity, @@ -5841,24 +5958,6 @@ unsigned long btrfs_full_stripe_len(struct btrfs_fs_info *fs_info, return len; } -int btrfs_is_parity_mirror(struct btrfs_fs_info *fs_info, u64 logical, u64 len) -{ - struct btrfs_chunk_map *map; - int ret = 0; - - if (!btrfs_fs_incompat(fs_info, RAID56)) - return 0; - - map = btrfs_get_chunk_map(fs_info, logical, len); - - if (!WARN_ON(IS_ERR(map))) { - if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) - ret = 1; - btrfs_free_chunk_map(map); - } - return ret; -} - static int find_live_mirror(struct btrfs_fs_info *fs_info, struct btrfs_chunk_map *map, int first, int dev_replace_is_ongoing) @@ -5919,9 +6018,9 @@ static int find_live_mirror(struct btrfs_fs_info *fs_info, return preferred_mirror; } -static struct btrfs_io_context *alloc_btrfs_io_context(struct btrfs_fs_info *fs_info, - u64 logical, - u16 total_stripes) +EXPORT_FOR_TESTS +struct btrfs_io_context *alloc_btrfs_io_context(struct btrfs_fs_info *fs_info, + u64 logical, u16 total_stripes) { struct btrfs_io_context *bioc; @@ -6480,13 +6579,15 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, max_len = btrfs_max_io_len(map, map_offset, &io_geom); *length = min_t(u64, map->chunk_len - map_offset, max_len); - down_read(&dev_replace->rwsem); + if (dev_replace->replace_task != current) + down_read(&dev_replace->rwsem); + dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(dev_replace); /* * Hold the semaphore for read during the whole operation, write is * requested at commit time but must wait. */ - if (!dev_replace_is_ongoing) + if (!dev_replace_is_ongoing && dev_replace->replace_task != current) up_read(&dev_replace->rwsem); switch (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) { @@ -6626,7 +6727,7 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, bioc->mirror_num = io_geom.mirror_num; out: - if (dev_replace_is_ongoing) { + if (dev_replace_is_ongoing && dev_replace->replace_task != current) { lockdep_assert_held(&dev_replace->rwsem); /* Unlock and let waiting writers proceed */ up_read(&dev_replace->rwsem); diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 4481575dd70f35..3a416b1bc24cb0 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -306,7 +306,7 @@ enum btrfs_read_policy { BTRFS_NR_READ_POLICY, }; -#ifdef CONFIG_BTRFS_DEBUG +#ifdef CONFIG_BTRFS_EXPERIMENTAL /* * Checksum mode - offload it to workqueues or do it synchronously in * btrfs_submit_chunk(). @@ -430,7 +430,7 @@ struct btrfs_fs_devices { /* Policy used to read the mirrored stripes. */ enum btrfs_read_policy read_policy; -#ifdef CONFIG_BTRFS_DEBUG +#ifdef CONFIG_BTRFS_EXPERIMENTAL /* Checksum mode - offload it or do it synchronously. */ enum btrfs_offload_csum_mode offload_csum_mode; #endif @@ -741,8 +741,6 @@ int btrfs_run_dev_stats(struct btrfs_trans_handle *trans); void btrfs_rm_dev_replace_remove_srcdev(struct btrfs_device *srcdev); void btrfs_rm_dev_replace_free_srcdev(struct btrfs_device *srcdev); void btrfs_destroy_dev_replace_tgtdev(struct btrfs_device *tgtdev); -int btrfs_is_parity_mirror(struct btrfs_fs_info *fs_info, - u64 logical, u64 len); unsigned long btrfs_full_stripe_len(struct btrfs_fs_info *fs_info, u64 logical); u64 btrfs_calc_stripe_length(const struct btrfs_chunk_map *map); @@ -840,4 +838,9 @@ bool btrfs_repair_one_zone(struct btrfs_fs_info *fs_info, u64 logical); bool btrfs_pinned_by_swapfile(struct btrfs_fs_info *fs_info, void *ptr); const u8 *btrfs_sb_fsid_ptr(const struct btrfs_super_block *sb); +#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS +struct btrfs_io_context *alloc_btrfs_io_context(struct btrfs_fs_info *fs_info, + u64 logical, u16 total_stripes); +#endif + #endif diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c index ce464cd8e0ac79..bc18710d1dcfbb 100644 --- a/fs/btrfs/xattr.c +++ b/fs/btrfs/xattr.c @@ -85,7 +85,6 @@ int btrfs_setxattr(struct btrfs_trans_handle *trans, struct inode *inode, { struct btrfs_dir_item *di = NULL; struct btrfs_root *root = BTRFS_I(inode)->root; - struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_path *path; size_t name_len = strlen(name); int ret = 0; @@ -143,14 +142,14 @@ int btrfs_setxattr(struct btrfs_trans_handle *trans, struct inode *inode, */ ret = 0; btrfs_assert_tree_write_locked(path->nodes[0]); - di = btrfs_match_dir_item_name(fs_info, path, name, name_len); + di = btrfs_match_dir_item_name(path, name, name_len); if (!di && !(flags & XATTR_REPLACE)) { ret = -ENOSPC; goto out; } } else if (ret == -EEXIST) { ret = 0; - di = btrfs_match_dir_item_name(fs_info, path, name, name_len); + di = btrfs_match_dir_item_name(path, name, name_len); ASSERT(di); /* logic error */ } else if (ret) { goto out; diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c index 100abc00b794ca..ddf0d5a448a741 100644 --- a/fs/btrfs/zlib.c +++ b/fs/btrfs/zlib.c @@ -194,7 +194,7 @@ int zlib_compress_folios(struct list_head *ws, struct address_space *mapping, pg_off = offset_in_page(start); cur_len = btrfs_calc_input_length(orig_end, start); data_in = kmap_local_folio(in_folio, pg_off); - start += PAGE_SIZE; + start += cur_len; workspace->strm.next_in = data_in; workspace->strm.avail_in = cur_len; } diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c index 69d03feea4e0ec..dbcbf754d2846d 100644 --- a/fs/btrfs/zoned.c +++ b/fs/btrfs/zoned.c @@ -1973,7 +1973,7 @@ int btrfs_check_meta_write_pointer(struct btrfs_fs_info *fs_info, if (block_group->meta_write_pointer > eb->start) return -EBUSY; - /* If for_sync, this hole will be filled with trasnsaction commit. */ + /* If for_sync, this hole will be filled with transaction commit. */ if (wbc->sync_mode == WB_SYNC_ALL && !wbc->for_sync) return -EAGAIN; return -EBUSY; diff --git a/fs/btrfs/zstd.c b/fs/btrfs/zstd.c index 866607fd3e5884..5232b56d589259 100644 --- a/fs/btrfs/zstd.c +++ b/fs/btrfs/zstd.c @@ -111,6 +111,8 @@ static void zstd_reclaim_timer_fn(struct timer_list *timer) unsigned long reclaim_threshold = jiffies - ZSTD_BTRFS_RECLAIM_JIFFIES; struct list_head *pos, *next; + ASSERT(timer == &wsm.timer); + spin_lock(&wsm.lock); if (list_empty(&wsm.lru_list)) { @@ -495,7 +497,7 @@ int zstd_compress_folios(struct list_head *ws, struct address_space *mapping, /* Check if we need more input */ if (workspace->in_buf.pos == workspace->in_buf.size) { - tot_in += PAGE_SIZE; + tot_in += workspace->in_buf.size; kunmap_local(workspace->in_buf.src); workspace->in_buf.src = NULL; folio_put(in_folio); diff --git a/include/trace/events/btrfs.h b/include/trace/events/btrfs.h index af6b3827fb1d01..4df93ca9b7a8d4 100644 --- a/include/trace/events/btrfs.h +++ b/include/trace/events/btrfs.h @@ -1706,9 +1706,10 @@ DEFINE_EVENT(btrfs__qgroup_rsv_data, btrfs_qgroup_release_data, DECLARE_EVENT_CLASS(btrfs_qgroup_extent, TP_PROTO(const struct btrfs_fs_info *fs_info, - const struct btrfs_qgroup_extent_record *rec), + const struct btrfs_qgroup_extent_record *rec, + u64 bytenr), - TP_ARGS(fs_info, rec), + TP_ARGS(fs_info, rec, bytenr), TP_STRUCT__entry_btrfs( __field( u64, bytenr ) @@ -1716,7 +1717,7 @@ DECLARE_EVENT_CLASS(btrfs_qgroup_extent, ), TP_fast_assign_btrfs(fs_info, - __entry->bytenr = rec->bytenr; + __entry->bytenr = bytenr; __entry->num_bytes = rec->num_bytes; ), @@ -1727,17 +1728,19 @@ DECLARE_EVENT_CLASS(btrfs_qgroup_extent, DEFINE_EVENT(btrfs_qgroup_extent, btrfs_qgroup_account_extents, TP_PROTO(const struct btrfs_fs_info *fs_info, - const struct btrfs_qgroup_extent_record *rec), + const struct btrfs_qgroup_extent_record *rec, + u64 bytenr), - TP_ARGS(fs_info, rec) + TP_ARGS(fs_info, rec, bytenr) ); DEFINE_EVENT(btrfs_qgroup_extent, btrfs_qgroup_trace_extent, TP_PROTO(const struct btrfs_fs_info *fs_info, - const struct btrfs_qgroup_extent_record *rec), + const struct btrfs_qgroup_extent_record *rec, + u64 bytenr), - TP_ARGS(fs_info, rec) + TP_ARGS(fs_info, rec, bytenr) ); TRACE_EVENT(qgroup_num_dirty_extents, @@ -2341,7 +2344,6 @@ DEFINE_BTRFS_LOCK_EVENT(btrfs_tree_read_unlock_blocking); DEFINE_BTRFS_LOCK_EVENT(btrfs_set_lock_blocking_read); DEFINE_BTRFS_LOCK_EVENT(btrfs_set_lock_blocking_write); DEFINE_BTRFS_LOCK_EVENT(btrfs_try_tree_read_lock); -DEFINE_BTRFS_LOCK_EVENT(btrfs_try_tree_write_lock); DEFINE_BTRFS_LOCK_EVENT(btrfs_tree_read_lock_atomic); DECLARE_EVENT_CLASS(btrfs__space_info_update, @@ -2553,10 +2555,9 @@ TRACE_EVENT(btrfs_extent_map_shrinker_count, TRACE_EVENT(btrfs_extent_map_shrinker_scan_enter, - TP_PROTO(const struct btrfs_fs_info *fs_info, long nr_to_scan, long nr, - u64 last_root_id, u64 last_ino), + TP_PROTO(const struct btrfs_fs_info *fs_info, long nr), - TP_ARGS(fs_info, nr_to_scan, nr, last_root_id, last_ino), + TP_ARGS(fs_info, nr), TP_STRUCT__entry_btrfs( __field( long, nr_to_scan ) @@ -2566,10 +2567,11 @@ TRACE_EVENT(btrfs_extent_map_shrinker_scan_enter, ), TP_fast_assign_btrfs(fs_info, - __entry->nr_to_scan = nr_to_scan; + __entry->nr_to_scan = \ + atomic64_read(&fs_info->em_shrinker_nr_to_scan); __entry->nr = nr; - __entry->last_root_id = last_root_id; - __entry->last_ino = last_ino; + __entry->last_root_id = fs_info->em_shrinker_last_root; + __entry->last_ino = fs_info->em_shrinker_last_ino; ), TP_printk_btrfs("nr_to_scan=%ld nr=%ld last_root=%llu(%s) last_ino=%llu", @@ -2579,10 +2581,9 @@ TRACE_EVENT(btrfs_extent_map_shrinker_scan_enter, TRACE_EVENT(btrfs_extent_map_shrinker_scan_exit, - TP_PROTO(const struct btrfs_fs_info *fs_info, long nr_dropped, long nr, - u64 last_root_id, u64 last_ino), + TP_PROTO(const struct btrfs_fs_info *fs_info, long nr_dropped, long nr), - TP_ARGS(fs_info, nr_dropped, nr, last_root_id, last_ino), + TP_ARGS(fs_info, nr_dropped, nr), TP_STRUCT__entry_btrfs( __field( long, nr_dropped ) @@ -2594,8 +2595,8 @@ TRACE_EVENT(btrfs_extent_map_shrinker_scan_exit, TP_fast_assign_btrfs(fs_info, __entry->nr_dropped = nr_dropped; __entry->nr = nr; - __entry->last_root_id = last_root_id; - __entry->last_ino = last_ino; + __entry->last_root_id = fs_info->em_shrinker_last_root; + __entry->last_ino = fs_info->em_shrinker_last_ino; ), TP_printk_btrfs("nr_dropped=%ld nr=%ld last_root=%llu(%s) last_ino=%llu",