From fe46c644a46728cc40ae3c0d2f100fc64d789a50 Mon Sep 17 00:00:00 2001
From: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Date: Tue, 10 Sep 2024 09:55:01 +0200
Subject: [PATCH 01/80] btrfs: don't take dev_replace rwsem on task already
 holding it

Running fstests btrfs/011 with MKFS_OPTIONS="-O rst" to force the usage of
the RAID stripe-tree, we get the following splat from lockdep:

 BTRFS info (device sdd): dev_replace from /dev/sdd (devid 1) to /dev/sdb started

 ============================================
 WARNING: possible recursive locking detected
 6.11.0-rc3-btrfs-for-next #599 Not tainted
 --------------------------------------------
 btrfs/2326 is trying to acquire lock:
 ffff88810f215c98 (&fs_info->dev_replace.rwsem){++++}-{3:3}, at: btrfs_map_block+0x39f/0x2250

 but task is already holding lock:
 ffff88810f215c98 (&fs_info->dev_replace.rwsem){++++}-{3:3}, at: btrfs_map_block+0x39f/0x2250

 other info that might help us debug this:
  Possible unsafe locking scenario:

        CPU0
        ----
   lock(&fs_info->dev_replace.rwsem);
   lock(&fs_info->dev_replace.rwsem);

  *** DEADLOCK ***

  May be due to missing lock nesting notation

 1 lock held by btrfs/2326:
  #0: ffff88810f215c98 (&fs_info->dev_replace.rwsem){++++}-{3:3}, at: btrfs_map_block+0x39f/0x2250

 stack backtrace:
 CPU: 1 UID: 0 PID: 2326 Comm: btrfs Not tainted 6.11.0-rc3-btrfs-for-next #599
 Hardware name: Bochs Bochs, BIOS Bochs 01/01/2011
 Call Trace:
  <TASK>
  dump_stack_lvl+0x5b/0x80
  __lock_acquire+0x2798/0x69d0
  ? __pfx___lock_acquire+0x10/0x10
  ? __pfx___lock_acquire+0x10/0x10
  lock_acquire+0x19d/0x4a0
  ? btrfs_map_block+0x39f/0x2250
  ? __pfx_lock_acquire+0x10/0x10
  ? find_held_lock+0x2d/0x110
  ? lock_is_held_type+0x8f/0x100
  down_read+0x8e/0x440
  ? btrfs_map_block+0x39f/0x2250
  ? __pfx_down_read+0x10/0x10
  ? do_raw_read_unlock+0x44/0x70
  ? _raw_read_unlock+0x23/0x40
  btrfs_map_block+0x39f/0x2250
  ? btrfs_dev_replace_by_ioctl+0xd69/0x1d00
  ? btrfs_bio_counter_inc_blocked+0xd9/0x2e0
  ? __kasan_slab_alloc+0x6e/0x70
  ? __pfx_btrfs_map_block+0x10/0x10
  ? __pfx_btrfs_bio_counter_inc_blocked+0x10/0x10
  ? kmem_cache_alloc_noprof+0x1f2/0x300
  ? mempool_alloc_noprof+0xed/0x2b0
  btrfs_submit_chunk+0x28d/0x17e0
  ? __pfx_btrfs_submit_chunk+0x10/0x10
  ? bvec_alloc+0xd7/0x1b0
  ? bio_add_folio+0x171/0x270
  ? __pfx_bio_add_folio+0x10/0x10
  ? __kasan_check_read+0x20/0x20
  btrfs_submit_bio+0x37/0x80
  read_extent_buffer_pages+0x3df/0x6c0
  btrfs_read_extent_buffer+0x13e/0x5f0
  read_tree_block+0x81/0xe0
  read_block_for_search+0x4bd/0x7a0
  ? __pfx_read_block_for_search+0x10/0x10
  btrfs_search_slot+0x78d/0x2720
  ? __pfx_btrfs_search_slot+0x10/0x10
  ? lock_is_held_type+0x8f/0x100
  ? kasan_save_track+0x14/0x30
  ? __kasan_slab_alloc+0x6e/0x70
  ? kmem_cache_alloc_noprof+0x1f2/0x300
  btrfs_get_raid_extent_offset+0x181/0x820
  ? __pfx_lock_acquire+0x10/0x10
  ? __pfx_btrfs_get_raid_extent_offset+0x10/0x10
  ? down_read+0x194/0x440
  ? __pfx_down_read+0x10/0x10
  ? do_raw_read_unlock+0x44/0x70
  ? _raw_read_unlock+0x23/0x40
  btrfs_map_block+0x5b5/0x2250
  ? __pfx_btrfs_map_block+0x10/0x10
  scrub_submit_initial_read+0x8fe/0x11b0
  ? __pfx_scrub_submit_initial_read+0x10/0x10
  submit_initial_group_read+0x161/0x3a0
  ? lock_release+0x20e/0x710
  ? __pfx_submit_initial_group_read+0x10/0x10
  ? __pfx_lock_release+0x10/0x10
  scrub_simple_mirror.isra.0+0x3eb/0x580
  scrub_stripe+0xe4d/0x1440
  ? lock_release+0x20e/0x710
  ? __pfx_scrub_stripe+0x10/0x10
  ? __pfx_lock_release+0x10/0x10
  ? do_raw_read_unlock+0x44/0x70
  ? _raw_read_unlock+0x23/0x40
  scrub_chunk+0x257/0x4a0
  scrub_enumerate_chunks+0x64c/0xf70
  ? __mutex_unlock_slowpath+0x147/0x5f0
  ? __pfx_scrub_enumerate_chunks+0x10/0x10
  ? bit_wait_timeout+0xb0/0x170
  ? __up_read+0x189/0x700
  ? scrub_workers_get+0x231/0x300
  ? up_write+0x490/0x4f0
  btrfs_scrub_dev+0x52e/0xcd0
  ? create_pending_snapshots+0x230/0x250
  ? __pfx_btrfs_scrub_dev+0x10/0x10
  btrfs_dev_replace_by_ioctl+0xd69/0x1d00
  ? lock_acquire+0x19d/0x4a0
  ? __pfx_btrfs_dev_replace_by_ioctl+0x10/0x10
  ? lock_release+0x20e/0x710
  ? btrfs_ioctl+0xa09/0x74f0
  ? __pfx_lock_release+0x10/0x10
  ? do_raw_spin_lock+0x11e/0x240
  ? __pfx_do_raw_spin_lock+0x10/0x10
  btrfs_ioctl+0xa14/0x74f0
  ? lock_acquire+0x19d/0x4a0
  ? find_held_lock+0x2d/0x110
  ? __pfx_btrfs_ioctl+0x10/0x10
  ? lock_release+0x20e/0x710
  ? do_sigaction+0x3f0/0x860
  ? __pfx_do_vfs_ioctl+0x10/0x10
  ? do_raw_spin_lock+0x11e/0x240
  ? lockdep_hardirqs_on_prepare+0x270/0x3e0
  ? _raw_spin_unlock_irq+0x28/0x50
  ? do_sigaction+0x3f0/0x860
  ? __pfx_do_sigaction+0x10/0x10
  ? __x64_sys_rt_sigaction+0x18e/0x1e0
  ? __pfx___x64_sys_rt_sigaction+0x10/0x10
  ? __x64_sys_close+0x7c/0xd0
  __x64_sys_ioctl+0x137/0x190
  do_syscall_64+0x71/0x140
  entry_SYSCALL_64_after_hwframe+0x76/0x7e
 RIP: 0033:0x7f0bd1114f9b
 Code: Unable to access opcode bytes at 0x7f0bd1114f71.
 RSP: 002b:00007ffc8a8c3130 EFLAGS: 00000246 ORIG_RAX: 0000000000000010
 RAX: ffffffffffffffda RBX: 0000000000000003 RCX: 00007f0bd1114f9b
 RDX: 00007ffc8a8c35e0 RSI: 00000000ca289435 RDI: 0000000000000003
 RBP: 0000000000000000 R08: 0000000000000000 R09: 0000000000000007
 R10: 0000000000000008 R11: 0000000000000246 R12: 00007ffc8a8c6c85
 R13: 00000000398e72a0 R14: 0000000000004361 R15: 0000000000000004
  </TASK>

This happens because on RAID stripe-tree filesystems we recurse back into
btrfs_map_block() on scrub to perform the logical to device physical
mapping.

But as the device replace task is already holding the dev_replace::rwsem
we deadlock.

So don't take the dev_replace::rwsem in case our task is the task performing
the device replace.

Suggested-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Reviewed-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/dev-replace.c | 2 ++
 fs/btrfs/fs.h          | 2 ++
 fs/btrfs/volumes.c     | 8 +++++---
 3 files changed, 9 insertions(+), 3 deletions(-)
diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index 83d5cdd77f293e..604399e59a3d10 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -641,6 +641,7 @@ static int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info,
 		return ret;
 
 	down_write(&dev_replace->rwsem);
+	dev_replace->replace_task = current;
 	switch (dev_replace->replace_state) {
 	case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
 	case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
@@ -994,6 +995,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
 	list_add(&tgt_device->dev_alloc_list, &fs_devices->alloc_list);
 	fs_devices->rw_devices++;
 
+	dev_replace->replace_task = NULL;
 	up_write(&dev_replace->rwsem);
 	btrfs_rm_dev_replace_blocked(fs_info);
 
diff --git a/fs/btrfs/fs.h b/fs/btrfs/fs.h
index 79f64e383eddf8..cbfb225858a59f 100644
--- a/fs/btrfs/fs.h
+++ b/fs/btrfs/fs.h
@@ -317,6 +317,8 @@ struct btrfs_dev_replace {
 
 	struct percpu_counter bio_counter;
 	wait_queue_head_t replace_wait;
+
+	struct task_struct *replace_task;
 };
 
 /*
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 8f340ad1d93845..995b0647f53816 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -6480,13 +6480,15 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
 	max_len = btrfs_max_io_len(map, map_offset, &io_geom);
 	*length = min_t(u64, map->chunk_len - map_offset, max_len);
 
-	down_read(&dev_replace->rwsem);
+	if (dev_replace->replace_task != current)
+		down_read(&dev_replace->rwsem);
+
 	dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(dev_replace);
 	/*
 	 * Hold the semaphore for read during the whole operation, write is
 	 * requested at commit time but must wait.
 	 */
-	if (!dev_replace_is_ongoing)
+	if (!dev_replace_is_ongoing && dev_replace->replace_task != current)
 		up_read(&dev_replace->rwsem);
 
 	switch (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) {
@@ -6626,7 +6628,7 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
 	bioc->mirror_num = io_geom.mirror_num;
 
 out:
-	if (dev_replace_is_ongoing) {
+	if (dev_replace_is_ongoing && dev_replace->replace_task != current) {
 		lockdep_assert_held(&dev_replace->rwsem);
 		/* Unlock and let waiting writers proceed */
 		up_read(&dev_replace->rwsem);

From 5b3b62a19dd44cd76016b10c4abf4b9e9953599c Mon Sep 17 00:00:00 2001
From: Qu Wenruo <wqu@suse.com>
Date: Mon, 16 Sep 2024 17:55:42 +0930
Subject: [PATCH 02/80] btrfs: make assert_rbio() to only check
 CONFIG_BTRFS_ASSERT

According to the description, CONFIG_BTRFS_DEBUG is only for extra
debug info, meanwhile sanity checks should be managed by
CONFIG_BTRFS_ASSERT.

There is no need to check both to enable assert_rbio().

Just remove the check for CONFIG_BTRFS_DEBUG.

Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/raid56.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
index 39bec672df0cc0..cdd373c277848e 100644
--- a/fs/btrfs/raid56.c
+++ b/fs/btrfs/raid56.c
@@ -1272,8 +1272,7 @@ static inline void bio_list_put(struct bio_list *bio_list)
 
 static void assert_rbio(struct btrfs_raid_bio *rbio)
 {
-	if (!IS_ENABLED(CONFIG_BTRFS_DEBUG) ||
-	    !IS_ENABLED(CONFIG_BTRFS_ASSERT))
+	if (!IS_ENABLED(CONFIG_BTRFS_ASSERT))
 		return;
 
 	/*

From 8f83607db707148b3a6c5237e37fd6bb6d47141d Mon Sep 17 00:00:00 2001
From: Qu Wenruo <wqu@suse.com>
Date: Mon, 16 Sep 2024 18:18:25 +0930
Subject: [PATCH 03/80] btrfs: split out CONFIG_BTRFS_EXPERIMENTAL from
 CONFIG_BTRFS_DEBUG

Currently CONFIG_BTRFS_EXPERIMENTAL is not only for the extra debugging
output, but also for experimental features.

This is not ideal to distinguish planned but not yet stable features
from those purely designed for debugging.

This patch splits the following features into CONFIG_BTRFS_EXPERIMENTAL:

- Extent map shrinker
  This seems to be the first one to exit experimental.

- Extent tree v2
  This seems to be the last one to graduate from experimental.

- Raid stripe tree
- Csum offload mode
- Send protocol v3

Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Signed-off-by: Qu Wenruo <wqu@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/Kconfig   | 26 ++++++++++++++++++++++++++
 fs/btrfs/bio.c     |  2 +-
 fs/btrfs/fs.h      |  4 ++--
 fs/btrfs/send.h    |  2 +-
 fs/btrfs/super.c   |  6 +++---
 fs/btrfs/sysfs.c   |  4 ++--
 fs/btrfs/volumes.h |  4 ++--
 7 files changed, 37 insertions(+), 11 deletions(-)

diff --git a/fs/btrfs/Kconfig b/fs/btrfs/Kconfig
index 4fb925e8c981d8..fa8515598341ec 100644
--- a/fs/btrfs/Kconfig
+++ b/fs/btrfs/Kconfig
@@ -78,6 +78,32 @@ config BTRFS_ASSERT
 
 	  If unsure, say N.
 
+config BTRFS_EXPERIMENTAL
+	bool "Btrfs experimental features"
+	depends on BTRFS_FS
+	default n
+	help
+	  Enable experimental features.  These features may not be stable enough
+	  for end users.  This is meant for btrfs developers or users who wish
+	  to test the functionality and report problems.
+
+	  Current list:
+
+	  - extent map shrinker - performance problems with too frequent shrinks
+
+	  - send stream protocol v3 - fs-verity support
+
+	  - checksum offload mode - sysfs knob to affect when checksums are
+	                            calculated (at IO time, or in a thread)
+
+	  - raid-stripe-tree - additional mapping of extents to devices to
+			       support RAID1* profiles on zoned devices,
+			       RAID56 not yet supported
+
+	  - extent tree v2 - complex rework of extent tracking
+
+	  If unsure, say N.
+
 config BTRFS_FS_REF_VERIFY
 	bool "Btrfs with the ref verify tool compiled in"
 	depends on BTRFS_FS
diff --git a/fs/btrfs/bio.c b/fs/btrfs/bio.c
index fec5c6cde0a7f1..5d3f8bd406d9f1 100644
--- a/fs/btrfs/bio.c
+++ b/fs/btrfs/bio.c
@@ -598,7 +598,7 @@ static bool should_async_write(struct btrfs_bio *bbio)
 {
 	bool auto_csum_mode = true;
 
-#ifdef CONFIG_BTRFS_DEBUG
+#ifdef CONFIG_BTRFS_EXPERIMENTAL
 	struct btrfs_fs_devices *fs_devices = bbio->fs_info->fs_devices;
 	enum btrfs_offload_csum_mode csum_mode = READ_ONCE(fs_devices->offload_csum_mode);
 
diff --git a/fs/btrfs/fs.h b/fs/btrfs/fs.h
index cbfb225858a59f..785ec15c1b849b 100644
--- a/fs/btrfs/fs.h
+++ b/fs/btrfs/fs.h
@@ -263,10 +263,10 @@ enum {
 	 BTRFS_FEATURE_INCOMPAT_ZONED		|	\
 	 BTRFS_FEATURE_INCOMPAT_SIMPLE_QUOTA)
 
-#ifdef CONFIG_BTRFS_DEBUG
+#ifdef CONFIG_BTRFS_EXPERIMENTAL
 	/*
 	 * Features under developmen like Extent tree v2 support is enabled
-	 * only under CONFIG_BTRFS_DEBUG.
+	 * only under CONFIG_BTRFS_EXPERIMENTAL
 	 */
 #define BTRFS_FEATURE_INCOMPAT_SUPP		\
 	(BTRFS_FEATURE_INCOMPAT_SUPP_STABLE |	\
diff --git a/fs/btrfs/send.h b/fs/btrfs/send.h
index b07f4aa6687836..9309886c5ea17b 100644
--- a/fs/btrfs/send.h
+++ b/fs/btrfs/send.h
@@ -16,7 +16,7 @@ struct btrfs_ioctl_send_args;
 
 #define BTRFS_SEND_STREAM_MAGIC "btrfs-stream"
 /* Conditional support for the upcoming protocol version. */
-#ifdef CONFIG_BTRFS_DEBUG
+#ifdef CONFIG_BTRFS_EXPERIMENTAL
 #define BTRFS_SEND_STREAM_VERSION 3
 #else
 #define BTRFS_SEND_STREAM_VERSION 2
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 98fa0f382480a2..e8a5bf4af91893 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -2403,10 +2403,10 @@ static long btrfs_nr_cached_objects(struct super_block *sb, struct shrink_contro
 	trace_btrfs_extent_map_shrinker_count(fs_info, nr);
 
 	/*
-	 * Only report the real number for DEBUG builds, as there are reports of
-	 * serious performance degradation caused by too frequent shrinks.
+	 * Only report the real number for EXPERIMENTAL builds, as there are
+	 * reports of serious performance degradation caused by too frequent shrinks.
 	 */
-	if (IS_ENABLED(CONFIG_BTRFS_DEBUG))
+	if (IS_ENABLED(CONFIG_BTRFS_EXPERIMENTAL))
 		return nr;
 	return 0;
 }
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index 03926ad467c919..b843308e2bc6f6 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -1390,7 +1390,7 @@ static ssize_t btrfs_bg_reclaim_threshold_store(struct kobject *kobj,
 BTRFS_ATTR_RW(, bg_reclaim_threshold, btrfs_bg_reclaim_threshold_show,
 	      btrfs_bg_reclaim_threshold_store);
 
-#ifdef CONFIG_BTRFS_DEBUG
+#ifdef CONFIG_BTRFS_EXPERIMENTAL
 static ssize_t btrfs_offload_csum_show(struct kobject *kobj,
 				       struct kobj_attribute *a, char *buf)
 {
@@ -1450,7 +1450,7 @@ static const struct attribute *btrfs_attrs[] = {
 	BTRFS_ATTR_PTR(, bg_reclaim_threshold),
 	BTRFS_ATTR_PTR(, commit_stats),
 	BTRFS_ATTR_PTR(, temp_fsid),
-#ifdef CONFIG_BTRFS_DEBUG
+#ifdef CONFIG_BTRFS_EXPERIMENTAL
 	BTRFS_ATTR_PTR(, offload_csum),
 #endif
 	NULL,
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 4481575dd70f35..26e35fc1c8fdab 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -306,7 +306,7 @@ enum btrfs_read_policy {
 	BTRFS_NR_READ_POLICY,
 };
 
-#ifdef CONFIG_BTRFS_DEBUG
+#ifdef CONFIG_BTRFS_EXPERIMENTAL
 /*
  * Checksum mode - offload it to workqueues or do it synchronously in
  * btrfs_submit_chunk().
@@ -430,7 +430,7 @@ struct btrfs_fs_devices {
 	/* Policy used to read the mirrored stripes. */
 	enum btrfs_read_policy read_policy;
 
-#ifdef CONFIG_BTRFS_DEBUG
+#ifdef CONFIG_BTRFS_EXPERIMENTAL
 	/* Checksum mode - offload it or do it synchronously. */
 	enum btrfs_offload_csum_mode offload_csum_mode;
 #endif

From 18c2be4bae8fb4b481dca83281944daafdfff6ec Mon Sep 17 00:00:00 2001
From: Qu Wenruo <wqu@suse.com>
Date: Fri, 6 Sep 2024 14:14:55 +0930
Subject: [PATCH 04/80] btrfs: zlib: make the compression path to handle sector
 size < page size

Inside zlib_compress_folios(), each time we switch the input page cache,
the @start is increased by PAGE_SIZE.

But for the incoming compression support for sector size < page size
(previously we support compression only when the range is fully page
aligned), this is not going to handle the following case:

    0          32K         64K          96K
    |          |///////////||///////////|

@start has the initial value 32K, indicating the start filepos of the
to-be-compressed range.

And when grabbing the first page as input, we always call "start +=
PAGE_SIZE;".

But since @start is starting at 32K, it will be increased by 64K,
resulting it to be 96K for the next range, causing incorrect input range
and corruption for the future subpage compression.

Fix it by only increase @start by the input size.

Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/zlib.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c
index 100abc00b794ca..ddf0d5a448a741 100644
--- a/fs/btrfs/zlib.c
+++ b/fs/btrfs/zlib.c
@@ -194,7 +194,7 @@ int zlib_compress_folios(struct list_head *ws, struct address_space *mapping,
 				pg_off = offset_in_page(start);
 				cur_len = btrfs_calc_input_length(orig_end, start);
 				data_in = kmap_local_folio(in_folio, pg_off);
-				start += PAGE_SIZE;
+				start += cur_len;
 				workspace->strm.next_in = data_in;
 				workspace->strm.avail_in = cur_len;
 			}

From 17a51a035735a81b6f1eef81e4faba899c54f65e Mon Sep 17 00:00:00 2001
From: Qu Wenruo <wqu@suse.com>
Date: Fri, 6 Sep 2024 13:40:30 +0930
Subject: [PATCH 05/80] btrfs: zstd: make the compression path to handle sector
 size < page size

Inside zstd_compress_folios(), after exhausted one input page, we need
to switch to the next page as input.

However when counting the total input bytes (@tot_in), we always increase
it by PAGE_SIZE.

For the following case, it can cause incorrect value:

        0          32K         64K          96K
        |          |///////////||///////////|

After compressing range [32K, 64K), we switch to the next page, and
increasing @tot_in by 64K, while we only read 32K.

This will cause the @total_in to return a value larger than the input
length.

Fix it by only increase @tot_in by the input size.

Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/zstd.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/btrfs/zstd.c b/fs/btrfs/zstd.c
index 866607fd3e5884..15f8a83165a38b 100644
--- a/fs/btrfs/zstd.c
+++ b/fs/btrfs/zstd.c
@@ -495,7 +495,7 @@ int zstd_compress_folios(struct list_head *ws, struct address_space *mapping,
 
 		/* Check if we need more input */
 		if (workspace->in_buf.pos == workspace->in_buf.size) {
-			tot_in += PAGE_SIZE;
+			tot_in += workspace->in_buf.size;
 			kunmap_local(workspace->in_buf.src);
 			workspace->in_buf.src = NULL;
 			folio_put(in_folio);

From 6eb293fcefc4458351c0707296014251fc852f06 Mon Sep 17 00:00:00 2001
From: Qu Wenruo <wqu@suse.com>
Date: Fri, 6 Sep 2024 14:27:56 +0930
Subject: [PATCH 06/80] btrfs: compression: add an ASSERT() to ensure the
 read-in length is sane

There are already two bugs (one in zlib, one in zstd) that involved
compression path is not handling sector size < page size cases well.

So it makes more sense to make sure that btrfs_compress_folios() returns

Since we already have two bugs (one in zlib, one in zstd) in the
compression path resulting the @total_in be to larger than the
to-be-compressed range length, there is enough reason to add an ASSERT()
to make sure the total read-in length doesn't exceed the input length.

Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/compression.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index 90aef2627ca27b..6e9c4a5e0d51af 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -1030,6 +1030,7 @@ int btrfs_compress_folios(unsigned int type_level, struct address_space *mapping
 {
 	int type = btrfs_compress_type(type_level);
 	int level = btrfs_compress_level(type_level);
+	const unsigned long orig_len = *total_out;
 	struct list_head *workspace;
 	int ret;
 
@@ -1037,6 +1038,8 @@ int btrfs_compress_folios(unsigned int type_level, struct address_space *mapping
 	workspace = get_workspace(type, level);
 	ret = compression_compress_pages(type, workspace, mapping, start, folios,
 					 out_folios, total_in, total_out);
+	/* The total read-in bytes should be no larger than the input. */
+	ASSERT(*total_in <= orig_len);
 	put_workspace(type, workspace);
 	return ret;
 }

From 50e2162c755ae0e84d3713da279201e70ae6b914 Mon Sep 17 00:00:00 2001
From: Qu Wenruo <wqu@suse.com>
Date: Mon, 9 Sep 2024 08:22:06 +0930
Subject: [PATCH 07/80] btrfs: wait for writeback if sector size is smaller
 than page size

[PROBLEM]
If sector perfect compression is enabled for sector size < page size
case, the following case can lead dirty ranges not being written back:

     0     32K     64K     96K     128K
     |     |///////||//////|     |/|
                                 124K

In above example, the page size is 64K, and we need to write back above
two pages.

- Submit for page 0 (main thread)
  We found delalloc range [32K, 96K), which can be compressed.
  So we queue an async range for [32K, 96K).
  This means, the page unlock/clearing dirty/setting writeback will
  all happen in a workqueue context.

- The compression is done, and compressed range is submitted (workqueue)
  Since the compression is done in asynchronously, the compression can
  be done before the main thread to submit for page 64K.

  Now the whole range [32K, 96K), involving two pages, will be marked
  writeback.

- Submit for page 64K (main thread)
  extent_write_cache_pages() got its wbc->sync_mode is WB_SYNC_NONE,
  so it skips the writeback wait.

  And unlock the page and exit. This means the dirty range [124K, 128K)
  will never be submitted, until next writeback happens for page 64K.

This will never happen for previous kernels because:

- For sector size == page size case
  Since one page is one sector, if a page is marked writeback it will
  not have dirty flags.
  So this corner case will never hit.

- For sector size < page size case
  We never do subpage compression, a range can only be submitted for
  compression if the range is fully page aligned.
  This change makes the subpage behavior mostly the same as non-subpage
  cases.

[ENHANCEMENT]
Instead of relying WB_SYNC_NONE check only, if it's a subpage case, then
always wait for writeback flags.

Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/extent_io.c | 22 +++++++++++++++++++++-
 1 file changed, 21 insertions(+), 1 deletion(-)

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 309a8ae4843454..a06d7033a2c163 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -2115,7 +2115,27 @@ static int extent_write_cache_pages(struct address_space *mapping,
 				continue;
 			}
 
-			if (wbc->sync_mode != WB_SYNC_NONE) {
+			/*
+			 * For subpage case, compression can lead to mixed
+			 * writeback and dirty flags, e.g:
+			 * 0     32K    64K    96K    128K
+			 * |     |//////||/////|   |//|
+			 *
+			 * In above case, [32K, 96K) is asynchronously submitted
+			 * for compression, and [124K, 128K) needs to be written back.
+			 *
+			 * If we didn't wait wrtiteback for page 64K, [128K, 128K)
+			 * won't be submitted as the page still has writeback flag
+			 * and will be skipped in the next check.
+			 *
+			 * This mixed writeback and dirty case is only possible for
+			 * subpage case.
+			 *
+			 * TODO: Remove this check after migrating compression to
+			 * regular submission.
+			 */
+			if (wbc->sync_mode != WB_SYNC_NONE ||
+			    btrfs_is_subpage(inode_to_fs_info(inode), mapping)) {
 				if (folio_test_writeback(folio))
 					submit_write_bio(bio_ctrl, 0);
 				folio_wait_writeback(folio);

From 2923eaf680dbe2eb92fd73a5307cf6a26ca5f623 Mon Sep 17 00:00:00 2001
From: Qu Wenruo <wqu@suse.com>
Date: Tue, 10 Sep 2024 16:12:57 +0930
Subject: [PATCH 08/80] btrfs: make extent_range_clear_dirty_for_io() to handle
 sector size < page size cases

For btrfs with sector size < page size (e.g. 4K sector size, 64K page
size), and enable the sector perfect compression support, then the
following dirty range can lead to problems:

   0     32K     64K     96K    128K
   |     |///////||//////|    |/|
                              124K

In above case, if we start writeback for that inode, the last dirty
range [124K, 128K) will not be submitted and cause reserved space
leakage:

- Start writeback for page 0
  We find the range [32K, 96K) is suitable for compression, and queue it
  into a workqueue to do the delayed compression and submission.

- Compression happens for range [32K, 96K)
  Function extent_range_clear_dirty_for_io() is called, however it is
  only doing full page handling, not considering any the extra bitmaps
  for subpage cases.

  That function will clear page dirty for both page 0 and page 64K.

- Writeback for the inode is done
  Because page 64K has its dirty flag cleared, it will not be considered
  as a writeback target.

This means the range [124K, 128K) will not be submitted, and reserved
space for it will be leaked.

Fix this problem by using the subpage helper to clear the dirty flag.

Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/inode.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 5618ca02934a04..0490c4b0088b53 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -902,7 +902,8 @@ static int extent_range_clear_dirty_for_io(struct inode *inode, u64 start, u64 e
 				ret = PTR_ERR(folio);
 			continue;
 		}
-		folio_clear_dirty_for_io(folio);
+		btrfs_folio_clamp_clear_dirty(inode_to_fs_info(inode), folio, start,
+					      end + 1 - start);
 		folio_put(folio);
 	}
 	return ret;

From 5ce74716d68cc917c846bb563a743bc874dce9ab Mon Sep 17 00:00:00 2001
From: Qu Wenruo <wqu@suse.com>
Date: Wed, 11 Sep 2024 08:52:36 +0930
Subject: [PATCH 09/80] btrfs: do not assume the full page range is not dirty
 in extent_writepage_io()

The function extent_writepage_io() will submit the dirty sectors inside
the page for the write.

But recently to co-operate with the incoming subpage compression
enhancement, a new bitmap is introduced to
btrfs_bio_ctrl::submit_bitmap, to only avoid a subset of the dirty
range.

This is because we can have the following cases with 64K page size:

    0      16K       32K       48K       64K
    |      |/////////|         |/|
                                 52K

For range [16K, 32K), we queue the dirty range for compression, which is
ran in a delayed workqueue.
Then for range [48K, 52K), we go through the regular submission path.

In that case, our btrfs_bio_ctrl::submit_bitmap will exclude the range
[16K, 32K).

The dirty flags for the range [16K, 32K) is only cleared when the
compression is done, by the extent_clear_unlock_delalloc() call inside
submit_one_async_extent().

This patch fix the false alert by removing the
btrfs_folio_assert_not_dirty() check, since it's no longer correct for
subpage compression cases.

Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/extent_io.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index a06d7033a2c163..7d0528b572173d 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -1390,8 +1390,6 @@ static noinline_for_stack int extent_writepage_io(struct btrfs_inode *inode,
 			goto out;
 		submitted_io = true;
 	}
-
-	btrfs_folio_assert_not_dirty(fs_info, folio, start, len);
 out:
 	/*
 	 * If we didn't submitted any sector (>= i_size), folio dirty get

From 938449b74562e0faab6acb54edc5b04af6e9fef6 Mon Sep 17 00:00:00 2001
From: Qu Wenruo <wqu@suse.com>
Date: Mon, 16 Sep 2024 08:03:00 +0930
Subject: [PATCH 10/80] btrfs: move the delalloc range bitmap search into
 extent_io.c

Currently for subpage (sector size < page size) cases, we reuse subpage
locked bitmap to find out all delalloc ranges we have locked, and run
all those found ranges.

However such reuse is not perfect, e.g.:

    0       32K      64K      96K       128K
    |       |////////||///////|    |////|
                                   120K

For above range, writepage_delalloc() for page 0 will handle the range
[32K, 96k), note delalloc range can be beyond the page boundary.

But writepage_delalloc() for page 64K will only handle range [120K,
128K), as the previous run on page 0 has already handled range [64K,
96K).
Meanwhile for the writeback we should expect range [64K, 96K) to also be
locked, this leads to the mismatch from locked bitmap and delalloc
range.

This is not causing problems yet, but it's still an inconsistent
behavior.

So instead of relying on the subpage locked bitmap, move the delalloc
range search using local @delalloc_bitmap, so that we can remove the
existing btrfs_folio_find_writer_locked().

Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/extent_io.c | 44 ++++++++++++++++++++++++++++++++++++++++-
 fs/btrfs/subpage.c   | 47 --------------------------------------------
 fs/btrfs/subpage.h   |  4 ----
 3 files changed, 43 insertions(+), 52 deletions(-)

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 7d0528b572173d..8f6ef810afee11 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -1101,6 +1101,45 @@ int btrfs_read_folio(struct file *file, struct folio *folio)
 	return ret;
 }
 
+static void set_delalloc_bitmap(struct folio *folio, unsigned long *delalloc_bitmap,
+				u64 start, u32 len)
+{
+	struct btrfs_fs_info *fs_info = folio_to_fs_info(folio);
+	const u64 folio_start = folio_pos(folio);
+	unsigned int start_bit;
+	unsigned int nbits;
+
+	ASSERT(start >= folio_start && start + len <= folio_start + PAGE_SIZE);
+	start_bit = (start - folio_start) >> fs_info->sectorsize_bits;
+	nbits = len >> fs_info->sectorsize_bits;
+	ASSERT(bitmap_test_range_all_zero(delalloc_bitmap, start_bit, nbits));
+	bitmap_set(delalloc_bitmap, start_bit, nbits);
+}
+
+static bool find_next_delalloc_bitmap(struct folio *folio,
+				      unsigned long *delalloc_bitmap, u64 start,
+				      u64 *found_start, u32 *found_len)
+{
+	struct btrfs_fs_info *fs_info = folio_to_fs_info(folio);
+	const u64 folio_start = folio_pos(folio);
+	const unsigned int bitmap_size = fs_info->sectors_per_page;
+	unsigned int start_bit;
+	unsigned int first_zero;
+	unsigned int first_set;
+
+	ASSERT(start >= folio_start && start < folio_start + PAGE_SIZE);
+
+	start_bit = (start - folio_start) >> fs_info->sectorsize_bits;
+	first_set = find_next_bit(delalloc_bitmap, bitmap_size, start_bit);
+	if (first_set >= bitmap_size)
+		return false;
+
+	*found_start = folio_start + (first_set << fs_info->sectorsize_bits);
+	first_zero = find_next_zero_bit(delalloc_bitmap, bitmap_size, first_set);
+	*found_len = (first_zero - first_set) << fs_info->sectorsize_bits;
+	return true;
+}
+
 /*
  * helper for extent_writepage(), doing all of the delayed allocation setup.
  *
@@ -1120,6 +1159,7 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode,
 	const bool is_subpage = btrfs_is_subpage(fs_info, folio->mapping);
 	const u64 page_start = folio_pos(folio);
 	const u64 page_end = page_start + folio_size(folio) - 1;
+	unsigned long delalloc_bitmap = 0;
 	/*
 	 * Save the last found delalloc end. As the delalloc end can go beyond
 	 * page boundary, thus we cannot rely on subpage bitmap to locate the
@@ -1147,6 +1187,8 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode,
 			delalloc_start = delalloc_end + 1;
 			continue;
 		}
+		set_delalloc_bitmap(folio, &delalloc_bitmap, delalloc_start,
+				    min(delalloc_end, page_end) + 1 - delalloc_start);
 		btrfs_folio_set_writer_lock(fs_info, folio, delalloc_start,
 					    min(delalloc_end, page_end) + 1 -
 					    delalloc_start);
@@ -1174,7 +1216,7 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode,
 			found_len = last_delalloc_end + 1 - found_start;
 			found = true;
 		} else {
-			found = btrfs_subpage_find_writer_locked(fs_info, folio,
+			found = find_next_delalloc_bitmap(folio, &delalloc_bitmap,
 					delalloc_start, &found_start, &found_len);
 		}
 		if (!found)
diff --git a/fs/btrfs/subpage.c b/fs/btrfs/subpage.c
index fe4d719d506bf5..17bc53a8df0140 100644
--- a/fs/btrfs/subpage.c
+++ b/fs/btrfs/subpage.c
@@ -801,53 +801,6 @@ void btrfs_folio_set_writer_lock(const struct btrfs_fs_info *fs_info,
 	spin_unlock_irqrestore(&subpage->lock, flags);
 }
 
-/*
- * Find any subpage writer locked range inside @folio, starting at file offset
- * @search_start. The caller should ensure the folio is locked.
- *
- * Return true and update @found_start_ret and @found_len_ret to the first
- * writer locked range.
- * Return false if there is no writer locked range.
- */
-bool btrfs_subpage_find_writer_locked(const struct btrfs_fs_info *fs_info,
-				      struct folio *folio, u64 search_start,
-				      u64 *found_start_ret, u32 *found_len_ret)
-{
-	struct btrfs_subpage *subpage = folio_get_private(folio);
-	const u32 sectors_per_page = fs_info->sectors_per_page;
-	const unsigned int len = PAGE_SIZE - offset_in_page(search_start);
-	const unsigned int start_bit = subpage_calc_start_bit(fs_info, folio,
-						locked, search_start, len);
-	const unsigned int locked_bitmap_start = sectors_per_page * btrfs_bitmap_nr_locked;
-	const unsigned int locked_bitmap_end = locked_bitmap_start + sectors_per_page;
-	unsigned long flags;
-	int first_zero;
-	int first_set;
-	bool found = false;
-
-	ASSERT(folio_test_locked(folio));
-	spin_lock_irqsave(&subpage->lock, flags);
-	first_set = find_next_bit(subpage->bitmaps, locked_bitmap_end, start_bit);
-	if (first_set >= locked_bitmap_end)
-		goto out;
-
-	found = true;
-
-	*found_start_ret = folio_pos(folio) +
-		((first_set - locked_bitmap_start) << fs_info->sectorsize_bits);
-	/*
-	 * Since @first_set is ensured to be smaller than locked_bitmap_end
-	 * here, @found_start_ret should be inside the folio.
-	 */
-	ASSERT(*found_start_ret < folio_pos(folio) + PAGE_SIZE);
-
-	first_zero = find_next_zero_bit(subpage->bitmaps, locked_bitmap_end, first_set);
-	*found_len_ret = (first_zero - first_set) << fs_info->sectorsize_bits;
-out:
-	spin_unlock_irqrestore(&subpage->lock, flags);
-	return found;
-}
-
 #define GET_SUBPAGE_BITMAP(subpage, fs_info, name, dst)			\
 {									\
 	const int sectors_per_page = fs_info->sectors_per_page;		\
diff --git a/fs/btrfs/subpage.h b/fs/btrfs/subpage.h
index 4b85d91d0e18b0..d16fbddeda68f2 100644
--- a/fs/btrfs/subpage.h
+++ b/fs/btrfs/subpage.h
@@ -108,10 +108,6 @@ void btrfs_folio_set_writer_lock(const struct btrfs_fs_info *fs_info,
 				 struct folio *folio, u64 start, u32 len);
 void btrfs_folio_end_writer_lock_bitmap(const struct btrfs_fs_info *fs_info,
 					struct folio *folio, unsigned long bitmap);
-bool btrfs_subpage_find_writer_locked(const struct btrfs_fs_info *fs_info,
-				      struct folio *folio, u64 search_start,
-				      u64 *found_start_ret, u32 *found_len_ret);
-
 /*
  * Template for subpage related operations.
  *

From 81b8cc51c38a03dfed2804358e14e648431588fb Mon Sep 17 00:00:00 2001
From: Qu Wenruo <wqu@suse.com>
Date: Mon, 16 Sep 2024 08:12:40 +0930
Subject: [PATCH 11/80] btrfs: mark all dirty sectors as locked inside
 writepage_delalloc()

Currently we only mark sectors as locked if there is a *NEW* delalloc
range for it.

But NEW delalloc range is not the same as dirty sectors we want to
submit, e.g:

        0       32K      64K      96K       128K
        |       |////////||///////|    |////|
                                       120K

For above 64K page size case, writepage_delalloc() for page 0 will find
and lock the delalloc range [32K, 96K), which is beyond the page
boundary.

Then when writepage_delalloc() is called for the page 64K, since [64K,
96K) is already locked, only [120K, 128K) will be locked.

This means, although range [64K, 96K) is dirty and will be submitted
later by extent_writepage_io(), it will not be marked as locked.

This is fine for now, as we call btrfs_folio_end_writer_lock_bitmap() to
free every non-compressed sector, and compression is only allowed for
full page range.

But this is not safe for future sector perfect compression support, as
this can lead to double folio unlock:

              Thread A                 |           Thread B
---------------------------------------+--------------------------------
                                       | submit_one_async_extent()
				       | |- extent_clear_unlock_delalloc()
extent_writepage()                     |    |- btrfs_folio_end_writer_lock()
|- btrfs_folio_end_writer_lock_bitmap()|       |- btrfs_subpage_end_and_test_writer()
   |                                   |       |  |- atomic_sub_and_test()
   |                                   |       |     /* Now the atomic value is 0 */
   |- if (atomic_read() == 0)          |       |
   |- folio_unlock()                   |       |- folio_unlock()

The root cause is the above range [64K, 96K) is dirtied and should also
be locked but it isn't.

So to make everything more consistent and prepare for the incoming
sector perfect compression, mark all dirty sectors as locked.

Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/extent_io.c | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 8f6ef810afee11..1ad3ea6f42e24b 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -1170,6 +1170,7 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode,
 	u64 delalloc_end = page_end;
 	u64 delalloc_to_write = 0;
 	int ret = 0;
+	int bit;
 
 	/* Save the dirty bitmap as our submission bitmap will be a subset of it. */
 	if (btrfs_is_subpage(fs_info, inode->vfs_inode.i_mapping)) {
@@ -1179,6 +1180,12 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode,
 		bio_ctrl->submit_bitmap = 1;
 	}
 
+	for_each_set_bit(bit, &bio_ctrl->submit_bitmap, fs_info->sectors_per_page) {
+		u64 start = page_start + (bit << fs_info->sectorsize_bits);
+
+		btrfs_folio_set_writer_lock(fs_info, folio, start, fs_info->sectorsize);
+	}
+
 	/* Lock all (subpage) delalloc ranges inside the folio first. */
 	while (delalloc_start < page_end) {
 		delalloc_end = page_end;
@@ -1189,9 +1196,6 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode,
 		}
 		set_delalloc_bitmap(folio, &delalloc_bitmap, delalloc_start,
 				    min(delalloc_end, page_end) + 1 - delalloc_start);
-		btrfs_folio_set_writer_lock(fs_info, folio, delalloc_start,
-					    min(delalloc_end, page_end) + 1 -
-					    delalloc_start);
 		last_delalloc_end = delalloc_end;
 		delalloc_start = delalloc_end + 1;
 	}

From 63267137f165c1886b787658bc19eeef254fc416 Mon Sep 17 00:00:00 2001
From: Qu Wenruo <wqu@suse.com>
Date: Fri, 6 Sep 2024 12:22:56 +0930
Subject: [PATCH 12/80] btrfs: allow compression even if the range is not page
 aligned

Previously for btrfs with sector size smaller than page size (subpage),
we only allow compression if the range is fully page aligned.

This is to work around the asynchronous submission of compressed range,
which delayed the page unlock and writeback into a workqueue,
furthermore asynchronous submission can lock multiple sector range
across page boundary.

Such asynchronous submission makes it very hard to co-operate with other
regular writes.

With the recent changes to the subpage folio unlock path, now
asynchronous submission of compressed pages can co-operate with regular
submission, so enable sector perfect compression if it's an experimental
build.

The ETA for moving this feature out of experimental is 6.15, and I hope
all remaining corner cases can be exposed before that.

Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/inode.c | 41 +++++++----------------------------------
 1 file changed, 7 insertions(+), 34 deletions(-)

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 0490c4b0088b53..83ee93fddaf53d 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -832,32 +832,16 @@ static inline int inode_need_compress(struct btrfs_inode *inode, u64 start,
 		return 0;
 	}
 	/*
-	 * Special check for subpage.
+	 * Only enable sector perfect compression for experimental builds.
 	 *
-	 * We lock the full page then run each delalloc range in the page, thus
-	 * for the following case, we will hit some subpage specific corner case:
+	 * This is a big feature change for subpage cases, and can hit
+	 * different corner cases, so only limit this feature for
+	 * experimental build for now.
 	 *
-	 * 0		32K		64K
-	 * |	|///////|	|///////|
-	 *		\- A		\- B
-	 *
-	 * In above case, both range A and range B will try to unlock the full
-	 * page [0, 64K), causing the one finished later will have page
-	 * unlocked already, triggering various page lock requirement BUG_ON()s.
-	 *
-	 * So here we add an artificial limit that subpage compression can only
-	 * if the range is fully page aligned.
-	 *
-	 * In theory we only need to ensure the first page is fully covered, but
-	 * the tailing partial page will be locked until the full compression
-	 * finishes, delaying the write of other range.
-	 *
-	 * TODO: Make btrfs_run_delalloc_range() to lock all delalloc range
-	 * first to prevent any submitted async extent to unlock the full page.
-	 * By this, we can ensure for subpage case that only the last async_cow
-	 * will unlock the full page.
+	 * ETA for moving this out of experimental builds is 6.15.
 	 */
-	if (fs_info->sectorsize < PAGE_SIZE) {
+	if (fs_info->sectorsize < PAGE_SIZE &&
+	    !IS_ENABLED(CONFIG_BTRFS_EXPERIMENTAL)) {
 		if (!PAGE_ALIGNED(start) ||
 		    !PAGE_ALIGNED(end + 1))
 			return 0;
@@ -1002,17 +986,6 @@ static void compress_file_range(struct btrfs_work *work)
 	   (start > 0 || end + 1 < inode->disk_i_size))
 		goto cleanup_and_bail_uncompressed;
 
-	/*
-	 * For subpage case, we require full page alignment for the sector
-	 * aligned range.
-	 * Thus we must also check against @actual_end, not just @end.
-	 */
-	if (blocksize < PAGE_SIZE) {
-		if (!PAGE_ALIGNED(start) ||
-		    !PAGE_ALIGNED(round_up(actual_end, blocksize)))
-			goto cleanup_and_bail_uncompressed;
-	}
-
 	total_compressed = min_t(unsigned long, total_compressed,
 			BTRFS_MAX_UNCOMPRESSED);
 	total_in = 0;

From f7f6d8ea91b3ae9a24ce1c070bfb45e618b1fb77 Mon Sep 17 00:00:00 2001
From: Qu Wenruo <wqu@suse.com>
Date: Tue, 24 Sep 2024 12:52:17 +0930
Subject: [PATCH 13/80] btrfs: avoid unnecessary device path update for the
 same device

[PROBLEM]
It is very common for udev to trigger device scan, and every time a
mounted btrfs device got re-scan from different soft links, we will get
some of unnecessary device path updates, this is especially common
for LVM based storage:

 # lvs
  scratch1 test -wi-ao---- 10.00g
  scratch2 test -wi-a----- 10.00g
  scratch3 test -wi-a----- 10.00g
  scratch4 test -wi-a----- 10.00g
  scratch5 test -wi-a----- 10.00g
  test     test -wi-a----- 10.00g

 # mkfs.btrfs -f /dev/test/scratch1
 # mount /dev/test/scratch1 /mnt/btrfs
 # dmesg -c
 [  205.705234] BTRFS: device fsid 7be2602f-9e35-4ecf-a6ff-9e91d2c182c9 devid 1 transid 6 /dev/mapper/test-scratch1 (253:4) scanned by mount (1154)
 [  205.710864] BTRFS info (device dm-4): first mount of filesystem 7be2602f-9e35-4ecf-a6ff-9e91d2c182c9
 [  205.711923] BTRFS info (device dm-4): using crc32c (crc32c-intel) checksum algorithm
 [  205.713856] BTRFS info (device dm-4): using free-space-tree
 [  205.722324] BTRFS info (device dm-4): checking UUID tree

So far so good, but even if we just touched any soft link of
"dm-4", we will get quite some unnecessary device path updates.

 # touch /dev/mapper/test-scratch1
 # dmesg -c
 [  469.295796] BTRFS info: devid 1 device path /dev/mapper/test-scratch1 changed to /dev/dm-4 scanned by (udev-worker) (1221)
 [  469.300494] BTRFS info: devid 1 device path /dev/dm-4 changed to /dev/mapper/test-scratch1 scanned by (udev-worker) (1221)

Such device path rename is unnecessary and can lead to random path
change due to the udev race.

[CAUSE]
Inside device_list_add(), we are using a very primitive way checking if
the device has changed, strcmp().

Which can never handle links well, no matter if it's hard or soft links.

So every different link of the same device will be treated as a different
device, causing the unnecessary device path update.

[FIX]
Introduce a helper, is_same_device(), and use path_equal() to properly
detect the same block device.
So that the different soft links won't trigger the rename race.

Reviewed-by: Filipe Manana <fdmanana@suse.com>
Link: https://bugzilla.suse.com/show_bug.cgi?id=1230641
Reported-by: Fabian Vogt <fvogt@suse.com>
Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/volumes.c | 38 +++++++++++++++++++++++++++++++++++++-
 1 file changed, 37 insertions(+), 1 deletion(-)

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 995b0647f53816..af38b04eff2c67 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -732,6 +732,42 @@ const u8 *btrfs_sb_fsid_ptr(const struct btrfs_super_block *sb)
 	return has_metadata_uuid ? sb->metadata_uuid : sb->fsid;
 }
 
+static bool is_same_device(struct btrfs_device *device, const char *new_path)
+{
+	struct path old = { .mnt = NULL, .dentry = NULL };
+	struct path new = { .mnt = NULL, .dentry = NULL };
+	char *old_path = NULL;
+	bool is_same = false;
+	int ret;
+
+	if (!device->name)
+		goto out;
+
+	old_path = kzalloc(PATH_MAX, GFP_NOFS);
+	if (!old_path)
+		goto out;
+
+	rcu_read_lock();
+	ret = strscpy(old_path, rcu_str_deref(device->name), PATH_MAX);
+	rcu_read_unlock();
+	if (ret < 0)
+		goto out;
+
+	ret = kern_path(old_path, LOOKUP_FOLLOW, &old);
+	if (ret)
+		goto out;
+	ret = kern_path(new_path, LOOKUP_FOLLOW, &new);
+	if (ret)
+		goto out;
+	if (path_equal(&old, &new))
+		is_same = true;
+out:
+	kfree(old_path);
+	path_put(&old);
+	path_put(&new);
+	return is_same;
+}
+
 /*
  * Add new device to list of registered devices
  *
@@ -852,7 +888,7 @@ static noinline struct btrfs_device *device_list_add(const char *path,
 				MAJOR(path_devt), MINOR(path_devt),
 				current->comm, task_pid_nr(current));
 
-	} else if (!device->name || strcmp(device->name->str, path)) {
+	} else if (!device->name || !is_same_device(device, path)) {
 		/*
 		 * When FS is already mounted.
 		 * 1. If you are here and if the device->name is NULL that

From 4f0ed68cbb6ff89d2331211cb04cd4d5fb4d6970 Mon Sep 17 00:00:00 2001
From: Qu Wenruo <wqu@suse.com>
Date: Tue, 24 Sep 2024 14:27:07 +0930
Subject: [PATCH 14/80] btrfs: canonicalize the device path before adding it

[PROBLEM]
Currently btrfs accepts any file path for its device, resulting some
weird situation:

 # ./mount_by_fd /dev/test/scratch1  /mnt/btrfs/

The program has the following source code:

 #include <fcntl.h>
 #include <stdio.h>
 #include <sys/mount.h>

 int main(int argc, char *argv[]) {
	int fd = open(argv[1], O_RDWR);
	char path[256];
	snprintf(path, sizeof(path), "/proc/self/fd/%d", fd);
	return mount(path, argv[2], "btrfs", 0, NULL);
 }

Then we can have the following weird device path:

 BTRFS: device fsid 2378be81-fe12-46d2-a9e8-68cf08dd98d5 devid 1 transid 7 /proc/self/fd/3 (253:2) scanned by mount_by_fd (18440)

Normally it's not a big deal, and later udev can trigger a device path
rename. But if udev didn't trigger, the device path "/proc/self/fd/3"
will show up in mtab.

[CAUSE]
For filename "/proc/self/fd/3", it means the opened file descriptor 3.
In above case, it's exactly the device we want to open, aka points to
"/dev/test/scratch1" which is another symlink pointing to "/dev/dm-2".

Inside kernel we solve the mount source using LOOKUP_FOLLOW, which
follows the symbolic link and grab the proper block device.

But inside btrfs we also save the filename into btrfs_device::name, and
utilize that member to report our mount source, which leads to the above
situation.

[FIX]
Instead of unconditionally trust the path, check if the original file
(not following the symbolic link) is inside "/dev/", if not, then
manually lookup the path to its final destination, and use that as our
device path.

This allows us to still use symbolic links, like
"/dev/mapper/test-scratch" from LVM2, which is required for fstests runs
with LVM2 setup.

And for really weird names, like the above case, we solve it to
"/dev/dm-2" instead.

Reviewed-by: Filipe Manana <fdmanana@suse.com>
Link: https://bugzilla.suse.com/show_bug.cgi?id=1230641
Reported-by: Fabian Vogt <fvogt@suse.com>
Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/volumes.c | 87 +++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 86 insertions(+), 1 deletion(-)

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index af38b04eff2c67..9ec761a280fe3a 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -732,6 +732,78 @@ const u8 *btrfs_sb_fsid_ptr(const struct btrfs_super_block *sb)
 	return has_metadata_uuid ? sb->metadata_uuid : sb->fsid;
 }
 
+/*
+ * We can have very weird soft links passed in.
+ * One example is "/proc/self/fd/<fd>", which can be a soft link to
+ * a block device.
+ *
+ * But it's never a good idea to use those weird names.
+ * Here we check if the path (not following symlinks) is a good one inside
+ * "/dev/".
+ */
+static bool is_good_dev_path(const char *dev_path)
+{
+	struct path path = { .mnt = NULL, .dentry = NULL };
+	char *path_buf = NULL;
+	char *resolved_path;
+	bool is_good = false;
+	int ret;
+
+	if (!dev_path)
+		goto out;
+
+	path_buf = kmalloc(PATH_MAX, GFP_KERNEL);
+	if (!path_buf)
+		goto out;
+
+	/*
+	 * Do not follow soft link, just check if the original path is inside
+	 * "/dev/".
+	 */
+	ret = kern_path(dev_path, 0, &path);
+	if (ret)
+		goto out;
+	resolved_path = d_path(&path, path_buf, PATH_MAX);
+	if (IS_ERR(resolved_path))
+		goto out;
+	if (strncmp(resolved_path, "/dev/", strlen("/dev/")))
+		goto out;
+	is_good = true;
+out:
+	kfree(path_buf);
+	path_put(&path);
+	return is_good;
+}
+
+static int get_canonical_dev_path(const char *dev_path, char *canonical)
+{
+	struct path path = { .mnt = NULL, .dentry = NULL };
+	char *path_buf = NULL;
+	char *resolved_path;
+	int ret;
+
+	if (!dev_path) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	path_buf = kmalloc(PATH_MAX, GFP_KERNEL);
+	if (!path_buf) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	ret = kern_path(dev_path, LOOKUP_FOLLOW, &path);
+	if (ret)
+		goto out;
+	resolved_path = d_path(&path, path_buf, PATH_MAX);
+	ret = strscpy(canonical, resolved_path, PATH_MAX);
+out:
+	kfree(path_buf);
+	path_put(&path);
+	return ret;
+}
+
 static bool is_same_device(struct btrfs_device *device, const char *new_path)
 {
 	struct path old = { .mnt = NULL, .dentry = NULL };
@@ -1418,12 +1490,23 @@ struct btrfs_device *btrfs_scan_one_device(const char *path, blk_mode_t flags,
 	bool new_device_added = false;
 	struct btrfs_device *device = NULL;
 	struct file *bdev_file;
+	char *canonical_path = NULL;
 	u64 bytenr;
 	dev_t devt;
 	int ret;
 
 	lockdep_assert_held(&uuid_mutex);
 
+	if (!is_good_dev_path(path)) {
+		canonical_path = kmalloc(PATH_MAX, GFP_KERNEL);
+		if (canonical_path) {
+			ret = get_canonical_dev_path(path, canonical_path);
+			if (ret < 0) {
+				kfree(canonical_path);
+				canonical_path = NULL;
+			}
+		}
+	}
 	/*
 	 * Avoid an exclusive open here, as the systemd-udev may initiate the
 	 * device scan which may race with the user's mount or mkfs command,
@@ -1468,7 +1551,8 @@ struct btrfs_device *btrfs_scan_one_device(const char *path, blk_mode_t flags,
 		goto free_disk_super;
 	}
 
-	device = device_list_add(path, disk_super, &new_device_added);
+	device = device_list_add(canonical_path ? : path, disk_super,
+				 &new_device_added);
 	if (!IS_ERR(device) && new_device_added)
 		btrfs_free_stale_devices(device->devt, device);
 
@@ -1477,6 +1561,7 @@ struct btrfs_device *btrfs_scan_one_device(const char *path, blk_mode_t flags,
 
 error_bdev_put:
 	fput(bdev_file);
+	kfree(canonical_path);
 
 	return device;
 }

From 90785160bc147c8b92168228f44f357435e986d2 Mon Sep 17 00:00:00 2001
From: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Date: Fri, 27 Sep 2024 12:30:05 +0200
Subject: [PATCH 15/80] btrfs: remove code duplication in ordered extent
 finishing

Remove the duplicated transaction joining, block reserve setting and raid
extent inserting in btrfs_finish_ordered_extent().

While at it, also abort the transaction in case inserting a RAID
stripe-tree entry fails.

Suggested-by: Naohiro Aota <naohiro.aota@wdc.com>
Reviewed-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/inode.c | 48 +++++++++++++++++++-----------------------------
 1 file changed, 19 insertions(+), 29 deletions(-)

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 83ee93fddaf53d..699cd85796c1c6 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -3068,34 +3068,6 @@ int btrfs_finish_one_ordered(struct btrfs_ordered_extent *ordered_extent)
 			goto out;
 	}
 
-	if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) {
-		BUG_ON(!list_empty(&ordered_extent->list)); /* Logic error */
-
-		btrfs_inode_safe_disk_i_size_write(inode, 0);
-		if (freespace_inode)
-			trans = btrfs_join_transaction_spacecache(root);
-		else
-			trans = btrfs_join_transaction(root);
-		if (IS_ERR(trans)) {
-			ret = PTR_ERR(trans);
-			trans = NULL;
-			goto out;
-		}
-		trans->block_rsv = &inode->block_rsv;
-		ret = btrfs_update_inode_fallback(trans, inode);
-		if (ret) /* -ENOMEM or corruption */
-			btrfs_abort_transaction(trans, ret);
-
-		ret = btrfs_insert_raid_extent(trans, ordered_extent);
-		if (ret)
-			btrfs_abort_transaction(trans, ret);
-
-		goto out;
-	}
-
-	clear_bits |= EXTENT_LOCKED;
-	lock_extent(io_tree, start, end, &cached_state);
-
 	if (freespace_inode)
 		trans = btrfs_join_transaction_spacecache(root);
 	else
@@ -3109,8 +3081,26 @@ int btrfs_finish_one_ordered(struct btrfs_ordered_extent *ordered_extent)
 	trans->block_rsv = &inode->block_rsv;
 
 	ret = btrfs_insert_raid_extent(trans, ordered_extent);
-	if (ret)
+	if (ret) {
+		btrfs_abort_transaction(trans, ret);
 		goto out;
+	}
+
+	if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) {
+		/* Logic error */
+		BUG_ON(!list_empty(&ordered_extent->list));
+
+		btrfs_inode_safe_disk_i_size_write(inode, 0);
+		ret = btrfs_update_inode_fallback(trans, inode);
+		if (ret) {
+			/* -ENOMEM or corruption */
+			btrfs_abort_transaction(trans, ret);
+		}
+		goto out;
+	}
+
+	clear_bits |= EXTENT_LOCKED;
+	lock_extent(io_tree, start, end, &cached_state);
 
 	if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered_extent->flags))
 		compress_type = ordered_extent->compress_type;

From d217a8f63b9867ba7153519be096fe1c0ee60906 Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Tue, 24 Sep 2024 17:12:32 +0100
Subject: [PATCH 16/80] btrfs: qgroups: remove bytenr field from struct
 btrfs_qgroup_extent_record

Now that we track qgroup extent records in a xarray we don't need to have
a "bytenr" field in  struct btrfs_qgroup_extent_record, since we can get
it from the index of the record in the xarray.

So remove the field and grab the bytenr from either the index key or any
other place where it's available (delayed refs). This reduces the size of
struct btrfs_qgroup_extent_record from 40 bytes down to 32 bytes, meaning
that we now can store 128 instances of this structure instead of 102 per
4K page.

Reviewed-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/delayed-ref.c       |  8 ++++----
 fs/btrfs/qgroup.c            | 31 +++++++++++++++++--------------
 fs/btrfs/qgroup.h            | 13 ++++++++++---
 include/trace/events/btrfs.h | 17 ++++++++++-------
 4 files changed, 41 insertions(+), 28 deletions(-)

diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index 115b90d29b1d8e..04586aa1191763 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -830,7 +830,6 @@ static void init_delayed_ref_head(struct btrfs_delayed_ref_head *head_ref,
 			qrecord->data_rsv = reserved;
 			qrecord->data_rsv_refroot = generic_ref->ref_root;
 		}
-		qrecord->bytenr = generic_ref->bytenr;
 		qrecord->num_bytes = generic_ref->num_bytes;
 		qrecord->old_roots = NULL;
 	}
@@ -860,11 +859,12 @@ add_delayed_ref_head(struct btrfs_trans_handle *trans,
 	if (qrecord) {
 		int ret;
 
-		ret = btrfs_qgroup_trace_extent_nolock(fs_info, delayed_refs, qrecord);
+		ret = btrfs_qgroup_trace_extent_nolock(fs_info, delayed_refs, qrecord,
+						       head_ref->bytenr);
 		if (ret) {
 			/* Clean up if insertion fails or item exists. */
 			xa_release(&delayed_refs->dirty_extents,
-				   qrecord->bytenr >> fs_info->sectorsize_bits);
+				   head_ref->bytenr >> fs_info->sectorsize_bits);
 			/* Caller responsible for freeing qrecord on error. */
 			if (ret < 0)
 				return ERR_PTR(ret);
@@ -1074,7 +1074,7 @@ static int add_delayed_ref(struct btrfs_trans_handle *trans,
 		kmem_cache_free(btrfs_delayed_ref_node_cachep, node);
 
 	if (qrecord_inserted)
-		return btrfs_qgroup_trace_extent_post(trans, record);
+		return btrfs_qgroup_trace_extent_post(trans, record, generic_ref->bytenr);
 	return 0;
 
 free_record:
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index 1332ec59c5396a..a06596e21d1e46 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -2001,27 +2001,28 @@ int btrfs_limit_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid,
  * Return <0 for insertion failure, caller can free @record safely.
  */
 int btrfs_qgroup_trace_extent_nolock(struct btrfs_fs_info *fs_info,
-				struct btrfs_delayed_ref_root *delayed_refs,
-				struct btrfs_qgroup_extent_record *record)
+				     struct btrfs_delayed_ref_root *delayed_refs,
+				     struct btrfs_qgroup_extent_record *record,
+				     u64 bytenr)
 {
 	struct btrfs_qgroup_extent_record *existing, *ret;
-	const unsigned long index = (record->bytenr >> fs_info->sectorsize_bits);
+	const unsigned long index = (bytenr >> fs_info->sectorsize_bits);
 
 	if (!btrfs_qgroup_full_accounting(fs_info))
 		return 1;
 
 #if BITS_PER_LONG == 32
-	if (record->bytenr >= MAX_LFS_FILESIZE) {
+	if (bytenr >= MAX_LFS_FILESIZE) {
 		btrfs_err_rl(fs_info,
 "qgroup record for extent at %llu is beyond 32bit page cache and xarray index limit",
-			     record->bytenr);
+			     bytenr);
 		btrfs_err_32bit_limit(fs_info);
 		return -EOVERFLOW;
 	}
 #endif
 
 	lockdep_assert_held(&delayed_refs->lock);
-	trace_btrfs_qgroup_trace_extent(fs_info, record);
+	trace_btrfs_qgroup_trace_extent(fs_info, record, bytenr);
 
 	xa_lock(&delayed_refs->dirty_extents);
 	existing = xa_load(&delayed_refs->dirty_extents, index);
@@ -2066,7 +2067,8 @@ int btrfs_qgroup_trace_extent_nolock(struct btrfs_fs_info *fs_info,
  * transaction committing, but not now as qgroup accounting will be wrong again.
  */
 int btrfs_qgroup_trace_extent_post(struct btrfs_trans_handle *trans,
-				   struct btrfs_qgroup_extent_record *qrecord)
+				   struct btrfs_qgroup_extent_record *qrecord,
+				   u64 bytenr)
 {
 	struct btrfs_backref_walk_ctx ctx = { 0 };
 	int ret;
@@ -2097,7 +2099,7 @@ int btrfs_qgroup_trace_extent_post(struct btrfs_trans_handle *trans,
 	if (trans->fs_info->qgroup_flags & BTRFS_QGROUP_RUNTIME_FLAG_NO_ACCOUNTING)
 		return 0;
 
-	ctx.bytenr = qrecord->bytenr;
+	ctx.bytenr = bytenr;
 	ctx.fs_info = trans->fs_info;
 
 	ret = btrfs_find_all_roots(&ctx, true);
@@ -2154,12 +2156,11 @@ int btrfs_qgroup_trace_extent(struct btrfs_trans_handle *trans, u64 bytenr,
 	}
 
 	delayed_refs = &trans->transaction->delayed_refs;
-	record->bytenr = bytenr;
 	record->num_bytes = num_bytes;
 	record->old_roots = NULL;
 
 	spin_lock(&delayed_refs->lock);
-	ret = btrfs_qgroup_trace_extent_nolock(fs_info, delayed_refs, record);
+	ret = btrfs_qgroup_trace_extent_nolock(fs_info, delayed_refs, record, bytenr);
 	spin_unlock(&delayed_refs->lock);
 	if (ret) {
 		/* Clean up if insertion fails or item exists. */
@@ -2167,7 +2168,7 @@ int btrfs_qgroup_trace_extent(struct btrfs_trans_handle *trans, u64 bytenr,
 		kfree(record);
 		return 0;
 	}
-	return btrfs_qgroup_trace_extent_post(trans, record);
+	return btrfs_qgroup_trace_extent_post(trans, record, bytenr);
 }
 
 /*
@@ -3043,14 +3044,16 @@ int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans)
 	delayed_refs = &trans->transaction->delayed_refs;
 	qgroup_to_skip = delayed_refs->qgroup_to_skip;
 	xa_for_each(&delayed_refs->dirty_extents, index, record) {
+		const u64 bytenr = (((u64)index) << fs_info->sectorsize_bits);
+
 		num_dirty_extents++;
-		trace_btrfs_qgroup_account_extents(fs_info, record);
+		trace_btrfs_qgroup_account_extents(fs_info, record, bytenr);
 
 		if (!ret && !(fs_info->qgroup_flags &
 			      BTRFS_QGROUP_RUNTIME_FLAG_NO_ACCOUNTING)) {
 			struct btrfs_backref_walk_ctx ctx = { 0 };
 
-			ctx.bytenr = record->bytenr;
+			ctx.bytenr = bytenr;
 			ctx.fs_info = fs_info;
 
 			/*
@@ -3092,7 +3095,7 @@ int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans)
 				ulist_del(record->old_roots, qgroup_to_skip,
 					  0);
 			}
-			ret = btrfs_qgroup_account_extent(trans, record->bytenr,
+			ret = btrfs_qgroup_account_extent(trans, bytenr,
 							  record->num_bytes,
 							  record->old_roots,
 							  new_roots);
diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h
index 98adf4ec7b0172..836e9f59ec8421 100644
--- a/fs/btrfs/qgroup.h
+++ b/fs/btrfs/qgroup.h
@@ -125,7 +125,12 @@ struct btrfs_inode;
  * Record a dirty extent, and info qgroup to update quota on it
  */
 struct btrfs_qgroup_extent_record {
-	u64 bytenr;
+	/*
+	 * The bytenr of the extent is given by its index in the dirty_extents
+	 * xarray of struct btrfs_delayed_ref_root left shifted by
+	 * fs_info->sectorsize_bits.
+	 */
+
 	u64 num_bytes;
 
 	/*
@@ -343,9 +348,11 @@ void btrfs_free_qgroup_config(struct btrfs_fs_info *fs_info);
 int btrfs_qgroup_trace_extent_nolock(
 		struct btrfs_fs_info *fs_info,
 		struct btrfs_delayed_ref_root *delayed_refs,
-		struct btrfs_qgroup_extent_record *record);
+		struct btrfs_qgroup_extent_record *record,
+		u64 bytenr);
 int btrfs_qgroup_trace_extent_post(struct btrfs_trans_handle *trans,
-				   struct btrfs_qgroup_extent_record *qrecord);
+				   struct btrfs_qgroup_extent_record *qrecord,
+				   u64 bytenr);
 int btrfs_qgroup_trace_extent(struct btrfs_trans_handle *trans, u64 bytenr,
 			      u64 num_bytes);
 int btrfs_qgroup_trace_leaf_items(struct btrfs_trans_handle *trans,
diff --git a/include/trace/events/btrfs.h b/include/trace/events/btrfs.h
index af6b3827fb1d01..8d2ff32fb3b04a 100644
--- a/include/trace/events/btrfs.h
+++ b/include/trace/events/btrfs.h
@@ -1706,9 +1706,10 @@ DEFINE_EVENT(btrfs__qgroup_rsv_data, btrfs_qgroup_release_data,
 
 DECLARE_EVENT_CLASS(btrfs_qgroup_extent,
 	TP_PROTO(const struct btrfs_fs_info *fs_info,
-		 const struct btrfs_qgroup_extent_record *rec),
+		 const struct btrfs_qgroup_extent_record *rec,
+		 u64 bytenr),
 
-	TP_ARGS(fs_info, rec),
+	TP_ARGS(fs_info, rec, bytenr),
 
 	TP_STRUCT__entry_btrfs(
 		__field(	u64,  bytenr		)
@@ -1716,7 +1717,7 @@ DECLARE_EVENT_CLASS(btrfs_qgroup_extent,
 	),
 
 	TP_fast_assign_btrfs(fs_info,
-		__entry->bytenr		= rec->bytenr;
+		__entry->bytenr		= bytenr;
 		__entry->num_bytes	= rec->num_bytes;
 	),
 
@@ -1727,17 +1728,19 @@ DECLARE_EVENT_CLASS(btrfs_qgroup_extent,
 DEFINE_EVENT(btrfs_qgroup_extent, btrfs_qgroup_account_extents,
 
 	TP_PROTO(const struct btrfs_fs_info *fs_info,
-		 const struct btrfs_qgroup_extent_record *rec),
+		 const struct btrfs_qgroup_extent_record *rec,
+		 u64 bytenr),
 
-	TP_ARGS(fs_info, rec)
+	TP_ARGS(fs_info, rec, bytenr)
 );
 
 DEFINE_EVENT(btrfs_qgroup_extent, btrfs_qgroup_trace_extent,
 
 	TP_PROTO(const struct btrfs_fs_info *fs_info,
-		 const struct btrfs_qgroup_extent_record *rec),
+		 const struct btrfs_qgroup_extent_record *rec,
+		 u64 bytenr),
 
-	TP_ARGS(fs_info, rec)
+	TP_ARGS(fs_info, rec, bytenr)
 );
 
 TRACE_EVENT(qgroup_num_dirty_extents,

From bc100464ff8061f56d262530176306fe9eeb4c9f Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Tue, 24 Sep 2024 17:23:32 +0100
Subject: [PATCH 17/80] btrfs: store fs_info in a local variable at
 btrfs_qgroup_trace_extent_post()

Instead of extracting fs_info from the transaction multiples times, store
it in a local variable and use it.

Reviewed-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/qgroup.c | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index a06596e21d1e46..b2bd94469a62bb 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -2070,10 +2070,14 @@ int btrfs_qgroup_trace_extent_post(struct btrfs_trans_handle *trans,
 				   struct btrfs_qgroup_extent_record *qrecord,
 				   u64 bytenr)
 {
-	struct btrfs_backref_walk_ctx ctx = { 0 };
+	struct btrfs_fs_info *fs_info = trans->fs_info;
+	struct btrfs_backref_walk_ctx ctx = {
+		.bytenr = bytenr,
+		.fs_info = fs_info,
+	};
 	int ret;
 
-	if (!btrfs_qgroup_full_accounting(trans->fs_info))
+	if (!btrfs_qgroup_full_accounting(fs_info))
 		return 0;
 	/*
 	 * We are always called in a context where we are already holding a
@@ -2096,16 +2100,13 @@ int btrfs_qgroup_trace_extent_post(struct btrfs_trans_handle *trans,
 	 */
 	ASSERT(trans != NULL);
 
-	if (trans->fs_info->qgroup_flags & BTRFS_QGROUP_RUNTIME_FLAG_NO_ACCOUNTING)
+	if (fs_info->qgroup_flags & BTRFS_QGROUP_RUNTIME_FLAG_NO_ACCOUNTING)
 		return 0;
 
-	ctx.bytenr = bytenr;
-	ctx.fs_info = trans->fs_info;
-
 	ret = btrfs_find_all_roots(&ctx, true);
 	if (ret < 0) {
-		qgroup_mark_inconsistent(trans->fs_info);
-		btrfs_warn(trans->fs_info,
+		qgroup_mark_inconsistent(fs_info);
+		btrfs_warn(fs_info,
 "error accounting new delayed refs extent (err code: %d), quota inconsistent",
 			ret);
 		return 0;

From 7d2835e229f74b49ed92083a9c7975bd9eef9ef8 Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Tue, 24 Sep 2024 17:37:28 +0100
Subject: [PATCH 18/80] btrfs: remove unnecessary delayed refs locking at
 btrfs_qgroup_trace_extent()

There's no need to hold the delayed refs spinlock when calling
btrfs_qgroup_trace_extent_nolock() from btrfs_qgroup_trace_extent(), since
it doesn't change anything in delayed refs and it only changes the xarray
used to track qgroup extent records, which is protected by the xarray's
lock.

Holding the lock is only adding unnecessary lock contention with other
tasks that actually need to take the lock to add/remove/change delayed
references. So remove the locking.

Reviewed-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/qgroup.c | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index b2bd94469a62bb..7b9095aff2c42e 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -2021,7 +2021,6 @@ int btrfs_qgroup_trace_extent_nolock(struct btrfs_fs_info *fs_info,
 	}
 #endif
 
-	lockdep_assert_held(&delayed_refs->lock);
 	trace_btrfs_qgroup_trace_extent(fs_info, record, bytenr);
 
 	xa_lock(&delayed_refs->dirty_extents);
@@ -2160,9 +2159,7 @@ int btrfs_qgroup_trace_extent(struct btrfs_trans_handle *trans, u64 bytenr,
 	record->num_bytes = num_bytes;
 	record->old_roots = NULL;
 
-	spin_lock(&delayed_refs->lock);
 	ret = btrfs_qgroup_trace_extent_nolock(fs_info, delayed_refs, record, bytenr);
-	spin_unlock(&delayed_refs->lock);
 	if (ret) {
 		/* Clean up if insertion fails or item exists. */
 		xa_release(&delayed_refs->dirty_extents, index);

From ad0bb2c0ba3b610095fbb415834cbb300c5f4674 Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Tue, 24 Sep 2024 17:43:56 +0100
Subject: [PATCH 19/80] btrfs: always use delayed_refs local variable at
 btrfs_qgroup_trace_extent()

Instead of dereferencing the delayed refs from the transaction multiple
times, store it early in the local variable and then always use the
variable.

Reviewed-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/qgroup.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index 7b9095aff2c42e..1c2e030bb5bed3 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -2140,7 +2140,7 @@ int btrfs_qgroup_trace_extent(struct btrfs_trans_handle *trans, u64 bytenr,
 {
 	struct btrfs_fs_info *fs_info = trans->fs_info;
 	struct btrfs_qgroup_extent_record *record;
-	struct btrfs_delayed_ref_root *delayed_refs;
+	struct btrfs_delayed_ref_root *delayed_refs = &trans->transaction->delayed_refs;
 	const unsigned long index = (bytenr >> fs_info->sectorsize_bits);
 	int ret;
 
@@ -2150,12 +2150,11 @@ int btrfs_qgroup_trace_extent(struct btrfs_trans_handle *trans, u64 bytenr,
 	if (!record)
 		return -ENOMEM;
 
-	if (xa_reserve(&trans->transaction->delayed_refs.dirty_extents, index, GFP_NOFS)) {
+	if (xa_reserve(&delayed_refs->dirty_extents, index, GFP_NOFS)) {
 		kfree(record);
 		return -ENOMEM;
 	}
 
-	delayed_refs = &trans->transaction->delayed_refs;
 	record->num_bytes = num_bytes;
 	record->old_roots = NULL;
 

From c8421bcc14201c2020b740094b00640456a47432 Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Tue, 24 Sep 2024 17:50:34 +0100
Subject: [PATCH 20/80] btrfs: remove pointless initialization at
 btrfs_qgroup_trace_extent()

The qgroup record was allocated with kzalloc(), so it's pointless to set
its old_roots member to NULL. Remove the assignment.

Reviewed-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/qgroup.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index 1c2e030bb5bed3..9df6cf56a58bf5 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -2156,7 +2156,6 @@ int btrfs_qgroup_trace_extent(struct btrfs_trans_handle *trans, u64 bytenr,
 	}
 
 	record->num_bytes = num_bytes;
-	record->old_roots = NULL;
 
 	ret = btrfs_qgroup_trace_extent_nolock(fs_info, delayed_refs, record, bytenr);
 	if (ret) {

From e7f6492a70ef10db91302fe246ad2f5ef23ed41e Mon Sep 17 00:00:00 2001
From: Riyan Dhiman <riyandhiman14@gmail.com>
Date: Thu, 26 Sep 2024 13:20:34 +0530
Subject: [PATCH 21/80] btrfs: remove redundant stop_loop variable in
 scrub_stripe()

The variable stop_loop was originally introduced in commit 625f1c8dc66d7
("Btrfs: improve the loop of scrub_stripe"). It was initialized to 0 in
commit 3b080b2564287 ("Btrfs: scrub raid56 stripes in the right way").
However, in a later commit 18d30ab961497 ("btrfs: scrub: use
scrub_simple_mirror() to handle RAID56 data stripe scrub"), the code
that modified stop_loop was removed, making the variable redundant.

Currently, stop_loop is only initialized with 0 and is never used or
modified within the scrub_stripe() function. As a result, this patch
removes the stop_loop variable to clean up the code and eliminate
unnecessary redundancy.

This change has no impact on functionality, as stop_loop was never
utilized in any meaningful way in the final version of the code.

Reviewed-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Riyan Dhiman <riyandhiman14@gmail.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/scrub.c | 9 +--------
 1 file changed, 1 insertion(+), 8 deletions(-)

diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 3a34274280746c..43431065d98189 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -2256,7 +2256,6 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
 	/* Offset inside the chunk */
 	u64 offset;
 	u64 stripe_logical;
-	int stop_loop = 0;
 
 	/* Extent_path should be released by now. */
 	ASSERT(sctx->extent_path.nodes[0] == NULL);
@@ -2370,14 +2369,8 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
 		logical += increment;
 		physical += BTRFS_STRIPE_LEN;
 		spin_lock(&sctx->stat_lock);
-		if (stop_loop)
-			sctx->stat.last_physical =
-				map->stripes[stripe_index].physical + dev_stripe_len;
-		else
-			sctx->stat.last_physical = physical;
+		sctx->stat.last_physical = physical;
 		spin_unlock(&sctx->stat_lock);
-		if (stop_loop)
-			break;
 	}
 out:
 	ret2 = flush_scrub_stripes(sctx);

From 21ac0bfbc09270264dbaf65a6977881f63c87d5d Mon Sep 17 00:00:00 2001
From: Youling Tang <tangyouling@kylinos.cn>
Date: Tue, 24 Sep 2024 10:31:35 +0800
Subject: [PATCH 22/80] btrfs: remove unused page_to_inode and page_to_fs_info
 macros

This macro is no longer used after the "btrfs: Cleaned up folio->page
conversion" series patch [1] was applied, so remove it.

[1]: https://patchwork.kernel.org/project/linux-btrfs/cover/20240828182908.3735344-1-lizetao1@huawei.com/

Reviewed-by: Neal Gompa <neal@gompa.dev>
Signed-off-by: Youling Tang <tangyouling@kylinos.cn>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/fs.h | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/fs/btrfs/fs.h b/fs/btrfs/fs.h
index 785ec15c1b849b..85e6a644aff251 100644
--- a/fs/btrfs/fs.h
+++ b/fs/btrfs/fs.h
@@ -878,12 +878,9 @@ struct btrfs_fs_info {
 #endif
 };
 
-#define page_to_inode(_page)	(BTRFS_I(_Generic((_page),			\
-					  struct page *: (_page))->mapping->host))
 #define folio_to_inode(_folio)	(BTRFS_I(_Generic((_folio),			\
 					  struct folio *: (_folio))->mapping->host))
 
-#define page_to_fs_info(_page)	 (page_to_inode(_page)->root->fs_info)
 #define folio_to_fs_info(_folio) (folio_to_inode(_folio)->root->fs_info)
 
 #define inode_to_fs_info(_inode) (BTRFS_I(_Generic((_inode),			\

From 0647aa37ce905946cd7149966a728e14b445a99c Mon Sep 17 00:00:00 2001
From: Shen Lichuan <shenlichuan@vivo.com>
Date: Tue, 24 Sep 2024 11:09:44 +0800
Subject: [PATCH 23/80] btrfs: correct typos in multiple comments across
 various files

Fix some confusing spelling errors that were currently identified,
the details are as follows:

	block-group.c: 2800: 	uncompressible 	==> incompressible
	extent-tree.c: 3131:	EXTEMT		==> EXTENT
	extent_io.c: 3124: 	utlizing 	==> utilizing
	extent_map.c: 1323: 	ealier		==> earlier
	extent_map.c: 1325:	possiblity	==> possibility
	fiemap.c: 189:		emmitted	==> emitted
	fiemap.c: 197:		emmitted	==> emitted
	fiemap.c: 203:		emmitted	==> emitted
	transaction.h: 36:	trasaction	==> transaction
	volumes.c: 5312:	filesysmte	==> filesystem
	zoned.c: 1977:		trasnsaction	==> transaction

Signed-off-by: Shen Lichuan <shenlichuan@vivo.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/block-group.c | 2 +-
 fs/btrfs/dev-replace.c | 2 +-
 fs/btrfs/extent-tree.c | 2 +-
 fs/btrfs/extent_io.c   | 2 +-
 fs/btrfs/extent_map.c  | 4 ++--
 fs/btrfs/fiemap.c      | 6 +++---
 fs/btrfs/inode.c       | 2 +-
 fs/btrfs/qgroup.c      | 2 +-
 fs/btrfs/scrub.c       | 2 +-
 fs/btrfs/space-info.c  | 2 +-
 fs/btrfs/transaction.h | 2 +-
 fs/btrfs/volumes.c     | 2 +-
 fs/btrfs/zoned.c       | 2 +-
 13 files changed, 16 insertions(+), 16 deletions(-)

diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c
index 7980b2e33a9216..5c5539eb82d349 100644
--- a/fs/btrfs/block-group.c
+++ b/fs/btrfs/block-group.c
@@ -2797,7 +2797,7 @@ void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans)
 		 * uncompressed data size, because the compression is only done
 		 * when writeback triggered and we don't know how much space we
 		 * are actually going to need, so we reserve the uncompressed
-		 * size because the data may be uncompressible in the worst case.
+		 * size because the data may be incompressible in the worst case.
 		 */
 		if (ret == 0) {
 			bool used;
diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index 604399e59a3d10..ac8e97ed13f751 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -45,7 +45,7 @@
  *
  * - Copy existing extents
  *
- *   This happens by re-using scrub facility, as scrub also iterates through
+ *   This happens by reusing scrub facility, as scrub also iterates through
  *   existing extents from commit root.
  *
  *   Location:		scrub_write_block_to_dev_replace() from
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index d9f511babd89ab..79373f0ab6cee3 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3144,7 +3144,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
 				break;
 			}
 
-			/* Quick path didn't find the EXTEMT/METADATA_ITEM */
+			/* Quick path didn't find the EXTENT/METADATA_ITEM */
 			if (path->slots[0] - extent_slot > 5)
 				break;
 			extent_slot--;
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 1ad3ea6f42e24b..9fbc83c76b94dc 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -3185,7 +3185,7 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
 	}
 	/*
 	 * Now all pages of that extent buffer is unmapped, set UNMAPPED flag,
-	 * so it can be cleaned up without utlizing page->mapping.
+	 * so it can be cleaned up without utilizing page->mapping.
 	 */
 	set_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags);
 
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index 25d191f1ac10f4..9e1a8af6629cff 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -1320,9 +1320,9 @@ long btrfs_free_extent_maps(struct btrfs_fs_info *fs_info, long nr_to_scan)
 	 * not possible to know which task made more progress because we can
 	 * cycle back to the first root and first inode if it's not the first
 	 * time the shrinker ran, see the above logic. Also a task that started
-	 * later may finish ealier than another task and made less progress. So
+	 * later may finish earlier than another task and made less progress. So
 	 * make this simple and update to the progress of the last task that
-	 * finished, with the occasional possiblity of having two consecutive
+	 * finished, with the occasional possibility of having two consecutive
 	 * runs of the shrinker process the same inodes.
 	 */
 	spin_lock(&fs_info->extent_map_shrinker_lock);
diff --git a/fs/btrfs/fiemap.c b/fs/btrfs/fiemap.c
index df7f09f3b02e06..b80c07ad8c5e71 100644
--- a/fs/btrfs/fiemap.c
+++ b/fs/btrfs/fiemap.c
@@ -186,7 +186,7 @@ static int emit_fiemap_extent(struct fiemap_extent_info *fieinfo,
 			 * we have in the cache is the last delalloc range we
 			 * found while the file extent item we found can be
 			 * either for a whole delalloc range we previously
-			 * emmitted or only a part of that range.
+			 * emitted or only a part of that range.
 			 *
 			 * We have two cases here:
 			 *
@@ -194,13 +194,13 @@ static int emit_fiemap_extent(struct fiemap_extent_info *fieinfo,
 			 *    cached extent's end. In this case just ignore the
 			 *    current file extent item because we don't want to
 			 *    overlap with previous ranges that may have been
-			 *    emmitted already;
+			 *    emitted already;
 			 *
 			 * 2) The file extent item starts behind the currently
 			 *    cached extent but its end offset goes beyond the
 			 *    end offset of the cached extent. We don't want to
 			 *    overlap with a previous range that may have been
-			 *    emmitted already, so we emit the currently cached
+			 *    emitted already, so we emit the currently cached
 			 *    extent and then partially store the current file
 			 *    extent item's range in the cache, for the subrange
 			 *    going the cached extent's end to the end of the
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 699cd85796c1c6..464620381ca709 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -5990,7 +5990,7 @@ static int btrfs_real_readdir(struct file *file, struct dir_context *ctx)
 	 * offset.  This means that new entries created during readdir
 	 * are *guaranteed* to be seen in the future by that readdir.
 	 * This has broken buggy programs which operate on names as
-	 * they're returned by readdir.  Until we re-use freed offsets
+	 * they're returned by readdir.  Until we reuse freed offsets
 	 * we have this hack to stop new entries from being returned
 	 * under the assumption that they'll never reach this huge
 	 * offset.
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index 9df6cf56a58bf5..f68a2639058949 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -469,7 +469,7 @@ int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info)
 			/*
 			 * If a qgroup exists for a subvolume ID, it is possible
 			 * that subvolume has been deleted, in which case
-			 * re-using that ID would lead to incorrect accounting.
+			 * reusing that ID would lead to incorrect accounting.
 			 *
 			 * Ensure that we skip any such subvol ids.
 			 *
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 43431065d98189..e141132b5c8dfa 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -1954,7 +1954,7 @@ static int scrub_raid56_parity_stripe(struct scrub_ctx *sctx,
 	ASSERT(sctx->raid56_data_stripes);
 
 	/*
-	 * For data stripe search, we cannot re-use the same extent/csum paths,
+	 * For data stripe search, we cannot reuse the same extent/csum paths,
 	 * as the data stripe bytenr may be smaller than previous extent.  Thus
 	 * we have to use our own extent/csum paths.
 	 */
diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c
index d5a9cd8a4fd8d7..ee23fae73f4736 100644
--- a/fs/btrfs/space-info.c
+++ b/fs/btrfs/space-info.c
@@ -1279,7 +1279,7 @@ static void btrfs_preempt_reclaim_metadata_space(struct work_struct *work)
  *   If we are freeing inodes, we want to make sure all delayed iputs have
  *   completed, because they could have been on an inode with i_nlink == 0, and
  *   thus have been truncated and freed up space.  But again this space is not
- *   immediately re-usable, it comes in the form of a delayed ref, which must be
+ *   immediately reusable, it comes in the form of a delayed ref, which must be
  *   run and then the transaction must be committed.
  *
  * COMMIT_TRANS
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index dd9ce9b9f69e38..184fa5c0062abf 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -33,7 +33,7 @@ struct btrfs_path;
  */
 #define BTRFS_TRANS_DIO_WRITE_STUB	((void *) 1)
 
-/* Radix-tree tag for roots that are part of the trasaction. */
+/* Radix-tree tag for roots that are part of the transaction. */
 #define BTRFS_ROOT_TRANS_TAG			0
 
 enum btrfs_trans_state {
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 9ec761a280fe3a..6352c8c61e0aeb 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -5430,7 +5430,7 @@ static int decide_stripe_size_zoned(struct alloc_chunk_ctl *ctl,
 	ctl->num_stripes = ctl->ndevs * ctl->dev_stripes;
 	data_stripes = (ctl->num_stripes - ctl->nparity) / ctl->ncopies;
 
-	/* stripe_size is fixed in zoned filesysmte. Reduce ndevs instead. */
+	/* stripe_size is fixed in zoned filesystem. Reduce ndevs instead. */
 	if (ctl->stripe_size * data_stripes > ctl->max_chunk_size) {
 		ctl->ndevs = div_u64(div_u64(ctl->max_chunk_size * ctl->ncopies,
 					     ctl->stripe_size) + ctl->nparity,
diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c
index 69d03feea4e0ec..dbcbf754d2846d 100644
--- a/fs/btrfs/zoned.c
+++ b/fs/btrfs/zoned.c
@@ -1973,7 +1973,7 @@ int btrfs_check_meta_write_pointer(struct btrfs_fs_info *fs_info,
 	if (block_group->meta_write_pointer > eb->start)
 		return -EBUSY;
 
-	/* If for_sync, this hole will be filled with trasnsaction commit. */
+	/* If for_sync, this hole will be filled with transaction commit. */
 	if (wbc->sync_mode == WB_SYNC_ALL && !wbc->for_sync)
 		return -EAGAIN;
 	return -EBUSY;

From 9c74f2c6bddd44ad8cd6b9cf93e2b68ba564b642 Mon Sep 17 00:00:00 2001
From: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Date: Wed, 2 Oct 2024 12:11:48 +0200
Subject: [PATCH 24/80] btrfs: tests: add selftests for raid-stripe-tree

Add first stash of very basic self tests for the RAID stripe-tree.

More test cases will follow exercising the tree.

Reviewed-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/Makefile                       |   3 +-
 fs/btrfs/raid-stripe-tree.c             |   5 +-
 fs/btrfs/raid-stripe-tree.h             |   5 +
 fs/btrfs/tests/btrfs-tests.c            |   4 +
 fs/btrfs/tests/btrfs-tests.h            |   2 +
 fs/btrfs/tests/raid-stripe-tree-tests.c | 317 ++++++++++++++++++++++++
 fs/btrfs/volumes.c                      |   6 +-
 fs/btrfs/volumes.h                      |   5 +
 8 files changed, 341 insertions(+), 6 deletions(-)
 create mode 100644 fs/btrfs/tests/raid-stripe-tree-tests.c

diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
index 87617f2968bccf..3cfc440c636ca3 100644
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -43,4 +43,5 @@ btrfs-$(CONFIG_FS_VERITY) += verity.o
 btrfs-$(CONFIG_BTRFS_FS_RUN_SANITY_TESTS) += tests/free-space-tests.o \
 	tests/extent-buffer-tests.o tests/btrfs-tests.o \
 	tests/extent-io-tests.o tests/inode-tests.o tests/qgroup-tests.o \
-	tests/free-space-tree-tests.o tests/extent-map-tests.o
+	tests/free-space-tree-tests.o tests/extent-map-tests.o \
+	tests/raid-stripe-tree-tests.o
diff --git a/fs/btrfs/raid-stripe-tree.c b/fs/btrfs/raid-stripe-tree.c
index 4c859b550f6c33..b7787a8e4af295 100644
--- a/fs/btrfs/raid-stripe-tree.c
+++ b/fs/btrfs/raid-stripe-tree.c
@@ -108,8 +108,9 @@ static int update_raid_extent_item(struct btrfs_trans_handle *trans,
 	return ret;
 }
 
-static int btrfs_insert_one_raid_extent(struct btrfs_trans_handle *trans,
-					struct btrfs_io_context *bioc)
+EXPORT_FOR_TESTS
+int btrfs_insert_one_raid_extent(struct btrfs_trans_handle *trans,
+				 struct btrfs_io_context *bioc)
 {
 	struct btrfs_fs_info *fs_info = trans->fs_info;
 	struct btrfs_key stripe_key;
diff --git a/fs/btrfs/raid-stripe-tree.h b/fs/btrfs/raid-stripe-tree.h
index 1ac1c21aac2f9b..54183642177869 100644
--- a/fs/btrfs/raid-stripe-tree.h
+++ b/fs/btrfs/raid-stripe-tree.h
@@ -28,6 +28,11 @@ int btrfs_get_raid_extent_offset(struct btrfs_fs_info *fs_info,
 int btrfs_insert_raid_extent(struct btrfs_trans_handle *trans,
 			     struct btrfs_ordered_extent *ordered_extent);
 
+#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
+int btrfs_insert_one_raid_extent(struct btrfs_trans_handle *trans,
+				 struct btrfs_io_context *bioc);
+#endif
+
 static inline bool btrfs_need_stripe_tree_update(struct btrfs_fs_info *fs_info,
 						 u64 map_type)
 {
diff --git a/fs/btrfs/tests/btrfs-tests.c b/fs/btrfs/tests/btrfs-tests.c
index ce50847e1e01a4..e607b5d52fb1d2 100644
--- a/fs/btrfs/tests/btrfs-tests.c
+++ b/fs/btrfs/tests/btrfs-tests.c
@@ -29,6 +29,7 @@ const char *test_error[] = {
 	[TEST_ALLOC_BLOCK_GROUP]     = "cannot allocate block group",
 	[TEST_ALLOC_EXTENT_MAP]      = "cannot allocate extent map",
 	[TEST_ALLOC_CHUNK_MAP]       = "cannot allocate chunk map",
+	[TEST_ALLOC_IO_CONTEXT]	     = "cannot allocate io context",
 };
 
 static const struct super_operations btrfs_test_super_ops = {
@@ -291,6 +292,9 @@ int btrfs_run_sanity_tests(void)
 			ret = btrfs_test_free_space_tree(sectorsize, nodesize);
 			if (ret)
 				goto out;
+			ret = btrfs_test_raid_stripe_tree(sectorsize, nodesize);
+			if (ret)
+				goto out;
 		}
 	}
 	ret = btrfs_test_extent_map();
diff --git a/fs/btrfs/tests/btrfs-tests.h b/fs/btrfs/tests/btrfs-tests.h
index dc2f2ab15fa5c3..b524ecf2f452b6 100644
--- a/fs/btrfs/tests/btrfs-tests.h
+++ b/fs/btrfs/tests/btrfs-tests.h
@@ -24,6 +24,7 @@ enum {
 	TEST_ALLOC_BLOCK_GROUP,
 	TEST_ALLOC_EXTENT_MAP,
 	TEST_ALLOC_CHUNK_MAP,
+	TEST_ALLOC_IO_CONTEXT,
 };
 
 extern const char *test_error[];
@@ -37,6 +38,7 @@ int btrfs_test_extent_io(u32 sectorsize, u32 nodesize);
 int btrfs_test_inodes(u32 sectorsize, u32 nodesize);
 int btrfs_test_qgroups(u32 sectorsize, u32 nodesize);
 int btrfs_test_free_space_tree(u32 sectorsize, u32 nodesize);
+int btrfs_test_raid_stripe_tree(u32 sectorsize, u32 nodesize);
 int btrfs_test_extent_map(void);
 struct inode *btrfs_new_test_inode(void);
 struct btrfs_fs_info *btrfs_alloc_dummy_fs_info(u32 nodesize, u32 sectorsize);
diff --git a/fs/btrfs/tests/raid-stripe-tree-tests.c b/fs/btrfs/tests/raid-stripe-tree-tests.c
new file mode 100644
index 00000000000000..b8013ab13c4374
--- /dev/null
+++ b/fs/btrfs/tests/raid-stripe-tree-tests.c
@@ -0,0 +1,317 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2024 Western Digital Corporation or its affiliates.
+ */
+
+#include <linux/sizes.h>
+#include "../fs.h"
+#include "../disk-io.h"
+#include "../transaction.h"
+#include "../volumes.h"
+#include "../raid-stripe-tree.h"
+#include "btrfs-tests.h"
+
+#define RST_TEST_NUM_DEVICES	(2)
+#define RST_TEST_RAID1_TYPE	(BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_RAID1)
+
+typedef int (*test_func_t)(struct btrfs_trans_handle *trans);
+
+static struct btrfs_device *btrfs_device_by_devid(struct btrfs_fs_devices *fs_devices,
+						  u64 devid)
+{
+	struct btrfs_device *dev;
+
+	list_for_each_entry(dev, &fs_devices->devices, dev_list) {
+		if (dev->devid == devid)
+			return dev;
+	}
+
+	return NULL;
+}
+
+/*
+ * Test a 64K RST write on a 2 disk RAID1 at a logical address of 1M and then
+ * overwrite the whole range giving it new physical address at an offset of 1G.
+ * The intent of this test is to exercise the 'update_raid_extent_item()'
+ * function called be btrfs_insert_one_raid_extent().
+ */
+static int test_create_update_delete(struct btrfs_trans_handle *trans)
+{
+	struct btrfs_fs_info *fs_info = trans->fs_info;
+	struct btrfs_io_context *bioc;
+	struct btrfs_io_stripe io_stripe = { 0 };
+	u64 map_type = RST_TEST_RAID1_TYPE;
+	u64 logical = SZ_1M;
+	u64 len = SZ_64K;
+	int ret;
+
+	bioc = alloc_btrfs_io_context(fs_info, logical, RST_TEST_NUM_DEVICES);
+	if (!bioc) {
+		test_std_err(TEST_ALLOC_IO_CONTEXT);
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	io_stripe.dev = btrfs_device_by_devid(fs_info->fs_devices, 0);
+	bioc->map_type = map_type;
+	bioc->size = len;
+
+	for (int i = 0; i < RST_TEST_NUM_DEVICES; i++) {
+		struct btrfs_io_stripe *stripe = &bioc->stripes[i];
+
+		stripe->dev = btrfs_device_by_devid(fs_info->fs_devices, i);
+		if (!stripe->dev) {
+			test_err("cannot find device with devid %d", i);
+			ret = -EINVAL;
+			goto out;
+		}
+
+		stripe->physical = logical + i * SZ_1G;
+	}
+
+	ret = btrfs_insert_one_raid_extent(trans, bioc);
+	if (ret) {
+		test_err("inserting RAID extent failed: %d", ret);
+		goto out;
+	}
+
+	io_stripe.dev = btrfs_device_by_devid(fs_info->fs_devices, 0);
+	if (!io_stripe.dev) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	ret = btrfs_get_raid_extent_offset(fs_info, logical, &len, map_type, 0, &io_stripe);
+	if (ret) {
+		test_err("lookup of RAID extent [%llu, %llu] failed", logical,
+			 logical + len);
+		goto out;
+	}
+
+	if (io_stripe.physical != logical) {
+		test_err("invalid physical address, expected %llu got %llu",
+			 logical, io_stripe.physical);
+		ret = -EINVAL;
+		goto out;
+	}
+
+	if (len != SZ_64K) {
+		test_err("invalid stripe length, expected %llu got %llu",
+			 (u64)SZ_64K, len);
+		ret = -EINVAL;
+		goto out;
+	}
+
+	for (int i = 0; i < RST_TEST_NUM_DEVICES; i++) {
+		struct btrfs_io_stripe *stripe = &bioc->stripes[i];
+
+		stripe->dev = btrfs_device_by_devid(fs_info->fs_devices, i);
+		if (!stripe->dev) {
+			test_err("cannot find device with devid %d", i);
+			ret = -EINVAL;
+			goto out;
+		}
+
+		stripe->physical = SZ_1G + logical + i * SZ_1G;
+	}
+
+	ret = btrfs_insert_one_raid_extent(trans, bioc);
+	if (ret) {
+		test_err("updating RAID extent failed: %d", ret);
+		goto out;
+	}
+
+	ret = btrfs_get_raid_extent_offset(fs_info, logical, &len, map_type, 0, &io_stripe);
+	if (ret) {
+		test_err("lookup of RAID extent [%llu, %llu] failed", logical,
+			 logical + len);
+		goto out;
+	}
+
+	if (io_stripe.physical != logical + SZ_1G) {
+		test_err("invalid physical address, expected %llu, got %llu",
+			 logical + SZ_1G, io_stripe.physical);
+		ret = -EINVAL;
+		goto out;
+	}
+
+	if (len != SZ_64K) {
+		test_err("invalid stripe length, expected %llu, got %llu",
+			 (u64)SZ_64K, len);
+		ret = -EINVAL;
+		goto out;
+	}
+
+	ret = btrfs_delete_raid_extent(trans, logical, len);
+	if (ret)
+		test_err("deleting RAID extent [%llu, %llu] failed", logical,
+			 logical + len);
+
+out:
+	btrfs_put_bioc(bioc);
+	return ret;
+}
+
+/*
+ * Test a simple 64K RST write on a 2 disk RAID1 at a logical address of 1M.
+ * The "physical" copy on device 0 is at 1M, on device 1 it is at 1G+1M.
+ */
+static int test_simple_create_delete(struct btrfs_trans_handle *trans)
+{
+	struct btrfs_fs_info *fs_info = trans->fs_info;
+	struct btrfs_io_context *bioc;
+	struct btrfs_io_stripe io_stripe = { 0 };
+	u64 map_type = RST_TEST_RAID1_TYPE;
+	u64 logical = SZ_1M;
+	u64 len = SZ_64K;
+	int ret;
+
+	bioc = alloc_btrfs_io_context(fs_info, logical, RST_TEST_NUM_DEVICES);
+	if (!bioc) {
+		test_std_err(TEST_ALLOC_IO_CONTEXT);
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	bioc->map_type = map_type;
+	bioc->size = SZ_64K;
+
+	for (int i = 0; i < RST_TEST_NUM_DEVICES; i++) {
+		struct btrfs_io_stripe *stripe = &bioc->stripes[i];
+
+		stripe->dev = btrfs_device_by_devid(fs_info->fs_devices, i);
+		if (!stripe->dev) {
+			test_err("cannot find device with devid %d", i);
+			ret = -EINVAL;
+			goto out;
+		}
+
+		stripe->physical = logical + i * SZ_1G;
+	}
+
+	ret = btrfs_insert_one_raid_extent(trans, bioc);
+	if (ret) {
+		test_err("inserting RAID extent failed: %d", ret);
+		goto out;
+	}
+
+	io_stripe.dev = btrfs_device_by_devid(fs_info->fs_devices, 0);
+	if (!io_stripe.dev) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	ret = btrfs_get_raid_extent_offset(fs_info, logical, &len, map_type, 0, &io_stripe);
+	if (ret)  {
+		test_err("lookup of RAID extent [%llu, %llu] failed", logical,
+			 logical + len);
+		goto out;
+	}
+
+	if (io_stripe.physical != logical) {
+		test_err("invalid physical address, expected %llu got %llu",
+			 logical, io_stripe.physical);
+		ret = -EINVAL;
+		goto out;
+	}
+
+	if (len != SZ_64K) {
+		test_err("invalid stripe length, expected %llu got %llu",
+			 (u64)SZ_64K, len);
+		ret = -EINVAL;
+		goto out;
+	}
+
+	ret = btrfs_delete_raid_extent(trans, logical, len);
+	if (ret)
+		test_err("deleting RAID extent [%llu, %llu] failed", logical,
+			 logical + len);
+
+out:
+	btrfs_put_bioc(bioc);
+	return ret;
+}
+
+static const test_func_t tests[] = {
+	test_simple_create_delete,
+	test_create_update_delete,
+};
+
+static int run_test(test_func_t test, u32 sectorsize, u32 nodesize)
+{
+	struct btrfs_trans_handle trans;
+	struct btrfs_fs_info *fs_info;
+	struct btrfs_root *root = NULL;
+	int ret;
+
+	fs_info = btrfs_alloc_dummy_fs_info(sectorsize, nodesize);
+	if (!fs_info) {
+		test_std_err(TEST_ALLOC_FS_INFO);
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	root = btrfs_alloc_dummy_root(fs_info);
+	if (IS_ERR(root)) {
+		test_std_err(TEST_ALLOC_ROOT);
+		ret = PTR_ERR(root);
+		goto out;
+	}
+	btrfs_set_super_compat_ro_flags(root->fs_info->super_copy,
+					BTRFS_FEATURE_INCOMPAT_RAID_STRIPE_TREE);
+	root->root_key.objectid = BTRFS_RAID_STRIPE_TREE_OBJECTID;
+	root->root_key.type = BTRFS_ROOT_ITEM_KEY;
+	root->root_key.offset = 0;
+	fs_info->stripe_root = root;
+	root->fs_info->tree_root = root;
+
+	root->node = alloc_test_extent_buffer(root->fs_info, nodesize);
+	if (IS_ERR(root->node)) {
+		test_std_err(TEST_ALLOC_EXTENT_BUFFER);
+		ret = PTR_ERR(root->node);
+		goto out;
+	}
+	btrfs_set_header_level(root->node, 0);
+	btrfs_set_header_nritems(root->node, 0);
+	root->alloc_bytenr += 2 * nodesize;
+
+	for (int i = 0; i < RST_TEST_NUM_DEVICES; i++) {
+		struct btrfs_device *dev;
+
+		dev = btrfs_alloc_dummy_device(fs_info);
+		if (IS_ERR(dev)) {
+			test_err("cannot allocate device");
+			ret = PTR_ERR(dev);
+			goto out;
+		}
+		dev->devid = i;
+	}
+
+	btrfs_init_dummy_trans(&trans, root->fs_info);
+	ret = test(&trans);
+	if (ret)
+		goto out;
+
+out:
+	btrfs_free_dummy_root(root);
+	btrfs_free_dummy_fs_info(fs_info);
+
+	return ret;
+}
+
+int btrfs_test_raid_stripe_tree(u32 sectorsize, u32 nodesize)
+{
+	int ret = 0;
+
+	test_msg("running raid-stripe-tree tests");
+	for (int i = 0; i < ARRAY_SIZE(tests); i++) {
+		ret = run_test(tests[i], sectorsize, nodesize);
+		if (ret) {
+			test_err("test-case %ps failed with %d\n", tests[i], ret);
+			goto out;
+		}
+	}
+
+out:
+	return ret;
+}
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 6352c8c61e0aeb..fda94f37ce6b71 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -6040,9 +6040,9 @@ static int find_live_mirror(struct btrfs_fs_info *fs_info,
 	return preferred_mirror;
 }
 
-static struct btrfs_io_context *alloc_btrfs_io_context(struct btrfs_fs_info *fs_info,
-						       u64 logical,
-						       u16 total_stripes)
+EXPORT_FOR_TESTS
+struct btrfs_io_context *alloc_btrfs_io_context(struct btrfs_fs_info *fs_info,
+						u64 logical, u16 total_stripes)
 {
 	struct btrfs_io_context *bioc;
 
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 26e35fc1c8fdab..3ebb3c2732b0f8 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -840,4 +840,9 @@ bool btrfs_repair_one_zone(struct btrfs_fs_info *fs_info, u64 logical);
 bool btrfs_pinned_by_swapfile(struct btrfs_fs_info *fs_info, void *ptr);
 const u8 *btrfs_sb_fsid_ptr(const struct btrfs_super_block *sb);
 
+#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
+struct btrfs_io_context *alloc_btrfs_io_context(struct btrfs_fs_info *fs_info,
+						u64 logical, u16 total_stripes);
+#endif
+
 #endif

From 1bbafcc1813f574acc3001e57aa90ce437815e7e Mon Sep 17 00:00:00 2001
From: "Dr. David Alan Gilbert" <linux@treblig.org>
Date: Thu, 3 Oct 2024 21:33:19 +0100
Subject: [PATCH 25/80] btrfs: remove unused btrfs_free_squota_rsv()

btrfs_free_squota_rsv() was added in commit
e85a0adacf17 ("btrfs: ensure releasing squota reserve on head refs")
but has remained unused since then.
Remove it as we don't seem to need it and was probably a leftover.

Reviewed-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: Dr. David Alan Gilbert <linux@treblig.org>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/qgroup.c | 11 -----------
 fs/btrfs/qgroup.h |  1 -
 2 files changed, 12 deletions(-)

diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index f68a2639058949..cd062b1015fb84 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -4893,17 +4893,6 @@ void btrfs_qgroup_destroy_extent_records(struct btrfs_transaction *trans)
 	xa_destroy(&trans->delayed_refs.dirty_extents);
 }
 
-void btrfs_free_squota_rsv(struct btrfs_fs_info *fs_info, u64 root, u64 rsv_bytes)
-{
-	if (btrfs_qgroup_mode(fs_info) != BTRFS_QGROUP_MODE_SIMPLE)
-		return;
-
-	if (!is_fstree(root))
-		return;
-
-	btrfs_qgroup_free_refroot(fs_info, root, rsv_bytes, BTRFS_QGROUP_RSV_DATA);
-}
-
 int btrfs_record_squota_delta(struct btrfs_fs_info *fs_info,
 			      const struct btrfs_squota_delta *delta)
 {
diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h
index 836e9f59ec8421..5ba2b181636b38 100644
--- a/fs/btrfs/qgroup.h
+++ b/fs/btrfs/qgroup.h
@@ -447,7 +447,6 @@ int btrfs_qgroup_trace_subtree_after_cow(struct btrfs_trans_handle *trans,
 		struct btrfs_root *root, struct extent_buffer *eb);
 void btrfs_qgroup_destroy_extent_records(struct btrfs_transaction *trans);
 bool btrfs_check_quota_leak(const struct btrfs_fs_info *fs_info);
-void btrfs_free_squota_rsv(struct btrfs_fs_info *fs_info, u64 root, u64 rsv_bytes);
 int btrfs_record_squota_delta(struct btrfs_fs_info *fs_info,
 			      const struct btrfs_squota_delta *delta);
 

From 3d8ac5535c8cee2dd04cc391dc88d9dfebbcf53f Mon Sep 17 00:00:00 2001
From: "Dr. David Alan Gilbert" <linux@treblig.org>
Date: Thu, 3 Oct 2024 15:27:26 +0100
Subject: [PATCH 26/80] btrfs: remove unused btrfs_is_parity_mirror()

btrfs_is_parity_mirror() has been unused since commit 4886ff7b50f6
("btrfs: introduce a new helper to submit write bio for repair").
Remove it as the code was refactored and we don't need the helper
anymore.

Reviewed-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: Dr. David Alan Gilbert <linux@treblig.org>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/volumes.c | 18 ------------------
 fs/btrfs/volumes.h |  2 --
 2 files changed, 20 deletions(-)

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index fda94f37ce6b71..dc9f54849f3901 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -5962,24 +5962,6 @@ unsigned long btrfs_full_stripe_len(struct btrfs_fs_info *fs_info,
 	return len;
 }
 
-int btrfs_is_parity_mirror(struct btrfs_fs_info *fs_info, u64 logical, u64 len)
-{
-	struct btrfs_chunk_map *map;
-	int ret = 0;
-
-	if (!btrfs_fs_incompat(fs_info, RAID56))
-		return 0;
-
-	map = btrfs_get_chunk_map(fs_info, logical, len);
-
-	if (!WARN_ON(IS_ERR(map))) {
-		if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
-			ret = 1;
-		btrfs_free_chunk_map(map);
-	}
-	return ret;
-}
-
 static int find_live_mirror(struct btrfs_fs_info *fs_info,
 			    struct btrfs_chunk_map *map, int first,
 			    int dev_replace_is_ongoing)
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 3ebb3c2732b0f8..3a416b1bc24cb0 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -741,8 +741,6 @@ int btrfs_run_dev_stats(struct btrfs_trans_handle *trans);
 void btrfs_rm_dev_replace_remove_srcdev(struct btrfs_device *srcdev);
 void btrfs_rm_dev_replace_free_srcdev(struct btrfs_device *srcdev);
 void btrfs_destroy_dev_replace_tgtdev(struct btrfs_device *tgtdev);
-int btrfs_is_parity_mirror(struct btrfs_fs_info *fs_info,
-			   u64 logical, u64 len);
 unsigned long btrfs_full_stripe_len(struct btrfs_fs_info *fs_info,
 				    u64 logical);
 u64 btrfs_calc_stripe_length(const struct btrfs_chunk_map *map);

From 7f928637858ddea5dea8e3261a936ec68808e30a Mon Sep 17 00:00:00 2001
From: "Dr. David Alan Gilbert" <linux@treblig.org>
Date: Thu, 3 Oct 2024 15:27:27 +0100
Subject: [PATCH 27/80] btrfs: remove unused btrfs_try_tree_write_lock()

btrfs_try_tree_write_lock() has been unused since commit
50b21d7a066f ("btrfs: submit a writeback bio per extent_buffer").
Remove it as we don't need it anymore.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: Dr. David Alan Gilbert <linux@treblig.org>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/locking.c           | 15 ---------------
 fs/btrfs/locking.h           |  1 -
 include/trace/events/btrfs.h |  1 -
 3 files changed, 17 deletions(-)

diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c
index 6a0b7abb5bd9aa..9a7a7b7233054e 100644
--- a/fs/btrfs/locking.c
+++ b/fs/btrfs/locking.c
@@ -161,21 +161,6 @@ int btrfs_try_tree_read_lock(struct extent_buffer *eb)
 	return 0;
 }
 
-/*
- * Try-lock for write.
- *
- * Return 1 if the rwlock has been taken, 0 otherwise
- */
-int btrfs_try_tree_write_lock(struct extent_buffer *eb)
-{
-	if (down_write_trylock(&eb->lock)) {
-		btrfs_set_eb_lock_owner(eb, current->pid);
-		trace_btrfs_try_tree_write_lock(eb);
-		return 1;
-	}
-	return 0;
-}
-
 /*
  * Release read lock.
  */
diff --git a/fs/btrfs/locking.h b/fs/btrfs/locking.h
index 3c15c75e05829e..46c8be2afab1c6 100644
--- a/fs/btrfs/locking.h
+++ b/fs/btrfs/locking.h
@@ -180,7 +180,6 @@ static inline void btrfs_tree_read_lock(struct extent_buffer *eb)
 
 void btrfs_tree_read_unlock(struct extent_buffer *eb);
 int btrfs_try_tree_read_lock(struct extent_buffer *eb);
-int btrfs_try_tree_write_lock(struct extent_buffer *eb);
 struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root);
 struct extent_buffer *btrfs_read_lock_root_node(struct btrfs_root *root);
 struct extent_buffer *btrfs_try_read_lock_root_node(struct btrfs_root *root);
diff --git a/include/trace/events/btrfs.h b/include/trace/events/btrfs.h
index 8d2ff32fb3b04a..bd515415ea8b81 100644
--- a/include/trace/events/btrfs.h
+++ b/include/trace/events/btrfs.h
@@ -2344,7 +2344,6 @@ DEFINE_BTRFS_LOCK_EVENT(btrfs_tree_read_unlock_blocking);
 DEFINE_BTRFS_LOCK_EVENT(btrfs_set_lock_blocking_read);
 DEFINE_BTRFS_LOCK_EVENT(btrfs_set_lock_blocking_write);
 DEFINE_BTRFS_LOCK_EVENT(btrfs_try_tree_read_lock);
-DEFINE_BTRFS_LOCK_EVENT(btrfs_try_tree_write_lock);
 DEFINE_BTRFS_LOCK_EVENT(btrfs_tree_read_lock_atomic);
 
 DECLARE_EVENT_CLASS(btrfs__space_info_update,

From 3b7324c43d2789be894aa9c845f86d5639cba233 Mon Sep 17 00:00:00 2001
From: Qu Wenruo <wqu@suse.com>
Date: Wed, 2 Oct 2024 08:47:48 +0930
Subject: [PATCH 28/80] btrfs: remove the dirty_page local variable

Inside btrfs_buffered_write(), we have a local variable @dirty_pages,
recording the number of pages we dirtied in the current iteration.

However we do not really need that variable, since it can be calculated
from @pos and @copied.

In fact there is already a problem inside the short copy path, where we
use @dirty_pages to calculate the range we need to release.
But that usage assumes sectorsize == PAGE_SIZE, which is no longer true.

Instead of keeping @dirty_pages and cause incorrect usage, just
calculate the number of dirtied pages inside btrfs_dirty_pages().

Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/file.c             | 19 +++++++------------
 fs/btrfs/file.h             |  2 +-
 fs/btrfs/free-space-cache.c |  3 +--
 3 files changed, 9 insertions(+), 15 deletions(-)

diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 4fb521d91b0612..9555a3485670be 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -124,12 +124,14 @@ static void btrfs_drop_pages(struct btrfs_fs_info *fs_info,
  * - Update inode size for past EOF write
  */
 int btrfs_dirty_pages(struct btrfs_inode *inode, struct page **pages,
-		      size_t num_pages, loff_t pos, size_t write_bytes,
+		      loff_t pos, size_t write_bytes,
 		      struct extent_state **cached, bool noreserve)
 {
 	struct btrfs_fs_info *fs_info = inode->root->fs_info;
 	int ret = 0;
 	int i;
+	const int num_pages = (round_up(pos + write_bytes, PAGE_SIZE) -
+			       round_down(pos, PAGE_SIZE)) >> PAGE_SHIFT;
 	u64 num_bytes;
 	u64 start_pos;
 	u64 end_of_last_block;
@@ -1242,7 +1244,6 @@ ssize_t btrfs_buffered_write(struct kiocb *iocb, struct iov_iter *i)
 					 offset);
 		size_t num_pages;
 		size_t reserve_bytes;
-		size_t dirty_pages;
 		size_t copied;
 		size_t dirty_sectors;
 		size_t num_sectors;
@@ -1361,11 +1362,8 @@ ssize_t btrfs_buffered_write(struct kiocb *iocb, struct iov_iter *i)
 		if (copied == 0) {
 			force_page_uptodate = true;
 			dirty_sectors = 0;
-			dirty_pages = 0;
 		} else {
 			force_page_uptodate = false;
-			dirty_pages = DIV_ROUND_UP(copied + offset,
-						   PAGE_SIZE);
 		}
 
 		if (num_sectors > dirty_sectors) {
@@ -1375,13 +1373,10 @@ ssize_t btrfs_buffered_write(struct kiocb *iocb, struct iov_iter *i)
 				btrfs_delalloc_release_metadata(BTRFS_I(inode),
 							release_bytes, true);
 			} else {
-				u64 __pos;
-
-				__pos = round_down(pos,
-						   fs_info->sectorsize) +
-					(dirty_pages << PAGE_SHIFT);
+				u64 release_start = round_up(pos + copied,
+							     fs_info->sectorsize);
 				btrfs_delalloc_release_space(BTRFS_I(inode),
-						data_reserved, __pos,
+						data_reserved, release_start,
 						release_bytes, true);
 			}
 		}
@@ -1390,7 +1385,7 @@ ssize_t btrfs_buffered_write(struct kiocb *iocb, struct iov_iter *i)
 					fs_info->sectorsize);
 
 		ret = btrfs_dirty_pages(BTRFS_I(inode), pages,
-					dirty_pages, pos, copied,
+					pos, copied,
 					&cached_state, only_release_metadata);
 
 		/*
diff --git a/fs/btrfs/file.h b/fs/btrfs/file.h
index 912254e653cf96..c23d0bf425989c 100644
--- a/fs/btrfs/file.h
+++ b/fs/btrfs/file.h
@@ -35,7 +35,7 @@ ssize_t btrfs_do_write_iter(struct kiocb *iocb, struct iov_iter *from,
 			    const struct btrfs_ioctl_encoded_io_args *encoded);
 int btrfs_release_file(struct inode *inode, struct file *file);
 int btrfs_dirty_pages(struct btrfs_inode *inode, struct page **pages,
-		      size_t num_pages, loff_t pos, size_t write_bytes,
+		      loff_t pos, size_t write_bytes,
 		      struct extent_state **cached, bool noreserve);
 int btrfs_fdatawrite_range(struct btrfs_inode *inode, loff_t start, loff_t end);
 int btrfs_check_nocow_lock(struct btrfs_inode *inode, loff_t pos,
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index f4bcb25306606a..0d2db205b9f629 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -1457,8 +1457,7 @@ static int __btrfs_write_out_cache(struct inode *inode,
 	io_ctl_zero_remaining_pages(io_ctl);
 
 	/* Everything is written out, now we dirty the pages in the file. */
-	ret = btrfs_dirty_pages(BTRFS_I(inode), io_ctl->pages,
-				io_ctl->num_pages, 0, i_size_read(inode),
+	ret = btrfs_dirty_pages(BTRFS_I(inode), io_ctl->pages, 0, i_size_read(inode),
 				&cached_state, false);
 	if (ret)
 		goto out_nospc;

From a85e63f574322583ee00f193e510815630bc078c Mon Sep 17 00:00:00 2001
From: Qu Wenruo <wqu@suse.com>
Date: Wed, 2 Oct 2024 08:47:49 +0930
Subject: [PATCH 29/80] btrfs: simplify the page uptodate preparation for
 prepare_pages()

Currently inside prepare_pages(), we handle the leading and tailing page
differently, and skip the middle pages (if any).  This is to avoid
reading pages which are fully covered by the dirty range.

Refactor the code by moving all checks (alignment check, range check,
force read check) into prepare_uptodate_page().

So that prepare_pages() only needs to iterate all the pages
unconditionally.

And since we're here, also update prepare_uptodate_page() to use
folio API other than the old page API.

Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/file.c | 64 +++++++++++++++++++++++++------------------------
 1 file changed, 33 insertions(+), 31 deletions(-)

diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 9555a3485670be..160d77f8eb6fac 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -858,36 +858,42 @@ int btrfs_mark_extent_written(struct btrfs_trans_handle *trans,
  */
 static int prepare_uptodate_page(struct inode *inode,
 				 struct page *page, u64 pos,
-				 bool force_uptodate)
+				 u64 len, bool force_uptodate)
 {
 	struct folio *folio = page_folio(page);
+	u64 clamp_start = max_t(u64, pos, folio_pos(folio));
+	u64 clamp_end = min_t(u64, pos + len, folio_pos(folio) + folio_size(folio));
 	int ret = 0;
 
-	if (((pos & (PAGE_SIZE - 1)) || force_uptodate) &&
-	    !PageUptodate(page)) {
-		ret = btrfs_read_folio(NULL, folio);
-		if (ret)
-			return ret;
-		lock_page(page);
-		if (!PageUptodate(page)) {
-			unlock_page(page);
-			return -EIO;
-		}
+	if (folio_test_uptodate(folio))
+		return 0;
 
-		/*
-		 * Since btrfs_read_folio() will unlock the folio before it
-		 * returns, there is a window where btrfs_release_folio() can be
-		 * called to release the page.  Here we check both inode
-		 * mapping and PagePrivate() to make sure the page was not
-		 * released.
-		 *
-		 * The private flag check is essential for subpage as we need
-		 * to store extra bitmap using folio private.
-		 */
-		if (page->mapping != inode->i_mapping || !folio_test_private(folio)) {
-			unlock_page(page);
-			return -EAGAIN;
-		}
+	if (!force_uptodate &&
+	    IS_ALIGNED(clamp_start, PAGE_SIZE) &&
+	    IS_ALIGNED(clamp_end, PAGE_SIZE))
+		return 0;
+
+	ret = btrfs_read_folio(NULL, folio);
+	if (ret)
+		return ret;
+	folio_lock(folio);
+	if (!folio_test_uptodate(folio)) {
+		folio_unlock(folio);
+		return -EIO;
+	}
+
+	/*
+	 * Since btrfs_read_folio() will unlock the folio before it returns,
+	 * there is a window where btrfs_release_folio() can be called to
+	 * release the page.  Here we check both inode mapping and page
+	 * private to make sure the page was not released.
+	 *
+	 * The private flag check is essential for subpage as we need to store
+	 * extra bitmap using folio private.
+	 */
+	if (page->mapping != inode->i_mapping || !folio_test_private(folio)) {
+		folio_unlock(folio);
+		return -EAGAIN;
 	}
 	return 0;
 }
@@ -949,12 +955,8 @@ static noinline int prepare_pages(struct inode *inode, struct page **pages,
 			goto fail;
 		}
 
-		if (i == 0)
-			ret = prepare_uptodate_page(inode, pages[i], pos,
-						    force_uptodate);
-		if (!ret && i == num_pages - 1)
-			ret = prepare_uptodate_page(inode, pages[i],
-						    pos + write_bytes, false);
+		ret = prepare_uptodate_page(inode, pages[i], pos, write_bytes,
+					    force_uptodate);
 		if (ret) {
 			put_page(pages[i]);
 			if (!nowait && ret == -EAGAIN) {

From df5af25b06cc5e72dd3950f4edf28e4f9b522524 Mon Sep 17 00:00:00 2001
From: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Date: Fri, 4 Oct 2024 15:19:01 +0200
Subject: [PATCH 30/80] btrfs: handle empty list of NOCOW ordered extents with
 checksum list

Currently we BUG_ON() in btrfs_finish_one_ordered() if we are finishing
an ordered extent that is flagged as NOCOW, but it's checksum list is
not empty.

This is clearly a logic error which we can recover from by aborting the
transaction.

For developer builds which enable CONFIG_BTRFS_ASSERT, also ASSERT()
that the list is empty.

Suggested-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: Qu Wenruo <wqu@suse.com>
Reviewed-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/inode.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 464620381ca709..b0ad46b734c3ca 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -3088,7 +3088,12 @@ int btrfs_finish_one_ordered(struct btrfs_ordered_extent *ordered_extent)
 
 	if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) {
 		/* Logic error */
-		BUG_ON(!list_empty(&ordered_extent->list));
+		ASSERT(list_empty(&ordered_extent->list));
+		if (!list_empty(&ordered_extent->list)) {
+			ret = -EINVAL;
+			btrfs_abort_transaction(trans, ret);
+			goto out;
+		}
 
 		btrfs_inode_safe_disk_i_size_write(inode, 0);
 		ret = btrfs_update_inode_fallback(trans, inode);

From 659f41efef6dac114ad4891df382d3c40c2472cc Mon Sep 17 00:00:00 2001
From: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Date: Mon, 7 Oct 2024 13:52:47 +0200
Subject: [PATCH 31/80] btrfs: return ENODATA in case RST lookup fails

In case a lookup in the RAID stripe-tree fails, return ENODATA instead of
ENOENT to better distinguish stripe-tree lookups from other code paths
where we return ENOENT.

Suggested-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/raid-stripe-tree.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/fs/btrfs/raid-stripe-tree.c b/fs/btrfs/raid-stripe-tree.c
index b7787a8e4af295..41970bbdb05f63 100644
--- a/fs/btrfs/raid-stripe-tree.c
+++ b/fs/btrfs/raid-stripe-tree.c
@@ -234,7 +234,7 @@ int btrfs_get_raid_extent_offset(struct btrfs_fs_info *fs_info,
 		found_end = found_logical + found_length;
 
 		if (found_logical > end) {
-			ret = -ENOENT;
+			ret = -ENODATA;
 			goto out;
 		}
 
@@ -280,10 +280,10 @@ int btrfs_get_raid_extent_offset(struct btrfs_fs_info *fs_info,
 	}
 
 	/* If we're here, we haven't found the requested devid in the stripe. */
-	ret = -ENOENT;
+	ret = -ENODATA;
 out:
 	if (ret > 0)
-		ret = -ENOENT;
+		ret = -ENODATA;
 	if (ret && ret != -EIO && !stripe->rst_search_commit_root) {
 		btrfs_debug(fs_info,
 		"cannot find raid-stripe for logical [%llu, %llu] devid %llu, profile %s",

From e081590d713f4ad6022594881d14662fff803ba9 Mon Sep 17 00:00:00 2001
From: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Date: Mon, 7 Oct 2024 13:52:48 +0200
Subject: [PATCH 32/80] btrfs: scrub: skip initial RST lookup errors

Performing the initial extent sector read on a RAID stripe-tree backed
filesystem with pre-allocated extents will cause the RAID stripe-tree
lookup code to return ENODATA, as pre-allocated extents do not have any
on-disk bytes and thus no RAID stripe-tree entries.

But the current scrub read code marks these extents as errors, because
the lookup fails.

If btrfs_map_block() returns -ENODATA, it means that the call to
btrfs_get_raid_extent_offset() returned -ENODATA, because there is no
entry for the corresponding range in the RAID stripe-tree. But as this
range is in the extent tree it means we've hit a pre-allocated extent. In
this case, don't mark the sector in the stripe's error bitmaps as faulty
and carry on to the next.

Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/scrub.c | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index e141132b5c8dfa..52e09f30746232 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -1704,8 +1704,18 @@ static void scrub_submit_extent_sector_read(struct scrub_ctx *sctx,
 					      &stripe_len, &bioc, &io_stripe, &mirror);
 			btrfs_put_bioc(bioc);
 			if (err < 0) {
-				set_bit(i, &stripe->io_error_bitmap);
-				set_bit(i, &stripe->error_bitmap);
+				if (err != -ENODATA) {
+					/*
+					 * Earlier btrfs_get_raid_extent_offset()
+					 * returned -ENODATA, which means there's
+					 * no entry for the corresponding range
+					 * in the stripe tree.  But if it's in
+					 * the extent tree, then it's a preallocated
+					 * extent and not an error.
+					 */
+					set_bit(i, &stripe->io_error_bitmap);
+					set_bit(i, &stripe->error_bitmap);
+				}
 				continue;
 			}
 

From 8162aaa7f0eb787199988df887a15bff671f50bc Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Mon, 7 Oct 2024 15:55:43 +0100
Subject: [PATCH 33/80] btrfs: qgroup: run delayed iputs after ordered extent
 completion

When trying to flush qgroups in order to release space we run delayed
iputs in order to release space from recently deleted files (their link
counted reached zero), and then we start delalloc and wait for any
existing ordered extents to complete.

However there's a time window here where we end up not doing the final
iput on a deleted file which could release necessary space:

1) An unlink operation starts;

2) During the unlink, or right before it completes, delalloc is flushed
   and an ordered extent is created;

3) When the ordered extent is created, the inode's ref count is
   incremented (with igrab() at alloc_ordered_extent());

4) When the unlink finishes it doesn't drop the last reference on the
   inode and so it doesn't trigger inode eviction to delete all of
   the inode's items in its root and drop all references on its data
   extents;

5) Another task enters try_flush_qgroup() to try to release space,
   it runs all delayed iputs, but there's no delayed iput yet for that
   deleted file because the ordered extent hasn't completed yet;

6) Then at try_flush_qgroup() we wait for the ordered extent to complete
   and that results in adding a delayed iput at btrfs_put_ordered_extent()
   when called from btrfs_finish_one_ordered();

7) Adding the delayed iput results in waking the cleaner kthread if it's
   not running already. However it may take some time for it to be
   scheduled, or it may be running but busy running auto defrag, dropping
   deleted snapshots or doing other work, so by the time we return from
   try_flush_qgroup() the space for deleted file isn't released.

Improve on this by running delayed iputs only after flushing delalloc
and waiting for ordered extent completion.

Reviewed-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/qgroup.c | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index cd062b1015fb84..716f29717d5ae7 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -4195,13 +4195,20 @@ static int try_flush_qgroup(struct btrfs_root *root)
 		return 0;
 	}
 
-	btrfs_run_delayed_iputs(root->fs_info);
-	btrfs_wait_on_delayed_iputs(root->fs_info);
 	ret = btrfs_start_delalloc_snapshot(root, true);
 	if (ret < 0)
 		goto out;
 	btrfs_wait_ordered_extents(root, U64_MAX, NULL);
 
+	/*
+	 * After waiting for ordered extents run delayed iputs in order to free
+	 * space from unlinked files before committing the current transaction,
+	 * as ordered extents may have been holding the last reference of an
+	 * inode and they add a delayed iput when they complete.
+	 */
+	btrfs_run_delayed_iputs(root->fs_info);
+	btrfs_wait_on_delayed_iputs(root->fs_info);
+
 	ret = btrfs_commit_current_transaction(root);
 out:
 	clear_bit(BTRFS_ROOT_QGROUP_FLUSHING, &root->state);

From 0b2308eb0338157789e914ac34272aefa9e81fcf Mon Sep 17 00:00:00 2001
From: Qu Wenruo <wqu@suse.com>
Date: Sun, 6 Oct 2024 10:36:20 +1030
Subject: [PATCH 34/80] btrfs: remove btrfs_set_range_writeback()

The function btrfs_set_range_writeback() was originally a callback for
metadata and data, to mark a range with writeback flag.

Then it was converted into a common function call for both metadata and
data.

From the very beginning, the function had been only called on a full page,
later converted to handle range inside a page.

But it never needed to handle multiple pages, and since commit
8189197425e7 ("btrfs: refactor __extent_writepage_io() to do
sector-by-sector submission") the function was only called on a
sector-by-sector basis.

This makes the function unnecessary, and can be converted to a simple
btrfs_folio_set_writeback() call instead.

Signed-off-by: Qu Wenruo <wqu@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/btrfs_inode.h |  1 -
 fs/btrfs/extent_io.c   |  2 +-
 fs/btrfs/inode.c       | 22 ----------------------
 3 files changed, 1 insertion(+), 24 deletions(-)

diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index e152fde888fc9a..c514bab532fafd 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -577,7 +577,6 @@ void btrfs_merge_delalloc_extent(struct btrfs_inode *inode, struct extent_state
 				 struct extent_state *other);
 void btrfs_split_delalloc_extent(struct btrfs_inode *inode,
 				 struct extent_state *orig, u64 split);
-void btrfs_set_range_writeback(struct btrfs_inode *inode, u64 start, u64 end);
 void btrfs_evict_inode(struct inode *inode);
 struct inode *btrfs_alloc_inode(struct super_block *sb);
 void btrfs_destroy_inode(struct inode *inode);
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 9fbc83c76b94dc..d87dcafab53779 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -1359,7 +1359,7 @@ static int submit_one_sector(struct btrfs_inode *inode,
 	 * a folio for a range already written to disk.
 	 */
 	btrfs_folio_clear_dirty(fs_info, folio, filepos, sectorsize);
-	btrfs_set_range_writeback(inode, filepos, filepos + sectorsize - 1);
+	btrfs_folio_set_writeback(fs_info, folio, filepos, sectorsize);
 	/*
 	 * Above call should set the whole folio with writeback flag, even
 	 * just for a single subpage sector.
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index b0ad46b734c3ca..b145045295b324 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -8944,28 +8944,6 @@ static int btrfs_tmpfile(struct mnt_idmap *idmap, struct inode *dir,
 	return finish_open_simple(file, ret);
 }
 
-void btrfs_set_range_writeback(struct btrfs_inode *inode, u64 start, u64 end)
-{
-	struct btrfs_fs_info *fs_info = inode->root->fs_info;
-	unsigned long index = start >> PAGE_SHIFT;
-	unsigned long end_index = end >> PAGE_SHIFT;
-	struct folio *folio;
-	u32 len;
-
-	ASSERT(end + 1 - start <= U32_MAX);
-	len = end + 1 - start;
-	while (index <= end_index) {
-		folio = __filemap_get_folio(inode->vfs_inode.i_mapping, index, 0, 0);
-		ASSERT(!IS_ERR(folio)); /* folios should be in the extent_io_tree */
-
-		/* This is for data, which doesn't yet support larger folio. */
-		ASSERT(folio_order(folio) == 0);
-		btrfs_folio_set_writeback(fs_info, folio, start, len);
-		folio_put(folio);
-		index++;
-	}
-}
-
 int btrfs_encoded_io_compression_from_extent(struct btrfs_fs_info *fs_info,
 					     int compress_type)
 {

From bde9f20fea80326fcd3f4947c4758dd58ad2395a Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Wed, 9 Oct 2024 16:30:50 +0200
Subject: [PATCH 35/80] btrfs: zstd: assert the timer pointer in callback

Make sure we got the right timer struct for the zstd workspace reclaim
work.

Reviewed-by: Anand Jain <anand.jain@oracle.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/zstd.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/fs/btrfs/zstd.c b/fs/btrfs/zstd.c
index 15f8a83165a38b..5232b56d589259 100644
--- a/fs/btrfs/zstd.c
+++ b/fs/btrfs/zstd.c
@@ -111,6 +111,8 @@ static void zstd_reclaim_timer_fn(struct timer_list *timer)
 	unsigned long reclaim_threshold = jiffies - ZSTD_BTRFS_RECLAIM_JIFFIES;
 	struct list_head *pos, *next;
 
+	ASSERT(timer == &wsm.timer);
+
 	spin_lock(&wsm.lock);
 
 	if (list_empty(&wsm.lru_list)) {

From 1d19c12c42a0dcce925f510208be6c67e9b415e9 Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Wed, 9 Oct 2024 16:30:53 +0200
Subject: [PATCH 36/80] btrfs: drop unused parameter path from
 btrfs_tree_mod_log_rewind()

The path parameter was used for our own locking, that got converted to
rwsem eventually. Last usage in ac5887c8e013d6 ("btrfs: locking: remove
all the blocking helpers").

Reviewed-by: Anand Jain <anand.jain@oracle.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/ctree.c        | 2 +-
 fs/btrfs/tree-mod-log.c | 1 -
 fs/btrfs/tree-mod-log.h | 1 -
 3 files changed, 1 insertion(+), 3 deletions(-)

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 0cc919d15b1441..b11ec86102e37b 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -2334,7 +2334,7 @@ int btrfs_search_old_slot(struct btrfs_root *root, const struct btrfs_key *key,
 
 		level = btrfs_header_level(b);
 		btrfs_tree_read_lock(b);
-		b = btrfs_tree_mod_log_rewind(fs_info, p, b, time_seq);
+		b = btrfs_tree_mod_log_rewind(fs_info, b, time_seq);
 		if (!b) {
 			ret = -ENOMEM;
 			goto done;
diff --git a/fs/btrfs/tree-mod-log.c b/fs/btrfs/tree-mod-log.c
index b382a4c443d427..1ac2678fc4caa8 100644
--- a/fs/btrfs/tree-mod-log.c
+++ b/fs/btrfs/tree-mod-log.c
@@ -909,7 +909,6 @@ static void tree_mod_log_rewind(struct btrfs_fs_info *fs_info,
  * is freed (its refcount is decremented).
  */
 struct extent_buffer *btrfs_tree_mod_log_rewind(struct btrfs_fs_info *fs_info,
-						struct btrfs_path *path,
 						struct extent_buffer *eb,
 						u64 time_seq)
 {
diff --git a/fs/btrfs/tree-mod-log.h b/fs/btrfs/tree-mod-log.h
index 6308c577a4a4ab..1c12566040db52 100644
--- a/fs/btrfs/tree-mod-log.h
+++ b/fs/btrfs/tree-mod-log.h
@@ -41,7 +41,6 @@ int btrfs_tree_mod_log_insert_key(const struct extent_buffer *eb, int slot,
 				  enum btrfs_mod_log_op op);
 int btrfs_tree_mod_log_free_eb(struct extent_buffer *eb);
 struct extent_buffer *btrfs_tree_mod_log_rewind(struct btrfs_fs_info *fs_info,
-						struct btrfs_path *path,
 						struct extent_buffer *eb,
 						u64 time_seq);
 struct extent_buffer *btrfs_get_old_root(struct btrfs_root *root, u64 time_seq);

From 3994109335dd08158ac9a3c63ac871ded13381a1 Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Wed, 9 Oct 2024 16:30:59 +0200
Subject: [PATCH 37/80] btrfs: drop unused parameter ctx from
 batch_delete_dir_index_items()

The ctx parameter is not used, we can drop it.

Reviewed-by: Anand Jain <anand.jain@oracle.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/tree-log.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 9637c7cdc0cf92..c8d6587688b3dd 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -6204,7 +6204,6 @@ static int log_delayed_deletions_full(struct btrfs_trans_handle *trans,
 static int batch_delete_dir_index_items(struct btrfs_trans_handle *trans,
 					struct btrfs_inode *inode,
 					struct btrfs_path *path,
-					struct btrfs_log_ctx *ctx,
 					const struct list_head *delayed_del_list,
 					const struct btrfs_delayed_item *first,
 					const struct btrfs_delayed_item **last_ret)
@@ -6265,7 +6264,7 @@ static int log_delayed_deletions_incremental(struct btrfs_trans_handle *trans,
 		if (ret < 0) {
 			return ret;
 		} else if (ret == 0) {
-			ret = batch_delete_dir_index_items(trans, inode, path, ctx,
+			ret = batch_delete_dir_index_items(trans, inode, path,
 							   delayed_del_list, curr,
 							   &last);
 			if (ret)

From 4995054c1ef2d63db9a2ca1f8374a04ce7a29c59 Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Wed, 9 Oct 2024 16:31:02 +0200
Subject: [PATCH 38/80] btrfs: drop unused parameter fs_info from
 wait_reserve_ticket()

The parameter is not used, we can also reach it from the space info if
needed in the future.

Reviewed-by: Anand Jain <anand.jain@oracle.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/space-info.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c
index ee23fae73f4736..3c7ccb3935cf07 100644
--- a/fs/btrfs/space-info.c
+++ b/fs/btrfs/space-info.c
@@ -1488,8 +1488,7 @@ static void priority_reclaim_data_space(struct btrfs_fs_info *fs_info,
 	spin_unlock(&space_info->lock);
 }
 
-static void wait_reserve_ticket(struct btrfs_fs_info *fs_info,
-				struct btrfs_space_info *space_info,
+static void wait_reserve_ticket(struct btrfs_space_info *space_info,
 				struct reserve_ticket *ticket)
 
 {
@@ -1547,7 +1546,7 @@ static int handle_reserve_ticket(struct btrfs_fs_info *fs_info,
 	case BTRFS_RESERVE_FLUSH_DATA:
 	case BTRFS_RESERVE_FLUSH_ALL:
 	case BTRFS_RESERVE_FLUSH_ALL_STEAL:
-		wait_reserve_ticket(fs_info, space_info, ticket);
+		wait_reserve_ticket(space_info, ticket);
 		break;
 	case BTRFS_RESERVE_FLUSH_LIMIT:
 		priority_reclaim_metadata_space(fs_info, space_info, ticket,

From 51f6c3ac6d02b4b36af7f916117c41b1d1f7e7b4 Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Wed, 9 Oct 2024 16:31:04 +0200
Subject: [PATCH 39/80] btrfs: drop unused parameter fs_info from
 do_reclaim_sweep()

The parameter is unused and we can get it from space info if needed.

Reviewed-by: Anand Jain <anand.jain@oracle.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/space-info.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c
index 3c7ccb3935cf07..255e85f78313ca 100644
--- a/fs/btrfs/space-info.c
+++ b/fs/btrfs/space-info.c
@@ -1983,8 +1983,7 @@ static bool is_reclaim_urgent(struct btrfs_space_info *space_info)
 	return unalloc < data_chunk_size;
 }
 
-static void do_reclaim_sweep(const struct btrfs_fs_info *fs_info,
-			     struct btrfs_space_info *space_info, int raid)
+static void do_reclaim_sweep(struct btrfs_space_info *space_info, int raid)
 {
 	struct btrfs_block_group *bg;
 	int thresh_pct;
@@ -2080,6 +2079,6 @@ void btrfs_reclaim_sweep(const struct btrfs_fs_info *fs_info)
 		if (!btrfs_should_periodic_reclaim(space_info))
 			continue;
 		for (raid = 0; raid < BTRFS_NR_RAID_TYPES; raid++)
-			do_reclaim_sweep(fs_info, space_info, raid);
+			do_reclaim_sweep(space_info, raid);
 	}
 }

From 8721e68d8458444e0da910fd3b99c53bf61df856 Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Wed, 9 Oct 2024 16:31:06 +0200
Subject: [PATCH 40/80] btrfs: send: drop unused parameter num from
 iterate_inode_ref_t callbacks

None of the ref iteration callbacks needs the num parameter (this is for
the directory item iteration), so we can drop it.

Reviewed-by: Anand Jain <anand.jain@oracle.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/send.c | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index b068469871f8e5..b8ffded33fef9a 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -980,7 +980,7 @@ static int get_inode_gen(struct btrfs_root *root, u64 ino, u64 *gen)
 	return ret;
 }
 
-typedef int (*iterate_inode_ref_t)(int num, u64 dir, int index,
+typedef int (*iterate_inode_ref_t)(u64 dir, int index,
 				   struct fs_path *p,
 				   void *ctx);
 
@@ -1007,7 +1007,6 @@ static int iterate_inode_ref(struct btrfs_root *root, struct btrfs_path *path,
 	u32 name_len;
 	char *start;
 	int ret = 0;
-	int num = 0;
 	int index;
 	u64 dir;
 	unsigned long name_off;
@@ -1094,10 +1093,9 @@ static int iterate_inode_ref(struct btrfs_root *root, struct btrfs_path *path,
 		}
 
 		cur += elem_size + name_len;
-		ret = iterate(num, dir, index, p, ctx);
+		ret = iterate(dir, index, p, ctx);
 		if (ret)
 			goto out;
-		num++;
 	}
 
 out:
@@ -1227,7 +1225,7 @@ static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path,
 	return ret;
 }
 
-static int __copy_first_ref(int num, u64 dir, int index,
+static int __copy_first_ref(u64 dir, int index,
 			    struct fs_path *p, void *ctx)
 {
 	int ret;
@@ -4708,7 +4706,7 @@ static int record_ref_in_tree(struct rb_root *root, struct list_head *refs,
 	return ret;
 }
 
-static int record_new_ref_if_needed(int num, u64 dir, int index,
+static int record_new_ref_if_needed(u64 dir, int index,
 				    struct fs_path *name, void *ctx)
 {
 	int ret = 0;
@@ -4738,7 +4736,7 @@ static int record_new_ref_if_needed(int num, u64 dir, int index,
 	return ret;
 }
 
-static int record_deleted_ref_if_needed(int num, u64 dir, int index,
+static int record_deleted_ref_if_needed(u64 dir, int index,
 					struct fs_path *name, void *ctx)
 {
 	int ret = 0;

From 3555feadd82e1b07deacb387b2d42fdc1860462b Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Wed, 9 Oct 2024 16:31:09 +0200
Subject: [PATCH 41/80] btrfs: send: drop unused parameter index from
 iterate_inode_ref_t callbacks

None of the ref iteration callbacks needs the index parameter (this is
for the directory item iteration), so we can drop it.

Reviewed-by: Anand Jain <anand.jain@oracle.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/send.c | 18 +++++-------------
 1 file changed, 5 insertions(+), 13 deletions(-)

diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index b8ffded33fef9a..3c35936d91b53a 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -980,9 +980,7 @@ static int get_inode_gen(struct btrfs_root *root, u64 ino, u64 *gen)
 	return ret;
 }
 
-typedef int (*iterate_inode_ref_t)(u64 dir, int index,
-				   struct fs_path *p,
-				   void *ctx);
+typedef int (*iterate_inode_ref_t)(u64 dir, struct fs_path *p, void *ctx);
 
 /*
  * Helper function to iterate the entries in ONE btrfs_inode_ref or
@@ -1007,7 +1005,6 @@ static int iterate_inode_ref(struct btrfs_root *root, struct btrfs_path *path,
 	u32 name_len;
 	char *start;
 	int ret = 0;
-	int index;
 	u64 dir;
 	unsigned long name_off;
 	unsigned long elem_size;
@@ -1042,13 +1039,11 @@ static int iterate_inode_ref(struct btrfs_root *root, struct btrfs_path *path,
 			iref = (struct btrfs_inode_ref *)(ptr + cur);
 			name_len = btrfs_inode_ref_name_len(eb, iref);
 			name_off = (unsigned long)(iref + 1);
-			index = btrfs_inode_ref_index(eb, iref);
 			dir = found_key->offset;
 		} else {
 			extref = (struct btrfs_inode_extref *)(ptr + cur);
 			name_len = btrfs_inode_extref_name_len(eb, extref);
 			name_off = (unsigned long)&extref->name;
-			index = btrfs_inode_extref_index(eb, extref);
 			dir = btrfs_inode_extref_parent(eb, extref);
 		}
 
@@ -1093,7 +1088,7 @@ static int iterate_inode_ref(struct btrfs_root *root, struct btrfs_path *path,
 		}
 
 		cur += elem_size + name_len;
-		ret = iterate(dir, index, p, ctx);
+		ret = iterate(dir, p, ctx);
 		if (ret)
 			goto out;
 	}
@@ -1225,8 +1220,7 @@ static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path,
 	return ret;
 }
 
-static int __copy_first_ref(u64 dir, int index,
-			    struct fs_path *p, void *ctx)
+static int __copy_first_ref(u64 dir, struct fs_path *p, void *ctx)
 {
 	int ret;
 	struct fs_path *pt = ctx;
@@ -4706,8 +4700,7 @@ static int record_ref_in_tree(struct rb_root *root, struct list_head *refs,
 	return ret;
 }
 
-static int record_new_ref_if_needed(u64 dir, int index,
-				    struct fs_path *name, void *ctx)
+static int record_new_ref_if_needed(u64 dir, struct fs_path *name, void *ctx)
 {
 	int ret = 0;
 	struct send_ctx *sctx = ctx;
@@ -4736,8 +4729,7 @@ static int record_new_ref_if_needed(u64 dir, int index,
 	return ret;
 }
 
-static int record_deleted_ref_if_needed(u64 dir, int index,
-					struct fs_path *name, void *ctx)
+static int record_deleted_ref_if_needed(u64 dir, struct fs_path *name, void *ctx)
 {
 	int ret = 0;
 	struct send_ctx *sctx = ctx;

From 2aa366dbbb880c3b1bfb9634a66bfcba2788a19a Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Wed, 9 Oct 2024 16:31:15 +0200
Subject: [PATCH 42/80] btrfs: scrub: drop unused parameter sctx from
 scrub_submit_extent_sector_read()

The parameter is unused and we can reach sctx from scrub stripe if
needed.

Reviewed-by: Anand Jain <anand.jain@oracle.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/scrub.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 52e09f30746232..f21fc6cf07c0eb 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -1656,8 +1656,7 @@ static u32 stripe_length(const struct scrub_stripe *stripe)
 		   stripe->bg->start + stripe->bg->length - stripe->logical);
 }
 
-static void scrub_submit_extent_sector_read(struct scrub_ctx *sctx,
-					    struct scrub_stripe *stripe)
+static void scrub_submit_extent_sector_read(struct scrub_stripe *stripe)
 {
 	struct btrfs_fs_info *fs_info = stripe->bg->fs_info;
 	struct btrfs_bio *bbio = NULL;
@@ -1753,7 +1752,7 @@ static void scrub_submit_initial_read(struct scrub_ctx *sctx,
 	ASSERT(test_bit(SCRUB_STRIPE_FLAG_INITIALIZED, &stripe->state));
 
 	if (btrfs_need_stripe_tree_update(fs_info, stripe->bg->flags)) {
-		scrub_submit_extent_sector_read(sctx, stripe);
+		scrub_submit_extent_sector_read(stripe);
 		return;
 	}
 

From e6c00f444faa494bb654012d58e64f2eb9b9f9aa Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Wed, 9 Oct 2024 16:31:18 +0200
Subject: [PATCH 43/80] btrfs: drop unused parameter map from
 scrub_simple_mirror()

The parameter map used to be passed to scrub_extent() until
e02ee89baa66c4 ("btrfs: scrub: switch scrub_simple_mirror() to
scrub_stripe infrastructure"), where the scrub implementation was
completely reworked.

Reviewed-by: Anand Jain <anand.jain@oracle.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/scrub.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index f21fc6cf07c0eb..204c928beaf9cb 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -2112,7 +2112,6 @@ static int scrub_raid56_parity_stripe(struct scrub_ctx *sctx,
  */
 static int scrub_simple_mirror(struct scrub_ctx *sctx,
 			       struct btrfs_block_group *bg,
-			       struct btrfs_chunk_map *map,
 			       u64 logical_start, u64 logical_length,
 			       struct btrfs_device *device,
 			       u64 physical, int mirror_num)
@@ -2231,7 +2230,7 @@ static int scrub_simple_stripe(struct scrub_ctx *sctx,
 		 * just RAID1, so we can reuse scrub_simple_mirror() to scrub
 		 * this stripe.
 		 */
-		ret = scrub_simple_mirror(sctx, bg, map, cur_logical,
+		ret = scrub_simple_mirror(sctx, bg, cur_logical,
 					  BTRFS_STRIPE_LEN, device, cur_physical,
 					  mirror_num);
 		if (ret)
@@ -2315,7 +2314,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
 		 * Only @physical and @mirror_num needs to calculated using
 		 * @stripe_index.
 		 */
-		ret = scrub_simple_mirror(sctx, bg, map, bg->start, bg->length,
+		ret = scrub_simple_mirror(sctx, bg, bg->start, bg->length,
 				scrub_dev, map->stripes[stripe_index].physical,
 				stripe_index + 1);
 		offset = 0;
@@ -2370,7 +2369,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
 		 * We can reuse scrub_simple_mirror() here, as the repair part
 		 * is still based on @mirror_num.
 		 */
-		ret = scrub_simple_mirror(sctx, bg, map, logical, BTRFS_STRIPE_LEN,
+		ret = scrub_simple_mirror(sctx, bg, logical, BTRFS_STRIPE_LEN,
 					  scrub_dev, physical, 1);
 		if (ret < 0)
 			goto out;

From f9bd5550419a9c3997ccdaa9642c64aee14b3fc3 Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Wed, 9 Oct 2024 16:31:20 +0200
Subject: [PATCH 44/80] btrfs: qgroup: drop unused parameter fs_info from
 __del_qgroup_rb()

We don't need fs_info here, everything is reachable from qgroup.

Reviewed-by: Anand Jain <anand.jain@oracle.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/qgroup.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index 716f29717d5ae7..b2d421b4cd5a51 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -226,8 +226,7 @@ static struct btrfs_qgroup *add_qgroup_rb(struct btrfs_fs_info *fs_info,
 	return qgroup;
 }
 
-static void __del_qgroup_rb(struct btrfs_fs_info *fs_info,
-			    struct btrfs_qgroup *qgroup)
+static void __del_qgroup_rb(struct btrfs_qgroup *qgroup)
 {
 	struct btrfs_qgroup_list *list;
 
@@ -258,7 +257,7 @@ static int del_qgroup_rb(struct btrfs_fs_info *fs_info, u64 qgroupid)
 		return -ENOENT;
 
 	rb_erase(&qgroup->node, &fs_info->qgroup_tree);
-	__del_qgroup_rb(fs_info, qgroup);
+	__del_qgroup_rb(qgroup);
 	return 0;
 }
 
@@ -643,7 +642,7 @@ void btrfs_free_qgroup_config(struct btrfs_fs_info *fs_info)
 	while ((n = rb_first(&fs_info->qgroup_tree))) {
 		qgroup = rb_entry(n, struct btrfs_qgroup, node);
 		rb_erase(n, &fs_info->qgroup_tree);
-		__del_qgroup_rb(fs_info, qgroup);
+		__del_qgroup_rb(qgroup);
 		btrfs_sysfs_del_one_qgroup(fs_info, qgroup);
 		kfree(qgroup);
 	}

From e3c79ce54818adb50f7fcdbd1dfff44ca7f57bf1 Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Wed, 9 Oct 2024 16:31:23 +0200
Subject: [PATCH 45/80] btrfs: drop unused transaction parameter from
 btrfs_qgroup_add_swapped_blocks()

The caller replace_path() runs under transaction but we don't need it in
btrfs_qgroup_add_swapped_blocks().

Reviewed-by: Anand Jain <anand.jain@oracle.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/qgroup.c     | 3 +--
 fs/btrfs/qgroup.h     | 3 +--
 fs/btrfs/relocation.c | 2 +-
 3 files changed, 3 insertions(+), 5 deletions(-)

diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index b2d421b4cd5a51..0fcf6e91616eec 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -4692,8 +4692,7 @@ void btrfs_qgroup_clean_swapped_blocks(struct btrfs_root *root)
  *			BOTH POINTERS ARE BEFORE TREE SWAP
  * @last_snapshot:	last snapshot generation of the subvolume tree
  */
-int btrfs_qgroup_add_swapped_blocks(struct btrfs_trans_handle *trans,
-		struct btrfs_root *subvol_root,
+int btrfs_qgroup_add_swapped_blocks(struct btrfs_root *subvol_root,
 		struct btrfs_block_group *bg,
 		struct extent_buffer *subvol_parent, int subvol_slot,
 		struct extent_buffer *reloc_parent, int reloc_slot,
diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h
index 5ba2b181636b38..9fbd008749fa37 100644
--- a/fs/btrfs/qgroup.h
+++ b/fs/btrfs/qgroup.h
@@ -437,8 +437,7 @@ void btrfs_qgroup_init_swapped_blocks(
 	struct btrfs_qgroup_swapped_blocks *swapped_blocks);
 
 void btrfs_qgroup_clean_swapped_blocks(struct btrfs_root *root);
-int btrfs_qgroup_add_swapped_blocks(struct btrfs_trans_handle *trans,
-		struct btrfs_root *subvol_root,
+int btrfs_qgroup_add_swapped_blocks(struct btrfs_root *subvol_root,
 		struct btrfs_block_group *bg,
 		struct extent_buffer *subvol_parent, int subvol_slot,
 		struct extent_buffer *reloc_parent, int reloc_slot,
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index f3834f8d26b456..bf267bdfa8f811 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -1244,7 +1244,7 @@ int replace_path(struct btrfs_trans_handle *trans, struct reloc_control *rc,
 		 * The real subtree rescan is delayed until we have new
 		 * CoW on the subtree root node before transaction commit.
 		 */
-		ret = btrfs_qgroup_add_swapped_blocks(trans, dest,
+		ret = btrfs_qgroup_add_swapped_blocks(dest,
 				rc->block_group, parent, slot,
 				path->nodes[level], path->slots[level],
 				last_snapshot);

From 3beb0db651f12fa13974217f99eee47ff7405a11 Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Wed, 9 Oct 2024 16:31:25 +0200
Subject: [PATCH 46/80] btrfs: lzo: drop unused paramter level from
 lzo_alloc_workspace()

The LZO compression has only one level, we don't need to pass the
parameter.

Reviewed-by: Anand Jain <anand.jain@oracle.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/compression.c | 2 +-
 fs/btrfs/compression.h | 2 +-
 fs/btrfs/lzo.c         | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index 6e9c4a5e0d51af..655e3212b4093b 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -746,7 +746,7 @@ static struct list_head *alloc_workspace(int type, unsigned int level)
 	switch (type) {
 	case BTRFS_COMPRESS_NONE: return alloc_heuristic_ws(level);
 	case BTRFS_COMPRESS_ZLIB: return zlib_alloc_workspace(level);
-	case BTRFS_COMPRESS_LZO:  return lzo_alloc_workspace(level);
+	case BTRFS_COMPRESS_LZO:  return lzo_alloc_workspace();
 	case BTRFS_COMPRESS_ZSTD: return zstd_alloc_workspace(level);
 	default:
 		/*
diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h
index b6563b6a333e5d..954034086d0d44 100644
--- a/fs/btrfs/compression.h
+++ b/fs/btrfs/compression.h
@@ -175,7 +175,7 @@ int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb);
 int lzo_decompress(struct list_head *ws, const u8 *data_in,
 		struct folio *dest_folio, unsigned long dest_pgoff, size_t srclen,
 		size_t destlen);
-struct list_head *lzo_alloc_workspace(unsigned int level);
+struct list_head *lzo_alloc_workspace(void);
 void lzo_free_workspace(struct list_head *ws);
 
 int zstd_compress_folios(struct list_head *ws, struct address_space *mapping,
diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c
index 72856f6775f7f1..a45bc11f866531 100644
--- a/fs/btrfs/lzo.c
+++ b/fs/btrfs/lzo.c
@@ -80,7 +80,7 @@ void lzo_free_workspace(struct list_head *ws)
 	kfree(workspace);
 }
 
-struct list_head *lzo_alloc_workspace(unsigned int level)
+struct list_head *lzo_alloc_workspace(void)
 {
 	struct workspace *workspace;
 

From 7e2275013afbc3f848f13dc8cf5bfae902fa64ff Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Wed, 9 Oct 2024 16:31:28 +0200
Subject: [PATCH 47/80] btrfs: drop unused parameter argp from
 btrfs_ioctl_quota_rescan_wait()

We don't need the user passed parameter, rescan is a filesystem
operation so fs_info is sufficient.

Reviewed-by: Anand Jain <anand.jain@oracle.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/ioctl.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 226c91fe31a707..28b9b7fda5780b 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -4058,8 +4058,7 @@ static long btrfs_ioctl_quota_rescan_status(struct btrfs_fs_info *fs_info,
 	return 0;
 }
 
-static long btrfs_ioctl_quota_rescan_wait(struct btrfs_fs_info *fs_info,
-						void __user *arg)
+static long btrfs_ioctl_quota_rescan_wait(struct btrfs_fs_info *fs_info)
 {
 	if (!capable(CAP_SYS_ADMIN))
 		return -EPERM;
@@ -4812,7 +4811,7 @@ long btrfs_ioctl(struct file *file, unsigned int
 	case BTRFS_IOC_QUOTA_RESCAN_STATUS:
 		return btrfs_ioctl_quota_rescan_status(fs_info, argp);
 	case BTRFS_IOC_QUOTA_RESCAN_WAIT:
-		return btrfs_ioctl_quota_rescan_wait(fs_info, argp);
+		return btrfs_ioctl_quota_rescan_wait(fs_info);
 	case BTRFS_IOC_DEV_REPLACE:
 		return btrfs_ioctl_dev_replace(fs_info, argp);
 	case BTRFS_IOC_GET_SUPPORTED_FEATURES:

From 24f6fd602c9f887eea94ca9948bec4942f068af5 Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Wed, 9 Oct 2024 16:31:34 +0200
Subject: [PATCH 48/80] btrfs: drop unused parameter inode from
 read_inline_extent()

We don't need the inode pointer to read inline extent, it's all
accessible from the path pointer.

Reviewed-by: Anand Jain <anand.jain@oracle.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/inode.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index b145045295b324..1e7ba07092f21c 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -6737,8 +6737,7 @@ static noinline int uncompress_inline(struct btrfs_path *path,
 	return ret;
 }
 
-static int read_inline_extent(struct btrfs_inode *inode, struct btrfs_path *path,
-			      struct folio *folio)
+static int read_inline_extent(struct btrfs_path *path, struct folio *folio)
 {
 	struct btrfs_file_extent_item *fi;
 	void *kaddr;
@@ -6936,7 +6935,7 @@ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode,
 		ASSERT(em->disk_bytenr == EXTENT_MAP_INLINE);
 		ASSERT(em->len == fs_info->sectorsize);
 
-		ret = read_inline_extent(inode, path, folio);
+		ret = read_inline_extent(path, folio);
 		if (ret < 0)
 			goto out;
 		goto insert;

From 05d2682ae5fa63d1b13bb74e95c1c536e36af3de Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Wed, 9 Oct 2024 16:31:36 +0200
Subject: [PATCH 49/80] btrfs: drop unused parameter offset from
 __cow_file_range_inline()

We don't need offset for inline extents, they always start from 0.

Reviewed-by: Anand Jain <anand.jain@oracle.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/inode.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 1e7ba07092f21c..261bf4924423dd 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -646,7 +646,7 @@ static bool can_cow_file_range_inline(struct btrfs_inode *inode,
  * If being used directly, you must have already checked we're allowed to cow
  * the range by getting true from can_cow_file_range_inline().
  */
-static noinline int __cow_file_range_inline(struct btrfs_inode *inode, u64 offset,
+static noinline int __cow_file_range_inline(struct btrfs_inode *inode,
 					    u64 size, size_t compressed_size,
 					    int compress_type,
 					    struct folio *compressed_folio,
@@ -736,7 +736,7 @@ static noinline int cow_file_range_inline(struct btrfs_inode *inode,
 		return 1;
 
 	lock_extent(&inode->io_tree, offset, end, &cached);
-	ret = __cow_file_range_inline(inode, offset, size, compressed_size,
+	ret = __cow_file_range_inline(inode, size, compressed_size,
 				      compress_type, compressed_folio,
 				      update_i_size);
 	if (ret > 0) {
@@ -9505,7 +9505,7 @@ ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from,
 	if (encoded->unencoded_len == encoded->len &&
 	    encoded->unencoded_offset == 0 &&
 	    can_cow_file_range_inline(inode, start, encoded->len, orig_count)) {
-		ret = __cow_file_range_inline(inode, start, encoded->len,
+		ret = __cow_file_range_inline(inode, encoded->len,
 					      orig_count, compression, folios[0],
 					      true);
 		if (ret <= 0) {

From 2c007d28f2997835e917c4d4e502b1d7e0bf2dcf Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Wed, 9 Oct 2024 16:31:47 +0200
Subject: [PATCH 50/80] btrfs: drop unused parameter file_offset from
 btrfs_encoded_read_regular_fill_pages()

The file_offset parameter used to be passed to encoded read struct but
was removed in commit b665affe93d8 ("btrfs: remove unused members from
struct btrfs_encoded_read_private").

Reviewed-by: Anand Jain <anand.jain@oracle.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/btrfs_inode.h | 3 +--
 fs/btrfs/inode.c       | 6 +++---
 fs/btrfs/send.c        | 2 +-
 3 files changed, 5 insertions(+), 6 deletions(-)

diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index c514bab532fafd..157fd3f4cb333a 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -612,8 +612,7 @@ int btrfs_writepage_cow_fixup(struct folio *folio);
 int btrfs_encoded_io_compression_from_extent(struct btrfs_fs_info *fs_info,
 					     int compress_type);
 int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode,
-					  u64 file_offset, u64 disk_bytenr,
-					  u64 disk_io_size,
+					  u64 disk_bytenr, u64 disk_io_size,
 					  struct page **pages);
 ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter,
 			   struct btrfs_ioctl_encoded_io_args *encoded);
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 261bf4924423dd..35f89d14c110d1 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -9081,8 +9081,8 @@ static void btrfs_encoded_read_endio(struct btrfs_bio *bbio)
 }
 
 int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode,
-					  u64 file_offset, u64 disk_bytenr,
-					  u64 disk_io_size, struct page **pages)
+					  u64 disk_bytenr, u64 disk_io_size,
+					  struct page **pages)
 {
 	struct btrfs_fs_info *fs_info = inode->root->fs_info;
 	struct btrfs_encoded_read_private priv = {
@@ -9152,7 +9152,7 @@ static ssize_t btrfs_encoded_read_regular(struct kiocb *iocb,
 		goto out;
 		}
 
-	ret = btrfs_encoded_read_regular_fill_pages(inode, start, disk_bytenr,
+	ret = btrfs_encoded_read_regular_fill_pages(inode, disk_bytenr,
 						    disk_io_size, pages);
 	if (ret)
 		goto out;
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index 3c35936d91b53a..efbe1c08a5adae 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -5667,7 +5667,7 @@ static int send_encoded_extent(struct send_ctx *sctx, struct btrfs_path *path,
 	 * Note that send_buf is a mapping of send_buf_pages, so this is really
 	 * reading into send_buf.
 	 */
-	ret = btrfs_encoded_read_regular_fill_pages(BTRFS_I(inode), offset,
+	ret = btrfs_encoded_read_regular_fill_pages(BTRFS_I(inode),
 						    disk_bytenr, disk_num_bytes,
 						    sctx->send_buf_pages +
 						    (data_offset >> PAGE_SHIFT));

From 3c09d3b337850ac85d29da7f316b95d1b9598c6b Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Wed, 9 Oct 2024 16:31:49 +0200
Subject: [PATCH 51/80] btrfs: drop unused parameter iov_iter from
 btrfs_write_check()

The parameter 'from' has never been used since commit b8d8e1fd570a
("btrfs: introduce btrfs_write_check()"), this is for buffered write.
Direct io write needs it so it was probably an interface thing, but we
can drop it.

Reviewed-by: Anand Jain <anand.jain@oracle.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/direct-io.c | 2 +-
 fs/btrfs/file.c      | 6 +++---
 fs/btrfs/file.h      | 2 +-
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/fs/btrfs/direct-io.c b/fs/btrfs/direct-io.c
index bd38df5647e35b..a7c3e221378db7 100644
--- a/fs/btrfs/direct-io.c
+++ b/fs/btrfs/direct-io.c
@@ -834,7 +834,7 @@ ssize_t btrfs_direct_write(struct kiocb *iocb, struct iov_iter *from)
 		return ret;
 	}
 
-	ret = btrfs_write_check(iocb, from, ret);
+	ret = btrfs_write_check(iocb, ret);
 	if (ret < 0) {
 		btrfs_inode_unlock(BTRFS_I(inode), ilock_flags);
 		goto out;
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 160d77f8eb6fac..033f85ea8c9d6a 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1144,7 +1144,7 @@ static void update_time_for_write(struct inode *inode)
 		inode_inc_iversion(inode);
 }
 
-int btrfs_write_check(struct kiocb *iocb, struct iov_iter *from, size_t count)
+int btrfs_write_check(struct kiocb *iocb, size_t count)
 {
 	struct file *file = iocb->ki_filp;
 	struct inode *inode = file_inode(file);
@@ -1222,7 +1222,7 @@ ssize_t btrfs_buffered_write(struct kiocb *iocb, struct iov_iter *i)
 	if (ret <= 0)
 		goto out;
 
-	ret = btrfs_write_check(iocb, i, ret);
+	ret = btrfs_write_check(iocb, ret);
 	if (ret < 0)
 		goto out;
 
@@ -1467,7 +1467,7 @@ static ssize_t btrfs_encoded_write(struct kiocb *iocb, struct iov_iter *from,
 	if (ret || encoded->len == 0)
 		goto out;
 
-	ret = btrfs_write_check(iocb, from, encoded->len);
+	ret = btrfs_write_check(iocb, encoded->len);
 	if (ret < 0)
 		goto out;
 
diff --git a/fs/btrfs/file.h b/fs/btrfs/file.h
index c23d0bf425989c..c2ce0ae94a9c17 100644
--- a/fs/btrfs/file.h
+++ b/fs/btrfs/file.h
@@ -44,7 +44,7 @@ void btrfs_check_nocow_unlock(struct btrfs_inode *inode);
 bool btrfs_find_delalloc_in_range(struct btrfs_inode *inode, u64 start, u64 end,
 				  struct extent_state **cached_state,
 				  u64 *delalloc_start_ret, u64 *delalloc_end_ret);
-int btrfs_write_check(struct kiocb *iocb, struct iov_iter *from, size_t count);
+int btrfs_write_check(struct kiocb *iocb, size_t count);
 ssize_t btrfs_buffered_write(struct kiocb *iocb, struct iov_iter *i);
 
 #endif

From afe6a708a22c2fb7bfa171bc4e35a523405c4d64 Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Wed, 9 Oct 2024 16:31:56 +0200
Subject: [PATCH 52/80] btrfs: drop unused parameter refs from
 visit_node_for_delete()

The parameter duplicates what can be effectively obtained from
wc->refs[level - 1] and this is what's actually used inside. Added in
commit 2b73c7e761c4 ("btrfs: unify logic to decide if we need to walk
down into a node during snapshot delete").

Reviewed-by: Anand Jain <anand.jain@oracle.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/extent-tree.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 79373f0ab6cee3..e29024c66edbf9 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -5270,7 +5270,7 @@ struct walk_control {
  * corrupted file systems must have been caught before calling this function.
  */
 static bool visit_node_for_delete(struct btrfs_root *root, struct walk_control *wc,
-				  struct extent_buffer *eb, u64 refs, u64 flags, int slot)
+				  struct extent_buffer *eb, u64 flags, int slot)
 {
 	struct btrfs_key key;
 	u64 generation;
@@ -5384,7 +5384,7 @@ static noinline void reada_walk_down(struct btrfs_trans_handle *trans,
 			continue;
 
 		/* If we don't need to visit this node don't reada. */
-		if (!visit_node_for_delete(root, wc, eb, refs, flags, slot))
+		if (!visit_node_for_delete(root, wc, eb, flags, slot))
 			continue;
 reada:
 		btrfs_readahead_node_child(eb, slot);
@@ -5737,8 +5737,7 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans,
 
 	/* If we don't have to walk into this node skip it. */
 	if (!visit_node_for_delete(root, wc, path->nodes[level],
-				   wc->refs[level - 1], wc->flags[level - 1],
-				   path->slots[level]))
+				   wc->flags[level - 1], path->slots[level]))
 		goto skip;
 
 	/*

From ce8e39d069efd74830f86a5e71a475e13367a10a Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Wed, 9 Oct 2024 16:31:58 +0200
Subject: [PATCH 53/80] btrfs: drop unused parameter mask from
 try_release_extent_state()

The mask parameter used for allocations got unified to GFP_NOFS and
removed from relevant functions in 1d1268004430 ("btrfs: drop gfp from
parameter extent state helpers").

Reviewed-by: Anand Jain <anand.jain@oracle.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/extent_io.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index d87dcafab53779..1dbe62e6f3ec68 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -2381,7 +2381,7 @@ int extent_invalidate_folio(struct extent_io_tree *tree,
  * to drop the page.
  */
 static bool try_release_extent_state(struct extent_io_tree *tree,
-				    struct folio *folio, gfp_t mask)
+				     struct folio *folio)
 {
 	u64 start = folio_pos(folio);
 	u64 end = start + PAGE_SIZE - 1;
@@ -2492,7 +2492,7 @@ bool try_release_extent_mapping(struct folio *folio, gfp_t mask)
 			cond_resched();
 		}
 	}
-	return try_release_extent_state(io_tree, folio, mask);
+	return try_release_extent_state(io_tree, folio);
 }
 
 static void __free_extent_buffer(struct extent_buffer *eb)

From 2f3f00995f0c61b52f579e8064f748f20aa880a3 Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Wed, 9 Oct 2024 16:32:05 +0200
Subject: [PATCH 54/80] btrfs: drop unused parameter fs_info from
 folio_range_has_eb()

The parameter was added in 8ff8466d29efc2 ("btrfs: support subpage for
extent buffer page release") for page but hasn't been used since, so we
can drop it.

Reviewed-by: Anand Jain <anand.jain@oracle.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/extent_io.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 1dbe62e6f3ec68..09c0d18a7b5a16 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -2506,7 +2506,7 @@ static int extent_buffer_under_io(const struct extent_buffer *eb)
 		test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags));
 }
 
-static bool folio_range_has_eb(struct btrfs_fs_info *fs_info, struct folio *folio)
+static bool folio_range_has_eb(struct folio *folio)
 {
 	struct btrfs_subpage *subpage;
 
@@ -2580,7 +2580,7 @@ static void detach_extent_buffer_folio(const struct extent_buffer *eb, struct fo
 	 * We can only detach the folio private if there are no other ebs in the
 	 * page range and no unfinished IO.
 	 */
-	if (!folio_range_has_eb(fs_info, folio))
+	if (!folio_range_has_eb(folio))
 		btrfs_detach_subpage(fs_info, folio);
 
 	spin_unlock(&folio->mapping->i_private_lock);

From 483665988244ab7870f3ca008571ab7b743868d1 Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Wed, 9 Oct 2024 16:32:07 +0200
Subject: [PATCH 55/80] btrfs: drop unused parameter options from open_ctree()

Since the new mount option parser in commit ad21f15b0f79 ("btrfs:
switch to the new mount API") we don't pass the options like that
anymore.

Reviewed-by: Anand Jain <anand.jain@oracle.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/disk-io.c | 3 +--
 fs/btrfs/disk-io.h | 3 +--
 fs/btrfs/super.c   | 2 +-
 3 files changed, 3 insertions(+), 5 deletions(-)

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 4ad5db619b008d..92c7dbeade1c92 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -3202,8 +3202,7 @@ int btrfs_check_features(struct btrfs_fs_info *fs_info, bool is_rw_mount)
 	return 0;
 }
 
-int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_devices,
-		      const char *options)
+int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_devices)
 {
 	u32 sectorsize;
 	u32 nodesize;
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index 99af64d3f27781..127e31e0834709 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -52,8 +52,7 @@ struct extent_buffer *btrfs_find_create_tree_block(
 int btrfs_start_pre_rw_mount(struct btrfs_fs_info *fs_info);
 int btrfs_check_super_csum(struct btrfs_fs_info *fs_info,
 			   const struct btrfs_super_block *disk_sb);
-int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_devices,
-		      const char *options);
+int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_devices);
 void __cold close_ctree(struct btrfs_fs_info *fs_info);
 int btrfs_validate_super(const struct btrfs_fs_info *fs_info,
 			 const struct btrfs_super_block *sb, int mirror_num);
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index e8a5bf4af91893..a86fe54cd61557 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -962,7 +962,7 @@ static int btrfs_fill_super(struct super_block *sb,
 		return err;
 	}
 
-	err = open_ctree(sb, fs_devices, (char *)data);
+	err = open_ctree(sb, fs_devices);
 	if (err) {
 		btrfs_err(fs_info, "open_ctree failed");
 		return err;

From 3fcafae6934190d7574ced447bd1eb59c7a8ff4e Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Wed, 9 Oct 2024 16:32:17 +0200
Subject: [PATCH 56/80] btrfs: drop unused parameter data from
 btrfs_fill_super()

The only caller passes NULL, we can drop the parameter. This is since
the new mount option parser done in 3bb17a25bcb09a ("btrfs: add get_tree
callback for new mount API").

Reviewed-by: Anand Jain <anand.jain@oracle.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/super.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index a86fe54cd61557..a0b463d17c2971 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -937,8 +937,7 @@ static int get_default_subvol_objectid(struct btrfs_fs_info *fs_info, u64 *objec
 }
 
 static int btrfs_fill_super(struct super_block *sb,
-			    struct btrfs_fs_devices *fs_devices,
-			    void *data)
+			    struct btrfs_fs_devices *fs_devices)
 {
 	struct inode *inode;
 	struct btrfs_fs_info *fs_info = btrfs_sb(sb);
@@ -1885,7 +1884,7 @@ static int btrfs_get_tree_super(struct fs_context *fc)
 		snprintf(sb->s_id, sizeof(sb->s_id), "%pg", bdev);
 		shrinker_debugfs_rename(sb->s_shrink, "sb-btrfs:%s", sb->s_id);
 		btrfs_sb(sb)->bdev_holder = &btrfs_fs_type;
-		ret = btrfs_fill_super(sb, fs_devices, NULL);
+		ret = btrfs_fill_super(sb, fs_devices);
 	}
 
 	if (ret) {

From bea54c5113dd8c9e30fbbafad72b748f947de358 Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Wed, 9 Oct 2024 16:32:20 +0200
Subject: [PATCH 57/80] btrfs: drop unused parameter transaction from
 alloc_log_tree()

The function got split in commit 6ab6ebb76042d3 ("btrfs: split
alloc_log_tree()") and since then transaction parameter has been unused.

Reviewed-by: Anand Jain <anand.jain@oracle.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/disk-io.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 92c7dbeade1c92..a4c8778dd65189 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -917,8 +917,7 @@ struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans,
 	return ERR_PTR(ret);
 }
 
-static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans,
-					 struct btrfs_fs_info *fs_info)
+static struct btrfs_root *alloc_log_tree(struct btrfs_fs_info *fs_info)
 {
 	struct btrfs_root *root;
 
@@ -966,7 +965,7 @@ int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans,
 {
 	struct btrfs_root *log_root;
 
-	log_root = alloc_log_tree(trans, fs_info);
+	log_root = alloc_log_tree(fs_info);
 	if (IS_ERR(log_root))
 		return PTR_ERR(log_root);
 
@@ -992,7 +991,7 @@ int btrfs_add_log_tree(struct btrfs_trans_handle *trans,
 	struct btrfs_inode_item *inode_item;
 	int ret;
 
-	log_root = alloc_log_tree(trans, fs_info);
+	log_root = alloc_log_tree(fs_info);
 	if (IS_ERR(log_root))
 		return PTR_ERR(log_root);
 

From 9c0ededb2f73a12751ba23a85222a76d82ede7ec Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Wed, 9 Oct 2024 16:32:22 +0200
Subject: [PATCH 58/80] btrfs: drop unused parameter fs_info from
 btrfs_match_dir_item_name()

Cascaded removal of fs_info that is not needed in several functions.

Reviewed-by: Anand Jain <anand.jain@oracle.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/dir-item.c | 11 ++++-------
 fs/btrfs/dir-item.h |  3 +--
 fs/btrfs/send.c     |  3 +--
 fs/btrfs/xattr.c    |  5 ++---
 4 files changed, 8 insertions(+), 14 deletions(-)

diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c
index 001c0c2f872c99..d3093eba54a553 100644
--- a/fs/btrfs/dir-item.c
+++ b/fs/btrfs/dir-item.c
@@ -27,7 +27,6 @@ static struct btrfs_dir_item *insert_with_overflow(struct btrfs_trans_handle
 						   const char *name,
 						   int name_len)
 {
-	struct btrfs_fs_info *fs_info = root->fs_info;
 	int ret;
 	char *ptr;
 	struct extent_buffer *leaf;
@@ -35,7 +34,7 @@ static struct btrfs_dir_item *insert_with_overflow(struct btrfs_trans_handle
 	ret = btrfs_insert_empty_item(trans, root, path, cpu_key, data_size);
 	if (ret == -EEXIST) {
 		struct btrfs_dir_item *di;
-		di = btrfs_match_dir_item_name(fs_info, path, name, name_len);
+		di = btrfs_match_dir_item_name(path, name, name_len);
 		if (di)
 			return ERR_PTR(-EEXIST);
 		btrfs_extend_item(trans, path, data_size);
@@ -190,7 +189,7 @@ static struct btrfs_dir_item *btrfs_lookup_match_dir(
 	if (ret > 0)
 		return ERR_PTR(-ENOENT);
 
-	return btrfs_match_dir_item_name(root->fs_info, path, name, name_len);
+	return btrfs_match_dir_item_name(path, name, name_len);
 }
 
 /*
@@ -341,8 +340,7 @@ btrfs_search_dir_index_item(struct btrfs_root *root, struct btrfs_path *path,
 		if (key.objectid != dirid || key.type != BTRFS_DIR_INDEX_KEY)
 			break;
 
-		di = btrfs_match_dir_item_name(root->fs_info, path,
-					       name->name, name->len);
+		di = btrfs_match_dir_item_name(path, name->name, name->len);
 		if (di)
 			return di;
 	}
@@ -378,8 +376,7 @@ struct btrfs_dir_item *btrfs_lookup_xattr(struct btrfs_trans_handle *trans,
  * this walks through all the entries in a dir item and finds one
  * for a specific name.
  */
-struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_fs_info *fs_info,
-						 const struct btrfs_path *path,
+struct btrfs_dir_item *btrfs_match_dir_item_name(const struct btrfs_path *path,
 						 const char *name, int name_len)
 {
 	struct btrfs_dir_item *dir_item;
diff --git a/fs/btrfs/dir-item.h b/fs/btrfs/dir-item.h
index 5f6dfafc91f1fe..28d69970bc709f 100644
--- a/fs/btrfs/dir-item.h
+++ b/fs/btrfs/dir-item.h
@@ -44,8 +44,7 @@ struct btrfs_dir_item *btrfs_lookup_xattr(struct btrfs_trans_handle *trans,
 					  struct btrfs_path *path, u64 dir,
 					  const char *name, u16 name_len,
 					  int mod);
-struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_fs_info *fs_info,
-						 const struct btrfs_path *path,
+struct btrfs_dir_item *btrfs_match_dir_item_name(const struct btrfs_path *path,
 						 const char *name,
 						 int name_len);
 
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index efbe1c08a5adae..03b31b1c39be29 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -3760,7 +3760,6 @@ static int wait_for_dest_dir_move(struct send_ctx *sctx,
 				  struct recorded_ref *parent_ref,
 				  const bool is_orphan)
 {
-	struct btrfs_fs_info *fs_info = sctx->parent_root->fs_info;
 	struct btrfs_path *path;
 	struct btrfs_key key;
 	struct btrfs_key di_key;
@@ -3789,7 +3788,7 @@ static int wait_for_dest_dir_move(struct send_ctx *sctx,
 		goto out;
 	}
 
-	di = btrfs_match_dir_item_name(fs_info, path, parent_ref->name,
+	di = btrfs_match_dir_item_name(path, parent_ref->name,
 				       parent_ref->name_len);
 	if (!di) {
 		ret = 0;
diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
index ce464cd8e0ac79..bc18710d1dcfbb 100644
--- a/fs/btrfs/xattr.c
+++ b/fs/btrfs/xattr.c
@@ -85,7 +85,6 @@ int btrfs_setxattr(struct btrfs_trans_handle *trans, struct inode *inode,
 {
 	struct btrfs_dir_item *di = NULL;
 	struct btrfs_root *root = BTRFS_I(inode)->root;
-	struct btrfs_fs_info *fs_info = root->fs_info;
 	struct btrfs_path *path;
 	size_t name_len = strlen(name);
 	int ret = 0;
@@ -143,14 +142,14 @@ int btrfs_setxattr(struct btrfs_trans_handle *trans, struct inode *inode,
 		 */
 		ret = 0;
 		btrfs_assert_tree_write_locked(path->nodes[0]);
-		di = btrfs_match_dir_item_name(fs_info, path, name, name_len);
+		di = btrfs_match_dir_item_name(path, name, name_len);
 		if (!di && !(flags & XATTR_REPLACE)) {
 			ret = -ENOSPC;
 			goto out;
 		}
 	} else if (ret == -EEXIST) {
 		ret = 0;
-		di = btrfs_match_dir_item_name(fs_info, path, name, name_len);
+		di = btrfs_match_dir_item_name(path, name, name_len);
 		ASSERT(di); /* logic error */
 	} else if (ret) {
 		goto out;

From caebb14a33a2f1bc3f5b8240320aaeaa1e810f2e Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Wed, 9 Oct 2024 16:32:25 +0200
Subject: [PATCH 59/80] btrfs: drop unused parameter level from
 alloc_heuristic_ws()

The compression heuristic pass does not need a level, so we can drop the
parameter.

Reviewed-by: Anand Jain <anand.jain@oracle.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/compression.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index 655e3212b4093b..abcf8ed06afc0f 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -702,7 +702,7 @@ static void free_heuristic_ws(struct list_head *ws)
 	kfree(workspace);
 }
 
-static struct list_head *alloc_heuristic_ws(unsigned int level)
+static struct list_head *alloc_heuristic_ws(void)
 {
 	struct heuristic_ws *ws;
 
@@ -744,7 +744,7 @@ static const struct btrfs_compress_op * const btrfs_compress_op[] = {
 static struct list_head *alloc_workspace(int type, unsigned int level)
 {
 	switch (type) {
-	case BTRFS_COMPRESS_NONE: return alloc_heuristic_ws(level);
+	case BTRFS_COMPRESS_NONE: return alloc_heuristic_ws();
 	case BTRFS_COMPRESS_ZLIB: return zlib_alloc_workspace(level);
 	case BTRFS_COMPRESS_LZO:  return lzo_alloc_workspace();
 	case BTRFS_COMPRESS_ZSTD: return zstd_alloc_workspace(level);

From 9bbc899dc2fa89a84c7991d203180455aabac691 Mon Sep 17 00:00:00 2001
From: Naohiro Aota <naohiro.aota@wdc.com>
Date: Tue, 1 Oct 2024 17:03:32 +0900
Subject: [PATCH 60/80] btrfs: zoned: fix zone unusable accounting for freed
 reserved extent

When btrfs reserves an extent and does not use it (e.g, by an error), it
calls btrfs_free_reserved_extent() to free the reserved extent. In the
process, it calls btrfs_add_free_space() and then it accounts the region
bytes as block_group->zone_unusable.

However, it leaves the space_info->bytes_zone_unusable side not updated. As
a result, ENOSPC can happen while a space_info reservation succeeded. The
reservation is fine because the freed region is not added in
space_info->bytes_zone_unusable, leaving that space as "free". OTOH,
corresponding block group counts it as zone_unusable and its allocation
pointer is not rewound, we cannot allocate an extent from that block group.
That will also negate space_info's async/sync reclaim process, and cause an
ENOSPC error from the extent allocation process.

Fix that by returning the space to space_info->bytes_zone_unusable.
Ideally, since a bio is not submitted for this reserved region, we should
return the space to free space and rewind the allocation pointer. But, it
needs rework on extent allocation handling, so let it work in this way for
now.

Fixes: 169e0da91a21 ("btrfs: zoned: track unusable bytes for zones")
CC: stable@vger.kernel.org # 5.15+
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Signed-off-by: Naohiro Aota <naohiro.aota@wdc.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/block-group.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c
index 5c5539eb82d349..4427c1b835e8b2 100644
--- a/fs/btrfs/block-group.c
+++ b/fs/btrfs/block-group.c
@@ -3819,6 +3819,8 @@ void btrfs_free_reserved_bytes(struct btrfs_block_group *cache,
 	spin_lock(&cache->lock);
 	if (cache->ro)
 		space_info->bytes_readonly += num_bytes;
+	else if (btrfs_is_zoned(cache->fs_info))
+		space_info->bytes_zone_unusable += num_bytes;
 	cache->reserved -= num_bytes;
 	space_info->bytes_reserved -= num_bytes;
 	space_info->max_extent_size = 0;

From 467b190c1204fb6342690ac912280d48752795e6 Mon Sep 17 00:00:00 2001
From: Naohiro Aota <naohiro.aota@wdc.com>
Date: Wed, 9 Oct 2024 22:52:06 +0900
Subject: [PATCH 61/80] btrfs: fix error propagation of split bios

The purpose of btrfs_bbio_propagate_error() shall be propagating an error
of split bio to its original btrfs_bio, and tell the error to the upper
layer. However, it's not working well on some cases.

* Case 1. Immediate (or quick) end_bio with an error

When btrfs sends btrfs_bio to mirrored devices, btrfs calls
btrfs_bio_end_io() when all the mirroring bios are completed. If that
btrfs_bio was split, it is from btrfs_clone_bioset and its end_io function
is btrfs_orig_write_end_io. For this case, btrfs_bbio_propagate_error()
accesses the orig_bbio's bio context to increase the error count.

That works well in most cases. However, if the end_io is called enough
fast, orig_bbio's (remaining part after split) bio context may not be
properly set at that time. Since the bio context is set when the orig_bbio
(the last btrfs_bio) is sent to devices, that might be too late for earlier
split btrfs_bio's completion.  That will result in NULL pointer
dereference.

That bug is easily reproducible by running btrfs/146 on zoned devices [1]
and it shows the following trace.

[1] You need raid-stripe-tree feature as it create "-d raid0 -m raid1" FS.

  BUG: kernel NULL pointer dereference, address: 0000000000000020
  #PF: supervisor read access in kernel mode
  #PF: error_code(0x0000) - not-present page
  PGD 0 P4D 0
  Oops: Oops: 0000 [#1] PREEMPT SMP PTI
  CPU: 1 UID: 0 PID: 13 Comm: kworker/u32:1 Not tainted 6.11.0-rc7-BTRFS-ZNS+ #474
  Hardware name: Bochs Bochs, BIOS Bochs 01/01/2011
  Workqueue: writeback wb_workfn (flush-btrfs-5)
  RIP: 0010:btrfs_bio_end_io+0xae/0xc0 [btrfs]
  BTRFS error (device dm-0): bdev /dev/mapper/error-test errs: wr 2, rd 0, flush 0, corrupt 0, gen 0
  RSP: 0018:ffffc9000006f248 EFLAGS: 00010246
  RAX: 0000000000000000 RBX: ffff888005a7f080 RCX: ffffc9000006f1dc
  RDX: 0000000000000000 RSI: 000000000000000a RDI: ffff888005a7f080
  RBP: ffff888011dfc540 R08: 0000000000000000 R09: 0000000000000001
  R10: ffffffff82e508e0 R11: 0000000000000005 R12: ffff88800ddfbe58
  R13: ffff888005a7f080 R14: ffff888005a7f158 R15: ffff888005a7f158
  FS:  0000000000000000(0000) GS:ffff88803ea80000(0000) knlGS:0000000000000000
  CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
  CR2: 0000000000000020 CR3: 0000000002e22006 CR4: 0000000000370ef0
  DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
  DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
  Call Trace:
   <TASK>
   ? __die_body.cold+0x19/0x26
   ? page_fault_oops+0x13e/0x2b0
   ? _printk+0x58/0x73
   ? do_user_addr_fault+0x5f/0x750
   ? exc_page_fault+0x76/0x240
   ? asm_exc_page_fault+0x22/0x30
   ? btrfs_bio_end_io+0xae/0xc0 [btrfs]
   ? btrfs_log_dev_io_error+0x7f/0x90 [btrfs]
   btrfs_orig_write_end_io+0x51/0x90 [btrfs]
   dm_submit_bio+0x5c2/0xa50 [dm_mod]
   ? find_held_lock+0x2b/0x80
   ? blk_try_enter_queue+0x90/0x1e0
   __submit_bio+0xe0/0x130
   ? ktime_get+0x10a/0x160
   ? lockdep_hardirqs_on+0x74/0x100
   submit_bio_noacct_nocheck+0x199/0x410
   btrfs_submit_bio+0x7d/0x150 [btrfs]
   btrfs_submit_chunk+0x1a1/0x6d0 [btrfs]
   ? lockdep_hardirqs_on+0x74/0x100
   ? __folio_start_writeback+0x10/0x2c0
   btrfs_submit_bbio+0x1c/0x40 [btrfs]
   submit_one_bio+0x44/0x60 [btrfs]
   submit_extent_folio+0x13f/0x330 [btrfs]
   ? btrfs_set_range_writeback+0xa3/0xd0 [btrfs]
   extent_writepage_io+0x18b/0x360 [btrfs]
   extent_write_locked_range+0x17c/0x340 [btrfs]
   ? __pfx_end_bbio_data_write+0x10/0x10 [btrfs]
   run_delalloc_cow+0x71/0xd0 [btrfs]
   btrfs_run_delalloc_range+0x176/0x500 [btrfs]
   ? find_lock_delalloc_range+0x119/0x260 [btrfs]
   writepage_delalloc+0x2ab/0x480 [btrfs]
   extent_write_cache_pages+0x236/0x7d0 [btrfs]
   btrfs_writepages+0x72/0x130 [btrfs]
   do_writepages+0xd4/0x240
   ? find_held_lock+0x2b/0x80
   ? wbc_attach_and_unlock_inode+0x12c/0x290
   ? wbc_attach_and_unlock_inode+0x12c/0x290
   __writeback_single_inode+0x5c/0x4c0
   ? do_raw_spin_unlock+0x49/0xb0
   writeback_sb_inodes+0x22c/0x560
   __writeback_inodes_wb+0x4c/0xe0
   wb_writeback+0x1d6/0x3f0
   wb_workfn+0x334/0x520
   process_one_work+0x1ee/0x570
   ? lock_is_held_type+0xc6/0x130
   worker_thread+0x1d1/0x3b0
   ? __pfx_worker_thread+0x10/0x10
   kthread+0xee/0x120
   ? __pfx_kthread+0x10/0x10
   ret_from_fork+0x30/0x50
   ? __pfx_kthread+0x10/0x10
   ret_from_fork_asm+0x1a/0x30
   </TASK>
  Modules linked in: dm_mod btrfs blake2b_generic xor raid6_pq rapl
  CR2: 0000000000000020

* Case 2. Earlier completion of orig_bbio for mirrored btrfs_bios

btrfs_bbio_propagate_error() assumes the end_io function for orig_bbio is
called last among split bios. In that case, btrfs_orig_write_end_io() sets
the bio->bi_status to BLK_STS_IOERR by seeing the bioc->error [2].
Otherwise, the increased orig_bio's bioc->error is not checked by anyone
and return BLK_STS_OK to the upper layer.

[2] Actually, this is not true. Because we only increases orig_bioc->errors
by max_errors, the condition "atomic_read(&bioc->error) > bioc->max_errors"
is still not met if only one split btrfs_bio fails.

* Case 3. Later completion of orig_bbio for un-mirrored btrfs_bios

In contrast to the above case, btrfs_bbio_propagate_error() is not working
well if un-mirrored orig_bbio is completed last. It sets
orig_bbio->bio.bi_status to the btrfs_bio's error. But, that is easily
over-written by orig_bbio's completion status. If the status is BLK_STS_OK,
the upper layer would not know the failure.

* Solution

Considering the above cases, we can only save the error status in the
orig_bbio (remaining part after split) itself as it is always
available. Also, the saved error status should be propagated when all the
split btrfs_bios are finished (i.e, bbio->pending_ios == 0).

This commit introduces "status" to btrfs_bbio and saves the first error of
split bios to original btrfs_bio's "status" variable. When all the split
bios are finished, the saved status is loaded into original btrfs_bio's
status.

With this commit, btrfs/146 on zoned devices does not hit the NULL pointer
dereference anymore.

Fixes: 852eee62d31a ("btrfs: allow btrfs_submit_bio to split bios")
CC: stable@vger.kernel.org # 6.6+
Reviewed-by: Qu Wenruo <wqu@suse.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Signed-off-by: Naohiro Aota <naohiro.aota@wdc.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/bio.c | 37 +++++++++++++------------------------
 fs/btrfs/bio.h |  3 +++
 2 files changed, 16 insertions(+), 24 deletions(-)

diff --git a/fs/btrfs/bio.c b/fs/btrfs/bio.c
index 5d3f8bd406d9f1..1f216d07eff65c 100644
--- a/fs/btrfs/bio.c
+++ b/fs/btrfs/bio.c
@@ -49,6 +49,7 @@ void btrfs_bio_init(struct btrfs_bio *bbio, struct btrfs_fs_info *fs_info,
 	bbio->end_io = end_io;
 	bbio->private = private;
 	atomic_set(&bbio->pending_ios, 1);
+	WRITE_ONCE(bbio->status, BLK_STS_OK);
 }
 
 /*
@@ -113,41 +114,29 @@ static void __btrfs_bio_end_io(struct btrfs_bio *bbio)
 	}
 }
 
-static void btrfs_orig_write_end_io(struct bio *bio);
-
-static void btrfs_bbio_propagate_error(struct btrfs_bio *bbio,
-				       struct btrfs_bio *orig_bbio)
-{
-	/*
-	 * For writes we tolerate nr_mirrors - 1 write failures, so we can't
-	 * just blindly propagate a write failure here.  Instead increment the
-	 * error count in the original I/O context so that it is guaranteed to
-	 * be larger than the error tolerance.
-	 */
-	if (bbio->bio.bi_end_io == &btrfs_orig_write_end_io) {
-		struct btrfs_io_stripe *orig_stripe = orig_bbio->bio.bi_private;
-		struct btrfs_io_context *orig_bioc = orig_stripe->bioc;
-
-		atomic_add(orig_bioc->max_errors, &orig_bioc->error);
-	} else {
-		orig_bbio->bio.bi_status = bbio->bio.bi_status;
-	}
-}
-
 void btrfs_bio_end_io(struct btrfs_bio *bbio, blk_status_t status)
 {
 	bbio->bio.bi_status = status;
 	if (bbio->bio.bi_pool == &btrfs_clone_bioset) {
 		struct btrfs_bio *orig_bbio = bbio->private;
 
-		if (bbio->bio.bi_status)
-			btrfs_bbio_propagate_error(bbio, orig_bbio);
 		btrfs_cleanup_bio(bbio);
 		bbio = orig_bbio;
 	}
 
-	if (atomic_dec_and_test(&bbio->pending_ios))
+	/*
+	 * At this point, bbio always points to the original btrfs_bio. Save
+	 * the first error in it.
+	 */
+	if (status != BLK_STS_OK)
+		cmpxchg(&bbio->status, BLK_STS_OK, status);
+
+	if (atomic_dec_and_test(&bbio->pending_ios)) {
+		/* Load split bio's error which might be set above. */
+		if (status == BLK_STS_OK)
+			bbio->bio.bi_status = READ_ONCE(bbio->status);
 		__btrfs_bio_end_io(bbio);
+	}
 }
 
 static int next_repair_mirror(struct btrfs_failed_bio *fbio, int cur_mirror)
diff --git a/fs/btrfs/bio.h b/fs/btrfs/bio.h
index e4861234074588..e2fe16074ad655 100644
--- a/fs/btrfs/bio.h
+++ b/fs/btrfs/bio.h
@@ -79,6 +79,9 @@ struct btrfs_bio {
 	/* File system that this I/O operates on. */
 	struct btrfs_fs_info *fs_info;
 
+	/* Save the first error status of split bio. */
+	blk_status_t status;
+
 	/*
 	 * This member must come last, bio_alloc_bioset will allocate enough
 	 * bytes for entire btrfs_bio but relies on bio being last.

From b54ce3d7dbb386405f8cdbcf416497f1a2be064a Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Mon, 14 Oct 2024 16:14:18 +0100
Subject: [PATCH 62/80] btrfs: clear force-compress on remount when compress
 mount option is given

After the migration to use fs context for processing mount options we had
a slight change in the semantics for remounting a filesystem that was
mounted with compress-force. Before we could clear compress-force by
passing only "-o compress[=algo]" during a remount, but after that change
that does not work anymore, force-compress is still present and one needs
to pass "-o compress-force=no,compress[=algo]" to the mount command.

Example, when running on a kernel 6.8+:

  $ mount -o compress-force=zlib:9 /dev/sdi /mnt/sdi
  $ mount | grep sdi
  /dev/sdi on /mnt/sdi type btrfs (rw,relatime,compress-force=zlib:9,discard=async,space_cache=v2,subvolid=5,subvol=/)

  $ mount -o remount,compress=zlib:5 /mnt/sdi
  $ mount | grep sdi
  /dev/sdi on /mnt/sdi type btrfs (rw,relatime,compress-force=zlib:5,discard=async,space_cache=v2,subvolid=5,subvol=/)

On a 6.7 kernel (or older):

  $ mount -o compress-force=zlib:9 /dev/sdi /mnt/sdi
  $ mount | grep sdi
  /dev/sdi on /mnt/sdi type btrfs (rw,relatime,compress-force=zlib:9,discard=async,space_cache=v2,subvolid=5,subvol=/)

  $ mount -o remount,compress=zlib:5 /mnt/sdi
  $ mount | grep sdi
  /dev/sdi on /mnt/sdi type btrfs (rw,relatime,compress=zlib:5,discard=async,space_cache=v2,subvolid=5,subvol=/)

So update btrfs_parse_param() to clear "compress-force" when "compress" is
given, providing the same semantics as kernel 6.7 and older.

Reported-by: Roman Mamedov <rm@romanrm.net>
Link: https://lore.kernel.org/linux-btrfs/20241014182416.13d0f8b0@nvm/
CC: stable@vger.kernel.org # 6.8+
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/super.c | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index a0b463d17c2971..ae68abe7bb7d62 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -340,6 +340,15 @@ static int btrfs_parse_param(struct fs_context *fc, struct fs_parameter *param)
 		fallthrough;
 	case Opt_compress:
 	case Opt_compress_type:
+		/*
+		 * Provide the same semantics as older kernels that don't use fs
+		 * context, specifying the "compress" option clears
+		 * "force-compress" without the need to pass
+		 * "compress-force=[no|none]" before specifying "compress".
+		 */
+		if (opt != Opt_compress_force && opt != Opt_compress_force_type)
+			btrfs_clear_opt(ctx->mount_opt, FORCE_COMPRESS);
+
 		if (opt == Opt_compress || opt == Opt_compress_force) {
 			ctx->compress_type = BTRFS_COMPRESS_ZLIB;
 			ctx->compress_level = BTRFS_ZLIB_DEFAULT_LEVEL;

From b18732bcfa7a386fa70633f7dc931103ed210b7a Mon Sep 17 00:00:00 2001
From: Robbie Ko <robbieko@synology.com>
Date: Tue, 15 Oct 2024 15:41:37 +0800
Subject: [PATCH 63/80] btrfs: reduce lock contention when eb cache miss for
 btree search

When crawling btree, if an eb cache miss occurs, we change to use the eb
read lock and release all previous locks (including the parent lock) to
reduce lock contention.

If an eb cache miss occurs in a leaf and needs to execute IO, before this
change we released locks only from level 2 and up and we read a leaf's
content from disk while holding a lock on its parent (level 1), causing
the unnecessary lock contention on the parent, after this change we
release locks from level 1 and up, but we lock level 0, and read leaf's
content from disk.

Because we have prepared the check parameters and the read lock of eb we
hold, we can ensure that no race will occur during the check and cause
unexpected errors.

Reviewed-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Robbie Ko <robbieko@synology.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/ctree.c | 101 ++++++++++++++++++++++++++++++++---------------
 1 file changed, 70 insertions(+), 31 deletions(-)

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index b11ec86102e37b..4f34208126f785 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -1515,12 +1515,14 @@ read_block_for_search(struct btrfs_root *root, struct btrfs_path *p,
 	struct btrfs_tree_parent_check check = { 0 };
 	u64 blocknr;
 	u64 gen;
-	struct extent_buffer *tmp;
-	int ret;
+	struct extent_buffer *tmp = NULL;
+	int ret = 0;
 	int parent_level;
-	bool unlock_up;
+	int err;
+	bool read_tmp = false;
+	bool tmp_locked = false;
+	bool path_released = false;
 
-	unlock_up = ((level + 1 < BTRFS_MAX_LEVEL) && p->locks[level + 1]);
 	blocknr = btrfs_node_blockptr(*eb_ret, slot);
 	gen = btrfs_node_ptr_generation(*eb_ret, slot);
 	parent_level = btrfs_header_level(*eb_ret);
@@ -1551,68 +1553,105 @@ read_block_for_search(struct btrfs_root *root, struct btrfs_path *p,
 			 */
 			if (btrfs_verify_level_key(tmp,
 					parent_level - 1, &check.first_key, gen)) {
-				free_extent_buffer(tmp);
-				return -EUCLEAN;
+				ret = -EUCLEAN;
+				goto out;
 			}
 			*eb_ret = tmp;
-			return 0;
+			tmp = NULL;
+			ret = 0;
+			goto out;
 		}
 
 		if (p->nowait) {
-			free_extent_buffer(tmp);
-			return -EAGAIN;
+			ret = -EAGAIN;
+			goto out;
 		}
 
-		if (unlock_up)
+		if (!p->skip_locking) {
 			btrfs_unlock_up_safe(p, level + 1);
-
-		/* now we're allowed to do a blocking uptodate check */
-		ret = btrfs_read_extent_buffer(tmp, &check);
-		if (ret) {
-			free_extent_buffer(tmp);
+			tmp_locked = true;
+			btrfs_tree_read_lock(tmp);
 			btrfs_release_path(p);
-			return ret;
+			ret = -EAGAIN;
+			path_released = true;
 		}
 
-		if (unlock_up)
-			ret = -EAGAIN;
+		/* Now we're allowed to do a blocking uptodate check. */
+		err = btrfs_read_extent_buffer(tmp, &check);
+		if (err) {
+			ret = err;
+			goto out;
+		}
 
+		if (ret == 0) {
+			ASSERT(!tmp_locked);
+			*eb_ret = tmp;
+			tmp = NULL;
+		}
 		goto out;
 	} else if (p->nowait) {
-		return -EAGAIN;
+		ret = -EAGAIN;
+		goto out;
 	}
 
-	if (unlock_up) {
+	if (!p->skip_locking) {
 		btrfs_unlock_up_safe(p, level + 1);
 		ret = -EAGAIN;
-	} else {
-		ret = 0;
 	}
 
 	if (p->reada != READA_NONE)
 		reada_for_search(fs_info, p, level, slot, key->objectid);
 
-	tmp = read_tree_block(fs_info, blocknr, &check);
+	tmp = btrfs_find_create_tree_block(fs_info, blocknr, check.owner_root, check.level);
 	if (IS_ERR(tmp)) {
+		ret = PTR_ERR(tmp);
+		tmp = NULL;
+		goto out;
+	}
+	read_tmp = true;
+
+	if (!p->skip_locking) {
+		ASSERT(ret == -EAGAIN);
+		tmp_locked = true;
+		btrfs_tree_read_lock(tmp);
 		btrfs_release_path(p);
-		return PTR_ERR(tmp);
+		path_released = true;
+	}
+
+	/* Now we're allowed to do a blocking uptodate check. */
+	err = btrfs_read_extent_buffer(tmp, &check);
+	if (err) {
+		ret = err;
+		goto out;
 	}
+
 	/*
 	 * If the read above didn't mark this buffer up to date,
 	 * it will never end up being up to date.  Set ret to EIO now
 	 * and give up so that our caller doesn't loop forever
 	 * on our EAGAINs.
 	 */
-	if (!extent_buffer_uptodate(tmp))
+	if (!extent_buffer_uptodate(tmp)) {
 		ret = -EIO;
+		goto out;
+	}
 
-out:
 	if (ret == 0) {
+		ASSERT(!tmp_locked);
 		*eb_ret = tmp;
-	} else {
-		free_extent_buffer(tmp);
-		btrfs_release_path(p);
+		tmp = NULL;
+	}
+out:
+	if (tmp) {
+		if (tmp_locked)
+			btrfs_tree_read_unlock(tmp);
+		if (read_tmp && ret && ret != -EAGAIN)
+			free_extent_buffer_stale(tmp);
+		else
+			free_extent_buffer(tmp);
 	}
+	if (ret && !path_released)
+		btrfs_release_path(p);
 
 	return ret;
 }
@@ -2198,7 +2237,7 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 		}
 
 		err = read_block_for_search(root, p, &b, level, slot, key);
-		if (err == -EAGAIN)
+		if (err == -EAGAIN && !p->nowait)
 			goto again;
 		if (err) {
 			ret = err;
@@ -2325,7 +2364,7 @@ int btrfs_search_old_slot(struct btrfs_root *root, const struct btrfs_key *key,
 		}
 
 		err = read_block_for_search(root, p, &b, level, slot, key);
-		if (err == -EAGAIN)
+		if (err == -EAGAIN && !p->nowait)
 			goto again;
 		if (err) {
 			ret = err;

From 7e1135cd8b008e27d73220e4802638a9164a0a68 Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Fri, 6 Sep 2024 16:00:43 +0100
Subject: [PATCH 64/80] btrfs: add and use helper to remove extent map from its
 inode's tree

Move the common code to remove an extent map from its inode's tree into a
helper function and use it, reducing duplicated code.

Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/extent_map.c | 18 +++++++-----------
 1 file changed, 7 insertions(+), 11 deletions(-)

diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index 9e1a8af6629cff..aef8e91352f560 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -77,10 +77,13 @@ static u64 range_end(u64 start, u64 len)
 	return start + len;
 }
 
-static void dec_evictable_extent_maps(struct btrfs_inode *inode)
+static void remove_em(struct btrfs_inode *inode, struct extent_map *em)
 {
 	struct btrfs_fs_info *fs_info = inode->root->fs_info;
 
+	rb_erase(&em->rb_node, &inode->extent_tree.root);
+	RB_CLEAR_NODE(&em->rb_node);
+
 	if (!btrfs_is_testing(fs_info) && is_fstree(btrfs_root_id(inode->root)))
 		percpu_counter_dec(&fs_info->evictable_extent_maps);
 }
@@ -333,7 +336,6 @@ static void validate_extent_map(struct btrfs_fs_info *fs_info, struct extent_map
 static void try_merge_map(struct btrfs_inode *inode, struct extent_map *em)
 {
 	struct btrfs_fs_info *fs_info = inode->root->fs_info;
-	struct extent_map_tree *tree = &inode->extent_tree;
 	struct extent_map *merge = NULL;
 	struct rb_node *rb;
 
@@ -365,10 +367,8 @@ static void try_merge_map(struct btrfs_inode *inode, struct extent_map *em)
 			em->flags |= EXTENT_FLAG_MERGED;
 
 			validate_extent_map(fs_info, em);
-			rb_erase(&merge->rb_node, &tree->root);
-			RB_CLEAR_NODE(&merge->rb_node);
+			remove_em(inode, merge);
 			free_extent_map(merge);
-			dec_evictable_extent_maps(inode);
 		}
 	}
 
@@ -380,12 +380,10 @@ static void try_merge_map(struct btrfs_inode *inode, struct extent_map *em)
 		if (em->disk_bytenr < EXTENT_MAP_LAST_BYTE)
 			merge_ondisk_extents(em, merge);
 		validate_extent_map(fs_info, em);
-		rb_erase(&merge->rb_node, &tree->root);
-		RB_CLEAR_NODE(&merge->rb_node);
 		em->generation = max(em->generation, merge->generation);
 		em->flags |= EXTENT_FLAG_MERGED;
+		remove_em(inode, merge);
 		free_extent_map(merge);
-		dec_evictable_extent_maps(inode);
 	}
 }
 
@@ -582,12 +580,10 @@ void remove_extent_mapping(struct btrfs_inode *inode, struct extent_map *em)
 	lockdep_assert_held_write(&tree->lock);
 
 	WARN_ON(em->flags & EXTENT_FLAG_PINNED);
-	rb_erase(&em->rb_node, &tree->root);
 	if (!(em->flags & EXTENT_FLAG_LOGGING))
 		list_del_init(&em->list);
-	RB_CLEAR_NODE(&em->rb_node);
 
-	dec_evictable_extent_maps(inode);
+	remove_em(inode, em);
 }
 
 static void replace_extent_mapping(struct btrfs_inode *inode,

From d4eefbc90a9ebad8d0959fbf33b212fec984ffcd Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Thu, 29 Aug 2024 15:23:32 +0100
Subject: [PATCH 65/80] btrfs: make the extent map shrinker run asynchronously
 as a work queue job

Currently the extent map shrinker is run synchronously for kswapd tasks
that end up calling the fs shrinker (fs/super.c:super_cache_scan()).
This has some disadvantages and for some heavy workloads with memory
pressure it can cause some delays and stalls that make a machine
unresponsive for some periods. This happens because:

1) We can have several kswapd tasks on machines with multiple NUMA zones,
   and running the extent map shrinker concurrently can cause high
   contention on some spin locks, namely the spin locks that protect
   the radix tree that tracks roots, the per root xarray that tracks
   open inodes and the list of delayed iputs. This not only delays the
   shrinker but also causes high CPU consumption and makes the task
   running the shrinker monopolize a core, resulting in the symptoms
   of an unresponsive system. This was noted in previous commits such as
   commit ae1e766f623f ("btrfs: only run the extent map shrinker from
   kswapd tasks");

2) The extent map shrinker's iteration over inodes can often be slow, even
   after changing the data structure that tracks open inodes for a root
   from a red black tree (up to kernel 6.10) to an xarray (kernel 6.10+).
   The transition to the xarray while it made things a bit faster, it's
   still somewhat slow - for example in a test scenario with 10000 inodes
   that have no extent maps loaded, the extent map shrinker took between
   5ms to 8ms, using a release, non-debug kernel. Iterating over the
   extent maps of an inode can also be slow if have an inode with many
   thousands of extent maps, since we use a red black tree to track and
   search extent maps. So having the extent map shrinker run synchronously
   adds extra delay for other things a kswapd task does.

So make the extent map shrinker run asynchronously as a job for the
system unbounded workqueue, just like what we do for data and metadata
space reclaim jobs.

Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/disk-io.c    |  2 ++
 fs/btrfs/extent_map.c | 51 ++++++++++++++++++++++++++++++++++++-------
 fs/btrfs/extent_map.h |  3 ++-
 fs/btrfs/fs.h         |  2 ++
 fs/btrfs/super.c      | 13 +++--------
 5 files changed, 52 insertions(+), 19 deletions(-)

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index a4c8778dd65189..e72c62e2baf5a0 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -2785,6 +2785,7 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info)
 	btrfs_init_scrub(fs_info);
 	btrfs_init_balance(fs_info);
 	btrfs_init_async_reclaim_work(fs_info);
+	btrfs_init_extent_map_shrinker_work(fs_info);
 
 	rwlock_init(&fs_info->block_group_cache_lock);
 	fs_info->block_group_cache_tree = RB_ROOT_CACHED;
@@ -4292,6 +4293,7 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info)
 	cancel_work_sync(&fs_info->async_reclaim_work);
 	cancel_work_sync(&fs_info->async_data_reclaim_work);
 	cancel_work_sync(&fs_info->preempt_reclaim_work);
+	cancel_work_sync(&fs_info->extent_map_shrinker_work);
 
 	/* Cancel or finish ongoing discard work */
 	btrfs_discard_cleanup(fs_info);
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index aef8e91352f560..c1c1273bf57228 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -1118,7 +1118,8 @@ struct btrfs_em_shrink_ctx {
 
 static long btrfs_scan_inode(struct btrfs_inode *inode, struct btrfs_em_shrink_ctx *ctx)
 {
-	const u64 cur_fs_gen = btrfs_get_fs_generation(inode->root->fs_info);
+	struct btrfs_fs_info *fs_info = inode->root->fs_info;
+	const u64 cur_fs_gen = btrfs_get_fs_generation(fs_info);
 	struct extent_map_tree *tree = &inode->extent_tree;
 	long nr_dropped = 0;
 	struct rb_node *node;
@@ -1191,7 +1192,8 @@ static long btrfs_scan_inode(struct btrfs_inode *inode, struct btrfs_em_shrink_c
 		 * lock. This is to avoid slowing other tasks trying to take the
 		 * lock.
 		 */
-		if (need_resched() || rwlock_needbreak(&tree->lock))
+		if (need_resched() || rwlock_needbreak(&tree->lock) ||
+		    btrfs_fs_closing(fs_info))
 			break;
 		node = next;
 	}
@@ -1215,7 +1217,8 @@ static long btrfs_scan_root(struct btrfs_root *root, struct btrfs_em_shrink_ctx
 		ctx->last_ino = btrfs_ino(inode);
 		btrfs_add_delayed_iput(inode);
 
-		if (ctx->scanned >= ctx->nr_to_scan)
+		if (ctx->scanned >= ctx->nr_to_scan ||
+		    btrfs_fs_closing(inode->root->fs_info))
 			break;
 
 		cond_resched();
@@ -1244,16 +1247,19 @@ static long btrfs_scan_root(struct btrfs_root *root, struct btrfs_em_shrink_ctx
 	return nr_dropped;
 }
 
-long btrfs_free_extent_maps(struct btrfs_fs_info *fs_info, long nr_to_scan)
+static void btrfs_extent_map_shrinker_worker(struct work_struct *work)
 {
+	struct btrfs_fs_info *fs_info;
 	struct btrfs_em_shrink_ctx ctx;
 	u64 start_root_id;
 	u64 next_root_id;
 	bool cycled = false;
 	long nr_dropped = 0;
 
+	fs_info = container_of(work, struct btrfs_fs_info, extent_map_shrinker_work);
+
 	ctx.scanned = 0;
-	ctx.nr_to_scan = nr_to_scan;
+	ctx.nr_to_scan = atomic64_read(&fs_info->extent_map_shrinker_nr_to_scan);
 
 	/*
 	 * In case we have multiple tasks running this shrinker, make the next
@@ -1271,12 +1277,12 @@ long btrfs_free_extent_maps(struct btrfs_fs_info *fs_info, long nr_to_scan)
 	if (trace_btrfs_extent_map_shrinker_scan_enter_enabled()) {
 		s64 nr = percpu_counter_sum_positive(&fs_info->evictable_extent_maps);
 
-		trace_btrfs_extent_map_shrinker_scan_enter(fs_info, nr_to_scan,
+		trace_btrfs_extent_map_shrinker_scan_enter(fs_info, ctx.nr_to_scan,
 							   nr, ctx.last_root,
 							   ctx.last_ino);
 	}
 
-	while (ctx.scanned < ctx.nr_to_scan) {
+	while (ctx.scanned < ctx.nr_to_scan && !btrfs_fs_closing(fs_info)) {
 		struct btrfs_root *root;
 		unsigned long count;
 
@@ -1334,5 +1340,34 @@ long btrfs_free_extent_maps(struct btrfs_fs_info *fs_info, long nr_to_scan)
 							  ctx.last_ino);
 	}
 
-	return nr_dropped;
+	atomic64_set(&fs_info->extent_map_shrinker_nr_to_scan, 0);
+}
+
+void btrfs_free_extent_maps(struct btrfs_fs_info *fs_info, long nr_to_scan)
+{
+	/*
+	 * Do nothing if the shrinker is already running. In case of high memory
+	 * pressure we can have a lot of tasks calling us and all passing the
+	 * same nr_to_scan value, but in reality we may need only to free
+	 * nr_to_scan extent maps (or less). In case we need to free more than
+	 * that, we will be called again by the fs shrinker, so no worries about
+	 * not doing enough work to reclaim memory from extent maps.
+	 * We can also be repeatedly called with the same nr_to_scan value
+	 * simply because the shrinker runs asynchronously and multiple calls
+	 * to this function are made before the shrinker does enough progress.
+	 *
+	 * That's why we set the atomic counter to nr_to_scan only if its
+	 * current value is zero, instead of incrementing the counter by
+	 * nr_to_scan.
+	 */
+	if (atomic64_cmpxchg(&fs_info->extent_map_shrinker_nr_to_scan, 0, nr_to_scan) != 0)
+		return;
+
+	queue_work(system_unbound_wq, &fs_info->extent_map_shrinker_work);
+}
+
+void btrfs_init_extent_map_shrinker_work(struct btrfs_fs_info *fs_info)
+{
+	atomic64_set(&fs_info->extent_map_shrinker_nr_to_scan, 0);
+	INIT_WORK(&fs_info->extent_map_shrinker_work, btrfs_extent_map_shrinker_worker);
 }
diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h
index 5154a8f1d26c94..cd123b266b6416 100644
--- a/fs/btrfs/extent_map.h
+++ b/fs/btrfs/extent_map.h
@@ -189,6 +189,7 @@ void btrfs_drop_extent_map_range(struct btrfs_inode *inode,
 int btrfs_replace_extent_map_range(struct btrfs_inode *inode,
 				   struct extent_map *new_em,
 				   bool modified);
-long btrfs_free_extent_maps(struct btrfs_fs_info *fs_info, long nr_to_scan);
+void btrfs_free_extent_maps(struct btrfs_fs_info *fs_info, long nr_to_scan);
+void btrfs_init_extent_map_shrinker_work(struct btrfs_fs_info *fs_info);
 
 #endif
diff --git a/fs/btrfs/fs.h b/fs/btrfs/fs.h
index 85e6a644aff251..678ce5259332bb 100644
--- a/fs/btrfs/fs.h
+++ b/fs/btrfs/fs.h
@@ -638,6 +638,8 @@ struct btrfs_fs_info {
 	spinlock_t extent_map_shrinker_lock;
 	u64 extent_map_shrinker_last_root;
 	u64 extent_map_shrinker_last_ino;
+	atomic64_t extent_map_shrinker_nr_to_scan;
+	struct work_struct extent_map_shrinker_work;
 
 	/* Protected by 'trans_lock'. */
 	struct list_head dirty_cowonly_roots;
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index ae68abe7bb7d62..f678be84b54159 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -28,7 +28,6 @@
 #include <linux/btrfs.h>
 #include <linux/security.h>
 #include <linux/fs_parser.h>
-#include <linux/swap.h>
 #include "messages.h"
 #include "delayed-inode.h"
 #include "ctree.h"
@@ -2424,16 +2423,10 @@ static long btrfs_free_cached_objects(struct super_block *sb, struct shrink_cont
 	const long nr_to_scan = min_t(unsigned long, LONG_MAX, sc->nr_to_scan);
 	struct btrfs_fs_info *fs_info = btrfs_sb(sb);
 
-	/*
-	 * We may be called from any task trying to allocate memory and we don't
-	 * want to slow it down with scanning and dropping extent maps. It would
-	 * also cause heavy lock contention if many tasks concurrently enter
-	 * here. Therefore only allow kswapd tasks to scan and drop extent maps.
-	 */
-	if (!current_is_kswapd())
-		return 0;
+	btrfs_free_extent_maps(fs_info, nr_to_scan);
 
-	return btrfs_free_extent_maps(fs_info, nr_to_scan);
+	/* The extent map shrinker runs asynchronously, so always return 0. */
+	return 0;
 }
 
 static const struct super_operations btrfs_super_ops = {

From ad6f27ec83b4057574c71711790b9eb009e42d66 Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Wed, 4 Sep 2024 17:03:43 +0100
Subject: [PATCH 66/80] btrfs: simplify tracking progress for the extent map
 shrinker

Now that the extent map shrinker can only be run by a single task (as a
work queue item) there is no need to keep the progress of the shrinker
protected by a spinlock and passing the progress to trace events as
parameters. So remove the lock and simplify the arguments for the trace
events.

Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/disk-io.c           |  2 --
 fs/btrfs/extent_map.c        | 55 ++++++++----------------------------
 fs/btrfs/fs.h                |  1 -
 include/trace/events/btrfs.h | 21 +++++++-------
 4 files changed, 22 insertions(+), 57 deletions(-)

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index e72c62e2baf5a0..e524a0bedd164f 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -2852,8 +2852,6 @@ static int init_mount_fs_info(struct btrfs_fs_info *fs_info, struct super_block
 	if (ret)
 		return ret;
 
-	spin_lock_init(&fs_info->extent_map_shrinker_lock);
-
 	ret = percpu_counter_init(&fs_info->dirty_metadata_bytes, 0, GFP_KERNEL);
 	if (ret)
 		return ret;
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index c1c1273bf57228..767f0804f50468 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -1112,8 +1112,6 @@ int split_extent_map(struct btrfs_inode *inode, u64 start, u64 len, u64 pre,
 struct btrfs_em_shrink_ctx {
 	long nr_to_scan;
 	long scanned;
-	u64 last_ino;
-	u64 last_root;
 };
 
 static long btrfs_scan_inode(struct btrfs_inode *inode, struct btrfs_em_shrink_ctx *ctx)
@@ -1205,16 +1203,17 @@ static long btrfs_scan_inode(struct btrfs_inode *inode, struct btrfs_em_shrink_c
 
 static long btrfs_scan_root(struct btrfs_root *root, struct btrfs_em_shrink_ctx *ctx)
 {
+	struct btrfs_fs_info *fs_info = root->fs_info;
 	struct btrfs_inode *inode;
 	long nr_dropped = 0;
-	u64 min_ino = ctx->last_ino + 1;
+	u64 min_ino = fs_info->extent_map_shrinker_last_ino + 1;
 
 	inode = btrfs_find_first_inode(root, min_ino);
 	while (inode) {
 		nr_dropped += btrfs_scan_inode(inode, ctx);
 
 		min_ino = btrfs_ino(inode) + 1;
-		ctx->last_ino = btrfs_ino(inode);
+		fs_info->extent_map_shrinker_last_ino = btrfs_ino(inode);
 		btrfs_add_delayed_iput(inode);
 
 		if (ctx->scanned >= ctx->nr_to_scan ||
@@ -1234,14 +1233,14 @@ static long btrfs_scan_root(struct btrfs_root *root, struct btrfs_em_shrink_ctx
 		 * inode if there is one or we will find out this was the last
 		 * one and move to the next root.
 		 */
-		ctx->last_root = btrfs_root_id(root);
+		fs_info->extent_map_shrinker_last_root = btrfs_root_id(root);
 	} else {
 		/*
 		 * No more inodes in this root, set extent_map_shrinker_last_ino to 0 so
 		 * that when processing the next root we start from its first inode.
 		 */
-		ctx->last_ino = 0;
-		ctx->last_root = btrfs_root_id(root) + 1;
+		fs_info->extent_map_shrinker_last_ino = 0;
+		fs_info->extent_map_shrinker_last_root = btrfs_root_id(root) + 1;
 	}
 
 	return nr_dropped;
@@ -1261,25 +1260,13 @@ static void btrfs_extent_map_shrinker_worker(struct work_struct *work)
 	ctx.scanned = 0;
 	ctx.nr_to_scan = atomic64_read(&fs_info->extent_map_shrinker_nr_to_scan);
 
-	/*
-	 * In case we have multiple tasks running this shrinker, make the next
-	 * one start from the next inode in case it starts before we finish.
-	 */
-	spin_lock(&fs_info->extent_map_shrinker_lock);
-	ctx.last_ino = fs_info->extent_map_shrinker_last_ino;
-	fs_info->extent_map_shrinker_last_ino++;
-	ctx.last_root = fs_info->extent_map_shrinker_last_root;
-	spin_unlock(&fs_info->extent_map_shrinker_lock);
-
-	start_root_id = ctx.last_root;
-	next_root_id = ctx.last_root;
+	start_root_id = fs_info->extent_map_shrinker_last_root;
+	next_root_id = fs_info->extent_map_shrinker_last_root;
 
 	if (trace_btrfs_extent_map_shrinker_scan_enter_enabled()) {
 		s64 nr = percpu_counter_sum_positive(&fs_info->evictable_extent_maps);
 
-		trace_btrfs_extent_map_shrinker_scan_enter(fs_info, ctx.nr_to_scan,
-							   nr, ctx.last_root,
-							   ctx.last_ino);
+		trace_btrfs_extent_map_shrinker_scan_enter(fs_info, nr);
 	}
 
 	while (ctx.scanned < ctx.nr_to_scan && !btrfs_fs_closing(fs_info)) {
@@ -1296,8 +1283,8 @@ static void btrfs_extent_map_shrinker_worker(struct work_struct *work)
 			spin_unlock(&fs_info->fs_roots_radix_lock);
 			if (start_root_id > 0 && !cycled) {
 				next_root_id = 0;
-				ctx.last_root = 0;
-				ctx.last_ino = 0;
+				fs_info->extent_map_shrinker_last_root = 0;
+				fs_info->extent_map_shrinker_last_ino = 0;
 				cycled = true;
 				continue;
 			}
@@ -1316,28 +1303,10 @@ static void btrfs_extent_map_shrinker_worker(struct work_struct *work)
 		btrfs_put_root(root);
 	}
 
-	/*
-	 * In case of multiple tasks running this extent map shrinking code this
-	 * isn't perfect but it's simple and silences things like KCSAN. It's
-	 * not possible to know which task made more progress because we can
-	 * cycle back to the first root and first inode if it's not the first
-	 * time the shrinker ran, see the above logic. Also a task that started
-	 * later may finish earlier than another task and made less progress. So
-	 * make this simple and update to the progress of the last task that
-	 * finished, with the occasional possibility of having two consecutive
-	 * runs of the shrinker process the same inodes.
-	 */
-	spin_lock(&fs_info->extent_map_shrinker_lock);
-	fs_info->extent_map_shrinker_last_ino = ctx.last_ino;
-	fs_info->extent_map_shrinker_last_root = ctx.last_root;
-	spin_unlock(&fs_info->extent_map_shrinker_lock);
-
 	if (trace_btrfs_extent_map_shrinker_scan_exit_enabled()) {
 		s64 nr = percpu_counter_sum_positive(&fs_info->evictable_extent_maps);
 
-		trace_btrfs_extent_map_shrinker_scan_exit(fs_info, nr_dropped,
-							  nr, ctx.last_root,
-							  ctx.last_ino);
+		trace_btrfs_extent_map_shrinker_scan_exit(fs_info, nr_dropped, nr);
 	}
 
 	atomic64_set(&fs_info->extent_map_shrinker_nr_to_scan, 0);
diff --git a/fs/btrfs/fs.h b/fs/btrfs/fs.h
index 678ce5259332bb..827af2b027df01 100644
--- a/fs/btrfs/fs.h
+++ b/fs/btrfs/fs.h
@@ -635,7 +635,6 @@ struct btrfs_fs_info {
 	s32 delalloc_batch;
 
 	struct percpu_counter evictable_extent_maps;
-	spinlock_t extent_map_shrinker_lock;
 	u64 extent_map_shrinker_last_root;
 	u64 extent_map_shrinker_last_ino;
 	atomic64_t extent_map_shrinker_nr_to_scan;
diff --git a/include/trace/events/btrfs.h b/include/trace/events/btrfs.h
index bd515415ea8b81..879016b4e391cd 100644
--- a/include/trace/events/btrfs.h
+++ b/include/trace/events/btrfs.h
@@ -2555,10 +2555,9 @@ TRACE_EVENT(btrfs_extent_map_shrinker_count,
 
 TRACE_EVENT(btrfs_extent_map_shrinker_scan_enter,
 
-	TP_PROTO(const struct btrfs_fs_info *fs_info, long nr_to_scan, long nr,
-		 u64 last_root_id, u64 last_ino),
+	TP_PROTO(const struct btrfs_fs_info *fs_info, long nr),
 
-	TP_ARGS(fs_info, nr_to_scan, nr, last_root_id, last_ino),
+	TP_ARGS(fs_info, nr),
 
 	TP_STRUCT__entry_btrfs(
 		__field(	long,	nr_to_scan	)
@@ -2568,10 +2567,11 @@ TRACE_EVENT(btrfs_extent_map_shrinker_scan_enter,
 	),
 
 	TP_fast_assign_btrfs(fs_info,
-		__entry->nr_to_scan	= nr_to_scan;
+		__entry->nr_to_scan	= \
+		     atomic64_read(&fs_info->extent_map_shrinker_nr_to_scan);
 		__entry->nr		= nr;
-		__entry->last_root_id	= last_root_id;
-		__entry->last_ino	= last_ino;
+		__entry->last_root_id	= fs_info->extent_map_shrinker_last_root;
+		__entry->last_ino	= fs_info->extent_map_shrinker_last_ino;
 	),
 
 	TP_printk_btrfs("nr_to_scan=%ld nr=%ld last_root=%llu(%s) last_ino=%llu",
@@ -2581,10 +2581,9 @@ TRACE_EVENT(btrfs_extent_map_shrinker_scan_enter,
 
 TRACE_EVENT(btrfs_extent_map_shrinker_scan_exit,
 
-	TP_PROTO(const struct btrfs_fs_info *fs_info, long nr_dropped, long nr,
-		 u64 last_root_id, u64 last_ino),
+	TP_PROTO(const struct btrfs_fs_info *fs_info, long nr_dropped, long nr),
 
-	TP_ARGS(fs_info, nr_dropped, nr, last_root_id, last_ino),
+	TP_ARGS(fs_info, nr_dropped, nr),
 
 	TP_STRUCT__entry_btrfs(
 		__field(	long,	nr_dropped	)
@@ -2596,8 +2595,8 @@ TRACE_EVENT(btrfs_extent_map_shrinker_scan_exit,
 	TP_fast_assign_btrfs(fs_info,
 		__entry->nr_dropped	= nr_dropped;
 		__entry->nr		= nr;
-		__entry->last_root_id	= last_root_id;
-		__entry->last_ino	= last_ino;
+		__entry->last_root_id	= fs_info->extent_map_shrinker_last_root;
+		__entry->last_ino	= fs_info->extent_map_shrinker_last_ino;
 	),
 
 	TP_printk_btrfs("nr_dropped=%ld nr=%ld last_root=%llu(%s) last_ino=%llu",

From 94a09da2fe961570edbfd49f9abc266b615b8df7 Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Thu, 5 Sep 2024 11:31:49 +0100
Subject: [PATCH 67/80] btrfs: rename extent map shrinker members from struct
 btrfs_fs_info

The names for the members of struct btrfs_fs_info related to the extent
map shrinker are a bit too long, so rename them to be shorter by replacing
the "extent_map_" prefix with the "em_" prefix.

Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/disk-io.c           |  2 +-
 fs/btrfs/extent_map.c        | 32 ++++++++++++++++----------------
 fs/btrfs/fs.h                |  8 ++++----
 include/trace/events/btrfs.h | 10 +++++-----
 4 files changed, 26 insertions(+), 26 deletions(-)

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index e524a0bedd164f..5f4abdb09d6f7d 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -4291,7 +4291,7 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info)
 	cancel_work_sync(&fs_info->async_reclaim_work);
 	cancel_work_sync(&fs_info->async_data_reclaim_work);
 	cancel_work_sync(&fs_info->preempt_reclaim_work);
-	cancel_work_sync(&fs_info->extent_map_shrinker_work);
+	cancel_work_sync(&fs_info->em_shrinker_work);
 
 	/* Cancel or finish ongoing discard work */
 	btrfs_discard_cleanup(fs_info);
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index 767f0804f50468..d58d6fc40da1fc 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -1206,14 +1206,14 @@ static long btrfs_scan_root(struct btrfs_root *root, struct btrfs_em_shrink_ctx
 	struct btrfs_fs_info *fs_info = root->fs_info;
 	struct btrfs_inode *inode;
 	long nr_dropped = 0;
-	u64 min_ino = fs_info->extent_map_shrinker_last_ino + 1;
+	u64 min_ino = fs_info->em_shrinker_last_ino + 1;
 
 	inode = btrfs_find_first_inode(root, min_ino);
 	while (inode) {
 		nr_dropped += btrfs_scan_inode(inode, ctx);
 
 		min_ino = btrfs_ino(inode) + 1;
-		fs_info->extent_map_shrinker_last_ino = btrfs_ino(inode);
+		fs_info->em_shrinker_last_ino = btrfs_ino(inode);
 		btrfs_add_delayed_iput(inode);
 
 		if (ctx->scanned >= ctx->nr_to_scan ||
@@ -1233,14 +1233,14 @@ static long btrfs_scan_root(struct btrfs_root *root, struct btrfs_em_shrink_ctx
 		 * inode if there is one or we will find out this was the last
 		 * one and move to the next root.
 		 */
-		fs_info->extent_map_shrinker_last_root = btrfs_root_id(root);
+		fs_info->em_shrinker_last_root = btrfs_root_id(root);
 	} else {
 		/*
 		 * No more inodes in this root, set extent_map_shrinker_last_ino to 0 so
 		 * that when processing the next root we start from its first inode.
 		 */
-		fs_info->extent_map_shrinker_last_ino = 0;
-		fs_info->extent_map_shrinker_last_root = btrfs_root_id(root) + 1;
+		fs_info->em_shrinker_last_ino = 0;
+		fs_info->em_shrinker_last_root = btrfs_root_id(root) + 1;
 	}
 
 	return nr_dropped;
@@ -1255,13 +1255,13 @@ static void btrfs_extent_map_shrinker_worker(struct work_struct *work)
 	bool cycled = false;
 	long nr_dropped = 0;
 
-	fs_info = container_of(work, struct btrfs_fs_info, extent_map_shrinker_work);
+	fs_info = container_of(work, struct btrfs_fs_info, em_shrinker_work);
 
 	ctx.scanned = 0;
-	ctx.nr_to_scan = atomic64_read(&fs_info->extent_map_shrinker_nr_to_scan);
+	ctx.nr_to_scan = atomic64_read(&fs_info->em_shrinker_nr_to_scan);
 
-	start_root_id = fs_info->extent_map_shrinker_last_root;
-	next_root_id = fs_info->extent_map_shrinker_last_root;
+	start_root_id = fs_info->em_shrinker_last_root;
+	next_root_id = fs_info->em_shrinker_last_root;
 
 	if (trace_btrfs_extent_map_shrinker_scan_enter_enabled()) {
 		s64 nr = percpu_counter_sum_positive(&fs_info->evictable_extent_maps);
@@ -1283,8 +1283,8 @@ static void btrfs_extent_map_shrinker_worker(struct work_struct *work)
 			spin_unlock(&fs_info->fs_roots_radix_lock);
 			if (start_root_id > 0 && !cycled) {
 				next_root_id = 0;
-				fs_info->extent_map_shrinker_last_root = 0;
-				fs_info->extent_map_shrinker_last_ino = 0;
+				fs_info->em_shrinker_last_root = 0;
+				fs_info->em_shrinker_last_ino = 0;
 				cycled = true;
 				continue;
 			}
@@ -1309,7 +1309,7 @@ static void btrfs_extent_map_shrinker_worker(struct work_struct *work)
 		trace_btrfs_extent_map_shrinker_scan_exit(fs_info, nr_dropped, nr);
 	}
 
-	atomic64_set(&fs_info->extent_map_shrinker_nr_to_scan, 0);
+	atomic64_set(&fs_info->em_shrinker_nr_to_scan, 0);
 }
 
 void btrfs_free_extent_maps(struct btrfs_fs_info *fs_info, long nr_to_scan)
@@ -1329,14 +1329,14 @@ void btrfs_free_extent_maps(struct btrfs_fs_info *fs_info, long nr_to_scan)
 	 * current value is zero, instead of incrementing the counter by
 	 * nr_to_scan.
 	 */
-	if (atomic64_cmpxchg(&fs_info->extent_map_shrinker_nr_to_scan, 0, nr_to_scan) != 0)
+	if (atomic64_cmpxchg(&fs_info->em_shrinker_nr_to_scan, 0, nr_to_scan) != 0)
 		return;
 
-	queue_work(system_unbound_wq, &fs_info->extent_map_shrinker_work);
+	queue_work(system_unbound_wq, &fs_info->em_shrinker_work);
 }
 
 void btrfs_init_extent_map_shrinker_work(struct btrfs_fs_info *fs_info)
 {
-	atomic64_set(&fs_info->extent_map_shrinker_nr_to_scan, 0);
-	INIT_WORK(&fs_info->extent_map_shrinker_work, btrfs_extent_map_shrinker_worker);
+	atomic64_set(&fs_info->em_shrinker_nr_to_scan, 0);
+	INIT_WORK(&fs_info->em_shrinker_work, btrfs_extent_map_shrinker_worker);
 }
diff --git a/fs/btrfs/fs.h b/fs/btrfs/fs.h
index 827af2b027df01..79a1a3d6f04d18 100644
--- a/fs/btrfs/fs.h
+++ b/fs/btrfs/fs.h
@@ -635,10 +635,10 @@ struct btrfs_fs_info {
 	s32 delalloc_batch;
 
 	struct percpu_counter evictable_extent_maps;
-	u64 extent_map_shrinker_last_root;
-	u64 extent_map_shrinker_last_ino;
-	atomic64_t extent_map_shrinker_nr_to_scan;
-	struct work_struct extent_map_shrinker_work;
+	u64 em_shrinker_last_root;
+	u64 em_shrinker_last_ino;
+	atomic64_t em_shrinker_nr_to_scan;
+	struct work_struct em_shrinker_work;
 
 	/* Protected by 'trans_lock'. */
 	struct list_head dirty_cowonly_roots;
diff --git a/include/trace/events/btrfs.h b/include/trace/events/btrfs.h
index 879016b4e391cd..4df93ca9b7a8d4 100644
--- a/include/trace/events/btrfs.h
+++ b/include/trace/events/btrfs.h
@@ -2568,10 +2568,10 @@ TRACE_EVENT(btrfs_extent_map_shrinker_scan_enter,
 
 	TP_fast_assign_btrfs(fs_info,
 		__entry->nr_to_scan	= \
-		     atomic64_read(&fs_info->extent_map_shrinker_nr_to_scan);
+		     atomic64_read(&fs_info->em_shrinker_nr_to_scan);
 		__entry->nr		= nr;
-		__entry->last_root_id	= fs_info->extent_map_shrinker_last_root;
-		__entry->last_ino	= fs_info->extent_map_shrinker_last_ino;
+		__entry->last_root_id	= fs_info->em_shrinker_last_root;
+		__entry->last_ino	= fs_info->em_shrinker_last_ino;
 	),
 
 	TP_printk_btrfs("nr_to_scan=%ld nr=%ld last_root=%llu(%s) last_ino=%llu",
@@ -2595,8 +2595,8 @@ TRACE_EVENT(btrfs_extent_map_shrinker_scan_exit,
 	TP_fast_assign_btrfs(fs_info,
 		__entry->nr_dropped	= nr_dropped;
 		__entry->nr		= nr;
-		__entry->last_root_id	= fs_info->extent_map_shrinker_last_root;
-		__entry->last_ino	= fs_info->extent_map_shrinker_last_ino;
+		__entry->last_root_id	= fs_info->em_shrinker_last_root;
+		__entry->last_ino	= fs_info->em_shrinker_last_ino;
 	),
 
 	TP_printk_btrfs("nr_dropped=%ld nr=%ld last_root=%llu(%s) last_ino=%llu",

From 5a6cadaa8143339a10e608e2755daf7234811f9d Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Tue, 17 Sep 2024 12:03:11 +0100
Subject: [PATCH 68/80] btrfs: re-enable the extent map shrinker

Now that the extent map shrinker can only be run by a single task and runs
asynchronously as a work queue job, enable it as it can no longer cause
stalls on tasks allocating memory and entering the extent map shrinker
through the fs shrinker (implemented by btrfs_free_cached_objects()).

This is crucial to prevent exhaustion of memory due to unbounded extent
map creation, primarily with direct IO but also for buffered IO on files
with holes. This problem, for the direct IO case, was first reported in
the Link tag below. That report was added to a Link tag of the first patch
that introduced the extent map shrinker, commit 956a17d9d050 ("btrfs: add
a shrinker for extent maps"), however the Link tag disappeared somehow
from the committed patch (but was included in the submitted patch to the
mailing list), so adding it below for future reference.

Link: https://lore.kernel.org/linux-btrfs/13f94633dcf04d29aaf1f0a43d42c55e@amazon.com/
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/super.c | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index f678be84b54159..fa25db4aacf996 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -2409,13 +2409,7 @@ static long btrfs_nr_cached_objects(struct super_block *sb, struct shrink_contro
 
 	trace_btrfs_extent_map_shrinker_count(fs_info, nr);
 
-	/*
-	 * Only report the real number for EXPERIMENTAL builds, as there are
-	 * reports of serious performance degradation caused by too frequent shrinks.
-	 */
-	if (IS_ENABLED(CONFIG_BTRFS_EXPERIMENTAL))
-		return nr;
-	return 0;
+	return nr;
 }
 
 static long btrfs_free_cached_objects(struct super_block *sb, struct shrink_control *sc)

From 0f9677adfc42495bebc7d0b57edf6f53817d3650 Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Wed, 16 Oct 2024 10:45:47 +0100
Subject: [PATCH 69/80] btrfs: remove redundant level argument from
 read_block_for_search()

The level parameter passed to read_block_for_search() always matches the
level of the extent buffer passed in the "eb_ret" parameter, which we are
also extracting into the "parent_level" local variable.

So remove the level parameter and instead use the "parent_level" variable
which in fact has a better name (it's the level of the parent node from
which we are reading a child node/leaf).

Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/ctree.c | 20 +++++++++-----------
 1 file changed, 9 insertions(+), 11 deletions(-)

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 4f34208126f785..428c5650559acc 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -1508,7 +1508,7 @@ static noinline void unlock_up(struct btrfs_path *path, int level,
  */
 static int
 read_block_for_search(struct btrfs_root *root, struct btrfs_path *p,
-		      struct extent_buffer **eb_ret, int level, int slot,
+		      struct extent_buffer **eb_ret, int slot,
 		      const struct btrfs_key *key)
 {
 	struct btrfs_fs_info *fs_info = root->fs_info;
@@ -1542,7 +1542,7 @@ read_block_for_search(struct btrfs_root *root, struct btrfs_path *p,
 	tmp = find_extent_buffer(fs_info, blocknr);
 	if (tmp) {
 		if (p->reada == READA_FORWARD_ALWAYS)
-			reada_for_search(fs_info, p, level, slot, key->objectid);
+			reada_for_search(fs_info, p, parent_level, slot, key->objectid);
 
 		/* first we do an atomic uptodate check */
 		if (btrfs_buffer_uptodate(tmp, gen, 1) > 0) {
@@ -1568,7 +1568,7 @@ read_block_for_search(struct btrfs_root *root, struct btrfs_path *p,
 		}
 
 		if (!p->skip_locking) {
-			btrfs_unlock_up_safe(p, level + 1);
+			btrfs_unlock_up_safe(p, parent_level + 1);
 			tmp_locked = true;
 			btrfs_tree_read_lock(tmp);
 			btrfs_release_path(p);
@@ -1595,12 +1595,12 @@ read_block_for_search(struct btrfs_root *root, struct btrfs_path *p,
 	}
 
 	if (!p->skip_locking) {
-		btrfs_unlock_up_safe(p, level + 1);
+		btrfs_unlock_up_safe(p, parent_level + 1);
 		ret = -EAGAIN;
 	}
 
 	if (p->reada != READA_NONE)
-		reada_for_search(fs_info, p, level, slot, key->objectid);
+		reada_for_search(fs_info, p, parent_level, slot, key->objectid);
 
 	tmp = btrfs_find_create_tree_block(fs_info, blocknr, check.owner_root, check.level);
 	if (IS_ERR(tmp)) {
@@ -2236,7 +2236,7 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 			goto done;
 		}
 
-		err = read_block_for_search(root, p, &b, level, slot, key);
+		err = read_block_for_search(root, p, &b, slot, key);
 		if (err == -EAGAIN && !p->nowait)
 			goto again;
 		if (err) {
@@ -2363,7 +2363,7 @@ int btrfs_search_old_slot(struct btrfs_root *root, const struct btrfs_key *key,
 			goto done;
 		}
 
-		err = read_block_for_search(root, p, &b, level, slot, key);
+		err = read_block_for_search(root, p, &b, slot, key);
 		if (err == -EAGAIN && !p->nowait)
 			goto again;
 		if (err) {
@@ -4969,8 +4969,7 @@ int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path,
 		}
 
 		next = c;
-		ret = read_block_for_search(root, path, &next, level,
-					    slot, &key);
+		ret = read_block_for_search(root, path, &next, slot, &key);
 		if (ret == -EAGAIN && !path->nowait)
 			goto again;
 
@@ -5013,8 +5012,7 @@ int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path,
 		if (!level)
 			break;
 
-		ret = read_block_for_search(root, path, &next, level,
-					    0, &key);
+		ret = read_block_for_search(root, path, &next, 0, &key);
 		if (ret == -EAGAIN && !path->nowait)
 			goto again;
 

From 7f066ce418766f6c71f54fb1521e04bb55147ee5 Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Wed, 16 Oct 2024 10:57:48 +0100
Subject: [PATCH 70/80] btrfs: simplify arguments for btrfs_verify_level_key()

The only caller of btrfs_verify_level_key() is read_block_for_search() and
it's passing 3 arguments to it that can be extracted from its on stack
variable of type struct btrfs_tree_parent_check.

So change btrfs_verify_level_key() to accept an argument of type
struct btrfs_tree_parent_check instead of level, first key and parent
transid arguments.

Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/ctree.c        |  3 +--
 fs/btrfs/tree-checker.c | 16 ++++++++--------
 fs/btrfs/tree-checker.h |  4 ++--
 3 files changed, 11 insertions(+), 12 deletions(-)

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 428c5650559acc..f68a9b586079b9 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -1551,8 +1551,7 @@ read_block_for_search(struct btrfs_root *root, struct btrfs_path *p,
 			 * being cached, read from scrub, or have multiple
 			 * parents (shared tree blocks).
 			 */
-			if (btrfs_verify_level_key(tmp,
-					parent_level - 1, &check.first_key, gen)) {
+			if (btrfs_verify_level_key(tmp, &check)) {
 				ret = -EUCLEAN;
 				goto out;
 			}
diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c
index 7b50263723bc1a..148d8cefa40eef 100644
--- a/fs/btrfs/tree-checker.c
+++ b/fs/btrfs/tree-checker.c
@@ -2183,8 +2183,8 @@ int btrfs_check_eb_owner(const struct extent_buffer *eb, u64 root_owner)
 	return 0;
 }
 
-int btrfs_verify_level_key(struct extent_buffer *eb, int level,
-			   struct btrfs_key *first_key, u64 parent_transid)
+int btrfs_verify_level_key(struct extent_buffer *eb,
+			   const struct btrfs_tree_parent_check *check)
 {
 	struct btrfs_fs_info *fs_info = eb->fs_info;
 	int found_level;
@@ -2192,16 +2192,16 @@ int btrfs_verify_level_key(struct extent_buffer *eb, int level,
 	int ret;
 
 	found_level = btrfs_header_level(eb);
-	if (found_level != level) {
+	if (found_level != check->level) {
 		WARN(IS_ENABLED(CONFIG_BTRFS_DEBUG),
 		     KERN_ERR "BTRFS: tree level check failed\n");
 		btrfs_err(fs_info,
 "tree level mismatch detected, bytenr=%llu level expected=%u has=%u",
-			  eb->start, level, found_level);
+			  eb->start, check->level, found_level);
 		return -EIO;
 	}
 
-	if (!first_key)
+	if (!check->has_first_key)
 		return 0;
 
 	/*
@@ -2226,15 +2226,15 @@ int btrfs_verify_level_key(struct extent_buffer *eb, int level,
 		btrfs_node_key_to_cpu(eb, &found_key, 0);
 	else
 		btrfs_item_key_to_cpu(eb, &found_key, 0);
-	ret = btrfs_comp_cpu_keys(first_key, &found_key);
+	ret = btrfs_comp_cpu_keys(&check->first_key, &found_key);
 
 	if (ret) {
 		WARN(IS_ENABLED(CONFIG_BTRFS_DEBUG),
 		     KERN_ERR "BTRFS: tree first key check failed\n");
 		btrfs_err(fs_info,
 "tree first key mismatch detected, bytenr=%llu parent_transid=%llu key expected=(%llu,%u,%llu) has=(%llu,%u,%llu)",
-			  eb->start, parent_transid, first_key->objectid,
-			  first_key->type, first_key->offset,
+			  eb->start, check->transid, check->first_key.objectid,
+			  check->first_key.type, check->first_key.offset,
 			  found_key.objectid, found_key.type,
 			  found_key.offset);
 	}
diff --git a/fs/btrfs/tree-checker.h b/fs/btrfs/tree-checker.h
index 01669cfa6578de..db67f96cbe4bc1 100644
--- a/fs/btrfs/tree-checker.h
+++ b/fs/btrfs/tree-checker.h
@@ -69,7 +69,7 @@ int btrfs_check_node(struct extent_buffer *node);
 int btrfs_check_chunk_valid(struct extent_buffer *leaf,
 			    struct btrfs_chunk *chunk, u64 logical);
 int btrfs_check_eb_owner(const struct extent_buffer *eb, u64 root_owner);
-int btrfs_verify_level_key(struct extent_buffer *eb, int level,
-			   struct btrfs_key *first_key, u64 parent_transid);
+int btrfs_verify_level_key(struct extent_buffer *eb,
+			   const struct btrfs_tree_parent_check *check);
 
 #endif

From 28cea0d122f06437ac42216c7d90e0d63e916208 Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Wed, 16 Oct 2024 11:09:26 +0100
Subject: [PATCH 71/80] btrfs: remove redundant initializations for struct
 btrfs_tree_parent_check

It's pointless to initialize the has_first_key field of the stack local
btrfs_tree_parent_check structure at btrfs_tree_parent_check() and at
btrfs_qgroup_trace_subtree() since all fields not explicitly initialized
are zeroed out. In the case of the first function it's a bit odd because
we are assigning 0 and the field is of type bool, however not incorrect
since a 0 is converted to false.

Just remove the explicit initializations due to their redundancy.

Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/extent_io.c | 1 -
 fs/btrfs/qgroup.c    | 1 -
 2 files changed, 2 deletions(-)

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 09c0d18a7b5a16..0a0c84eb1c4270 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -4285,7 +4285,6 @@ void btrfs_readahead_tree_block(struct btrfs_fs_info *fs_info,
 				u64 bytenr, u64 owner_root, u64 gen, int level)
 {
 	struct btrfs_tree_parent_check check = {
-		.has_first_key = 0,
 		.level = level,
 		.transid = gen
 	};
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index 0fcf6e91616eec..c19436704eda8f 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -2648,7 +2648,6 @@ int btrfs_qgroup_trace_subtree(struct btrfs_trans_handle *trans,
 
 	if (!extent_buffer_uptodate(root_eb)) {
 		struct btrfs_tree_parent_check check = {
-			.has_first_key = false,
 			.transid = root_gen,
 			.level = root_level
 		};

From 7e3a5cadbf82b37ced59a92cbf5d1967fe1a8ae6 Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Wed, 16 Oct 2024 11:13:24 +0100
Subject: [PATCH 72/80] btrfs: remove local generation variable from
 read_block_for_search()

It's redundant to have the 'gen' variable since we already have the same
value in the local btrfs_tree_parent_check structure. So remove it and
instead use the structure's field.

Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/ctree.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index f68a9b586079b9..148648ea1c8b95 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -1514,7 +1514,6 @@ read_block_for_search(struct btrfs_root *root, struct btrfs_path *p,
 	struct btrfs_fs_info *fs_info = root->fs_info;
 	struct btrfs_tree_parent_check check = { 0 };
 	u64 blocknr;
-	u64 gen;
 	struct extent_buffer *tmp = NULL;
 	int ret = 0;
 	int parent_level;
@@ -1524,12 +1523,11 @@ read_block_for_search(struct btrfs_root *root, struct btrfs_path *p,
 	bool path_released = false;
 
 	blocknr = btrfs_node_blockptr(*eb_ret, slot);
-	gen = btrfs_node_ptr_generation(*eb_ret, slot);
 	parent_level = btrfs_header_level(*eb_ret);
 	btrfs_node_key_to_cpu(*eb_ret, &check.first_key, slot);
 	check.has_first_key = true;
 	check.level = parent_level - 1;
-	check.transid = gen;
+	check.transid = btrfs_node_ptr_generation(*eb_ret, slot);
 	check.owner_root = btrfs_root_id(root);
 
 	/*
@@ -1545,7 +1543,7 @@ read_block_for_search(struct btrfs_root *root, struct btrfs_path *p,
 			reada_for_search(fs_info, p, parent_level, slot, key->objectid);
 
 		/* first we do an atomic uptodate check */
-		if (btrfs_buffer_uptodate(tmp, gen, 1) > 0) {
+		if (btrfs_buffer_uptodate(tmp, check.transid, 1) > 0) {
 			/*
 			 * Do extra check for first_key, eb can be stale due to
 			 * being cached, read from scrub, or have multiple

From 78a423df98afedc43b6a2249ea6174bbb7ecbcc0 Mon Sep 17 00:00:00 2001
From: Boris Burkov <boris@bur.io>
Date: Tue, 15 Oct 2024 14:27:32 -0700
Subject: [PATCH 73/80] btrfs: do not clear read-only when adding sprout device

If you follow the seed/sprout wiki, it suggests the following workflow:

btrfstune -S 1 seed_dev
mount seed_dev mnt
btrfs device add sprout_dev
mount -o remount,rw mnt

The first mount mounts the FS readonly, which results in not setting
BTRFS_FS_OPEN, and setting the readonly bit on the sb. The device add
somewhat surprisingly clears the readonly bit on the sb (though the
mount is still practically readonly, from the users perspective...).
Finally, the remount checks the readonly bit on the sb against the flag
and sees no change, so it does not run the code intended to run on
ro->rw transitions, leaving BTRFS_FS_OPEN unset.

As a result, when the cleaner_kthread runs, it sees no BTRFS_FS_OPEN and
does no work. This results in leaking deleted snapshots until we run out
of space.

I propose fixing it at the first departure from what feels reasonable:
when we clear the readonly bit on the sb during device add.

A new fstest I have written reproduces the bug and confirms the fix.

Reviewed-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: Boris Burkov <boris@bur.io>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/volumes.c | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index dc9f54849f3901..84e861dcb35008 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -2841,8 +2841,6 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
 	set_blocksize(device->bdev_file, BTRFS_BDEV_BLOCKSIZE);
 
 	if (seeding_dev) {
-		btrfs_clear_sb_rdonly(sb);
-
 		/* GFP_KERNEL allocation must not be under device_list_mutex */
 		seed_devices = btrfs_init_sprout(fs_info);
 		if (IS_ERR(seed_devices)) {
@@ -2985,8 +2983,6 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
 	mutex_unlock(&fs_info->chunk_mutex);
 	mutex_unlock(&fs_info->fs_devices->device_list_mutex);
 error_trans:
-	if (seeding_dev)
-		btrfs_set_sb_rdonly(sb);
 	if (trans)
 		btrfs_end_transaction(trans);
 error_free_zone:

From 8a40d126df6db42f5d224526e188716115ccfe01 Mon Sep 17 00:00:00 2001
From: Qu Wenruo <wqu@suse.com>
Date: Tue, 10 Sep 2024 15:21:04 +0930
Subject: [PATCH 74/80] btrfs: qgroup: set a more sane default value for
 subtree drop threshold

Since commit 011b46c30476 ("btrfs: skip subtree scan if it's too high to
avoid low stall in btrfs_commit_transaction()"), btrfs qgroup can
automatically skip large subtree scan at the cost of marking qgroup
inconsistent.

It's designed to address the final performance problem of snapshot drop
with qgroup enabled, but to be safe the default value is
BTRFS_MAX_LEVEL, requiring a user space daemon to set a different value
to make it work.

I'd say it's not a good idea to rely on user space tool to set this
default value, especially when some operations (snapshot dropping) can
be triggered immediately after mount, leaving a very small window to
that that sysfs interface.

So instead of disabling this new feature by default, enable it with a
low threshold (3), so that large subvolume tree drop at mount time won't
cause huge qgroup workload.

CC: stable@vger.kernel.org # 6.1
Signed-off-by: Qu Wenruo <wqu@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/disk-io.c | 2 +-
 fs/btrfs/qgroup.c  | 2 +-
 fs/btrfs/qgroup.h  | 2 ++
 3 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 5f4abdb09d6f7d..bcf6e53bc2a74c 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1958,7 +1958,7 @@ static void btrfs_init_qgroup(struct btrfs_fs_info *fs_info)
 	fs_info->qgroup_seq = 1;
 	fs_info->qgroup_ulist = NULL;
 	fs_info->qgroup_rescan_running = false;
-	fs_info->qgroup_drop_subtree_thres = BTRFS_MAX_LEVEL;
+	fs_info->qgroup_drop_subtree_thres = BTRFS_QGROUP_DROP_SUBTREE_THRES_DEFAULT;
 	mutex_init(&fs_info->qgroup_rescan_lock);
 }
 
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index c19436704eda8f..a6f92836c9b113 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -1406,7 +1406,7 @@ int btrfs_quota_disable(struct btrfs_fs_info *fs_info)
 	fs_info->quota_root = NULL;
 	fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_ON;
 	fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_SIMPLE_MODE;
-	fs_info->qgroup_drop_subtree_thres = BTRFS_MAX_LEVEL;
+	fs_info->qgroup_drop_subtree_thres = BTRFS_QGROUP_DROP_SUBTREE_THRES_DEFAULT;
 	spin_unlock(&fs_info->qgroup_lock);
 
 	btrfs_free_qgroup_config(fs_info);
diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h
index 9fbd008749fa37..e233cc79af185a 100644
--- a/fs/btrfs/qgroup.h
+++ b/fs/btrfs/qgroup.h
@@ -121,6 +121,8 @@ struct btrfs_inode;
 #define BTRFS_QGROUP_RUNTIME_FLAG_CANCEL_RESCAN		(1ULL << 63)
 #define BTRFS_QGROUP_RUNTIME_FLAG_NO_ACCOUNTING		(1ULL << 62)
 
+#define BTRFS_QGROUP_DROP_SUBTREE_THRES_DEFAULT		(3)
+
 /*
  * Record a dirty extent, and info qgroup to update quota on it
  */

From cd9721f5806403360b68a886941cd28fecadd274 Mon Sep 17 00:00:00 2001
From: Qu Wenruo <wqu@suse.com>
Date: Wed, 9 Oct 2024 09:37:03 +1030
Subject: [PATCH 75/80] btrfs: fix the delalloc range locking if sector size <
 page size

Inside lock_delalloc_folios(), there are several problems related to
sector size < page size handling:

- Set the writer locks without checking if the folio is still valid
  We call btrfs_folio_start_writer_lock() just like it's folio_lock().
  But since the folio may not even be the folio of the current mapping,
  we can easily screw up the folio->private.

- The range is not clamped inside the page
  This means we can over write other bitmaps if the start/len is not
  properly handled, and trigger the btrfs_subpage_assert().

- @processed_end is always rounded up to page end
  If the delalloc range is not page aligned, and we need to retry
  (returning -EAGAIN), then we will unlock to the page end.

  Thankfully this is not a huge problem, as now
  btrfs_folio_end_writer_lock() can handle range larger than the locked
  range, and only unlock what is already locked.

Fix all these problems by:

- Lock and check the folio first, then call
  btrfs_folio_set_writer_lock()
  So that if we got a folio not belonging to the inode, we won't
  touch folio->private.

- Properly truncate the range inside the page

- Update @processed_end to the locked range end

Fixes: 1e1de38792e0 ("btrfs: make process_one_page() to handle subpage locking")
Signed-off-by: Qu Wenruo <wqu@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/extent_io.c | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 0a0c84eb1c4270..4cba6d78e63148 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -262,22 +262,23 @@ static noinline int lock_delalloc_folios(struct inode *inode,
 
 		for (i = 0; i < found_folios; i++) {
 			struct folio *folio = fbatch.folios[i];
-			u32 len = end + 1 - start;
+			u64 range_start;
+			u32 range_len;
 
 			if (folio == locked_folio)
 				continue;
 
-			if (btrfs_folio_start_writer_lock(fs_info, folio, start,
-							  len))
-				goto out;
-
+			folio_lock(folio);
 			if (!folio_test_dirty(folio) || folio->mapping != mapping) {
-				btrfs_folio_end_writer_lock(fs_info, folio, start,
-							    len);
+				folio_unlock(folio);
 				goto out;
 			}
+			range_start = max_t(u64, folio_pos(folio), start);
+			range_len = min_t(u64, folio_pos(folio) + folio_size(folio),
+					  end + 1) - range_start;
+			btrfs_folio_set_writer_lock(fs_info, folio, range_start, range_len);
 
-			processed_end = folio_pos(folio) + folio_size(folio) - 1;
+			processed_end = range_start + range_len - 1;
 		}
 		folio_batch_release(&fbatch);
 		cond_resched();

From 3efe27c029ab05356ba27d593252c64fb359a198 Mon Sep 17 00:00:00 2001
From: Qu Wenruo <wqu@suse.com>
Date: Wed, 9 Oct 2024 09:37:04 +1030
Subject: [PATCH 76/80] btrfs: remove unused btrfs_folio_start_writer_lock()

This function is not really suitable to lock a folio, as it lacks the
proper mapping checks, thus the locked folio may not even belong to
btrfs.

And due to the above reason, the last user inside lock_delalloc_folios()
is already removed, and we can remove this function.

Signed-off-by: Qu Wenruo <wqu@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/subpage.c | 47 ----------------------------------------------
 fs/btrfs/subpage.h |  2 --
 2 files changed, 49 deletions(-)

diff --git a/fs/btrfs/subpage.c b/fs/btrfs/subpage.c
index 17bc53a8df0140..26d4d05dd165aa 100644
--- a/fs/btrfs/subpage.c
+++ b/fs/btrfs/subpage.c
@@ -295,26 +295,6 @@ static void btrfs_subpage_clamp_range(struct folio *folio, u64 *start, u32 *len)
 			     orig_start + orig_len) - *start;
 }
 
-static void btrfs_subpage_start_writer(const struct btrfs_fs_info *fs_info,
-				       struct folio *folio, u64 start, u32 len)
-{
-	struct btrfs_subpage *subpage = folio_get_private(folio);
-	const int start_bit = subpage_calc_start_bit(fs_info, folio, locked, start, len);
-	const int nbits = (len >> fs_info->sectorsize_bits);
-	unsigned long flags;
-	int ret;
-
-	btrfs_subpage_assert(fs_info, folio, start, len);
-
-	spin_lock_irqsave(&subpage->lock, flags);
-	ASSERT(atomic_read(&subpage->readers) == 0);
-	ASSERT(bitmap_test_range_all_zero(subpage->bitmaps, start_bit, nbits));
-	bitmap_set(subpage->bitmaps, start_bit, nbits);
-	ret = atomic_add_return(nbits, &subpage->writers);
-	ASSERT(ret == nbits);
-	spin_unlock_irqrestore(&subpage->lock, flags);
-}
-
 static bool btrfs_subpage_end_and_test_writer(const struct btrfs_fs_info *fs_info,
 					      struct folio *folio, u64 start, u32 len)
 {
@@ -351,33 +331,6 @@ static bool btrfs_subpage_end_and_test_writer(const struct btrfs_fs_info *fs_inf
 	return last;
 }
 
-/*
- * Lock a folio for delalloc page writeback.
- *
- * Return -EAGAIN if the page is not properly initialized.
- * Return 0 with the page locked, and writer counter updated.
- *
- * Even with 0 returned, the page still need extra check to make sure
- * it's really the correct page, as the caller is using
- * filemap_get_folios_contig(), which can race with page invalidating.
- */
-int btrfs_folio_start_writer_lock(const struct btrfs_fs_info *fs_info,
-				  struct folio *folio, u64 start, u32 len)
-{
-	if (unlikely(!fs_info) || !btrfs_is_subpage(fs_info, folio->mapping)) {
-		folio_lock(folio);
-		return 0;
-	}
-	folio_lock(folio);
-	if (!folio_test_private(folio) || !folio_get_private(folio)) {
-		folio_unlock(folio);
-		return -EAGAIN;
-	}
-	btrfs_subpage_clamp_range(folio, &start, &len);
-	btrfs_subpage_start_writer(fs_info, folio, start, len);
-	return 0;
-}
-
 /*
  * Handle different locked folios:
  *
diff --git a/fs/btrfs/subpage.h b/fs/btrfs/subpage.h
index d16fbddeda68f2..e4076c5b06bc97 100644
--- a/fs/btrfs/subpage.h
+++ b/fs/btrfs/subpage.h
@@ -100,8 +100,6 @@ void btrfs_subpage_start_reader(const struct btrfs_fs_info *fs_info,
 void btrfs_subpage_end_reader(const struct btrfs_fs_info *fs_info,
 			      struct folio *folio, u64 start, u32 len);
 
-int btrfs_folio_start_writer_lock(const struct btrfs_fs_info *fs_info,
-				  struct folio *folio, u64 start, u32 len);
 void btrfs_folio_end_writer_lock(const struct btrfs_fs_info *fs_info,
 				 struct folio *folio, u64 start, u32 len);
 void btrfs_folio_set_writer_lock(const struct btrfs_fs_info *fs_info,

From 9d6485664fade851cc4a736b757441735703616c Mon Sep 17 00:00:00 2001
From: Qu Wenruo <wqu@suse.com>
Date: Wed, 9 Oct 2024 16:21:06 +1030
Subject: [PATCH 77/80] btrfs: unify to use writer locks for subpage locking

Since commit d7172f52e993 ("btrfs: use per-buffer locking for
extent_buffer reading"), metadata read no longer relies on the subpage
reader locking.

This means we do not need to maintain a different metadata/data split
for locking, so we can convert the existing reader lock users by:

- add_ra_bio_pages()
  Convert to btrfs_folio_set_writer_lock()

- end_folio_read()
  Convert to btrfs_folio_end_writer_lock()

- begin_folio_read()
  Convert to btrfs_folio_set_writer_lock()

- folio_range_has_eb()
  Remove the subpage->readers checks, since it is always 0.

- Remove btrfs_subpage_start_reader() and btrfs_subpage_end_reader()

Signed-off-by: Qu Wenruo <wqu@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/compression.c |  3 +-
 fs/btrfs/extent_io.c   | 10 ++-----
 fs/btrfs/subpage.c     | 62 ++----------------------------------------
 fs/btrfs/subpage.h     | 13 ---------
 4 files changed, 5 insertions(+), 83 deletions(-)

diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index abcf8ed06afc0f..e51d55bd1a29b9 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -545,8 +545,7 @@ static noinline int add_ra_bio_pages(struct inode *inode,
 		 * subpage::readers and to unlock the page.
 		 */
 		if (fs_info->sectorsize < PAGE_SIZE)
-			btrfs_subpage_start_reader(fs_info, folio, cur,
-						   add_size);
+			btrfs_folio_set_writer_lock(fs_info, folio, cur, add_size);
 		folio_put(folio);
 		cur += add_size;
 	}
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 4cba6d78e63148..3eb9ed5d55f26a 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -438,7 +438,7 @@ static void end_folio_read(struct folio *folio, bool uptodate, u64 start, u32 le
 	if (!btrfs_is_subpage(fs_info, folio->mapping))
 		folio_unlock(folio);
 	else
-		btrfs_subpage_end_reader(fs_info, folio, start, len);
+		btrfs_folio_end_writer_lock(fs_info, folio, start, len);
 }
 
 /*
@@ -495,7 +495,7 @@ static void begin_folio_read(struct btrfs_fs_info *fs_info, struct folio *folio)
 		return;
 
 	ASSERT(folio_test_private(folio));
-	btrfs_subpage_start_reader(fs_info, folio, folio_pos(folio), PAGE_SIZE);
+	btrfs_folio_set_writer_lock(fs_info, folio, folio_pos(folio), PAGE_SIZE);
 }
 
 /*
@@ -2517,12 +2517,6 @@ static bool folio_range_has_eb(struct folio *folio)
 		subpage = folio_get_private(folio);
 		if (atomic_read(&subpage->eb_refs))
 			return true;
-		/*
-		 * Even there is no eb refs here, we may still have
-		 * end_folio_read() call relying on page::private.
-		 */
-		if (atomic_read(&subpage->readers))
-			return true;
 	}
 	return false;
 }
diff --git a/fs/btrfs/subpage.c b/fs/btrfs/subpage.c
index 26d4d05dd165aa..6c3d54a459564a 100644
--- a/fs/btrfs/subpage.c
+++ b/fs/btrfs/subpage.c
@@ -140,12 +140,10 @@ struct btrfs_subpage *btrfs_alloc_subpage(const struct btrfs_fs_info *fs_info,
 		return ERR_PTR(-ENOMEM);
 
 	spin_lock_init(&ret->lock);
-	if (type == BTRFS_SUBPAGE_METADATA) {
+	if (type == BTRFS_SUBPAGE_METADATA)
 		atomic_set(&ret->eb_refs, 0);
-	} else {
-		atomic_set(&ret->readers, 0);
+	else
 		atomic_set(&ret->writers, 0);
-	}
 	return ret;
 }
 
@@ -221,62 +219,6 @@ static void btrfs_subpage_assert(const struct btrfs_fs_info *fs_info,
 	__start_bit;							\
 })
 
-void btrfs_subpage_start_reader(const struct btrfs_fs_info *fs_info,
-				struct folio *folio, u64 start, u32 len)
-{
-	struct btrfs_subpage *subpage = folio_get_private(folio);
-	const int start_bit = subpage_calc_start_bit(fs_info, folio, locked, start, len);
-	const int nbits = len >> fs_info->sectorsize_bits;
-	unsigned long flags;
-
-
-	btrfs_subpage_assert(fs_info, folio, start, len);
-
-	spin_lock_irqsave(&subpage->lock, flags);
-	/*
-	 * Even though it's just for reading the page, no one should have
-	 * locked the subpage range.
-	 */
-	ASSERT(bitmap_test_range_all_zero(subpage->bitmaps, start_bit, nbits));
-	bitmap_set(subpage->bitmaps, start_bit, nbits);
-	atomic_add(nbits, &subpage->readers);
-	spin_unlock_irqrestore(&subpage->lock, flags);
-}
-
-void btrfs_subpage_end_reader(const struct btrfs_fs_info *fs_info,
-			      struct folio *folio, u64 start, u32 len)
-{
-	struct btrfs_subpage *subpage = folio_get_private(folio);
-	const int start_bit = subpage_calc_start_bit(fs_info, folio, locked, start, len);
-	const int nbits = len >> fs_info->sectorsize_bits;
-	unsigned long flags;
-	bool is_data;
-	bool last;
-
-	btrfs_subpage_assert(fs_info, folio, start, len);
-	is_data = is_data_inode(BTRFS_I(folio->mapping->host));
-
-	spin_lock_irqsave(&subpage->lock, flags);
-
-	/* The range should have already been locked. */
-	ASSERT(bitmap_test_range_all_set(subpage->bitmaps, start_bit, nbits));
-	ASSERT(atomic_read(&subpage->readers) >= nbits);
-
-	bitmap_clear(subpage->bitmaps, start_bit, nbits);
-	last = atomic_sub_and_test(nbits, &subpage->readers);
-
-	/*
-	 * For data we need to unlock the page if the last read has finished.
-	 *
-	 * And please don't replace @last with atomic_sub_and_test() call
-	 * inside if () condition.
-	 * As we want the atomic_sub_and_test() to be always executed.
-	 */
-	if (is_data && last)
-		folio_unlock(folio);
-	spin_unlock_irqrestore(&subpage->lock, flags);
-}
-
 static void btrfs_subpage_clamp_range(struct folio *folio, u64 *start, u32 *len)
 {
 	u64 orig_start = *start;
diff --git a/fs/btrfs/subpage.h b/fs/btrfs/subpage.h
index e4076c5b06bc97..c150aba9318edd 100644
--- a/fs/btrfs/subpage.h
+++ b/fs/btrfs/subpage.h
@@ -45,14 +45,6 @@ enum {
 struct btrfs_subpage {
 	/* Common members for both data and metadata pages */
 	spinlock_t lock;
-	/*
-	 * Both data and metadata needs to track how many readers are for the
-	 * page.
-	 * Data relies on @readers to unlock the page when last reader finished.
-	 * While metadata doesn't need page unlock, it needs to prevent
-	 * page::private get cleared before the last end_page_read().
-	 */
-	atomic_t readers;
 	union {
 		/*
 		 * Structures only used by metadata
@@ -95,11 +87,6 @@ void btrfs_free_subpage(struct btrfs_subpage *subpage);
 void btrfs_folio_inc_eb_refs(const struct btrfs_fs_info *fs_info, struct folio *folio);
 void btrfs_folio_dec_eb_refs(const struct btrfs_fs_info *fs_info, struct folio *folio);
 
-void btrfs_subpage_start_reader(const struct btrfs_fs_info *fs_info,
-				struct folio *folio, u64 start, u32 len);
-void btrfs_subpage_end_reader(const struct btrfs_fs_info *fs_info,
-			      struct folio *folio, u64 start, u32 len);
-
 void btrfs_folio_end_writer_lock(const struct btrfs_fs_info *fs_info,
 				 struct folio *folio, u64 start, u32 len);
 void btrfs_folio_set_writer_lock(const struct btrfs_fs_info *fs_info,

From 2be4c90c956eb7ce8e1b20d5389a821c53266248 Mon Sep 17 00:00:00 2001
From: Qu Wenruo <wqu@suse.com>
Date: Wed, 9 Oct 2024 16:21:07 +1030
Subject: [PATCH 78/80] btrfs: rename btrfs_folio_(set|start|end)_writer_lock()

Since there is no user of reader locks, rename the writer locks into a
more generic name, by removing the "_writer" part from the name.

And also rename btrfs_subpage::writer into btrfs_subpage::locked.

Signed-off-by: Qu Wenruo <wqu@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/compression.c |  2 +-
 fs/btrfs/extent_io.c   | 14 ++++++-------
 fs/btrfs/subpage.c     | 46 +++++++++++++++++++++---------------------
 fs/btrfs/subpage.h     | 20 ++++++++++--------
 4 files changed, 43 insertions(+), 39 deletions(-)

diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index e51d55bd1a29b9..21d73ea594a999 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -545,7 +545,7 @@ static noinline int add_ra_bio_pages(struct inode *inode,
 		 * subpage::readers and to unlock the page.
 		 */
 		if (fs_info->sectorsize < PAGE_SIZE)
-			btrfs_folio_set_writer_lock(fs_info, folio, cur, add_size);
+			btrfs_folio_set_lock(fs_info, folio, cur, add_size);
 		folio_put(folio);
 		cur += add_size;
 	}
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 3eb9ed5d55f26a..1beaba23253242 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -190,7 +190,7 @@ static void process_one_folio(struct btrfs_fs_info *fs_info,
 		btrfs_folio_clamp_clear_writeback(fs_info, folio, start, len);
 
 	if (folio != locked_folio && (page_ops & PAGE_UNLOCK))
-		btrfs_folio_end_writer_lock(fs_info, folio, start, len);
+		btrfs_folio_end_lock(fs_info, folio, start, len);
 }
 
 static void __process_folios_contig(struct address_space *mapping,
@@ -276,7 +276,7 @@ static noinline int lock_delalloc_folios(struct inode *inode,
 			range_start = max_t(u64, folio_pos(folio), start);
 			range_len = min_t(u64, folio_pos(folio) + folio_size(folio),
 					  end + 1) - range_start;
-			btrfs_folio_set_writer_lock(fs_info, folio, range_start, range_len);
+			btrfs_folio_set_lock(fs_info, folio, range_start, range_len);
 
 			processed_end = range_start + range_len - 1;
 		}
@@ -438,7 +438,7 @@ static void end_folio_read(struct folio *folio, bool uptodate, u64 start, u32 le
 	if (!btrfs_is_subpage(fs_info, folio->mapping))
 		folio_unlock(folio);
 	else
-		btrfs_folio_end_writer_lock(fs_info, folio, start, len);
+		btrfs_folio_end_lock(fs_info, folio, start, len);
 }
 
 /*
@@ -495,7 +495,7 @@ static void begin_folio_read(struct btrfs_fs_info *fs_info, struct folio *folio)
 		return;
 
 	ASSERT(folio_test_private(folio));
-	btrfs_folio_set_writer_lock(fs_info, folio, folio_pos(folio), PAGE_SIZE);
+	btrfs_folio_set_lock(fs_info, folio, folio_pos(folio), PAGE_SIZE);
 }
 
 /*
@@ -1184,7 +1184,7 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode,
 	for_each_set_bit(bit, &bio_ctrl->submit_bitmap, fs_info->sectors_per_page) {
 		u64 start = page_start + (bit << fs_info->sectorsize_bits);
 
-		btrfs_folio_set_writer_lock(fs_info, folio, start, fs_info->sectorsize);
+		btrfs_folio_set_lock(fs_info, folio, start, fs_info->sectorsize);
 	}
 
 	/* Lock all (subpage) delalloc ranges inside the folio first. */
@@ -1520,7 +1520,7 @@ static int extent_writepage(struct folio *folio, struct btrfs_bio_ctrl *bio_ctrl
 	 * Only unlock ranges that are submitted. As there can be some async
 	 * submitted ranges inside the folio.
 	 */
-	btrfs_folio_end_writer_lock_bitmap(fs_info, folio, bio_ctrl->submit_bitmap);
+	btrfs_folio_end_lock_bitmap(fs_info, folio, bio_ctrl->submit_bitmap);
 	ASSERT(ret <= 0);
 	return ret;
 }
@@ -2298,7 +2298,7 @@ void extent_write_locked_range(struct inode *inode, const struct folio *locked_f
 						       cur, cur_len, !ret);
 			mapping_set_error(mapping, ret);
 		}
-		btrfs_folio_end_writer_lock(fs_info, folio, cur, cur_len);
+		btrfs_folio_end_lock(fs_info, folio, cur, cur_len);
 		if (ret < 0)
 			found_error = true;
 next_page:
diff --git a/fs/btrfs/subpage.c b/fs/btrfs/subpage.c
index 6c3d54a459564a..d4cab3c557425d 100644
--- a/fs/btrfs/subpage.c
+++ b/fs/btrfs/subpage.c
@@ -143,7 +143,7 @@ struct btrfs_subpage *btrfs_alloc_subpage(const struct btrfs_fs_info *fs_info,
 	if (type == BTRFS_SUBPAGE_METADATA)
 		atomic_set(&ret->eb_refs, 0);
 	else
-		atomic_set(&ret->writers, 0);
+		atomic_set(&ret->nr_locked, 0);
 	return ret;
 }
 
@@ -237,8 +237,8 @@ static void btrfs_subpage_clamp_range(struct folio *folio, u64 *start, u32 *len)
 			     orig_start + orig_len) - *start;
 }
 
-static bool btrfs_subpage_end_and_test_writer(const struct btrfs_fs_info *fs_info,
-					      struct folio *folio, u64 start, u32 len)
+static bool btrfs_subpage_end_and_test_lock(const struct btrfs_fs_info *fs_info,
+					    struct folio *folio, u64 start, u32 len)
 {
 	struct btrfs_subpage *subpage = folio_get_private(folio);
 	const int start_bit = subpage_calc_start_bit(fs_info, folio, locked, start, len);
@@ -256,9 +256,9 @@ static bool btrfs_subpage_end_and_test_writer(const struct btrfs_fs_info *fs_inf
 	 * extent_clear_unlock_delalloc() for compression path.
 	 *
 	 * This @locked_page is locked by plain lock_page(), thus its
-	 * subpage::writers is 0.  Handle them in a special way.
+	 * subpage::locked is 0.  Handle them in a special way.
 	 */
-	if (atomic_read(&subpage->writers) == 0) {
+	if (atomic_read(&subpage->nr_locked) == 0) {
 		spin_unlock_irqrestore(&subpage->lock, flags);
 		return true;
 	}
@@ -267,8 +267,8 @@ static bool btrfs_subpage_end_and_test_writer(const struct btrfs_fs_info *fs_inf
 		clear_bit(bit, subpage->bitmaps);
 		cleared++;
 	}
-	ASSERT(atomic_read(&subpage->writers) >= cleared);
-	last = atomic_sub_and_test(cleared, &subpage->writers);
+	ASSERT(atomic_read(&subpage->nr_locked) >= cleared);
+	last = atomic_sub_and_test(cleared, &subpage->nr_locked);
 	spin_unlock_irqrestore(&subpage->lock, flags);
 	return last;
 }
@@ -289,8 +289,8 @@ static bool btrfs_subpage_end_and_test_writer(const struct btrfs_fs_info *fs_inf
  *   bitmap, reduce the writer lock number, and unlock the page if that's
  *   the last locked range.
  */
-void btrfs_folio_end_writer_lock(const struct btrfs_fs_info *fs_info,
-				 struct folio *folio, u64 start, u32 len)
+void btrfs_folio_end_lock(const struct btrfs_fs_info *fs_info,
+			  struct folio *folio, u64 start, u32 len)
 {
 	struct btrfs_subpage *subpage = folio_get_private(folio);
 
@@ -303,24 +303,24 @@ void btrfs_folio_end_writer_lock(const struct btrfs_fs_info *fs_info,
 
 	/*
 	 * For subpage case, there are two types of locked page.  With or
-	 * without writers number.
+	 * without locked number.
 	 *
-	 * Since we own the page lock, no one else could touch subpage::writers
+	 * Since we own the page lock, no one else could touch subpage::locked
 	 * and we are safe to do several atomic operations without spinlock.
 	 */
-	if (atomic_read(&subpage->writers) == 0) {
-		/* No writers, locked by plain lock_page(). */
+	if (atomic_read(&subpage->nr_locked) == 0) {
+		/* No subpage lock, locked by plain lock_page(). */
 		folio_unlock(folio);
 		return;
 	}
 
 	btrfs_subpage_clamp_range(folio, &start, &len);
-	if (btrfs_subpage_end_and_test_writer(fs_info, folio, start, len))
+	if (btrfs_subpage_end_and_test_lock(fs_info, folio, start, len))
 		folio_unlock(folio);
 }
 
-void btrfs_folio_end_writer_lock_bitmap(const struct btrfs_fs_info *fs_info,
-					struct folio *folio, unsigned long bitmap)
+void btrfs_folio_end_lock_bitmap(const struct btrfs_fs_info *fs_info,
+				 struct folio *folio, unsigned long bitmap)
 {
 	struct btrfs_subpage *subpage = folio_get_private(folio);
 	const int start_bit = fs_info->sectors_per_page * btrfs_bitmap_nr_locked;
@@ -334,8 +334,8 @@ void btrfs_folio_end_writer_lock_bitmap(const struct btrfs_fs_info *fs_info,
 		return;
 	}
 
-	if (atomic_read(&subpage->writers) == 0) {
-		/* No writers, locked by plain lock_page(). */
+	if (atomic_read(&subpage->nr_locked) == 0) {
+		/* No subpage lock, locked by plain lock_page(). */
 		folio_unlock(folio);
 		return;
 	}
@@ -345,8 +345,8 @@ void btrfs_folio_end_writer_lock_bitmap(const struct btrfs_fs_info *fs_info,
 		if (test_and_clear_bit(bit + start_bit, subpage->bitmaps))
 			cleared++;
 	}
-	ASSERT(atomic_read(&subpage->writers) >= cleared);
-	last = atomic_sub_and_test(cleared, &subpage->writers);
+	ASSERT(atomic_read(&subpage->nr_locked) >= cleared);
+	last = atomic_sub_and_test(cleared, &subpage->nr_locked);
 	spin_unlock_irqrestore(&subpage->lock, flags);
 	if (last)
 		folio_unlock(folio);
@@ -671,8 +671,8 @@ void btrfs_folio_assert_not_dirty(const struct btrfs_fs_info *fs_info,
  * This populates the involved subpage ranges so that subpage helpers can
  * properly unlock them.
  */
-void btrfs_folio_set_writer_lock(const struct btrfs_fs_info *fs_info,
-				 struct folio *folio, u64 start, u32 len)
+void btrfs_folio_set_lock(const struct btrfs_fs_info *fs_info,
+			  struct folio *folio, u64 start, u32 len)
 {
 	struct btrfs_subpage *subpage;
 	unsigned long flags;
@@ -691,7 +691,7 @@ void btrfs_folio_set_writer_lock(const struct btrfs_fs_info *fs_info,
 	/* Target range should not yet be locked. */
 	ASSERT(bitmap_test_range_all_zero(subpage->bitmaps, start_bit, nbits));
 	bitmap_set(subpage->bitmaps, start_bit, nbits);
-	ret = atomic_add_return(nbits, &subpage->writers);
+	ret = atomic_add_return(nbits, &subpage->nr_locked);
 	ASSERT(ret <= fs_info->sectors_per_page);
 	spin_unlock_irqrestore(&subpage->lock, flags);
 }
diff --git a/fs/btrfs/subpage.h b/fs/btrfs/subpage.h
index c150aba9318edd..428fa9389fd49e 100644
--- a/fs/btrfs/subpage.h
+++ b/fs/btrfs/subpage.h
@@ -54,8 +54,12 @@ struct btrfs_subpage {
 		 */
 		atomic_t eb_refs;
 
-		/* Structures only used by data */
-		atomic_t writers;
+		/*
+		 * Structures only used by data,
+		 *
+		 * How many sectors inside the page is locked.
+		 */
+		atomic_t nr_locked;
 	};
 	unsigned long bitmaps[];
 };
@@ -87,12 +91,12 @@ void btrfs_free_subpage(struct btrfs_subpage *subpage);
 void btrfs_folio_inc_eb_refs(const struct btrfs_fs_info *fs_info, struct folio *folio);
 void btrfs_folio_dec_eb_refs(const struct btrfs_fs_info *fs_info, struct folio *folio);
 
-void btrfs_folio_end_writer_lock(const struct btrfs_fs_info *fs_info,
-				 struct folio *folio, u64 start, u32 len);
-void btrfs_folio_set_writer_lock(const struct btrfs_fs_info *fs_info,
-				 struct folio *folio, u64 start, u32 len);
-void btrfs_folio_end_writer_lock_bitmap(const struct btrfs_fs_info *fs_info,
-					struct folio *folio, unsigned long bitmap);
+void btrfs_folio_end_lock(const struct btrfs_fs_info *fs_info,
+			  struct folio *folio, u64 start, u32 len);
+void btrfs_folio_set_lock(const struct btrfs_fs_info *fs_info,
+			  struct folio *folio, u64 start, u32 len);
+void btrfs_folio_end_lock_bitmap(const struct btrfs_fs_info *fs_info,
+				 struct folio *folio, unsigned long bitmap);
 /*
  * Template for subpage related operations.
  *

From 6eba2463c7df69c6e853d405859c45d1aee7afcf Mon Sep 17 00:00:00 2001
From: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Date: Tue, 8 Oct 2024 15:42:51 +0200
Subject: [PATCH 79/80] btrfs: implement partial deletion of RAID stripe
 extents

In our CI system, the RAID stripe tree configuration sometimes fails with
the following ASSERT():

 assertion failed: found_start >= start && found_end <= end, in fs/btrfs/raid-stripe-tree.c:64

This ASSERT()ion triggers, because for the initial design of RAID
stripe-tree, I had the "one ordered-extent equals one bio" rule of zoned
btrfs in mind.

But for a RAID stripe-tree based system, that is not hosted on a zoned
storage device, but on a regular device this rule doesn't apply.

So in case the range we want to delete starts in the middle of the
previous item, grab the item and "truncate" it's length. That is, clone
the item, subtract the deleted portion from the key's offset, delete the
old item and insert the new one.

In case the range to delete ends in the middle of an item, we have to
adjust both the item's key as well as the stripe extents and then
re-insert the modified clone into the tree after deleting the old stripe
extent.

Signed-off-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
---
 fs/btrfs/ctree.c            |  1 +
 fs/btrfs/raid-stripe-tree.c | 94 ++++++++++++++++++++++++++++++++++---
 2 files changed, 88 insertions(+), 7 deletions(-)

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 148648ea1c8b95..cf791e5cf229eb 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -3899,6 +3899,7 @@ static noinline int setup_leaf_for_split(struct btrfs_trans_handle *trans,
 	btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
 
 	BUG_ON(key.type != BTRFS_EXTENT_DATA_KEY &&
+	       key.type != BTRFS_RAID_STRIPE_KEY &&
 	       key.type != BTRFS_EXTENT_CSUM_KEY);
 
 	if (btrfs_leaf_free_space(leaf) >= ins_len)
diff --git a/fs/btrfs/raid-stripe-tree.c b/fs/btrfs/raid-stripe-tree.c
index 41970bbdb05f63..569273e42d852b 100644
--- a/fs/btrfs/raid-stripe-tree.c
+++ b/fs/btrfs/raid-stripe-tree.c
@@ -13,6 +13,50 @@
 #include "volumes.h"
 #include "print-tree.h"
 
+static int btrfs_partially_delete_raid_extent(struct btrfs_trans_handle *trans,
+					      struct btrfs_path *path,
+					      struct btrfs_key *oldkey,
+					      u64 newlen, u64 frontpad)
+{
+	struct btrfs_root *stripe_root = trans->fs_info->stripe_root;
+	struct btrfs_stripe_extent *extent;
+	struct extent_buffer *leaf;
+	int slot;
+	size_t item_size;
+	int ret;
+	struct btrfs_key newkey = {
+		.objectid = oldkey->objectid + frontpad,
+		.type = BTRFS_RAID_STRIPE_KEY,
+		.offset = newlen,
+	};
+
+	ASSERT(oldkey->type == BTRFS_RAID_STRIPE_KEY);
+	ret = btrfs_duplicate_item(trans, stripe_root, path, &newkey);
+	if (ret)
+		return ret;
+
+	leaf = path->nodes[0];
+	slot = path->slots[0];
+	item_size = btrfs_item_size(leaf, slot);
+	extent = btrfs_item_ptr(leaf, slot, struct btrfs_stripe_extent);
+
+	for (int i = 0; i < btrfs_num_raid_stripes(item_size); i++) {
+		struct btrfs_raid_stride *stride = &extent->strides[i];
+		u64 phys;
+
+		phys = btrfs_raid_stride_physical(leaf, stride);
+		btrfs_set_raid_stride_physical(leaf, stride, phys + frontpad);
+	}
+
+	btrfs_mark_buffer_dirty(trans, leaf);
+
+	/* delete the old item, after we've inserted a new one. */
+	path->slots[0]--;
+	ret = btrfs_del_item(trans, stripe_root, path);
+
+	return ret;
+}
+
 int btrfs_delete_raid_extent(struct btrfs_trans_handle *trans, u64 start, u64 length)
 {
 	struct btrfs_fs_info *fs_info = trans->fs_info;
@@ -36,23 +80,24 @@ int btrfs_delete_raid_extent(struct btrfs_trans_handle *trans, u64 start, u64 le
 	while (1) {
 		key.objectid = start;
 		key.type = BTRFS_RAID_STRIPE_KEY;
-		key.offset = length;
+		key.offset = 0;
 
 		ret = btrfs_search_slot(trans, stripe_root, &key, path, -1, 1);
 		if (ret < 0)
 			break;
-		if (ret > 0) {
-			ret = 0;
-			if (path->slots[0] == 0)
-				break;
+
+		if (path->slots[0] == btrfs_header_nritems(path->nodes[0]))
 			path->slots[0]--;
-		}
 
 		leaf = path->nodes[0];
 		slot = path->slots[0];
 		btrfs_item_key_to_cpu(leaf, &key, slot);
 		found_start = key.objectid;
 		found_end = found_start + key.offset;
+		ret = 0;
+
+		if (key.type != BTRFS_RAID_STRIPE_KEY)
+			break;
 
 		/* That stripe ends before we start, we're done. */
 		if (found_end <= start)
@@ -61,7 +106,42 @@ int btrfs_delete_raid_extent(struct btrfs_trans_handle *trans, u64 start, u64 le
 		trace_btrfs_raid_extent_delete(fs_info, start, end,
 					       found_start, found_end);
 
-		ASSERT(found_start >= start && found_end <= end);
+		/*
+		 * The stripe extent starts before the range we want to delete:
+		 *
+		 * |--- RAID Stripe Extent ---|
+		 * |--- keep  ---|--- drop ---|
+		 *
+		 * This means we have to duplicate the tree item, truncate the
+		 * length to the new size and then re-insert the item.
+		 */
+		if (found_start < start) {
+			u64 diff = start - found_start;
+
+			ret = btrfs_partially_delete_raid_extent(trans, path,
+								 &key,
+								 diff, 0);
+			break;
+		}
+
+		/*
+		 * The stripe extent ends after the range we want to delete:
+		 *
+		 * |--- RAID Stripe Extent ---|
+		 * |--- drop  ---|--- keep ---|
+		 *
+		 * This means we have to duplicate the tree item, truncate the
+		 * length to the new size and then re-insert the item.
+		 */
+		if (found_end > end) {
+			u64 diff = found_end - end;
+
+			ret = btrfs_partially_delete_raid_extent(trans, path,
+								 &key,
+								 diff, diff);
+			break;
+		}
+
 		ret = btrfs_del_item(trans, stripe_root, path);
 		if (ret)
 			break;

From 7269e1c1ee168c6094f15a7673876ab2beadbca9 Mon Sep 17 00:00:00 2001
From: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Date: Tue, 8 Oct 2024 16:51:22 +0200
Subject: [PATCH 80/80] btrfs: implement self-tests for partial RAID
 srtipe-tree delete

Implement self-tests for partial deletion of RAID stripe-tree entries.

These two new tests cover both the deletion of the front of a RAID
stripe-tree stripe extent as well as truncation of an item to make it
smaller.

Signed-off-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
---
 fs/btrfs/tests/raid-stripe-tree-tests.c | 223 ++++++++++++++++++++++++
 1 file changed, 223 insertions(+)

diff --git a/fs/btrfs/tests/raid-stripe-tree-tests.c b/fs/btrfs/tests/raid-stripe-tree-tests.c
index b8013ab13c4374..5f1f1342b29172 100644
--- a/fs/btrfs/tests/raid-stripe-tree-tests.c
+++ b/fs/btrfs/tests/raid-stripe-tree-tests.c
@@ -29,6 +29,227 @@ static struct btrfs_device *btrfs_device_by_devid(struct btrfs_fs_devices *fs_de
 	return NULL;
 }
 
+/*
+ * Test a 64K RST write on a 2 disk RAID1 at a logical address of 1M and then
+ * delete the 1st 32K, making the new start address 1M+32K.
+ */
+static int test_front_delete(struct btrfs_trans_handle *trans)
+{
+	struct btrfs_fs_info *fs_info = trans->fs_info;
+	struct btrfs_io_context *bioc;
+	struct btrfs_io_stripe io_stripe = { 0 };
+	u64 map_type = RST_TEST_RAID1_TYPE;
+	u64 logical = SZ_1M;
+	u64 len = SZ_64K;
+	int ret;
+
+	bioc = alloc_btrfs_io_context(fs_info, logical, RST_TEST_NUM_DEVICES);
+	if (!bioc) {
+		test_std_err(TEST_ALLOC_IO_CONTEXT);
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	io_stripe.dev = btrfs_device_by_devid(fs_info->fs_devices, 0);
+	bioc->map_type = map_type;
+	bioc->size = len;
+
+	for (int i = 0; i < RST_TEST_NUM_DEVICES; i++) {
+		struct btrfs_io_stripe *stripe = &bioc->stripes[i];
+
+		stripe->dev = btrfs_device_by_devid(fs_info->fs_devices, i);
+		if (!stripe->dev) {
+			test_err("cannot find device with devid %d", i);
+			ret = -EINVAL;
+			goto out;
+		}
+
+		stripe->physical = logical + i * SZ_1G;
+	}
+
+	ret = btrfs_insert_one_raid_extent(trans, bioc);
+	if (ret) {
+		test_err("inserting RAID extent failed: %d", ret);
+		goto out;
+	}
+
+	ret = btrfs_get_raid_extent_offset(fs_info, logical, &len, map_type, 0,
+					   &io_stripe);
+	if (ret) {
+		test_err("lookup of RAID extent [%llu, %llu] failed", logical,
+			 logical + len);
+		goto out;
+	}
+
+	if (io_stripe.physical != logical) {
+		test_err("invalid physical address, expected %llu got %llu",
+			 logical, io_stripe.physical);
+		ret = -EINVAL;
+		goto out;
+	}
+
+	if (len != SZ_64K) {
+		test_err("invalid stripe length, expected %llu got %llu",
+			 (u64)SZ_64K, len);
+		ret = -EINVAL;
+		goto out;
+	}
+
+	ret = btrfs_delete_raid_extent(trans, logical, SZ_32K);
+	if (ret) {
+		test_err("deleting RAID extent [%llu, %llu] failed", logical,
+			 logical + SZ_32K);
+		goto out;
+	}
+
+	len = SZ_32K;
+	ret = btrfs_get_raid_extent_offset(fs_info, logical + SZ_32K, &len,
+					   map_type, 0, &io_stripe);
+	if (ret) {
+		test_err("lookup of RAID extent [%llu, %llu] failed",
+			 logical + SZ_32K, logical + SZ_32K + len);
+		goto out;
+	}
+
+	if (io_stripe.physical != logical + SZ_32K) {
+		test_err("invalid physical address, expected %llu, got %llu",
+			 logical + SZ_32K, io_stripe.physical);
+		ret = -EINVAL;
+		goto out;
+	}
+
+	if (len != SZ_32K) {
+		test_err("invalid stripe length, expected %llu, got %llu",
+			 (u64)SZ_32K, len);
+		ret = -EINVAL;
+		goto out;
+	}
+
+	ret = btrfs_get_raid_extent_offset(fs_info, logical, &len, map_type, 0,
+					   &io_stripe);
+	if (!ret) {
+		ret = -EINVAL;
+		test_err("lookup of RAID extent [%llu, %llu] succeeded, should fail",
+			 logical, logical + SZ_32K);
+		goto out;
+	}
+
+	ret = btrfs_delete_raid_extent(trans, logical + SZ_32K, SZ_32K);
+ out:
+	return ret;
+}
+
+/*
+ * Test a 64K RST write on a 2 disk RAID1 at a logical address of 1M and then
+ * truncate the stripe extent down to 32K.
+ */
+static int test_tail_delete(struct btrfs_trans_handle *trans)
+{
+	struct btrfs_fs_info *fs_info = trans->fs_info;
+	struct btrfs_io_context *bioc;
+	struct btrfs_io_stripe io_stripe = { 0 };
+	u64 map_type = RST_TEST_RAID1_TYPE;
+	u64 logical = SZ_1M;
+	u64 len = SZ_64K;
+	int ret;
+
+	bioc = alloc_btrfs_io_context(fs_info, logical, RST_TEST_NUM_DEVICES);
+	if (!bioc) {
+		test_std_err(TEST_ALLOC_IO_CONTEXT);
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	io_stripe.dev = btrfs_device_by_devid(fs_info->fs_devices, 0);
+	bioc->map_type = map_type;
+	bioc->size = len;
+
+	for (int i = 0; i < RST_TEST_NUM_DEVICES; i++) {
+		struct btrfs_io_stripe *stripe = &bioc->stripes[i];
+
+		stripe->dev = btrfs_device_by_devid(fs_info->fs_devices, i);
+		if (!stripe->dev) {
+			test_err("cannot find device with devid %d", i);
+			ret = -EINVAL;
+			goto out;
+		}
+
+		stripe->physical = logical + i * SZ_1G;
+	}
+
+	ret = btrfs_insert_one_raid_extent(trans, bioc);
+	if (ret) {
+		test_err("inserting RAID extent failed: %d", ret);
+		goto out;
+	}
+
+	io_stripe.dev = btrfs_device_by_devid(fs_info->fs_devices, 0);
+	if (!io_stripe.dev) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	ret = btrfs_get_raid_extent_offset(fs_info, logical, &len, map_type, 0,
+					   &io_stripe);
+	if (ret) {
+		test_err("lookup of RAID extent [%llu, %llu] failed", logical,
+			 logical + len);
+		goto out;
+	}
+
+	if (io_stripe.physical != logical) {
+		test_err("invalid physical address, expected %llu got %llu",
+			 logical, io_stripe.physical);
+		ret = -EINVAL;
+		goto out;
+	}
+
+	if (len != SZ_64K) {
+		test_err("invalid stripe length, expected %llu got %llu",
+			 (u64)SZ_64K, len);
+		ret = -EINVAL;
+		goto out;
+	}
+
+	ret = btrfs_delete_raid_extent(trans, logical + SZ_32K, SZ_32K);
+	if (ret) {
+		test_err("deleting RAID extent [%llu, %llu] failed",
+			 logical + SZ_32K, logical + SZ_64K);
+		goto out;
+	}
+
+	len = SZ_32K;
+	ret = btrfs_get_raid_extent_offset(fs_info, logical, &len, map_type, 0, &io_stripe);
+	if (ret) {
+		test_err("lookup of RAID extent [%llu, %llu] failed", logical,
+			 logical + len);
+		goto out;
+	}
+
+	if (io_stripe.physical != logical) {
+		test_err("invalid physical address, expected %llu, got %llu",
+			 logical, io_stripe.physical);
+		ret = -EINVAL;
+		goto out;
+	}
+
+	if (len != SZ_32K) {
+		test_err("invalid stripe length, expected %llu, got %llu",
+			 (u64)SZ_32K, len);
+		ret = -EINVAL;
+		goto out;
+	}
+
+	ret = btrfs_delete_raid_extent(trans, logical, len);
+	if (ret)
+		test_err("deleting RAID extent [%llu, %llu] failed", logical,
+			 logical + len);
+
+out:
+	btrfs_put_bioc(bioc);
+	return ret;
+}
+
 /*
  * Test a 64K RST write on a 2 disk RAID1 at a logical address of 1M and then
  * overwrite the whole range giving it new physical address at an offset of 1G.
@@ -235,6 +456,8 @@ static int test_simple_create_delete(struct btrfs_trans_handle *trans)
 static const test_func_t tests[] = {
 	test_simple_create_delete,
 	test_create_update_delete,
+	test_tail_delete,
+	test_front_delete,
 };
 
 static int run_test(test_func_t test, u32 sectorsize, u32 nodesize)