From 5ba34480dc222db47d7813009eb95a7543790dd3 Mon Sep 17 00:00:00 2001 From: John Garry Date: Mon, 16 Sep 2024 16:53:39 +0000 Subject: [PATCH 1/9] os-linux: Define RWF_ATOMIC Add a definition of RWF_ATOMIC when not available from uapi headers. RWF_ATOMIC is going to be part of Linux v6.11 Signed-off-by: John Garry Link: https://lore.kernel.org/r/20240916165347.2226763-2-john.g.garry@oracle.com Signed-off-by: Jens Axboe --- os/os-linux.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/os/os-linux.h b/os/os-linux.h index 4d150311a3..010a82913e 100644 --- a/os/os-linux.h +++ b/os/os-linux.h @@ -328,6 +328,10 @@ static inline int fio_set_sched_idle(void) #define RWF_NOWAIT 0x00000008 #endif +#ifndef RWF_ATOMIC +#define RWF_ATOMIC 0x00000040 +#endif + #ifndef RWF_WRITE_LIFE_SHIFT #define RWF_WRITE_LIFE_SHIFT 4 #define RWF_WRITE_LIFE_SHORT (1 << RWF_WRITE_LIFE_SHIFT) From 40f1fc11d4fc6dbbd3c0eb5852e24126dad5fd4e Mon Sep 17 00:00:00 2001 From: John Garry Date: Mon, 16 Sep 2024 16:53:40 +0000 Subject: [PATCH 2/9] os: Reintroduce atomic write support Previously O_ATOMIC support was added in commit d01612f3ae25 ("Add support for O_ATOMIC"). But support was removed in commit a25ba6c64fe1 ("Get rid of O_ATOMIC"), as support was never added in the Linux kernel. Linux kernel 6.11 will add support for RWF_ATOMIC, which can be supported for various ioengines. See latest man pages for details. The plumbing was left in place for thread option oatomic, so that will be reused. Add a flag to say whether an engine supports atomic writes, and reject when oatomic is set for an engine which does not support atomic writes. This is a change in behaviour, as since commit a25ba6c64fe1 ("Get rid of O_ATOMIC"), this oatomic has been ignored. However, it is better to tell the user that their ioengine of choice does not support atomic writes. Today RWF_ATOMIC is only supported for direct-IO. In future it may be supported for buffered IO. As such, do not auto-set odirect=1 when oatomic==1. Signed-off-by: John Garry Link: https://lore.kernel.org/r/20240916165347.2226763-3-john.g.garry@oracle.com Signed-off-by: Jens Axboe --- init.c | 11 +++++++++++ ioengines.h | 2 ++ options.c | 2 ++ os/os-linux.h | 1 + 4 files changed, 16 insertions(+) diff --git a/init.c b/init.c index 414535ccd4..bad8b75b4e 100644 --- a/init.c +++ b/init.c @@ -855,6 +855,17 @@ static int fixup_options(struct thread_data *td) o->max_bs[DDIR_WRITE]); } + if (td->o.oatomic) { + if (!td_ioengine_flagged(td, FIO_ATOMICWRITES)) { + log_err("fio: engine does not support atomic writes\n"); + td->o.oatomic = 0; + ret |= 1; + } + + if (!td_write(td)) + td->o.oatomic = 0; + } + if (o->pre_read) { if (o->invalidate_cache) o->invalidate_cache = 0; diff --git a/ioengines.h b/ioengines.h index b9834fec3c..1531cd8977 100644 --- a/ioengines.h +++ b/ioengines.h @@ -96,6 +96,7 @@ enum { __FIO_RO_NEEDS_RW_OPEN, /* open files in rw mode even if we have a read job; only affects ioengines using generic_open_file */ __FIO_MULTI_RANGE_TRIM, /* ioengine supports trim with more than one range */ + __FIO_ATOMICWRITES, /* ioengine supports atomic writes */ __FIO_IOENGINE_F_LAST, /* not a real bit; used to count number of bits */ }; @@ -120,6 +121,7 @@ enum fio_ioengine_flags { FIO_SKIPPABLE_IOMEM_ALLOC = 1 << __FIO_SKIPPABLE_IOMEM_ALLOC, FIO_RO_NEEDS_RW_OPEN = 1 << __FIO_RO_NEEDS_RW_OPEN, FIO_MULTI_RANGE_TRIM = 1 << __FIO_MULTI_RANGE_TRIM, + FIO_ATOMICWRITES = 1 << __FIO_ATOMICWRITES, }; /* diff --git a/options.c b/options.c index 5a6b0a0667..95567de60b 100644 --- a/options.c +++ b/options.c @@ -2926,6 +2926,7 @@ struct fio_option fio_options[FIO_MAX_OPTS] = { .category = FIO_OPT_C_IO, .group = FIO_OPT_G_IO_TYPE, }, +#ifdef FIO_HAVE_RWF_ATOMIC { .name = "atomic", .lname = "Atomic I/O", @@ -2936,6 +2937,7 @@ struct fio_option fio_options[FIO_MAX_OPTS] = { .category = FIO_OPT_C_IO, .group = FIO_OPT_G_IO_TYPE, }, +#endif { .name = "buffered", .lname = "Buffered I/O", diff --git a/os/os-linux.h b/os/os-linux.h index 010a82913e..ead8295c44 100644 --- a/os/os-linux.h +++ b/os/os-linux.h @@ -62,6 +62,7 @@ #define FIO_HAVE_BYTEORDER_FUNCS #define FIO_HAVE_PWRITEV2 #define FIO_HAVE_SHM_ATTACH_REMOVED +#define FIO_HAVE_RWF_ATOMIC #ifdef MAP_HUGETLB #define FIO_HAVE_MMAP_HUGE From 80226c53d994e8a9206ed7acf4a4dc75a8691792 Mon Sep 17 00:00:00 2001 From: Alan Adamson Date: Mon, 16 Sep 2024 16:53:41 +0000 Subject: [PATCH 3/9] pvsync2: Support RWF_ATOMIC Set RWF_ATOMIC for writes and atomic==1. Signed-off-by: Alan Adamson jpg: Set FIO_ATOMICWRITES for pvsync2 Signed-off-by: John Garry Link: https://lore.kernel.org/r/20240916165347.2226763-4-john.g.garry@oracle.com Signed-off-by: Jens Axboe --- engines/sync.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/engines/sync.c b/engines/sync.c index b8be4eb341..729d8a71cf 100644 --- a/engines/sync.c +++ b/engines/sync.c @@ -175,9 +175,11 @@ static enum fio_q_status fio_pvsyncio2_queue(struct thread_data *td, if (io_u->ddir == DDIR_READ) ret = preadv2(f->fd, iov, 1, io_u->offset, flags); - else if (io_u->ddir == DDIR_WRITE) + else if (io_u->ddir == DDIR_WRITE) { + if (td->o.oatomic) + flags |= RWF_ATOMIC; ret = pwritev2(f->fd, iov, 1, io_u->offset, flags); - else if (io_u->ddir == DDIR_TRIM) { + } else if (io_u->ddir == DDIR_TRIM) { do_io_u_trim(td, io_u); return FIO_Q_COMPLETED; } else @@ -476,7 +478,8 @@ static struct ioengine_ops ioengine_pvrw2 = { .open_file = generic_open_file, .close_file = generic_close_file, .get_file_size = generic_get_file_size, - .flags = FIO_SYNCIO, + .flags = FIO_SYNCIO | + FIO_ATOMICWRITES, .options = options, .option_struct_size = sizeof(struct psyncv2_options), }; From a79319fe6efd138ae346aff28b528682fbed0a7c Mon Sep 17 00:00:00 2001 From: John Garry Date: Mon, 16 Sep 2024 16:53:42 +0000 Subject: [PATCH 4/9] libaio: Support RWF_ATOMIC Set RWF_ATOMIC for writes and oatomic==1. Guard setting RWF_ATOMIC by FIO_HAVE_RWF_ATOMIC, as only linux supports RWF_ATOMIC. Signed-off-by: John Garry Link: https://lore.kernel.org/r/20240916165347.2226763-5-john.g.garry@oracle.com Signed-off-by: Jens Axboe --- engines/libaio.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/engines/libaio.c b/engines/libaio.c index aaccc7ce09..c2d437938e 100644 --- a/engines/libaio.c +++ b/engines/libaio.c @@ -110,6 +110,10 @@ static int fio_libaio_prep(struct thread_data *td, struct io_u *io_u) io_prep_pwrite(iocb, f->fd, io_u->xfer_buf, io_u->xfer_buflen, io_u->offset); if (o->nowait) iocb->aio_rw_flags |= RWF_NOWAIT; +#ifdef FIO_HAVE_RWF_ATOMIC + if (td->o.oatomic) + iocb->aio_rw_flags |= RWF_ATOMIC; +#endif } else if (ddir_sync(io_u->ddir)) io_prep_fsync(iocb, f->fd); @@ -440,7 +444,8 @@ FIO_STATIC struct ioengine_ops ioengine = { .name = "libaio", .version = FIO_IOOPS_VERSION, .flags = FIO_ASYNCIO_SYNC_TRIM | - FIO_ASYNCIO_SETS_ISSUE_TIME, + FIO_ASYNCIO_SETS_ISSUE_TIME | + FIO_ATOMICWRITES, .init = fio_libaio_init, .post_init = fio_libaio_post_init, .prep = fio_libaio_prep, From b1552b6e655dba1b1d4957c04a19361654ab38ad Mon Sep 17 00:00:00 2001 From: John Garry Date: Mon, 16 Sep 2024 16:53:43 +0000 Subject: [PATCH 5/9] io_uring: Support RWF_ATOMIC Set RWF_ATOMIC for writes and oatomic==1. Signed-off-by: John Garry Link: https://lore.kernel.org/r/20240916165347.2226763-6-john.g.garry@oracle.com Signed-off-by: Jens Axboe --- engines/io_uring.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/engines/io_uring.c b/engines/io_uring.c index 1d4a611843..96a042a888 100644 --- a/engines/io_uring.c +++ b/engines/io_uring.c @@ -392,6 +392,8 @@ static int fio_ioring_prep(struct thread_data *td, struct io_u *io_u) sqe->rw_flags = 0; if (o->nowait) sqe->rw_flags |= RWF_NOWAIT; + if (td->o.oatomic && io_u->ddir == DDIR_WRITE) + sqe->rw_flags |= RWF_ATOMIC; /* * Since io_uring can have a submission context (sqthread_poll) @@ -1582,7 +1584,8 @@ static struct ioengine_ops ioengine_uring = { .name = "io_uring", .version = FIO_IOOPS_VERSION, .flags = FIO_ASYNCIO_SYNC_TRIM | FIO_NO_OFFLOAD | - FIO_ASYNCIO_SETS_ISSUE_TIME, + FIO_ASYNCIO_SETS_ISSUE_TIME | + FIO_ATOMICWRITES, .init = fio_ioring_init, .post_init = fio_ioring_post_init, .io_u_init = fio_ioring_io_u_init, From 274a69eab08837e4e32185f2944e3b98dd5258c0 Mon Sep 17 00:00:00 2001 From: John Garry Date: Mon, 16 Sep 2024 16:53:44 +0000 Subject: [PATCH 6/9] tools/fiograph: Update for atomic support Add atomic support for the specific engines which support this option. This just means that "atomic" will show up as a special iongeine config option (if fio 'atomic' option is specified). Signed-off-by: John Garry Link: https://lore.kernel.org/r/20240916165347.2226763-7-john.g.garry@oracle.com Signed-off-by: Jens Axboe --- tools/fiograph/fiograph.conf | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tools/fiograph/fiograph.conf b/tools/fiograph/fiograph.conf index 122f2baf29..757121806e 100644 --- a/tools/fiograph/fiograph.conf +++ b/tools/fiograph/fiograph.conf @@ -51,13 +51,13 @@ specific_options=https http_host http_user http_pass http_s3_key http_s3_ke specific_options=ime_psync ime_psyncv [ioengine_io_uring] -specific_options=hipri cmdprio_percentage cmdprio_class cmdprio cmdprio_bssplit fixedbufs registerfiles sqthread_poll sqthread_poll_cpu nonvectored nowait force_async +specific_options=hipri cmdprio_percentage cmdprio_class cmdprio cmdprio_bssplit fixedbufs registerfiles sqthread_poll sqthread_poll_cpu nonvectored nowait force_async atomic [ioengine_io_uring_cmd] specific_options=hipri cmdprio_percentage cmdprio_class cmdprio cmdprio_bssplit fixedbufs registerfiles sqthread_poll sqthread_poll_cpu nonvectored nowait force_async cmd_type md_per_io_size pi_act pi_chk apptag apptag_mask [ioengine_libaio] -specific_options=userspace_reap cmdprio_percentage cmdprio_class cmdprio cmdprio_bssplit nowait +specific_options=userspace_reap cmdprio_percentage cmdprio_class cmdprio cmdprio_bssplit nowait atomic [ioengine_libblkio] specific_options=libblkio_driver libblkio_path libblkio_pre_connect_props libblkio_num_entries libblkio_queue_size libblkio_pre_start_props hipri libblkio_vectored libblkio_write_zeroes_on_trim libblkio_wait_mode libblkio_force_enable_completion_eventfd @@ -99,7 +99,7 @@ specific_options=hostname bindname port verb specific_options=hipri readfua writefua sg_write_mode stream_id [ioengine_pvsync2] -specific_options=hipri hipri_percentage nowait sync psync vsync pvsync +specific_options=hipri hipri_percentage nowait sync psync vsync pvsync atomic [ioengine_xnvme] specific_options=hipri sqthread_poll xnvme_be xnvme_async xnvme_sync xnvme_admin xnvme_dev_nsid xnvme_iovec From cd3de3d7700294f28d52cc781d1ce3982c12ba7e Mon Sep 17 00:00:00 2001 From: John Garry Date: Mon, 16 Sep 2024 16:53:45 +0000 Subject: [PATCH 7/9] doc: Document atomic command Now that the atomic command is formally supported, document it. Signed-off-by: John Garry Link: https://lore.kernel.org/r/20240916165347.2226763-8-john.g.garry@oracle.com Signed-off-by: Jens Axboe --- HOWTO.rst | 14 ++++++++++++++ fio.1 | 13 +++++++++++++ 2 files changed, 27 insertions(+) diff --git a/HOWTO.rst b/HOWTO.rst index a363206d23..c6c9b30600 100644 --- a/HOWTO.rst +++ b/HOWTO.rst @@ -2501,6 +2501,20 @@ with the caveat that when used on the command line, they must come after the For direct I/O, requests will only succeed if cache invalidation isn't required, file blocks are fully allocated and the disk request could be issued immediately. +.. option:: atomic=bool : [pvsync2] [libaio] [io_uring] + + This option means that writes are issued with torn-write protection, meaning + that for a power fail or kernel crash, all or none of the data from the write + will be stored, but never a mix of old and new data. Torn-write protection is + also known as atomic writes. + + This option sets the RWF_ATOMIC flag (supported from the 6.11 Linux kernel) on + a per-IO basis. + + Writes with RWF_ATOMIC set will be rejected by the kernel when the file does + not support torn-write protection. To learn a file's torn-write limits, issue + statx with STATX_WRITE_ATOMIC. + .. option:: fdp=bool : [io_uring_cmd] [xnvme] Enable Flexible Data Placement mode for write commands. diff --git a/fio.1 b/fio.1 index a4ab07ed8f..6e5bed9d06 100644 --- a/fio.1 +++ b/fio.1 @@ -2266,6 +2266,19 @@ cached data. Currently the RWF_NOWAIT flag does not supported for cached write. For direct I/O, requests will only succeed if cache invalidation isn't required, file blocks are fully allocated and the disk request could be issued immediately. .TP +.BI (pvsync2,libaio,io_uring)atomic \fR=\fPbool +This option means that writes are issued with torn-write protection, meaning +that for a power fail or kernel crash, all or none of the data from the write +will be stored, but never a mix of old and new data. Torn-write protection is +also known as atomic writes. + +This option sets the RWF_ATOMIC flag (supported from the 6.11 Linux kernel) on +a per-IO basis. + +Writes with RWF_ATOMIC set will be rejected by the kernel when the file does +not support torn-write protection. To learn a file's torn-write limits, issue +statx with STATX_WRITE_ATOMIC. +.TP .BI (io_uring_cmd,xnvme)fdp \fR=\fPbool Enable Flexible Data Placement mode for write commands. .TP From 2dd80ee4598bcd69d1413fd2f1205cbffb56d787 Mon Sep 17 00:00:00 2001 From: John Garry Date: Mon, 16 Sep 2024 16:53:46 +0000 Subject: [PATCH 8/9] fio: Support verify_write_sequence Add an option to disable verifying the write sequence number. By default, it is enabled. However disable for verify_only mode. Signed-off-by: John Garry Link: https://lore.kernel.org/r/20240916165347.2226763-9-john.g.garry@oracle.com Signed-off-by: Jens Axboe --- HOWTO.rst | 11 +++++++++++ fio.1 | 8 ++++++++ init.c | 3 +++ options.c | 11 +++++++++++ thread_options.h | 1 + verify.c | 7 ++++--- 6 files changed, 38 insertions(+), 3 deletions(-) diff --git a/HOWTO.rst b/HOWTO.rst index c6c9b30600..4f071484bd 100644 --- a/HOWTO.rst +++ b/HOWTO.rst @@ -4002,6 +4002,17 @@ Verification instead resets the file after the write phase and then replays I/Os for the verification phase. +.. option:: verify_write_sequence=bool + + Verify the header write sequence number. In a scenario with multiple jobs, + verification of the write sequence number may fail. Disabling this option + will mean that write sequence number checking is skipped. Doing that can be + useful for testing atomic writes, as it means that checksum verification can + still be attempted. For when :option:`atomic` is enabled, checksum + verification is expected to succeed (while write sequence checking can still + fail). + Defaults to true. + .. option:: trim_percentage=int Number of verify blocks to discard/trim. diff --git a/fio.1 b/fio.1 index 6e5bed9d06..0fd0fb25f2 100644 --- a/fio.1 +++ b/fio.1 @@ -3726,6 +3726,14 @@ Enable experimental verification. Standard verify records I/O metadata for later use during the verification phase. Experimental verify instead resets the file after the write phase and then replays I/Os for the verification phase. .TP +.BI verify_write_sequence \fR=\fPbool +Verify the header write sequence number. In a scenario with multiple jobs, +verification of the write sequence number may fail. Disabling this option +will mean that write sequence number checking is skipped. Doing that can be +useful for testing atomic writes, as it means that checksum verification can +still be attempted. For when \fBatomic\fR is enabled, checksum verification +is expected to succeed (while write sequence checking can still fail). +.TP .BI trim_percentage \fR=\fPint Number of verify blocks to discard/trim. .TP diff --git a/init.c b/init.c index bad8b75b4e..96a03d984b 100644 --- a/init.c +++ b/init.c @@ -853,6 +853,9 @@ static int fixup_options(struct thread_data *td) (o->max_bs[DDIR_WRITE] % o->verify_interval)) o->verify_interval = gcd(o->min_bs[DDIR_WRITE], o->max_bs[DDIR_WRITE]); + + if (td->o.verify_only) + o->verify_write_sequence = 0; } if (td->o.oatomic) { diff --git a/options.c b/options.c index 95567de60b..c35878f7bf 100644 --- a/options.c +++ b/options.c @@ -3397,6 +3397,17 @@ struct fio_option fio_options[FIO_MAX_OPTS] = { .category = FIO_OPT_C_IO, .group = FIO_OPT_G_VERIFY, }, + { + .name = "verify_write_sequence", + .lname = "Verify write sequence number", + .off1 = offsetof(struct thread_options, verify_write_sequence), + .type = FIO_OPT_BOOL, + .def = "1", + .help = "Verify header write sequence number", + .parent = "verify", + .category = FIO_OPT_C_IO, + .group = FIO_OPT_G_VERIFY, + }, #ifdef FIO_HAVE_TRIM { .name = "trim_percentage", diff --git a/thread_options.h b/thread_options.h index ee1e5b31bf..d0e0a4aea2 100644 --- a/thread_options.h +++ b/thread_options.h @@ -156,6 +156,7 @@ struct thread_options { unsigned int experimental_verify; unsigned int verify_state; unsigned int verify_state_save; + unsigned int verify_write_sequence; unsigned int use_thread; unsigned int unlink; unsigned int unlink_each_loop; diff --git a/verify.c b/verify.c index b2fede2471..f3d228ba7d 100644 --- a/verify.c +++ b/verify.c @@ -848,12 +848,13 @@ static int verify_header(struct io_u *io_u, struct thread_data *td, /* * For read-only workloads, the program cannot be certain of the * last numberio written to a block. Checking of numberio will be - * done only for workloads that write data. For verify_only, - * numberio check is skipped. + * done only for workloads that write data. For verify_only or + * any mode de-selecting verify_write_sequence, numberio check is + * skipped. */ if (td_write(td) && (td_min_bs(td) == td_max_bs(td)) && !td->o.time_based) - if (!td->o.verify_only) + if (td->o.verify_write_sequence) if (hdr->numberio != io_u->numberio) { log_err("verify: bad header numberio %"PRIu16 ", wanted %"PRIu16, From f23208c8860de515f025af6ac34d6a227f277e67 Mon Sep 17 00:00:00 2001 From: John Garry Date: Mon, 16 Sep 2024 16:53:47 +0000 Subject: [PATCH 9/9] examples: Add example for atomic write verify Add an example for verifying atomic writes. Until now, atomic writes are only supported on Linux for block devices, so only give instructions on for that. Currently support is being worked on for XFS and EXT4, and instructions can be updated in due course. Signed-off-by: John Garry Link: https://lore.kernel.org/r/20240916165347.2226763-10-john.g.garry@oracle.com Signed-off-by: Jens Axboe --- examples/atomic-verify.fio | 36 ++++++++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) create mode 100644 examples/atomic-verify.fio diff --git a/examples/atomic-verify.fio b/examples/atomic-verify.fio new file mode 100644 index 0000000000..17bcd89f86 --- /dev/null +++ b/examples/atomic-verify.fio @@ -0,0 +1,36 @@ +# Data verification with atomic writes +# +# Some background on atomic writes: +# +# The main selling point of atomic writes is that it is guaranteed writes +# to storage will not be torn for a power failure or kernel crash. + +# Another aspect of atomic writes is that they handle racing writes and +# reads, such that a read racing with a write will see all the data from +# the write or none. Well, SCSI and NVMe guarantee this if using +# RWF_ATOMIC, but it is not formally stated as a feature of RWF_ATOMIC. +# +# Fio verify mode can be used to prove that atomic writes can make "safe" +# racing reads and writes. This done by having many jobs in a xsum verify +# mode. In this way, xsums should be correct, although a job may be +# reading a data block written by another job; however +# verify_write_sequence must be disabled, as it cannot be helped that data +# blocks will be out of sequence between with many jobs. +# +# Atomic write limits: +# For a block device, the max block size for atomic=1 is in +# /sys/block/sdXXX/queue/atomic_write_unit_max_bytes +# or this value can also be read with a statx syscall on the bdev file. + +[write-and-verify] +rw=randwrite +bs=4k +direct=1 +ioengine=libaio +iodepth=16 +verify=crc64 +atomic=1 +verify_write_sequence=0 +numjobs=10 +# Use /dev/XXX or filename +filename=/dev/XXX