From cac24cdd6bb3f8d99af61fbdcdb3f9edf44744e3 Mon Sep 17 00:00:00 2001
From: Lukasz Anforowicz <lukasza@chromium.org>
Date: Wed, 29 Jan 2025 17:05:55 +0000
Subject: [PATCH 1/5] Avoid `unsafe` in `ArrayString`'s `new` and `zero_filled`
 methods.

Before the changes in `fn new`, the type of `MaybeUninit::uninit()`
would be inferred to be `MaybeUninit<[MaybeUninit<...>; CAP]>` which
seems somewhat unexpected.  This commit rewrites this expression so that
no `unsafe` is needed.

Similarily, before the changes in `fn zero_filled`, the type of
`MaybeUninit::zeroed()` would be inferred to be
`MaybeUninit<[MaybeUninit<...>; CAP]>`.  This commit also rewrites this
expression so that no `unsafe` is needed.
---
 src/array_string.rs | 15 ++++++---------
 1 file changed, 6 insertions(+), 9 deletions(-)
diff --git a/src/array_string.rs b/src/array_string.rs
index 227e01d..1920afe 100644
--- a/src/array_string.rs
+++ b/src/array_string.rs
@@ -64,8 +64,9 @@ impl<const CAP: usize> ArrayString<CAP>
     /// ```
     pub fn new() -> ArrayString<CAP> {
         assert_capacity_limit!(CAP);
-        unsafe {
-            ArrayString { xs: MaybeUninit::uninit().assume_init(), len: 0 }
+        ArrayString {
+            xs: [MaybeUninit::uninit(); CAP],
+            len: 0,
         }
     }
 
@@ -144,13 +145,9 @@ impl<const CAP: usize> ArrayString<CAP>
     #[inline]
     pub fn zero_filled() -> Self {
         assert_capacity_limit!(CAP);
-        // SAFETY: `assert_capacity_limit` asserts that `len` won't overflow and
-        // `zeroed` fully fills the array with nulls.
-        unsafe {
-            ArrayString {
-                xs: MaybeUninit::zeroed().assume_init(),
-                len: CAP as _
-            }
+        ArrayString {
+            xs: [MaybeUninit::zeroed(); CAP],
+            len: CAP as _,
         }
     }
 

From dc853a757ed4ec7373a1256d754efdcc26b32257 Mon Sep 17 00:00:00 2001
From: Lukasz Anforowicz <lukasza@chromium.org>
Date: Wed, 29 Jan 2025 17:22:24 +0000
Subject: [PATCH 2/5] Limit the extent of `unsafe` in
 `ArrayString::from_byte_string`.

The safe code results in the same, fast assembly code.
---
 src/array_string.rs | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/src/array_string.rs b/src/array_string.rs
index 1920afe..fac7329 100644
--- a/src/array_string.rs
+++ b/src/array_string.rs
@@ -125,11 +125,20 @@ impl<const CAP: usize> ArrayString<CAP>
         let len = str::from_utf8(b)?.len();
         debug_assert_eq!(len, CAP);
         let mut vec = Self::new();
+
+        // This seems to result in the same, fast assembly code as some
+        // `unsafe` transmutes and a call to `copy_to_nonoverlapping`.
+        // See https://godbolt.org/z/vhM1WePTK for more details.
+        for (dst, src) in vec.xs.iter_mut().zip(b.iter()) {
+            *dst = MaybeUninit::new(*src);
+        }
+
+        // SAFETY: Copying `CAP` bytes in the `for` loop above initializes
+        // all the bytes in `vec`.
         unsafe {
-            (b as *const [u8; CAP] as *const [MaybeUninit<u8>; CAP])
-                .copy_to_nonoverlapping(&mut vec.xs as *mut [MaybeUninit<u8>; CAP], 1);
             vec.set_len(CAP);
         }
+        
         Ok(vec)
     }
 

From eb02511eb87455d7508152cceb8074afdc43e23a Mon Sep 17 00:00:00 2001
From: Lukasz Anforowicz <lukasza@chromium.org>
Date: Wed, 29 Jan 2025 17:34:30 +0000
Subject: [PATCH 3/5] Limit the extent of `unsafe` in
 `ArrayString::try_push_str`.

The safe code should result in the same, fast assembly code.
---
 src/array_string.rs | 21 ++++++++++++++++-----
 1 file changed, 16 insertions(+), 5 deletions(-)

diff --git a/src/array_string.rs b/src/array_string.rs
index fac7329..ab4a638 100644
--- a/src/array_string.rs
+++ b/src/array_string.rs
@@ -291,13 +291,24 @@ impl<const CAP: usize> ArrayString<CAP>
         if s.len() > self.capacity() - self.len() {
             return Err(CapacityError::new(s));
         }
+        let old_len = self.len();
+        let new_len = old_len + s.len();
+
+        // This loop is similar to the one in `from_byte_string` and therefore
+        // it is expected to  result in the same, fast assembly code as some
+        // `unsafe` transmutes and a call to `copy_to_nonoverlapping`.
+        let dst = &mut self.xs[old_len..new_len];
+        let src = s.as_bytes();
+        for (dst, src) in dst.iter_mut().zip(src.iter()) {
+            *dst = MaybeUninit::new(*src);
+        }
+
+        // SAFETY: Copying `CAP` bytes in the `for` loop above initializes
+        // all the bytes in `self.xs[old_len..new_len]`.
         unsafe {
-            let dst = self.as_mut_ptr().add(self.len());
-            let src = s.as_ptr();
-            ptr::copy_nonoverlapping(src, dst, s.len());
-            let newl = self.len() + s.len();
-            self.set_len(newl);
+            self.set_len(new_len);
         }
+
         Ok(())
     }
 

From 3320c1246921c290c6b6ef45ab5933b5ee3feb11 Mon Sep 17 00:00:00 2001
From: Lukasz Anforowicz <lukasza@chromium.org>
Date: Wed, 29 Jan 2025 17:40:19 +0000
Subject: [PATCH 4/5] More granular `unsafe` blocks in `ArrayString::try_push`.

---
 src/array_string.rs | 18 +++++++++++-------
 1 file changed, 11 insertions(+), 7 deletions(-)

diff --git a/src/array_string.rs b/src/array_string.rs
index ab4a638..165b3f2 100644
--- a/src/array_string.rs
+++ b/src/array_string.rs
@@ -235,16 +235,20 @@ impl<const CAP: usize> ArrayString<CAP>
     /// ```
     pub fn try_push(&mut self, c: char) -> Result<(), CapacityError<char>> {
         let len = self.len();
-        unsafe {
-            let ptr = self.as_mut_ptr().add(len);
-            let remaining_cap = self.capacity() - len;
-            match encode_utf8(c, ptr, remaining_cap) {
-                Ok(n) => {
+        let ptr: *mut MaybeUninit<u8> = self.xs[len..].as_mut_ptr();
+        let ptr = ptr as *mut u8;
+        let remaining_cap = self.capacity() - len;
+
+        // SAFETY: `ptr` points to `remaining_cap` bytes.
+        match unsafe { encode_utf8(c, ptr, remaining_cap) } {
+            Ok(n) => {
+                // SAFETY: `encode_utf8` promises that it initialized `n` bytes.
+                unsafe {
                     self.set_len(len + n);
-                    Ok(())
                 }
-                Err(_) => Err(CapacityError::new(c)),
+                Ok(())
             }
+            Err(_) => Err(CapacityError::new(c)),
         }
     }
 

From cd11fb566afa6ca0e1a03f3ca6e2e468e05ac025 Mon Sep 17 00:00:00 2001
From: Lukasz Anforowicz <lukasza@chromium.org>
Date: Wed, 29 Jan 2025 18:02:09 +0000
Subject: [PATCH 5/5] Add safety comments related to `ArrayString`'s `fn
 set_len` method.

This also opportunistically reduces the amount of `unsafe` in
`ArrayString::remove`.
---
 src/array_string.rs | 68 +++++++++++++++++++++++++++++++++------------
 1 file changed, 50 insertions(+), 18 deletions(-)

diff --git a/src/array_string.rs b/src/array_string.rs
index 165b3f2..471544b 100644
--- a/src/array_string.rs
+++ b/src/array_string.rs
@@ -7,7 +7,6 @@ use std::mem::MaybeUninit;
 use std::ops::{Deref, DerefMut};
 #[cfg(feature="std")]
 use std::path::Path;
-use std::ptr;
 use std::slice;
 use std::str;
 use std::str::FromStr;
@@ -35,7 +34,8 @@ use serde::{Serialize, Deserialize, Serializer, Deserializer};
 #[derive(Copy)]
 #[repr(C)]
 pub struct ArrayString<const CAP: usize> {
-    // the `len` first elements of the array are initialized
+    // the `len` first elements of the array are initialized and contain valid
+    // UTF-8
     len: LenUint,
     xs: [MaybeUninit<u8>; CAP],
 }
@@ -134,7 +134,8 @@ impl<const CAP: usize> ArrayString<CAP>
         }
 
         // SAFETY: Copying `CAP` bytes in the `for` loop above initializes
-        // all the bytes in `vec`.
+        // all the bytes in `vec`.  `str::from_utf8` call above promises
+        // that the bytes are valid UTF-8.
         unsafe {
             vec.set_len(CAP);
         }
@@ -242,7 +243,8 @@ impl<const CAP: usize> ArrayString<CAP>
         // SAFETY: `ptr` points to `remaining_cap` bytes.
         match unsafe { encode_utf8(c, ptr, remaining_cap) } {
             Ok(n) => {
-                // SAFETY: `encode_utf8` promises that it initialized `n` bytes.
+                // SAFETY: `encode_utf8` promises that it initialized `n` bytes
+                // and that it wrote valid UTF-8.
                 unsafe {
                     self.set_len(len + n);
                 }
@@ -308,7 +310,8 @@ impl<const CAP: usize> ArrayString<CAP>
         }
 
         // SAFETY: Copying `CAP` bytes in the `for` loop above initializes
-        // all the bytes in `self.xs[old_len..new_len]`.
+        // all the bytes in `self.xs[old_len..new_len]`.  We copy the bytes
+        // from `s: &'a str` so the bytes must be valid UTF-8.
         unsafe {
             self.set_len(new_len);
         }
@@ -337,9 +340,17 @@ impl<const CAP: usize> ArrayString<CAP>
             None => return None,
         };
         let new_len = self.len() - ch.len_utf8();
+
+        // SAFETY: Type invariant guarantees that `self.len()` bytes are
+        // initialized and valid UTF-8.  Therefore `new_len` bytes (less bytes)
+        // are also initialized.  And they are still valid UTF-8 because we cut
+        // on char boundary.
         unsafe {
+            debug_assert!(new_len <= self.len());
+            debug_assert!(self.is_char_boundary(new_len));
             self.set_len(new_len);
         }
+
         Some(ch)
     }
 
@@ -362,11 +373,17 @@ impl<const CAP: usize> ArrayString<CAP>
     pub fn truncate(&mut self, new_len: usize) {
         if new_len <= self.len() {
             assert!(self.is_char_boundary(new_len));
+
+            // SAFETY: Type invariant guarantees that `self.len()` bytes are
+            // initialized and form valid UTF-8.  `new_len` bytes are also
+            // initialized, because we checked above that `new_len <=
+            // self.len()`.  And `new_len` bytes are valid UTF-8, because we
+            // `assert!` above that `new_len` is at a char boundary.
+            //
+            // In libstd truncate is called on the underlying vector, which in
+            // turns drops each element.  Here we work with `u8` butes, so we
+            // don't have to worry about Drop, and we can just set the length.
             unsafe { 
-                // In libstd truncate is called on the underlying vector,
-                // which in turns drops each element.
-                // As we know we don't have to worry about Drop,
-                // we can just set the length (a la clear.)
                 self.set_len(new_len);
             }
         }
@@ -396,20 +413,25 @@ impl<const CAP: usize> ArrayString<CAP>
         };
 
         let next = idx + ch.len_utf8();
+        self.xs.copy_within(next.., idx);
+
+        // SAFETY: Type invariant guarantees that `self.len()` bytes are
+        // initialized and form valid UTF-8.  Therefore `new_len` bytes (less
+        // bytes) are also initialized.  We remove a whole UTF-8 char, so
+        // `new_len` bytes remain valid UTF-8.
         let len = self.len();
-        let ptr = self.as_mut_ptr();
+        let new_len = len - (next - idx);
         unsafe {
-            ptr::copy(
-                ptr.add(next),
-                ptr.add(idx),
-                len - next);
-            self.set_len(len - (next - idx));
+            debug_assert!(new_len <= self.len());
+            self.set_len(new_len);
         }
         ch
     }
 
     /// Make the string empty.
     pub fn clear(&mut self) {
+        // SAFETY: Empty slice is initialized by definition.  Empty string is
+        // valid UTF-8.
         unsafe {
             self.set_len(0);
         }
@@ -417,15 +439,25 @@ impl<const CAP: usize> ArrayString<CAP>
 
     /// Set the strings’s length.
     ///
-    /// This function is `unsafe` because it changes the notion of the
-    /// number of “valid” bytes in the string. Use with care.
-    ///
     /// This method uses *debug assertions* to check the validity of `length`
     /// and may use other debug assertions.
+    ///
+    /// # Safety
+    ///
+    /// The caller needs to guarantee that `length` bytes of the underlying
+    /// storage:
+    ///
+    /// * have been initialized
+    /// * encode valid UTF-8
     pub unsafe fn set_len(&mut self, length: usize) {
         // type invariant that capacity always fits in LenUint
         debug_assert!(length <= self.capacity());
+
         self.len = length as LenUint;
+
+        // type invariant that we contain a valid UTF-8 string
+        // (this is just an O(1) heuristic - full check would require O(N)).
+        debug_assert!(self.is_char_boundary(length));
     }
 
     /// Return a string slice of the whole `ArrayString`.