From 7205da93d19d7d801709fa196d7c57a941a048f2 Mon Sep 17 00:00:00 2001 From: Dan Stoza Date: Wed, 28 Feb 2024 23:02:01 -0800 Subject: [PATCH] Add String::{to_lowercase,to_uppercase} (#669) --- crates/rune/src/modules/string.rs | 255 ++++++++++++++++++++++++++++++ 1 file changed, 255 insertions(+) diff --git a/crates/rune/src/modules/string.rs b/crates/rune/src/modules/string.rs index 76bd43fb2..3403a8e82 100644 --- a/crates/rune/src/modules/string.rs +++ b/crates/rune/src/modules/string.rs @@ -54,6 +54,8 @@ pub fn module() -> Result { module.function_meta(get)?; module.function_meta(parse_int)?; module.function_meta(parse_char)?; + module.function_meta(to_lowercase)?; + module.function_meta(to_uppercase)?; module.function_meta(add)?; module.function_meta(add_assign)?; @@ -1103,5 +1105,258 @@ fn parse_char(s: &str) -> Result { str::parse::(s) } +/// Returns the lowercase equivalent of this string slice, as a new [`String`]. +/// +/// 'Lowercase' is defined according to the terms of the Unicode Derived Core Property +/// `Lowercase`. +/// +/// Since some characters can expand into multiple characters when changing +/// the case, this function returns a [`String`] instead of modifying the +/// parameter in-place. +/// +/// # Examples +/// +/// Basic usage: +/// +/// ```rune +/// let s = "HELLO"; +/// +/// assert_eq!("hello", s.to_lowercase()); +/// ``` +/// +/// A tricky example, with sigma: +/// +/// ```rune +/// let sigma = "Σ"; +/// +/// assert_eq!("σ", sigma.to_lowercase()); +/// +/// // but at the end of a word, it's ς, not σ: +/// let odysseus = "ὈΔΥΣΣΕΎΣ"; +/// +/// assert_eq!("ὀδυσσεύς", odysseus.to_lowercase()); +/// ``` +/// +/// Languages without case are not changed: +/// +/// ```rune +/// let new_year = "农历新年"; +/// +/// assert_eq!(new_year, new_year.to_lowercase()); +/// ``` +#[rune::function(instance)] +fn to_lowercase(s: &str) -> VmResult { + let mut lowercase = vm_try!(String::try_with_capacity(s.len())); + for (i, c) in s.char_indices() { + // Inlined code to from std::str to handle upper-case sigma, + // since it is the only Unicode character that is context-dependent + // See https://github.com/rust-lang/rust/issues/26035 for more context + if c == 'Σ' { + vm_try!(lowercase.try_push_str(map_uppercase_sigma(s, i))); + } else { + vm_try!(lowercase.try_extend(c.to_lowercase())); + } + } + + return VmResult::Ok(lowercase); + + fn map_uppercase_sigma(from: &str, i: usize) -> &'static str { + // See https://www.unicode.org/versions/Unicode7.0.0/ch03.pdf#G33992 + // for the definition of `Final_Sigma`. + debug_assert!('Σ'.len_utf8() == 2); + let is_word_final = case_ignorable_then_cased(from[..i].chars().rev()) + && !case_ignorable_then_cased(from[i + 2..].chars()); + if is_word_final { + "ς" + } else { + "σ" + } + } + + fn case_ignorable_then_cased>(mut iter: I) -> bool { + match iter.find(|&c| !unicode::case_ignorable::lookup(c)) { + Some(c) => unicode::cased::lookup(c), + None => false, + } + } +} + +/// Returns the uppercase equivalent of this string slice, as a new [`String`]. +/// +/// 'Uppercase' is defined according to the terms of the Unicode Derived Core Property +/// `Uppercase`. +/// +/// Since some characters can expand into multiple characters when changing +/// the case, this function returns a [`String`] instead of modifying the +/// parameter in-place. +/// +/// # Examples +/// +/// Basic usage: +/// +/// ```rune +/// let s = "hello"; +/// +/// assert_eq!("HELLO", s.to_uppercase()); +/// ``` +/// +/// Scripts without case are not changed: +/// +/// ```rune +/// let new_year = "农历新年"; +/// +/// assert_eq!(new_year, new_year.to_uppercase()); +/// ``` +/// +/// One character can become multiple: +/// ```rune +/// let s = "tschüß"; +/// +/// assert_eq!("TSCHÜSS", s.to_uppercase()); +/// ``` +#[rune::function(instance)] +fn to_uppercase(s: &str) -> VmResult { + let mut uppercase = vm_try!(String::try_with_capacity(s.len())); + vm_try!(uppercase.try_extend(s.chars().flat_map(|c| c.to_uppercase()))); + VmResult::Ok(uppercase) +} + +// Inlined code from core::unicode, since using it directly is marked as using an +// unstable library feature +mod unicode { + fn decode_prefix_sum(short_offset_run_header: u32) -> u32 { + short_offset_run_header & ((1 << 21) - 1) + } + + fn decode_length(short_offset_run_header: u32) -> usize { + (short_offset_run_header >> 21) as usize + } + + #[inline(always)] + fn skip_search( + needle: u32, + short_offset_runs: &[u32; SOR], + offsets: &[u8; OFFSETS], + ) -> bool { + // Note that this *cannot* be past the end of the array, as the last + // element is greater than std::char::MAX (the largest possible needle). + // + // So, we cannot have found it (i.e. Ok(idx) + 1 != length) and the correct + // location cannot be past it, so Err(idx) != length either. + // + // This means that we can avoid bounds checking for the accesses below, too. + let last_idx = + match short_offset_runs.binary_search_by_key(&(needle << 11), |header| header << 11) { + Ok(idx) => idx + 1, + Err(idx) => idx, + }; + + let mut offset_idx = decode_length(short_offset_runs[last_idx]); + let length = if let Some(next) = short_offset_runs.get(last_idx + 1) { + decode_length(*next) - offset_idx + } else { + offsets.len() - offset_idx + }; + let prev = last_idx + .checked_sub(1) + .map(|prev| decode_prefix_sum(short_offset_runs[prev])) + .unwrap_or(0); + + let total = needle - prev; + let mut prefix_sum = 0; + for _ in 0..(length - 1) { + let offset = offsets[offset_idx]; + prefix_sum += offset as u32; + if prefix_sum > total { + break; + } + offset_idx += 1; + } + offset_idx % 2 == 1 + } + + #[rustfmt::skip] + pub mod case_ignorable { + static SHORT_OFFSET_RUNS: [u32; 35] = [ + 688, 44045149, 572528402, 576724925, 807414908, 878718981, 903913493, 929080568, 933275148, + 937491230, 1138818560, 1147208189, 1210124160, 1222707713, 1235291428, 1260457643, + 1264654383, 1499535675, 1507925040, 1566646003, 1629566000, 1650551536, 1658941263, + 1671540720, 1688321181, 1700908800, 1709298023, 1717688832, 1738661888, 1763828398, + 1797383403, 1805773008, 1809970171, 1819148289, 1824457200, + ]; + static OFFSETS: [u8; 875] = [ + 39, 1, 6, 1, 11, 1, 35, 1, 1, 1, 71, 1, 4, 1, 1, 1, 4, 1, 2, 2, 0, 192, 4, 2, 4, 1, 9, 2, + 1, 1, 251, 7, 207, 1, 5, 1, 49, 45, 1, 1, 1, 2, 1, 2, 1, 1, 44, 1, 11, 6, 10, 11, 1, 1, 35, + 1, 10, 21, 16, 1, 101, 8, 1, 10, 1, 4, 33, 1, 1, 1, 30, 27, 91, 11, 58, 11, 4, 1, 2, 1, 24, + 24, 43, 3, 44, 1, 7, 2, 6, 8, 41, 58, 55, 1, 1, 1, 4, 8, 4, 1, 3, 7, 10, 2, 13, 1, 15, 1, + 58, 1, 4, 4, 8, 1, 20, 2, 26, 1, 2, 2, 57, 1, 4, 2, 4, 2, 2, 3, 3, 1, 30, 2, 3, 1, 11, 2, + 57, 1, 4, 5, 1, 2, 4, 1, 20, 2, 22, 6, 1, 1, 58, 1, 2, 1, 1, 4, 8, 1, 7, 2, 11, 2, 30, 1, + 61, 1, 12, 1, 50, 1, 3, 1, 55, 1, 1, 3, 5, 3, 1, 4, 7, 2, 11, 2, 29, 1, 58, 1, 2, 1, 6, 1, + 5, 2, 20, 2, 28, 2, 57, 2, 4, 4, 8, 1, 20, 2, 29, 1, 72, 1, 7, 3, 1, 1, 90, 1, 2, 7, 11, 9, + 98, 1, 2, 9, 9, 1, 1, 7, 73, 2, 27, 1, 1, 1, 1, 1, 55, 14, 1, 5, 1, 2, 5, 11, 1, 36, 9, 1, + 102, 4, 1, 6, 1, 2, 2, 2, 25, 2, 4, 3, 16, 4, 13, 1, 2, 2, 6, 1, 15, 1, 94, 1, 0, 3, 0, 3, + 29, 2, 30, 2, 30, 2, 64, 2, 1, 7, 8, 1, 2, 11, 3, 1, 5, 1, 45, 5, 51, 1, 65, 2, 34, 1, 118, + 3, 4, 2, 9, 1, 6, 3, 219, 2, 2, 1, 58, 1, 1, 7, 1, 1, 1, 1, 2, 8, 6, 10, 2, 1, 39, 1, 8, 31, + 49, 4, 48, 1, 1, 5, 1, 1, 5, 1, 40, 9, 12, 2, 32, 4, 2, 2, 1, 3, 56, 1, 1, 2, 3, 1, 1, 3, + 58, 8, 2, 2, 64, 6, 82, 3, 1, 13, 1, 7, 4, 1, 6, 1, 3, 2, 50, 63, 13, 1, 34, 101, 0, 1, 1, + 3, 11, 3, 13, 3, 13, 3, 13, 2, 12, 5, 8, 2, 10, 1, 2, 1, 2, 5, 49, 5, 1, 10, 1, 1, 13, 1, + 16, 13, 51, 33, 0, 2, 113, 3, 125, 1, 15, 1, 96, 32, 47, 1, 0, 1, 36, 4, 3, 5, 5, 1, 93, 6, + 93, 3, 0, 1, 0, 6, 0, 1, 98, 4, 1, 10, 1, 1, 28, 4, 80, 2, 14, 34, 78, 1, 23, 3, 103, 3, 3, + 2, 8, 1, 3, 1, 4, 1, 25, 2, 5, 1, 151, 2, 26, 18, 13, 1, 38, 8, 25, 11, 46, 3, 48, 1, 2, 4, + 2, 2, 17, 1, 21, 2, 66, 6, 2, 2, 2, 2, 12, 1, 8, 1, 35, 1, 11, 1, 51, 1, 1, 3, 2, 2, 5, 2, + 1, 1, 27, 1, 14, 2, 5, 2, 1, 1, 100, 5, 9, 3, 121, 1, 2, 1, 4, 1, 0, 1, 147, 17, 0, 16, 3, + 1, 12, 16, 34, 1, 2, 1, 169, 1, 7, 1, 6, 1, 11, 1, 35, 1, 1, 1, 47, 1, 45, 2, 67, 1, 21, 3, + 0, 1, 226, 1, 149, 5, 0, 6, 1, 42, 1, 9, 0, 3, 1, 2, 5, 4, 40, 3, 4, 1, 165, 2, 0, 4, 0, 2, + 80, 3, 70, 11, 49, 4, 123, 1, 54, 15, 41, 1, 2, 2, 10, 3, 49, 4, 2, 2, 2, 1, 4, 1, 10, 1, + 50, 3, 36, 5, 1, 8, 62, 1, 12, 2, 52, 9, 10, 4, 2, 1, 95, 3, 2, 1, 1, 2, 6, 1, 2, 1, 157, 1, + 3, 8, 21, 2, 57, 2, 3, 1, 37, 7, 3, 5, 195, 8, 2, 3, 1, 1, 23, 1, 84, 6, 1, 1, 4, 2, 1, 2, + 238, 4, 6, 2, 1, 2, 27, 2, 85, 8, 2, 1, 1, 2, 106, 1, 1, 1, 2, 6, 1, 1, 101, 3, 2, 4, 1, 5, + 0, 9, 1, 2, 0, 2, 1, 1, 4, 1, 144, 4, 2, 2, 4, 1, 32, 10, 40, 6, 2, 4, 8, 1, 9, 6, 2, 3, 46, + 13, 1, 2, 0, 7, 1, 6, 1, 1, 82, 22, 2, 7, 1, 2, 1, 2, 122, 6, 3, 1, 1, 2, 1, 7, 1, 1, 72, 2, + 3, 1, 1, 1, 0, 2, 11, 2, 52, 5, 5, 1, 1, 1, 0, 17, 6, 15, 0, 5, 59, 7, 9, 4, 0, 1, 63, 17, + 64, 2, 1, 2, 0, 4, 1, 7, 1, 2, 0, 2, 1, 4, 0, 46, 2, 23, 0, 3, 9, 16, 2, 7, 30, 4, 148, 3, + 0, 55, 4, 50, 8, 1, 14, 1, 22, 5, 1, 15, 0, 7, 1, 17, 2, 7, 1, 2, 1, 5, 5, 62, 33, 1, 160, + 14, 0, 1, 61, 4, 0, 5, 0, 7, 109, 8, 0, 5, 0, 1, 30, 96, 128, 240, 0, + ]; + pub fn lookup(c: char) -> bool { + super::skip_search( + c as u32, + &SHORT_OFFSET_RUNS, + &OFFSETS, + ) + } + } + + #[rustfmt::skip] + pub mod cased { + static SHORT_OFFSET_RUNS: [u32; 22] = [ + 4256, 115348384, 136322176, 144711446, 163587254, 320875520, 325101120, 350268208, + 392231680, 404815649, 413205504, 421595008, 467733632, 484513952, 492924480, 497144832, + 501339814, 578936576, 627171376, 639756544, 643952944, 649261450, + ]; + static OFFSETS: [u8; 315] = [ + 65, 26, 6, 26, 47, 1, 10, 1, 4, 1, 5, 23, 1, 31, 1, 195, 1, 4, 4, 208, 1, 36, 7, 2, 30, 5, + 96, 1, 42, 4, 2, 2, 2, 4, 1, 1, 6, 1, 1, 3, 1, 1, 1, 20, 1, 83, 1, 139, 8, 166, 1, 38, 9, + 41, 0, 38, 1, 1, 5, 1, 2, 43, 1, 4, 0, 86, 2, 6, 0, 9, 7, 43, 2, 3, 64, 192, 64, 0, 2, 6, 2, + 38, 2, 6, 2, 8, 1, 1, 1, 1, 1, 1, 1, 31, 2, 53, 1, 7, 1, 1, 3, 3, 1, 7, 3, 4, 2, 6, 4, 13, + 5, 3, 1, 7, 116, 1, 13, 1, 16, 13, 101, 1, 4, 1, 2, 10, 1, 1, 3, 5, 6, 1, 1, 1, 1, 1, 1, 4, + 1, 6, 4, 1, 2, 4, 5, 5, 4, 1, 17, 32, 3, 2, 0, 52, 0, 229, 6, 4, 3, 2, 12, 38, 1, 1, 5, 1, + 0, 46, 18, 30, 132, 102, 3, 4, 1, 59, 5, 2, 1, 1, 1, 5, 24, 5, 1, 3, 0, 43, 1, 14, 6, 80, 0, + 7, 12, 5, 0, 26, 6, 26, 0, 80, 96, 36, 4, 36, 116, 11, 1, 15, 1, 7, 1, 2, 1, 11, 1, 15, 1, + 7, 1, 2, 0, 1, 2, 3, 1, 42, 1, 9, 0, 51, 13, 51, 0, 64, 0, 64, 0, 85, 1, 71, 1, 2, 2, 1, 2, + 2, 2, 4, 1, 12, 1, 1, 1, 7, 1, 65, 1, 4, 2, 8, 1, 7, 1, 28, 1, 4, 1, 5, 1, 1, 3, 7, 1, 0, 2, + 25, 1, 25, 1, 31, 1, 25, 1, 31, 1, 25, 1, 31, 1, 25, 1, 31, 1, 25, 1, 8, 0, 10, 1, 20, 6, 6, + 0, 62, 0, 68, 0, 26, 6, 26, 6, 26, 0, + ]; + pub fn lookup(c: char) -> bool { + super::skip_search( + c as u32, + &SHORT_OFFSET_RUNS, + &OFFSETS, + ) + } + } +} + crate::__internal_impl_any!(::std::string, FromUtf8Error); crate::__internal_impl_any!(::std::string, Utf8Error);