Skip to content

Commit

Permalink
Add String::{to_lowercase,to_uppercase} (#669)
Browse files Browse the repository at this point in the history
  • Loading branch information
dstoza authored Feb 29, 2024
1 parent 8e69882 commit 7205da9
Showing 1 changed file with 255 additions and 0 deletions.
255 changes: 255 additions & 0 deletions crates/rune/src/modules/string.rs
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,8 @@ pub fn module() -> Result<Module, ContextError> {
module.function_meta(get)?;
module.function_meta(parse_int)?;
module.function_meta(parse_char)?;
module.function_meta(to_lowercase)?;
module.function_meta(to_uppercase)?;

module.function_meta(add)?;
module.function_meta(add_assign)?;
Expand Down Expand Up @@ -1103,5 +1105,258 @@ fn parse_char(s: &str) -> Result<char, char::ParseCharError> {
str::parse::<char>(s)
}

/// Returns the lowercase equivalent of this string slice, as a new [`String`].
///
/// 'Lowercase' is defined according to the terms of the Unicode Derived Core Property
/// `Lowercase`.
///
/// Since some characters can expand into multiple characters when changing
/// the case, this function returns a [`String`] instead of modifying the
/// parameter in-place.
///
/// # Examples
///
/// Basic usage:
///
/// ```rune
/// let s = "HELLO";
///
/// assert_eq!("hello", s.to_lowercase());
/// ```
///
/// A tricky example, with sigma:
///
/// ```rune
/// let sigma = "Σ";
///
/// assert_eq!("σ", sigma.to_lowercase());
///
/// // but at the end of a word, it's ς, not σ:
/// let odysseus = "ὈΔΥΣΣΕΎΣ";
///
/// assert_eq!("ὀδυσσεύς", odysseus.to_lowercase());
/// ```
///
/// Languages without case are not changed:
///
/// ```rune
/// let new_year = "农历新年";
///
/// assert_eq!(new_year, new_year.to_lowercase());
/// ```
#[rune::function(instance)]
fn to_lowercase(s: &str) -> VmResult<String> {
let mut lowercase = vm_try!(String::try_with_capacity(s.len()));
for (i, c) in s.char_indices() {
// Inlined code to from std::str to handle upper-case sigma,
// since it is the only Unicode character that is context-dependent
// See https://github.com/rust-lang/rust/issues/26035 for more context
if c == 'Σ' {
vm_try!(lowercase.try_push_str(map_uppercase_sigma(s, i)));
} else {
vm_try!(lowercase.try_extend(c.to_lowercase()));
}
}

return VmResult::Ok(lowercase);

fn map_uppercase_sigma(from: &str, i: usize) -> &'static str {
// See https://www.unicode.org/versions/Unicode7.0.0/ch03.pdf#G33992
// for the definition of `Final_Sigma`.
debug_assert!('Σ'.len_utf8() == 2);
let is_word_final = case_ignorable_then_cased(from[..i].chars().rev())
&& !case_ignorable_then_cased(from[i + 2..].chars());
if is_word_final {
"ς"
} else {
"σ"
}
}

fn case_ignorable_then_cased<I: core::iter::Iterator<Item = char>>(mut iter: I) -> bool {
match iter.find(|&c| !unicode::case_ignorable::lookup(c)) {
Some(c) => unicode::cased::lookup(c),
None => false,
}
}
}

/// Returns the uppercase equivalent of this string slice, as a new [`String`].
///
/// 'Uppercase' is defined according to the terms of the Unicode Derived Core Property
/// `Uppercase`.
///
/// Since some characters can expand into multiple characters when changing
/// the case, this function returns a [`String`] instead of modifying the
/// parameter in-place.
///
/// # Examples
///
/// Basic usage:
///
/// ```rune
/// let s = "hello";
///
/// assert_eq!("HELLO", s.to_uppercase());
/// ```
///
/// Scripts without case are not changed:
///
/// ```rune
/// let new_year = "农历新年";
///
/// assert_eq!(new_year, new_year.to_uppercase());
/// ```
///
/// One character can become multiple:
/// ```rune
/// let s = "tschüß";
///
/// assert_eq!("TSCHÜSS", s.to_uppercase());
/// ```
#[rune::function(instance)]
fn to_uppercase(s: &str) -> VmResult<String> {
let mut uppercase = vm_try!(String::try_with_capacity(s.len()));
vm_try!(uppercase.try_extend(s.chars().flat_map(|c| c.to_uppercase())));
VmResult::Ok(uppercase)
}

// Inlined code from core::unicode, since using it directly is marked as using an
// unstable library feature
mod unicode {
fn decode_prefix_sum(short_offset_run_header: u32) -> u32 {
short_offset_run_header & ((1 << 21) - 1)
}

fn decode_length(short_offset_run_header: u32) -> usize {
(short_offset_run_header >> 21) as usize
}

#[inline(always)]
fn skip_search<const SOR: usize, const OFFSETS: usize>(
needle: u32,
short_offset_runs: &[u32; SOR],
offsets: &[u8; OFFSETS],
) -> bool {
// Note that this *cannot* be past the end of the array, as the last
// element is greater than std::char::MAX (the largest possible needle).
//
// So, we cannot have found it (i.e. Ok(idx) + 1 != length) and the correct
// location cannot be past it, so Err(idx) != length either.
//
// This means that we can avoid bounds checking for the accesses below, too.
let last_idx =
match short_offset_runs.binary_search_by_key(&(needle << 11), |header| header << 11) {
Ok(idx) => idx + 1,
Err(idx) => idx,
};

let mut offset_idx = decode_length(short_offset_runs[last_idx]);
let length = if let Some(next) = short_offset_runs.get(last_idx + 1) {
decode_length(*next) - offset_idx
} else {
offsets.len() - offset_idx
};
let prev = last_idx
.checked_sub(1)
.map(|prev| decode_prefix_sum(short_offset_runs[prev]))
.unwrap_or(0);

let total = needle - prev;
let mut prefix_sum = 0;
for _ in 0..(length - 1) {
let offset = offsets[offset_idx];
prefix_sum += offset as u32;
if prefix_sum > total {
break;
}
offset_idx += 1;
}
offset_idx % 2 == 1
}

#[rustfmt::skip]
pub mod case_ignorable {
static SHORT_OFFSET_RUNS: [u32; 35] = [
688, 44045149, 572528402, 576724925, 807414908, 878718981, 903913493, 929080568, 933275148,
937491230, 1138818560, 1147208189, 1210124160, 1222707713, 1235291428, 1260457643,
1264654383, 1499535675, 1507925040, 1566646003, 1629566000, 1650551536, 1658941263,
1671540720, 1688321181, 1700908800, 1709298023, 1717688832, 1738661888, 1763828398,
1797383403, 1805773008, 1809970171, 1819148289, 1824457200,
];
static OFFSETS: [u8; 875] = [
39, 1, 6, 1, 11, 1, 35, 1, 1, 1, 71, 1, 4, 1, 1, 1, 4, 1, 2, 2, 0, 192, 4, 2, 4, 1, 9, 2,
1, 1, 251, 7, 207, 1, 5, 1, 49, 45, 1, 1, 1, 2, 1, 2, 1, 1, 44, 1, 11, 6, 10, 11, 1, 1, 35,
1, 10, 21, 16, 1, 101, 8, 1, 10, 1, 4, 33, 1, 1, 1, 30, 27, 91, 11, 58, 11, 4, 1, 2, 1, 24,
24, 43, 3, 44, 1, 7, 2, 6, 8, 41, 58, 55, 1, 1, 1, 4, 8, 4, 1, 3, 7, 10, 2, 13, 1, 15, 1,
58, 1, 4, 4, 8, 1, 20, 2, 26, 1, 2, 2, 57, 1, 4, 2, 4, 2, 2, 3, 3, 1, 30, 2, 3, 1, 11, 2,
57, 1, 4, 5, 1, 2, 4, 1, 20, 2, 22, 6, 1, 1, 58, 1, 2, 1, 1, 4, 8, 1, 7, 2, 11, 2, 30, 1,
61, 1, 12, 1, 50, 1, 3, 1, 55, 1, 1, 3, 5, 3, 1, 4, 7, 2, 11, 2, 29, 1, 58, 1, 2, 1, 6, 1,
5, 2, 20, 2, 28, 2, 57, 2, 4, 4, 8, 1, 20, 2, 29, 1, 72, 1, 7, 3, 1, 1, 90, 1, 2, 7, 11, 9,
98, 1, 2, 9, 9, 1, 1, 7, 73, 2, 27, 1, 1, 1, 1, 1, 55, 14, 1, 5, 1, 2, 5, 11, 1, 36, 9, 1,
102, 4, 1, 6, 1, 2, 2, 2, 25, 2, 4, 3, 16, 4, 13, 1, 2, 2, 6, 1, 15, 1, 94, 1, 0, 3, 0, 3,
29, 2, 30, 2, 30, 2, 64, 2, 1, 7, 8, 1, 2, 11, 3, 1, 5, 1, 45, 5, 51, 1, 65, 2, 34, 1, 118,
3, 4, 2, 9, 1, 6, 3, 219, 2, 2, 1, 58, 1, 1, 7, 1, 1, 1, 1, 2, 8, 6, 10, 2, 1, 39, 1, 8, 31,
49, 4, 48, 1, 1, 5, 1, 1, 5, 1, 40, 9, 12, 2, 32, 4, 2, 2, 1, 3, 56, 1, 1, 2, 3, 1, 1, 3,
58, 8, 2, 2, 64, 6, 82, 3, 1, 13, 1, 7, 4, 1, 6, 1, 3, 2, 50, 63, 13, 1, 34, 101, 0, 1, 1,
3, 11, 3, 13, 3, 13, 3, 13, 2, 12, 5, 8, 2, 10, 1, 2, 1, 2, 5, 49, 5, 1, 10, 1, 1, 13, 1,
16, 13, 51, 33, 0, 2, 113, 3, 125, 1, 15, 1, 96, 32, 47, 1, 0, 1, 36, 4, 3, 5, 5, 1, 93, 6,
93, 3, 0, 1, 0, 6, 0, 1, 98, 4, 1, 10, 1, 1, 28, 4, 80, 2, 14, 34, 78, 1, 23, 3, 103, 3, 3,
2, 8, 1, 3, 1, 4, 1, 25, 2, 5, 1, 151, 2, 26, 18, 13, 1, 38, 8, 25, 11, 46, 3, 48, 1, 2, 4,
2, 2, 17, 1, 21, 2, 66, 6, 2, 2, 2, 2, 12, 1, 8, 1, 35, 1, 11, 1, 51, 1, 1, 3, 2, 2, 5, 2,
1, 1, 27, 1, 14, 2, 5, 2, 1, 1, 100, 5, 9, 3, 121, 1, 2, 1, 4, 1, 0, 1, 147, 17, 0, 16, 3,
1, 12, 16, 34, 1, 2, 1, 169, 1, 7, 1, 6, 1, 11, 1, 35, 1, 1, 1, 47, 1, 45, 2, 67, 1, 21, 3,
0, 1, 226, 1, 149, 5, 0, 6, 1, 42, 1, 9, 0, 3, 1, 2, 5, 4, 40, 3, 4, 1, 165, 2, 0, 4, 0, 2,
80, 3, 70, 11, 49, 4, 123, 1, 54, 15, 41, 1, 2, 2, 10, 3, 49, 4, 2, 2, 2, 1, 4, 1, 10, 1,
50, 3, 36, 5, 1, 8, 62, 1, 12, 2, 52, 9, 10, 4, 2, 1, 95, 3, 2, 1, 1, 2, 6, 1, 2, 1, 157, 1,
3, 8, 21, 2, 57, 2, 3, 1, 37, 7, 3, 5, 195, 8, 2, 3, 1, 1, 23, 1, 84, 6, 1, 1, 4, 2, 1, 2,
238, 4, 6, 2, 1, 2, 27, 2, 85, 8, 2, 1, 1, 2, 106, 1, 1, 1, 2, 6, 1, 1, 101, 3, 2, 4, 1, 5,
0, 9, 1, 2, 0, 2, 1, 1, 4, 1, 144, 4, 2, 2, 4, 1, 32, 10, 40, 6, 2, 4, 8, 1, 9, 6, 2, 3, 46,
13, 1, 2, 0, 7, 1, 6, 1, 1, 82, 22, 2, 7, 1, 2, 1, 2, 122, 6, 3, 1, 1, 2, 1, 7, 1, 1, 72, 2,
3, 1, 1, 1, 0, 2, 11, 2, 52, 5, 5, 1, 1, 1, 0, 17, 6, 15, 0, 5, 59, 7, 9, 4, 0, 1, 63, 17,
64, 2, 1, 2, 0, 4, 1, 7, 1, 2, 0, 2, 1, 4, 0, 46, 2, 23, 0, 3, 9, 16, 2, 7, 30, 4, 148, 3,
0, 55, 4, 50, 8, 1, 14, 1, 22, 5, 1, 15, 0, 7, 1, 17, 2, 7, 1, 2, 1, 5, 5, 62, 33, 1, 160,
14, 0, 1, 61, 4, 0, 5, 0, 7, 109, 8, 0, 5, 0, 1, 30, 96, 128, 240, 0,
];
pub fn lookup(c: char) -> bool {
super::skip_search(
c as u32,
&SHORT_OFFSET_RUNS,
&OFFSETS,
)
}
}

#[rustfmt::skip]
pub mod cased {
static SHORT_OFFSET_RUNS: [u32; 22] = [
4256, 115348384, 136322176, 144711446, 163587254, 320875520, 325101120, 350268208,
392231680, 404815649, 413205504, 421595008, 467733632, 484513952, 492924480, 497144832,
501339814, 578936576, 627171376, 639756544, 643952944, 649261450,
];
static OFFSETS: [u8; 315] = [
65, 26, 6, 26, 47, 1, 10, 1, 4, 1, 5, 23, 1, 31, 1, 195, 1, 4, 4, 208, 1, 36, 7, 2, 30, 5,
96, 1, 42, 4, 2, 2, 2, 4, 1, 1, 6, 1, 1, 3, 1, 1, 1, 20, 1, 83, 1, 139, 8, 166, 1, 38, 9,
41, 0, 38, 1, 1, 5, 1, 2, 43, 1, 4, 0, 86, 2, 6, 0, 9, 7, 43, 2, 3, 64, 192, 64, 0, 2, 6, 2,
38, 2, 6, 2, 8, 1, 1, 1, 1, 1, 1, 1, 31, 2, 53, 1, 7, 1, 1, 3, 3, 1, 7, 3, 4, 2, 6, 4, 13,
5, 3, 1, 7, 116, 1, 13, 1, 16, 13, 101, 1, 4, 1, 2, 10, 1, 1, 3, 5, 6, 1, 1, 1, 1, 1, 1, 4,
1, 6, 4, 1, 2, 4, 5, 5, 4, 1, 17, 32, 3, 2, 0, 52, 0, 229, 6, 4, 3, 2, 12, 38, 1, 1, 5, 1,
0, 46, 18, 30, 132, 102, 3, 4, 1, 59, 5, 2, 1, 1, 1, 5, 24, 5, 1, 3, 0, 43, 1, 14, 6, 80, 0,
7, 12, 5, 0, 26, 6, 26, 0, 80, 96, 36, 4, 36, 116, 11, 1, 15, 1, 7, 1, 2, 1, 11, 1, 15, 1,
7, 1, 2, 0, 1, 2, 3, 1, 42, 1, 9, 0, 51, 13, 51, 0, 64, 0, 64, 0, 85, 1, 71, 1, 2, 2, 1, 2,
2, 2, 4, 1, 12, 1, 1, 1, 7, 1, 65, 1, 4, 2, 8, 1, 7, 1, 28, 1, 4, 1, 5, 1, 1, 3, 7, 1, 0, 2,
25, 1, 25, 1, 31, 1, 25, 1, 31, 1, 25, 1, 31, 1, 25, 1, 31, 1, 25, 1, 8, 0, 10, 1, 20, 6, 6,
0, 62, 0, 68, 0, 26, 6, 26, 6, 26, 0,
];
pub fn lookup(c: char) -> bool {
super::skip_search(
c as u32,
&SHORT_OFFSET_RUNS,
&OFFSETS,
)
}
}
}

crate::__internal_impl_any!(::std::string, FromUtf8Error);
crate::__internal_impl_any!(::std::string, Utf8Error);

0 comments on commit 7205da9

Please sign in to comment.