diff --git a/utils/zerotrie/src/options.rs b/utils/zerotrie/src/options.rs index af9e90fd87a..df37a4415d1 100644 --- a/utils/zerotrie/src/options.rs +++ b/utils/zerotrie/src/options.rs @@ -73,6 +73,15 @@ pub(crate) enum CaseSensitivity { IgnoreCase, } +/// How to handle lookup for strings with mixed ASCII case. Only used in ignore-case tries +#[derive(Copy, Clone)] +pub(crate) enum LookupStrictness { + /// Select strings that differ in case so long as their `to_ascii_lowercase` matches + Normal, + /// Select strings only if they match exactly + Strict, +} + impl CaseSensitivity { #[cfg(feature = "serde")] const fn to_u8_flag(self) -> u8 { @@ -89,6 +98,7 @@ pub(crate) struct ZeroTrieBuilderOptions { pub ascii_mode: AsciiMode, pub capacity_mode: CapacityMode, pub case_sensitivity: CaseSensitivity, + pub lookup_strictness: LookupStrictness, } impl ZeroTrieBuilderOptions { @@ -113,6 +123,7 @@ impl ZeroTrieWithOptions for crate::ZeroTrieSimpleAscii { ascii_mode: AsciiMode::AsciiOnly, capacity_mode: CapacityMode::Normal, case_sensitivity: CaseSensitivity::Sensitive, + lookup_strictness: LookupStrictness::Normal, }; } @@ -129,6 +140,7 @@ impl ZeroTrieWithOptions for crate::ZeroAsciiIgnoreCaseTrie { ascii_mode: AsciiMode::AsciiOnly, capacity_mode: CapacityMode::Normal, case_sensitivity: CaseSensitivity::IgnoreCase, + lookup_strictness: LookupStrictness::Normal, }; } @@ -137,6 +149,16 @@ impl crate::ZeroAsciiIgnoreCaseTrie { pub(crate) const FLAGS: u8 = Self::OPTIONS.to_u8_flags(); } +/// Internal struct to power `get_strict` +pub(crate) struct ZeroAsciiIgnoreCaseStrictTrie; + +impl ZeroTrieWithOptions for ZeroAsciiIgnoreCaseStrictTrie { + const OPTIONS: ZeroTrieBuilderOptions = ZeroTrieBuilderOptions { + lookup_strictness: LookupStrictness::Strict, + ..crate::ZeroAsciiIgnoreCaseTrie::OPTIONS + }; +} + /// Branch nodes could be either binary search or PHF. impl ZeroTrieWithOptions for crate::ZeroTriePerfectHash { const OPTIONS: ZeroTrieBuilderOptions = ZeroTrieBuilderOptions { @@ -144,6 +166,7 @@ impl ZeroTrieWithOptions for crate::ZeroTriePerfectHash { ascii_mode: AsciiMode::BinarySpans, capacity_mode: CapacityMode::Normal, case_sensitivity: CaseSensitivity::Sensitive, + lookup_strictness: LookupStrictness::Normal, }; } @@ -159,6 +182,7 @@ impl ZeroTrieWithOptions for crate::ZeroTrieExtendedCapacity { ascii_mode: AsciiMode::BinarySpans, capacity_mode: CapacityMode::Extended, case_sensitivity: CaseSensitivity::Sensitive, + lookup_strictness: LookupStrictness::Normal, }; } diff --git a/utils/zerotrie/src/reader.rs b/utils/zerotrie/src/reader.rs index eed1c80aaad..64ae6e1651f 100644 --- a/utils/zerotrie/src/reader.rs +++ b/utils/zerotrie/src/reader.rs @@ -322,6 +322,7 @@ pub(crate) fn get_parameterized( if let Some((c, temp)) = ascii.split_first() { if matches!(byte_type, NodeType::Ascii) { let is_match = if matches!(T::OPTIONS.case_sensitivity, CaseSensitivity::IgnoreCase) + && matches!(T::OPTIONS.lookup_strictness, LookupStrictness::Normal) { b.to_ascii_lowercase() == c.to_ascii_lowercase() } else { @@ -367,12 +368,28 @@ pub(crate) fn get_parameterized( if matches!(T::OPTIONS.phf_mode, PhfMode::BinaryOnly) || x < 16 { // binary search (search, trie) = trie.debug_split_at(x); + // TODO(#5584): Consider making all of these have the same order of elements let bsearch_result = if matches!(T::OPTIONS.case_sensitivity, CaseSensitivity::IgnoreCase) { - search.binary_search_by_key(&c.to_ascii_lowercase(), |x| { - x.to_ascii_lowercase() - }) + if matches!(T::OPTIONS.lookup_strictness, LookupStrictness::Normal) { + // Ordering: (A=a), (B=b), (C=c), ..., (Z=z) + search.binary_search_by_key(&c.to_ascii_lowercase(), |x| { + x.to_ascii_lowercase() + }) + } else { + // Ordering: A, a, B, b, C, c, ..., Z, z + let c_lowercase = c.to_ascii_lowercase(); + search.binary_search_by(move |p| { + let p_lowercase = p.to_ascii_lowercase(); + if c_lowercase == p_lowercase { + p.cmp(c) + } else { + p_lowercase.cmp(&c_lowercase) + } + }) + } } else { + // Ordering: A, B, C, ..., Z, a, b, c, ..., z search.binary_search(c) }; i = bsearch_result.ok()?; diff --git a/utils/zerotrie/src/zerotrie.rs b/utils/zerotrie/src/zerotrie.rs index 21d6b430de2..e82c9262ec5 100644 --- a/utils/zerotrie/src/zerotrie.rs +++ b/utils/zerotrie/src/zerotrie.rs @@ -665,6 +665,45 @@ impl_zerotrie_subtype!( Vec::into_boxed_slice ); +impl ZeroAsciiIgnoreCaseTrie +where + Store: AsRef<[u8]> + ?Sized, +{ + /// Queries the trie for a string, requiring that it matches case. + /// + /// # Examples + /// + /// ``` + /// use litemap::LiteMap; + /// use zerotrie::ZeroAsciiIgnoreCaseTrie; + /// + /// let mut map = LiteMap::new_vec(); + /// map.insert(&b"foo"[..], 1); + /// map.insert(b"Bar", 2); + /// map.insert(b"Bingo", 3); + /// + /// let trie = ZeroAsciiIgnoreCaseTrie::try_from(&map)?; + /// + /// assert_eq!(trie.get(b"foo"), Some(1)); + /// assert_eq!(trie.get(b"bar"), Some(2)); + /// assert_eq!(trie.get(b"BaR"), Some(2)); + /// assert_eq!(trie.get_strict(b"bar"), None); + /// assert_eq!(trie.get_strict(b"BaR"), None); + /// assert_eq!(trie.get_strict(b"Bar"), Some(2)); + /// + /// # Ok::<_, zerotrie::ZeroTrieBuildError>(()) + /// ``` + pub fn get_strict(&self, key: K) -> Option + where + K: AsRef<[u8]>, + { + reader::get_parameterized::( + self.store.as_ref(), + key.as_ref(), + ) + } +} + macro_rules! impl_dispatch { ($self:ident, $inner_fn:ident()) => { match $self.0 {