From c9faa0be6c084439610045b77fad7eda949ca2fc Mon Sep 17 00:00:00 2001 From: Addison Crump Date: Thu, 28 Sep 2023 03:30:59 +0200 Subject: [PATCH 1/5] allow for non-ASCII decoding --- src/legacy.rs | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/src/legacy.rs b/src/legacy.rs index d55f3a1..6060ac5 100644 --- a/src/legacy.rs +++ b/src/legacy.rs @@ -63,11 +63,6 @@ pub fn demangle(s: &str) -> Result<(Demangle, &str), ()> { return Err(()); }; - // only work with ascii text - if inner.bytes().any(|c| c & 0x80 != 0) { - return Err(()); - } - let mut elements = 0; let mut chars = inner.chars(); let mut c = chars.next().ok_or(())?; @@ -87,8 +82,9 @@ pub fn demangle(s: &str) -> Result<(Demangle, &str), ()> { // `c` already contains the first character of this identifier, skip it and // all the other characters of this identifier, to reach the next element. - for _ in 0..len { + while len > 0 { c = chars.next().ok_or(())?; + len = len.checked_sub(c.len_utf8()).ok_or(())?; } elements += 1; From 3cc40e7757337267679309b023f3ca427259aede Mon Sep 17 00:00:00 2001 From: Addison Crump Date: Thu, 28 Sep 2023 03:44:08 +0200 Subject: [PATCH 2/5] fixup and test --- src/legacy.rs | 2 +- src/lib.rs | 4 ++++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/src/legacy.rs b/src/legacy.rs index 6060ac5..199b2ed 100644 --- a/src/legacy.rs +++ b/src/legacy.rs @@ -83,8 +83,8 @@ pub fn demangle(s: &str) -> Result<(Demangle, &str), ()> { // `c` already contains the first character of this identifier, skip it and // all the other characters of this identifier, to reach the next element. while len > 0 { - c = chars.next().ok_or(())?; len = len.checked_sub(c.len_utf8()).ok_or(())?; + c = chars.next().ok_or(())?; } elements += 1; diff --git a/src/lib.rs b/src/lib.rs index cafec2f..5cc19a5 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -406,6 +406,10 @@ mod tests { t!("_ZN4testE", "test"); t_err!("_ZN4test"); t!("_ZN4test1a2bcE", "test::a::bc"); + t_err!("🐇"); + t!("_ZN4🐇E", "🐇"); + t_err!("_ZN4🐇"); + t!("_ZN4🐇1a2bcE", "🐇::a::bc"); } #[test] From 8fdefc250f9aa72b2e3f9be850ca990e2b42f4d2 Mon Sep 17 00:00:00 2001 From: Addison Crump Date: Thu, 28 Sep 2023 03:44:59 +0200 Subject: [PATCH 3/5] separate tests for clarity --- src/lib.rs | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/lib.rs b/src/lib.rs index 5cc19a5..e86722f 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -406,6 +406,10 @@ mod tests { t!("_ZN4testE", "test"); t_err!("_ZN4test"); t!("_ZN4test1a2bcE", "test::a::bc"); + } + + #[test] + fn demangle_emoji() { t_err!("🐇"); t!("_ZN4🐇E", "🐇"); t_err!("_ZN4🐇"); From e9f84c16ed4e72c4d8fb5ae41d8f563df3e61f0c Mon Sep 17 00:00:00 2001 From: Addison Crump Date: Fri, 29 Sep 2023 00:33:18 +0200 Subject: [PATCH 4/5] demangle_line support for non-ASCII symbols --- src/lib.rs | 147 ++++++++++++++++++++++++++++++----------------------- 1 file changed, 83 insertions(+), 64 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index e86722f..3cd0375 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -74,26 +74,12 @@ enum DemangleStyle<'a> { V0(v0::Demangle<'a>), } -/// De-mangles a Rust symbol into a more readable version -/// -/// This function will take a **mangled** symbol and return a value. When printed, -/// the de-mangled version will be written. If the symbol does not look like -/// a mangled symbol, the original value will be written instead. -/// -/// # Examples -/// -/// ``` -/// use rustc_demangle::demangle; -/// -/// assert_eq!(demangle("_ZN4testE").to_string(), "test"); -/// assert_eq!(demangle("_ZN3foo3barE").to_string(), "foo::bar"); -/// assert_eq!(demangle("foo").to_string(), "foo"); -/// ``` -pub fn demangle(mut s: &str) -> Demangle { +fn demangle_common(s: &str) -> Option<(DemangleStyle, &str)> { // During ThinLTO LLVM may import and rename internal symbols, so strip out // those endings first as they're one of the last manglings applied to symbol // names. let llvm = ".llvm."; + let mut thinlto_stripped = s; if let Some(i) = s.find(llvm) { let candidate = &s[i + llvm.len()..]; let all_hex = candidate.chars().all(|c| match c { @@ -102,21 +88,14 @@ pub fn demangle(mut s: &str) -> Demangle { }); if all_hex { - s = &s[..i]; + thinlto_stripped = &s[..i]; } } - let mut suffix = ""; - let mut style = match legacy::demangle(s) { - Ok((d, s)) => { - suffix = s; - Some(DemangleStyle::Legacy(d)) - } - Err(()) => match v0::demangle(s) { - Ok((d, s)) => { - suffix = s; - Some(DemangleStyle::V0(d)) - } + match legacy::demangle(thinlto_stripped) { + Ok((d, suffix)) => Some((DemangleStyle::Legacy(d), suffix)), + Err(()) => match v0::demangle(thinlto_stripped) { + Ok((d, suffix)) => Some((DemangleStyle::V0(d), suffix)), // FIXME(eddyb) would it make sense to treat an unknown-validity // symbol (e.g. one that errored with `RecursedTooDeep`) as // v0-mangled, and have the error show up in the demangling? @@ -124,65 +103,105 @@ pub fn demangle(mut s: &str) -> Demangle { // will show up in the demangling, if hidden behind a backref) Err(v0::ParseError::Invalid) | Err(v0::ParseError::RecursedTooDeep) => None, }, - }; + } +} - // Output like LLVM IR adds extra period-delimited words. See if - // we are in that case and save the trailing words if so. - if !suffix.is_empty() { - if suffix.starts_with('.') && is_symbol_like(suffix) { - // Keep the suffix. - } else { - // Reset the suffix and invalidate the demangling. - suffix = ""; - style = None; +/// De-mangles a Rust symbol into a more readable version +/// +/// This function will take a **mangled** symbol and return a value. When printed, +/// the de-mangled version will be written. If the symbol does not look like +/// a mangled symbol, the original value will be written instead. +/// +/// # Examples +/// +/// ``` +/// use rustc_demangle::demangle; +/// +/// assert_eq!(demangle("_ZN4testE").to_string(), "test"); +/// assert_eq!(demangle("_ZN3foo3barE").to_string(), "foo::bar"); +/// assert_eq!(demangle("foo").to_string(), "foo"); +/// ``` +pub fn demangle(s: &str) -> Demangle { + if let Some((style, remainder)) = demangle_common(s) { + // Output like LLVM IR adds extra period-delimited words. See if + // we are in that case and save the trailing words if so. + if remainder.is_empty() || (remainder.starts_with('.') && is_llvm_suffix_like(remainder)) { + return Demangle { + style: Some(style), + original: s, + suffix: remainder, + }; } } - Demangle { - style, + return Demangle { + style: None, original: s, - suffix, + suffix: "", + }; +} + +#[cfg(feature = "std")] +fn demangle_partial(s: &str) -> (Demangle, &str) { + if let Some((style, remainder)) = demangle_common(s) { + // Note: suffix is ALWAYS empty because we do not compute the + // LLVM compatibility (nor do we care) + return ( + Demangle { + style: Some(style), + original: s, + suffix: "", + }, + remainder, + ); } + + ( + Demangle { + style: None, + original: s, + suffix: "", + }, + s, + ) } #[cfg(feature = "std")] fn demangle_line( - line: &str, + mut line: &str, output: &mut impl std::io::Write, include_hash: bool, ) -> std::io::Result<()> { - let mut head = 0; - while head < line.len() { + loop { // Move to the next potential match - let next_head = match (line[head..].find("_ZN"), line[head..].find("_R")) { - (Some(idx), None) | (None, Some(idx)) => head + idx, - (Some(idx1), Some(idx2)) => head + idx1.min(idx2), + let next_head = match (line.find("_ZN"), line.find("_R")) { + (Some(idx), None) | (None, Some(idx)) => idx, + (Some(idx1), Some(idx2)) => idx1.min(idx2), (None, None) => { // No more matches... line.len() } }; - output.write_all(line[head..next_head].as_bytes())?; - head = next_head; - // Find the non-matching character. - // - // If we do not find a character, then until the end of the line is the - // thing to demangle. - let match_end = line[head..] - .find(|ch: char| !(ch == '$' || ch == '.' || ch == '_' || ch.is_ascii_alphanumeric())) - .map(|idx| head + idx) - .unwrap_or(line.len()); - - let mangled = &line[head..match_end]; - head = head + mangled.len(); - if let Ok(demangled) = try_demangle(mangled) { + output.write_all(line[..next_head].as_bytes())?; + line = &line[next_head..]; + + if line.is_empty() { + break; + } + + let (demangled, remainder) = demangle_partial(line); + line = remainder; + + if demangled.style.is_some() { if include_hash { write!(output, "{}", demangled)?; } else { write!(output, "{:#}", demangled)?; } } else { - output.write_all(mangled.as_bytes())?; + // there are maybe valid symbols inside this fake one + output.write_all(&line.as_bytes()[..1])?; + line = &line[1..]; } } Ok(()) @@ -250,7 +269,7 @@ impl<'a> Demangle<'a> { } } -fn is_symbol_like(s: &str) -> bool { +fn is_llvm_suffix_like(s: &str) -> bool { s.chars().all(|c| { // Once `char::is_ascii_punctuation` and `char::is_ascii_alphanumeric` // have been stable for long enough, use those instead for clarity @@ -572,7 +591,7 @@ mod tests { #[cfg(feature = "std")] fn demangle_str(input: &str) -> String { let mut output = Vec::new(); - super::demangle_line(input, &mut output, false); + super::demangle_line(input, &mut output, false).unwrap(); String::from_utf8(output).unwrap() } From 3ed564f296636b1359c37acbcc9f8a69c1a28939 Mon Sep 17 00:00:00 2001 From: Addison Crump Date: Fri, 29 Sep 2023 00:41:50 +0200 Subject: [PATCH 5/5] test for demangle_str with emoji --- src/lib.rs | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/src/lib.rs b/src/lib.rs index 3cd0375..15d2921 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -604,6 +604,15 @@ mod tests { ); } + #[test] + #[cfg(feature = "std")] + fn find_multiple_emoji() { + assert_eq!( + demangle_str("_ZN4🐇E.llvm moocow _ZN4🐇E.llvm"), + "🐇.llvm moocow 🐇.llvm" + ); + } + #[test] #[cfg(feature = "std")] fn interleaved_new_legacy() {