From cc6d6527dafee5ade0a2931d9ff90adc228a7f14 Mon Sep 17 00:00:00 2001 From: Jan Verbeek Date: Fri, 5 Nov 2021 14:16:09 +0100 Subject: [PATCH] Only escape unterminated BIDI control codes --- .github/workflows/ci.yaml | 1 + CHANGELOG.md | 2 +- README.md | 12 ++++ fuzz/Cargo.toml | 7 +++ fuzz/fuzz_targets/bidi.rs | 60 ++++++++++++++++++++ src/lib.rs | 116 ++++++++++++++++++++++++++++++++++---- src/unix.rs | 10 +++- src/windows.rs | 12 +++- 8 files changed, 205 insertions(+), 15 deletions(-) create mode 100644 fuzz/fuzz_targets/bidi.rs diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index c8f8557..e8c25d9 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -45,6 +45,7 @@ jobs: - run: cargo install cargo-fuzz # This is too short to catch subtle issues, but it hopefully catches glaring ones. - run: cargo +nightly fuzz run basic -- -max_len=32 -timeout=1 -max_total_time=60 + - run: cargo +nightly fuzz run bidi -- -max_len=32 -timeout=1 -max_total_time=60 - run: cargo +nightly fuzz run shell -- -max_len=32 -timeout=1 -max_total_time=180 - run: cargo +nightly fuzz run powershell -- -max_len=32 -timeout=1 -max_total_time=180 diff --git a/CHANGELOG.md b/CHANGELOG.md index 6a1365e..08ef915 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,5 @@ ## v0.1.2 (unreleased) -- Escape dangerous control codes for bidirectional text. See also: [rustc CVE-2021-42574](https://blog.rust-lang.org/2021/11/01/cve-2021-42574.html). +- Escape dangerous control codes for bidirectional text. See also: [CVE-2021-42574](https://blog.rust-lang.org/2021/11/01/cve-2021-42574.html). ## v0.1.1 (2021-10-14) - Escape unicode control characters like `U+0085 NEXT LINE` and `U+2028 LINE SEPARATOR`. diff --git a/README.md b/README.md index 74321c3..75ce5fc 100644 --- a/README.md +++ b/README.md @@ -109,6 +109,16 @@ assert_eq!("\u{200B}".maybe_quote().to_string(), "'\u{200B}'"); It still misleadingly looks like `''` when printed, but it's possible to copy and paste it and get the right result. +## Bidirectional unicode +A carefully-crafted string can move part of itself to the end of the line: +```console +$ wc $'filename\u202E\u2066 [This does not belong!]\u2069\u2066' +wc: 'filename': No such file or directory [This does not belong!] +``` +This is known as a [*Trojan Source*](https://trojansource.codes/) attack. It uses control codes for bidirectional text. + +`os_display` escapes those control codes if they're not properly terminated. + ## Feature flags By default you can only use the current platform's quoting style. That's appropriate most of the time. @@ -140,6 +150,8 @@ The Unix implementation has been [fuzzed](https://github.com/rust-fuzz/cargo-fuz The PowerShell implementation has been fuzzed against PowerShell Core 7.1.4 running on Linux. +Both implementations have been fuzzed to test their protection against Trojan Source attacks. + ## Acknowledgments This library is modeled after the quoting done by [Gnulib](https://www.gnu.org/software/gnulib/) as seen in the GNU coreutils. The behavior is not identical, however: - GNU uses octal escapes, like `\377` instead of `\xFF`. diff --git a/fuzz/Cargo.toml b/fuzz/Cargo.toml index 9f75abf..e649bad 100644 --- a/fuzz/Cargo.toml +++ b/fuzz/Cargo.toml @@ -12,6 +12,7 @@ cargo-fuzz = true libfuzzer-sys = "0.4" once_cell = "1.8.0" unicode-width = "0.1.9" +unic-bidi = "0.9.0" [dependencies.os_display] path = ".." @@ -38,3 +39,9 @@ name = "powershell" path = "fuzz_targets/powershell.rs" test = false doc = false + +[[bin]] +name = "bidi" +path = "fuzz_targets/bidi.rs" +test = false +doc = false diff --git a/fuzz/fuzz_targets/bidi.rs b/fuzz/fuzz_targets/bidi.rs new file mode 100644 index 0000000..7c3eee1 --- /dev/null +++ b/fuzz/fuzz_targets/bidi.rs @@ -0,0 +1,60 @@ +#![no_main] +use libfuzzer_sys::fuzz_target; + +use os_display::Quoted; + +fn assert_bidi_safe(text: &str) { + // The prefix stops unic_bidi from RTL-ing the context if there are + // RTL codepoints inside text. Without the prefix the suffix may end up + // at the start. + + // I think that for certain combinations of terminal, text, and context it + // may be possible to mangle the context using only "normal" RTL codepoints. + // With VTE it only seems possible if the context contains RTL codepoints, + // but other terminals may follow different rules (e.g. Terminal.app). + + // The context mangling is unfortunate, but it's not nearly as dangerous as + // a proper trojan-source attack (it's weak and conspicuous), and I think + // it's flat-out impossible to prevent within this crate's constraints. + // Escaping all RTL codepoints renders text unreadable, and adding LTR markers + // means it can't be copied faithfully. + + let text = format!("a {} b", text); + + let info = unic_bidi::BidiInfo::new(&text, None); + assert_eq!(info.paragraphs.len(), 1); + let para = &info.paragraphs[0]; + let reordered = info.reorder_line(para, para.range.clone()); + + assert!(reordered.ends_with('b'), "{:?} → {:?}", text, reordered); + assert!(reordered.starts_with('a')); +} + +const WEIRD_CHARS: &[char] = &[ + // Explicitly called out in the paper. + '\u{202A}', '\u{202B}', '\u{202C}', '\u{202D}', '\u{202E}', '\u{2066}', '\u{2067}', '\u{2068}', + '\u{2069}', + // Other bidi. Not as dangerous (and ~equivalent to certain ordinary characters), but can't hurt to test. + '\u{061C}', '\u{200E}', '\u{200F}', +]; + +fuzz_target!(|data: &[u8]| { + let mut owned = Vec::new(); + for ch in data { + match *ch { + b'a'..=b'l' => owned.extend( + WEIRD_CHARS[(*ch - b'a') as usize] + .encode_utf8(&mut [0; 4]) + .as_bytes(), + ), + _ => owned.push(*ch), + } + } + let data = owned; + let unix = Quoted::unix_raw(&data).force(false).to_string(); + assert_bidi_safe(&unix); + if let Ok(text) = String::from_utf8(data) { + let windows = Quoted::windows(&text).force(false).to_string(); + assert_bidi_safe(&windows); + } +}); diff --git a/src/lib.rs b/src/lib.rs index a0db3e8..b878070 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -228,7 +228,7 @@ fn decode_utf16(units: impl IntoIterator) -> impl Iterator bool { - ch.is_control() || is_separator(ch) || is_dangerous_bidi(ch) + ch.is_control() || is_separator(ch) } /// U+2028 LINE SEPARATOR and U+2029 PARAGRAPH SEPARATOR are currently the only @@ -241,22 +241,82 @@ fn is_separator(ch: char) -> bool { /// These two ranges in PropList.txt: /// LEFT-TO-RIGHT EMBEDDING..RIGHT-TO-LEFT OVERRIDE /// LEFT-TO-RIGHT ISOLATE..POP DIRECTIONAL ISOLATE -/// -/// ARABIC LETTER MARK and LEFT-TO-RIGHT MARK/RIGHT-TO-LEFT MARK don't appear -/// to be dangerous, and escaping them could mess up legitimate text. -/// -/// For more info, see this Rust CVE: -/// https://blog.rust-lang.org/2021/11/01/cve-2021-42574.html -/// -/// This should be checked again for Unicode 15.0, which will release on -/// September 11, 2022. -fn is_dangerous_bidi(ch: char) -> bool { +fn is_bidi(ch: char) -> bool { match ch { '\u{202A}'..='\u{202E}' | '\u{2066}'..='\u{2069}' => true, _ => false, } } +/// Check whether text uses bidi in a potentially problematic way. +/// +/// See https://trojansource.codes/ and +/// https://www.unicode.org/reports/tr9/tr9-42.html. +/// +/// If text fails this check then it's handled by write_escaped(), which +/// escapes these bidi control characters no matter what. +/// +/// We can safely assume that there are no newlines (or unicode separators) +/// in the text because those would get it sent to write_escaped() earlier. +/// In unicode terms, this is all a single paragraph. +#[inline(never)] +fn is_suspicious_bidi(text: &str) -> bool { + #[derive(Clone, Copy, PartialEq)] + enum Kind { + Formatting, + Isolate, + } + const STACK_SIZE: usize = 16; + // Can't use a Vec because of no_std + let mut stack: [Option; STACK_SIZE] = [None; STACK_SIZE]; + let mut pos = 0; + for ch in text.chars() { + match ch { + '\u{202A}' | '\u{202B}' | '\u{202D}' | '\u{202E}' => { + if pos >= STACK_SIZE { + // Suspicious amount of nesting. + return true; + } + stack[pos] = Some(Kind::Formatting); + pos += 1; + } + '\u{202C}' => { + if pos == 0 { + // Unpaired terminator. + // Not necessarily dangerous, but suspicious and + // could disrupt preceding text. + return true; + } + pos -= 1; + if stack[pos] != Some(Kind::Formatting) { + // Terminator doesn't match. + // UAX #9 says to pop the stack until we find a match. + // But we'll keep things simple and cautious. + return true; + } + } + '\u{2066}' | '\u{2067}' | '\u{2068}' => { + if pos >= STACK_SIZE { + return true; + } + stack[pos] = Some(Kind::Isolate); + pos += 1; + } + '\u{2069}' => { + if pos == 0 { + return true; + } + pos -= 1; + if stack[pos] != Some(Kind::Isolate) { + return true; + } + } + _ => (), + } + } + pos != 0 +} + #[cfg(feature = "native")] mod native { use super::*; @@ -344,7 +404,7 @@ mod tests { use super::*; - use std::string::ToString; + use std::string::{String, ToString}; const BOTH_ALWAYS: &[(&str, &str)] = &[ ("foo", "'foo'"), @@ -367,6 +427,11 @@ mod tests { ("\u{200B}a", "'\u{200B}a'"), ("a\u{200B}", "a\u{200B}"), ("\u{2000}", "'\u{2000}'"), + // Odd but safe bidi + ( + "\u{2067}\u{2066}abc\u{2069}\u{2066}def\u{2069}\u{2069}", + "'\u{2067}\u{2066}abc\u{2069}\u{2066}def\u{2069}\u{2069}'", + ), ]; const UNIX_ALWAYS: &[(&str, &str)] = &[ @@ -387,6 +452,7 @@ mod tests { ("\u{85}", r#"$'\xC2\x85'"#), ("\u{85}a", r#"$'\xC2\x85'$'a'"#), ("\u{2028}", r#"$'\xE2\x80\xA8'"#), + // Dangerous bidi ( "user\u{202E} \u{2066}// Check if admin\u{2069} \u{2066}", r#"$'user\xE2\x80\xAE \xE2\x81\xA6// Check if admin\xE2\x81\xA9 \xE2\x81\xA6'"#, @@ -409,6 +475,13 @@ mod tests { for &(orig, expected) in UNIX_RAW { assert_eq!(Quoted::unix_raw(orig).to_string(), expected); } + let bidi_ok = nest_bidi(16); + assert_eq!( + Quoted::unix(&bidi_ok).to_string(), + "'".to_string() + &bidi_ok + "'" + ); + let bidi_too_deep = nest_bidi(17); + assert!(Quoted::unix(&bidi_too_deep).to_string().starts_with('$')); } const WINDOWS_ALWAYS: &[(&str, &str)] = &[ @@ -450,6 +523,13 @@ mod tests { for &(orig, expected) in WINDOWS_RAW { assert_eq!(Quoted::windows_raw(orig).to_string(), expected); } + let bidi_ok = nest_bidi(16); + assert_eq!( + Quoted::windows(&bidi_ok).to_string(), + "'".to_string() + &bidi_ok + "'" + ); + let bidi_too_deep = nest_bidi(17); + assert!(Quoted::windows(&bidi_too_deep).to_string().contains('`')); } #[cfg(feature = "native")] @@ -511,4 +591,16 @@ mod tests { Path::new("foo").to_owned().quote(); Cow::Borrowed(Path::new("foo")).quote(); } + + fn nest_bidi(n: usize) -> String { + let mut out = String::new(); + for _ in 0..n { + out.push('\u{2066}'); + } + out.push('a'); + for _ in 0..n { + out.push('\u{2069}'); + } + out + } } diff --git a/src/unix.rs b/src/unix.rs index c5ce592..8efaae9 100644 --- a/src/unix.rs +++ b/src/unix.rs @@ -23,6 +23,7 @@ pub(crate) fn write(f: &mut Formatter<'_>, text: &str, force_quote: bool) -> fmt let mut is_single_safe = true; let mut is_double_safe = true; let mut requires_quote = force_quote; + let mut is_bidi = false; if !requires_quote { if let Some(first) = text.chars().next() { @@ -75,12 +76,19 @@ pub(crate) fn write(f: &mut Formatter<'_>, text: &str, force_quote: bool) -> fmt // This check goes stale when new whitespace codepoints are assigned. requires_quote = true; } + if crate::is_bidi(ch) { + is_bidi = true; + } if crate::requires_escape(ch) { return write_escaped(f, text.as_bytes()); } } } + if is_bidi && crate::is_suspicious_bidi(text) { + return write_escaped(f, text.as_bytes()); + } + if !requires_quote { f.write_str(text) } else if is_single_safe { @@ -155,7 +163,7 @@ pub(crate) fn write_escaped(f: &mut Formatter<'_>, text: &[u8]) -> fmt::Result { // and null bytes can't appear in arguments anyway, // so let's stay clear of that. // Some but not all shells have \e for \x1B. - ch if crate::requires_escape(ch) => { + ch if crate::requires_escape(ch) || crate::is_bidi(ch) => { // Most shells support \uXXXX escape codes, but busybox sh // doesn't, so we always encode the raw UTF-8. Bit unfortunate, // but GNU does the same. diff --git a/src/windows.rs b/src/windows.rs index 981658f..4feffba 100644 --- a/src/windows.rs +++ b/src/windows.rs @@ -24,6 +24,7 @@ pub(crate) fn write(f: &mut Formatter<'_>, text: &str, force_quote: bool) -> fmt let mut is_single_safe = true; let mut is_double_safe = true; let mut requires_quote = force_quote; + let mut is_bidi = false; if !requires_quote { if let Some(first) = text.chars().next() { @@ -94,12 +95,19 @@ pub(crate) fn write(f: &mut Formatter<'_>, text: &str, force_quote: bool) -> fmt is_single_safe = false; requires_quote = true; } + if crate::is_bidi(ch) { + is_bidi = true; + } if crate::requires_escape(ch) { return write_escaped(f, text.chars().map(Ok)); } } } + if is_bidi && crate::is_suspicious_bidi(text) { + return write_escaped(f, text.chars().map(Ok)); + } + if !requires_quote { f.write_str(text) } else if is_single_safe { @@ -163,7 +171,9 @@ pub(crate) fn write_escaped( '\x08' => f.write_str("`b")?, '\x0b' => f.write_str("`v")?, '\x0c' => f.write_str("`f")?, - ch if crate::requires_escape(ch) => write!(f, "`u{{{:02X}}}", ch as u32)?, + ch if crate::requires_escape(ch) || crate::is_bidi(ch) => { + write!(f, "`u{{{:02X}}}", ch as u32)? + } '`' => f.write_str("``")?, '$' => f.write_str("`$")?, ch if unicode::is_double_quote(ch) => {