Skip to content

Commit

Permalink
feat(stream): Add strip_bytes support
Browse files Browse the repository at this point in the history
This has more overhead as it has to track the UTF8 parse state to tell
UTF8 continuations from single-byte control codes.
  • Loading branch information
epage committed Mar 7, 2023
1 parent d6dbbbc commit 7fadaf2
Show file tree
Hide file tree
Showing 5 changed files with 274 additions and 5 deletions.
1 change: 1 addition & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 2 additions & 2 deletions crates/anstyle-stream/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -23,13 +23,13 @@ pre-release-replacements = [

[dependencies]
anstyle = { version = "0.2.5", path = "../anstyle" }
anstyle-parse = { version = "0.1.0", path = "../anstyle-parse", default-features = false }
anstyle-parse = { version = "0.1.0", path = "../anstyle-parse" }
utf8parse = "0.2.0"

[dev-dependencies]
criterion = "0.4.0"
proptest = "1.1.0"
strip-ansi-escapes = "0.1.1"
anstyle-parse = { version = "0.1.0", path = "../anstyle-parse", default-features = false, features = ["utf8"] }

[[bench]]
name = "strip"
Expand Down
14 changes: 14 additions & 0 deletions crates/anstyle-stream/benches/strip.rs
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,13 @@ fn strip(c: &mut Criterion) {
stripped.0,
anstyle_stream::adapter::strip_str(content).to_string()
);
assert_eq!(
stripped.0,
String::from_utf8(
anstyle_stream::adapter::strip_bytes(content.as_bytes()).into_vec()
)
.unwrap()
);
}

let mut group = c.benchmark_group(name);
Expand Down Expand Up @@ -84,6 +91,13 @@ fn strip(c: &mut Criterion) {
})
});
}
group.bench_function("strip_bytes", |b| {
b.iter(|| {
let stripped = anstyle_stream::adapter::strip_bytes(content).into_vec();

black_box(stripped)
})
});
}
}

Expand Down
4 changes: 4 additions & 0 deletions crates/anstyle-stream/src/adapter/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,11 @@

mod strip;

pub use strip::strip_bytes;
pub use strip::strip_str;
pub use strip::StripBytes;
pub use strip::StripBytesIter;
pub use strip::StripStr;
pub use strip::StripStrIter;
pub use strip::StrippedBytes;
pub use strip::StrippedStr;
256 changes: 253 additions & 3 deletions crates/anstyle-stream/src/adapter/strip.rs
Original file line number Diff line number Diff line change
Expand Up @@ -168,6 +168,207 @@ fn is_utf8_continuation(b: u8) -> bool {
matches!(b, 0x80..=0xbf)
}

/// Strip ANSI escapes from bytes, returning the printable content
///
/// This can be used to take output from a program that includes escape sequences and write it
/// somewhere that does not easily support them, such as a log file.
///
/// # Example
///
/// ```rust
/// use std::io::Write as _;
///
/// let styled_text = "\x1b[32mfoo\x1b[m bar";
/// let plain_str = anstyle_stream::adapter::strip_bytes(styled_text.as_bytes()).into_vec();
/// assert_eq!(plain_str.as_slice(), &b"foo bar"[..]);
/// ```
#[inline]
pub fn strip_bytes(data: &[u8]) -> StrippedBytes<'_> {
StrippedBytes::new(data)
}

/// See [`strip_bytes`]
#[derive(Default)]
pub struct StrippedBytes<'s> {
bytes: &'s [u8],
state: State,
utf8parser: Utf8Parser,
}

impl<'s> StrippedBytes<'s> {
/// See [`strip_bytes`]
#[inline]
pub fn new(bytes: &'s [u8]) -> Self {
Self {
bytes,
state: State::Ground,
utf8parser: Default::default(),
}
}

/// Strip the next slice of bytes
///
/// Used when the content is in several non-contiguous slices
///
/// # Panic
///
/// May panic if it is not exhausted / empty
#[inline]
pub fn extend(&mut self, bytes: &'s [u8]) {
debug_assert!(
self.is_empty(),
"current bytes must be processed to ensure we end at the right state"
);
self.bytes = bytes;
}

/// Report the bytes has been exhausted
#[inline]
pub fn is_empty(&self) -> bool {
self.bytes.is_empty()
}

/// Create a [`Vec`] of the printable content
#[inline]
pub fn into_vec(self) -> Vec<u8> {
let mut stripped = Vec::with_capacity(self.bytes.len());
for printable in self {
stripped.extend(printable);
}
stripped
}
}

impl<'s> Iterator for StrippedBytes<'s> {
type Item = &'s [u8];

#[inline]
fn next(&mut self) -> Option<Self::Item> {
next_bytes(&mut self.bytes, &mut self.state, &mut self.utf8parser)
}
}

/// Incrementally strip non-contiguous data
#[derive(Default)]
pub struct StripBytes {
state: State,
utf8parser: Utf8Parser,
}

impl StripBytes {
/// Initial state
pub fn new() -> Self {
Default::default()
}

/// Strip the next segment of data
pub fn strip_next<'s>(&'s mut self, bytes: &'s [u8]) -> StripBytesIter<'s> {
StripBytesIter {
bytes,
state: &mut self.state,
utf8parser: &mut self.utf8parser,
}
}
}

/// See [`StripStr`]
pub struct StripBytesIter<'s> {
bytes: &'s [u8],
state: &'s mut State,
utf8parser: &'s mut Utf8Parser,
}

impl<'s> Iterator for StripBytesIter<'s> {
type Item = &'s [u8];

#[inline]
fn next(&mut self) -> Option<Self::Item> {
next_bytes(&mut self.bytes, self.state, self.utf8parser)
}
}

#[inline]
fn next_bytes<'s>(
bytes: &mut &'s [u8],
state: &mut State,
utf8parser: &mut Utf8Parser,
) -> Option<&'s [u8]> {
let offset = bytes.iter().copied().position(|b| {
if *state == State::Utf8 {
true
} else {
let (next_state, action) = state_change(*state, b);
if next_state != State::Anywhere {
*state = next_state;
}
is_printable_bytes(action, b)
}
});
let (_, next) = bytes.split_at(offset.unwrap_or(bytes.len()));
*bytes = next;

let offset = bytes.iter().copied().position(|b| {
if *state == State::Utf8 {
if utf8parser.add(b) {
*state = State::Ground;
}
false
} else {
let (next_state, action) = state_change(State::Ground, b);
if next_state != State::Anywhere {
*state = next_state;
}
if *state == State::Utf8 {
utf8parser.add(b);
false
} else {
!is_printable_bytes(action, b)
}
}
});
let (printable, next) = bytes.split_at(offset.unwrap_or(bytes.len()));
*bytes = next;
if printable.is_empty() {
None
} else {
Some(printable)
}
}

#[derive(Default)]
pub struct Utf8Parser {
utf8_parser: utf8parse::Parser,
}

impl Utf8Parser {
fn add(&mut self, byte: u8) -> bool {
let mut b = false;
let mut receiver = VtUtf8Receiver(&mut b);
self.utf8_parser.advance(&mut receiver, byte);
b
}
}

struct VtUtf8Receiver<'a>(&'a mut bool);

impl<'a> utf8parse::Receiver for VtUtf8Receiver<'a> {
fn codepoint(&mut self, _: char) {
*self.0 = true;
}

fn invalid_sequence(&mut self) {
*self.0 = true;
}
}

#[inline]
fn is_printable_bytes(action: Action, byte: u8) -> bool {
// Continuations aren't included as they may also be control codes, requiring more context
action == Action::Print
|| action == Action::BeginUtf8
|| (action == Action::Execute && byte.is_ascii_whitespace())
}

#[cfg(test)]
mod test {
use super::*;
Expand Down Expand Up @@ -203,7 +404,7 @@ mod test {
}

/// Model verifying incremental parsing
fn strip_chars(mut s: &str) -> String {
fn strip_char(mut s: &str) -> String {
let mut result = String::new();
let mut state = StripStr::new();
while !s.is_empty() {
Expand All @@ -219,6 +420,35 @@ mod test {
result
}

/// Model verifying incremental parsing
fn strip_byte(s: &[u8]) -> Vec<u8> {
let mut result = Vec::new();
let mut state = StripBytes::default();
for start in 0..s.len() {
let current = &s[start..=start];
for printable in state.strip_next(current) {
result.extend(printable);
}
}
result
}

#[test]
fn test_strip_bytes_multibyte() {
let bytes = [240, 145, 141, 139];
let expected = parser_strip(&bytes);
let actual = String::from_utf8(strip_bytes(&bytes).into_vec()).unwrap();
assert_eq!(expected, actual);
}

#[test]
fn test_strip_byte_multibyte() {
let bytes = [240, 145, 141, 139];
let expected = parser_strip(&bytes);
let actual = String::from_utf8(strip_byte(&bytes).to_vec()).unwrap();
assert_eq!(expected, actual);
}

proptest! {
#[test]
#[cfg_attr(miri, ignore)] // See https://github.com/AltSysrq/proptest/issues/253
Expand All @@ -230,9 +460,29 @@ mod test {

#[test]
#[cfg_attr(miri, ignore)] // See https://github.com/AltSysrq/proptest/issues/253
fn strip_chars_no_escapes(s in "\\PC*") {
fn strip_char_no_escapes(s in "\\PC*") {
let expected = parser_strip(s.as_bytes());
let actual = strip_char(&s);
assert_eq!(expected, actual);
}

#[test]
#[cfg_attr(miri, ignore)] // See https://github.com/AltSysrq/proptest/issues/253
fn strip_bytes_no_escapes(s in "\\PC*") {
dbg!(&s);
dbg!(s.as_bytes());
let expected = parser_strip(s.as_bytes());
let actual = String::from_utf8(strip_bytes(s.as_bytes()).into_vec()).unwrap();
assert_eq!(expected, actual);
}

#[test]
#[cfg_attr(miri, ignore)] // See https://github.com/AltSysrq/proptest/issues/253
fn strip_byte_no_escapes(s in "\\PC*") {
dbg!(&s);
dbg!(s.as_bytes());
let expected = parser_strip(s.as_bytes());
let actual = strip_chars(&s);
let actual = String::from_utf8(strip_byte(s.as_bytes()).to_vec()).unwrap();
assert_eq!(expected, actual);
}
}
Expand Down

0 comments on commit 7fadaf2

Please sign in to comment.