Skip to content

Commit

Permalink
Replace scraper with a small custom library (#593)
Browse files Browse the repository at this point in the history
* Add small custom scraper library

* add control flow capabilities

* fix lint
  • Loading branch information
aumetra authored Sep 27, 2024
1 parent b5326c6 commit abfa422
Show file tree
Hide file tree
Showing 11 changed files with 287 additions and 259 deletions.
333 changes: 86 additions & 247 deletions Cargo.lock

Large diffs are not rendered by default.

2 changes: 2 additions & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,7 @@ members = [
"lib/mrf-manifest",
"lib/mrf-tool",
"lib/post-process",
"lib/schaber",
"lib/speedy-uuid",
"lib/tick-tock-mock",
"lib/tower-http-digest",
Expand Down Expand Up @@ -162,6 +163,7 @@ just-retry = { path = "lib/just-retry" }
masto-id-convert = { path = "lib/masto-id-convert" }
mrf-manifest = { path = "lib/mrf-manifest" }
post-process = { path = "lib/post-process" }
schaber = { path = "lib/schaber" }
speedy-uuid = { path = "lib/speedy-uuid", features = ["serde"] }
tick-tock-mock = { path = "lib/tick-tock-mock" }
tower-http-digest = { path = "lib/tower-http-digest" }
Expand Down
2 changes: 1 addition & 1 deletion crates/kitsune-derive/impl/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ proc-macro = true
[dependencies]
proc-macro2 = "1.0.86"
quote = "1.0.37"
syn = { version = "2.0.78", features = ["full"] }
syn = { version = "2.0.79", features = ["full"] }

[lints]
workspace = true
2 changes: 1 addition & 1 deletion crates/kitsune-embed/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ kitsune-derive = { workspace = true }
kitsune-error = { workspace = true }
kitsune-http-client = { workspace = true }
lantern-client-sdk = { package = "client-sdk", git = "https://github.com/Lantern-chat/client-sdk-rs.git", rev = "efb4288d9b107b48609802193d57b29f7ae395a1", default-features = false }
scraper = { version = "0.20.0", default-features = false }
schaber = { workspace = true }
smol_str = "0.3.1"

[lints]
Expand Down
22 changes: 12 additions & 10 deletions crates/kitsune-embed/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,24 +12,26 @@ use kitsune_derive::kitsune_service;
use kitsune_error::Result;
use kitsune_http_client::Client as HttpClient;
use lantern_client_sdk::models::EmbedWithExpire;
use scraper::{Html, Selector};
use schaber::Scraper;
use smol_str::SmolStr;
use std::sync::LazyLock;
use std::{ops::ControlFlow, sync::LazyLock};

pub use lantern_client_sdk::models::{Embed, EmbedType};

static LINK_SELECTOR: LazyLock<Selector> = LazyLock::new(|| {
Selector::parse("a:not(.mention, .hashtag)").expect("[Bug] Failed to parse link HTML selector")
static LINK_SCRAPER: LazyLock<Scraper> = LazyLock::new(|| {
Scraper::new("a:not(.mention, .hashtag)").expect("[Bug] Failed to parse link HTML selector")
});

fn first_link_from_fragment(fragment: &str) -> Option<String> {
let parsed_fragment = Html::parse_fragment(fragment);
let mut link = None;
LINK_SCRAPER
.process(fragment, |element| {
link = element.get_attribute("href");
ControlFlow::Break(())
})
.unwrap();

parsed_fragment
.select(&LINK_SELECTOR)
.next()
.and_then(|element| element.value().attr("href"))
.map(ToString::to_string)
link
}

#[kitsune_service]
Expand Down
13 changes: 13 additions & 0 deletions lib/schaber/Cargo.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
[package]
name = "schaber"
authors.workspace = true
edition.workspace = true
version.workspace = true
license = "MIT OR Apache-2.0"

[dependencies]
lol_html = "2.0.0"
thiserror = "1.0.64"

[lints]
workspace = true
1 change: 1 addition & 0 deletions lib/schaber/LICENSE-APACHE-2.0
1 change: 1 addition & 0 deletions lib/schaber/LICENSE-MIT
89 changes: 89 additions & 0 deletions lib/schaber/src/lib.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
use lol_html::{
errors::{RewritingError, SelectorError},
html_content::Element,
ElementContentHandlers, HandlerResult, HtmlRewriter, Selector, Settings,
};
use std::{borrow::Cow, ops::ControlFlow, str::FromStr};
use thiserror::Error;

type Result<T, E = Error> = std::result::Result<T, E>;

/// Ignore any content handler "errors", since we use these errors
/// as our means of communicating control flow
macro_rules! handle_error {
($error_expr:expr) => {{
match { $error_expr } {
Err(::lol_html::errors::RewritingError::ContentHandlerError(..)) => return Ok(()),
other => other,
}
}};
}

#[derive(Debug, Error)]
#[error("small sacrifice for the lol_html gods")]
struct Sacrifice;

#[derive(Debug, Error)]
pub enum Error {
#[error(transparent)]
InvalidSelector(#[from] SelectorError),

#[error(transparent)]
RewriteError(#[from] RewritingError),
}

pub struct Scraper {
element_selector: Selector,
}

impl Scraper {
pub fn new(selector: &str) -> Result<Self> {
Ok(Self {
element_selector: Selector::from_str(selector)?,
})
}

pub fn process<I, H>(&self, input: I, mut handler: H) -> Result<()>
where
I: AsRef<[u8]>,
H: FnMut(&Element<'_, '_>) -> ControlFlow<()>,
{
#[inline]
fn handler_assert<F>(uwu: F) -> F
where
F: FnMut(&mut Element<'_, '_>) -> HandlerResult,
{
uwu
}

#[inline]
fn sink_assert<F>(uwu: F) -> F
where
F: FnMut(&[u8]),
{
uwu
}

let mut rewriter = HtmlRewriter::new(
Settings {
element_content_handlers: vec![(
Cow::Borrowed(&self.element_selector),
ElementContentHandlers::default().element(handler_assert(|el| {
if handler(el).is_continue() {
Ok(())
} else {
Err(Box::new(Sacrifice))
}
})),
)],
..Settings::new()
},
sink_assert(|_| {}),
);

handle_error!(rewriter.write(input.as_ref()))?;
handle_error!(rewriter.end())?;

Ok(())
}
}
25 changes: 25 additions & 0 deletions lib/schaber/tests/basic.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
use schaber::Scraper;
use std::ops::ControlFlow;

#[test]
fn select_link() {
let html = r#"
<div id="hello">
<a href="http://druckbrudi.lab">
PRINT MORE BLÅHAJ CATEARS!
</a>
</div>
"#;

let mut link_url = None;
let scraper = Scraper::new("a").unwrap();

scraper
.process(html, |element| {
link_url = element.get_attribute("href");
ControlFlow::Break(())
})
.unwrap();

assert_eq!(link_url.as_deref(), Some("http://druckbrudi.lab"));
}
56 changes: 56 additions & 0 deletions lib/schaber/tests/control_flow.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
use schaber::Scraper;
use std::ops::ControlFlow;

#[test]
fn ends_after_break() {
let html = r#"
<div id="hello">
<a href="http://druckbrudi.lab">
PRINT MORE BLÅHAJ CATEARS!
</a>
<a href="http://evil.com">
This link shall not be seen!
</a>
</div>
"#;

let mut link_url = None;
let scraper = Scraper::new("a").unwrap();

scraper
.process(html, |element| {
link_url = element.get_attribute("href");
ControlFlow::Break(())
})
.unwrap();

assert_eq!(link_url.as_deref(), Some("http://druckbrudi.lab"));
}

#[test]
fn continues_after_continue() {
let html = r#"
<div id="hello">
<a href="http://druckbrudi.lab">
PRINT MORE BLÅHAJ CATEARS!
</a>
<a href="https://good.org">
This link shall be seen!
</a>
</div>
"#;

let mut link_url = None;
let scraper = Scraper::new("a").unwrap();

scraper
.process(html, |element| {
link_url = element.get_attribute("href");
ControlFlow::Continue(())
})
.unwrap();

assert_eq!(link_url.as_deref(), Some("https://good.org"));
}

0 comments on commit abfa422

Please sign in to comment.