Skip to content

Commit

Permalink
Use magic number to detemine file type (#5225)
Browse files Browse the repository at this point in the history
* Revert "Guess image mime type from file extension (fixes #5196) (#5212)"

This reverts commit 63ea99d.

* Use magic numbers to determine file type.

* fmt

* Don't wrap response in an option

* Regen Cargo.lock

* Clean-up + guess mime type from extension if server is unresponsive

* Move some things about.

* Some cleanup.

* Removing comment lines.

---------

Co-authored-by: Dessalines <tyhou13@gmx.com>
  • Loading branch information
2 people authored and Nutomic committed Dec 4, 2024
1 parent cd7759b commit 5769a33
Show file tree
Hide file tree
Showing 3 changed files with 58 additions and 18 deletions.
28 changes: 25 additions & 3 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 2 additions & 0 deletions crates/api_common/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,8 @@ actix-web = { workspace = true, optional = true }
enum-map = { workspace = true }
urlencoding = { workspace = true }
mime = { version = "0.3.17", optional = true }
mime_guess = "2.0.5"
infer = "0.16.0"
webpage = { version = "2.0", default-features = false, features = [
"serde",
], optional = true }
Expand Down
46 changes: 31 additions & 15 deletions crates/api_common/src/request.rs
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ use lemmy_utils::{
REQWEST_TIMEOUT,
VERSION,
};
use mime::Mime;
use mime::{Mime, TEXT_HTML};
use reqwest::{
header::{CONTENT_TYPE, RANGE},
Client,
Expand Down Expand Up @@ -62,38 +62,54 @@ pub async fn fetch_link_metadata(url: &Url, context: &LemmyContext) -> LemmyResu
.send()
.await?;

let content_type: Option<Mime> = response
let mut content_type: Option<Mime> = response
.headers()
.get(CONTENT_TYPE)
.and_then(|h| h.to_str().ok())
.and_then(|h| h.parse().ok());
.and_then(|h| h.parse().ok())
// If we don't get a content_type from the response (e.g. if the server is down),
// then try to infer the content_type from the file extension.
.or(mime_guess::from_path(url.path()).first());

let opengraph_data = {
// if the content type is not text/html, we don't need to parse it
let is_html = content_type
.as_ref()
.map(|c| {
(c.type_() == mime::TEXT && c.subtype() == mime::HTML)
||
// application/xhtml+xml is a subset of HTML
(c.type_() == mime::APPLICATION && c.subtype() == "xhtml")
// application/xhtml+xml is a subset of HTML
let application_xhtml: Mime = "application/xhtml+xml".parse::<Mime>().unwrap_or(TEXT_HTML);
let allowed_mime_types = [TEXT_HTML.essence_str(), application_xhtml.essence_str()];
allowed_mime_types.contains(&c.essence_str())
})
.unwrap_or(false);
if !is_html {
Default::default()
} else {
.unwrap_or_default();

if is_html {
// Can't use .text() here, because it only checks the content header, not the actual bytes
// https://github.com/LemmyNet/lemmy/issues/1964
// So we want to do deep inspection of the actually returned bytes but need to be careful not
// spend too much time parsing binary data as HTML

// So we want to do deep inspection of the actually returned bytes but need to be careful
// not spend too much time parsing binary data as HTML
// only take first bytes regardless of how many bytes the server returns
let html_bytes = collect_bytes_until_limit(response, bytes_to_fetch).await?;
extract_opengraph_data(&html_bytes, url)
.map_err(|e| info!("{e}"))
.unwrap_or_default()
} else {
let is_octet_type = content_type
.as_ref()
.map(|c| c.subtype() == "octet-stream")
.unwrap_or_default();

// Overwrite the content type if its an octet type
if is_octet_type {
// Don't need to fetch as much data for this as we do with opengraph
let octet_bytes = collect_bytes_until_limit(response, 512).await?;
content_type =
infer::get(&octet_bytes).map_or(content_type, |t| t.mime_type().parse().ok());
}

Default::default()
}
};

Ok(LinkMetadata {
opengraph_data,
content_type: content_type.map(|c| c.to_string()),
Expand Down

0 comments on commit 5769a33

Please sign in to comment.