Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Issue 382/ support X-Robots-Tag as a typed http header XRobotsTag #393

Open
wants to merge 25 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 14 commits
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
ce6e2b7
add XRobotsTag, initial implementation
hafihaf123 Jan 12, 2025
ff26238
add value_string.rs
hafihaf123 Jan 15, 2025
caefce6
add more context with comments
hafihaf123 Jan 15, 2025
a7b8ebd
add ValidDate, custom rules
hafihaf123 Jan 15, 2025
f696c50
fix value_string.rs visibility issues
hafihaf123 Jan 15, 2025
78c2ba6
rename Iterator to ElementIter
hafihaf123 Jan 17, 2025
23c8fef
fix visibility issues
hafihaf123 Jan 17, 2025
36af384
change trait TryFrom<&[&str]> to private function from_iter
hafihaf123 Jan 17, 2025
4dacfcb
separate 'split_csv_str' function from 'from_comma_delimited'
hafihaf123 Jan 17, 2025
a57a00b
change bot_name field type to 'HeaderValueString' and indexing_rule f…
hafihaf123 Jan 17, 2025
d4fa1ad
implement FromStr for Element
hafihaf123 Jan 17, 2025
e66d95b
reformat with rustfmt
hafihaf123 Jan 17, 2025
6d0cf14
todo/ fix XRobotsTag::decode()
hafihaf123 Jan 17, 2025
6c350db
Merge branch 'plabayo:main' into issue-382/x-robots-tag
hafihaf123 Jan 20, 2025
2c2dcfa
Merge branch 'plabayo:main' into issue-382/x-robots-tag
hafihaf123 Jan 21, 2025
97230f5
add chrono crate to dependencies
hafihaf123 Jan 27, 2025
881e70c
Merge branch 'plabayo:main' into issue-382/x-robots-tag
hafihaf123 Jan 27, 2025
e003827
Merge remote-tracking branch 'origin/issue-382/x-robots-tag' into iss…
hafihaf123 Jan 27, 2025
2ea9085
rework API
hafihaf123 Jan 27, 2025
707a209
fix chrono dependency placement
hafihaf123 Jan 27, 2025
f280156
enhance code, add valid_date.rs
hafihaf123 Jan 27, 2025
92cd0cc
Merge branch 'plabayo:main' into issue-382/x-robots-tag
hafihaf123 Jan 27, 2025
bd571c4
add x_robots_tag.rs
hafihaf123 Jan 29, 2025
5933ec2
implement FromStr for ValidDate
hafihaf123 Jan 29, 2025
f10e6df
enhance code
hafihaf123 Jan 31, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions rama-http-types/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -131,6 +131,12 @@ pub mod header {
"x-real-ip",
];

// non-std web-crawler info headers
//
// More information at
// <https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/X-Robots-Tag>.
static_header!["x-robots-tag"];

/// Static Header Value that is can be used as `User-Agent` or `Server` header.
pub static RAMA_ID_HEADER_VALUE: HeaderValue = HeaderValue::from_static(
const_format::formatcp!("{}/{}", rama_utils::info::NAME, rama_utils::info::VERSION),
Expand Down
4 changes: 4 additions & 0 deletions rama-http/src/headers/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -102,4 +102,8 @@ pub mod authorization {
pub use ::rama_http_types::headers::HeaderExt;

pub(crate) mod util;

mod x_robots_tag;
pub use x_robots_tag::XRobotsTag;

pub use util::quality_value::{Quality, QualityValue};
26 changes: 15 additions & 11 deletions rama-http/src/headers/util/csv.rs
Original file line number Diff line number Diff line change
Expand Up @@ -11,24 +11,28 @@ use crate::HeaderValue;
pub(crate) fn from_comma_delimited<'i, I, T, E>(values: &mut I) -> Result<E, Error>
where
I: Iterator<Item = &'i HeaderValue>,
T: ::std::str::FromStr,
E: ::std::iter::FromIterator<T>,
T: std::str::FromStr,
E: FromIterator<T>,
{
values
.flat_map(|value| {
value.to_str().into_iter().flat_map(|string| {
string
.split(',')
.filter_map(|x| match x.trim() {
"" => None,
y => Some(y),
})
.map(|x| x.parse().map_err(|_| Error::invalid()))
})
value
.to_str()
.into_iter()
.flat_map(|string| split_csv_str(string))
})
.collect()
}

pub(crate) fn split_csv_str<T: std::str::FromStr>(
string: &str,
) -> impl Iterator<Item = Result<T, Error>> + use<'_, T> {
string.split(',').filter_map(|x| match x.trim() {
"" => None,
y => Some(y.parse().map_err(|_| Error::invalid())),
})
}

/// Format an array into a comma-delimited string.
pub(crate) fn fmt_comma_delimited<T: fmt::Display>(
f: &mut fmt::Formatter,
Expand Down
2 changes: 2 additions & 0 deletions rama-http/src/headers/util/mod.rs
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
pub(crate) mod csv;
/// Internal utility functions for headers.
pub(crate) mod quality_value;

pub(crate) mod value_string;
81 changes: 81 additions & 0 deletions rama-http/src/headers/util/value_string.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
use std::{
fmt,
str::{self, FromStr},
};

use bytes::Bytes;
use http::header::HeaderValue;

use crate::headers::Error;

/// A value that is both a valid `HeaderValue` and `String`.
#[derive(Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
pub struct HeaderValueString {
/// Care must be taken to only set this value when it is also
/// a valid `String`, since `as_str` will convert to a `&str`
/// in an unchecked manner.
value: HeaderValue,
}

impl HeaderValueString {
pub(crate) fn from_val(val: &HeaderValue) -> Result<Self, Error> {
if val.to_str().is_ok() {
Ok(HeaderValueString { value: val.clone() })
} else {
Err(Error::invalid())
}
}

pub(crate) fn from_string(src: String) -> Option<Self> {
// A valid `str` (the argument)...
let bytes = Bytes::from(src);
HeaderValue::from_maybe_shared(bytes)
.ok()
.map(|value| HeaderValueString { value })
}

pub(crate) fn from_static(src: &'static str) -> HeaderValueString {
// A valid `str` (the argument)...
HeaderValueString {
value: HeaderValue::from_static(src),
}
}

pub(crate) fn as_str(&self) -> &str {
// HeaderValueString is only created from HeaderValues
// that have validated they are also UTF-8 strings.
unsafe { str::from_utf8_unchecked(self.value.as_bytes()) }
}
}

impl fmt::Debug for HeaderValueString {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
fmt::Debug::fmt(self.as_str(), f)
}
}

impl fmt::Display for HeaderValueString {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
fmt::Display::fmt(self.as_str(), f)
}
}

impl<'a> From<&'a HeaderValueString> for HeaderValue {
fn from(src: &'a HeaderValueString) -> HeaderValue {
src.value.clone()
}
}

#[derive(Debug)]
pub struct FromStrError(());

impl FromStr for HeaderValueString {
type Err = FromStrError;

fn from_str(src: &str) -> Result<Self, Self::Err> {
// A valid `str` (the argument)...
src.parse()
.map(|value| HeaderValueString { value })
.map_err(|_| FromStrError(()))
}
}
91 changes: 91 additions & 0 deletions rama-http/src/headers/x_robots_tag/element.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
use crate::headers::util::csv::{fmt_comma_delimited, split_csv_str};
use crate::headers::util::value_string::HeaderValueString;
use crate::headers::x_robots_tag::rule::Rule;
use rama_core::error::{ErrorContext, OpaqueError};
use regex::Regex;
use std::fmt::Formatter;
use std::str::FromStr;

#[derive(Debug, Clone, PartialEq, Eq)]
pub struct Element {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Depending how you structure it, this actually has to be either:

struct Element { bot_name: Option<HeaderValueString>, rules: Vec<Rule> }

or

enum Element {
    BotName(HeaderValueString),
    Rule(Rule),
}

Because when a botname is mentioned it applies to all rules that follow it, until another botname is mentioned

bot_name: Option<HeaderValueString>,
indexing_rules: Vec<Rule>,
}

impl Element {
pub fn new() -> Self {
Self {
bot_name: None,
indexing_rules: Vec::new(),
}
}

pub fn with_bot_name(bot_name: HeaderValueString) -> Self {
Self {
bot_name: Some(bot_name),
indexing_rules: Vec::new(),
}
}

pub fn add_indexing_rule(&mut self, indexing_rule: Rule) {
self.indexing_rules.push(indexing_rule);
}

pub fn bot_name(&self) -> Option<&HeaderValueString> {
self.bot_name.as_ref()
}

pub fn indexing_rules(&self) -> &[Rule] {
&self.indexing_rules
}
}

impl std::fmt::Display for Element {
fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
match self.bot_name() {
None => fmt_comma_delimited(f, self.indexing_rules().iter()),
Some(bot) => {
write!(f, "{bot}: ")?;
fmt_comma_delimited(f, self.indexing_rules().iter())
}
}
}
}

impl FromStr for Element {
type Err = OpaqueError;

fn from_str(s: &str) -> Result<Self, Self::Err> {
let regex = Regex::new(r"^\s*([^:]+?):\s*(.+)$")
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There should be no need for a regex here, it's a pretty linear process, so you should be able to easily parse out rules. E.g. something like ready until ':' or EOF, ':' => ...`.

.context("Failed to compile a regular expression")?;

let mut bot_name = None;
let mut rules_str = s;

if let Some(captures) = regex.captures(s) {
let bot_name_candidate = captures
.get(1)
.context("Failed to capture the target bot name")?
.as_str()
.trim();

if bot_name_candidate.parse::<Rule>().is_err() {
bot_name = HeaderValueString::from_string(bot_name_candidate.to_owned());
rules_str = captures
.get(2)
.context("Failed to capture the indexing rules")?
.as_str()
.trim();
}
}

let indexing_rules = split_csv_str(rules_str)
.collect::<Result<Vec<_>, _>>()
.context("Failed to parse the indexing rules")?;

Ok(Self {
bot_name,
indexing_rules,
})
}
}
19 changes: 19 additions & 0 deletions rama-http/src/headers/x_robots_tag/element_iter.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
use crate::headers::x_robots_tag::Element;

#[derive(Debug, Clone)]
/// An iterator over the `XRobotsTag` header's elements.
pub struct ElementIter(std::vec::IntoIter<Element>);

impl Iterator for ElementIter {
type Item = Element;

fn next(&mut self) -> Option<Self::Item> {
self.0.next()
}
}

impl ElementIter {
pub fn new(elements: std::vec::IntoIter<Element>) -> Self {
Self(elements)
}
}
72 changes: 72 additions & 0 deletions rama-http/src/headers/x_robots_tag/mod.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
mod rule;

mod element;

mod element_iter;

mod valid_date;

// ----------------------------------------------- \\

use crate::headers::Header;
use element::Element;
use element_iter::ElementIter;
use http::{HeaderName, HeaderValue};
use std::fmt::Formatter;
use std::iter::Iterator;

#[derive(Debug, Clone, PartialEq, Eq)]
pub struct XRobotsTag(Vec<Element>);

impl Header for XRobotsTag {
fn name() -> &'static HeaderName {
&crate::header::X_ROBOTS_TAG
}

fn decode<'i, I>(values: &mut I) -> Result<Self, headers::Error>
where
Self: Sized,
I: Iterator<Item = &'i HeaderValue>,
{
todo!();
crate::headers::util::csv::from_comma_delimited(values).map(XRobotsTag) // wouldn't really work, need more complex logic
}

fn encode<E: Extend<HeaderValue>>(&self, values: &mut E) {
use std::fmt;
struct Format<F>(F);
impl<F> fmt::Display for Format<F>
where
F: Fn(&mut Formatter<'_>) -> fmt::Result,
{
fn fmt(&self, f: &mut Formatter) -> fmt::Result {
self.0(f)
}
}
let s = format!(
"{}",
Format(|f: &mut Formatter<'_>| {
crate::headers::util::csv::fmt_comma_delimited(&mut *f, self.0.iter())
})
);
values.extend(Some(HeaderValue::from_str(&s).unwrap()))
}
}

impl FromIterator<Element> for XRobotsTag {
fn from_iter<T>(iter: T) -> Self
where
T: IntoIterator<Item = Element>,
{
XRobotsTag(iter.into_iter().collect())
}
}

impl IntoIterator for XRobotsTag {
type Item = Element;
type IntoIter = ElementIter;

fn into_iter(self) -> Self::IntoIter {
ElementIter::new(self.0.into_iter())
}
}
Loading
Loading