Skip to content

Commit

Permalink
Merge pull request #627 from thoth-pub/feature/551_565_crossref_dois
Browse files Browse the repository at this point in the history
Better handling of missing DOIs when outputting Crossref metadata (#551, #565)
  • Loading branch information
rhigman authored Sep 10, 2024
2 parents 5631f8d + 7b2ed50 commit eab0663
Show file tree
Hide file tree
Showing 2 changed files with 134 additions and 106 deletions.
5 changes: 5 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,11 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).

## [Unreleased]
### Changed
- [551](https://github.com/thoth-pub/thoth/issues/551) - Only include chapters in Crossref metadata output if they have DOIs

### Fixed
- [565](https://github.com/thoth-pub/thoth/issues/565) - Don't generate Crossref metadata output if no DOIs (work or chapter) are present

## [[0.12.9]](https://github.com/thoth-pub/thoth/releases/tag/v0.12.9) - 2024-09-06
### Added
Expand Down
235 changes: 129 additions & 106 deletions thoth-export-server/src/xml/doideposit_crossref.rs
Original file line number Diff line number Diff line change
Expand Up @@ -79,14 +79,24 @@ impl XmlSpecification for DoiDepositCrossref {

impl XmlElementBlock<DoiDepositCrossref> for Work {
fn xml_element<W: Write>(&self, w: &mut EventWriter<W>) -> ThothResult<()> {
if self.doi.is_none()
&& !self
.relations
.iter()
.any(|r| r.relation_type == RelationType::HAS_CHILD && r.related_work.doi.is_some())
{
return Err(ThothError::IncompleteMetadataRecord(
DEPOSIT_ERROR.to_string(),
"No work or chapter DOIs to deposit".to_string(),
));
}
let work_type = match &self.work_type {
WorkType::MONOGRAPH => "monograph",
WorkType::EDITED_BOOK => "edited_book",
WorkType::TEXTBOOK => "reference",
WorkType::JOURNAL_ISSUE | WorkType::BOOK_SET | WorkType::BOOK_CHAPTER => "other",
WorkType::Other(_) => unreachable!(),
};
let work = self;
// As an alternative to `book_metadata` and `book_series_metadata` below,
// `book_set_metadata` can be used for works which are part of a set.
// Omitted at present but could be considered as a future enhancement.
Expand All @@ -106,22 +116,22 @@ impl XmlElementBlock<DoiDepositCrossref> for Work {
XmlElementBlock::<DoiDepositCrossref>::xml_element(series, w)?;
ordinal = Some(ord);
}
write_work_contributions(work, w)?;
write_work_title(work, w)?;
write_work_abstract(work, w)?;
write_work_contributions(self, w)?;
write_work_title(self, w)?;
write_work_abstract(self, w)?;

if ordinal.is_some() {
let ordinal_i64 = ordinal.unwrap_or(0);
write_work_volume(ordinal_i64, w)?;
}

write_work_edition(work, w)?;
write_work_publication_date(work, w)?;
write_work_publications(work, w)?;
write_publisher(work, w)?;
write_crossmark_funding_access(work, w)?;
write_doi_collection(work, w)?;
write_work_references(work, w)?;
write_work_edition(self, w)?;
write_work_publication_date(self, w)?;
write_work_publications(self, w)?;
write_publisher(self, w)?;
write_crossmark_funding_access(self, w)?;
write_doi_collection(self, w)?;
write_work_references(self, w)?;
Ok(())
})?;

Expand All @@ -132,7 +142,11 @@ impl XmlElementBlock<DoiDepositCrossref> for Work {
.iter()
.filter(|r| r.relation_type == RelationType::HAS_CHILD)
{
XmlElementBlock::<DoiDepositCrossref>::xml_element(chapter, w)?;
// If chapter has no DOI, nothing to output (`content_item` element
// representing chapter must contain `doi_data` element with `doi`)
if chapter.related_work.doi.is_some() {
XmlElementBlock::<DoiDepositCrossref>::xml_element(chapter, w)?;
}
}
Ok(())
})
Expand Down Expand Up @@ -612,91 +626,83 @@ fn write_chapter_doi_collection<W: Write>(
chapter: &WorkRelations,
w: &mut EventWriter<W>,
) -> ThothResult<()> {
if let Some(doi) = &chapter.related_work.doi {
if let Some(landing_page) = &chapter.related_work.landing_page {
write_element_block("doi_data", w, |w| {
write_element_block("doi", w, |w| {
w.write(XmlEvent::Characters(&doi.to_string()))
.map_err(|e| e.into())
})?;
write_element_block("resource", w, |w| {
w.write(XmlEvent::Characters(landing_page))
.map_err(|e| e.into())
})?;
if let Some(pdf_url) = chapter
.related_work
.publications
.iter()
.find(|p| {
p.publication_type.eq(&PublicationType::PDF) && !p.locations.is_empty()
})
.and_then(|p| p.locations.iter().find(|l| l.canonical))
.and_then(|l| l.full_text_url.as_ref())
{
// Used for CrossRef Similarity Check. URL must point directly to full-text PDF.
// Alternatively, a direct link to full-text HTML can be used (not implemented here).
write_full_element_block(
"collection",
Some(vec![("property", "crawler-based")]),
w,
|w| {
for crawler in ["iParadigms", "google", "msn", "yahoo", "scirus"] {
write_full_element_block(
"item",
Some(vec![("crawler", crawler)]),
w,
|w| {
write_full_element_block(
"resource",
Some(vec![("mime_type", "application/pdf")]),
w,
|w| {
w.write(XmlEvent::Characters(pdf_url))
.map_err(|e| e.into())
},
)
},
)?;
}
Ok(())
},
)?;
// Used for CrossRef Text and Data Mining. URL must point directly to full-text PDF.
// Alternatively, a direct link to full-text XML can be used (not implemented here).
write_full_element_block(
"collection",
Some(vec![("property", "text-mining")]),
w,
|w| {
write_element_block("item", w, |w| {
write_full_element_block(
"resource",
Some(vec![("mime_type", "application/pdf")]),
w,
|w| {
w.write(XmlEvent::Characters(pdf_url)).map_err(|e| e.into())
},
)
})
},
)?;
}
Ok(())
let doi = &chapter
.related_work
.doi
.as_ref()
.expect("Caller should only pass in chapters which have DOIs");
if let Some(landing_page) = &chapter.related_work.landing_page {
write_element_block("doi_data", w, |w| {
write_element_block("doi", w, |w| {
w.write(XmlEvent::Characters(&doi.to_string()))
.map_err(|e| e.into())
})?;
} else {
// `doi_data` element is mandatory for `content_item`, and must contain
// both `doi` element and `resource` (landing page) element
return Err(ThothError::IncompleteMetadataRecord(
DEPOSIT_ERROR.to_string(),
"Missing chapter Landing Page".to_string(),
));
}
write_element_block("resource", w, |w| {
w.write(XmlEvent::Characters(landing_page))
.map_err(|e| e.into())
})?;
if let Some(pdf_url) = chapter
.related_work
.publications
.iter()
.find(|p| p.publication_type.eq(&PublicationType::PDF) && !p.locations.is_empty())
.and_then(|p| p.locations.iter().find(|l| l.canonical))
.and_then(|l| l.full_text_url.as_ref())
{
// Used for CrossRef Similarity Check. URL must point directly to full-text PDF.
// Alternatively, a direct link to full-text HTML can be used (not implemented here).
write_full_element_block(
"collection",
Some(vec![("property", "crawler-based")]),
w,
|w| {
for crawler in ["iParadigms", "google", "msn", "yahoo", "scirus"] {
write_full_element_block(
"item",
Some(vec![("crawler", crawler)]),
w,
|w| {
write_full_element_block(
"resource",
Some(vec![("mime_type", "application/pdf")]),
w,
|w| {
w.write(XmlEvent::Characters(pdf_url))
.map_err(|e| e.into())
},
)
},
)?;
}
Ok(())
},
)?;
// Used for CrossRef Text and Data Mining. URL must point directly to full-text PDF.
// Alternatively, a direct link to full-text XML can be used (not implemented here).
write_full_element_block(
"collection",
Some(vec![("property", "text-mining")]),
w,
|w| {
write_element_block("item", w, |w| {
write_full_element_block(
"resource",
Some(vec![("mime_type", "application/pdf")]),
w,
|w| w.write(XmlEvent::Characters(pdf_url)).map_err(|e| e.into()),
)
})
},
)?;
}
Ok(())
})?;
} else {
// `doi_data` element is mandatory for `content_item`, and must contain
// both `doi` element and `resource` (landing page) element
return Err(ThothError::IncompleteMetadataRecord(
DEPOSIT_ERROR.to_string(),
"Missing chapter DOI".to_string(),
"Missing chapter Landing Page".to_string(),
));
}
Ok(())
Expand Down Expand Up @@ -1436,17 +1442,6 @@ mod tests {
output,
"Could not generate doideposit::crossref: Missing chapter Landing Page".to_string()
);

// Restore landing page but remove DOI. Result: error, as above
test_relations.related_work.edition = None;
test_relations.related_work.landing_page =
Some("https://www.book.com/chapter_one".to_string());
test_relations.related_work.doi = None;
let output = generate_test_output(false, &test_relations);
assert_eq!(
output,
"Could not generate doideposit::crossref: Missing chapter DOI".to_string()
);
}

#[test]
Expand Down Expand Up @@ -1974,6 +1969,9 @@ mod tests {
test_work.publications[0].locations.clear();
// Remove last (hardback) publication
test_work.publications.pop();
// Change sole relation to chapter with no DOI
test_work.relations[0].relation_type = RelationType::HAS_CHILD;
test_work.relations[0].related_work.doi = None;

let output = generate_test_output(true, &test_work);
// Work type changed
Expand Down Expand Up @@ -2022,6 +2020,18 @@ mod tests {
assert!(!output.contains(r#" <item crawler="yahoo">"#));
assert!(!output.contains(r#" <item crawler="scirus">"#));
assert!(!output.contains(r#" <collection property="text-mining">"#));
// Only chapters with no DOI supplied: no `content_item` elements emitted
assert!(!output.contains(r#" <content_item component_type="chapter">"#));
assert!(!output.contains(r#" <title>Part</title>"#));
assert!(!output.contains(r#" <subtitle>One</subtitle>"#));
assert!(!output.contains(r#" <component_number>1</component_number>"#));
assert!(!output.contains(r#" <month>02</month>"#));
assert!(!output.contains(r#" <day>28</day>"#));
assert!(!output.contains(r#" <year>2000</year>"#));
assert!(!output.contains(r#" <pages>"#));
assert!(!output.contains(r#" <first_page>10</first_page>"#));
assert!(!output.contains(r#" <last_page>20</last_page>"#));
assert!(!output.contains(r#" <resource>https://www.book.com/part_one</resource>"#));

// Change work type, remove landing page, remove XML ISBN,
// remove all but the omitted contributor
Expand Down Expand Up @@ -2065,18 +2075,31 @@ mod tests {
assert!(!output.contains(r#" <doi>10.00001/BOOK.0001</doi>"#));
assert!(!output.contains(r#" <resource>https://www.book.com</resource>"#));

// Change work type again, replace landing page but remove DOI
// Remove DOI (so neither work nor chapter DOIs are present). Result: error
test_work.doi = None;
let output = generate_test_output(false, &test_work);
assert_eq!(
output,
"Could not generate doideposit::crossref: No work or chapter DOIs to deposit"
.to_string()
);

// Change work type again, replace landing page, replace chapter DOI
test_work.work_type = WorkType::JOURNAL_ISSUE;
test_work.landing_page = Some("https://www.book.com".to_string());
test_work.doi = None;
test_work.relations[0].related_work.doi =
Some(Doi::from_str("https://doi.org/10.00001/PART.0001").unwrap());
let output = generate_test_output(true, &test_work);
// Work type changed
assert!(!output.contains(r#" <book book_type="reference">"#));
assert!(output.contains(r#" <book book_type="other">"#));
// No DOI: entire `doi_data` element omitted (even though landing page restored)
assert!(!output.contains(r#" <doi_data>"#));
// No work DOI: entire work-specific `doi_data` element omitted (even though landing page restored)
assert!(!output.contains(r#" <doi>10.00001/BOOK.0001</doi>"#));
assert!(!output.contains(r#" <resource>https://www.book.com</resource>"#));
// But chapter-specific `doi_data` element will be present (at same nesting level)
assert!(output.contains(r#" <doi_data>"#));
assert!(output.contains(r#" <doi>10.00001/PART.0001</doi>"#));
assert!(output.contains(r#" <resource>https://www.book.com/part_one</resource>"#));

// Remove publication date. Result: error
test_work.publication_date = None;
Expand Down

0 comments on commit eab0663

Please sign in to comment.