From 640b39e970c009285127f78135a056103437c614 Mon Sep 17 00:00:00 2001 From: rhigman <73792779+rhigman@users.noreply.github.com> Date: Mon, 9 Sep 2024 10:43:15 +0100 Subject: [PATCH 1/4] Omit chapters with no DOI from Crossref output --- .../src/xml/doideposit_crossref.rs | 40 ++++++++++--------- 1 file changed, 22 insertions(+), 18 deletions(-) diff --git a/thoth-export-server/src/xml/doideposit_crossref.rs b/thoth-export-server/src/xml/doideposit_crossref.rs index f7b675f3..53c52fce 100644 --- a/thoth-export-server/src/xml/doideposit_crossref.rs +++ b/thoth-export-server/src/xml/doideposit_crossref.rs @@ -132,7 +132,11 @@ impl XmlElementBlock for Work { .iter() .filter(|r| r.relation_type == RelationType::HAS_CHILD) { - XmlElementBlock::::xml_element(chapter, w)?; + // If chapter has no DOI, nothing to output (`content_item` element + // representing chapter must contain `doi_data` element with `doi`) + if chapter.related_work.doi.is_some() { + XmlElementBlock::::xml_element(chapter, w)?; + } } Ok(()) }) @@ -692,12 +696,8 @@ fn write_chapter_doi_collection( )); } } else { - // `doi_data` element is mandatory for `content_item`, and must contain - // both `doi` element and `resource` (landing page) element - return Err(ThothError::IncompleteMetadataRecord( - DEPOSIT_ERROR.to_string(), - "Missing chapter DOI".to_string(), - )); + // Caller should only pass in chapters which have DOIs + unreachable!() } Ok(()) } @@ -1436,17 +1436,6 @@ mod tests { output, "Could not generate doideposit::crossref: Missing chapter Landing Page".to_string() ); - - // Restore landing page but remove DOI. Result: error, as above - test_relations.related_work.edition = None; - test_relations.related_work.landing_page = - Some("https://www.book.com/chapter_one".to_string()); - test_relations.related_work.doi = None; - let output = generate_test_output(false, &test_relations); - assert_eq!( - output, - "Could not generate doideposit::crossref: Missing chapter DOI".to_string() - ); } #[test] @@ -1974,6 +1963,9 @@ mod tests { test_work.publications[0].locations.clear(); // Remove last (hardback) publication test_work.publications.pop(); + // Change sole relation to chapter with no DOI + test_work.relations[0].relation_type = RelationType::HAS_CHILD; + test_work.relations[0].related_work.doi = None; let output = generate_test_output(true, &test_work); // Work type changed @@ -2022,6 +2014,18 @@ mod tests { assert!(!output.contains(r#" "#)); assert!(!output.contains(r#" "#)); assert!(!output.contains(r#" "#)); + // Only chapters with no DOI supplied: no `content_item` elements emitted + assert!(!output.contains(r#" "#)); + assert!(!output.contains(r#" Part"#)); + assert!(!output.contains(r#" One"#)); + assert!(!output.contains(r#" 1"#)); + assert!(!output.contains(r#" 02"#)); + assert!(!output.contains(r#" 28"#)); + assert!(!output.contains(r#" 2000"#)); + assert!(!output.contains(r#" "#)); + assert!(!output.contains(r#" 10"#)); + assert!(!output.contains(r#" 20"#)); + assert!(!output.contains(r#" https://www.book.com/part_one"#)); // Change work type, remove landing page, remove XML ISBN, // remove all but the omitted contributor From a378dacac3b9b2d41ff341c4dac4698c75fcf05a Mon Sep 17 00:00:00 2001 From: rhigman <73792779+rhigman@users.noreply.github.com> Date: Mon, 9 Sep 2024 12:33:41 +0100 Subject: [PATCH 2/4] Raise error if no DOIs are found for either parent work or chapters --- .../src/xml/doideposit_crossref.rs | 36 +++++++++++++++---- 1 file changed, 30 insertions(+), 6 deletions(-) diff --git a/thoth-export-server/src/xml/doideposit_crossref.rs b/thoth-export-server/src/xml/doideposit_crossref.rs index 53c52fce..1d519c43 100644 --- a/thoth-export-server/src/xml/doideposit_crossref.rs +++ b/thoth-export-server/src/xml/doideposit_crossref.rs @@ -79,14 +79,25 @@ impl XmlSpecification for DoiDepositCrossref { impl XmlElementBlock for Work { fn xml_element(&self, w: &mut EventWriter) -> ThothResult<()> { - let work_type = match &self.work_type { + let work = self; + if work.doi.is_none() + && !work + .relations + .iter() + .any(|r| r.relation_type == RelationType::HAS_CHILD && r.related_work.doi.is_some()) + { + return Err(ThothError::IncompleteMetadataRecord( + DEPOSIT_ERROR.to_string(), + "No work or chapter DOIs to deposit".to_string(), + )); + } + let work_type = match &work.work_type { WorkType::MONOGRAPH => "monograph", WorkType::EDITED_BOOK => "edited_book", WorkType::TEXTBOOK => "reference", WorkType::JOURNAL_ISSUE | WorkType::BOOK_SET | WorkType::BOOK_CHAPTER => "other", WorkType::Other(_) => unreachable!(), }; - let work = self; // As an alternative to `book_metadata` and `book_series_metadata` below, // `book_set_metadata` can be used for works which are part of a set. // Omitted at present but could be considered as a future enhancement. @@ -2069,18 +2080,31 @@ mod tests { assert!(!output.contains(r#" 10.00001/BOOK.0001"#)); assert!(!output.contains(r#" https://www.book.com"#)); - // Change work type again, replace landing page but remove DOI + // Remove DOI (so neither work nor chapter DOIs are present). Result: error + test_work.doi = None; + let output = generate_test_output(false, &test_work); + assert_eq!( + output, + "Could not generate doideposit::crossref: No work or chapter DOIs to deposit" + .to_string() + ); + + // Change work type again, replace landing page, replace chapter DOI test_work.work_type = WorkType::JOURNAL_ISSUE; test_work.landing_page = Some("https://www.book.com".to_string()); - test_work.doi = None; + test_work.relations[0].related_work.doi = + Some(Doi::from_str("https://doi.org/10.00001/PART.0001").unwrap()); let output = generate_test_output(true, &test_work); // Work type changed assert!(!output.contains(r#" "#)); assert!(output.contains(r#" "#)); - // No DOI: entire `doi_data` element omitted (even though landing page restored) - assert!(!output.contains(r#" "#)); + // No work DOI: entire work-specific `doi_data` element omitted (even though landing page restored) assert!(!output.contains(r#" 10.00001/BOOK.0001"#)); assert!(!output.contains(r#" https://www.book.com"#)); + // But chapter-specific `doi_data` element will be present (at same nesting level) + assert!(output.contains(r#" "#)); + assert!(output.contains(r#" 10.00001/PART.0001"#)); + assert!(output.contains(r#" https://www.book.com/part_one"#)); // Remove publication date. Result: error test_work.publication_date = None; From 57bb64b9f2f8c64210b19017975490fd9414a5c4 Mon Sep 17 00:00:00 2001 From: rhigman <73792779+rhigman@users.noreply.github.com> Date: Mon, 9 Sep 2024 12:36:50 +0100 Subject: [PATCH 3/4] Update changelog --- CHANGELOG.md | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 0ffff930..694cc16d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,11 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). ## [Unreleased] +### Changed + - [551](https://github.com/thoth-pub/thoth/issues/551) - Only include chapters in Crossref metadata output if they have DOIs + +### Fixed + - [565](https://github.com/thoth-pub/thoth/issues/565) - Don't generate Crossref metadata output if no DOIs (work or chapter) are present ## [[0.12.9]](https://github.com/thoth-pub/thoth/releases/tag/v0.12.9) - 2024-09-06 ### Added From 7b2ed5002920a265cdf79235d9b7731705557f2b Mon Sep 17 00:00:00 2001 From: rhigman <73792779+rhigman@users.noreply.github.com> Date: Mon, 9 Sep 2024 16:04:12 +0100 Subject: [PATCH 4/4] Review markups: remove unnecessary variable, streamline unreachable check --- .../src/xml/doideposit_crossref.rs | 183 +++++++++--------- 1 file changed, 89 insertions(+), 94 deletions(-) diff --git a/thoth-export-server/src/xml/doideposit_crossref.rs b/thoth-export-server/src/xml/doideposit_crossref.rs index 1d519c43..d90d5233 100644 --- a/thoth-export-server/src/xml/doideposit_crossref.rs +++ b/thoth-export-server/src/xml/doideposit_crossref.rs @@ -79,9 +79,8 @@ impl XmlSpecification for DoiDepositCrossref { impl XmlElementBlock for Work { fn xml_element(&self, w: &mut EventWriter) -> ThothResult<()> { - let work = self; - if work.doi.is_none() - && !work + if self.doi.is_none() + && !self .relations .iter() .any(|r| r.relation_type == RelationType::HAS_CHILD && r.related_work.doi.is_some()) @@ -91,7 +90,7 @@ impl XmlElementBlock for Work { "No work or chapter DOIs to deposit".to_string(), )); } - let work_type = match &work.work_type { + let work_type = match &self.work_type { WorkType::MONOGRAPH => "monograph", WorkType::EDITED_BOOK => "edited_book", WorkType::TEXTBOOK => "reference", @@ -117,22 +116,22 @@ impl XmlElementBlock for Work { XmlElementBlock::::xml_element(series, w)?; ordinal = Some(ord); } - write_work_contributions(work, w)?; - write_work_title(work, w)?; - write_work_abstract(work, w)?; + write_work_contributions(self, w)?; + write_work_title(self, w)?; + write_work_abstract(self, w)?; if ordinal.is_some() { let ordinal_i64 = ordinal.unwrap_or(0); write_work_volume(ordinal_i64, w)?; } - write_work_edition(work, w)?; - write_work_publication_date(work, w)?; - write_work_publications(work, w)?; - write_publisher(work, w)?; - write_crossmark_funding_access(work, w)?; - write_doi_collection(work, w)?; - write_work_references(work, w)?; + write_work_edition(self, w)?; + write_work_publication_date(self, w)?; + write_work_publications(self, w)?; + write_publisher(self, w)?; + write_crossmark_funding_access(self, w)?; + write_doi_collection(self, w)?; + write_work_references(self, w)?; Ok(()) })?; @@ -627,88 +626,84 @@ fn write_chapter_doi_collection( chapter: &WorkRelations, w: &mut EventWriter, ) -> ThothResult<()> { - if let Some(doi) = &chapter.related_work.doi { - if let Some(landing_page) = &chapter.related_work.landing_page { - write_element_block("doi_data", w, |w| { - write_element_block("doi", w, |w| { - w.write(XmlEvent::Characters(&doi.to_string())) - .map_err(|e| e.into()) - })?; - write_element_block("resource", w, |w| { - w.write(XmlEvent::Characters(landing_page)) - .map_err(|e| e.into()) - })?; - if let Some(pdf_url) = chapter - .related_work - .publications - .iter() - .find(|p| { - p.publication_type.eq(&PublicationType::PDF) && !p.locations.is_empty() - }) - .and_then(|p| p.locations.iter().find(|l| l.canonical)) - .and_then(|l| l.full_text_url.as_ref()) - { - // Used for CrossRef Similarity Check. URL must point directly to full-text PDF. - // Alternatively, a direct link to full-text HTML can be used (not implemented here). - write_full_element_block( - "collection", - Some(vec![("property", "crawler-based")]), - w, - |w| { - for crawler in ["iParadigms", "google", "msn", "yahoo", "scirus"] { - write_full_element_block( - "item", - Some(vec![("crawler", crawler)]), - w, - |w| { - write_full_element_block( - "resource", - Some(vec![("mime_type", "application/pdf")]), - w, - |w| { - w.write(XmlEvent::Characters(pdf_url)) - .map_err(|e| e.into()) - }, - ) - }, - )?; - } - Ok(()) - }, - )?; - // Used for CrossRef Text and Data Mining. URL must point directly to full-text PDF. - // Alternatively, a direct link to full-text XML can be used (not implemented here). - write_full_element_block( - "collection", - Some(vec![("property", "text-mining")]), - w, - |w| { - write_element_block("item", w, |w| { - write_full_element_block( - "resource", - Some(vec![("mime_type", "application/pdf")]), - w, - |w| { - w.write(XmlEvent::Characters(pdf_url)).map_err(|e| e.into()) - }, - ) - }) - }, - )?; - } - Ok(()) + let doi = &chapter + .related_work + .doi + .as_ref() + .expect("Caller should only pass in chapters which have DOIs"); + if let Some(landing_page) = &chapter.related_work.landing_page { + write_element_block("doi_data", w, |w| { + write_element_block("doi", w, |w| { + w.write(XmlEvent::Characters(&doi.to_string())) + .map_err(|e| e.into()) })?; - } else { - // `doi_data` element is mandatory for `content_item`, and must contain - // both `doi` element and `resource` (landing page) element - return Err(ThothError::IncompleteMetadataRecord( - DEPOSIT_ERROR.to_string(), - "Missing chapter Landing Page".to_string(), - )); - } + write_element_block("resource", w, |w| { + w.write(XmlEvent::Characters(landing_page)) + .map_err(|e| e.into()) + })?; + if let Some(pdf_url) = chapter + .related_work + .publications + .iter() + .find(|p| p.publication_type.eq(&PublicationType::PDF) && !p.locations.is_empty()) + .and_then(|p| p.locations.iter().find(|l| l.canonical)) + .and_then(|l| l.full_text_url.as_ref()) + { + // Used for CrossRef Similarity Check. URL must point directly to full-text PDF. + // Alternatively, a direct link to full-text HTML can be used (not implemented here). + write_full_element_block( + "collection", + Some(vec![("property", "crawler-based")]), + w, + |w| { + for crawler in ["iParadigms", "google", "msn", "yahoo", "scirus"] { + write_full_element_block( + "item", + Some(vec![("crawler", crawler)]), + w, + |w| { + write_full_element_block( + "resource", + Some(vec![("mime_type", "application/pdf")]), + w, + |w| { + w.write(XmlEvent::Characters(pdf_url)) + .map_err(|e| e.into()) + }, + ) + }, + )?; + } + Ok(()) + }, + )?; + // Used for CrossRef Text and Data Mining. URL must point directly to full-text PDF. + // Alternatively, a direct link to full-text XML can be used (not implemented here). + write_full_element_block( + "collection", + Some(vec![("property", "text-mining")]), + w, + |w| { + write_element_block("item", w, |w| { + write_full_element_block( + "resource", + Some(vec![("mime_type", "application/pdf")]), + w, + |w| w.write(XmlEvent::Characters(pdf_url)).map_err(|e| e.into()), + ) + }) + }, + )?; + } + Ok(()) + })?; } else { - // Caller should only pass in chapters which have DOIs - unreachable!() + // `doi_data` element is mandatory for `content_item`, and must contain + // both `doi` element and `resource` (landing page) element + return Err(ThothError::IncompleteMetadataRecord( + DEPOSIT_ERROR.to_string(), + "Missing chapter Landing Page".to_string(), + )); } Ok(()) }