From 0598b7fdb426d33ef87ddf8012d598e9c416dd1b Mon Sep 17 00:00:00 2001 From: rhigman <73792779+rhigman@users.noreply.github.com> Date: Mon, 24 Jul 2023 15:07:35 +0100 Subject: [PATCH 1/2] Fix thoth_wrapper bug where DOI arg was assumed to be optional --- src/thoth_wrapper.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/thoth_wrapper.py b/src/thoth_wrapper.py index 84e5e30..1c890d0 100755 --- a/src/thoth_wrapper.py +++ b/src/thoth_wrapper.py @@ -37,7 +37,7 @@ def get_html_pub_url(thoth_data): def run(): parser = argparse.ArgumentParser(description='Thoth wrapper') parser.add_argument('epub_path', help='Path to epub file') - parser.add_argument('-d', '--doi', help='Work DOI (registered in Thoth)') + parser.add_argument('-d', '--doi', help='Work DOI (registered in Thoth)', required=True) args = parser.parse_args() doi_url = urllib.parse.urljoin('https://doi.org/', args.doi) From 4ad0ca9fff6079a9cf893d989b4504ec793c2d0d Mon Sep 17 00:00:00 2001 From: rhigman <73792779+rhigman@users.noreply.github.com> Date: Tue, 25 Jul 2023 12:03:34 +0100 Subject: [PATCH 2/2] Write Landing Page and Full Text URLs to Thoth automatically on chapter creation --- README.md | 8 +++-- src/epublius/epublius.py | 4 +++ src/epublius/metadata.py | 17 ++++++++++ src/epublius/thoth.py | 72 ++++++++++++++++++++++++++++++++++++++++ src/main.py | 14 ++++++++ src/thoth_wrapper.py | 7 ++-- 6 files changed, 117 insertions(+), 5 deletions(-) create mode 100644 src/epublius/thoth.py diff --git a/README.md b/README.md index a245562..662f26a 100644 --- a/README.md +++ b/README.md @@ -20,8 +20,9 @@ The entry point is `.src/main.py`, which can be called with the following argume | `-u` | URL path of this book | | `-k` | Copyright file | | `-a` | Donation link | -| `-m` | MathJax support | -| `-p` | Privacy policy URL | +| `-m` | MathJax support | +| `-p` | Privacy policy URL | +| `-w` | Write chapter URLs to Thoth (True/False) | ## Thoth Wrapper (Optional) @@ -38,12 +39,15 @@ docker run --rm \ -v /path/to/output:/ebook_automation/output \ -e MATHJAX=False \ -e PRIVACYPOLICY_URL=https://example.org \ + -e THOTH_EMAIL=email@example.com \ + -e THOTH_PWD=password \ openbookpublishers/epublius \ thoth_wrapper.py /ebook_automation/epub_file.epub \ --doi 10.11647/obp.0275 ``` The environment variable MATHJAX enables or disable MathJax support +The environment variables THOTH_EMAIL and THOTH_PWD allow use of the `--write-urls` option by providing Thoth credentials ## Contributing diff --git a/src/epublius/epublius.py b/src/epublius/epublius.py index 59bbf08..6e9b03d 100644 --- a/src/epublius/epublius.py +++ b/src/epublius/epublius.py @@ -73,6 +73,10 @@ def parse_args(self, argv=None): parser.add_argument('-p', '--privacy-policy', help = 'Privacy policy URL') + parser.add_argument('-w', '--write-urls', + help = 'Write HTML chapter URLs to Thoth', + default = False) + return parser.parse_args(argv) def unzip_epub(self, prefix): diff --git a/src/epublius/metadata.py b/src/epublius/metadata.py index b4263d2..e3dcbfb 100644 --- a/src/epublius/metadata.py +++ b/src/epublius/metadata.py @@ -126,6 +126,23 @@ def get_chapter_title(self): return html.escape(ch_title) + def get_chapter_doi(self): + ''' + Retrieve chapter DOI based on the text of
+ (this contains both copyright statement and DOI link) + ''' + # Not all chapters will have DOIs + doi = None + + doi_node = self.soup.find('p', class_='doi') + + if (doi_node is not None): + doi_link = doi_node.a + if (doi_link is not None) and (doi_link.string is not None): + doi = doi_link.string + + return doi + def get_css(self): ''' Return a str with the CSS information of a file diff --git a/src/epublius/thoth.py b/src/epublius/thoth.py new file mode 100644 index 0000000..9970679 --- /dev/null +++ b/src/epublius/thoth.py @@ -0,0 +1,72 @@ +#!/usr/bin/env python3 + +from os import getenv +from thothlibrary import ThothClient +import urllib.parse + + +class Thoth: + ''' + Module to handle Thoth interactions + ''' + + def __init__(self): + self.client = ThothClient() + self.logged_in = self.login() + + def login(self): + username = getenv('THOTH_EMAIL') + password = getenv('THOTH_PWD') + if username is None: + print('[WARNING] No Thoth username provided ' + '(THOTH_EMAIL environment variable not set)') + return False + if password is None: + print('[WARNING] No Thoth password provided ' + '(THOTH_PWD environment variable not set)') + return False + try: + self.client.login(username, password) + return True + except: + return False + + def write_urls(self, metadata, book_doi): + ''' + Write chapter Landing Page and Full Text URLs + to Thoth in standard OBP format + ''' + chapter_doi_full = metadata.get_chapter_doi() + + # Skip chapters that don't have a DOI + if chapter_doi_full is not None: + work_id = self.client.work_by_doi(chapter_doi_full).workId + chapter_doi = chapter_doi_full.split('doi.org/')[-1].lower() + landing_page_root = ( + 'https://www.openbookpublishers.com/books/' + '{book_doi}/chapters/{chapter_doi}') + + publication = {"workId": work_id, + "publicationType": "HTML", + "isbn": None, + "widthMm": None, + "widthIn": None, + "heightMm": None, + "heightIn": None, + "depthMm": None, + "depthIn": None, + "weightG": None, + "weightOz": None} + publication_id = self.client.create_publication(publication) + + location = {"publicationId": publication_id, + "landingPage": landing_page_root.format( + book_doi=book_doi, chapter_doi=chapter_doi), + "fullTextUrl": urllib.parse.unquote_plus( + metadata.get_page_url()), + "locationPlatform": "OTHER", + "canonical": "true"} + self.client.create_location(location) + + print('{}: URLs written to Thoth'.format( + metadata.contents[metadata.index])) diff --git a/src/main.py b/src/main.py index 4097667..b97b814 100755 --- a/src/main.py +++ b/src/main.py @@ -5,6 +5,8 @@ from epublius.epublius import Epublius from epublius.metadata import Metadata from epublius.output import Output +from epublius.thoth import Thoth +from thothlibrary import ThothError def main(): @@ -15,6 +17,11 @@ def main(): # Create instances epublius = Epublius(work_dir) output = Output(os.path.abspath('assets/template.xhtml')) + thoth = Thoth() + + # Warn if user requested to write URLs to Thoth but Thoth login failed + if epublius.argv.write_urls and not thoth.logged_in: + print('[WARNING] Thoth login failed; URLs will not be written') # Get book contents contents = epublius.get_contents() @@ -57,6 +64,13 @@ def main(): file_path = os.path.join(output_directory, section) output.write_file(processed_content, file_path) + if epublius.argv.write_urls and thoth.logged_in: + # Write chapter URL metadata to Thoth + try: + thoth.write_urls(metadata, epublius.argv.doi) + except (KeyError, ThothError) as e: + # Continue on error, but display warning + print('[WARNING] Error writing URLs to Thoth for {}: {}'.format(section, e)) # Duplicate TOC file to output_directory/main.html epublius.duplicate_contents(TOC_filepath.get('TOC_filepath')) diff --git a/src/thoth_wrapper.py b/src/thoth_wrapper.py index 1c890d0..eb1d9ff 100755 --- a/src/thoth_wrapper.py +++ b/src/thoth_wrapper.py @@ -12,7 +12,7 @@ def query_thoth(doi_url): thoth = ThothClient() return thoth.query('workByDoi', {'doi': f'"{doi_url}"'}) - + def get_title(thoth_data): return thoth_data["fullTitle"] @@ -39,7 +39,7 @@ def run(): parser.add_argument('epub_path', help='Path to epub file') parser.add_argument('-d', '--doi', help='Work DOI (registered in Thoth)', required=True) args = parser.parse_args() - + doi_url = urllib.parse.urljoin('https://doi.org/', args.doi) thoth_data = query_thoth(doi_url) @@ -57,7 +57,8 @@ def run(): "-t", os.path.join(epublius_dir, ""), "-d", args.doi, "-m", MATHJAX, - "-p", os.getenv('PRIVACYPOLICY_URL', '#')] + "-p", os.getenv('PRIVACYPOLICY_URL', '#'), + "-w", 'True'] os.execvp(sys.executable, [exe] + args)