diff --git a/Changelog b/Changelog index df81b1c..9d88fb3 100644 --- a/Changelog +++ b/Changelog @@ -4,17 +4,28 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). -## [Unreleased] +## [0.1.1] - 2020-05-11 ### Added - Treat nested AMD-type (non-logical) divs in logical struct map (i.e. newspaper case) +- Make full text file group selectable by user +- Allow for file entries (in addition to URLs) in METS +- Add special treatment for URNs and VD IDs +- Add poor man's namespace versioning handling ### Changed - Make extraction of subtitles conditional on their presence +- Use "licence" for all types of licences (even unknown ones) + +### Fixed +- https://github.com/slub/mets-mods2tei/issues/28 +- https://github.com/slub/mets-mods2tei/issues/37 +- https://github.com/slub/mets-mods2tei/issues/39 +- https://github.com/slub/mets-mods2tei/issues/41 ## [0.1.0] - 2019-12-04 ### Added -- Correctly Place structures which are not on top of a page +- Correctly place structures which are not on top of a page - Set `corresp` and `facs` attributes of `pb` elements - Store links to `DEFAULT` images in METS - Tests for new functionality diff --git a/mets_mods2tei/api/alto.py b/mets_mods2tei/api/alto.py index 6cf3d4a..e1a2cec 100644 --- a/mets_mods2tei/api/alto.py +++ b/mets_mods2tei/api/alto.py @@ -4,15 +4,18 @@ import os import logging +import re import Levenshtein ns = { 'xlink' : "http://www.w3.org/1999/xlink", - 'alto': "http://www.loc.gov/standards/alto/ns-v2#", + 'alto': "http://www.loc.gov/standards/alto/ns-v4#", } XLINK = "{%s}" % ns['xlink'] ALTO = "{%s}" % ns['alto'] +norm_alto_ns_re = re.compile(rb'alto/ns-v.#') + class Alto: def __init__(self): @@ -47,7 +50,8 @@ def read(cls, source): if hasattr(source, 'read'): return cls.fromfile(source) if os.path.exists(source): - return cls.fromfile(source) + with open(source, 'rb') as f: + return cls.fromfile(f) @classmethod def fromfile(cls, path): @@ -65,7 +69,7 @@ def _fromfile(self, path): :param str path: Path to a ALTO document. """ parser = etree.XMLParser(remove_blank_text=True) - self.tree = etree.parse(path, parser) + self.tree = etree.XML(norm_alto_ns_re.sub(b"alto/ns-v4#", path.read()), parser) self.path = path def get_text_blocks(self): @@ -88,14 +92,7 @@ def get_text_in_line(self, line): Returns the ALTO-encoded text . :param Element line: The line to extract the text from. """ - line_text = "" - for element in line.xpath("./alto:String|./alto:SP", namespaces=ns): - if element.tag == "%sString" % ALTO: - line_text += element.get("CONTENT") - elif element.tag == "%sSP" % ALTO: - line_text += " " - #line_text += "\n" - return line_text + return " ".join(element.get("CONTENT") for element in line.xpath("./alto:String", namespaces=ns)) def __compute_fuzzy_distance(self, text1, text2): """ diff --git a/mets_mods2tei/api/mets.py b/mets_mods2tei/api/mets.py index 7b1de81..4ac0531 100644 --- a/mets_mods2tei/api/mets.py +++ b/mets_mods2tei/api/mets.py @@ -26,11 +26,10 @@ def __init__(self): The constructor. """ self.map = {} - filep = open(os.path.realpath(resource_filename(Requirement.parse("mets_mods2tei"), 'mets_mods2tei/data/iso15924-utf8-20180827.txt'))) - reader = csv.DictReader(filter(lambda row: row[0]!='#', filep), delimiter=';', quoting=csv.QUOTE_NONE, fieldnames=['code','index','name_eng', 'name_fr', 'alias', 'Age', 'Date']) - for row in reader: - self.map[row['code']] = row['name_eng'] - filep.close() + with open(os.path.realpath(resource_filename(Requirement.parse("mets_mods2tei"), 'mets_mods2tei/data/iso15924-utf8-20180827.txt'))) as filep: + reader = csv.DictReader(filter(lambda row: row[0]!='#', filep), delimiter=';', quoting=csv.QUOTE_NONE, fieldnames=['code','index','name_eng', 'name_fr', 'alias', 'Age', 'Date']) + for row in reader: + self.map[row['code']] = row['name_eng'] def get(self, code): """ @@ -55,6 +54,7 @@ def __init__(self): self.img_map = {} self.alto_map = {} self.struct_links = {} + self.fulltext_group_name = 'FULLTEXT' self.title = None self.sub_titles = None @@ -87,21 +87,21 @@ def read(cls, source): :param source: METS (file) source. """ if hasattr(source, 'read'): - return cls.fromfile(source) + return cls.from_file(source) if os.path.exists(source): - return cls.fromfile(source) + return cls.from_file(source) @classmethod - def fromfile(cls, path): + def from_file(cls, path): """ Reads in METS from a given file source. :param str path: Path to a METS document. """ i = cls() - i.__fromfile(path) + i.fromfile(path) return i - def __fromfile(self, path): + def fromfile(self, path): """ Reads in METS from a given file source. :param str path: Path to a METS document. @@ -271,7 +271,7 @@ def __spur(self): # fulltext fulltext_map = {} - fulltext_group = self.tree.xpath("//mets:fileGrp[@USE='FULLTEXT']", namespaces=ns) + fulltext_group = self.tree.xpath("//mets:fileGrp[@USE='%s']" % self.fulltext_group_name, namespaces=ns) if fulltext_group: fulltext_map = {} for entry in fulltext_group[0].xpath("./mets:file", namespaces=ns): @@ -300,6 +300,18 @@ def __spur(self): self.struct_links[sm_link.get("%sfrom" % XLINK)] = [] self.struct_links[sm_link.get("%sfrom" % XLINK)].append(sm_link.get("%sto" % XLINK)) + @property + def fulltext_group_name(self): + """ + Return the currently configured full-text-related + file group use attribute. + """ + return self.__fulltext_group_name + + @fulltext_group_name.setter + def fulltext_group_name(self, fulltext_use): + self.__fulltext_group_name = fulltext_use + def get_main_title(self): """ Return the main title of the work. diff --git a/mets_mods2tei/api/tei.py b/mets_mods2tei/api/tei.py index 4aa9944..4dbe4cf 100644 --- a/mets_mods2tei/api/tei.py +++ b/mets_mods2tei/api/tei.py @@ -6,7 +6,9 @@ import logging import copy +from contextlib import closing from urllib.request import urlopen +from urllib.parse import urlparse from pkg_resources import resource_filename, Requirement from .alto import Alto @@ -602,8 +604,22 @@ def __add_ocr_to_node(self, node, mets): alto_link = mets.get_alto(struct_link) # only collect ocr from a file once! if not alto_link in self.alto_map: - f = urlopen(alto_link) - alto = Alto.read(f) + try: + sections = urlparse(alto_link) + except: + continue + + # use urlopen for both paths and URLs + if not sections.scheme: + mod_link = 'file:' + alto_link + else: + mod_link = alto_link + self.logger.debug(mod_link) + + with closing(urlopen(mod_link)) as f: + alto = Alto.read(f) + + # save original link! self.alto_map[alto_link] = alto pb = etree.SubElement(node, "%spb" % TEI) diff --git a/mets_mods2tei/scripts/mets_mods2tei.py b/mets_mods2tei/scripts/mets_mods2tei.py index 1a2e199..28a3bdc 100644 --- a/mets_mods2tei/scripts/mets_mods2tei.py +++ b/mets_mods2tei/scripts/mets_mods2tei.py @@ -12,8 +12,9 @@ @click.command() @click.argument('mets', required=True) @click.option('-o', '--ocr', is_flag=True, default=False, help="Serialize OCR into resulting TEI") +@click.option('-T', '--text-group', default="FULLTEXT", help="File group which contains the full text") @click.option('-l', '--log-level', type=click.Choice(['DEBUG', 'INFO', 'WARN', 'ERROR', 'OFF']), default='WARN') -def cli(mets, ocr, log_level): +def cli(mets, ocr, text_group, log_level): """ METS: File containing or URL pointing to the METS/MODS XML to be converted """ # @@ -29,7 +30,9 @@ def cli(mets, ocr, log_level): # # read in METS - mets = Mets.read(f) + mets = Mets() + mets.fulltext_group_name = text_group + mets.fromfile(f) # # create TEI (from skeleton) diff --git a/setup.py b/setup.py index b9bd5d9..26dd4a7 100644 --- a/setup.py +++ b/setup.py @@ -4,7 +4,7 @@ setup( name='mets-mods2tei', - version='0.1.0', + version='0.1.1', description='Convert digital documents in METS/MODS format to TEI', long_description=open('README.md').read(), long_description_content_type="text/markdown", diff --git a/tests/test_alto.py b/tests/test_alto.py index b4dfdb2..a2b0597 100644 --- a/tests/test_alto.py +++ b/tests/test_alto.py @@ -38,32 +38,32 @@ def test_reading_local_file(datadir): ''' Test reading a local alto file ''' - f = open(datadir.join('test_alto.xml')) - alto = Alto.read(f) + with open(datadir.join('test_alto.xml'), 'rb') as f: + alto = Alto.read(f) assert(alto.tree is not None) def test_loading_local_file(datadir): ''' Test loading a local alto file ''' - f = open(datadir.join('test_alto.xml')) - alto = Alto.fromfile(f) + with open(datadir.join('test_alto.xml'), 'rb') as f: + alto = Alto.read(f) assert(alto.tree is not None) def test_text_block_extraction(datadir): ''' Test the extraction of text blocks ''' - f = open(datadir.join('test_alto.xml')) - alto = Alto.fromfile(f) + with open(datadir.join('test_alto.xml'), 'rb') as f: + alto = Alto.read(f) assert(len(list(alto.get_text_blocks())) == 1) def test_text_line_extraction(datadir): ''' Test the extraction of text lines ''' - f = open(datadir.join('test_alto.xml')) - alto = Alto.fromfile(f) + with open(datadir.join('test_alto.xml'), 'rb') as f: + alto = Alto.read(f) text_block = list(alto.get_text_blocks())[0] assert(len(list(alto.get_lines_in_text_block(text_block))) == 26) @@ -71,8 +71,8 @@ def test_text_line_text_extraction(datadir): ''' Test the extraction of text from text lines ''' - f = open(datadir.join('test_alto.xml')) - alto = Alto.fromfile(f) + with open(datadir.join('test_alto.xml'), 'rb') as f: + alto = Alto.read(f) text_block = list(alto.get_text_blocks())[0] text_line = list(alto.get_lines_in_text_block(text_block))[0] assert(alto.get_text_in_line(text_line) == "Vorbericht.") diff --git a/tests/test_mets.py b/tests/test_mets.py index 710bef1..bf71438 100644 --- a/tests/test_mets.py +++ b/tests/test_mets.py @@ -47,9 +47,32 @@ def test_loading_local_file(datadir): Test loading a local mets file ''' f = open(datadir.join('test_mets.xml')) - mets = Mets.fromfile(f) + mets = Mets.from_file(f) assert(mets.mets is not None) +def test_intermediate_file_loading(datadir): + ''' + Test loading a local mets file + ''' + f = open(datadir.join('test_mets.xml')) + mets = Mets() + mets.fromfile(f) + assert(mets.mets is not None) + +def test_fulltext_group_name(subtests, datadir): + ''' + Test getting and setting the full text group name + ''' + f = open(datadir.join('test_mets.xml')) + mets = Mets.read(f) + + with subtests.test("Check getter"): + assert(mets.fulltext_group_name == "FULLTEXT") + + with subtests.test("Check setter"): + mets.fulltext_group_name = "TEXT" + assert(mets.fulltext_group_name == "TEXT") + def test_mappings(subtests, datadir): ''' Test the correct interpretation of the structural linking