Skip to content

Commit

Permalink
Include texts from all struct types
Browse files Browse the repository at this point in the history
In addition to `front_page`, `chapter` and `section`, all other
elements from the DFG viewer struct set now contribute to the text
extraction.

Fixes #43
  • Loading branch information
wrznr committed Jun 24, 2020
1 parent fb4db51 commit 2303929
Show file tree
Hide file tree
Showing 4 changed files with 26,146 additions and 28 deletions.
9 changes: 9 additions & 0 deletions Changelog
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,15 @@ All notable changes to this project will be documented in this file.
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).

## [Unreleased]
### Changed
- Added tests for TEI API
- Evaluate texts from all struct types
- Add `front`, `body` and `back` per default

### Fixed
- https://github.com/slub/mets-mods2tei/issues/43

## [0.1.1] - 2020-05-11
### Added
- Treat nested AMD-type (non-logical) divs in logical struct map (i.e.
Expand Down
44 changes: 16 additions & 28 deletions mets_mods2tei/api/tei.py
Original file line number Diff line number Diff line change
Expand Up @@ -684,38 +684,26 @@ def add_div_structure(self, div):
# div structure has to be added to text
text = self.tree.xpath('//tei:text', namespaces=ns)[0]

# relevant divs
struct_divs = list(filter(lambda x: x.get_ADMID() is None, div.get_div()))
amd_divs = list(filter(lambda x: x.get_ADMID() is not None, div.get_div()))

# do not add front node to unstructured volumes
if struct_divs:
front = etree.SubElement(text, "%sfront" % TEI)

# body must be present
# decent to the deepest AMD
while div.get_ADMID() is None:
div = div.get_div()[0]
start_div = div.get_div()[0]
while start_div.get_div() and start_div.get_div()[0].get_ADMID() is not None:
div = start_div
start_div = start_div.get_div()[0]
front = etree.SubElement(text, "%sfront" % TEI)
body = etree.SubElement(text, "%sbody" % TEI)
back = etree.SubElement(text, "%sback" % TEI)

# do not add back node to unstructured volumes
if struct_divs:
back = etree.SubElement(text, "%sback" % TEI)
else:
# default div for unstructured volumes
body = etree.SubElement(body, "%sdiv" % TEI)
# newspaper case: decent to the deepest div!
if amd_divs:
div = amd_divs[0]
while div.get_div():
div = div.get_div()[0]
body.set("id", div.get_ID())

for sub_div in struct_divs:
entry_point = front

for sub_div in div.get_div():
if sub_div.get_TYPE() == "title_page":
self.__add_div(front, sub_div, 1, "titlePage")
elif sub_div.get_TYPE() == "chapter" or sub_div.get_TYPE() == "section":
self.__add_div(body, sub_div, 1)
self.__add_div(entry_point, sub_div, 1, "titlePage")
else:
#FIXME
pass
entry_point = body
self.__add_div(entry_point, sub_div, 1)

def __add_div(self, insert_node, div, n, tag="div"):
"""
Expand Down
36 changes: 36 additions & 0 deletions tests/test_tei.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,32 @@
# -*- coding: utf-8 -*-

import os
import pytest
import warnings

# the import of dir_util introduces a deprecation warning
# we can't do much about it
with warnings.catch_warnings():
warnings.simplefilter("ignore")
from distutils import dir_util

from mets_mods2tei import Tei
from mets_mods2tei import Mets

@pytest.fixture
def datadir(tmpdir, request):
'''
Fixture responsible for searching a folder with the same name of test
module and, if available, moving all contents to a temporary directory so
tests can use them freely.
'''
filename = request.module.__file__
test_dir, _ = os.path.splitext(filename)

if os.path.isdir(test_dir):
dir_util.copy_tree(test_dir, str(tmpdir))

return tmpdir

def test_constructor():
'''
Expand All @@ -9,6 +35,16 @@ def test_constructor():
tei = Tei()
assert(tei.tree is not None)

def test_reading_local_file(datadir):
'''
Test reading from a local mets file
'''
f = open(datadir.join('test_mets.xml'))
mets = Mets.read(f)
tei = Tei()
tei.fill_from_mets(mets, ocr=False)
assert(tei.tree is not None)

def test_string_dumping():
tei = Tei()
assert(tei.tostring().startswith(b"<"))
Expand Down
Loading

0 comments on commit 2303929

Please sign in to comment.