diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 0117b23..17078fd 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -11,7 +11,8 @@ jobs:
runs-on: ubuntu-latest
strategy:
matrix:
- python-version: ['3.7', '3.8', '3.9', '3.10']
+ # 3.7 fails because ocrd discontinued
+ python-version: ['3.8', '3.9', '3.10']
steps:
- uses: actions/checkout@v3
diff --git a/tests/test_workspace.py b/tests/test_workspace.py
index 72bbac0..1b23ec3 100644
--- a/tests/test_workspace.py
+++ b/tests/test_workspace.py
@@ -1,8 +1,9 @@
from pathlib import Path
from os import chdir
from difflib import unified_diff
-from unittest import TestCase, skip, main
-from tempfile import NamedTemporaryFile
+from pytest import fixture
+from ocrd_utils import pushd_popd
+from ocrd import Resolver
from ocrd_models.ocrd_page import parseEtree
from ocrd_models.constants import NAMESPACES as NS
from lxml import etree as ET
@@ -11,60 +12,47 @@
THIS_DIR = Path(__file__).resolve().parent
+@fixture
+def workspace_path(tmpdir):
+ workspace = str(THIS_DIR / "workspace" / "mets.xml")
+ workspace = Resolver().workspace_from_url(workspace, dst_dir=tmpdir, download=True)
+ with pushd_popd(tmpdir):
+ yield tmpdir
-class TestConvertTextract(TestCase):
- def setUp(self):
- workspace = THIS_DIR / "workspace"
- chdir(str(workspace))
+def test_api(workspace_path, tmpdir):
+ test_path_dict = [
+ {
+ "aws": Path("textract_responses") / f"{filename.name.split('.', 1)[0]}.json",
+ "img": Path("images") / filename.name,
+ "xml": Path("reference_page_xml") / f"{filename.name.split('.', 1)[0]}.xml",
+ }
+ for filename in Path("images").iterdir()
+ ]
+ for path in test_path_dict:
+ _, target_tree, _, _ = parseEtree(path["xml"], silence=True)
+ convert_file(str(path["aws"]), str(path["img"]), str(tmpdir/path["xml"]))
+ _, result_tree, _, _ = parseEtree(tmpdir/path["xml"], silence=True)
+ # remove elements bearing dates (Created, LastChange, Creator/Version)
+ for meta in target_tree.xpath(
+ "/page:PcGts/page:Metadata/*",
+ namespaces=NS,
+ ) + result_tree.xpath(
+ "/page:PcGts/page:Metadata/*",
+ namespaces=NS,
+ ):
+ meta.getparent().remove(meta)
+ # remove img path from Page element
- self.test_path_dict = [
- {
- "aws": Path("textract_responses")
- / f"{filename.name.split('.', 1)[0]}.json",
- "img": Path("images") / filename.name,
- "xml": Path("reference_page_xml")
- / f"{filename.name.split('.', 1)[0]}.xml",
- }
- for filename in (workspace / "images").iterdir()
- ]
- print(self.test_path_dict)
-
- def test_api(self):
- for path in self.test_path_dict:
- print(path)
- _, target_tree, _, _ = parseEtree(path["xml"], silence=True)
- with NamedTemporaryFile() as out:
- convert_file(str(path["aws"]), str(path["img"]), out.name)
- _, result_tree, _, _ = parseEtree(out.name, silence=True)
- # remove elements bearing dates (Created, LastChange, Creator/Version)
- for meta in target_tree.xpath(
- "/pc:PcGts/pc:Metadata/*",
- namespaces={
- "pc": "http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15"
- },
- ) + result_tree.xpath(
- "/pc:PcGts/pc:Metadata/*",
- namespaces={
- "pc": "http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15"
- },
- ):
- meta.getparent().remove(meta)
- # remove img path from Page element
-
- res_img_path_elem = result_tree.find(
- "pc:Page",
- namespaces={
- "pc": "http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15"
- },
- )
- del res_img_path_elem.attrib["imageFilename"]
- tar_img_path_elem = target_tree.find(
- "pc:Page",
- namespaces={
- "pc": "http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15"
- },
- )
- del tar_img_path_elem.attrib["imageFilename"]
- target_xml = ET.tostring(target_tree, pretty_print=True, encoding='UTF-8').decode('utf-8')
- result_xml = ET.tostring(result_tree, pretty_print=True, encoding='UTF-8').decode('utf-8')
- assert target_xml == result_xml
+ res_img_path_elem = result_tree.find(
+ "page:Page",
+ namespaces=NS,
+ )
+ del res_img_path_elem.attrib["imageFilename"]
+ tar_img_path_elem = target_tree.find(
+ "page:Page",
+ namespaces=NS,
+ )
+ del tar_img_path_elem.attrib["imageFilename"]
+ target_xml = ET.tostring(target_tree, pretty_print=True, encoding='UTF-8').decode('utf-8')
+ result_xml = ET.tostring(result_tree, pretty_print=True, encoding='UTF-8').decode('utf-8')
+ assert result_xml == target_xml, path
diff --git a/tests/workspace/images/sn1991-01-03_0001.jpg b/tests/workspace/images/sn1991-01-03_0001.jpg
new file mode 100644
index 0000000..60a379d
Binary files /dev/null and b/tests/workspace/images/sn1991-01-03_0001.jpg differ
diff --git a/tests/workspace/images/sn1991-02-09_pr_0002.jpg b/tests/workspace/images/sn1991-02-09_pr_0002.jpg
new file mode 100644
index 0000000..b4434da
Binary files /dev/null and b/tests/workspace/images/sn1991-02-09_pr_0002.jpg differ
diff --git a/tests/workspace/mets.xml b/tests/workspace/mets.xml
index 742821a..4f33ee3 100644
--- a/tests/workspace/mets.xml
+++ b/tests/workspace/mets.xml
@@ -35,6 +35,12 @@
+
+
+
+
+
+
@@ -49,12 +55,44 @@
-
-
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
@@ -62,28 +100,42 @@
+
+
+
+
+
+
+
+
+
+
+
+
-
-
+
+
+
+
diff --git a/tests/workspace/reference_page_xml/18xx-Missio-EMU-0042.xml b/tests/workspace/reference_page_xml/18xx-Missio-EMU-0042.xml
index 771cf43..2a1fe4c 100644
--- a/tests/workspace/reference_page_xml/18xx-Missio-EMU-0042.xml
+++ b/tests/workspace/reference_page_xml/18xx-Missio-EMU-0042.xml
@@ -1,2141 +1,2140 @@
-
-
- OCR-D/core 2.63.3
- 2024-04-24T11:37:59.880024
- 2024-04-24T11:37:59.880024
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
+
+
+ OCR-D/core 2.66.0
+ 2024-08-21T15:00:12.655363
+ 2024-08-21T15:00:12.655363
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/tests/workspace/reference_page_xml/Ansiedlung_Korotschin_UZS_Sign_22a_0018.xml b/tests/workspace/reference_page_xml/Ansiedlung_Korotschin_UZS_Sign_22a_0018.xml
index 4cb1083..1a4a99c 100644
--- a/tests/workspace/reference_page_xml/Ansiedlung_Korotschin_UZS_Sign_22a_0018.xml
+++ b/tests/workspace/reference_page_xml/Ansiedlung_Korotschin_UZS_Sign_22a_0018.xml
@@ -1,2641 +1,2640 @@
-
-
- OCR-D/core 2.63.3
- 2024-04-24T11:38:02.445055
- 2024-04-24T11:38:02.445055
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
+
+
+ OCR-D/core 2.66.0
+ 2024-08-21T14:59:55.171335
+ 2024-08-21T14:59:55.171335
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git "a/tests/workspace/reference_page_xml/Ansiedlung_WD_Wielun_Lentsch\303\274tz_0053.xml" "b/tests/workspace/reference_page_xml/Ansiedlung_WD_Wielun_Lentsch\303\274tz_0053.xml"
index d2cb367..afa774b 100644
--- "a/tests/workspace/reference_page_xml/Ansiedlung_WD_Wielun_Lentsch\303\274tz_0053.xml"
+++ "b/tests/workspace/reference_page_xml/Ansiedlung_WD_Wielun_Lentsch\303\274tz_0053.xml"
@@ -1,1010 +1,1009 @@
-
-
- OCR-D/core 2.63.3
- 2024-04-24T11:38:05.496157
- 2024-04-24T11:38:05.496157
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
+
+
+ OCR-D/core 2.66.0
+ 2024-08-21T14:59:22.152990
+ 2024-08-21T14:59:22.152990
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/tests/workspace/reference_page_xml/Lodz_UZS_25_0056.xml b/tests/workspace/reference_page_xml/Lodz_UZS_25_0056.xml
index 6e97a2c..ddf26a3 100644
--- a/tests/workspace/reference_page_xml/Lodz_UZS_25_0056.xml
+++ b/tests/workspace/reference_page_xml/Lodz_UZS_25_0056.xml
@@ -1,2820 +1,2819 @@
-
-
- OCR-D/core 2.63.3
- 2024-04-24T11:38:08.019765
- 2024-04-24T11:38:08.019765
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
+
+
+ OCR-D/core 2.66.0
+ 2024-08-21T14:58:01.253637
+ 2024-08-21T14:58:01.253637
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/tests/workspace/reference_page_xml/nd1969-01-21_3.xml b/tests/workspace/reference_page_xml/nd1969-01-21_3.xml
index 1c7cc19..feea0cb 100644
--- a/tests/workspace/reference_page_xml/nd1969-01-21_3.xml
+++ b/tests/workspace/reference_page_xml/nd1969-01-21_3.xml
@@ -1,15 +1,24 @@
- OCR-D/core 2.64.1
- 2024-04-26T11:08:53.328131
- 2024-04-26T11:08:53.328131
+ OCR-D/core 2.66.0
+ 2024-08-21T14:54:31.440586
+ 2024-08-21T14:54:31.440586
-
-
-