Skip to content

Commit

Permalink
Merge pull request #23 from bertsky/toplevel-reading-order
Browse files Browse the repository at this point in the history
Top-level reading order
  • Loading branch information
bertsky authored Nov 29, 2024
2 parents 55fe416 + 4eb96ab commit d7d21c0
Show file tree
Hide file tree
Showing 16 changed files with 91,095 additions and 11,666 deletions.
3 changes: 2 additions & 1 deletion .github/workflows/test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,8 @@ jobs:
runs-on: ubuntu-latest
strategy:
matrix:
python-version: ['3.7', '3.8', '3.9', '3.10']
# 3.7 fails because ocrd discontinued
python-version: ['3.8', '3.9', '3.10']

steps:
- uses: actions/checkout@v3
Expand Down
102 changes: 45 additions & 57 deletions tests/test_workspace.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
from pathlib import Path
from os import chdir
from difflib import unified_diff
from unittest import TestCase, skip, main
from tempfile import NamedTemporaryFile
from pytest import fixture
from ocrd_utils import pushd_popd
from ocrd import Resolver
from ocrd_models.ocrd_page import parseEtree
from ocrd_models.constants import NAMESPACES as NS
from lxml import etree as ET
Expand All @@ -11,60 +12,47 @@

THIS_DIR = Path(__file__).resolve().parent

@fixture
def workspace_path(tmpdir):
workspace = str(THIS_DIR / "workspace" / "mets.xml")
workspace = Resolver().workspace_from_url(workspace, dst_dir=tmpdir, download=True)
with pushd_popd(tmpdir):
yield tmpdir

class TestConvertTextract(TestCase):
def setUp(self):
workspace = THIS_DIR / "workspace"
chdir(str(workspace))
def test_api(workspace_path, tmpdir):
test_path_dict = [
{
"aws": Path("textract_responses") / f"{filename.name.split('.', 1)[0]}.json",
"img": Path("images") / filename.name,
"xml": Path("reference_page_xml") / f"{filename.name.split('.', 1)[0]}.xml",
}
for filename in Path("images").iterdir()
]
for path in test_path_dict:
_, target_tree, _, _ = parseEtree(path["xml"], silence=True)
convert_file(str(path["aws"]), str(path["img"]), str(tmpdir/path["xml"]))
_, result_tree, _, _ = parseEtree(tmpdir/path["xml"], silence=True)
# remove elements bearing dates (Created, LastChange, Creator/Version)
for meta in target_tree.xpath(
"/page:PcGts/page:Metadata/*",
namespaces=NS,
) + result_tree.xpath(
"/page:PcGts/page:Metadata/*",
namespaces=NS,
):
meta.getparent().remove(meta)
# remove img path from Page element

self.test_path_dict = [
{
"aws": Path("textract_responses")
/ f"{filename.name.split('.', 1)[0]}.json",
"img": Path("images") / filename.name,
"xml": Path("reference_page_xml")
/ f"{filename.name.split('.', 1)[0]}.xml",
}
for filename in (workspace / "images").iterdir()
]
print(self.test_path_dict)

def test_api(self):
for path in self.test_path_dict:
print(path)
_, target_tree, _, _ = parseEtree(path["xml"], silence=True)
with NamedTemporaryFile() as out:
convert_file(str(path["aws"]), str(path["img"]), out.name)
_, result_tree, _, _ = parseEtree(out.name, silence=True)
# remove elements bearing dates (Created, LastChange, Creator/Version)
for meta in target_tree.xpath(
"/pc:PcGts/pc:Metadata/*",
namespaces={
"pc": "http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15"
},
) + result_tree.xpath(
"/pc:PcGts/pc:Metadata/*",
namespaces={
"pc": "http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15"
},
):
meta.getparent().remove(meta)
# remove img path from Page element

res_img_path_elem = result_tree.find(
"pc:Page",
namespaces={
"pc": "http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15"
},
)
del res_img_path_elem.attrib["imageFilename"]
tar_img_path_elem = target_tree.find(
"pc:Page",
namespaces={
"pc": "http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15"
},
)
del tar_img_path_elem.attrib["imageFilename"]
target_xml = ET.tostring(target_tree, pretty_print=True, encoding='UTF-8').decode('utf-8')
result_xml = ET.tostring(result_tree, pretty_print=True, encoding='UTF-8').decode('utf-8')
assert target_xml == result_xml
res_img_path_elem = result_tree.find(
"page:Page",
namespaces=NS,
)
del res_img_path_elem.attrib["imageFilename"]
tar_img_path_elem = target_tree.find(
"page:Page",
namespaces=NS,
)
del tar_img_path_elem.attrib["imageFilename"]
target_xml = ET.tostring(target_tree, pretty_print=True, encoding='UTF-8').decode('utf-8')
result_xml = ET.tostring(result_tree, pretty_print=True, encoding='UTF-8').decode('utf-8')
assert result_xml == target_xml, path
Binary file added tests/workspace/images/sn1991-01-03_0001.jpg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added tests/workspace/images/sn1991-02-09_pr_0002.jpg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
60 changes: 56 additions & 4 deletions tests/workspace/mets.xml
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,12 @@
<mets:file ID="OCR-D-IMG_nowa_doba" MIMETYPE="image/jpeg">
<mets:FLocat xlink:href="images/nowa_doba.jpg" LOCTYPE="OTHER" OTHERLOCTYPE="FILE"/>
</mets:file>
<mets:file ID="OCR-D-IMG_sn1991-02-09_pr_0002" MIMETYPE="image/jpeg">
<mets:FLocat xlink:href="images/sn1991-02-09_pr_0002.jpg" LOCTYPE="OTHER" OTHERLOCTYPE="FILE"/>
</mets:file>
<mets:file ID="OCR-D-IMG_sn1991-01-03_0001" MIMETYPE="image/jpeg">
<mets:FLocat xlink:href="images/sn1991-01-03_0001.jpg" LOCTYPE="OTHER" OTHERLOCTYPE="FILE"/>
</mets:file>
</mets:fileGrp>
<mets:fileGrp USE="OCR-D-SEG-PAGE">
<mets:file ID="OCR-D-SEG-PAGE_f18xx-Missio-EMU-0042" MIMETYPE="application/vnd.prima.page+xml">
Expand All @@ -49,41 +55,87 @@
<mets:file ID="OCR-D-SEG-PAGE_Lodz_UZS_25_0056" MIMETYPE="application/vnd.prima.page+xml">
<mets:FLocat xlink:href="reference_page_xml/Lodz_UZS_25_0056.xml" LOCTYPE="OTHER" OTHERLOCTYPE="FILE"/>
</mets:file>
<mets:file ID="OCR-D-SEG-PAGE_nd1969-01-21_03" MIMETYPE="application/vnd.prima.page+xml">
<mets:FLocat xlink:href="reference_page_xml/nd1969-01-21_03.xml" LOCTYPE="OTHER" OTHERLOCTYPE="FILE"/>
<mets:file ID="OCR-D-SEG-PAGE_nd1969-01-21_3" MIMETYPE="application/vnd.prima.page+xml">
<mets:FLocat xlink:href="reference_page_xml/nd1969-01-21_3.xml" LOCTYPE="OTHER" OTHERLOCTYPE="FILE"/>
</mets:file>
<mets:file ID="OCR-D-SEG-PAGE_nowa_doba" MIMETYPE="application/vnd.prima.page+xml">
<mets:FLocat xlink:href="reference_page_xml/nowa_doba.xml" LOCTYPE="OTHER" OTHERLOCTYPE="FILE"/>
</mets:file>
<mets:file ID="OCR-D-SEG-PAGE_sn1991-02-09_pr_0002" MIMETYPE="application/vnd.prima.page+xml">
<mets:FLocat xlink:href="reference_page_xml/sn1991-02-09_pr_0002.xml" LOCTYPE="OTHER" OTHERLOCTYPE="FILE"/>
</mets:file>
<mets:file ID="OCR-D-SEG-PAGE_sn1991-01-03_0001" MIMETYPE="application/vnd.prima.page+xml">
<mets:FLocat xlink:href="reference_page_xml/sn1991-01-03_0001.xml" LOCTYPE="OTHER" OTHERLOCTYPE="FILE"/>
</mets:file>
</mets:fileGrp>
<mets:fileGrp USE="AWS">
<mets:file ID="AWS_18xx-Missio-EMU-0042" MIMETYPE="application/json">
<mets:FLocat xlink:href="textract_responses/18xx-Missio-EMU-0042.json" LOCTYPE="OTHER" OTHERLOCTYPE="FILE"/>
</mets:file>
<mets:file ID="AWS_Ansiedlung_Korotschin_UZS_Sign_22a_0018" MIMETYPE="application/json">
<mets:FLocat xlink:href="textract_responses/Ansiedlung_Korotschin_UZS_Sign_22a_0018.json" LOCTYPE="OTHER" OTHERLOCTYPE="FILE"/>
</mets:file>
<mets:file ID="AWS_Ansiedlung_WD_Wielun_Lentschütz_0053" MIMETYPE="application/json">
<mets:FLocat xlink:href="textract_responses/Ansiedlung_WD_Wielun_Lentschütz_0053.json" LOCTYPE="OTHER" OTHERLOCTYPE="FILE"/>
</mets:file>
<mets:file ID="AWS_Lodz_UZS_25_0056" MIMETYPE="application/json">
<mets:FLocat xlink:href="textract_responses/Lodz_UZS_25_0056.json" LOCTYPE="OTHER" OTHERLOCTYPE="FILE"/>
</mets:file>
<mets:file ID="AWS_nd1969-01-21_3" MIMETYPE="application/json">
<mets:FLocat xlink:href="textract_responses/nd1969-01-21_3.json" LOCTYPE="OTHER" OTHERLOCTYPE="FILE"/>
</mets:file>
<mets:file ID="AWS_nowa_doba" MIMETYPE="application/json">
<mets:FLocat xlink:href="textract_responses/nowa_doba.json" LOCTYPE="OTHER" OTHERLOCTYPE="FILE"/>
</mets:file>
<mets:file ID="AWS_sn1991-02-09_pr_0002" MIMETYPE="application/json">
<mets:FLocat xlink:href="textract_responses/sn1991-02-09_pr_0002.json" LOCTYPE="OTHER" OTHERLOCTYPE="FILE"/>
</mets:file>
<mets:file ID="AWS_sn1991-01-03_0001" MIMETYPE="application/json">
<mets:FLocat xlink:href="textract_responses/sn1991-01-03_0001.json" LOCTYPE="OTHER" OTHERLOCTYPE="FILE"/>
</mets:file>
</mets:fileGrp>
</mets:fileSec>
<mets:structMap TYPE="PHYSICAL">
<mets:div TYPE="physSequence">
<mets:div TYPE="page" ID="f18xx-Missio-EMU-0042">
<mets:fptr FILEID="OCR-D-IMG_f18xx-Missio-EMU-0042"/>
<mets:fptr FILEID="OCR-D-SEG-PAGE_f18xx-Missio-EMU-0042"/>
<mets:fptr FILEID="AWS_18xx-Missio-EMU-0042"/>
</mets:div>
<mets:div TYPE="page" ID="Ansiedlung_Korotschin_UZS_Sign_22a_0018">
<mets:fptr FILEID="OCR-D-IMG_Ansiedlung_Korotschin_UZS_Sign_22a_0018"/>
<mets:fptr FILEID="OCR-D-SEG-PAGE_Ansiedlung_Korotschin_UZS_Sign_22a_0018"/>
<mets:fptr FILEID="AWS_Ansiedlung_Korotschin_UZS_Sign_22a_0018"/>
</mets:div>
<mets:div TYPE="page" ID="Ansiedlung_WD_Wielun_Lentschütz_0053">
<mets:fptr FILEID="OCR-D-IMG_Ansiedlung_WD_Wielun_Lentschütz_0053"/>
<mets:fptr FILEID="OCR-D-SEG-PAGE_Ansiedlung_WD_Wielun_Lentschütz_0053"/>
<mets:fptr FILEID="AWS_Ansiedlung_WD_Wielun_Lentschütz_0053"/>
</mets:div>
<mets:div TYPE="page" ID="Lodz_UZS_25_0056">
<mets:fptr FILEID="OCR-D-IMG_Lodz_UZS_25_0056"/>
<mets:fptr FILEID="OCR-D-SEG-PAGE_Lodz_UZS_25_0056"/>
<mets:fptr FILEID="AWS_Lodz_UZS_25_0056"/>
</mets:div>
<mets:div TYPE="page" ID="nd1969-01-21_3">
<mets:fptr FILEID="OCR-D-IMG_nd1969-01-21_3"/>
<mets:fptr FILEID="OCR-D-SEG-PAGE_nd1969-01-21_3"/>
<mets:fptr FILEID="AWS_nd1969-01-21_3"/>
</mets:div>
<mets:div TYPE="page" ID="nowa_doba">
<mets:fptr FILEID="OCR-D-IMG_nowa_doba"/>
<mets:fptr FILEID="OCR-D-SEG-PAGE_nowa_doba"/>
<mets:fptr FILEID="AWS_nowa_doba"/>
</mets:div>
<mets:div TYPE="page" ID="sn1991-02-09_pr_0002">
<mets:fptr FILEID="OCR-D-IMG_sn1991-02-09_pr_0002"/>
<mets:fptr FILEID="OCR-D-SEG-PAGE_sn1991-02-09_pr_0002"/>
<mets:fptr FILEID="AWS_sn1991-02-09_pr_0002"/>
</mets:div>
<mets:div TYPE="page" ID="nd1969-01-21_03">
<mets:fptr FILEID="OCR-D-SEG-PAGE_nd1969-01-21_03"/>
<mets:div TYPE="page" ID="sn1991-01-03_0001">
<mets:fptr FILEID="OCR-D-IMG_sn1991-01-03_0001"/>
<mets:fptr FILEID="OCR-D-SEG-PAGE_sn1991-01-03_0001"/>
<mets:fptr FILEID="AWS_sn1991-01-03_0001"/>
</mets:div>
</mets:div>
</mets:structMap>
Expand Down
Loading

0 comments on commit d7d21c0

Please sign in to comment.