Skip to content

Commit

Permalink
fix table cell creation, refactor reading order generation, remove 'p…
Browse files Browse the repository at this point in the history
…reserve-reading-order' option
  • Loading branch information
rue-a committed Apr 24, 2024
1 parent 6062904 commit a267e02
Show file tree
Hide file tree
Showing 10 changed files with 31,878 additions and 4,129 deletions.
4 changes: 3 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -5,4 +5,6 @@ build
dist
*~
.vscode
ocrd.log
ocrd.log
tests/workspace/OCR-D-IMG
tests/workspace/OCR-D-SEG-PAGE
14 changes: 13 additions & 1 deletion tests/workspace/mets.xml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
<?xml version="1.0" encoding="UTF-8"?>
<mets:mets xmlns:mets="http://www.loc.gov/METS/" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="info:lc/xmlns/premis-v2 http://www.loc.gov/standards/premis/v2/premis-v2-0.xsd http://www.loc.gov/mods/v3 http://www.loc.gov/standards/mods/v3/mods-3-6.xsd http://www.loc.gov/METS/ http://www.loc.gov/standards/mets/mets.xsd http://www.loc.gov/mix/v10 http://www.loc.gov/standards/mix/mix10/mix10.xsd">
<mets:metsHdr CREATEDATE="2024-04-12T13:10:28.344894">
<mets:metsHdr CREATEDATE="2024-04-24T11:11:46.027506">
<mets:agent TYPE="OTHER" OTHERTYPE="SOFTWARE" ROLE="CREATOR">
<mets:name>ocrd/core v2.63.3</mets:name>
</mets:agent>
Expand Down Expand Up @@ -29,6 +29,9 @@
<mets:file ID="OCR-D-IMG_Lodz_UZS_25_0056" MIMETYPE="image/tiff">
<mets:FLocat xlink:href="images/Lodz_UZS_25_0056.tif" LOCTYPE="OTHER" OTHERLOCTYPE="FILE"/>
</mets:file>
<mets:file ID="OCR-D-IMG_nd1969-01-21_3" MIMETYPE="image/jpeg">
<mets:FLocat xlink:href="images/nd1969-01-21_3.jpg" LOCTYPE="OTHER" OTHERLOCTYPE="FILE"/>
</mets:file>
<mets:file ID="OCR-D-IMG_nowa_doba" MIMETYPE="image/jpeg">
<mets:FLocat xlink:href="images/nowa_doba.jpg" LOCTYPE="OTHER" OTHERLOCTYPE="FILE"/>
</mets:file>
Expand All @@ -46,6 +49,9 @@
<mets:file ID="OCR-D-SEG-PAGE_Lodz_UZS_25_0056" MIMETYPE="application/vnd.prima.page+xml">
<mets:FLocat xlink:href="reference_page_xml/Lodz_UZS_25_0056.xml" LOCTYPE="OTHER" OTHERLOCTYPE="FILE"/>
</mets:file>
<mets:file ID="OCR-D-SEG-PAGE_nd1969-01-21_03" MIMETYPE="application/vnd.prima.page+xml">
<mets:FLocat xlink:href="reference_page_xml/nd1969-01-21_03.xml" LOCTYPE="OTHER" OTHERLOCTYPE="FILE"/>
</mets:file>
<mets:file ID="OCR-D-SEG-PAGE_nowa_doba" MIMETYPE="application/vnd.prima.page+xml">
<mets:FLocat xlink:href="reference_page_xml/nowa_doba.xml" LOCTYPE="OTHER" OTHERLOCTYPE="FILE"/>
</mets:file>
Expand All @@ -69,10 +75,16 @@
<mets:fptr FILEID="OCR-D-IMG_Lodz_UZS_25_0056"/>
<mets:fptr FILEID="OCR-D-SEG-PAGE_Lodz_UZS_25_0056"/>
</mets:div>
<mets:div TYPE="page" ID="nd1969-01-21_3">
<mets:fptr FILEID="OCR-D-IMG_nd1969-01-21_3"/>
</mets:div>
<mets:div TYPE="page" ID="nowa_doba">
<mets:fptr FILEID="OCR-D-IMG_nowa_doba"/>
<mets:fptr FILEID="OCR-D-SEG-PAGE_nowa_doba"/>
</mets:div>
<mets:div TYPE="page" ID="nd1969-01-21_03">
<mets:fptr FILEID="OCR-D-SEG-PAGE_nd1969-01-21_03"/>
</mets:div>
</mets:div>
</mets:structMap>
</mets:mets>
808 changes: 404 additions & 404 deletions tests/workspace/reference_page_xml/18xx-Missio-EMU-0042.xml

Large diffs are not rendered by default.

1,076 changes: 538 additions & 538 deletions tests/workspace/reference_page_xml/Ansiedlung_Korotschin_UZS_Sign_22a_0018.xml

Large diffs are not rendered by default.

Large diffs are not rendered by default.

1,352 changes: 704 additions & 648 deletions tests/workspace/reference_page_xml/Lodz_UZS_25_0056.xml

Large diffs are not rendered by default.

27,543 changes: 27,543 additions & 0 deletions tests/workspace/reference_page_xml/nd1969-01-21_03.xml

Large diffs are not rendered by default.

4,409 changes: 2,205 additions & 2,204 deletions tests/workspace/reference_page_xml/nowa_doba.xml

Large diffs are not rendered by default.

24 changes: 14 additions & 10 deletions textract2page/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,27 +2,31 @@

from .convert_aws import convert_file

CONTEXT_SETTINGS = dict(help_option_names=['-h', '--help'])
CONTEXT_SETTINGS = dict(help_option_names=["-h", "--help"])


@click.command(context_settings=CONTEXT_SETTINGS)
@click.option('-O', '--output-file', default='-', help='Output filename (or "-" for standard output)',
type=click.Path(dir_okay=False, writable=True, exists=False, allow_dash=True))
@click.option('--preserve-textract-reading-order', default=True, help='Preserve reading order of lines as indicated by Textract (default = True)')
@click.argument('aws-json-file', type=click.Path(dir_okay=False, exists=True))
@click.argument('image-file', type=click.Path(dir_okay=False, exists=True))
def cli(output_file, preserve_textract_reading_order, aws_json_file, image_file):
@click.option(
"-O",
"--output-file",
default="-",
help='Output filename (or "-" for standard output)',
type=click.Path(dir_okay=False, writable=True, exists=False, allow_dash=True),
)
@click.argument("aws-json-file", type=click.Path(dir_okay=False, exists=True))
@click.argument("image-file", type=click.Path(dir_okay=False, exists=True))
def cli(output_file, aws_json_file, image_file):
"""Convert an AWS Textract JSON file to a PAGE XML file.
Also requires the original input image of AWS OCR to get absolute image coordinates.
The output file will reference the image file under `Page/@imageFilename`
with its full path. (So you may want to use a relative path.)
"""
if output_file == '-':
if output_file == "-":
output_file = None
convert_file(aws_json_file, image_file, output_file, preserve_textract_reading_order)
convert_file(aws_json_file, image_file, output_file)


if __name__ == '__main__':
if __name__ == "__main__":
cli() # pylint: disable=no-value-for-parameter
Loading

0 comments on commit a267e02

Please sign in to comment.