From 9b30ee49b9da99f831bbd8f215f6ebe4cba96ead Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Mon, 24 Aug 2020 22:21:23 +0200 Subject: [PATCH 01/23] segment-region: only ignore if candidate outside Border --- ocrd_tesserocr/segment_region.py | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/ocrd_tesserocr/segment_region.py b/ocrd_tesserocr/segment_region.py index df5e1ec..7f7d83b 100644 --- a/ocrd_tesserocr/segment_region.py +++ b/ocrd_tesserocr/segment_region.py @@ -219,9 +219,15 @@ def _process_page(self, it, page, page_image, page_coords, page_id): else: polygon = polygon_from_x0y0x1y1(bbox) polygon = coordinates_for_segment(polygon, page_image, page_coords) - polygon = polygon_for_parent(polygon, page) + polygon2 = polygon_for_parent(polygon, page) + if polygon2 is not None: + polygon = polygon2 points = points_from_polygon(polygon) coords = CoordsType(points=points) + if polygon2 is None: + LOG.info('Ignoring extant region: %s', points) + it.Next(RIL.BLOCK) + continue # if xywh['w'] < 30 or xywh['h'] < 30: # LOG.info('Ignoring too small region: %s', points) # it.Next(RIL.BLOCK) @@ -324,8 +330,10 @@ def polygon_for_parent(polygon, parent): return polygon interp = childp.intersection(parentp) if interp.is_empty: - # FIXME: we need a better strategy against this - raise Exception("intersection of would-be segment with parent is empty") + # this happens if Tesseract "finds" something + # outside of the valid Border of a deskewed/cropped page + # (empty corners created by masking); will be ignored + return None if interp.type == 'MultiPolygon': interp = interp.convex_hull return interp.exterior.coords[:-1] # keep open From 8c183d1ab580230b183501f94e0e3591821801ef Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Mon, 24 Aug 2020 22:30:01 +0200 Subject: [PATCH 02/23] make install depend on deps again --- Makefile | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Makefile b/Makefile index 3054453..7261bb7 100644 --- a/Makefile +++ b/Makefile @@ -73,7 +73,7 @@ deps-test: $(PIP) install -r requirements_test.txt # Install -install: +install: deps $(PIP) install -U pip $(PIP) install . @@ -82,12 +82,12 @@ docker: docker build -t $(DOCKER_TAG) . # Run unit tests -test: test/assets +test: test/assets deps-test # declare -p HTTP_PROXY $(PYTHON) -m pytest --continue-on-collection-errors test $(PYTEST_ARGS) # Run unit tests and determine test coverage -coverage: +coverage: deps-test coverage erase make test PYTHON="coverage run" coverage report From 30f570e40df8a7ae3a222a7dd6e74ca474014ef1 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Mon, 24 Aug 2020 22:41:37 +0200 Subject: [PATCH 03/23] update test-cli to work with current assets --- Makefile | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/Makefile b/Makefile index 7261bb7..2143b8e 100644 --- a/Makefile +++ b/Makefile @@ -25,14 +25,14 @@ help: @echo " from Alexander Pozdnyakov which provides 4.1.0." @echo " See https://launchpad.net/~alex-p/+archive/ubuntu/tesseract-ocr" @echo " for details.)" - @echo " deps Install python deps via pip" - @echo " deps-test Install testing python deps via pip" - @echo " install Install" + @echo " deps Install Python deps for install via pip" + @echo " deps-test Install Python deps for test via pip" @echo " docker Build docker image" - @echo " test Run test" + @echo " install Install this package" + @echo " test Run regression test" @echo " test-cli Test the command line tools" - @echo " repo/assets Clone OCR-D/assets to ./repo/assets" @echo " test/assets Setup test assets" + @echo " repo/assets Clone OCR-D/assets to ./repo/assets" @echo " assets-clean Remove symlinks in test/assets" @echo "" @echo " Variables" @@ -96,12 +96,12 @@ coverage: deps-test # Test the command line tools test-cli: test/assets $(PIP) install -e . - rm -rfv test-workspace - cp -rv test/assets/kant_aufklaerung_1784 test-workspace - export LC_ALL=C.UTF-8; cd test-workspace/data && \ - ocrd-tesserocr-segment-region -l DEBUG -m mets.xml -I OCR-D-IMG -O OCR-D-SEG-BLOCK ; \ - ocrd-tesserocr-segment-line -l DEBUG -m mets.xml -I OCR-D-SEG-BLOCK -O OCR-D-SEG-LINE ; \ - ocrd-tesserocr-recognize -l DEBUG -m mets.xml -I OCR-D-SEG-LINE -O OCR-D-TESS-OCR + rm -rfv test/workspace + cp -rv test/assets/kant_aufklaerung_1784 test/workspace + cd test/workspace/data && \ + ocrd-tesserocr-segment-region -l DEBUG -I OCR-D-IMG -O OCR-D-SEG-REGION ; \ + ocrd-tesserocr-segment-line -l DEBUG -I OCR-D-SEG-REGION -O OCR-D-SEG-LINE ; \ + ocrd-tesserocr-recognize -l DEBUG -I OCR-D-SEG-LINE -O OCR-D-TESS-OCR .PHONY: test test-cli install deps deps-ubuntu deps-test help @@ -110,6 +110,8 @@ test-cli: test/assets # # Clone OCR-D/assets to ./repo/assets +# FIXME does not work if already checked out +# FIXME should be a proper (VCed) submodule repo/assets: mkdir -p $(dir $@) git clone https://github.com/OCR-D/assets "$@" From 4818e72078c654d2f2f733491b6f43caa266f5d9 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Mon, 24 Aug 2020 22:45:11 +0200 Subject: [PATCH 04/23] CircleCI: use ocrd/core as base, update/simplify steps --- .circleci/config.yml | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 73a7cfd..9a5ecd9 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -6,11 +6,12 @@ jobs: build-python36: docker: - - image: ubuntu:18.04 + - image: ocrd/core steps: - run: apt-get update && apt-get install -y --no-install-recommends make git curl - checkout - - run: make deps-ubuntu deps-test deps install repo/assets + - run: make deps-ubuntu + - run: make install - run: make test-cli - run: make coverage - codecov/upload From 9e6ad4109cadcef3b0600f97d3da8ebde2d5ab35 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Tue, 25 Aug 2020 11:17:29 +0200 Subject: [PATCH 05/23] reformat makefile to make consistent with generated help --- Makefile | 30 ++++++++++++++++-------------- 1 file changed, 16 insertions(+), 14 deletions(-) diff --git a/Makefile b/Makefile index 2143b8e..2fd5894 100644 --- a/Makefile +++ b/Makefile @@ -29,7 +29,8 @@ help: @echo " deps-test Install Python deps for test via pip" @echo " docker Build docker image" @echo " install Install this package" - @echo " test Run regression test" + @echo " test Run unit tests" + @echo " coverage Run unit tests and determine test coverage" @echo " test-cli Test the command line tools" @echo " test/assets Setup test assets" @echo " repo/assets Clone OCR-D/assets to ./repo/assets" @@ -44,7 +45,7 @@ help: # Dependencies for deployment in an ubuntu/debian linux # (lib*-dev merely for building tesserocr with pip) -# (tesseract-ocr: Ubuntu 18.04 now ships 4.0.0 +# (tesseract-ocr: Ubuntu 18.04 now ships 4.0.0, # which is unsupported. Add the tesseract-ocr PPA # from Alexander Pozdnyakov which provides 4.1.0. # See https://launchpad.net/~alex-p/+archive/ubuntu/tesseract-ocr @@ -62,25 +63,25 @@ deps-ubuntu: tesseract-ocr-eng \ tesseract-ocr -# Install python deps via pip +# Install Python deps for install via pip deps: $(PIP) install -U pip $(PIP) install -r requirements.txt -# Install testing python deps via pip +# Install Python deps for test via pip deps-test: $(PIP) install -U pip $(PIP) install -r requirements_test.txt -# Install -install: deps - $(PIP) install -U pip - $(PIP) install . - # Build docker image docker: docker build -t $(DOCKER_TAG) . +# Install this package +install: deps + $(PIP) install -U pip + $(PIP) install . + # Run unit tests test: test/assets deps-test # declare -p HTTP_PROXY @@ -109,6 +110,12 @@ test-cli: test/assets # Assets # +# Setup test assets (copy repo/assets) +# FIXME remove/update if already present +test/assets: repo/assets + mkdir -p $@ + cp -r -t $@ repo/assets/data/* + # Clone OCR-D/assets to ./repo/assets # FIXME does not work if already checked out # FIXME should be a proper (VCed) submodule @@ -117,11 +124,6 @@ repo/assets: git clone https://github.com/OCR-D/assets "$@" -# Setup test assets -test/assets: repo/assets - mkdir -p $@ - cp -r -t $@ repo/assets/data/* - .PHONY: assets-clean # Remove symlinks in test/assets assets-clean: From beacbc3223626b5651ffe6f73ed2959884ba889f Mon Sep 17 00:00:00 2001 From: Robert Sachunsky <38561704+bertsky@users.noreply.github.com> Date: Fri, 4 Sep 2020 12:44:48 +0200 Subject: [PATCH 06/23] use suffix for AlternativeImage file ID --- ocrd_tesserocr/deskew.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ocrd_tesserocr/deskew.py b/ocrd_tesserocr/deskew.py index e47d97c..d87acc2 100644 --- a/ocrd_tesserocr/deskew.py +++ b/ocrd_tesserocr/deskew.py @@ -287,7 +287,7 @@ def _process_segment(self, tessapi, segment, image, xywh, where, page_id, file_i # segment.add_Baseline(BaselineType(points=points)) # update METS (add the image file): file_path = self.workspace.save_image_file(image, - file_id, + file_id + '.IMG-DESKEW', page_id=page_id, file_grp=self.output_file_grp) # update PAGE (reference the image file): From 5761661fc51fdab79103591b8ee041e69fe7ac3a Mon Sep 17 00:00:00 2001 From: Robert Sachunsky <38561704+bertsky@users.noreply.github.com> Date: Fri, 4 Sep 2020 21:43:49 +0200 Subject: [PATCH 07/23] :package: 0.9.2 --- CHANGELOG.md | 7 +++++++ ocrd_tesserocr/ocrd-tool.json | 2 +- 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 4bf8052..4afffc4 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,13 @@ Versioned according to [Semantic Versioning](http://semver.org/). ## Unreleased +## [0.9.2] - 2020-09-04 + +Fixed: + + * segment-region: just ignore region outside of page frame, #145 + * deskew: add suffix to AlternativeImage file ID, #148 + ## [0.9.1] - 2020-08-16 Fixed: diff --git a/ocrd_tesserocr/ocrd-tool.json b/ocrd_tesserocr/ocrd-tool.json index 9c1cc9d..f6ab780 100644 --- a/ocrd_tesserocr/ocrd-tool.json +++ b/ocrd_tesserocr/ocrd-tool.json @@ -1,5 +1,5 @@ { - "version": "0.9.1", + "version": "0.9.2", "git_url": "https://github.com/OCR-D/ocrd_tesserocr", "dockerhub": "ocrd/tesserocr", "tools": { From 5789cb6b53c0aa06aff3ad45fb7201e957ec221c Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Mon, 14 Sep 2020 01:01:13 +0200 Subject: [PATCH 08/23] =?UTF-8?q?segment-region:=20more=20robust=20interse?= =?UTF-8?q?ction=20with=20parent=E2=80=A6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - cover zero area intersections (treating them like empty) - cover heterogeneous intersections (removing zero area shapes) --- ocrd_tesserocr/segment_region.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/ocrd_tesserocr/segment_region.py b/ocrd_tesserocr/segment_region.py index 7f7d83b..7552eaf 100644 --- a/ocrd_tesserocr/segment_region.py +++ b/ocrd_tesserocr/segment_region.py @@ -2,6 +2,7 @@ import os.path from shapely.geometry import Polygon +from shapely.ops import unary_union from tesserocr import ( PyTessBaseAPI, PSM, RIL, PT @@ -329,11 +330,16 @@ def polygon_for_parent(polygon, parent): if childp.within(parentp): return polygon interp = childp.intersection(parentp) - if interp.is_empty: + if interp.is_empty or interp.area == 0.0: # this happens if Tesseract "finds" something # outside of the valid Border of a deskewed/cropped page # (empty corners created by masking); will be ignored return None + if interp.type == 'GeometryCollection': + # heterogeneous result: filter zero-area shapes (LineString, Point) + interp = unary_union([geom for geom in interp.geoms if geom.area > 0]) if interp.type == 'MultiPolygon': + # homogeneous result: construct convex hull to connect + # FIXME: construct concave hull / alpha shape interp = interp.convex_hull return interp.exterior.coords[:-1] # keep open From 17244bfb105802e7149055536b14c24f5a49cdb4 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Mon, 14 Sep 2020 09:39:46 +0200 Subject: [PATCH 09/23] =?UTF-8?q?segment-region:=20even=20more=20robust=20?= =?UTF-8?q?intersection=20with=20parent=E2=80=A6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - cover invalid paths on input side (from rounding; via repeated simplification) - cover invalid path on output side (from rounding; via repeated simplification) --- ocrd_tesserocr/segment_region.py | 23 ++++++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) diff --git a/ocrd_tesserocr/segment_region.py b/ocrd_tesserocr/segment_region.py index 7552eaf..c6f2701 100644 --- a/ocrd_tesserocr/segment_region.py +++ b/ocrd_tesserocr/segment_region.py @@ -1,7 +1,8 @@ from __future__ import absolute_import import os.path -from shapely.geometry import Polygon +import numpy as np +from shapely.geometry import Polygon, asPolygon from shapely.ops import unary_union from tesserocr import ( PyTessBaseAPI, @@ -327,8 +328,20 @@ def polygon_for_parent(polygon, parent): [parent.get_imageWidth(),0]]) else: parentp = Polygon(polygon_from_points(parent.get_Coords().points)) + # check if clipping is necessary if childp.within(parentp): return polygon + # ensure input coords have valid paths (without self-intersection) + # (this can happen when shapes valid in floating point are rounded) + for tolerance in range(1, int(childp.area)): + if childp.is_valid: + break + childp = childp.simplify(tolerance) + for tolerance in range(1, int(parentp.area)): + if parentp.is_valid: + break + parentp = parentp.simplify(tolerance) + # clip to parent interp = childp.intersection(parentp) if interp.is_empty or interp.area == 0.0: # this happens if Tesseract "finds" something @@ -342,4 +355,12 @@ def polygon_for_parent(polygon, parent): # homogeneous result: construct convex hull to connect # FIXME: construct concave hull / alpha shape interp = interp.convex_hull + if interp.minimum_clearance < 1.0: + # follow-up calculations will necessarily be integer; + # so anticipate rounding here and then ensure validity + interp = asPolygon(np.round(interp.exterior.coords)) + for tolerance in range(1, int(interp.area)): + if interp.is_valid: + break + interp = interp.simplify(tolerance) return interp.exterior.coords[:-1] # keep open From 59d6b3f8b59b03749d616ade35a54ee58a462513 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Mon, 14 Sep 2020 09:55:43 +0200 Subject: [PATCH 10/23] update shapely --- requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/requirements.txt b/requirements.txt index 0434d1f..d8a4b6d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,4 @@ ocrd >= 2.4.4 click tesserocr >= 2.5.1 +shapely >= 1.7.1 From c7bc33b5027bbe40de5869845ebb98a0601a17fb Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Mon, 14 Sep 2020 10:00:00 +0200 Subject: [PATCH 11/23] =?UTF-8?q?segment-line:=20more=20robust=20intersect?= =?UTF-8?q?ion=20with=20parent=E2=80=A6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit (re-use function from segment-region) --- ocrd_tesserocr/segment_line.py | 27 ++++++++------------------- 1 file changed, 8 insertions(+), 19 deletions(-) diff --git a/ocrd_tesserocr/segment_line.py b/ocrd_tesserocr/segment_line.py index 7f71a9e..d193282 100644 --- a/ocrd_tesserocr/segment_line.py +++ b/ocrd_tesserocr/segment_line.py @@ -2,7 +2,6 @@ import itertools import os.path -from shapely.geometry import Polygon, LinearRing from tesserocr import PyTessBaseAPI, RIL, PSM from ocrd import Processor @@ -26,6 +25,7 @@ ) from .config import TESSDATA_PREFIX, OCRD_TOOL +from .segment_region import polygon_for_parent TOOL = 'ocrd-tesserocr-segment-line' LOG = getLogger('processor.TesserocrSegmentLine') @@ -108,30 +108,19 @@ def process(self): region_image, region_coords = self.workspace.image_from_segment( region, page_image, page_coords) region_polygon = coordinates_of_segment(region, region_image, region_coords) - region_poly = Polygon(region_polygon) tessapi.SetImage(region_image) for line_no, component in enumerate(tessapi.GetComponentImages(RIL.TEXTLINE, True, raw_image=True)): line_id = '%s_line%04d' % (region.id, line_no) line_polygon = polygon_from_xywh(component[1]) - line_poly = Polygon(line_polygon) - if not line_poly.within(region_poly): - # this could happen due to rotation - interline = line_poly.intersection(region_poly) - if interline.is_empty: - continue # ignore this line - if hasattr(interline, 'geoms'): - # is (heterogeneous) GeometryCollection - area = 0 - for geom in interline.geoms: - if geom.area > area: - area = geom.area - interline = geom - if not area: - continue - line_poly = interline.convex_hull - line_polygon = line_poly.exterior.coords line_polygon = coordinates_for_segment(line_polygon, region_image, region_coords) + line_polygon2 = polygon_for_parent(line_polygon, region) + if line_polygon2 is not None: + line_polygon = line_polygon2 line_points = points_from_polygon(line_polygon) + if line_polygon2 is None: + # could happen due to rotation + LOG.info('Ignoring extant line: %s', line_points) + continue region.add_TextLine(TextLineType( id=line_id, Coords=CoordsType(line_points))) From 6bbe873d7eb21f68cc649d98731f9209093d18be Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Mon, 14 Sep 2020 11:11:36 +0200 Subject: [PATCH 12/23] =?UTF-8?q?segment-region:=20yet=20more=20robust=20i?= =?UTF-8?q?ntersection=20with=20parent=E2=80=A6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - cover invalid paths which cannot be repaired through simplification directly (find a new starting point in the point sequence) --- ocrd_tesserocr/segment_region.py | 29 +++++++++++++++++------------ 1 file changed, 17 insertions(+), 12 deletions(-) diff --git a/ocrd_tesserocr/segment_region.py b/ocrd_tesserocr/segment_region.py index c6f2701..612ea64 100644 --- a/ocrd_tesserocr/segment_region.py +++ b/ocrd_tesserocr/segment_region.py @@ -333,14 +333,8 @@ def polygon_for_parent(polygon, parent): return polygon # ensure input coords have valid paths (without self-intersection) # (this can happen when shapes valid in floating point are rounded) - for tolerance in range(1, int(childp.area)): - if childp.is_valid: - break - childp = childp.simplify(tolerance) - for tolerance in range(1, int(parentp.area)): - if parentp.is_valid: - break - parentp = parentp.simplify(tolerance) + childp = make_valid(childp) + parentp = make_valid(parentp) # clip to parent interp = childp.intersection(parentp) if interp.is_empty or interp.area == 0.0: @@ -359,8 +353,19 @@ def polygon_for_parent(polygon, parent): # follow-up calculations will necessarily be integer; # so anticipate rounding here and then ensure validity interp = asPolygon(np.round(interp.exterior.coords)) - for tolerance in range(1, int(interp.area)): - if interp.is_valid: - break - interp = interp.simplify(tolerance) + interp = make_valid(interp) return interp.exterior.coords[:-1] # keep open + +def make_valid(polygon): + for split in range(1, len(polygon.exterior.coords)-1): + if polygon.is_valid or polygon.simplify(polygon.area).is_valid: + break + # simplification may not be possible (at all) due to ordering + # in that case, try another starting point + polygon = Polygon(polygon.exterior.coords[-split:]+polygon.exterior.coords[:-split]) + for tolerance in range(1, int(polygon.area)): + if polygon.is_valid: + break + # simplification may require a larger tolerance + polygon = polygon.simplify(tolerance) + return polygon From c97f7c17f4ea1c7c1bc460e042f7e0809977bfbc Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Mon, 14 Sep 2020 17:07:05 +0200 Subject: [PATCH 13/23] all: delegate to `add_metadata` in core --- ocrd_tesserocr/binarize.py | 16 +--------------- ocrd_tesserocr/crop.py | 16 +--------------- ocrd_tesserocr/deskew.py | 16 +--------------- ocrd_tesserocr/recognize.py | 15 +-------------- ocrd_tesserocr/segment_line.py | 16 +--------------- ocrd_tesserocr/segment_region.py | 16 +--------------- ocrd_tesserocr/segment_table.py | 16 +--------------- ocrd_tesserocr/segment_word.py | 15 +-------------- 8 files changed, 8 insertions(+), 118 deletions(-) diff --git a/ocrd_tesserocr/binarize.py b/ocrd_tesserocr/binarize.py index 3701cb0..597981b 100644 --- a/ocrd_tesserocr/binarize.py +++ b/ocrd_tesserocr/binarize.py @@ -15,8 +15,6 @@ ) from ocrd_modelfactory import page_from_file from ocrd_models.ocrd_page import ( - MetadataItemType, - LabelsType, LabelType, AlternativeImageType, TextRegionType, to_xml @@ -62,21 +60,9 @@ def process(self): page_id = input_file.pageId or input_file.ID LOG.info("INPUT FILE %i / %s", n, page_id) pcgts = page_from_file(self.workspace.download_file(input_file)) + self.add_metadata(pcgts) page = pcgts.get_Page() - # add metadata about this operation and its runtime parameters: - metadata = pcgts.get_Metadata() # ensured by from_file() - metadata.add_MetadataItem( - MetadataItemType(type_="processingStep", - name=self.ocrd_tool['steps'][0], - value=TOOL, - Labels=[LabelsType( - externalModel="ocrd-tool", - externalId="parameters", - Label=[LabelType(type_=name, - value=self.parameter[name]) - for name in self.parameter.keys()])])) - page_image, page_xywh, _ = self.workspace.image_from_page( page, page_id) LOG.info("Binarizing on '%s' level in page '%s'", oplevel, page_id) diff --git a/ocrd_tesserocr/crop.py b/ocrd_tesserocr/crop.py index 77d5493..bd28354 100644 --- a/ocrd_tesserocr/crop.py +++ b/ocrd_tesserocr/crop.py @@ -18,8 +18,6 @@ ) from ocrd_modelfactory import page_from_file from ocrd_models.ocrd_page import ( - MetadataItemType, - LabelsType, LabelType, CoordsType, AlternativeImageType, to_xml ) @@ -70,21 +68,9 @@ def process(self): page_id = input_file.pageId or input_file.ID LOG.info("INPUT FILE %i / %s", n, page_id) pcgts = page_from_file(self.workspace.download_file(input_file)) + self.add_metadata(pcgts) page = pcgts.get_Page() - # add metadata about this operation and its runtime parameters: - metadata = pcgts.get_Metadata() # ensured by from_file() - metadata.add_MetadataItem( - MetadataItemType(type_="processingStep", - name=self.ocrd_tool['steps'][0], - value=TOOL, - Labels=[LabelsType( - externalModel="ocrd-tool", - externalId="parameters", - Label=[LabelType(type_=name, - value=self.parameter[name]) - for name in self.parameter.keys()])])) - # warn of existing Border: border = page.get_Border() if border: diff --git a/ocrd_tesserocr/deskew.py b/ocrd_tesserocr/deskew.py index d87acc2..9dd22f0 100644 --- a/ocrd_tesserocr/deskew.py +++ b/ocrd_tesserocr/deskew.py @@ -21,8 +21,6 @@ ) from ocrd_modelfactory import page_from_file from ocrd_models.ocrd_page import ( - MetadataItemType, - LabelsType, LabelType, AlternativeImageType, TextRegionType, PageType, to_xml @@ -76,21 +74,9 @@ def process(self): LOG.info("INPUT FILE %i / %s", n, page_id) pcgts = page_from_file(self.workspace.download_file(input_file)) pcgts.set_pcGtsId(file_id) + self.add_metadata(pcgts) page = pcgts.get_Page() - # add metadata about this operation and its runtime parameters: - metadata = pcgts.get_Metadata() # ensured by from_file() - metadata.add_MetadataItem( - MetadataItemType(type_="processingStep", - name=self.ocrd_tool['steps'][0], - value=TOOL, - Labels=[LabelsType( - externalModel="ocrd-tool", - externalId="parameters", - Label=[LabelType(type_=name, - value=self.parameter[name]) - for name in self.parameter.keys()])])) - page_image, page_xywh, page_image_info = self.workspace.image_from_page( page, page_id, # image must not have been rotated already, diff --git a/ocrd_tesserocr/recognize.py b/ocrd_tesserocr/recognize.py index b9dd4da..2c1345e 100644 --- a/ocrd_tesserocr/recognize.py +++ b/ocrd_tesserocr/recognize.py @@ -19,8 +19,6 @@ from ocrd_models.ocrd_page import ( CoordsType, GlyphType, WordType, - LabelType, LabelsType, - MetadataItemType, TextEquivType, TextStyleType, to_xml) from ocrd_models.ocrd_page_generateds import ( @@ -157,20 +155,9 @@ def process(self): page_id = input_file.pageId or input_file.ID LOG.info("INPUT FILE %i / %s", n, page_id) pcgts = page_from_file(self.workspace.download_file(input_file)) + self.add_metadata(pcgts) page = pcgts.get_Page() - # add metadata about this operation and its runtime parameters: - metadata = pcgts.get_Metadata() # ensured by from_file() - metadata.add_MetadataItem( - MetadataItemType(type_="processingStep", - name=self.ocrd_tool['steps'][0], - value=TOOL, - Labels=[LabelsType( - externalModel="ocrd-tool", - externalId="parameters", - Label=[LabelType(type_=name, - value=self.parameter[name]) - for name in self.parameter.keys()])])) page_image, page_xywh, page_image_info = self.workspace.image_from_page( page, page_id) if self.parameter['dpi'] > 0: diff --git a/ocrd_tesserocr/segment_line.py b/ocrd_tesserocr/segment_line.py index d193282..0b49ba1 100644 --- a/ocrd_tesserocr/segment_line.py +++ b/ocrd_tesserocr/segment_line.py @@ -18,8 +18,6 @@ from ocrd_modelfactory import page_from_file from ocrd_models.ocrd_page import ( CoordsType, - LabelType, LabelsType, - MetadataItemType, TextLineType, to_xml ) @@ -64,21 +62,9 @@ def process(self): page_id = input_file.pageId or input_file.ID LOG.info("INPUT FILE %i / %s", n, page_id) pcgts = page_from_file(self.workspace.download_file(input_file)) + self.add_metadata(pcgts) page = pcgts.get_Page() - # add metadata about this operation and its runtime parameters: - metadata = pcgts.get_Metadata() # ensured by from_file() - metadata.add_MetadataItem( - MetadataItemType(type_="processingStep", - name=self.ocrd_tool['steps'][0], - value=TOOL, - Labels=[LabelsType( - externalModel="ocrd-tool", - externalId="parameters", - Label=[LabelType(type_=name, - value=self.parameter[name]) - for name in self.parameter.keys()])])) - page_image, page_coords, page_image_info = self.workspace.image_from_page( page, page_id) if self.parameter['dpi'] > 0: diff --git a/ocrd_tesserocr/segment_region.py b/ocrd_tesserocr/segment_region.py index 612ea64..95157c2 100644 --- a/ocrd_tesserocr/segment_region.py +++ b/ocrd_tesserocr/segment_region.py @@ -22,8 +22,6 @@ ) from ocrd_modelfactory import page_from_file from ocrd_models.ocrd_page import ( - MetadataItemType, - LabelsType, LabelType, CoordsType, PageType, OrderedGroupType, @@ -95,21 +93,9 @@ def process(self): page_id = input_file.pageId or input_file.ID LOG.info("INPUT FILE %i / %s", n, page_id) pcgts = page_from_file(self.workspace.download_file(input_file)) + self.add_metadata(pcgts) page = pcgts.get_Page() - # add metadata about this operation and its runtime parameters: - metadata = pcgts.get_Metadata() # ensured by from_file() - metadata.add_MetadataItem( - MetadataItemType(type_="processingStep", - name=self.ocrd_tool['steps'][0], - value=TOOL, - Labels=[LabelsType( - externalModel="ocrd-tool", - externalId="parameters", - Label=[LabelType(type_=name, - value=self.parameter[name]) - for name in self.parameter.keys()])])) - # delete or warn of existing regions: if (page.get_AdvertRegion() or page.get_ChartRegion() or diff --git a/ocrd_tesserocr/segment_table.py b/ocrd_tesserocr/segment_table.py index fc023f7..42cb325 100644 --- a/ocrd_tesserocr/segment_table.py +++ b/ocrd_tesserocr/segment_table.py @@ -19,8 +19,6 @@ ) from ocrd_modelfactory import page_from_file from ocrd_models.ocrd_page import ( - MetadataItemType, - LabelsType, LabelType, CoordsType, TextRegionType, to_xml) @@ -79,21 +77,9 @@ def process(self): page_id = input_file.pageId or input_file.ID LOG.info("INPUT FILE %i / %s", n, page_id) pcgts = page_from_file(self.workspace.download_file(input_file)) + self.add_metadata(pcgts) page = pcgts.get_Page() - # add metadata about this operation and its runtime parameters: - metadata = pcgts.get_Metadata() # ensured by from_file() - metadata.add_MetadataItem( - MetadataItemType(type_="processingStep", - name=self.ocrd_tool['steps'][0], - value=TOOL, - Labels=[LabelsType( - externalModel="ocrd-tool", - externalId="parameters", - Label=[LabelType(type_=name, - value=self.parameter[name]) - for name in self.parameter.keys()])])) - page_image, page_coords, page_image_info = self.workspace.image_from_page( page, page_id) if self.parameter['dpi'] > 0: diff --git a/ocrd_tesserocr/segment_word.py b/ocrd_tesserocr/segment_word.py index 2d335ce..8c786ac 100644 --- a/ocrd_tesserocr/segment_word.py +++ b/ocrd_tesserocr/segment_word.py @@ -16,8 +16,6 @@ from ocrd_modelfactory import page_from_file from ocrd_models.ocrd_page import ( CoordsType, - LabelType, LabelsType, - MetadataItemType, WordType, to_xml, ) @@ -60,20 +58,9 @@ def process(self): page_id = input_file.pageId or input_file.ID LOG.info("INPUT FILE %i / %s", n, page_id) pcgts = page_from_file(self.workspace.download_file(input_file)) + self.add_metadata(pcgts) page = pcgts.get_Page() - # add metadata about this operation and its runtime parameters: - metadata = pcgts.get_Metadata() # ensured by from_file() - metadata.add_MetadataItem( - MetadataItemType(type_="processingStep", - name=self.ocrd_tool['steps'][0], - value=TOOL, - Labels=[LabelsType( - externalModel="ocrd-tool", - externalId="parameters", - Label=[LabelType(type_=name, - value=self.parameter[name]) - for name in self.parameter.keys()])])) page_image, page_coords, page_image_info = self.workspace.image_from_page( page, page_id) if self.parameter['dpi'] > 0: From 40008e405ad78d62bd4060267254f38be6eef7f4 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Mon, 14 Sep 2020 17:23:03 +0200 Subject: [PATCH 14/23] =?UTF-8?q?segment-word=20/=20recognize:=20more=20ro?= =?UTF-8?q?bust=20intersection=20with=20parent=E2=80=A6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit (re-use function from segment-region) --- .pylintrc | 2 ++ ocrd_tesserocr/binarize.py | 1 - ocrd_tesserocr/crop.py | 2 +- ocrd_tesserocr/deskew.py | 2 +- ocrd_tesserocr/recognize.py | 21 ++++++++++++++++++--- ocrd_tesserocr/segment_line.py | 3 +-- ocrd_tesserocr/segment_table.py | 5 +---- ocrd_tesserocr/segment_word.py | 12 ++++++++++-- 8 files changed, 34 insertions(+), 14 deletions(-) diff --git a/.pylintrc b/.pylintrc index 710b8b2..dfcd216 100644 --- a/.pylintrc +++ b/.pylintrc @@ -6,6 +6,7 @@ ignored-modules=cv2,tesserocr disable = ungrouped-imports, bad-continuation, + trailing-whitespace, missing-docstring, no-self-use, superfluous-parens, @@ -15,6 +16,7 @@ disable = too-many-branches, too-many-statements, too-many-locals, + too-many-nested-blocks, too-few-public-methods, wrong-import-order, duplicate-code diff --git a/ocrd_tesserocr/binarize.py b/ocrd_tesserocr/binarize.py index 597981b..fcca6cd 100644 --- a/ocrd_tesserocr/binarize.py +++ b/ocrd_tesserocr/binarize.py @@ -8,7 +8,6 @@ from ocrd_utils import ( getLogger, - concat_padded, assert_file_grp_cardinality, make_file_id, MIMETYPE_PAGE diff --git a/ocrd_tesserocr/crop.py b/ocrd_tesserocr/crop.py index bd28354..3f65868 100644 --- a/ocrd_tesserocr/crop.py +++ b/ocrd_tesserocr/crop.py @@ -3,7 +3,7 @@ import tesserocr from ocrd_utils import ( - getLogger, concat_padded, + getLogger, crop_image, coordinates_for_segment, coordinates_of_segment, diff --git a/ocrd_tesserocr/deskew.py b/ocrd_tesserocr/deskew.py index 9dd22f0..4448ddd 100644 --- a/ocrd_tesserocr/deskew.py +++ b/ocrd_tesserocr/deskew.py @@ -12,7 +12,7 @@ ) from ocrd_utils import ( - getLogger, concat_padded, + getLogger, make_file_id, assert_file_grp_cardinality, rotate_image, transpose_image, diff --git a/ocrd_tesserocr/recognize.py b/ocrd_tesserocr/recognize.py index 2c1345e..0618c31 100644 --- a/ocrd_tesserocr/recognize.py +++ b/ocrd_tesserocr/recognize.py @@ -35,6 +35,7 @@ from ocrd import Processor from .config import TESSDATA_PREFIX, OCRD_TOOL +from .segment_region import polygon_for_parent TOOL = 'ocrd-tesserocr-recognize' LOG = getLogger('processor.TesserocrRecognize') @@ -284,9 +285,16 @@ def _process_words_in_line(self, result_it, line, line_xywh): # convert to absolute coordinates: polygon = coordinates_for_segment(polygon_from_x0y0x1y1(bbox), None, line_xywh) - self.parameter['padding'] + polygon2 = polygon_for_parent(polygon, line) + if polygon2 is not None: + polygon = polygon2 points = points_from_polygon(polygon) word = WordType(id=word_id, Coords=CoordsType(points)) - line.add_Word(word) + if polygon2 is None: + # could happen due to rotation + LOG.info('Ignoring extant word: %s', points) + else: + line.add_Word(word) # todo: determine if font attributes available for word level will work with LSTM models word_attributes = result_it.WordFontAttributes() if word_attributes: @@ -409,9 +417,16 @@ def _process_glyphs_in_word(self, result_it, word, word_xywh): # convert to absolute coordinates: polygon = coordinates_for_segment(polygon_from_x0y0x1y1(bbox), None, word_xywh) - self.parameter['padding'] + polygon2 = polygon_for_parent(polygon, word) + if polygon2 is not None: + polygon = polygon2 points = points_from_polygon(polygon) glyph = GlyphType(id=glyph_id, Coords=CoordsType(points)) - word.add_Glyph(glyph) + if polygon2 is None: + # could happen due to rotation + LOG.info('Ignoring extant glyph: %s', points) + else: + word.add_Glyph(glyph) choice_it = result_it.GetChoiceIterator() for (choice_no, choice) in enumerate(choice_it): alternative_text = choice.GetUTF8Text() @@ -521,7 +536,7 @@ def page_update_higher_textequiv_levels(level, pcgts): reading_order[subregion.id].index) region_unicode = page_element_unicode0(subregions[0]) for subregion, next_subregion in zip(subregions, subregions[1:]): - if not (subregion.id, next_subregion.id) in joins: + if (subregion.id, next_subregion.id) not in joins: region_unicode += '\n' # or '\f'? region_unicode += page_element_unicode0(next_subregion) region_conf = sum(page_element_conf0(subregion) for subregion in subregions) diff --git a/ocrd_tesserocr/segment_line.py b/ocrd_tesserocr/segment_line.py index 0b49ba1..b1815dd 100644 --- a/ocrd_tesserocr/segment_line.py +++ b/ocrd_tesserocr/segment_line.py @@ -6,7 +6,7 @@ from ocrd import Processor from ocrd_utils import ( - getLogger, concat_padded, + getLogger, make_file_id, assert_file_grp_cardinality, polygon_from_xywh, @@ -93,7 +93,6 @@ def process(self): LOG.debug("Detecting lines in region '%s'", region.id) region_image, region_coords = self.workspace.image_from_segment( region, page_image, page_coords) - region_polygon = coordinates_of_segment(region, region_image, region_coords) tessapi.SetImage(region_image) for line_no, component in enumerate(tessapi.GetComponentImages(RIL.TEXTLINE, True, raw_image=True)): line_id = '%s_line%04d' % (region.id, line_no) diff --git a/ocrd_tesserocr/segment_table.py b/ocrd_tesserocr/segment_table.py index 42cb325..0cec39f 100644 --- a/ocrd_tesserocr/segment_table.py +++ b/ocrd_tesserocr/segment_table.py @@ -8,7 +8,6 @@ from ocrd_utils import ( getLogger, - concat_padded, make_file_id, assert_file_grp_cardinality, coordinates_for_segment, @@ -23,15 +22,13 @@ TextRegionType, to_xml) from ocrd_models.ocrd_page_generateds import ( - TableRegionType, TextTypeSimpleType, RegionRefType, RegionRefIndexedType, OrderedGroupType, OrderedGroupIndexedType, UnorderedGroupType, - UnorderedGroupIndexedType, - ReadingOrderType + UnorderedGroupIndexedType ) from ocrd import Processor diff --git a/ocrd_tesserocr/segment_word.py b/ocrd_tesserocr/segment_word.py index 8c786ac..6834ec6 100644 --- a/ocrd_tesserocr/segment_word.py +++ b/ocrd_tesserocr/segment_word.py @@ -5,7 +5,7 @@ from ocrd import Processor from ocrd_utils import ( - getLogger, concat_padded, + getLogger, make_file_id, assert_file_grp_cardinality, polygon_from_xywh, @@ -20,7 +20,8 @@ to_xml, ) -from ocrd_tesserocr.config import TESSDATA_PREFIX, OCRD_TOOL +from .config import TESSDATA_PREFIX, OCRD_TOOL +from .segment_region import polygon_for_parent TOOL = 'ocrd-tesserocr-segment-word' LOG = getLogger('processor.TesserocrSegmentWord') @@ -95,7 +96,14 @@ def process(self): word_id = '%s_word%04d' % (line.id, word_no) word_polygon = polygon_from_xywh(component[1]) word_polygon = coordinates_for_segment(word_polygon, line_image, line_coords) + word_polygon2 = polygon_for_parent(word_polygon, line) + if word_polygon2 is not None: + word_polygon = word_polygon2 word_points = points_from_polygon(word_polygon) + if word_polygon2 is None: + # could happen due to rotation + LOG.info('Ignoring extant word: %s', word_points) + continue line.add_Word(WordType( id=word_id, Coords=CoordsType(word_points))) From b32ae0ca6f8dcd2d4ace33123d17a5d763ec91d9 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Tue, 15 Sep 2020 09:36:41 +0200 Subject: [PATCH 15/23] :package: 0.9.3 --- CHANGELOG.md | 7 +++++++ ocrd_tesserocr/ocrd-tool.json | 2 +- 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 4afffc4..83f3a3f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,13 @@ Versioned according to [Semantic Versioning](http://semver.org/). ## Unreleased +## [0.9.3] - 2020-09-15 + +Fixed: + + * segmentation: ensure new elements fit into their parent coords + * segmentation: ensure valid coords + ## [0.9.2] - 2020-09-04 Fixed: diff --git a/ocrd_tesserocr/ocrd-tool.json b/ocrd_tesserocr/ocrd-tool.json index f6ab780..6ccd4a2 100644 --- a/ocrd_tesserocr/ocrd-tool.json +++ b/ocrd_tesserocr/ocrd-tool.json @@ -1,5 +1,5 @@ { - "version": "0.9.2", + "version": "0.9.3", "git_url": "https://github.com/OCR-D/ocrd_tesserocr", "dockerhub": "ocrd/tesserocr", "tools": { From 066b1add6706c8a6a47baeb138ba46c98914a3a1 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Sun, 20 Sep 2020 22:27:23 +0200 Subject: [PATCH 16/23] recognize: fix bg tuple vs scalar (depending on mode) --- ocrd_tesserocr/recognize.py | 46 +++++++++++++++---------------------- 1 file changed, 18 insertions(+), 28 deletions(-) diff --git a/ocrd_tesserocr/recognize.py b/ocrd_tesserocr/recognize.py index 0618c31..899851e 100644 --- a/ocrd_tesserocr/recognize.py +++ b/ocrd_tesserocr/recognize.py @@ -202,13 +202,7 @@ def _process_regions(self, tessapi, regions, page_image, page_xywh): region, page_image, page_xywh) if self.parameter['textequiv_level'] == 'region': if self.parameter['padding']: - bg = tuple(ImageStat.Stat(region_image).median) - pad = self.parameter['padding'] - padded = Image.new(region_image.mode, - (region_image.width + 2 * pad, - region_image.height + 2 * pad), bg) - padded.paste(region_image, (pad, pad)) - tessapi.SetImage(padded) + tessapi.SetImage(pad_image(region_image, self.parameter['padding'])) else: tessapi.SetImage(region_image) tessapi.SetPageSegMode(PSM.SINGLE_BLOCK) @@ -237,13 +231,7 @@ def _process_lines(self, tessapi, textlines, region_image, region_xywh): line, region_image, region_xywh) # todo: Tesseract works better if the line images have a 5px margin everywhere if self.parameter['padding']: - bg = tuple(ImageStat.Stat(line_image).median) - pad = self.parameter['padding'] - padded = Image.new(line_image.mode, - (line_image.width + 2 * pad, - line_image.height + 2 * pad), bg) - padded.paste(line_image, (pad, pad)) - tessapi.SetImage(padded) + tessapi.SetImage(pad_image(line_image, self.parameter['padding'])) else: tessapi.SetImage(line_image) if self.parameter['raw_lines']: @@ -331,13 +319,7 @@ def _process_existing_words(self, tessapi, words, line_image, line_xywh): word_image, word_xywh = self.workspace.image_from_segment( word, line_image, line_xywh) if self.parameter['padding']: - bg = tuple(ImageStat.Stat(word_image).median) - pad = self.parameter['padding'] - padded = Image.new(word_image.mode, - (word_image.width + 2 * pad, - word_image.height + 2 * pad), bg) - padded.paste(word_image, (pad, pad)) - tessapi.SetImage(padded) + tessapi.SetImage(pad_image(word_image, self.parameter['padding'])) else: tessapi.SetImage(word_image) tessapi.SetPageSegMode(PSM.SINGLE_WORD) @@ -368,13 +350,7 @@ def _process_existing_glyphs(self, tessapi, glyphs, word_image, word_xywh): glyph_image, _ = self.workspace.image_from_segment( glyph, word_image, word_xywh) if self.parameter['padding']: - bg = tuple(ImageStat.Stat(glyph_image).median) - pad = self.parameter['padding'] - padded = Image.new(glyph_image.mode, - (glyph_image.width + 2 * pad, - glyph_image.height + 2 * pad), bg) - padded.paste(glyph_image, (pad, pad)) - tessapi.SetImage(padded) + tessapi.SetImage(pad_image(glyph_image, self.parameter['padding'])) else: tessapi.SetImage(glyph_image) tessapi.SetPageSegMode(PSM.SINGLE_CHAR) @@ -590,3 +566,17 @@ def page_update_higher_textequiv_levels(level, pcgts): region_conf /= len(lines) region.set_TextEquiv( # replace old, if any [TextEquivType(Unicode=region_unicode, conf=region_conf)]) + +def pad_image(image, padding): + stat = ImageStat.Stat(image) + # workaround for Pillow#4925 + if len(stat.bands) > 1: + background = tuple(stat.median) + else: + background = stat.median[0] + padded = Image.new(image.mode, + (image.width + 2 * padding, + image.height + 2 * padding), + background) + padded.paste(image, (padding, padding)) + return padded From 4c65fd694cebad0cf46acaa5f7eed667cccd5ff4 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky <38561704+bertsky@users.noreply.github.com> Date: Tue, 22 Sep 2020 18:41:16 +0200 Subject: [PATCH 17/23] Update CHANGELOG.md --- CHANGELOG.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 83f3a3f..b59e5cc 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,10 @@ Versioned according to [Semantic Versioning](http://semver.org/). ## Unreleased +Fixed: + + * recognize: be robust to different input image modes, Pillow#4925 + ## [0.9.3] - 2020-09-15 Fixed: From 34add3f4e93761f1cc16bf4b2cfef7ff0d865e7a Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Thu, 24 Sep 2020 10:49:39 +0200 Subject: [PATCH 18/23] getLogger per method --- ocrd_tesserocr/binarize.py | 3 ++- ocrd_tesserocr/crop.py | 2 +- ocrd_tesserocr/deskew.py | 3 ++- ocrd_tesserocr/recognize.py | 8 +++++++- ocrd_tesserocr/segment_line.py | 2 +- ocrd_tesserocr/segment_region.py | 3 ++- ocrd_tesserocr/segment_table.py | 3 ++- ocrd_tesserocr/segment_word.py | 2 +- 8 files changed, 18 insertions(+), 8 deletions(-) diff --git a/ocrd_tesserocr/binarize.py b/ocrd_tesserocr/binarize.py index fcca6cd..e284e80 100644 --- a/ocrd_tesserocr/binarize.py +++ b/ocrd_tesserocr/binarize.py @@ -23,7 +23,6 @@ from .config import TESSDATA_PREFIX, OCRD_TOOL TOOL = 'ocrd-tesserocr-binarize' -LOG = getLogger('processor.TesserocrBinarize') class TesserocrBinarize(Processor): @@ -48,6 +47,7 @@ def process(self): Produce a new output file by serialising the resulting hierarchy. """ + LOG = getLogger('processor.TesserocrBinarize') assert_file_grp_cardinality(self.input_file_grp, 1) assert_file_grp_cardinality(self.output_file_grp, 1) @@ -102,6 +102,7 @@ def process(self): content=to_xml(pcgts)) def _process_segment(self, tessapi, ril, segment, image, xywh, where, page_id, file_id): + LOG = getLogger('processor.TesserocrBinarize') tessapi.SetImage(image) image_bin = None layout = tessapi.AnalyseLayout() diff --git a/ocrd_tesserocr/crop.py b/ocrd_tesserocr/crop.py index 3f65868..0eadf3d 100644 --- a/ocrd_tesserocr/crop.py +++ b/ocrd_tesserocr/crop.py @@ -28,7 +28,6 @@ from .segment_region import polygon_for_parent TOOL = 'ocrd-tesserocr-crop' -LOG = getLogger('processor.TesserocrCrop') class TesserocrCrop(Processor): @@ -54,6 +53,7 @@ def process(self): Produce new output files by serialising the resulting hierarchy. """ + LOG = getLogger('processor.TesserocrCrop') assert_file_grp_cardinality(self.input_file_grp, 1) assert_file_grp_cardinality(self.output_file_grp, 1) diff --git a/ocrd_tesserocr/deskew.py b/ocrd_tesserocr/deskew.py index 4448ddd..de6cc8e 100644 --- a/ocrd_tesserocr/deskew.py +++ b/ocrd_tesserocr/deskew.py @@ -30,7 +30,6 @@ from .config import TESSDATA_PREFIX, OCRD_TOOL TOOL = 'ocrd-tesserocr-deskew' -LOG = getLogger('processor.TesserocrDeskew') FALLBACK_FILEGRP_IMG = 'OCR-D-IMG-DESKEW' class TesserocrDeskew(Processor): @@ -58,6 +57,7 @@ def process(self): Produce a new output file by serialising the resulting hierarchy. """ + LOG = getLogger('processor.TesserocrDeskew') assert_file_grp_cardinality(self.input_file_grp, 1) assert_file_grp_cardinality(self.output_file_grp, 1) oplevel = self.parameter['operation_level'] @@ -127,6 +127,7 @@ def process(self): content=to_xml(pcgts)) def _process_segment(self, tessapi, segment, image, xywh, where, page_id, file_id): + LOG = getLogger('processor.TesserocrDeskew') features = xywh['features'] # features already applied to image angle0 = xywh['angle'] # deskewing (w.r.t. top image) already applied to image angle = 0. # additional angle to be applied at current level diff --git a/ocrd_tesserocr/recognize.py b/ocrd_tesserocr/recognize.py index 899851e..71dbf09 100644 --- a/ocrd_tesserocr/recognize.py +++ b/ocrd_tesserocr/recognize.py @@ -38,7 +38,6 @@ from .segment_region import polygon_for_parent TOOL = 'ocrd-tesserocr-recognize' -LOG = getLogger('processor.TesserocrRecognize') CHOICE_THRESHOLD_NUM = 6 # maximum number of choices to query and annotate CHOICE_THRESHOLD_CONF = 0.2 # maximum score drop from best choice to query and annotate @@ -80,6 +79,7 @@ def process(self): Produce new output files by serialising the resulting hierarchy. """ + LOG = getLogger('processor.TesserocrRecognize') LOG.debug("TESSDATA: %s, installed Tesseract models: %s", *get_languages()) assert_file_grp_cardinality(self.input_file_grp, 1) @@ -197,6 +197,7 @@ def process(self): content=to_xml(pcgts)) def _process_regions(self, tessapi, regions, page_image, page_xywh): + LOG = getLogger('processor.TesserocrRecognize') for region in regions: region_image, region_xywh = self.workspace.image_from_segment( region, page_image, page_xywh) @@ -224,6 +225,7 @@ def _process_regions(self, tessapi, regions, page_image, page_xywh): self._process_lines(tessapi, textlines, region_image, region_xywh) def _process_lines(self, tessapi, textlines, region_image, region_xywh): + LOG = getLogger('processor.TesserocrRecognize') for line in textlines: if self.parameter['overwrite_words']: line.set_Word([]) @@ -261,6 +263,7 @@ def _process_lines(self, tessapi, textlines, region_image, region_xywh): self._process_words_in_line(tessapi.GetIterator(), line, line_xywh) def _process_words_in_line(self, result_it, line, line_xywh): + LOG = getLogger('processor.TesserocrRecognize') if not result_it or result_it.Empty(RIL.WORD): LOG.warning("No text in line '%s'", line.id) return @@ -315,6 +318,7 @@ def _process_words_in_line(self, result_it, line, line_xywh): result_it.Next(RIL.WORD) def _process_existing_words(self, tessapi, words, line_image, line_xywh): + LOG = getLogger('processor.TesserocrRecognize') for word in words: word_image, word_xywh = self.workspace.image_from_segment( word, line_image, line_xywh) @@ -346,6 +350,7 @@ def _process_existing_words(self, tessapi, words, line_image, line_xywh): self._process_glyphs_in_word(tessapi.GetIterator(), word, word_xywh) def _process_existing_glyphs(self, tessapi, glyphs, word_image, word_xywh): + LOG = getLogger('processor.TesserocrRecognize') for glyph in glyphs: glyph_image, _ = self.workspace.image_from_segment( glyph, word_image, word_xywh) @@ -378,6 +383,7 @@ def _process_existing_glyphs(self, tessapi, glyphs, word_image, word_xywh): glyph.add_TextEquiv(TextEquivType(index=choice_no, Unicode=alternative_text, conf=alternative_conf)) def _process_glyphs_in_word(self, result_it, word, word_xywh): + LOG = getLogger('processor.TesserocrRecognize') if not result_it or result_it.Empty(RIL.SYMBOL): LOG.debug("No glyph in word '%s'", word.id) return diff --git a/ocrd_tesserocr/segment_line.py b/ocrd_tesserocr/segment_line.py index b1815dd..1e41527 100644 --- a/ocrd_tesserocr/segment_line.py +++ b/ocrd_tesserocr/segment_line.py @@ -26,7 +26,6 @@ from .segment_region import polygon_for_parent TOOL = 'ocrd-tesserocr-segment-line' -LOG = getLogger('processor.TesserocrSegmentLine') class TesserocrSegmentLine(Processor): @@ -49,6 +48,7 @@ def process(self): Produce a new output file by serialising the resulting hierarchy. """ + LOG = getLogger('processor.TesserocrSegmentLine') assert_file_grp_cardinality(self.input_file_grp, 1) assert_file_grp_cardinality(self.output_file_grp, 1) diff --git a/ocrd_tesserocr/segment_region.py b/ocrd_tesserocr/segment_region.py index 95157c2..f6a96be 100644 --- a/ocrd_tesserocr/segment_region.py +++ b/ocrd_tesserocr/segment_region.py @@ -43,7 +43,6 @@ from .config import TESSDATA_PREFIX, OCRD_TOOL TOOL = 'ocrd-tesserocr-segment-region' -LOG = getLogger('processor.TesserocrSegmentRegion') class TesserocrSegmentRegion(Processor): @@ -71,6 +70,7 @@ def process(self): Produce a new output file by serialising the resulting hierarchy. """ + LOG = getLogger('processor.TesserocrSegmentRegion') assert_file_grp_cardinality(self.input_file_grp, 1) assert_file_grp_cardinality(self.output_file_grp, 1) @@ -171,6 +171,7 @@ def process(self): content=to_xml(pcgts)) def _process_page(self, it, page, page_image, page_coords, page_id): + LOG = getLogger('processor.TesserocrSegmentRegion') # equivalent to GetComponentImages with raw_image=True, # (which would also give raw coordinates), # except we are also interested in the iterator's BlockType() here, diff --git a/ocrd_tesserocr/segment_table.py b/ocrd_tesserocr/segment_table.py index 0cec39f..98928c3 100644 --- a/ocrd_tesserocr/segment_table.py +++ b/ocrd_tesserocr/segment_table.py @@ -36,7 +36,6 @@ from .recognize import page_get_reading_order TOOL = 'ocrd-tesserocr-segment-table' -LOG = getLogger('processor.TesserocrSegmentTable') class TesserocrSegmentTable(Processor): @@ -60,6 +59,7 @@ def process(self): Produce a new output file by serialising the resulting hierarchy. """ + LOG = getLogger('processor.TesserocrSegmentTable') assert_file_grp_cardinality(self.input_file_grp, 1) assert_file_grp_cardinality(self.output_file_grp, 1) @@ -175,6 +175,7 @@ def process(self): content=to_xml(pcgts)) def _process_region(self, it, region, rogroup, region_image, region_coords): + LOG = getLogger('processor.TesserocrSegmentTable') # equivalent to GetComponentImages with raw_image=True, # (which would also give raw coordinates), # except we are also interested in the iterator's BlockType() here, diff --git a/ocrd_tesserocr/segment_word.py b/ocrd_tesserocr/segment_word.py index 6834ec6..249f025 100644 --- a/ocrd_tesserocr/segment_word.py +++ b/ocrd_tesserocr/segment_word.py @@ -24,7 +24,6 @@ from .segment_region import polygon_for_parent TOOL = 'ocrd-tesserocr-segment-word' -LOG = getLogger('processor.TesserocrSegmentWord') class TesserocrSegmentWord(Processor): @@ -46,6 +45,7 @@ def process(self): Produce a new output file by serialising the resulting hierarchy. """ + LOG = getLogger('processor.TesserocrSegmentWord') assert_file_grp_cardinality(self.input_file_grp, 1) assert_file_grp_cardinality(self.output_file_grp, 1) From 5ac233fc6c95448695568c9b8ff08e1898e0b783 Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Thu, 24 Sep 2020 11:02:38 +0200 Subject: [PATCH 19/23] Update ocrd_tesserocr/deskew.py Co-authored-by: Robert Sachunsky <38561704+bertsky@users.noreply.github.com> --- ocrd_tesserocr/deskew.py | 1 - 1 file changed, 1 deletion(-) diff --git a/ocrd_tesserocr/deskew.py b/ocrd_tesserocr/deskew.py index de6cc8e..c6afc13 100644 --- a/ocrd_tesserocr/deskew.py +++ b/ocrd_tesserocr/deskew.py @@ -30,7 +30,6 @@ from .config import TESSDATA_PREFIX, OCRD_TOOL TOOL = 'ocrd-tesserocr-deskew' -FALLBACK_FILEGRP_IMG = 'OCR-D-IMG-DESKEW' class TesserocrDeskew(Processor): From 5a02e131e34265635a0f08cee6e729b6ab397b4f Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Thu, 24 Sep 2020 15:40:28 +0200 Subject: [PATCH 20/23] :package: v0.9.4 --- CHANGELOG.md | 6 ++++++ ocrd_tesserocr/ocrd-tool.json | 2 +- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index b59e5cc..a5ca54c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,9 +5,12 @@ Versioned according to [Semantic Versioning](http://semver.org/). ## Unreleased +## [0.9.4] - 2020-09-24 + Fixed: * recognize: be robust to different input image modes, Pillow#4925 + * logging according to https://github.com/OCR-D/core/pull/599 ## [0.9.3] - 2020-09-15 @@ -222,6 +225,9 @@ Changed: * Recognition with proper support for textequiv_level, drop `page` level +[0.9.4]: v0.9.4...v0.9.3 +[0.9.3]: v0.9.3...v0.9.2 +[0.9.2]: v0.9.2...v0.9.1 [0.9.1]: v0.9.1...v0.9.0 [0.9.0]: v0.9.0...v0.8.5 [0.8.5]: v0.8.5...v0.8.4 diff --git a/ocrd_tesserocr/ocrd-tool.json b/ocrd_tesserocr/ocrd-tool.json index 6ccd4a2..82dbca2 100644 --- a/ocrd_tesserocr/ocrd-tool.json +++ b/ocrd_tesserocr/ocrd-tool.json @@ -1,5 +1,5 @@ { - "version": "0.9.3", + "version": "0.9.4", "git_url": "https://github.com/OCR-D/ocrd_tesserocr", "dockerhub": "ocrd/tesserocr", "tools": { From 27fe2be167040fdcbc9659d1daf263aab5e0def4 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky <38561704+bertsky@users.noreply.github.com> Date: Thu, 24 Sep 2020 22:02:45 +0200 Subject: [PATCH 21/23] Update CHANGELOG.md --- CHANGELOG.md | 48 ++++++++++++++++++++++++------------------------ 1 file changed, 24 insertions(+), 24 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index a5ca54c..264c5d5 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -225,28 +225,28 @@ Changed: * Recognition with proper support for textequiv_level, drop `page` level -[0.9.4]: v0.9.4...v0.9.3 -[0.9.3]: v0.9.3...v0.9.2 -[0.9.2]: v0.9.2...v0.9.1 -[0.9.1]: v0.9.1...v0.9.0 -[0.9.0]: v0.9.0...v0.8.5 -[0.8.5]: v0.8.5...v0.8.4 -[0.8.4]: v0.8.4...v0.8.3 -[0.8.3]: v0.8.3...v0.8.2 -[0.8.2]: v0.8.2...v0.8.1 -[0.8.1]: v0.8.1...v0.8.0 -[0.8.0]: v0.8.0...v0.7.0 -[0.7.0]: v0.7.0...v0.6.0 -[0.6.0]: v0.6.0...v0.5.1 -[0.5.1]: v0.5.1...v0.5.0 -[0.5.0]: v0.5.0...v0.4.1 -[0.4.1]: v0.4.1...v0.4.0 -[0.4.0]: v0.4.0...v0.3.0 -[0.3.0]: v0.3.0...v0.2.2 -[0.2.2]: v0.2.2...v0.2.1 -[0.2.1]: v0.2.1...v0.2.0 -[0.2.0]: v0.2.0...v0.1.2 -[0.1.3]: v0.1.3...v0.1.2 -[0.1.2]: v0.1.2...v0.1.1 -[0.1.1]: v0.1.1...v0.1.0 +[0.9.4]: ../../compare/v0.9.3...v0.9.4 +[0.9.3]: ../../compare/v0.9.2...v0.9.3 +[0.9.2]: ../../compare/v0.9.1...v0.9.2 +[0.9.1]: ../../compare/v0.9.0...v0.9.1 +[0.9.0]: ../../compare/v0.8.5...v0.9.0 +[0.8.5]: ../../compare/v0.8.4...v0.8.5 +[0.8.4]: ../../compare/v0.8.3...v0.8.4 +[0.8.3]: ../../compare/v0.8.2...v0.8.3 +[0.8.2]: ../../compare/v0.8.1...v0.8.2 +[0.8.1]: ../../compare/v0.8.0...v0.8.1 +[0.8.0]: ../../compare/v0.7.0...v0.8.0 +[0.7.0]: ../../compare/v0.6.0...v0.7.0 +[0.6.0]: ../../compare/v0.5.1...v0.6.0 +[0.5.1]: ../../compare/v0.5.0...v0.5.1 +[0.5.0]: ../../compare/v0.4.1...v0.5.0 +[0.4.1]: ../../compare/v0.4.0...v0.4.1 +[0.4.0]: ../../compare/v0.3.0...v0.4.0 +[0.3.0]: ../../compare/v0.2.2...v0.3.0 +[0.2.2]: ../../compare/v0.2.1...v0.2.2 +[0.2.1]: ../../compare/v0.2.0...v0.2.1 +[0.2.0]: ../../compare/v0.1.2...v0.2.0 +[0.1.3]: ../../compare/v0.1.2...v0.1.3 +[0.1.2]: ../../compare/v0.1.1...v0.1.2 +[0.1.1]: ../../compare/v0.1.0...v0.1.1 [0.1.0]: ../../compare/HEAD...v0.1.0 From 8360e0e28e229ddc035a1ff25f13f5f9c2273fba Mon Sep 17 00:00:00 2001 From: Robert Sachunsky <38561704+bertsky@users.noreply.github.com> Date: Fri, 2 Oct 2020 01:22:42 +0200 Subject: [PATCH 22/23] :package: 0.9.5 --- ocrd_tesserocr/ocrd-tool.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ocrd_tesserocr/ocrd-tool.json b/ocrd_tesserocr/ocrd-tool.json index 6ccd4a2..85b1253 100644 --- a/ocrd_tesserocr/ocrd-tool.json +++ b/ocrd_tesserocr/ocrd-tool.json @@ -1,5 +1,5 @@ { - "version": "0.9.3", + "version": "0.9.5", "git_url": "https://github.com/OCR-D/ocrd_tesserocr", "dockerhub": "ocrd/tesserocr", "tools": { From 24b7ced7379afab821020c49f857cc79bc949fbc Mon Sep 17 00:00:00 2001 From: Robert Sachunsky <38561704+bertsky@users.noreply.github.com> Date: Fri, 2 Oct 2020 01:29:49 +0200 Subject: [PATCH 23/23] Update CHANGELOG.md --- CHANGELOG.md | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 264c5d5..4ad0a21 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,12 @@ Versioned according to [Semantic Versioning](http://semver.org/). ## Unreleased +## [0.9.5] - 2020-10-02 + +Fixed: + + * logging according to https://github.com/OCR-D/core/pull/599 (again) + ## [0.9.4] - 2020-09-24 Fixed: