diff --git a/.circleci/config.yml b/.circleci/config.yml index 73a7cfd..9a5ecd9 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -6,11 +6,12 @@ jobs: build-python36: docker: - - image: ubuntu:18.04 + - image: ocrd/core steps: - run: apt-get update && apt-get install -y --no-install-recommends make git curl - checkout - - run: make deps-ubuntu deps-test deps install repo/assets + - run: make deps-ubuntu + - run: make install - run: make test-cli - run: make coverage - codecov/upload diff --git a/.pylintrc b/.pylintrc index 710b8b2..dfcd216 100644 --- a/.pylintrc +++ b/.pylintrc @@ -6,6 +6,7 @@ ignored-modules=cv2,tesserocr disable = ungrouped-imports, bad-continuation, + trailing-whitespace, missing-docstring, no-self-use, superfluous-parens, @@ -15,6 +16,7 @@ disable = too-many-branches, too-many-statements, too-many-locals, + too-many-nested-blocks, too-few-public-methods, wrong-import-order, duplicate-code diff --git a/CHANGELOG.md b/CHANGELOG.md index 4bf8052..4ad0a21 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,33 @@ Versioned according to [Semantic Versioning](http://semver.org/). ## Unreleased +## [0.9.5] - 2020-10-02 + +Fixed: + + * logging according to https://github.com/OCR-D/core/pull/599 (again) + +## [0.9.4] - 2020-09-24 + +Fixed: + + * recognize: be robust to different input image modes, Pillow#4925 + * logging according to https://github.com/OCR-D/core/pull/599 + +## [0.9.3] - 2020-09-15 + +Fixed: + + * segmentation: ensure new elements fit into their parent coords + * segmentation: ensure valid coords + +## [0.9.2] - 2020-09-04 + +Fixed: + + * segment-region: just ignore region outside of page frame, #145 + * deskew: add suffix to AlternativeImage file ID, #148 + ## [0.9.1] - 2020-08-16 Fixed: @@ -204,25 +231,28 @@ Changed: * Recognition with proper support for textequiv_level, drop `page` level -[0.9.1]: v0.9.1...v0.9.0 -[0.9.0]: v0.9.0...v0.8.5 -[0.8.5]: v0.8.5...v0.8.4 -[0.8.4]: v0.8.4...v0.8.3 -[0.8.3]: v0.8.3...v0.8.2 -[0.8.2]: v0.8.2...v0.8.1 -[0.8.1]: v0.8.1...v0.8.0 -[0.8.0]: v0.8.0...v0.7.0 -[0.7.0]: v0.7.0...v0.6.0 -[0.6.0]: v0.6.0...v0.5.1 -[0.5.1]: v0.5.1...v0.5.0 -[0.5.0]: v0.5.0...v0.4.1 -[0.4.1]: v0.4.1...v0.4.0 -[0.4.0]: v0.4.0...v0.3.0 -[0.3.0]: v0.3.0...v0.2.2 -[0.2.2]: v0.2.2...v0.2.1 -[0.2.1]: v0.2.1...v0.2.0 -[0.2.0]: v0.2.0...v0.1.2 -[0.1.3]: v0.1.3...v0.1.2 -[0.1.2]: v0.1.2...v0.1.1 -[0.1.1]: v0.1.1...v0.1.0 +[0.9.4]: ../../compare/v0.9.3...v0.9.4 +[0.9.3]: ../../compare/v0.9.2...v0.9.3 +[0.9.2]: ../../compare/v0.9.1...v0.9.2 +[0.9.1]: ../../compare/v0.9.0...v0.9.1 +[0.9.0]: ../../compare/v0.8.5...v0.9.0 +[0.8.5]: ../../compare/v0.8.4...v0.8.5 +[0.8.4]: ../../compare/v0.8.3...v0.8.4 +[0.8.3]: ../../compare/v0.8.2...v0.8.3 +[0.8.2]: ../../compare/v0.8.1...v0.8.2 +[0.8.1]: ../../compare/v0.8.0...v0.8.1 +[0.8.0]: ../../compare/v0.7.0...v0.8.0 +[0.7.0]: ../../compare/v0.6.0...v0.7.0 +[0.6.0]: ../../compare/v0.5.1...v0.6.0 +[0.5.1]: ../../compare/v0.5.0...v0.5.1 +[0.5.0]: ../../compare/v0.4.1...v0.5.0 +[0.4.1]: ../../compare/v0.4.0...v0.4.1 +[0.4.0]: ../../compare/v0.3.0...v0.4.0 +[0.3.0]: ../../compare/v0.2.2...v0.3.0 +[0.2.2]: ../../compare/v0.2.1...v0.2.2 +[0.2.1]: ../../compare/v0.2.0...v0.2.1 +[0.2.0]: ../../compare/v0.1.2...v0.2.0 +[0.1.3]: ../../compare/v0.1.2...v0.1.3 +[0.1.2]: ../../compare/v0.1.1...v0.1.2 +[0.1.1]: ../../compare/v0.1.0...v0.1.1 [0.1.0]: ../../compare/HEAD...v0.1.0 diff --git a/Makefile b/Makefile index 3054453..2fd5894 100644 --- a/Makefile +++ b/Makefile @@ -25,14 +25,15 @@ help: @echo " from Alexander Pozdnyakov which provides 4.1.0." @echo " See https://launchpad.net/~alex-p/+archive/ubuntu/tesseract-ocr" @echo " for details.)" - @echo " deps Install python deps via pip" - @echo " deps-test Install testing python deps via pip" - @echo " install Install" + @echo " deps Install Python deps for install via pip" + @echo " deps-test Install Python deps for test via pip" @echo " docker Build docker image" - @echo " test Run test" + @echo " install Install this package" + @echo " test Run unit tests" + @echo " coverage Run unit tests and determine test coverage" @echo " test-cli Test the command line tools" - @echo " repo/assets Clone OCR-D/assets to ./repo/assets" @echo " test/assets Setup test assets" + @echo " repo/assets Clone OCR-D/assets to ./repo/assets" @echo " assets-clean Remove symlinks in test/assets" @echo "" @echo " Variables" @@ -44,7 +45,7 @@ help: # Dependencies for deployment in an ubuntu/debian linux # (lib*-dev merely for building tesserocr with pip) -# (tesseract-ocr: Ubuntu 18.04 now ships 4.0.0 +# (tesseract-ocr: Ubuntu 18.04 now ships 4.0.0, # which is unsupported. Add the tesseract-ocr PPA # from Alexander Pozdnyakov which provides 4.1.0. # See https://launchpad.net/~alex-p/+archive/ubuntu/tesseract-ocr @@ -62,32 +63,32 @@ deps-ubuntu: tesseract-ocr-eng \ tesseract-ocr -# Install python deps via pip +# Install Python deps for install via pip deps: $(PIP) install -U pip $(PIP) install -r requirements.txt -# Install testing python deps via pip +# Install Python deps for test via pip deps-test: $(PIP) install -U pip $(PIP) install -r requirements_test.txt -# Install -install: - $(PIP) install -U pip - $(PIP) install . - # Build docker image docker: docker build -t $(DOCKER_TAG) . +# Install this package +install: deps + $(PIP) install -U pip + $(PIP) install . + # Run unit tests -test: test/assets +test: test/assets deps-test # declare -p HTTP_PROXY $(PYTHON) -m pytest --continue-on-collection-errors test $(PYTEST_ARGS) # Run unit tests and determine test coverage -coverage: +coverage: deps-test coverage erase make test PYTHON="coverage run" coverage report @@ -96,12 +97,12 @@ coverage: # Test the command line tools test-cli: test/assets $(PIP) install -e . - rm -rfv test-workspace - cp -rv test/assets/kant_aufklaerung_1784 test-workspace - export LC_ALL=C.UTF-8; cd test-workspace/data && \ - ocrd-tesserocr-segment-region -l DEBUG -m mets.xml -I OCR-D-IMG -O OCR-D-SEG-BLOCK ; \ - ocrd-tesserocr-segment-line -l DEBUG -m mets.xml -I OCR-D-SEG-BLOCK -O OCR-D-SEG-LINE ; \ - ocrd-tesserocr-recognize -l DEBUG -m mets.xml -I OCR-D-SEG-LINE -O OCR-D-TESS-OCR + rm -rfv test/workspace + cp -rv test/assets/kant_aufklaerung_1784 test/workspace + cd test/workspace/data && \ + ocrd-tesserocr-segment-region -l DEBUG -I OCR-D-IMG -O OCR-D-SEG-REGION ; \ + ocrd-tesserocr-segment-line -l DEBUG -I OCR-D-SEG-REGION -O OCR-D-SEG-LINE ; \ + ocrd-tesserocr-recognize -l DEBUG -I OCR-D-SEG-LINE -O OCR-D-TESS-OCR .PHONY: test test-cli install deps deps-ubuntu deps-test help @@ -109,17 +110,20 @@ test-cli: test/assets # Assets # +# Setup test assets (copy repo/assets) +# FIXME remove/update if already present +test/assets: repo/assets + mkdir -p $@ + cp -r -t $@ repo/assets/data/* + # Clone OCR-D/assets to ./repo/assets +# FIXME does not work if already checked out +# FIXME should be a proper (VCed) submodule repo/assets: mkdir -p $(dir $@) git clone https://github.com/OCR-D/assets "$@" -# Setup test assets -test/assets: repo/assets - mkdir -p $@ - cp -r -t $@ repo/assets/data/* - .PHONY: assets-clean # Remove symlinks in test/assets assets-clean: diff --git a/ocrd_tesserocr/binarize.py b/ocrd_tesserocr/binarize.py index 3701cb0..e284e80 100644 --- a/ocrd_tesserocr/binarize.py +++ b/ocrd_tesserocr/binarize.py @@ -8,15 +8,12 @@ from ocrd_utils import ( getLogger, - concat_padded, assert_file_grp_cardinality, make_file_id, MIMETYPE_PAGE ) from ocrd_modelfactory import page_from_file from ocrd_models.ocrd_page import ( - MetadataItemType, - LabelsType, LabelType, AlternativeImageType, TextRegionType, to_xml @@ -26,7 +23,6 @@ from .config import TESSDATA_PREFIX, OCRD_TOOL TOOL = 'ocrd-tesserocr-binarize' -LOG = getLogger('processor.TesserocrBinarize') class TesserocrBinarize(Processor): @@ -51,6 +47,7 @@ def process(self): Produce a new output file by serialising the resulting hierarchy. """ + LOG = getLogger('processor.TesserocrBinarize') assert_file_grp_cardinality(self.input_file_grp, 1) assert_file_grp_cardinality(self.output_file_grp, 1) @@ -62,21 +59,9 @@ def process(self): page_id = input_file.pageId or input_file.ID LOG.info("INPUT FILE %i / %s", n, page_id) pcgts = page_from_file(self.workspace.download_file(input_file)) + self.add_metadata(pcgts) page = pcgts.get_Page() - # add metadata about this operation and its runtime parameters: - metadata = pcgts.get_Metadata() # ensured by from_file() - metadata.add_MetadataItem( - MetadataItemType(type_="processingStep", - name=self.ocrd_tool['steps'][0], - value=TOOL, - Labels=[LabelsType( - externalModel="ocrd-tool", - externalId="parameters", - Label=[LabelType(type_=name, - value=self.parameter[name]) - for name in self.parameter.keys()])])) - page_image, page_xywh, _ = self.workspace.image_from_page( page, page_id) LOG.info("Binarizing on '%s' level in page '%s'", oplevel, page_id) @@ -117,6 +102,7 @@ def process(self): content=to_xml(pcgts)) def _process_segment(self, tessapi, ril, segment, image, xywh, where, page_id, file_id): + LOG = getLogger('processor.TesserocrBinarize') tessapi.SetImage(image) image_bin = None layout = tessapi.AnalyseLayout() diff --git a/ocrd_tesserocr/crop.py b/ocrd_tesserocr/crop.py index ac41809..4ce4910 100644 --- a/ocrd_tesserocr/crop.py +++ b/ocrd_tesserocr/crop.py @@ -3,7 +3,7 @@ import tesserocr from ocrd_utils import ( - getLogger, concat_padded, + getLogger, crop_image, coordinates_for_segment, coordinates_of_segment, @@ -18,8 +18,6 @@ ) from ocrd_modelfactory import page_from_file from ocrd_models.ocrd_page import ( - MetadataItemType, - LabelsType, LabelType, CoordsType, AlternativeImageType, to_xml ) @@ -30,7 +28,6 @@ from .segment_region import polygon_for_parent TOOL = 'ocrd-tesserocr-crop' -LOG = getLogger('processor.TesserocrCrop') class TesserocrCrop(Processor): @@ -56,6 +53,7 @@ def process(self): Produce new output files by serialising the resulting hierarchy. """ + LOG = getLogger('processor.TesserocrCrop') assert_file_grp_cardinality(self.input_file_grp, 1) assert_file_grp_cardinality(self.output_file_grp, 1) @@ -70,21 +68,9 @@ def process(self): page_id = input_file.pageId or input_file.ID LOG.info("INPUT FILE %i / %s", n, page_id) pcgts = page_from_file(self.workspace.download_file(input_file)) + self.add_metadata(pcgts) page = pcgts.get_Page() - # add metadata about this operation and its runtime parameters: - metadata = pcgts.get_Metadata() # ensured by from_file() - metadata.add_MetadataItem( - MetadataItemType(type_="processingStep", - name=self.ocrd_tool['steps'][0], - value=TOOL, - Labels=[LabelsType( - externalModel="ocrd-tool", - externalId="parameters", - Label=[LabelType(type_=name, - value=self.parameter[name]) - for name in self.parameter.keys()])])) - # warn of existing Border: border = page.get_Border() if border: diff --git a/ocrd_tesserocr/deskew.py b/ocrd_tesserocr/deskew.py index e47d97c..c6afc13 100644 --- a/ocrd_tesserocr/deskew.py +++ b/ocrd_tesserocr/deskew.py @@ -12,7 +12,7 @@ ) from ocrd_utils import ( - getLogger, concat_padded, + getLogger, make_file_id, assert_file_grp_cardinality, rotate_image, transpose_image, @@ -21,8 +21,6 @@ ) from ocrd_modelfactory import page_from_file from ocrd_models.ocrd_page import ( - MetadataItemType, - LabelsType, LabelType, AlternativeImageType, TextRegionType, PageType, to_xml @@ -32,8 +30,6 @@ from .config import TESSDATA_PREFIX, OCRD_TOOL TOOL = 'ocrd-tesserocr-deskew' -LOG = getLogger('processor.TesserocrDeskew') -FALLBACK_FILEGRP_IMG = 'OCR-D-IMG-DESKEW' class TesserocrDeskew(Processor): @@ -60,6 +56,7 @@ def process(self): Produce a new output file by serialising the resulting hierarchy. """ + LOG = getLogger('processor.TesserocrDeskew') assert_file_grp_cardinality(self.input_file_grp, 1) assert_file_grp_cardinality(self.output_file_grp, 1) oplevel = self.parameter['operation_level'] @@ -76,21 +73,9 @@ def process(self): LOG.info("INPUT FILE %i / %s", n, page_id) pcgts = page_from_file(self.workspace.download_file(input_file)) pcgts.set_pcGtsId(file_id) + self.add_metadata(pcgts) page = pcgts.get_Page() - # add metadata about this operation and its runtime parameters: - metadata = pcgts.get_Metadata() # ensured by from_file() - metadata.add_MetadataItem( - MetadataItemType(type_="processingStep", - name=self.ocrd_tool['steps'][0], - value=TOOL, - Labels=[LabelsType( - externalModel="ocrd-tool", - externalId="parameters", - Label=[LabelType(type_=name, - value=self.parameter[name]) - for name in self.parameter.keys()])])) - page_image, page_xywh, page_image_info = self.workspace.image_from_page( page, page_id, # image must not have been rotated already, @@ -141,6 +126,7 @@ def process(self): content=to_xml(pcgts)) def _process_segment(self, tessapi, segment, image, xywh, where, page_id, file_id): + LOG = getLogger('processor.TesserocrDeskew') features = xywh['features'] # features already applied to image angle0 = xywh['angle'] # deskewing (w.r.t. top image) already applied to image angle = 0. # additional angle to be applied at current level @@ -287,7 +273,7 @@ def _process_segment(self, tessapi, segment, image, xywh, where, page_id, file_i # segment.add_Baseline(BaselineType(points=points)) # update METS (add the image file): file_path = self.workspace.save_image_file(image, - file_id, + file_id + '.IMG-DESKEW', page_id=page_id, file_grp=self.output_file_grp) # update PAGE (reference the image file): diff --git a/ocrd_tesserocr/ocrd-tool.json b/ocrd_tesserocr/ocrd-tool.json index 9c1cc9d..85b1253 100644 --- a/ocrd_tesserocr/ocrd-tool.json +++ b/ocrd_tesserocr/ocrd-tool.json @@ -1,5 +1,5 @@ { - "version": "0.9.1", + "version": "0.9.5", "git_url": "https://github.com/OCR-D/ocrd_tesserocr", "dockerhub": "ocrd/tesserocr", "tools": { diff --git a/ocrd_tesserocr/recognize.py b/ocrd_tesserocr/recognize.py index b9dd4da..71dbf09 100644 --- a/ocrd_tesserocr/recognize.py +++ b/ocrd_tesserocr/recognize.py @@ -19,8 +19,6 @@ from ocrd_models.ocrd_page import ( CoordsType, GlyphType, WordType, - LabelType, LabelsType, - MetadataItemType, TextEquivType, TextStyleType, to_xml) from ocrd_models.ocrd_page_generateds import ( @@ -37,9 +35,9 @@ from ocrd import Processor from .config import TESSDATA_PREFIX, OCRD_TOOL +from .segment_region import polygon_for_parent TOOL = 'ocrd-tesserocr-recognize' -LOG = getLogger('processor.TesserocrRecognize') CHOICE_THRESHOLD_NUM = 6 # maximum number of choices to query and annotate CHOICE_THRESHOLD_CONF = 0.2 # maximum score drop from best choice to query and annotate @@ -81,6 +79,7 @@ def process(self): Produce new output files by serialising the resulting hierarchy. """ + LOG = getLogger('processor.TesserocrRecognize') LOG.debug("TESSDATA: %s, installed Tesseract models: %s", *get_languages()) assert_file_grp_cardinality(self.input_file_grp, 1) @@ -157,20 +156,9 @@ def process(self): page_id = input_file.pageId or input_file.ID LOG.info("INPUT FILE %i / %s", n, page_id) pcgts = page_from_file(self.workspace.download_file(input_file)) + self.add_metadata(pcgts) page = pcgts.get_Page() - # add metadata about this operation and its runtime parameters: - metadata = pcgts.get_Metadata() # ensured by from_file() - metadata.add_MetadataItem( - MetadataItemType(type_="processingStep", - name=self.ocrd_tool['steps'][0], - value=TOOL, - Labels=[LabelsType( - externalModel="ocrd-tool", - externalId="parameters", - Label=[LabelType(type_=name, - value=self.parameter[name]) - for name in self.parameter.keys()])])) page_image, page_xywh, page_image_info = self.workspace.image_from_page( page, page_id) if self.parameter['dpi'] > 0: @@ -209,18 +197,13 @@ def process(self): content=to_xml(pcgts)) def _process_regions(self, tessapi, regions, page_image, page_xywh): + LOG = getLogger('processor.TesserocrRecognize') for region in regions: region_image, region_xywh = self.workspace.image_from_segment( region, page_image, page_xywh) if self.parameter['textequiv_level'] == 'region': if self.parameter['padding']: - bg = tuple(ImageStat.Stat(region_image).median) - pad = self.parameter['padding'] - padded = Image.new(region_image.mode, - (region_image.width + 2 * pad, - region_image.height + 2 * pad), bg) - padded.paste(region_image, (pad, pad)) - tessapi.SetImage(padded) + tessapi.SetImage(pad_image(region_image, self.parameter['padding'])) else: tessapi.SetImage(region_image) tessapi.SetPageSegMode(PSM.SINGLE_BLOCK) @@ -242,6 +225,7 @@ def _process_regions(self, tessapi, regions, page_image, page_xywh): self._process_lines(tessapi, textlines, region_image, region_xywh) def _process_lines(self, tessapi, textlines, region_image, region_xywh): + LOG = getLogger('processor.TesserocrRecognize') for line in textlines: if self.parameter['overwrite_words']: line.set_Word([]) @@ -249,13 +233,7 @@ def _process_lines(self, tessapi, textlines, region_image, region_xywh): line, region_image, region_xywh) # todo: Tesseract works better if the line images have a 5px margin everywhere if self.parameter['padding']: - bg = tuple(ImageStat.Stat(line_image).median) - pad = self.parameter['padding'] - padded = Image.new(line_image.mode, - (line_image.width + 2 * pad, - line_image.height + 2 * pad), bg) - padded.paste(line_image, (pad, pad)) - tessapi.SetImage(padded) + tessapi.SetImage(pad_image(line_image, self.parameter['padding'])) else: tessapi.SetImage(line_image) if self.parameter['raw_lines']: @@ -285,6 +263,7 @@ def _process_lines(self, tessapi, textlines, region_image, region_xywh): self._process_words_in_line(tessapi.GetIterator(), line, line_xywh) def _process_words_in_line(self, result_it, line, line_xywh): + LOG = getLogger('processor.TesserocrRecognize') if not result_it or result_it.Empty(RIL.WORD): LOG.warning("No text in line '%s'", line.id) return @@ -297,9 +276,16 @@ def _process_words_in_line(self, result_it, line, line_xywh): # convert to absolute coordinates: polygon = coordinates_for_segment(polygon_from_x0y0x1y1(bbox), None, line_xywh) - self.parameter['padding'] + polygon2 = polygon_for_parent(polygon, line) + if polygon2 is not None: + polygon = polygon2 points = points_from_polygon(polygon) word = WordType(id=word_id, Coords=CoordsType(points)) - line.add_Word(word) + if polygon2 is None: + # could happen due to rotation + LOG.info('Ignoring extant word: %s', points) + else: + line.add_Word(word) # todo: determine if font attributes available for word level will work with LSTM models word_attributes = result_it.WordFontAttributes() if word_attributes: @@ -332,17 +318,12 @@ def _process_words_in_line(self, result_it, line, line_xywh): result_it.Next(RIL.WORD) def _process_existing_words(self, tessapi, words, line_image, line_xywh): + LOG = getLogger('processor.TesserocrRecognize') for word in words: word_image, word_xywh = self.workspace.image_from_segment( word, line_image, line_xywh) if self.parameter['padding']: - bg = tuple(ImageStat.Stat(word_image).median) - pad = self.parameter['padding'] - padded = Image.new(word_image.mode, - (word_image.width + 2 * pad, - word_image.height + 2 * pad), bg) - padded.paste(word_image, (pad, pad)) - tessapi.SetImage(padded) + tessapi.SetImage(pad_image(word_image, self.parameter['padding'])) else: tessapi.SetImage(word_image) tessapi.SetPageSegMode(PSM.SINGLE_WORD) @@ -369,17 +350,12 @@ def _process_existing_words(self, tessapi, words, line_image, line_xywh): self._process_glyphs_in_word(tessapi.GetIterator(), word, word_xywh) def _process_existing_glyphs(self, tessapi, glyphs, word_image, word_xywh): + LOG = getLogger('processor.TesserocrRecognize') for glyph in glyphs: glyph_image, _ = self.workspace.image_from_segment( glyph, word_image, word_xywh) if self.parameter['padding']: - bg = tuple(ImageStat.Stat(glyph_image).median) - pad = self.parameter['padding'] - padded = Image.new(glyph_image.mode, - (glyph_image.width + 2 * pad, - glyph_image.height + 2 * pad), bg) - padded.paste(glyph_image, (pad, pad)) - tessapi.SetImage(padded) + tessapi.SetImage(pad_image(glyph_image, self.parameter['padding'])) else: tessapi.SetImage(glyph_image) tessapi.SetPageSegMode(PSM.SINGLE_CHAR) @@ -407,6 +383,7 @@ def _process_existing_glyphs(self, tessapi, glyphs, word_image, word_xywh): glyph.add_TextEquiv(TextEquivType(index=choice_no, Unicode=alternative_text, conf=alternative_conf)) def _process_glyphs_in_word(self, result_it, word, word_xywh): + LOG = getLogger('processor.TesserocrRecognize') if not result_it or result_it.Empty(RIL.SYMBOL): LOG.debug("No glyph in word '%s'", word.id) return @@ -422,9 +399,16 @@ def _process_glyphs_in_word(self, result_it, word, word_xywh): # convert to absolute coordinates: polygon = coordinates_for_segment(polygon_from_x0y0x1y1(bbox), None, word_xywh) - self.parameter['padding'] + polygon2 = polygon_for_parent(polygon, word) + if polygon2 is not None: + polygon = polygon2 points = points_from_polygon(polygon) glyph = GlyphType(id=glyph_id, Coords=CoordsType(points)) - word.add_Glyph(glyph) + if polygon2 is None: + # could happen due to rotation + LOG.info('Ignoring extant glyph: %s', points) + else: + word.add_Glyph(glyph) choice_it = result_it.GetChoiceIterator() for (choice_no, choice) in enumerate(choice_it): alternative_text = choice.GetUTF8Text() @@ -534,7 +518,7 @@ def page_update_higher_textequiv_levels(level, pcgts): reading_order[subregion.id].index) region_unicode = page_element_unicode0(subregions[0]) for subregion, next_subregion in zip(subregions, subregions[1:]): - if not (subregion.id, next_subregion.id) in joins: + if (subregion.id, next_subregion.id) not in joins: region_unicode += '\n' # or '\f'? region_unicode += page_element_unicode0(next_subregion) region_conf = sum(page_element_conf0(subregion) for subregion in subregions) @@ -588,3 +572,17 @@ def page_update_higher_textequiv_levels(level, pcgts): region_conf /= len(lines) region.set_TextEquiv( # replace old, if any [TextEquivType(Unicode=region_unicode, conf=region_conf)]) + +def pad_image(image, padding): + stat = ImageStat.Stat(image) + # workaround for Pillow#4925 + if len(stat.bands) > 1: + background = tuple(stat.median) + else: + background = stat.median[0] + padded = Image.new(image.mode, + (image.width + 2 * padding, + image.height + 2 * padding), + background) + padded.paste(image, (padding, padding)) + return padded diff --git a/ocrd_tesserocr/segment_line.py b/ocrd_tesserocr/segment_line.py index 2c2dc4c..4f86ebe 100644 --- a/ocrd_tesserocr/segment_line.py +++ b/ocrd_tesserocr/segment_line.py @@ -2,12 +2,11 @@ import itertools import os.path -from shapely.geometry import Polygon, LinearRing from tesserocr import PyTessBaseAPI, RIL, PSM from ocrd import Processor from ocrd_utils import ( - getLogger, concat_padded, + getLogger, make_file_id, assert_file_grp_cardinality, polygon_from_xywh, @@ -19,16 +18,14 @@ from ocrd_modelfactory import page_from_file from ocrd_models.ocrd_page import ( CoordsType, - LabelType, LabelsType, - MetadataItemType, TextLineType, to_xml ) from .config import TESSDATA_PREFIX, OCRD_TOOL +from .segment_region import polygon_for_parent TOOL = 'ocrd-tesserocr-segment-line' -LOG = getLogger('processor.TesserocrSegmentLine') class TesserocrSegmentLine(Processor): @@ -51,6 +48,7 @@ def process(self): Produce a new output file by serialising the resulting hierarchy. """ + LOG = getLogger('processor.TesserocrSegmentLine') assert_file_grp_cardinality(self.input_file_grp, 1) assert_file_grp_cardinality(self.output_file_grp, 1) @@ -64,21 +62,9 @@ def process(self): page_id = input_file.pageId or input_file.ID LOG.info("INPUT FILE %i / %s", n, page_id) pcgts = page_from_file(self.workspace.download_file(input_file)) + self.add_metadata(pcgts) page = pcgts.get_Page() - # add metadata about this operation and its runtime parameters: - metadata = pcgts.get_Metadata() # ensured by from_file() - metadata.add_MetadataItem( - MetadataItemType(type_="processingStep", - name=self.ocrd_tool['steps'][0], - value=TOOL, - Labels=[LabelsType( - externalModel="ocrd-tool", - externalId="parameters", - Label=[LabelType(type_=name, - value=self.parameter[name]) - for name in self.parameter.keys()])])) - page_image, page_coords, page_image_info = self.workspace.image_from_page( page, page_id, # for some reason, external binarization @@ -112,6 +98,7 @@ def process(self): LOG.warning('keeping existing TextLines in region "%s"', region.id) LOG.debug("Detecting lines in region '%s'", region.id) region_image, region_coords = self.workspace.image_from_segment( +<<<<<<< HEAD region, page_image, page_coords, # for some reason, external binarization # degrades Tesseract segmentation quality @@ -121,29 +108,22 @@ def process(self): feature_filter='binarized') region_polygon = coordinates_of_segment(region, region_image, region_coords) region_poly = Polygon(region_polygon) +======= + region, page_image, page_coords) +>>>>>>> master tessapi.SetImage(region_image) for line_no, component in enumerate(tessapi.GetComponentImages(RIL.TEXTLINE, True, raw_image=True)): line_id = '%s_line%04d' % (region.id, line_no) line_polygon = polygon_from_xywh(component[1]) - line_poly = Polygon(line_polygon) - if not line_poly.within(region_poly): - # this could happen due to rotation - interline = line_poly.intersection(region_poly) - if interline.is_empty: - continue # ignore this line - if hasattr(interline, 'geoms'): - # is (heterogeneous) GeometryCollection - area = 0 - for geom in interline.geoms: - if geom.area > area: - area = geom.area - interline = geom - if not area: - continue - line_poly = interline.convex_hull - line_polygon = line_poly.exterior.coords line_polygon = coordinates_for_segment(line_polygon, region_image, region_coords) + line_polygon2 = polygon_for_parent(line_polygon, region) + if line_polygon2 is not None: + line_polygon = line_polygon2 line_points = points_from_polygon(line_polygon) + if line_polygon2 is None: + # could happen due to rotation + LOG.info('Ignoring extant line: %s', line_points) + continue region.add_TextLine(TextLineType( id=line_id, Coords=CoordsType(line_points))) diff --git a/ocrd_tesserocr/segment_region.py b/ocrd_tesserocr/segment_region.py index 0f9f943..300e3da 100644 --- a/ocrd_tesserocr/segment_region.py +++ b/ocrd_tesserocr/segment_region.py @@ -1,7 +1,9 @@ from __future__ import absolute_import import os.path -from shapely.geometry import Polygon +import numpy as np +from shapely.geometry import Polygon, asPolygon +from shapely.ops import unary_union from tesserocr import ( PyTessBaseAPI, PSM, RIL, PT @@ -20,8 +22,6 @@ ) from ocrd_modelfactory import page_from_file from ocrd_models.ocrd_page import ( - MetadataItemType, - LabelsType, LabelType, CoordsType, PageType, OrderedGroupType, @@ -43,7 +43,6 @@ from .config import TESSDATA_PREFIX, OCRD_TOOL TOOL = 'ocrd-tesserocr-segment-region' -LOG = getLogger('processor.TesserocrSegmentRegion') class TesserocrSegmentRegion(Processor): @@ -71,6 +70,7 @@ def process(self): Produce a new output file by serialising the resulting hierarchy. """ + LOG = getLogger('processor.TesserocrSegmentRegion') assert_file_grp_cardinality(self.input_file_grp, 1) assert_file_grp_cardinality(self.output_file_grp, 1) @@ -93,21 +93,9 @@ def process(self): page_id = input_file.pageId or input_file.ID LOG.info("INPUT FILE %i / %s", n, page_id) pcgts = page_from_file(self.workspace.download_file(input_file)) + self.add_metadata(pcgts) page = pcgts.get_Page() - # add metadata about this operation and its runtime parameters: - metadata = pcgts.get_Metadata() # ensured by from_file() - metadata.add_MetadataItem( - MetadataItemType(type_="processingStep", - name=self.ocrd_tool['steps'][0], - value=TOOL, - Labels=[LabelsType( - externalModel="ocrd-tool", - externalId="parameters", - Label=[LabelType(type_=name, - value=self.parameter[name]) - for name in self.parameter.keys()])])) - # delete or warn of existing regions: if (page.get_AdvertRegion() or page.get_ChartRegion() or @@ -189,6 +177,7 @@ def process(self): content=to_xml(pcgts)) def _process_page(self, it, page, page_image, page_coords, page_id): + LOG = getLogger('processor.TesserocrSegmentRegion') # equivalent to GetComponentImages with raw_image=True, # (which would also give raw coordinates), # except we are also interested in the iterator's BlockType() here, @@ -225,9 +214,15 @@ def _process_page(self, it, page, page_image, page_coords, page_id): else: polygon = polygon_from_x0y0x1y1(bbox) polygon = coordinates_for_segment(polygon, page_image, page_coords) - polygon = polygon_for_parent(polygon, page) + polygon2 = polygon_for_parent(polygon, page) + if polygon2 is not None: + polygon = polygon2 points = points_from_polygon(polygon) coords = CoordsType(points=points) + if polygon2 is None: + LOG.info('Ignoring extant region: %s', points) + it.Next(RIL.BLOCK) + continue # if xywh['w'] < 30 or xywh['h'] < 30: # LOG.info('Ignoring too small region: %s', points) # it.Next(RIL.BLOCK) @@ -326,12 +321,44 @@ def polygon_for_parent(polygon, parent): [parent.get_imageWidth(),0]]) else: parentp = Polygon(polygon_from_points(parent.get_Coords().points)) + # check if clipping is necessary if childp.within(parentp): return polygon + # ensure input coords have valid paths (without self-intersection) + # (this can happen when shapes valid in floating point are rounded) + childp = make_valid(childp) + parentp = make_valid(parentp) + # clip to parent interp = childp.intersection(parentp) - if interp.is_empty: - # FIXME: we need a better strategy against this - raise Exception("intersection of would-be segment with parent is empty") + if interp.is_empty or interp.area == 0.0: + # this happens if Tesseract "finds" something + # outside of the valid Border of a deskewed/cropped page + # (empty corners created by masking); will be ignored + return None + if interp.type == 'GeometryCollection': + # heterogeneous result: filter zero-area shapes (LineString, Point) + interp = unary_union([geom for geom in interp.geoms if geom.area > 0]) if interp.type == 'MultiPolygon': + # homogeneous result: construct convex hull to connect + # FIXME: construct concave hull / alpha shape interp = interp.convex_hull + if interp.minimum_clearance < 1.0: + # follow-up calculations will necessarily be integer; + # so anticipate rounding here and then ensure validity + interp = asPolygon(np.round(interp.exterior.coords)) + interp = make_valid(interp) return interp.exterior.coords[:-1] # keep open + +def make_valid(polygon): + for split in range(1, len(polygon.exterior.coords)-1): + if polygon.is_valid or polygon.simplify(polygon.area).is_valid: + break + # simplification may not be possible (at all) due to ordering + # in that case, try another starting point + polygon = Polygon(polygon.exterior.coords[-split:]+polygon.exterior.coords[:-split]) + for tolerance in range(1, int(polygon.area)): + if polygon.is_valid: + break + # simplification may require a larger tolerance + polygon = polygon.simplify(tolerance) + return polygon diff --git a/ocrd_tesserocr/segment_table.py b/ocrd_tesserocr/segment_table.py index 1f14005..75ce4c8 100644 --- a/ocrd_tesserocr/segment_table.py +++ b/ocrd_tesserocr/segment_table.py @@ -8,7 +8,6 @@ from ocrd_utils import ( getLogger, - concat_padded, make_file_id, assert_file_grp_cardinality, coordinates_for_segment, @@ -19,21 +18,17 @@ ) from ocrd_modelfactory import page_from_file from ocrd_models.ocrd_page import ( - MetadataItemType, - LabelsType, LabelType, CoordsType, TextRegionType, to_xml) from ocrd_models.ocrd_page_generateds import ( - TableRegionType, TextTypeSimpleType, RegionRefType, RegionRefIndexedType, OrderedGroupType, OrderedGroupIndexedType, UnorderedGroupType, - UnorderedGroupIndexedType, - ReadingOrderType + UnorderedGroupIndexedType ) from ocrd import Processor @@ -41,7 +36,6 @@ from .recognize import page_get_reading_order TOOL = 'ocrd-tesserocr-segment-table' -LOG = getLogger('processor.TesserocrSegmentTable') class TesserocrSegmentTable(Processor): @@ -65,6 +59,7 @@ def process(self): Produce a new output file by serialising the resulting hierarchy. """ + LOG = getLogger('processor.TesserocrSegmentTable') assert_file_grp_cardinality(self.input_file_grp, 1) assert_file_grp_cardinality(self.output_file_grp, 1) @@ -79,21 +74,9 @@ def process(self): page_id = input_file.pageId or input_file.ID LOG.info("INPUT FILE %i / %s", n, page_id) pcgts = page_from_file(self.workspace.download_file(input_file)) + self.add_metadata(pcgts) page = pcgts.get_Page() - # add metadata about this operation and its runtime parameters: - metadata = pcgts.get_Metadata() # ensured by from_file() - metadata.add_MetadataItem( - MetadataItemType(type_="processingStep", - name=self.ocrd_tool['steps'][0], - value=TOOL, - Labels=[LabelsType( - externalModel="ocrd-tool", - externalId="parameters", - Label=[LabelType(type_=name, - value=self.parameter[name]) - for name in self.parameter.keys()])])) - page_image, page_coords, page_image_info = self.workspace.image_from_page( page, page_id, # for some reason, external binarization @@ -211,6 +194,7 @@ def process(self): content=to_xml(pcgts)) def _process_region(self, it, region, rogroup, region_image, region_coords): + LOG = getLogger('processor.TesserocrSegmentTable') # equivalent to GetComponentImages with raw_image=True, # (which would also give raw coordinates), # except we are also interested in the iterator's BlockType() here, diff --git a/ocrd_tesserocr/segment_word.py b/ocrd_tesserocr/segment_word.py index 2d335ce..249f025 100644 --- a/ocrd_tesserocr/segment_word.py +++ b/ocrd_tesserocr/segment_word.py @@ -5,7 +5,7 @@ from ocrd import Processor from ocrd_utils import ( - getLogger, concat_padded, + getLogger, make_file_id, assert_file_grp_cardinality, polygon_from_xywh, @@ -16,16 +16,14 @@ from ocrd_modelfactory import page_from_file from ocrd_models.ocrd_page import ( CoordsType, - LabelType, LabelsType, - MetadataItemType, WordType, to_xml, ) -from ocrd_tesserocr.config import TESSDATA_PREFIX, OCRD_TOOL +from .config import TESSDATA_PREFIX, OCRD_TOOL +from .segment_region import polygon_for_parent TOOL = 'ocrd-tesserocr-segment-word' -LOG = getLogger('processor.TesserocrSegmentWord') class TesserocrSegmentWord(Processor): @@ -47,6 +45,7 @@ def process(self): Produce a new output file by serialising the resulting hierarchy. """ + LOG = getLogger('processor.TesserocrSegmentWord') assert_file_grp_cardinality(self.input_file_grp, 1) assert_file_grp_cardinality(self.output_file_grp, 1) @@ -60,20 +59,9 @@ def process(self): page_id = input_file.pageId or input_file.ID LOG.info("INPUT FILE %i / %s", n, page_id) pcgts = page_from_file(self.workspace.download_file(input_file)) + self.add_metadata(pcgts) page = pcgts.get_Page() - # add metadata about this operation and its runtime parameters: - metadata = pcgts.get_Metadata() # ensured by from_file() - metadata.add_MetadataItem( - MetadataItemType(type_="processingStep", - name=self.ocrd_tool['steps'][0], - value=TOOL, - Labels=[LabelsType( - externalModel="ocrd-tool", - externalId="parameters", - Label=[LabelType(type_=name, - value=self.parameter[name]) - for name in self.parameter.keys()])])) page_image, page_coords, page_image_info = self.workspace.image_from_page( page, page_id) if self.parameter['dpi'] > 0: @@ -108,7 +96,14 @@ def process(self): word_id = '%s_word%04d' % (line.id, word_no) word_polygon = polygon_from_xywh(component[1]) word_polygon = coordinates_for_segment(word_polygon, line_image, line_coords) + word_polygon2 = polygon_for_parent(word_polygon, line) + if word_polygon2 is not None: + word_polygon = word_polygon2 word_points = points_from_polygon(word_polygon) + if word_polygon2 is None: + # could happen due to rotation + LOG.info('Ignoring extant word: %s', word_points) + continue line.add_Word(WordType( id=word_id, Coords=CoordsType(word_points))) diff --git a/requirements.txt b/requirements.txt index 0434d1f..d8a4b6d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,4 @@ ocrd >= 2.4.4 click tesserocr >= 2.5.1 +shapely >= 1.7.1