Skip to content

Commit

Permalink
Merge branch 'master' into pr-144
Browse files Browse the repository at this point in the history
  • Loading branch information
bertsky committed Oct 1, 2020
2 parents ecfc989 + 24b7ced commit d231edb
Show file tree
Hide file tree
Showing 14 changed files with 222 additions and 242 deletions.
5 changes: 3 additions & 2 deletions .circleci/config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,12 @@ jobs:

build-python36:
docker:
- image: ubuntu:18.04
- image: ocrd/core
steps:
- run: apt-get update && apt-get install -y --no-install-recommends make git curl
- checkout
- run: make deps-ubuntu deps-test deps install repo/assets
- run: make deps-ubuntu
- run: make install
- run: make test-cli
- run: make coverage
- codecov/upload
Expand Down
2 changes: 2 additions & 0 deletions .pylintrc
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ ignored-modules=cv2,tesserocr
disable =
ungrouped-imports,
bad-continuation,
trailing-whitespace,
missing-docstring,
no-self-use,
superfluous-parens,
Expand All @@ -15,6 +16,7 @@ disable =
too-many-branches,
too-many-statements,
too-many-locals,
too-many-nested-blocks,
too-few-public-methods,
wrong-import-order,
duplicate-code
Expand Down
72 changes: 51 additions & 21 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,33 @@ Versioned according to [Semantic Versioning](http://semver.org/).

## Unreleased

## [0.9.5] - 2020-10-02

Fixed:

* logging according to https://github.com/OCR-D/core/pull/599 (again)

## [0.9.4] - 2020-09-24

Fixed:

* recognize: be robust to different input image modes, Pillow#4925
* logging according to https://github.com/OCR-D/core/pull/599

## [0.9.3] - 2020-09-15

Fixed:

* segmentation: ensure new elements fit into their parent coords
* segmentation: ensure valid coords

## [0.9.2] - 2020-09-04

Fixed:

* segment-region: just ignore region outside of page frame, #145
* deskew: add suffix to AlternativeImage file ID, #148

## [0.9.1] - 2020-08-16

Fixed:
Expand Down Expand Up @@ -204,25 +231,28 @@ Changed:
* Recognition with proper support for textequiv_level, drop `page` level

<!-- link-labels -->
[0.9.1]: v0.9.1...v0.9.0
[0.9.0]: v0.9.0...v0.8.5
[0.8.5]: v0.8.5...v0.8.4
[0.8.4]: v0.8.4...v0.8.3
[0.8.3]: v0.8.3...v0.8.2
[0.8.2]: v0.8.2...v0.8.1
[0.8.1]: v0.8.1...v0.8.0
[0.8.0]: v0.8.0...v0.7.0
[0.7.0]: v0.7.0...v0.6.0
[0.6.0]: v0.6.0...v0.5.1
[0.5.1]: v0.5.1...v0.5.0
[0.5.0]: v0.5.0...v0.4.1
[0.4.1]: v0.4.1...v0.4.0
[0.4.0]: v0.4.0...v0.3.0
[0.3.0]: v0.3.0...v0.2.2
[0.2.2]: v0.2.2...v0.2.1
[0.2.1]: v0.2.1...v0.2.0
[0.2.0]: v0.2.0...v0.1.2
[0.1.3]: v0.1.3...v0.1.2
[0.1.2]: v0.1.2...v0.1.1
[0.1.1]: v0.1.1...v0.1.0
[0.9.4]: ../../compare/v0.9.3...v0.9.4
[0.9.3]: ../../compare/v0.9.2...v0.9.3
[0.9.2]: ../../compare/v0.9.1...v0.9.2
[0.9.1]: ../../compare/v0.9.0...v0.9.1
[0.9.0]: ../../compare/v0.8.5...v0.9.0
[0.8.5]: ../../compare/v0.8.4...v0.8.5
[0.8.4]: ../../compare/v0.8.3...v0.8.4
[0.8.3]: ../../compare/v0.8.2...v0.8.3
[0.8.2]: ../../compare/v0.8.1...v0.8.2
[0.8.1]: ../../compare/v0.8.0...v0.8.1
[0.8.0]: ../../compare/v0.7.0...v0.8.0
[0.7.0]: ../../compare/v0.6.0...v0.7.0
[0.6.0]: ../../compare/v0.5.1...v0.6.0
[0.5.1]: ../../compare/v0.5.0...v0.5.1
[0.5.0]: ../../compare/v0.4.1...v0.5.0
[0.4.1]: ../../compare/v0.4.0...v0.4.1
[0.4.0]: ../../compare/v0.3.0...v0.4.0
[0.3.0]: ../../compare/v0.2.2...v0.3.0
[0.2.2]: ../../compare/v0.2.1...v0.2.2
[0.2.1]: ../../compare/v0.2.0...v0.2.1
[0.2.0]: ../../compare/v0.1.2...v0.2.0
[0.1.3]: ../../compare/v0.1.2...v0.1.3
[0.1.2]: ../../compare/v0.1.1...v0.1.2
[0.1.1]: ../../compare/v0.1.0...v0.1.1
[0.1.0]: ../../compare/HEAD...v0.1.0
56 changes: 30 additions & 26 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -25,14 +25,15 @@ help:
@echo " from Alexander Pozdnyakov which provides 4.1.0."
@echo " See https://launchpad.net/~alex-p/+archive/ubuntu/tesseract-ocr"
@echo " for details.)"
@echo " deps Install python deps via pip"
@echo " deps-test Install testing python deps via pip"
@echo " install Install"
@echo " deps Install Python deps for install via pip"
@echo " deps-test Install Python deps for test via pip"
@echo " docker Build docker image"
@echo " test Run test"
@echo " install Install this package"
@echo " test Run unit tests"
@echo " coverage Run unit tests and determine test coverage"
@echo " test-cli Test the command line tools"
@echo " repo/assets Clone OCR-D/assets to ./repo/assets"
@echo " test/assets Setup test assets"
@echo " repo/assets Clone OCR-D/assets to ./repo/assets"
@echo " assets-clean Remove symlinks in test/assets"
@echo ""
@echo " Variables"
Expand All @@ -44,7 +45,7 @@ help:

# Dependencies for deployment in an ubuntu/debian linux
# (lib*-dev merely for building tesserocr with pip)
# (tesseract-ocr: Ubuntu 18.04 now ships 4.0.0
# (tesseract-ocr: Ubuntu 18.04 now ships 4.0.0,
# which is unsupported. Add the tesseract-ocr PPA
# from Alexander Pozdnyakov which provides 4.1.0.
# See https://launchpad.net/~alex-p/+archive/ubuntu/tesseract-ocr
Expand All @@ -62,32 +63,32 @@ deps-ubuntu:
tesseract-ocr-eng \
tesseract-ocr

# Install python deps via pip
# Install Python deps for install via pip
deps:
$(PIP) install -U pip
$(PIP) install -r requirements.txt

# Install testing python deps via pip
# Install Python deps for test via pip
deps-test:
$(PIP) install -U pip
$(PIP) install -r requirements_test.txt

# Install
install:
$(PIP) install -U pip
$(PIP) install .

# Build docker image
docker:
docker build -t $(DOCKER_TAG) .

# Install this package
install: deps
$(PIP) install -U pip
$(PIP) install .

# Run unit tests
test: test/assets
test: test/assets deps-test
# declare -p HTTP_PROXY
$(PYTHON) -m pytest --continue-on-collection-errors test $(PYTEST_ARGS)

# Run unit tests and determine test coverage
coverage:
coverage: deps-test
coverage erase
make test PYTHON="coverage run"
coverage report
Expand All @@ -96,30 +97,33 @@ coverage:
# Test the command line tools
test-cli: test/assets
$(PIP) install -e .
rm -rfv test-workspace
cp -rv test/assets/kant_aufklaerung_1784 test-workspace
export LC_ALL=C.UTF-8; cd test-workspace/data && \
ocrd-tesserocr-segment-region -l DEBUG -m mets.xml -I OCR-D-IMG -O OCR-D-SEG-BLOCK ; \
ocrd-tesserocr-segment-line -l DEBUG -m mets.xml -I OCR-D-SEG-BLOCK -O OCR-D-SEG-LINE ; \
ocrd-tesserocr-recognize -l DEBUG -m mets.xml -I OCR-D-SEG-LINE -O OCR-D-TESS-OCR
rm -rfv test/workspace
cp -rv test/assets/kant_aufklaerung_1784 test/workspace
cd test/workspace/data && \
ocrd-tesserocr-segment-region -l DEBUG -I OCR-D-IMG -O OCR-D-SEG-REGION ; \
ocrd-tesserocr-segment-line -l DEBUG -I OCR-D-SEG-REGION -O OCR-D-SEG-LINE ; \
ocrd-tesserocr-recognize -l DEBUG -I OCR-D-SEG-LINE -O OCR-D-TESS-OCR

.PHONY: test test-cli install deps deps-ubuntu deps-test help

#
# Assets
#

# Setup test assets (copy repo/assets)
# FIXME remove/update if already present
test/assets: repo/assets
mkdir -p $@
cp -r -t $@ repo/assets/data/*

# Clone OCR-D/assets to ./repo/assets
# FIXME does not work if already checked out
# FIXME should be a proper (VCed) submodule
repo/assets:
mkdir -p $(dir $@)
git clone https://github.com/OCR-D/assets "$@"


# Setup test assets
test/assets: repo/assets
mkdir -p $@
cp -r -t $@ repo/assets/data/*

.PHONY: assets-clean
# Remove symlinks in test/assets
assets-clean:
Expand Down
20 changes: 3 additions & 17 deletions ocrd_tesserocr/binarize.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,15 +8,12 @@

from ocrd_utils import (
getLogger,
concat_padded,
assert_file_grp_cardinality,
make_file_id,
MIMETYPE_PAGE
)
from ocrd_modelfactory import page_from_file
from ocrd_models.ocrd_page import (
MetadataItemType,
LabelsType, LabelType,
AlternativeImageType,
TextRegionType,
to_xml
Expand All @@ -26,7 +23,6 @@
from .config import TESSDATA_PREFIX, OCRD_TOOL

TOOL = 'ocrd-tesserocr-binarize'
LOG = getLogger('processor.TesserocrBinarize')

class TesserocrBinarize(Processor):

Expand All @@ -51,6 +47,7 @@ def process(self):
Produce a new output file by serialising the resulting hierarchy.
"""
LOG = getLogger('processor.TesserocrBinarize')
assert_file_grp_cardinality(self.input_file_grp, 1)
assert_file_grp_cardinality(self.output_file_grp, 1)

Expand All @@ -62,21 +59,9 @@ def process(self):
page_id = input_file.pageId or input_file.ID
LOG.info("INPUT FILE %i / %s", n, page_id)
pcgts = page_from_file(self.workspace.download_file(input_file))
self.add_metadata(pcgts)
page = pcgts.get_Page()

# add metadata about this operation and its runtime parameters:
metadata = pcgts.get_Metadata() # ensured by from_file()
metadata.add_MetadataItem(
MetadataItemType(type_="processingStep",
name=self.ocrd_tool['steps'][0],
value=TOOL,
Labels=[LabelsType(
externalModel="ocrd-tool",
externalId="parameters",
Label=[LabelType(type_=name,
value=self.parameter[name])
for name in self.parameter.keys()])]))

page_image, page_xywh, _ = self.workspace.image_from_page(
page, page_id)
LOG.info("Binarizing on '%s' level in page '%s'", oplevel, page_id)
Expand Down Expand Up @@ -117,6 +102,7 @@ def process(self):
content=to_xml(pcgts))

def _process_segment(self, tessapi, ril, segment, image, xywh, where, page_id, file_id):
LOG = getLogger('processor.TesserocrBinarize')
tessapi.SetImage(image)
image_bin = None
layout = tessapi.AnalyseLayout()
Expand Down
20 changes: 3 additions & 17 deletions ocrd_tesserocr/crop.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@

import tesserocr
from ocrd_utils import (
getLogger, concat_padded,
getLogger,
crop_image,
coordinates_for_segment,
coordinates_of_segment,
Expand All @@ -18,8 +18,6 @@
)
from ocrd_modelfactory import page_from_file
from ocrd_models.ocrd_page import (
MetadataItemType,
LabelsType, LabelType,
CoordsType, AlternativeImageType,
to_xml
)
Expand All @@ -30,7 +28,6 @@
from .segment_region import polygon_for_parent

TOOL = 'ocrd-tesserocr-crop'
LOG = getLogger('processor.TesserocrCrop')

class TesserocrCrop(Processor):

Expand All @@ -56,6 +53,7 @@ def process(self):
Produce new output files by serialising the resulting hierarchy.
"""
LOG = getLogger('processor.TesserocrCrop')
assert_file_grp_cardinality(self.input_file_grp, 1)
assert_file_grp_cardinality(self.output_file_grp, 1)

Expand All @@ -70,21 +68,9 @@ def process(self):
page_id = input_file.pageId or input_file.ID
LOG.info("INPUT FILE %i / %s", n, page_id)
pcgts = page_from_file(self.workspace.download_file(input_file))
self.add_metadata(pcgts)
page = pcgts.get_Page()

# add metadata about this operation and its runtime parameters:
metadata = pcgts.get_Metadata() # ensured by from_file()
metadata.add_MetadataItem(
MetadataItemType(type_="processingStep",
name=self.ocrd_tool['steps'][0],
value=TOOL,
Labels=[LabelsType(
externalModel="ocrd-tool",
externalId="parameters",
Label=[LabelType(type_=name,
value=self.parameter[name])
for name in self.parameter.keys()])]))

# warn of existing Border:
border = page.get_Border()
if border:
Expand Down
Loading

0 comments on commit d231edb

Please sign in to comment.