Skip to content

Commit

Permalink
allow multiple output formats in one run (#1)
Browse files Browse the repository at this point in the history
* allow multiple output

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* tidy

* fix init

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* break up test to avoid racing condition

* Update tests/pytesseract_test.py

Co-authored-by: qued <[email protected]>

* fix flakey pdf comparison

---------

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: qued <[email protected]>
  • Loading branch information
3 people authored Sep 3, 2023
1 parent b35f061 commit cb83840
Show file tree
Hide file tree
Showing 4 changed files with 106 additions and 7 deletions.
4 changes: 4 additions & 0 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,10 @@ Library usage:
# Get ALTO XML output
xml = pytesseract.image_to_alto_xml('test.png')
# getting multiple types of output with one call to save compute time
# currently supports mix and match of the following: txt, pdf, hocr, box, tsv
text, boxes = pytesseract.run_and_get_multiple_output('test.png', extensions=['txt', 'box'])
Support for OpenCV image/NumPy array objects

.. code-block:: python
Expand Down
1 change: 1 addition & 0 deletions pytesseract/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from .pytesseract import image_to_pdf_or_hocr
from .pytesseract import image_to_string
from .pytesseract import Output
from .pytesseract import run_and_get_multiple_output
from .pytesseract import run_and_get_output
from .pytesseract import TesseractError
from .pytesseract import TesseractNotFoundError
Expand Down
68 changes: 61 additions & 7 deletions pytesseract/pytesseract.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@
from pkgutil import find_loader
from tempfile import NamedTemporaryFile
from time import sleep
from typing import List
from typing import Optional

from packaging.version import InvalidVersion
from packaging.version import parse
Expand Down Expand Up @@ -65,6 +67,13 @@
'Script confidence': ('script_conf', float),
}

EXTENTION_TO_CONFIG = {
'box': 'tessedit_create_boxfile=1 batch.nochop makebox',
'xml': 'tessedit_create_alto=1',
'hocr': 'tessedit_create_hocr=1',
'tsv': 'tessedit_create_tsv=1',
}

TESSERACT_MIN_VERSION = Version('3.05')
TESSERACT_ALTO_VERSION = Version('4.1.0')

Expand Down Expand Up @@ -252,8 +261,9 @@ def run_tesseract(
if config:
cmd_args += shlex.split(config, posix=not_windows)

if extension and extension not in {'box', 'osd', 'tsv', 'xml'}:
cmd_args.append(extension)
for _extension in extension.split():
if _extension not in {'box', 'osd', 'tsv', 'xml'}:
cmd_args.append(_extension)
LOGGER.debug('%r', cmd_args)

try:
Expand All @@ -269,6 +279,51 @@ def run_tesseract(
raise TesseractError(proc.returncode, get_errors(error_string))


def _read_output(filename: str, return_bytes: bool = False):
with open(filename, 'rb') as output_file:
if return_bytes:
return output_file.read()
return output_file.read().decode(DEFAULT_ENCODING)


def run_and_get_multiple_output(
image,
extensions: List[str],
lang: Optional[str] = None,
nice: int = 0,
timeout: int = 0,
return_bytes: bool = False,
):
config = ' '.join(
EXTENTION_TO_CONFIG.get(extension, '') for extension in extensions
).strip()
if config:
config = f'-c {config}'
else:
config = ''

with save(image) as (temp_name, input_filename):
kwargs = {
'input_filename': input_filename,
'output_filename_base': temp_name,
'extension': ' '.join(extensions),
'lang': lang,
'config': config,
'nice': nice,
'timeout': timeout,
}

run_tesseract(**kwargs)

return [
_read_output(
f"{kwargs['output_filename_base']}{extsep}{extension}",
True if extension in {'pdf', 'hocr'} else return_bytes,
)
for extension in extensions
]


def run_and_get_output(
image,
extension='',
Expand All @@ -290,11 +345,10 @@ def run_and_get_output(
}

run_tesseract(**kwargs)
filename = f"{kwargs['output_filename_base']}{extsep}{extension}"
with open(filename, 'rb') as output_file:
if return_bytes:
return output_file.read()
return output_file.read().decode(DEFAULT_ENCODING)
return _read_output(
f"{kwargs['output_filename_base']}{extsep}{extension}",
return_bytes,
)


def file_to_dict(tsv, cell_delimiter, str_col_idx):
Expand Down
40 changes: 40 additions & 0 deletions tests/pytesseract_test.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from functools import partial
from glob import iglob
from multiprocessing import Pool
from os import getcwd
Expand All @@ -20,6 +21,7 @@
from pytesseract import image_to_pdf_or_hocr
from pytesseract import image_to_string
from pytesseract import Output
from pytesseract import run_and_get_multiple_output
from pytesseract import TesseractNotFoundError
from pytesseract import TSVNotSupported
from pytesseract.pytesseract import file_to_dict
Expand Down Expand Up @@ -73,6 +75,17 @@ def test_file_small():
return path.join(DATA_DIR, 'test-small.jpg')


@pytest.fixture(scope='session')
def function_mapping():
return {
'pdf': partial(image_to_pdf_or_hocr, extension='pdf'),
'txt': image_to_string,
'box': image_to_boxes,
'hocr': partial(image_to_pdf_or_hocr, extension='hocr'),
'tsv': image_to_data,
}


@pytest.mark.parametrize(
'test_file',
[
Expand Down Expand Up @@ -227,6 +240,33 @@ def test_image_to_pdf_or_hocr(test_file, extension):
assert result.endswith('</html>')


@pytest.mark.parametrize(
'extensions',
[
['tsv', 'pdf', 'txt', 'box', 'hocr'],
# This tests a case where the extensions do not add any config params
# Here this test is not merged with the test above because we might get
# into a racing condition where test results from different parameter
# are mixed in the test below
['pdf', 'txt'],
],
)
def test_run_and_get_multiple_output(test_file, function_mapping, extensions):
compound_results = run_and_get_multiple_output(
test_file,
extensions=extensions,
)
for result, extension in zip(compound_results, extensions):
if extension == 'pdf':
# pdf creation time could be different between the two so do not
# check the whole string
assert (
result[:1000] == function_mapping[extension](test_file)[:1000]
)
else:
assert result == function_mapping[extension](test_file)


@pytest.mark.skipif(
TESSERACT_VERSION[:2] < (4, 1),
reason='requires tesseract >= 4.1',
Expand Down

0 comments on commit cb83840

Please sign in to comment.