Skip to content

Commit

Permalink
src/__init__.py tests/test_tesseract.py: fix tesseract on rebased.
Browse files Browse the repository at this point in the history
Also extended test_tesseract() to assert that page.get_textpage_ocr() succeeds
if TESSDATA_PREFIX is set in environment.
  • Loading branch information
julian-smith-artifex-com committed Sep 8, 2023
1 parent a3e3335 commit 63af0ed
Show file tree
Hide file tree
Showing 3 changed files with 27 additions and 15 deletions.
4 changes: 2 additions & 2 deletions src/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -9587,14 +9587,14 @@ def pdfocr_save(self, filename, compress=1, language=None, tessdata=None):
'''
if not TESSDATA_PREFIX and not tessdata:
raise RuntimeError('No OCR support: TESSDATA_PREFIX not set')
opts = mupdf.PdfocrOptions()
opts = mupdf.FzPdfocrOptions()
opts.compress = compress;
if language:
opts.language_set2( language)
if tessdata:
opts.datadir_set2( tessdata)
pix = self.this
if filename:
if isinstance(filename, str):
mupdf.fz_save_pixmap_as_pdfocr( pix, filename, 0, opts)
else:
out = JM_new_output_fileptr( filename)
Expand Down
1 change: 1 addition & 0 deletions src/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
import math
import os
import typing
import weakref

from . import fitz
try:
Expand Down
37 changes: 24 additions & 13 deletions tests/test_tesseract.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,21 +3,32 @@

def test_tesseract():
'''
This checks that MuPDF has been built with tesseract support. We don't
(yet) attempt to supply a valid `tessdata` directory.
This checks that MuPDF has been built with tesseract support.
By default we don't supply a valid `tessdata` directory, and just assert
that attempting to use Tesseract raises the expected error (which checks
that MuPDF is built with Tesseract support).
But if TESSDATA_PREFIX is set in the environment, we assert that
FzPage.get_textpage_ocr() succeeds.
'''
if hasattr(fitz, 'mupdf'):
print(f'Not running test_tesseract() on rebased because tesseract not yet supported.')
return
path = os.path.abspath( f'{__file__}/../resources/2.pdf')
doc = fitz.open( path)
page = doc[5]
e_expected = 'OCR initialisation failed'
try:
tp = page.get_textpage_ocr(full=True, tessdata='/foo/bar')
except Exception as e:
ee = str(e)
print(f'Received expected exception: {e}')
assert ee == e_expected, f'Unexpected exception: {ee!r}'
e_expected = (
'OCR initialisation failed',
'code=2: OCR initialisation failed',
)
tessdata_prefix = os.environ.get('TESSDATA_PREFIX')
if tessdata_prefix:
tp = page.get_textpage_ocr(full=True)
print(f'test_tesseract(): page.get_textpage_ocr() succeeded')
else:
assert 0, f'Expected exception {e_expected!r}'
try:
tp = page.get_textpage_ocr(full=True, tessdata='/foo/bar')
except Exception as e:
ee = str(e)
print(f'Received expected exception: {e}')
assert ee in e_expected, f'Unexpected exception: {ee!r}'
else:
assert 0, f'Expected exception {e_expected!r}'

0 comments on commit 63af0ed

Please sign in to comment.