src/__init__.py tests/test_tesseract.py: fix tesseract on rebased.

Also extended test_tesseract() to assert that page.get_textpage_ocr() succeeds if TESSDATA_PREFIX is set in environment.
ArtifexSoftware · Sep 8, 2023 · 63af0ed · 63af0ed
1 parent a3e3335
commit 63af0ed
Show file tree

Hide file tree

Showing 3 changed files with 27 additions and 15 deletions.
diff --git a/src/__init__.py b/src/__init__.py
@@ -9587,14 +9587,14 @@ def pdfocr_save(self, filename, compress=1, language=None, tessdata=None):
         '''
         if not TESSDATA_PREFIX and not tessdata:
             raise RuntimeError('No OCR support: TESSDATA_PREFIX not set')
-        opts = mupdf.PdfocrOptions()
+        opts = mupdf.FzPdfocrOptions()
         opts.compress = compress;
         if language:
             opts.language_set2( language)
         if tessdata:
             opts.datadir_set2( tessdata)
         pix = self.this
-        if filename:
+        if isinstance(filename, str):
             mupdf.fz_save_pixmap_as_pdfocr( pix, filename, 0, opts)
         else:
             out = JM_new_output_fileptr( filename)

diff --git a/src/utils.py b/src/utils.py
@@ -10,6 +10,7 @@
 import math
 import os
 import typing
+import weakref
 
 from . import fitz
 try:

diff --git a/tests/test_tesseract.py b/tests/test_tesseract.py
@@ -3,21 +3,32 @@
 
 def test_tesseract():
     '''
-    This checks that MuPDF has been built with tesseract support. We don't
-    (yet) attempt to supply a valid `tessdata` directory.
+    This checks that MuPDF has been built with tesseract support.
+
+    By default we don't supply a valid `tessdata` directory, and just assert
+    that attempting to use Tesseract raises the expected error (which checks
+    that MuPDF is built with Tesseract support).
+
+    But if TESSDATA_PREFIX is set in the environment, we assert that
+    FzPage.get_textpage_ocr() succeeds.
     '''
-    if hasattr(fitz, 'mupdf'):
-        print(f'Not running test_tesseract() on rebased because tesseract not yet supported.')
-        return
     path = os.path.abspath( f'{__file__}/../resources/2.pdf')
     doc = fitz.open( path)
     page = doc[5]
-    e_expected = 'OCR initialisation failed'
-    try:
-        tp = page.get_textpage_ocr(full=True, tessdata='/foo/bar')
-    except Exception as e:
-        ee = str(e)
-        print(f'Received expected exception: {e}')
-        assert ee == e_expected, f'Unexpected exception: {ee!r}'
+    e_expected = (
+            'OCR initialisation failed',
+            'code=2: OCR initialisation failed',
+            )
+    tessdata_prefix = os.environ.get('TESSDATA_PREFIX')
+    if tessdata_prefix:
+        tp = page.get_textpage_ocr(full=True)
+        print(f'test_tesseract(): page.get_textpage_ocr() succeeded')
     else:
-        assert 0, f'Expected exception {e_expected!r}'
+        try:
+            tp = page.get_textpage_ocr(full=True, tessdata='/foo/bar')
+        except Exception as e:
+            ee = str(e)
+            print(f'Received expected exception: {e}')
+            assert ee in e_expected, f'Unexpected exception: {ee!r}'
+        else:
+            assert 0, f'Expected exception {e_expected!r}'