Skip to content

Commit

Permalink
Tesseract: Stop throwing an exception for images with no text
Browse files Browse the repository at this point in the history
The Tesseract wrapper library we use throws an exception when the
command doesn't produce any output (i.e. an output file of size
0). We can't just catch that exception because it's also used for
other things, so this adds a check for the contents of the message
and returns a warning instead.

Bug: T373161
  • Loading branch information
samwilson authored Aug 24, 2024
1 parent 7706aae commit d1cbe55
Show file tree
Hide file tree
Showing 3 changed files with 14 additions and 1 deletion.
1 change: 1 addition & 0 deletions i18n/en.json
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@
"tesseract-psm-12": "Sparse text with OSD.",
"tesseract-psm-13": "Raw line. Treat the image as a single text line, bypassing hacks that are Tesseract-specific.",
"tesseract-param-error": "The '$1' option with a value of $2 is not supported by Tesseract. Maximum value: $3",
"tesseract-no-text-error": "The Tesseract engine did not return any text for this image.",
"tesseract-internal-error": "The tesseract engine returned an internal error.",
"transkribus-language-code": "Language Model",
"transkribus-unauthorized-error": "Error Code '$1' :: The request is not authorized",
Expand Down
1 change: 1 addition & 0 deletions i18n/qqq.json
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,7 @@
"tesseract-psm-12": "Form option for Tesseract page segmentation mode.",
"tesseract-psm-13": "Form option for Tesseract page segmentation mode.",
"tesseract-param-error": "Error message displayed when invalid values for Tesseract options are submitted.\n\nParameters:\n* $1 – the form label for the option. Currently, the only possible value here is {{msg-wm|wikimedia-ocr-tesseract-psm-label}}.\n* $2 – The value that was given.\n* $3 – the maximum value for the option (this will be an integer).",
"tesseract-no-text-error": "Error message displayed when Tesseract returned no text (e.g. for an image that has no text in it).",
"tesseract-internal-error": "Generic error message displayed when the tesseract command fails.",
"transkribus-language-code": "Form label for the 'Language Model' field for the Transkribus engine",
"transkribus-unauthorized-error": "Error message displayed when Transkribus access token has expired or login credentials are invalid.",
Expand Down
13 changes: 12 additions & 1 deletion src/Engine/TesseractEngine.php
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
use Krinkle\Intuition\Intuition;
use Symfony\Contracts\HttpClient\HttpClientInterface;
use thiagoalessio\TesseractOCR\TesseractOCR;
use thiagoalessio\TesseractOCR\UnsuccessfulCommandException;

class TesseractEngine extends EngineBase {

Expand Down Expand Up @@ -69,7 +70,17 @@ public function getResult(
// so we have to load this one manually. We only process one image at a time, so don't benefit from
// multiple threads. See https://github.com/tesseract-ocr/tesseract/issues/898 for some more info.
putenv( 'OMP_THREAD_LIMIT=1' );
$text = $this->ocr->run();
try {
$text = $this->ocr->run();
} catch ( UnsuccessfulCommandException $e ) {
// An UnsuccessfulCommandException is thrown when there's no output, but that's not an
// actual error so we check for it here and just show a warning. The same exception class
// is also used for other things, hence the message check here.
if ( strpos( $e->getMessage(), 'The command did not produce any output' ) !== false ) {
return new EngineResult( '', [ $this->intuition->msg( 'tesseract-no-text-error' ) ] );
}
throw $e;
}

$warnings = $invalidLangs ? [ $this->getInvalidLangsWarning( $invalidLangs ) ] : [];
return new EngineResult( $text, $warnings );
Expand Down

0 comments on commit d1cbe55

Please sign in to comment.