diff --git a/i18n/en.json b/i18n/en.json index 08ee8a2c..c19e12b0 100644 --- a/i18n/en.json +++ b/i18n/en.json @@ -32,6 +32,7 @@ "report-issue": "Report an issue", "langs-placeholder": "Leave blank for automatic language detection.", "langs-param-error": "The following {{PLURAL:$1|language is|languages are}} not supported by the OCR engine: $2", + "normalize-ocr-text": "Normalize the text from OCR", "tesseract-options": "Tesseract options", "tesseract-psm-label": "Page segmentation method", "tesseract-psm-help": "Try \"Sparse text\" for better multi-column support.", diff --git a/i18n/qqq.json b/i18n/qqq.json index 9f4dd451..d4d999d1 100644 --- a/i18n/qqq.json +++ b/i18n/qqq.json @@ -39,6 +39,7 @@ "report-issue": "Link text in the footer for the issue-reporting link.", "langs-placeholder": "Placeholder text for the language input field.", "langs-param-error": "Error message displayed when invalid language(s) are submitted.\n\nParameters:\n* $1 – number of invalid languages\n* $2 - the list of invalid languages\n\nOCR is a common abbreviation in English for \"Optical Characters Recognition\".", + "normalize-ocr-text": "Normalize the text from OCR (replaces long s and some other historic characters)", "tesseract-options": "Heading for Tesseract-specific options.", "tesseract-psm-label": "Form label for the Tesseract page segmentation mode.", "tesseract-psm-help": "Help text for the Tesseract page segmentation mode option. 'Sparse text' refers to options, see messages:\n* {{msg-wm|Wikimedia-ocr-tesseract-psm-11}} and\n* {{msg-wm|Wikimedia-ocr-tesseract-psm-12}}.", diff --git a/src/Controller/OcrController.php b/src/Controller/OcrController.php index 99c3ca88..fe1da5a5 100644 --- a/src/Controller/OcrController.php +++ b/src/Controller/OcrController.php @@ -60,6 +60,7 @@ class OcrController extends AbstractController { 'image' => '', 'engine' => self::DEFAULT_ENGINE, 'langs' => [], + 'normalize' => false, 'psm' => TesseractEngine::DEFAULT_PSM, 'crop' => [], 'line_id' => TranskribusEngine::DEFAULT_LINEID, @@ -112,6 +113,7 @@ private function setup(): void { } static::$params['langs'] = $this->getLangs( $this->request ); static::$params['image_hosts'] = $this->engine->getImageHosts(); + static::$params['normalize'] = $this->request->query->get( 'normalize' ); $crop = $this->request->query->get( 'crop' ); if ( !is_array( $crop ) ) { $crop = []; @@ -228,6 +230,12 @@ public function homeAction(): Response { * @OA\JsonContent(type="array", @OA\Items(type="string")) * ) * @OA\Parameter( + * name="normalize", + * in="query", + * description="Normalize OCR text.", + * @OA\Schema(type="boolean") + * ) + * @OA\Parameter( * name="psm", * in="query", * description="The Page Segmentation Mode for Tesseract.", @@ -365,6 +373,9 @@ private function getResult( string $invalidLangsMode ): EngineResult { if ( !$result instanceof EngineResult ) { throw new Exception( 'Incorrect (possibly cached) result: ' . var_export( $result, true ) ); } + if ( static::$params['normalize'] ) { + $result->normalize(); + } return $result; } } diff --git a/src/Engine/EngineResult.php b/src/Engine/EngineResult.php index 04745393..d2577905 100644 --- a/src/Engine/EngineResult.php +++ b/src/Engine/EngineResult.php @@ -35,4 +35,19 @@ public function getText(): string { public function getWarnings(): array { return $this->warnings; } + + /** + * Normalize result by replacing some historic characters + */ + public function normalize() { + $this->text = strtr( $this->text, [ + 'ſ' => 's', + 'ꝛ' => 'r', + 'ℳ' => 'M', + 'aͤ' => 'ä', + 'oͤ' => 'ö', + 'uͤ' => 'ü', + '⸗' => '-', + ] ); + } } diff --git a/templates/output.html.twig b/templates/output.html.twig index a6cda1a6..cf336ed3 100644 --- a/templates/output.html.twig +++ b/templates/output.html.twig @@ -56,6 +56,10 @@ {% include '_transkribus_help.html.twig' with {engine: engine} %} +