diff --git a/i18n/en.json b/i18n/en.json index 999359b..be4a93e 100644 --- a/i18n/en.json +++ b/i18n/en.json @@ -36,6 +36,7 @@ "kraken-segmentation-model-label": "Kraken segmentation model", "langs-placeholder": "Leave blank for automatic language detection.", "langs-param-error": "The following {{PLURAL:$1|language is|languages are}} not supported by the OCR engine: $2", + "normalize-ocr-text": "Normalize the text from OCR", "tesseract-options": "Tesseract options", "tesseract-psm-label": "Page segmentation method", "tesseract-psm-help": "Try \"Sparse text\" for better multi-column support.", diff --git a/i18n/qqq.json b/i18n/qqq.json index 58bed87..fe2fe06 100644 --- a/i18n/qqq.json +++ b/i18n/qqq.json @@ -40,6 +40,7 @@ "report-issue": "Link text in the footer for the issue-reporting link.", "langs-placeholder": "Placeholder text for the language input field.", "langs-param-error": "Error message displayed when invalid language(s) are submitted.\n\nParameters:\n* $1 – number of invalid languages\n* $2 - the list of invalid languages\n\nOCR is a common abbreviation in English for \"Optical Characters Recognition\".", + "normalize-ocr-text": "Normalize the text from OCR (replaces long s and some other historic characters)", "kraken-options": "Heading for kraken-specific options.", "kraken-segmentation-model-help": "Help text for the selection of a kraken segmentation model.", "kraken-segmentation-model-label": "Form label for the selection of a kraken segmentstion model.", diff --git a/src/Controller/OcrController.php b/src/Controller/OcrController.php index db9b107..990c7f8 100644 --- a/src/Controller/OcrController.php +++ b/src/Controller/OcrController.php @@ -61,6 +61,7 @@ class OcrController extends AbstractController { 'image' => '', 'engine' => self::DEFAULT_ENGINE, 'langs' => [], + 'normalize' => false, 'psm' => TesseractEngine::DEFAULT_PSM, 'crop' => [], 'line_id' => TranskribusEngine::DEFAULT_LINEID, @@ -114,6 +115,7 @@ private function setup(): void { } static::$params['langs'] = $this->getLangs( $this->request ); static::$params['image_hosts'] = $this->engine->getImageHosts(); + static::$params['normalize'] = $this->request->query->get( 'normalize' ); $crop = $this->request->query->get( 'crop' ); if ( !is_array( $crop ) ) { $crop = []; @@ -240,6 +242,12 @@ public function homeAction(): Response { * @OA\Schema(type="array", @OA\Items(type="string")) * ) * @OA\Parameter( + * name="normalize", + * in="query", + * description="Normalize OCR text.", + * @OA\Schema(type="boolean") + * ) + * @OA\Parameter( * name="segmentation_model", * in="query", * description="The segmentation model for kraken.", @@ -424,6 +432,9 @@ private function getResult( string $invalidLangsMode ): EngineResult { if ( !$result instanceof EngineResult ) { throw new Exception( 'Incorrect (possibly cached) result: ' . var_export( $result, true ) ); } + if ( static::$params['normalize'] ) { + $result->normalize(); + } return $result; } } diff --git a/src/Engine/EngineResult.php b/src/Engine/EngineResult.php index 0474539..d257790 100644 --- a/src/Engine/EngineResult.php +++ b/src/Engine/EngineResult.php @@ -35,4 +35,19 @@ public function getText(): string { public function getWarnings(): array { return $this->warnings; } + + /** + * Normalize result by replacing some historic characters + */ + public function normalize() { + $this->text = strtr( $this->text, [ + 'ſ' => 's', + 'ꝛ' => 'r', + 'ℳ' => 'M', + 'aͤ' => 'ä', + 'oͤ' => 'ö', + 'uͤ' => 'ü', + '⸗' => '-', + ] ); + } } diff --git a/templates/output.html.twig b/templates/output.html.twig index 2894abc..62f6e0a 100644 --- a/templates/output.html.twig +++ b/templates/output.html.twig @@ -62,6 +62,10 @@ {% include '_transkribus_help.html.twig' with {engine: engine} %} +