Add optionnal parameters for Tesseract OCR (#1154)

robocorp · Dec 6, 2024 · 283bb92 · 283bb92
1 parent 445c1fa
commit 283bb92
Show file tree

Hide file tree

Showing 4 changed files with 38 additions and 8 deletions.
diff --git a/packages/core/src/RPA/core/locators/containers.py b/packages/core/src/RPA/core/locators/containers.py
@@ -124,6 +124,8 @@ class OcrLocator(Locator):
     confidence: Optional[float] = None
     """3-character ISO 639-2 language code. Passed to pytesseract lang parameter."""
     language: Optional[str] = None
+    """Tesseract specific parameters (like psm or oem). Passed to pytesserect config parameter."""
+    configuration: Optional[str] = None
 
     def __post_init__(self):
         self.text = str(self.text)

diff --git a/packages/main/src/RPA/Desktop/keywords/finder.py b/packages/main/src/RPA/Desktop/keywords/finder.py
@@ -199,12 +199,14 @@ def _find_ocr(self, base: Geometry, locator: OcrLocator) -> List[Region]:
 
         confidence = locator.confidence or self.confidence
         language = locator.language
+        configuration = locator.configuration
         self.logger.info(
-            "Searching for text '%s' (region: %s, confidence: %.1f, language: %s)",
+            "Searching for text '%s' (region: %s, confidence: %.1f, language: %s, configuration: %s)",
             locator.text,
             region or "display",
             confidence,
             language or "Not set",
+            configuration or "Not set",
         )
 
         def finder(image: Image.Image) -> List[Region]:
@@ -214,6 +216,7 @@ def finder(image: Image.Image) -> List[Region]:
                 confidence=confidence,
                 region=region,
                 language=language,
+                configuration=configuration,
             )
 
             return [match["region"] for match in matches]

diff --git a/packages/main/src/RPA/Desktop/keywords/text.py b/packages/main/src/RPA/Desktop/keywords/text.py
@@ -21,13 +21,19 @@ class TextKeywords(LibraryContext):
     """Keywords for reading screen information and content."""
 
     @keyword
-    def read_text(self, locator: Optional[str] = None, invert: bool = False):
+    def read_text(self, locator: Optional[str] = None, invert: bool = False, language: str = None, configuration: str = None):
         """Read text using OCR from the screen, or an area of the
         screen defined by the given locator.
 
         :param locator: Location of element to read text from
         :param invert:  Invert image colors, useful for reading white text
                         on dark background
+        :param language: 3-character ISO 639-2 language code of the text.
+        This is passed directly to the pytesseract lib in the lang parameter.
+         See https://tesseract-ocr.github.io/tessdoc/Command-Line-Usage.html#using-one-language
+        :param configuration: Tesseract specific parameters like Page Segmentation Modes(psm) or OCR Engine Mode (oem).
+        This is passed directly to the pytesseract lib in the config parameter.
+         See https://tesseract-ocr.github.io/tessdoc/Command-Line-Usage.html
 
         Usage examples:
 
@@ -51,19 +57,27 @@ def read_text(self, locator: Optional[str] = None, invert: bool = False):
             if not isinstance(element, Region):
                 raise ValueError("Locator must resolve to a region")
 
-            self.logger.info("Reading text from element: %s", element)
+            area = "element: %s" % element
             image = screen.grab(element)
         else:
-            self.logger.info("Reading text from screen")
+            area = "screen"
             image = screen.grab()
 
         screen.log_image(image)
 
         if invert:
             image = ImageOps.invert(image)
 
+        self.logger.info(
+            "Reading text from %s (invert: %s, language: %s, configuration: %s)",
+            area,
+            invert or "Not set",
+            language or "Not set",
+            configuration or "Not set",
+        )
+
         start_time = time.time()
-        text = ocr.read(image)
+        text = ocr.read(image, language, configuration)
         self.logger.info("Read text in %.2f seconds", time.time() - start_time)
 
         return text
diff --git a/packages/recognition/src/RPA/recognition/ocr.py b/packages/recognition/src/RPA/recognition/ocr.py
@@ -23,15 +23,25 @@
 DEFAULT_CONFIDENCE = 80.0
 
 
-def read(image: Union[Image.Image, Path]):
+def read(
+    image: Union[Image.Image, Path],
+    language: Optional[str] = None,
+    configuration: Optional[str] = None
+):
     """Scan image for text and return it as one string.
 
     :param image: Path to image or Image object
+    :param language: 3-character ISO 639-2 language code of the text.
+    This is passed directly to the pytesseract lib in the lang parameter.
+     See https://tesseract-ocr.github.io/tessdoc/Command-Line-Usage.html#using-one-language
+    :param configuration: Tesseract specific parameters like Page Segmentation Modes(psm) or OCR Engine Mode (oem).
+    This is passed directly to the pytesseract lib in the config parameter.
+     See https://tesseract-ocr.github.io/tessdoc/Command-Line-Usage.html
     """
     image = to_image(image)
 
     try:
-        return pytesseract.image_to_string(image).strip()
+        return pytesseract.image_to_string(image, lang=language, config=configuration).strip()
     except TesseractNotFoundError as err:
         raise EnvironmentError(INSTALL_PROMPT) from err
 
@@ -42,6 +52,7 @@ def find(
     confidence: float = DEFAULT_CONFIDENCE,
     region: Optional[Region] = None,
     language: Optional[str] = None,
+    configuration: Optional[str] = None
 ):
     """Scan image for text and return a list of regions
     that contain it (or something close to it).
@@ -67,7 +78,7 @@ def find(
 
     try:
         data = pytesseract.image_to_data(
-            image, lang=language, output_type=pytesseract.Output.DICT
+            image, lang=language, config=configuration, output_type=pytesseract.Output.DICT
         )
     except TesseractNotFoundError as err:
         raise EnvironmentError(INSTALL_PROMPT) from err