Dual lenguage support for eng and pol. Future multi lang implementati…

…ons planned
ArturOle · Nov 11, 2024 · d2cded5 · d2cded5
1 parent 4f8c7ce
commit d2cded5
Show file tree

Hide file tree

Showing 12 changed files with 586 additions and 152 deletions.
diff --git a/.dockerignore b/.dockerignore
@@ -0,0 +1,13 @@
+.pytest_cache/
+.vscode
+logs.log
+__pycache__/
+.python-version
+data/**/*.pdf
+Ragger.egg-info
+!data/pdf-ai-generated/*
+.coverage
+src/context_search/utils/purge_whitespaces.py
+.venv_cs
+ContextSearch.egg-info
+build/
diff --git a/.github/workflows/unit_tests.yaml → .github/workflows/tests.yaml b/.github/workflows/unit_tests.yaml → .github/workflows/tests.yaml
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -20,7 +20,7 @@ pytesseract = "^0.3.13"
 tqdm = "^4.66.5"
 numpy = ">=1.21,<2"
 fastembed = "0.4.1"
-fasttext-langdetect = "^1.0.5"
+fast-langdetect = "^0.2.2"
 
 [tool.poetry.dev-dependencies]
 pytest = "^8.3.2"

diff --git a/src/context_search/communicator/communicator.py b/src/context_search/communicator/communicator.py
@@ -4,12 +4,9 @@
 from .query_builder import QueryBuilder
 
 from abc import ABC, abstractmethod
-from enum import Enum
-
-logger = setup_logger("Communicator Logger", "logs.log")
 
 
-class SupportedDatabses(Enum):
+logger = setup_logger("Communicator Logger", "logs.log")
 
 
 class DatabaseNotSupportedError(BaseException):
@@ -27,7 +24,6 @@ def driver(self):
         pass
 
     @abstractmethod
-    @staticmethod
     def connection():
         pass
 

diff --git a/src/context_search/reader/lang_adapter.py b/src/context_search/reader/lang_adapter.py
@@ -0,0 +1,151 @@
+SUPPORTED_LENGUAGES = {
+    "pl": "pol",
+    "en": "eng",
+}
+
+
+class LangAdapt:
+    @staticmethod
+    def map(lang_code: str):
+        return SUPPORTED_LENGUAGES.get(lang_code, )
+
+
+'''
+lang_detect_support = """
+af als am an ar arz as ast av az azb ba bar bcl be bg bh bn bo bpy br bs bxr
+ca cbk ce ceb ckb co cs cv cy da de diq dsb dty dv el eml en eo es et eu fa
+fi fr frr fy ga gd gl gn gom gu gv he hi hif hr hsb ht hu hy ia id ie ilo
+io is it ja jbo jv ka kk km kn ko krc ku kv kw ky la lb lez li lmo lo lrc lt
+lv mai mg mhr min mk ml mn mr mrj ms mt mwl my myv mzn nah nap nds ne new nl
+nn no oc or os pa pam pfl pl pms pnb ps pt qu rm ro ru rue sa sah sc scn sco
+sd sh si sk sl so sq sr su sv sw ta te tg th tk tl tr tt tyv ug uk ur uz vec
+vep vi vls vo wa war wuu xal xmf yi yo yue zh"""
+
+# ISO-639 set 3
+tesseract_support = """afr	Afrikaans	x	x	x	x	x	x
+amh	Amharic	 	x	x	x	x	x
+ara	Arabic	x	x	x	x	x	x
+asm	Assamese	 	x	x	x	x	x
+aze	Azerbaijani	 	x	x	x	x	x
+aze_cyrl	Azerbaijani - Cyrilic	x	x	x	x	x	x
+bel	Belarusian	x	x	x	x	x	x
+ben	Bengali	x	x	x	x	x	x
+bod	Tibetan	 	x	x	x	x	x
+bos	Bosnian	 	x	x	x	x	x
+bre	Breton	 	 	x	x	x	x
+bul	Bulgarian	x	x	x	x	x	x
+cat	Catalan; Valencian	x	x	x	x	x	x
+ceb	Cebuano	 	x	x	x	x	x
+ces	Czech	x	x	x	x	x	x
+chi_sim	Chinese - Simplified	x	x	x	x	x	x
+chi_tra	Chinese - Traditional	x	x	x	x	x	x
+chr	Cherokee	x	x	x	x	x	x
+cos	Corsican	 	 	 	x	x	x
+cym	Welsh	 	x	x	x	x	x
+dan	Danish	x	x	x	x	x	x
+dan_frak	Danish - Fraktur (contrib)	x	x	 	 	 	 
+deu	German	x	x	x	x	x	x
+deu_frak	German - Fraktur (contrib)	x	x	 	 	 	 
+deu_latf	German (Fraktur Latin)	 	 	x	x	x	x
+dzo	Dzongkha	 	x	x	x	x	x
+ell	Greek, Modern (1453-)	x	x	x	x	x	x
+eng	English	x	x	x	x	x	x
+enm	English, Middle (1100-1500)	x	x	x	x	x	x
+epo	Esperanto	x	x	x	x	x	x
+equ	Math / equation detection module	x	x	 	x	x	x
+est	Estonian	x	x	x	x	x	x
+eus	Basque	x	x	x	x	x	x
+fao	Faroese	 	 	 	x	x	x
+fas	Persian	 	x	x	x	x	x
+fil	Filipino (old - Tagalog)	 	 	 	x	x	x
+fin	Finnish	x	x	x	x	x	x
+fra	French	x	x	x	x	x	x
+frk	German - Fraktur (now deu_latf)	x	x	x	x	x	x
+frm	French, Middle (ca.1400-1600)	x	x	x	x	x	x
+fry	Western Frisian	 	 	 	x	x	x
+gla	Scottish Gaelic	 	 	 	x	x	x
+gle	Irish	 	x	x	x	x	x
+glg	Galician	x	x	x	x	x	x
+grc	Greek, Ancient (to 1453) (contrib)	x	x	x	x	x	x
+guj	Gujarati	 	x	x	x	x	x
+hat	Haitian; Haitian Creole	 	x	x	x	x	x
+heb	Hebrew	x	x	x	x	x	x
+hin	Hindi	x	x	x	x	x	x
+hrv	Croatian	x	x	x	x	x	x
+hun	Hungarian	x	x	x	x	x	x
+hye	Armenian	 	 	 	x	x	x
+iku	Inuktitut	 	x	x	x	x	x
+ind	Indonesian	x	x	x	x	x	x
+isl	Icelandic	x	x	x	x	x	x
+ita	Italian	x	x	x	x	x	x
+ita_old	Italian - Old	x	x	x	x	x	x
+jav	Javanese	 	x	x	x	x	x
+jpn	Japanese	x	x	x	x	x	x
+kan	Kannada	x	x	x	x	x	x
+kat	Georgian	 	x	x	x	x	x
+kat_old	Georgian - Old	 	x	x	x	x	x
+kaz	Kazakh	 	x	x	x	x	x
+khm	Central Khmer	 	x	x	x	x	x
+kir	Kirghiz; Kyrgyz	 	x	x	x	x	x
+kmr	Kurmanji (Kurdish - Latin Script)	 	 	x	x	x	x
+kor	Korean	x	x	x	x	x	x
+kor_vert	Korean (vertical)	 	 	x	x	x	x
+kur	Kurdish (Arabic Script)	 	x	 	 	 	 
+lao	Lao	 	x	x	x	x	x
+lat	Latin	 	x	x	x	x	x
+lav	Latvian	x	x	x	x	x	x
+lit	Lithuanian	x	x	x	x	x	x
+ltz	Luxembourgish	 	 	x	x	x	x
+mal	Malayalam	x	x	x	x	x	x
+mar	Marathi	 	x	x	x	x	x
+mkd	Macedonian	x	x	x	x	x	x
+mlt	Maltese	x	x	x	x	x	x
+mon	Mongolian	 	 	x	x	x	x
+mri	Maori	 	 	x	x	x	x
+msa	Malay	x	x	x	x	x	x
+mya	Burmese	 	x	x	x	x	x
+nep	Nepali	 	x	x	x	x	x
+nld	Dutch; Flemish	x	x	x	x	x	x
+nor	Norwegian	x	 	x	x	x	x
+oci	Occitan (post 1500)	 	x	x	x	x	x
+ori	Oriya	 	x	x	x	x	x
+osd	Orientation and script detection module	x	x	x	x	x	x
+pan	Panjabi; Punjabi	 	x	x	x	x	x
+pol	Polish	x	x	x	x	x	x
+por	Portuguese	x	x	x	x	x	x
+pus	Pushto; Pashto	 	x	x	x	x	x
+que	Quechua	 	 	x	x	x	x
+ron	Romanian; Moldavian; Moldovan	x	x	x	x	x	x
+rus	Russian	x	x	x	x	x	x
+san	Sanskrit	 	x	x	x	x	x
+sin	Sinhala; Sinhalese	 	x	x	x	x	x
+slk	Slovak	x	x	x	x	x	x
+slk_frak	Slovak - Fraktur (contrib)	x	x	 	 	 	 
+slv	Slovenian	x	x	x	x	x	x
+snd	Sindhi	 	 	x	x	x	x
+spa	Spanish; Castilian	x	x	x	x	x	x
+spa_old	Spanish; Castilian - Old	x	x	x	x	x	x
+sqi	Albanian	x	x	x	x	x	x
+srp	Serbian	x	x	x	x	x	x
+srp_latn	Serbian - Latin	 	x	x	x	x	x
+sun	Sundanese	 	 	x	x	x	x
+swa	Swahili	x	x	x	x	x	x
+swe	Swedish	x	x	x	x	x	x
+syr	Syriac	 	x	x	x	x	x
+tam	Tamil	x	x	x	x	x	x
+tat	Tatar	 	 	x	x	x	x
+tel	Telugu	x	x	x	x	x	x
+tgk	Tajik	 	x	x	x	x	x
+tgl	Tagalog (new - Filipino)	x	x	x	 	 	 
+tha	Thai	x	x	x	x	x	x
+tir	Tigrinya	 	x	x	x	x	x
+ton	Tonga	 	 	x	x	x	x
+tur	Turkish	x	x	x	x	x	x
+uig	Uighur; Uyghur	 	x	x	x	x	x
+ukr	Ukrainian	x	x	x	x	x	x
+urd	Urdu	 	x	x	x	x	x
+uzb	Uzbek	 	x	x	x	x	x
+uzb_cyrl	Uzbek - Cyrilic	 	x	x	x	x	x
+vie	Vietnamese	x	x	x	x	x	x
+yid	Yiddish	 	x	x	x	x	x
+yor	Yoruba	 	 	x	x	x	x"""'''
diff --git a/src/context_search/reader/reader.py b/src/context_search/reader/reader.py
@@ -5,17 +5,19 @@
 import pytesseract
 
 from abc import ABC, abstractmethod
-from ftlangdetect import detect
+from fast_langdetect import detect
 from pdf2image import convert_from_path
 from typing import List
 
 from ..data_classes import LiteratureDTO
 from ..utils import setup_logger, config_variables
 
 current_directory = os.path.dirname(__file__)
-
-
 logger = setup_logger('Reader Logger', 'logs.log', logging.INFO)
+SUPPORTED_LENGUAGES = {
+    "pl": "pol",
+    "en": "eng",
+}
 
 
 class ReadManager:
@@ -164,15 +166,22 @@ def read(self, data_path: str) -> List[str]:
 
         return paged_text
 
+    @staticmethod
+    def _detect_lang(string):
+        string = string.replace("\n", ' ')
+        lang = detect(string)["lang"]
+        return SUPPORTED_LENGUAGES.get(lang, "eng")
+
     def _read_file_ocr(self, file_path):
 
         pages = convert_from_path(file_path, 300)
 
-        # we are sacrificing one execution of tesseract to
+        # we sacrifice one execution of tesseract to
         # to detect main lenguage of analyzed text
-        lang = detect(pytesseract.image_to_string(pages[0]))["lang"]
+        lang = self._detect_lang(pytesseract.image_to_string(pages[0]))
+
         paged_text = []
-        for i, page in enumerate(pages):
+        for page in pages:
             page_text = pytesseract.image_to_string(page, lang)
             paged_text.append(page_text)
 

diff --git a/test/unit_tests/data_manager_test/reader_test/pdf_reader_multilang.py b/test/unit_tests/data_manager_test/reader_test/pdf_reader_multilang.py
@@ -0,0 +1,46 @@
+import os
+import logging
+
+from context_search.reader import PDFReader
+
+cur_dir = os.path.dirname(__file__)
+logger = logging.getLogger(__name__)
+
+
+def test_lang_detect_pl(mocker):
+    pdf_reader = PDFReader()
+    with mocker.patch.object(
+        pdf_reader,
+        "_detect_lang",
+        wraps=pdf_reader._detect_lang
+    ) as detect_lang_mock:
+        pdf_reader.read(
+            rf'{cur_dir}/test_files/test_pl.pdf'
+        )
+        assert detect_lang_mock.return_value == "pol"
+
+
+def test_lang_detect_en(mocker):
+    pdf_reader = PDFReader()
+    with mocker.patch.object(
+        pdf_reader,
+        "_detect_lang",
+        wraps=pdf_reader._detect_lang
+    ) as detect_lang_mock:
+        pdf_reader.read(
+            rf'{cur_dir}/test_files/test_eng.pdf'
+        )
+        assert detect_lang_mock.return_value == "eng"
+
+
+def test_lang_detect_not_supported(mocker):
+    pdf_reader = PDFReader()
+    with mocker.patch.object(
+        pdf_reader,
+        "_detect_lang",
+        wraps=pdf_reader._detect_lang
+    ) as detect_lang_mock:
+        pdf_reader.read(
+            rf'{cur_dir}/test_files/test_kor.pdf'
+        )
+        assert detect_lang_mock.return_value == "eng"
diff --git a/test/unit_tests/data_manager_test/reader_test/test_files/test_eng.pdf b/test/unit_tests/data_manager_test/reader_test/test_files/test_eng.pdf
diff --git a/test/unit_tests/data_manager_test/reader_test/test_files/test_kor.pdf b/test/unit_tests/data_manager_test/reader_test/test_files/test_kor.pdf
diff --git a/test/unit_tests/data_manager_test/reader_test/test_files/test_pl.pdf b/test/unit_tests/data_manager_test/reader_test/test_files/test_pl.pdf
diff --git a/test/unit_tests/data_manager_test/reader_test/test_reader_manager.py b/test/unit_tests/data_manager_test/reader_test/test_reader_manager.py
@@ -27,9 +27,9 @@ def test_reader_manager_read_pdf(mocker):
     assert isinstance(result, list) and len(result) == 1
 
     result = reader_manager.read(f"{cur_dir}/test_files")
-    assert isinstance(result, list) and len(result) == 3
+    assert isinstance(result, list) and len(result) == 6
 
-    assert_n_calls(mock_pdf_reader, 3)
+    assert_n_calls(mock_pdf_reader, 6)
 
 
 def test_reader_manager_read_txt(mocker):
@@ -53,7 +53,7 @@ def test_reader_manager_read_txt(mocker):
     assert isinstance(result, list) and len(result) == 1
 
     result = reader_manager.read(f"{cur_dir}/test_files")
-    assert isinstance(result, list) and len(result) == 3
+    assert isinstance(result, list) and len(result) == 6
 
     assert_n_calls(mock_text_reader, 2)