Skip to content

Commit

Permalink
Dual lenguage support for eng and pol. Future multi lang implementati…
Browse files Browse the repository at this point in the history
…ons planned
  • Loading branch information
ArturOle committed Nov 11, 2024
1 parent 4f8c7ce commit d2cded5
Show file tree
Hide file tree
Showing 12 changed files with 586 additions and 152 deletions.
13 changes: 13 additions & 0 deletions .dockerignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
.pytest_cache/
.vscode
logs.log
__pycache__/
.python-version
data/**/*.pdf
Ragger.egg-info
!data/pdf-ai-generated/*
.coverage
src/context_search/utils/purge_whitespaces.py
.venv_cs
ContextSearch.egg-info
build/
File renamed without changes.
493 changes: 356 additions & 137 deletions poetry.lock

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ pytesseract = "^0.3.13"
tqdm = "^4.66.5"
numpy = ">=1.21,<2"
fastembed = "0.4.1"
fasttext-langdetect = "^1.0.5"
fast-langdetect = "^0.2.2"

[tool.poetry.dev-dependencies]
pytest = "^8.3.2"
Expand Down
6 changes: 1 addition & 5 deletions src/context_search/communicator/communicator.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,9 @@
from .query_builder import QueryBuilder

from abc import ABC, abstractmethod
from enum import Enum

logger = setup_logger("Communicator Logger", "logs.log")


class SupportedDatabses(Enum):
logger = setup_logger("Communicator Logger", "logs.log")


class DatabaseNotSupportedError(BaseException):
Expand All @@ -27,7 +24,6 @@ def driver(self):
pass

@abstractmethod
@staticmethod
def connection():
pass

Expand Down
151 changes: 151 additions & 0 deletions src/context_search/reader/lang_adapter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,151 @@
SUPPORTED_LENGUAGES = {
"pl": "pol",
"en": "eng",
}


class LangAdapt:
@staticmethod
def map(lang_code: str):
return SUPPORTED_LENGUAGES.get(lang_code, )


'''
lang_detect_support = """
af als am an ar arz as ast av az azb ba bar bcl be bg bh bn bo bpy br bs bxr
ca cbk ce ceb ckb co cs cv cy da de diq dsb dty dv el eml en eo es et eu fa
fi fr frr fy ga gd gl gn gom gu gv he hi hif hr hsb ht hu hy ia id ie ilo
io is it ja jbo jv ka kk km kn ko krc ku kv kw ky la lb lez li lmo lo lrc lt
lv mai mg mhr min mk ml mn mr mrj ms mt mwl my myv mzn nah nap nds ne new nl
nn no oc or os pa pam pfl pl pms pnb ps pt qu rm ro ru rue sa sah sc scn sco
sd sh si sk sl so sq sr su sv sw ta te tg th tk tl tr tt tyv ug uk ur uz vec
vep vi vls vo wa war wuu xal xmf yi yo yue zh"""
# ISO-639 set 3
tesseract_support = """afr Afrikaans x x x x x x
amh Amharic x x x x x
ara Arabic x x x x x x
asm Assamese x x x x x
aze Azerbaijani x x x x x
aze_cyrl Azerbaijani - Cyrilic x x x x x x
bel Belarusian x x x x x x
ben Bengali x x x x x x
bod Tibetan x x x x x
bos Bosnian x x x x x
bre Breton x x x x
bul Bulgarian x x x x x x
cat Catalan; Valencian x x x x x x
ceb Cebuano x x x x x
ces Czech x x x x x x
chi_sim Chinese - Simplified x x x x x x
chi_tra Chinese - Traditional x x x x x x
chr Cherokee x x x x x x
cos Corsican x x x
cym Welsh x x x x x
dan Danish x x x x x x
dan_frak Danish - Fraktur (contrib) x x
deu German x x x x x x
deu_frak German - Fraktur (contrib) x x
deu_latf German (Fraktur Latin) x x x x
dzo Dzongkha x x x x x
ell Greek, Modern (1453-) x x x x x x
eng English x x x x x x
enm English, Middle (1100-1500) x x x x x x
epo Esperanto x x x x x x
equ Math / equation detection module x x x x x
est Estonian x x x x x x
eus Basque x x x x x x
fao Faroese x x x
fas Persian x x x x x
fil Filipino (old - Tagalog) x x x
fin Finnish x x x x x x
fra French x x x x x x
frk German - Fraktur (now deu_latf) x x x x x x
frm French, Middle (ca.1400-1600) x x x x x x
fry Western Frisian x x x
gla Scottish Gaelic x x x
gle Irish x x x x x
glg Galician x x x x x x
grc Greek, Ancient (to 1453) (contrib) x x x x x x
guj Gujarati x x x x x
hat Haitian; Haitian Creole x x x x x
heb Hebrew x x x x x x
hin Hindi x x x x x x
hrv Croatian x x x x x x
hun Hungarian x x x x x x
hye Armenian x x x
iku Inuktitut x x x x x
ind Indonesian x x x x x x
isl Icelandic x x x x x x
ita Italian x x x x x x
ita_old Italian - Old x x x x x x
jav Javanese x x x x x
jpn Japanese x x x x x x
kan Kannada x x x x x x
kat Georgian x x x x x
kat_old Georgian - Old x x x x x
kaz Kazakh x x x x x
khm Central Khmer x x x x x
kir Kirghiz; Kyrgyz x x x x x
kmr Kurmanji (Kurdish - Latin Script) x x x x
kor Korean x x x x x x
kor_vert Korean (vertical) x x x x
kur Kurdish (Arabic Script) x
lao Lao x x x x x
lat Latin x x x x x
lav Latvian x x x x x x
lit Lithuanian x x x x x x
ltz Luxembourgish x x x x
mal Malayalam x x x x x x
mar Marathi x x x x x
mkd Macedonian x x x x x x
mlt Maltese x x x x x x
mon Mongolian x x x x
mri Maori x x x x
msa Malay x x x x x x
mya Burmese x x x x x
nep Nepali x x x x x
nld Dutch; Flemish x x x x x x
nor Norwegian x x x x x
oci Occitan (post 1500) x x x x x
ori Oriya x x x x x
osd Orientation and script detection module x x x x x x
pan Panjabi; Punjabi x x x x x
pol Polish x x x x x x
por Portuguese x x x x x x
pus Pushto; Pashto x x x x x
que Quechua x x x x
ron Romanian; Moldavian; Moldovan x x x x x x
rus Russian x x x x x x
san Sanskrit x x x x x
sin Sinhala; Sinhalese x x x x x
slk Slovak x x x x x x
slk_frak Slovak - Fraktur (contrib) x x
slv Slovenian x x x x x x
snd Sindhi x x x x
spa Spanish; Castilian x x x x x x
spa_old Spanish; Castilian - Old x x x x x x
sqi Albanian x x x x x x
srp Serbian x x x x x x
srp_latn Serbian - Latin x x x x x
sun Sundanese x x x x
swa Swahili x x x x x x
swe Swedish x x x x x x
syr Syriac x x x x x
tam Tamil x x x x x x
tat Tatar x x x x
tel Telugu x x x x x x
tgk Tajik x x x x x
tgl Tagalog (new - Filipino) x x x
tha Thai x x x x x x
tir Tigrinya x x x x x
ton Tonga x x x x
tur Turkish x x x x x x
uig Uighur; Uyghur x x x x x
ukr Ukrainian x x x x x x
urd Urdu x x x x x
uzb Uzbek x x x x x
uzb_cyrl Uzbek - Cyrilic x x x x x
vie Vietnamese x x x x x x
yid Yiddish x x x x x
yor Yoruba x x x x"""'''
21 changes: 15 additions & 6 deletions src/context_search/reader/reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,17 +5,19 @@
import pytesseract

from abc import ABC, abstractmethod
from ftlangdetect import detect
from fast_langdetect import detect
from pdf2image import convert_from_path
from typing import List

from ..data_classes import LiteratureDTO
from ..utils import setup_logger, config_variables

current_directory = os.path.dirname(__file__)


logger = setup_logger('Reader Logger', 'logs.log', logging.INFO)
SUPPORTED_LENGUAGES = {
"pl": "pol",
"en": "eng",
}


class ReadManager:
Expand Down Expand Up @@ -164,15 +166,22 @@ def read(self, data_path: str) -> List[str]:

return paged_text

@staticmethod
def _detect_lang(string):
string = string.replace("\n", ' ')
lang = detect(string)["lang"]
return SUPPORTED_LENGUAGES.get(lang, "eng")

def _read_file_ocr(self, file_path):

pages = convert_from_path(file_path, 300)

# we are sacrificing one execution of tesseract to
# we sacrifice one execution of tesseract to
# to detect main lenguage of analyzed text
lang = detect(pytesseract.image_to_string(pages[0]))["lang"]
lang = self._detect_lang(pytesseract.image_to_string(pages[0]))

paged_text = []
for i, page in enumerate(pages):
for page in pages:
page_text = pytesseract.image_to_string(page, lang)
paged_text.append(page_text)

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
import os
import logging

from context_search.reader import PDFReader

cur_dir = os.path.dirname(__file__)
logger = logging.getLogger(__name__)


def test_lang_detect_pl(mocker):
pdf_reader = PDFReader()
with mocker.patch.object(
pdf_reader,
"_detect_lang",
wraps=pdf_reader._detect_lang
) as detect_lang_mock:
pdf_reader.read(
rf'{cur_dir}/test_files/test_pl.pdf'
)
assert detect_lang_mock.return_value == "pol"


def test_lang_detect_en(mocker):
pdf_reader = PDFReader()
with mocker.patch.object(
pdf_reader,
"_detect_lang",
wraps=pdf_reader._detect_lang
) as detect_lang_mock:
pdf_reader.read(
rf'{cur_dir}/test_files/test_eng.pdf'
)
assert detect_lang_mock.return_value == "eng"


def test_lang_detect_not_supported(mocker):
pdf_reader = PDFReader()
with mocker.patch.object(
pdf_reader,
"_detect_lang",
wraps=pdf_reader._detect_lang
) as detect_lang_mock:
pdf_reader.read(
rf'{cur_dir}/test_files/test_kor.pdf'
)
assert detect_lang_mock.return_value == "eng"
Binary file not shown.
Binary file not shown.
Binary file not shown.
Original file line number Diff line number Diff line change
Expand Up @@ -27,9 +27,9 @@ def test_reader_manager_read_pdf(mocker):
assert isinstance(result, list) and len(result) == 1

result = reader_manager.read(f"{cur_dir}/test_files")
assert isinstance(result, list) and len(result) == 3
assert isinstance(result, list) and len(result) == 6

assert_n_calls(mock_pdf_reader, 3)
assert_n_calls(mock_pdf_reader, 6)


def test_reader_manager_read_txt(mocker):
Expand All @@ -53,7 +53,7 @@ def test_reader_manager_read_txt(mocker):
assert isinstance(result, list) and len(result) == 1

result = reader_manager.read(f"{cur_dir}/test_files")
assert isinstance(result, list) and len(result) == 3
assert isinstance(result, list) and len(result) == 6

assert_n_calls(mock_text_reader, 2)

Expand Down

0 comments on commit d2cded5

Please sign in to comment.