Skip to content

Commit

Permalink
Multi language support - PL (#27)
Browse files Browse the repository at this point in the history
* make file and poetry build

* Checking running poetry run pytest with directory

* Deploy version for arm server deployment

* Replacing pytorch and transformers with fastembed

* Correcting user experience of TextSplitter and adding tests

* Adding fast-language detection

* Dual language support for eng and pol. Future multi lang implementations planned
  • Loading branch information
ArturOle authored Nov 11, 2024
1 parent ee963cd commit 2e3c809
Show file tree
Hide file tree
Showing 13 changed files with 532 additions and 28 deletions.
13 changes: 13 additions & 0 deletions .dockerignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
.pytest_cache/
.vscode
logs.log
__pycache__/
.python-version
data/**/*.pdf
Ragger.egg-info
!data/pdf-ai-generated/*
.coverage
src/context_search/utils/purge_whitespaces.py
.venv_cs
ContextSearch.egg-info
build/
File renamed without changes.
230 changes: 223 additions & 7 deletions poetry.lock

Large diffs are not rendered by default.

2 changes: 2 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@ pytesseract = "^0.3.13"
tqdm = "^4.66.5"
numpy = ">=1.21,<2"
fastembed = "0.4.1"
fast-langdetect = "^0.2.2"


[tool.poetry.dev-dependencies]
pytest = "^8.3.2"
Expand Down
36 changes: 35 additions & 1 deletion src/context_search/communicator/communicator.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,32 @@
from ..data_classes import LiteratureGraph
from .query_builder import QueryBuilder

from abc import ABC, abstractmethod


logger = setup_logger("Communicator Logger", "logs.log")


class Communicator:
class DatabaseNotSupportedError(BaseException):
def __init__(self, db) -> None:
super().__init__(
""
)


class AbstractCommunciator(ABC):

@abstractmethod
def driver(self):
""" The connection objects for databases """
pass

@abstractmethod
def connection():
pass


class Communicator(AbstractCommunciator):
"""Communicator class for interacting with the Neo4j database.
Attributes:
Expand Down Expand Up @@ -39,6 +60,7 @@ def driver(self, driver):
def driver(self):
if self._driver is not None:
self._driver.close()
logger.info("Driver closed")
del self._driver

@staticmethod
Expand Down Expand Up @@ -132,3 +154,15 @@ def __del__(self):
if self._driver is not None:
self._driver.close()
logger.info("Driver closed")


class DatabaseManager:
supported_db = {
"neo4j": Communicator
}

def __init__(self, adapter: str):
self.database_adapter = self.supported_db.get(adapter, None)

if self.database_adapter is None:
raise DatabaseNotSupportedError()
5 changes: 5 additions & 0 deletions src/context_search/data_classes.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,11 @@ class Tag(Embeddable):
description: Optional[str] = None


class Page(BaseModel):
page_number: int
text: str


class Literature(BaseModel):
filename: str
filepath: str
Expand Down
151 changes: 151 additions & 0 deletions src/context_search/reader/lang_adapter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,151 @@
SUPPORTED_LENGUAGES = {
"pl": "pol",
"en": "eng",
}


class LangAdapt:
@staticmethod
def map(lang_code: str):
return SUPPORTED_LENGUAGES.get(lang_code, )


'''
lang_detect_support = """
af als am an ar arz as ast av az azb ba bar bcl be bg bh bn bo bpy br bs bxr
ca cbk ce ceb ckb co cs cv cy da de diq dsb dty dv el eml en eo es et eu fa
fi fr frr fy ga gd gl gn gom gu gv he hi hif hr hsb ht hu hy ia id ie ilo
io is it ja jbo jv ka kk km kn ko krc ku kv kw ky la lb lez li lmo lo lrc lt
lv mai mg mhr min mk ml mn mr mrj ms mt mwl my myv mzn nah nap nds ne new nl
nn no oc or os pa pam pfl pl pms pnb ps pt qu rm ro ru rue sa sah sc scn sco
sd sh si sk sl so sq sr su sv sw ta te tg th tk tl tr tt tyv ug uk ur uz vec
vep vi vls vo wa war wuu xal xmf yi yo yue zh"""
# ISO-639 set 3
tesseract_support = """afr Afrikaans x x x x x x
amh Amharic x x x x x
ara Arabic x x x x x x
asm Assamese x x x x x
aze Azerbaijani x x x x x
aze_cyrl Azerbaijani - Cyrilic x x x x x x
bel Belarusian x x x x x x
ben Bengali x x x x x x
bod Tibetan x x x x x
bos Bosnian x x x x x
bre Breton x x x x
bul Bulgarian x x x x x x
cat Catalan; Valencian x x x x x x
ceb Cebuano x x x x x
ces Czech x x x x x x
chi_sim Chinese - Simplified x x x x x x
chi_tra Chinese - Traditional x x x x x x
chr Cherokee x x x x x x
cos Corsican x x x
cym Welsh x x x x x
dan Danish x x x x x x
dan_frak Danish - Fraktur (contrib) x x
deu German x x x x x x
deu_frak German - Fraktur (contrib) x x
deu_latf German (Fraktur Latin) x x x x
dzo Dzongkha x x x x x
ell Greek, Modern (1453-) x x x x x x
eng English x x x x x x
enm English, Middle (1100-1500) x x x x x x
epo Esperanto x x x x x x
equ Math / equation detection module x x x x x
est Estonian x x x x x x
eus Basque x x x x x x
fao Faroese x x x
fas Persian x x x x x
fil Filipino (old - Tagalog) x x x
fin Finnish x x x x x x
fra French x x x x x x
frk German - Fraktur (now deu_latf) x x x x x x
frm French, Middle (ca.1400-1600) x x x x x x
fry Western Frisian x x x
gla Scottish Gaelic x x x
gle Irish x x x x x
glg Galician x x x x x x
grc Greek, Ancient (to 1453) (contrib) x x x x x x
guj Gujarati x x x x x
hat Haitian; Haitian Creole x x x x x
heb Hebrew x x x x x x
hin Hindi x x x x x x
hrv Croatian x x x x x x
hun Hungarian x x x x x x
hye Armenian x x x
iku Inuktitut x x x x x
ind Indonesian x x x x x x
isl Icelandic x x x x x x
ita Italian x x x x x x
ita_old Italian - Old x x x x x x
jav Javanese x x x x x
jpn Japanese x x x x x x
kan Kannada x x x x x x
kat Georgian x x x x x
kat_old Georgian - Old x x x x x
kaz Kazakh x x x x x
khm Central Khmer x x x x x
kir Kirghiz; Kyrgyz x x x x x
kmr Kurmanji (Kurdish - Latin Script) x x x x
kor Korean x x x x x x
kor_vert Korean (vertical) x x x x
kur Kurdish (Arabic Script) x
lao Lao x x x x x
lat Latin x x x x x
lav Latvian x x x x x x
lit Lithuanian x x x x x x
ltz Luxembourgish x x x x
mal Malayalam x x x x x x
mar Marathi x x x x x
mkd Macedonian x x x x x x
mlt Maltese x x x x x x
mon Mongolian x x x x
mri Maori x x x x
msa Malay x x x x x x
mya Burmese x x x x x
nep Nepali x x x x x
nld Dutch; Flemish x x x x x x
nor Norwegian x x x x x
oci Occitan (post 1500) x x x x x
ori Oriya x x x x x
osd Orientation and script detection module x x x x x x
pan Panjabi; Punjabi x x x x x
pol Polish x x x x x x
por Portuguese x x x x x x
pus Pushto; Pashto x x x x x
que Quechua x x x x
ron Romanian; Moldavian; Moldovan x x x x x x
rus Russian x x x x x x
san Sanskrit x x x x x
sin Sinhala; Sinhalese x x x x x
slk Slovak x x x x x x
slk_frak Slovak - Fraktur (contrib) x x
slv Slovenian x x x x x x
snd Sindhi x x x x
spa Spanish; Castilian x x x x x x
spa_old Spanish; Castilian - Old x x x x x x
sqi Albanian x x x x x x
srp Serbian x x x x x x
srp_latn Serbian - Latin x x x x x
sun Sundanese x x x x
swa Swahili x x x x x x
swe Swedish x x x x x x
syr Syriac x x x x x
tam Tamil x x x x x x
tat Tatar x x x x
tel Telugu x x x x x x
tgk Tajik x x x x x
tgl Tagalog (new - Filipino) x x x
tha Thai x x x x x x
tir Tigrinya x x x x x
ton Tonga x x x x
tur Turkish x x x x x x
uig Uighur; Uyghur x x x x x
ukr Ukrainian x x x x x x
urd Urdu x x x x x
uzb Uzbek x x x x x
uzb_cyrl Uzbek - Cyrilic x x x x x
vie Vietnamese x x x x x x
yid Yiddish x x x x x
yor Yoruba x x x x"""'''
71 changes: 54 additions & 17 deletions src/context_search/reader/reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,33 +5,47 @@
import pytesseract

from abc import ABC, abstractmethod
from fast_langdetect import detect
from pdf2image import convert_from_path
from typing import List

from ..data_classes import LiteratureDTO
from ..utils import setup_logger, config_variables

current_directory = os.path.dirname(__file__)


logger = setup_logger('Reader Logger', 'logs.log', logging.INFO)
SUPPORTED_LENGUAGES = {
"pl": "pol",
"en": "eng",
}


class ReadManager:
_pdf_reader = None
_text_reader = None
_readers = {}

@property
def pdf_reader(self):
if self._pdf_reader is None:
self._pdf_reader = PDFReader()
return self._pdf_reader
reader = self._readers.get("pdf", None)
if reader is None:
self._readers["pdf"] = PDFReader()
return self._readers["pdf"]
return reader

@property
def text_reader(self):
if self._text_reader is None:
self._text_reader = TextReader()
return self._text_reader
reader = self._readers.get("txt", None)
if reader is None:
self._readers["txt"] = TextReader()
return self._readers["txt"]
return reader

@property
def docx_reader(self):
reader = self._readers.get("docx", None)
if reader is None:
self._readers["docx"] = DocxReader()
return self._readers["docx"]
return reader

@staticmethod
def _is_path_valid(data_path: str) -> bool:
Expand All @@ -41,6 +55,10 @@ def _is_path_valid(data_path: str) -> bool:
def _is_directory_or_file(data_path: str) -> bool:
return FileTypeRecon.is_directory_or_file(data_path)

# A depth first search may be beneficial here to search directory.
# However, it may not be very safe... This should be disabled for web
# service and secured for the desktop. Idea, after searching through files
# give user possibility to disable files he wants to exclude.
def read(self, data_path: str) -> List[LiteratureDTO]:
if self._is_directory_or_file(data_path):
return self._read_directory(data_path)
Expand All @@ -57,10 +75,11 @@ def _read_file(self, file_path: str) -> LiteratureDTO:
file_type = FileTypeRecon.recognize_type(file_path)
text = None

if file_type == 'pdf':
text = self.pdf_reader.read(file_path)
elif file_type == 'txt':
text = self.text_reader.read(file_path)
match file_type:
case 'pdf':
text = self.pdf_reader.read(file_path)
case 'txt':
text = self.text_reader.read(file_path)

return LiteratureDTO(
filename=os.path.basename(file_path),
Expand Down Expand Up @@ -120,11 +139,12 @@ def _setup_paths_from_config(self):
os.environ["TESSERACT_PATH"] = self.tesseract_path

if os.name == "nt":
# system specific path for windows
pytesseract.pytesseract.tesseract_cmd = os.path.join(
self.tesseract_path, "tesseract.exe"
)
else:
# system specsific path for linux
# system specific path for linux
pytesseract.pytesseract.tesseract_cmd = self.tesseract_path

def read(self, data_path: str) -> List[str]:
Expand All @@ -146,18 +166,35 @@ def read(self, data_path: str) -> List[str]:

return paged_text

@staticmethod
def _detect_lang(string):
string = string.replace("\n", ' ')
lang = detect(string)["lang"]
return SUPPORTED_LENGUAGES.get(lang, "eng")

def _read_file_ocr(self, file_path):

pages = convert_from_path(file_path, 300)

# we sacrifice one execution of tesseract to
# to detect main lenguage of analyzed text
lang = self._detect_lang(pytesseract.image_to_string(pages[0]))

paged_text = []
for i, page in enumerate(pages):
page_text = pytesseract.image_to_string(page)
for page in pages:
page_text = pytesseract.image_to_string(page, lang)
paged_text.append(page_text)

return paged_text


class DocxReader(AbstractReader):
# To be implemented in future versions
@staticmethod
def read(data_path: str) -> List[str]:
raise NotImplementedError


class FileTypeRecon:
file_type_classes = {
'txt',
Expand Down
Loading

0 comments on commit 2e3c809

Please sign in to comment.