From 0930690f5025da9142b92c19765e8064988b0da0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Artur=20Oleksi=C5=84ski?= Date: Mon, 18 Nov 2024 15:21:48 +0100 Subject: [PATCH] Moving changes to deploy (#29) * Moving from ini to toml with config (#28) * Moving from ini file to toml * Correcting tests and adjusting code for toml-based shared DTO object * Version bump * logging in exeption handling set to error level * Update README.md --- README.md | 4 +- config.ini | 5 - config.toml | 15 +++ docker/build_dev/dockerfile | 6 +- docker/integration_tests/dockerfile | 7 +- docker/unit_tests/dockerfile | 7 +- pyproject.toml | 2 +- src/context_search/__init__.py | 10 +- src/context_search/communicator/__init__.py | 4 +- .../communicator/communicator.py | 94 ++++++++++++++---- src/context_search/data_manager.py | 24 ++--- src/context_search/reader/reader.py | 19 ++-- src/context_search/utils/__init__.py | 8 +- src/context_search/utils/config_variables.py | 98 +++++++++---------- .../communicator_test/communicator_test.py | 12 +-- .../communicator_test/communicator_test.py | 6 +- 16 files changed, 186 insertions(+), 135 deletions(-) delete mode 100644 config.ini create mode 100644 config.toml diff --git a/README.md b/README.md index 1b12f9d..965116d 100644 --- a/README.md +++ b/README.md @@ -48,9 +48,9 @@ The project currently is not mature enough to be submitted to PYPI, that's why t ### Alternative build (Docker) -Use docker-compose files which are ready to use without any tinkering with the config file. +Use docker-compose files that are ready to use without any tinkering with the config file. -### Run seamntic search +### Run semantic search Refer to `examples/example_submit.py` on how to upload files to the database and to `examples/example_retrieve.py` on how to retrieve data. The extensive tutorial is "in progress". diff --git a/config.ini b/config.ini deleted file mode 100644 index f3eb5fa..0000000 --- a/config.ini +++ /dev/null @@ -1,5 +0,0 @@ -TESSERACT_PATH=/usr/bin/tesseract -POPPLER_PATH=/usr/bin/pdftotext -NEO4J_URI=neo4j://database:7687 -NEO4J_USER=neo4j -NEO4J_PASSWORD=StrongPsPsP5 \ No newline at end of file diff --git a/config.toml b/config.toml new file mode 100644 index 0000000..8fe7205 --- /dev/null +++ b/config.toml @@ -0,0 +1,15 @@ + +[settings] +db_engine = "neo4j" + +[OCR] +TESSERACT_PATH = "/usr/bin/tesseract" +POPPLER_PATH = "/usr/bin/pdftotext" + +[neo4j] +NEO4J_URI = "neo4j://database:7687" +NEO4J_USER = "neo4j" +NEO4J_PASSWORD = "StrongPsPsP5" + +[qdrant] +in_memory = true diff --git a/docker/build_dev/dockerfile b/docker/build_dev/dockerfile index 4ea566c..9750dec 100644 --- a/docker/build_dev/dockerfile +++ b/docker/build_dev/dockerfile @@ -1,4 +1,4 @@ -FROM ubuntu:22.04 +FROM ubuntu:24.04 ENV DEBIAN_FRONTEND=noninteractive ENV POETRY_HOME="/root/.poetry" @@ -7,9 +7,9 @@ ENV PATH="$POETRY_HOME/bin:$PATH" CMD ["bash"] RUN apt-get update -RUN apt-get install -y python3 python3-pip curl +RUN apt-get install -y python3 pipx curl RUN curl -sSL https://install.python-poetry.org | python3 - WORKDIR /ragger COPY . /ragger -RUN make install \ No newline at end of file +RUN poetry install \ No newline at end of file diff --git a/docker/integration_tests/dockerfile b/docker/integration_tests/dockerfile index 09c9bb8..b91d5c0 100644 --- a/docker/integration_tests/dockerfile +++ b/docker/integration_tests/dockerfile @@ -1,4 +1,4 @@ -FROM ubuntu:22.04 +FROM ubuntu:24.04 ENV DEBIAN_FRONTEND=noninteractive ENV POETRY_HOME="/root/.poetry" @@ -7,7 +7,7 @@ ENV PATH="$POETRY_HOME/bin:$PATH" CMD ["bash"] RUN apt-get update --fix-missing -RUN apt-get install -y python3 python3-pip git ca-certificates lsb-release ubuntu-keyring software-properties-common curl +RUN apt-get install -y python3 pipx git ca-certificates lsb-release ubuntu-keyring software-properties-common curl RUN curl -sSL https://install.python-poetry.org | python3 - RUN update-ca-certificates --fresh @@ -17,9 +17,8 @@ RUN apt install -y libtesseract-dev RUN apt install -y poppler-utils WORKDIR /ragger -RUN pip install --upgrade pip COPY . /ragger RUN touch logs.log -RUN make install +RUN poetry install RUN export GIT_PYTHON_GIT_EXECUTABLE=$(which git) diff --git a/docker/unit_tests/dockerfile b/docker/unit_tests/dockerfile index d88a6b5..cfa259b 100644 --- a/docker/unit_tests/dockerfile +++ b/docker/unit_tests/dockerfile @@ -1,4 +1,4 @@ -FROM ubuntu:22.04 +FROM ubuntu:24.04 ENV DEBIAN_FRONTEND=noninteractive ENV POETRY_HOME="/root/.poetry" @@ -7,7 +7,7 @@ ENV PATH="$POETRY_HOME/bin:$PATH" CMD ["bash"] RUN apt-get update -RUN apt-get install -y python3 python3-pip git curl +RUN apt-get install -y python3 pipx git curl RUN curl -sSL https://install.python-poetry.org | python3 - RUN apt install -y tesseract-ocr @@ -15,9 +15,8 @@ RUN apt install -y libtesseract-dev RUN apt install -y poppler-utils WORKDIR /ragger -RUN pip install --upgrade pip COPY . /ragger RUN touch logs.log -RUN make install +RUN poetry install RUN export GIT_PYTHON_GIT_EXECUTABLE=$(which git) diff --git a/pyproject.toml b/pyproject.toml index 6b75288..9a13b1b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "ContextSearch" -version = "0.6.0" +version = "0.7.0" description = "User friendly system for semantic search." authors = [ "ArturOle" diff --git a/src/context_search/__init__.py b/src/context_search/__init__.py index 646c45e..567818b 100644 --- a/src/context_search/__init__.py +++ b/src/context_search/__init__.py @@ -2,15 +2,15 @@ from .data_manager import DataManager from .reader import ReadManager from .preprocessor import Preprocessor -from .communicator import Communicator -from .utils import setup_logger, config_variables +from .communicator import CommAdapterNeo +from .utils import setup_logger, EnvInterface __all__ = [ 'ContextSearch', 'DataManager', 'ReadManager', 'Preprocessor', - 'Communicator', + 'CommAdapterNeo', 'setup_logger', - 'config_variables' -] \ No newline at end of file + 'EnvInterface' +] diff --git a/src/context_search/communicator/__init__.py b/src/context_search/communicator/__init__.py index 2a5d00c..47359de 100644 --- a/src/context_search/communicator/__init__.py +++ b/src/context_search/communicator/__init__.py @@ -1,4 +1,4 @@ -from .communicator import Communicator +from .communicator import CommAdapterNeo -__all__ = ["Communicator"] +__all__ = ["CommAdapterNeo"] diff --git a/src/context_search/communicator/communicator.py b/src/context_search/communicator/communicator.py index cf254c3..a789bda 100644 --- a/src/context_search/communicator/communicator.py +++ b/src/context_search/communicator/communicator.py @@ -1,4 +1,6 @@ from neo4j import GraphDatabase +from abc import ABC, abstractmethod + from ..utils import setup_logger from ..data_classes import LiteratureGraph from .query_builder import QueryBuilder @@ -9,14 +11,8 @@ logger = setup_logger("Communicator Logger", "logs.log") -class DatabaseNotSupportedError(BaseException): - def __init__(self, db) -> None: - super().__init__( - "" - ) - -class AbstractCommunciator(ABC): +class AbstractCommAdapter(ABC): @abstractmethod def driver(self): @@ -28,7 +24,16 @@ def connection(): pass -class Communicator(AbstractCommunciator): +class DatabaseNotSupported(BaseException, AbstractCommAdapter): + def __init__(self, *args, **kwargs) -> None: + super(BaseException).__init__( + "The requested database is not supported.\n" + "Supported:\n" + " {Neo4j, Qdrant}" + ) + + +class CommAdapterNeo(AbstractCommAdapter): """Communicator class for interacting with the Neo4j database. Attributes: @@ -83,7 +88,7 @@ def add_literature_subgraph( session, literature_graph: LiteratureGraph ): - session.write_transaction( + session.execute_write( self._add_literature_subgraph, literature_graph ) @@ -93,7 +98,7 @@ def create_vector_indexes(self, session): """Creates vector indexes for chunks and tags. This function is separated from the add_literature_subgraph because the indexes cannot be created in the same transaction""" - session.write_transaction(self._index_ebeddables) + session.execute_write(self._index_ebeddables) def _add_literature_subgraph(self, tx, literature_graph: LiteratureGraph): """Builds the the nodes and relationships based on the given @@ -118,25 +123,25 @@ def _index_ebeddables(self, tx): @connection def get_literature(self, session, filename): - return session.read_transaction(QueryBuilder._get_literature, filename) + return session.execute_read(QueryBuilder._get_literature, filename) @connection def get_literature_chunks(self, session, filename): - return session.read_transaction( + return session.execute_read( QueryBuilder._get_literature_chunks, filename ) @connection def get_literature_tags(self, session, filename): - return session.read_transaction( + return session.execute_read( QueryBuilder._get_literature_tags, filename ) @connection def search_n_records(self, session, query, n): - return session.read_transaction( + return session.execute_read( QueryBuilder._search_n_records, query, n @@ -144,11 +149,11 @@ def search_n_records(self, session, query, n): @connection def get_all_literatures(self, session): - return session.read_transaction(QueryBuilder._get_all_literatures) + return session.execute_read(QueryBuilder._get_all_literatures) @connection def delete_literature(self, session, filename): - session.write_transaction(QueryBuilder._delete_literature, filename) + session.execute_write(QueryBuilder._delete_literature, filename) def __del__(self): if self._driver is not None: @@ -156,13 +161,60 @@ def __del__(self): logger.info("Driver closed") +class CommAdapterQdrant(AbstractCommAdapter): + """Communicator class for interacting with the Neo4j database. + + Attributes: + uri (str): The URI of the Neo4j database. + user (str): The username for the Neo4j database. + password (str): The password for the Neo4j database. + """ + + def __init__(self, uri, user, password): + self._uri = uri + self._user = user + self._password = password + self._driver = None + + @property + def driver(self): + if self._driver is None: + self._driver = GraphDatabase.driver( + self._uri, + auth=(self._user, self._password) + ) + return self._driver + + @driver.setter + def driver(self, driver): + self._driver = driver + + @driver.deleter + def driver(self): + if self._driver is not None: + self._driver.close() + logger.info("Driver closed") + del self._driver + + @staticmethod + def connection(func): + def wrapper(self, *args, **kwargs): + session = self.driver.session(database="neo4j") + result = func(self, session, *args, **kwargs) + session.close() + return result + + return wrapper + + class DatabaseManager: supported_db = { - "neo4j": Communicator + "neo4j": CommAdapterNeo, + "qdrant": CommAdapterQdrant } def __init__(self, adapter: str): - self.database_adapter = self.supported_db.get(adapter, None) - - if self.database_adapter is None: - raise DatabaseNotSupportedError() + self.database_adapter = self.supported_db.get( + adapter, + DatabaseNotSupported + ) diff --git a/src/context_search/data_manager.py b/src/context_search/data_manager.py index 463eb41..a53bf3d 100644 --- a/src/context_search/data_manager.py +++ b/src/context_search/data_manager.py @@ -2,9 +2,8 @@ from .reader import ReadManager from .preprocessor import Preprocessor -from .communicator import Communicator -from .utils import setup_logger, config_variables - +from .communicator import CommAdapterNeo +from .utils import setup_logger, EnvInterface logger = setup_logger('Data Manager Logger', 'logs.log') @@ -15,23 +14,24 @@ class DataManager: _communicator = None def __init__(self): + EnvInterface().set_env_variables_from_config() self.read_manager = ReadManager() self.preprocessor = Preprocessor() @property def communicator(self): if self._communicator is None: - neo4j_variables = config_variables.get_neo4j_variables() - self._communicator = Communicator( - uri=neo4j_variables[0], - user=neo4j_variables[1], - password=neo4j_variables[2] + neo4j_variables = EnvInterface.get_neo4j_vars() + self._communicator = CommAdapterNeo( + uri=neo4j_variables["NEO4J_URI"], + user=neo4j_variables["NEO4J_USER"], + password=neo4j_variables["NEO4J_PASSWORD"] ) - logger.info(f""" + logger.debug(f""" Communicator created with: - uri: {neo4j_variables[0]} - user: {neo4j_variables[1]} - password: {neo4j_variables[2]} + uri: {neo4j_variables["NEO4J_URI"]} + user: {neo4j_variables["NEO4J_USER"]} + password: {neo4j_variables["NEO4J_PASSWORD"]} """) return self._communicator diff --git a/src/context_search/reader/reader.py b/src/context_search/reader/reader.py index 22b207a..64c4c4c 100644 --- a/src/context_search/reader/reader.py +++ b/src/context_search/reader/reader.py @@ -10,7 +10,7 @@ from typing import List from ..data_classes import LiteratureDTO -from ..utils import setup_logger, config_variables +from ..utils import setup_logger, EnvInterface current_directory = os.path.dirname(__file__) logger = setup_logger('Reader Logger', 'logs.log', logging.INFO) @@ -125,18 +125,13 @@ def _setup_paths_from_config(self): directory as the script that is being run with paths to tesseract and poppler bin folder (NOT TO EXECUTABLES, BUT FOLDERS). """ - if not os.getenv("POPPLER_PATH") or not os.getenv("TESSERACT_PATH"): - self.tesseract_path, self.poppler_path = config_variables.get_OCR_variables() + self.tesseract_path = os.getenv("TESSERACT_PATH") + self.poppler_path = os.getenv("POPPLER_PATH") - if os.getenv("POPPLER_PATH"): - self.poppler_path = os.getenv("POPPLER_PATH") - else: - os.environ["POPPLER_PATH"] = self.poppler_path - - if os.getenv("TESSERACT_PATH"): - self.tesseract_path = os.getenv("TESSERACT_PATH") - else: - os.environ["TESSERACT_PATH"] = self.tesseract_path + if not self.tesseract_path or not self.poppler_path: + ocr_vars = EnvInterface.get_OCR_vars() + self.tesseract_path = ocr_vars.get("TESSERACT_PATH") + self.poppler_path = ocr_vars.get("POPPLER_PATH") if os.name == "nt": # system specific path for windows diff --git a/src/context_search/utils/__init__.py b/src/context_search/utils/__init__.py index e20d945..f72f3fc 100644 --- a/src/context_search/utils/__init__.py +++ b/src/context_search/utils/__init__.py @@ -1,13 +1,9 @@ from .config_variables import ( - get_OCR_variables, - get_neo4j_variables, - set_env_variables_from_config + EnvInterface ) from .logger_setup import setup_logger __all__ = [ - "get_OCR_variables", - "get_neo4j_variables", - "set_env_variables_from_config", + "EnvInterface", "setup_logger" ] diff --git a/src/context_search/utils/config_variables.py b/src/context_search/utils/config_variables.py index 25ad58a..91ec6d0 100644 --- a/src/context_search/utils/config_variables.py +++ b/src/context_search/utils/config_variables.py @@ -1,52 +1,52 @@ import os +import tomllib +import logging + + +class EnvInterface: + """ + Clas for managing shared values of config.toml + After initizlization, the variables are shared across all instances + on purpose so that it is accessible from different parts of + system if needed. + """ + _config = {} + + def __init__(self): + current_working_directory = os.getcwd() + path_to_config = os.path.join( + current_working_directory, + "config.toml" + ) + self.read_config(path_to_config) + + def read_config(self, path): + try: + with open(path, "rb") as conf_file: + EnvInterface._config = tomllib.load(conf_file) + except FileNotFoundError as err: + logging.error( + f"Could not find config file in the project root. {err}" + ) + + @staticmethod + def get_OCR_vars(): + return EnvInterface._config.get("OCR") + + @staticmethod + def get_neo4j_vars(): + return EnvInterface._config.get("neo4j") + + @staticmethod + def set_env_variables_from_config(): + """ + Static method for setting env variables + for necessary modules + """ + for ocr_kwarg in EnvInterface.get_OCR_vars().items(): + os.environ[ocr_kwarg[0]] = ocr_kwarg[1] + + for db_kwarg in EnvInterface.get_neo4j_vars().items(): + os.environ[db_kwarg[0]] = db_kwarg[1] - -def get_OCR_variables(): - tesseract_path = "" - poppler_path = "" - - current_working_directory = os.getcwd() - path_to_config = os.path.join(current_working_directory, "config.ini") - with open(path_to_config, "r") as file: - lines = file.readlines() - - for line in lines: - if line.startswith("TESSERACT_PATH"): - tesseract_path = line.split("=")[1].strip() - if line.startswith("POPPLER_PATH"): - poppler_path = line.split("=")[1].strip() - - return tesseract_path, poppler_path - - -def get_neo4j_variables(): - uri = "" - user = "" - password = "" - - current_working_directory = os.getcwd() - path_to_config = os.path.join(current_working_directory, "config.ini") - with open(path_to_config, "r") as file: - lines = file.readlines() - - for line in lines: - if line.startswith("NEO4J_URI"): - uri = line.split("=")[1].strip() - if line.startswith("NEO4J_USER"): - user = line.split("=")[1].strip() - if line.startswith("NEO4J_PASSWORD"): - password = line.split("=")[1].strip() - - return uri, user, password - - -def set_env_variables_from_config(): - tesseract_path, poppler_path = get_OCR_variables() - os.environ["TESSERACT_PATH"] = tesseract_path - os.environ["POPPLER_PATH"] = poppler_path - - uri, user, password = get_neo4j_variables() - os.environ["NEO4J_URI"] = uri - os.environ["NEO4J_USER"] = user - os.environ["NEO4J_PASSWORD"] = password diff --git a/test/integration_tests/data_manager_test/communicator_test/communicator_test.py b/test/integration_tests/data_manager_test/communicator_test/communicator_test.py index ef9a75c..96a5a9d 100644 --- a/test/integration_tests/data_manager_test/communicator_test/communicator_test.py +++ b/test/integration_tests/data_manager_test/communicator_test/communicator_test.py @@ -1,6 +1,6 @@ import pytest -from context_search.communicator import Communicator +from context_search.communicator import CommAdapterNeo from context_search.data_classes import Literature from context_search.communicator.query_builder import QueryBuilder @@ -14,7 +14,7 @@ def setup_test(test): def test_communicator_creation(test): test.setup_test() - test.communicator = Communicator(test.uri, test.user, test.password) + test.communicator = CommAdapterNeo(test.uri, test.user, test.password) assert test.communicator is not None test.communicator = None @@ -24,7 +24,7 @@ def setup_test(test): test.uri = "neo4j://database:7687" test.user = "neo4j" test.password = "StrongPsPsP5" - test.communicator = Communicator(test.uri, test.user, test.password) + test.communicator = CommAdapterNeo(test.uri, test.user, test.password) test.literature = Literature( filename="test", filepath="test" @@ -33,7 +33,7 @@ def setup_test(test): def test_add_literature(test): test.setup_test() test.session = test.communicator.driver.session(database="neo4j") - test.session.write_transaction( + test.session.execute_write( QueryBuilder._merge_literature, test.literature ) @@ -43,7 +43,7 @@ def test_add_literature(test): def test_get_literature(test): test.setup_test() test.session = test.communicator.driver.session(database="neo4j") - test.session.write_transaction( + test.session.execute_write( QueryBuilder._merge_literature, test.literature ) @@ -55,7 +55,7 @@ def test_get_literature(test): def test_get_all_literatures(test): test.setup_test() test.session = test.communicator.driver.session(database="neo4j") - test.session.write_transaction( + test.session.execute_write( QueryBuilder._merge_literature, test.literature ) diff --git a/test/unit_tests/data_manager_test/communicator_test/communicator_test.py b/test/unit_tests/data_manager_test/communicator_test/communicator_test.py index d1f24a4..82a55ee 100644 --- a/test/unit_tests/data_manager_test/communicator_test/communicator_test.py +++ b/test/unit_tests/data_manager_test/communicator_test/communicator_test.py @@ -2,7 +2,7 @@ from unittest.mock import MagicMock, patch -from context_search.communicator import Communicator +from context_search.communicator import CommAdapterNeo class TestCommunicatorConnection: @@ -14,7 +14,7 @@ def setup_test(test): def test_communicator_creation(test): test.setup_test() - test.communicator = Communicator(test.uri, test.user, test.password) + test.communicator = CommAdapterNeo(test.uri, test.user, test.password) assert test.communicator is not None test.communicator = None @@ -30,7 +30,7 @@ def setup_test(test): test.uri = "neo4j://database:7687" test.user = "neo4j" test.password = "StrongPsPsP5" - test.communicator = Communicator(test.uri, test.user, test.password) + test.communicator = CommAdapterNeo(test.uri, test.user, test.password) def test_driver_property_invocation(test, mock_driver): test.setup_test()