Submitting fastembed and new tests to the master (#25)

* make file and poetry build * Adding curl to dockerfiles * Deploy version for arm server deployment * Correcting text splitter parameters * Replacing pytorch and transformers with fastembed * Additional tests and the setter method for text splitter * Correcting user expirience of TextSplitter and adding tests * update poetry lock
ArturOle · Nov 5, 2024 · ee963cd · ee963cd
1 parent 0669432
commit ee963cd
Show file tree

Hide file tree

Showing 14 changed files with 717 additions and 2,235 deletions.
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "ContextSearch"
-version = "0.5.0"
+version = "0.6.0"
 description = "User friendly system for semantic search."
 authors = [
     "ArturOle"
@@ -11,19 +11,15 @@ keywords = ["semantic search", "ocr", "rag", "document-embedding", "contextual-s
 packages = [{include = "context_search", from="src"}]
 
 [tool.poetry.dependencies]
-python = "^3.10"
+python = ">=3.10,<3.13"
 neo4j = "^5.25.0"
 pdf2image = "^1.17.0"
 pydantic = "^2.9.2"
 PyMuPDF = "^1.24.10"
 pytesseract = "^0.3.13"
-pytextrank = "^3.3.0"
-spacy = "^3.8.2"
 tqdm = "^4.66.5"
-transformers = "^4.45.1"
-numpy = "^2.0.2"
-torch = { version = "^2.3.1", source = "torch"}
-en_core_web_sm = {url = "https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0.tar.gz"}
+numpy = ">=1.21,<2"
+fastembed = "0.4.1"
 
 [tool.poetry.dev-dependencies]
 pytest = "^8.3.2"
@@ -38,3 +34,6 @@ priority = "supplemental"
 [build-system]
 requires = ["poetry-core>=1.0.0"]
 build-backend = "poetry.core.masonry.api"
+
+[tool.pylint.format]
+max-module-lines = 99
diff --git a/src/context_search/preprocessor/embedder.py b/src/context_search/preprocessor/embedder.py
@@ -1,29 +1,41 @@
-import torch
-from transformers import AutoModel, AutoTokenizer
+
+from fastembed import TextEmbedding
 from typing import List
 
 from ..data_classes import Embeddable
 
 
-class Embedder:
-    def __init__(self, model_id="intfloat/e5-base-v2"):
-        self.device = "cuda" if torch.cuda.is_available() else "cpu"
-        self.tokenizer = AutoTokenizer.from_pretrained(model_id)
-        self.model = AutoModel.from_pretrained(model_id).to(self.device)
-        self.model.eval()
+class AbstractEmbedder:
+    def embed(self, text):
+        raise NotImplementedError
+
+    def produce_embeddings(
+            self,
+            embeddable_objs: List[Embeddable]
+    ) -> List[Embeddable]:
+        raise NotImplementedError
+
+
+class ImageEmbedder(AbstractEmbedder):
+    def embed(self, text):
+        raise NotImplementedError
+
+    def produce_embeddings(
+            self,
+            embeddable_objs: List[Embeddable]
+    ) -> List[Embeddable]:
+        raise NotImplementedError
+
+
+class TextEmbedder:
+    def __init__(self, model_id="sentence-transformers/all-MiniLM-L6-v2"):
+        self.model = TextEmbedding(
+            model_name=model_id
+        )
 
     def embed(self, text):
         """Embeds the given text using the model."""
-        with torch.no_grad():
-            inputs = self.tokenizer(
-                text,
-                return_tensors="pt",
-                padding=True,
-                truncation=True
-            ).to(self.device)
-            outputs = self.model(**inputs)
-            squeezed_output = outputs.last_hidden_state.mean(dim=1).squeeze()
-            return squeezed_output.cpu().tolist()
+        return list(self.model.embed(text))
 
     def __call__(self, doc):
         doc._.embedding = self.embed(doc.text)
@@ -35,9 +47,11 @@ def produce_embeddings(
     ) -> List[Embeddable]:
         """Produces embeddings for the given list of Embeddable objects."""
 
+        embeddings = self.embed([
+            embeddable_obj.text for embeddable_obj in embeddable_objs
+        ])
+
         for embeddable_obj in embeddable_objs:
-            embeddable_obj.embeddings = self.embed(
-                embeddable_obj.text
-            )
+            embeddable_obj.embeddings = embeddings.pop(0)
 
         return embeddable_objs
diff --git a/src/context_search/preprocessor/extractor.py b/src/context_search/preprocessor/extractor.py
@@ -1,6 +1,6 @@
 
-import spacy
-import pytextrank   # noqa: F401
+# import spacy
+# import pytextrank   # noqa: F401
 
 from typing import List, Tuple
 
@@ -9,15 +9,14 @@
 
 class Extractor:
     def __init__(self):
-        self.nlp = spacy.load("en_core_web_sm")
-        self.nlp.add_pipe("textrank")
+        pass
 
     def extract_keywords(self, text_list: List[str]) -> list:
         ranked_phrases = []
 
-        doc = self.nlp(''.join(text_list))
-        for phrase in doc._.phrases:
-            ranked_phrases.append([phrase.text, phrase.rank])
+        # # doc = self.nlp(''.join(text_list))
+        # for phrase in doc._.phrases:
+        #     ranked_phrases.append([phrase.text, phrase.rank])
 
         return ranked_phrases
 

diff --git a/src/context_search/preprocessor/preprocessor.py b/src/context_search/preprocessor/preprocessor.py
@@ -4,18 +4,21 @@
     LiteratureGraph,
 )
 from .extractor import Extractor
-from .embedder import Embedder
+from .embedder import TextEmbedder
 from .text_splitter import TextSplitter
 
 
 class Preprocessor:
     def __init__(self):
-        self.embedder = Embedder()
+        self.embedder = TextEmbedder()
         self.extractor = Extractor()
         self.splitter = TextSplitter(
-            order="any",
-            separators=['\.', '\n\n', '\n', '\s'],
-            is_separator_regex=True
+            order="sequential",
+            separators=['\n\n', '\n', '\.', '\s'],
+            is_separator_regex=True,
+            chunk_size=1024,
+            chunk_overlap=128,
+            margin=128
         )
 
     def process(self, literatures: list[LiteratureDTO]):

diff --git a/src/context_search/preprocessor/text_splitter.py b/src/context_search/preprocessor/text_splitter.py
@@ -1,4 +1,4 @@
-import inspect
+
 import re
 
 from abc import ABC, abstractmethod
@@ -51,15 +51,16 @@ def __init__(
             self,
             chunk_size: int = 1024,
             chunk_overlap: Union[int, float] = 256,
-            margin: int = 256,
+            margin: int = None,
             order: str = "any",
             separators: List[str] = ['\.', '\n\n', '\n', '\s'],
             is_separator_regex: bool = True,
     ):
         self.chunk_size = chunk_size
         self.overlap = chunk_overlap
+        margin = margin if margin is not None else self.overlap
         self.margin = margin
-        self.order = order
+        self._order = order
         self._is_separator_regex = is_separator_regex
         self.separators = separators
 
@@ -92,11 +93,21 @@ def __init__(
         self.search_func: callable = None
         self.setup_separators(separators)
 
+    @property
+    def order(self):
+        return self._order
+
+    @order.setter
+    def order(self, value):
+        """ Assures the separators are compiled for the new order strategy."""
+        self._order = value
+        self.setup_separators(self.separators)
+
     def setup_separators(self, separators):
         """ Prepares compiled patterns for efficient search of the separators
         and sets the search function based on the order of the separators.
         """
-        match self.order.lower():
+        match self._order.lower():
             case "any":
                 if not self._is_separator_regex:
                     separators = '|'.join([

diff --git a/test/integration_tests/data_manager_test/preprocessor_test/preprocessor_test.py b/test/integration_tests/data_manager_test/preprocessor_test/preprocessor_test.py
@@ -3,7 +3,7 @@
 from context_search.data_classes import LiteratureDTO
 import numpy as np
 import random
-import pytest
+import pytest   # noqa: F401
 
 random.seed(0)
 np.random.seed(0)
@@ -23,13 +23,6 @@ def test_preprocessing_steps():
     assert len(literature.chunks) == 1
     assert literature.chunks[0].text == "This is a test text"
     assert literature.chunks[0].page_number == 0
-    assert len(literature.tags) == 2
-    assert literature.tags[0].text == "a test text"
-    assert pytest.approx(literature.tags[0].embeddings[:2], 1e-3) ==\
-        [-0.23820725, -0.3175099]
-    assert literature.relation_weights[0].literature == "name"
-    assert literature.relation_weights[0].tag == "a test text"
-    assert literature.relation_weights[0].weight == 0.25
 
 
 def test_multiple_file_processing():

diff --git a/test/unit_tests/data_manager_test/literature_dataclass_test.py b/test/unit_tests/data_manager_test/literature_dataclass_test.py
@@ -140,7 +140,7 @@ def test_literature_graph_creation_incorrect():
                 filepath=r'f:\ile\pat.h'
             ),
             chunks=[Chunk(text='text')],
-            tags=[Chunk(text='text')], # should be a list of Tag
+            tags=[Chunk(text='text')],  # should be a list of Tag
             relation_weights=[RelationWeight(
                 literature='literature',
                 tag='tag',

diff --git a/test/unit_tests/data_manager_test/preprocessor_test/embedder_test.py b/test/unit_tests/data_manager_test/preprocessor_test/embedder_test.py
@@ -1,4 +1,5 @@
-from context_search.preprocessor.embedder import Embedder
+from context_search.preprocessor.embedder import TextEmbedder
+from context_search.data_classes import Chunk
 import pytest
 import numpy as np
 import random
@@ -11,9 +12,30 @@
 
 
 def test_creating_embeddings_from_text():
-    embedder = Embedder()
+    embedder = TextEmbedder()
     embedding = embedder.embed(text_to_embed)
 
-    assert pytest.approx(embedding, 1e-3) == np.load(
+    assert pytest.approx(embedding[0], 1e-3) == np.load(
         "test/unit_tests/data_manager_test/preprocessor_test/test_embedding.npy"
     )
+
+
+def test_creating_embeddings_from_multiple_texts():
+    embedder = TextEmbedder()
+    chunk_1 = Chunk(text=text_to_embed)
+    chunk_2 = Chunk(text=text_to_embed)
+
+    embeddings = embedder.produce_embeddings([chunk_1, chunk_2])
+
+    assert len(embeddings) == 2
+    assert pytest.approx(embeddings[0].embeddings, 1e-3) == np.load(
+        "test/unit_tests/data_manager_test/preprocessor_test/test_embedding.npy"
+    )
+    assert pytest.approx(embeddings[1].embeddings, 1e-3) == np.load(
+        "test/unit_tests/data_manager_test/preprocessor_test/test_embedding.npy"
+    )
+
+
+if __name__ == "__main__":
+    test_creating_embeddings_from_text()
+    test_creating_embeddings_from_multiple_texts()
diff --git a/test/unit_tests/data_manager_test/preprocessor_test/extractor_test.py b/test/unit_tests/data_manager_test/preprocessor_test/extractor_test.py
diff --git a/test/unit_tests/data_manager_test/preprocessor_test/preprocessor_test.py b/test/unit_tests/data_manager_test/preprocessor_test/preprocessor_test.py
@@ -9,3 +9,13 @@ def test_produce_chunks():
 
     assert len(chunks) == 1
     assert isinstance(chunks[0], Chunk)
+
+
+def test_produce_chunks_from_multiple_strings():
+    pipeline = Preprocessor()
+    texts = ['This is a test text', 'This is another test text']
+    chunks = pipeline.splitter.produce_chunks(texts)
+
+    assert len(chunks) == 2
+    assert isinstance(chunks[0], Chunk)
+    assert isinstance(chunks[1], Chunk)
diff --git a/test/unit_tests/data_manager_test/preprocessor_test/test_embedding.npy b/test/unit_tests/data_manager_test/preprocessor_test/test_embedding.npy