Skip to content

Commit

Permalink
Submitting fastembed and new tests to the master (#25)
Browse files Browse the repository at this point in the history
* make file and poetry build

* Adding curl to dockerfiles

* Deploy version for arm server deployment

* Correcting text splitter parameters

* Replacing pytorch and transformers with fastembed

* Additional tests and the setter method for text splitter

* Correcting user expirience of TextSplitter and adding tests

* update poetry lock
  • Loading branch information
ArturOle authored Nov 5, 2024
1 parent 0669432 commit ee963cd
Show file tree
Hide file tree
Showing 14 changed files with 717 additions and 2,235 deletions.
2,711 changes: 558 additions & 2,153 deletions poetry.lock

Large diffs are not rendered by default.

15 changes: 7 additions & 8 deletions pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "ContextSearch"
version = "0.5.0"
version = "0.6.0"
description = "User friendly system for semantic search."
authors = [
"ArturOle"
Expand All @@ -11,19 +11,15 @@ keywords = ["semantic search", "ocr", "rag", "document-embedding", "contextual-s
packages = [{include = "context_search", from="src"}]

[tool.poetry.dependencies]
python = "^3.10"
python = ">=3.10,<3.13"
neo4j = "^5.25.0"
pdf2image = "^1.17.0"
pydantic = "^2.9.2"
PyMuPDF = "^1.24.10"
pytesseract = "^0.3.13"
pytextrank = "^3.3.0"
spacy = "^3.8.2"
tqdm = "^4.66.5"
transformers = "^4.45.1"
numpy = "^2.0.2"
torch = { version = "^2.3.1", source = "torch"}
en_core_web_sm = {url = "https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0.tar.gz"}
numpy = ">=1.21,<2"
fastembed = "0.4.1"

[tool.poetry.dev-dependencies]
pytest = "^8.3.2"
Expand All @@ -38,3 +34,6 @@ priority = "supplemental"
[build-system]
requires = ["poetry-core>=1.0.0"]
build-backend = "poetry.core.masonry.api"

[tool.pylint.format]
max-module-lines = 99
56 changes: 35 additions & 21 deletions src/context_search/preprocessor/embedder.py
Original file line number Diff line number Diff line change
@@ -1,29 +1,41 @@
import torch
from transformers import AutoModel, AutoTokenizer

from fastembed import TextEmbedding
from typing import List

from ..data_classes import Embeddable


class Embedder:
def __init__(self, model_id="intfloat/e5-base-v2"):
self.device = "cuda" if torch.cuda.is_available() else "cpu"
self.tokenizer = AutoTokenizer.from_pretrained(model_id)
self.model = AutoModel.from_pretrained(model_id).to(self.device)
self.model.eval()
class AbstractEmbedder:
def embed(self, text):
raise NotImplementedError

def produce_embeddings(
self,
embeddable_objs: List[Embeddable]
) -> List[Embeddable]:
raise NotImplementedError


class ImageEmbedder(AbstractEmbedder):
def embed(self, text):
raise NotImplementedError

def produce_embeddings(
self,
embeddable_objs: List[Embeddable]
) -> List[Embeddable]:
raise NotImplementedError


class TextEmbedder:
def __init__(self, model_id="sentence-transformers/all-MiniLM-L6-v2"):
self.model = TextEmbedding(
model_name=model_id
)

def embed(self, text):
"""Embeds the given text using the model."""
with torch.no_grad():
inputs = self.tokenizer(
text,
return_tensors="pt",
padding=True,
truncation=True
).to(self.device)
outputs = self.model(**inputs)
squeezed_output = outputs.last_hidden_state.mean(dim=1).squeeze()
return squeezed_output.cpu().tolist()
return list(self.model.embed(text))

def __call__(self, doc):
doc._.embedding = self.embed(doc.text)
Expand All @@ -35,9 +47,11 @@ def produce_embeddings(
) -> List[Embeddable]:
"""Produces embeddings for the given list of Embeddable objects."""

embeddings = self.embed([
embeddable_obj.text for embeddable_obj in embeddable_objs
])

for embeddable_obj in embeddable_objs:
embeddable_obj.embeddings = self.embed(
embeddable_obj.text
)
embeddable_obj.embeddings = embeddings.pop(0)

return embeddable_objs
13 changes: 6 additions & 7 deletions src/context_search/preprocessor/extractor.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@

import spacy
import pytextrank # noqa: F401
# import spacy
# import pytextrank # noqa: F401

from typing import List, Tuple

Expand All @@ -9,15 +9,14 @@

class Extractor:
def __init__(self):
self.nlp = spacy.load("en_core_web_sm")
self.nlp.add_pipe("textrank")
pass

def extract_keywords(self, text_list: List[str]) -> list:
ranked_phrases = []

doc = self.nlp(''.join(text_list))
for phrase in doc._.phrases:
ranked_phrases.append([phrase.text, phrase.rank])
# # doc = self.nlp(''.join(text_list))
# for phrase in doc._.phrases:
# ranked_phrases.append([phrase.text, phrase.rank])

return ranked_phrases

Expand Down
13 changes: 8 additions & 5 deletions src/context_search/preprocessor/preprocessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,18 +4,21 @@
LiteratureGraph,
)
from .extractor import Extractor
from .embedder import Embedder
from .embedder import TextEmbedder
from .text_splitter import TextSplitter


class Preprocessor:
def __init__(self):
self.embedder = Embedder()
self.embedder = TextEmbedder()
self.extractor = Extractor()
self.splitter = TextSplitter(
order="any",
separators=['\.', '\n\n', '\n', '\s'],
is_separator_regex=True
order="sequential",
separators=['\n\n', '\n', '\.', '\s'],
is_separator_regex=True,
chunk_size=1024,
chunk_overlap=128,
margin=128
)

def process(self, literatures: list[LiteratureDTO]):
Expand Down
19 changes: 15 additions & 4 deletions src/context_search/preprocessor/text_splitter.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
import inspect

import re

from abc import ABC, abstractmethod
Expand Down Expand Up @@ -51,15 +51,16 @@ def __init__(
self,
chunk_size: int = 1024,
chunk_overlap: Union[int, float] = 256,
margin: int = 256,
margin: int = None,
order: str = "any",
separators: List[str] = ['\.', '\n\n', '\n', '\s'],
is_separator_regex: bool = True,
):
self.chunk_size = chunk_size
self.overlap = chunk_overlap
margin = margin if margin is not None else self.overlap
self.margin = margin
self.order = order
self._order = order
self._is_separator_regex = is_separator_regex
self.separators = separators

Expand Down Expand Up @@ -92,11 +93,21 @@ def __init__(
self.search_func: callable = None
self.setup_separators(separators)

@property
def order(self):
return self._order

@order.setter
def order(self, value):
""" Assures the separators are compiled for the new order strategy."""
self._order = value
self.setup_separators(self.separators)

def setup_separators(self, separators):
""" Prepares compiled patterns for efficient search of the separators
and sets the search function based on the order of the separators.
"""
match self.order.lower():
match self._order.lower():
case "any":
if not self._is_separator_regex:
separators = '|'.join([
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from context_search.data_classes import LiteratureDTO
import numpy as np
import random
import pytest
import pytest # noqa: F401

random.seed(0)
np.random.seed(0)
Expand All @@ -23,13 +23,6 @@ def test_preprocessing_steps():
assert len(literature.chunks) == 1
assert literature.chunks[0].text == "This is a test text"
assert literature.chunks[0].page_number == 0
assert len(literature.tags) == 2
assert literature.tags[0].text == "a test text"
assert pytest.approx(literature.tags[0].embeddings[:2], 1e-3) ==\
[-0.23820725, -0.3175099]
assert literature.relation_weights[0].literature == "name"
assert literature.relation_weights[0].tag == "a test text"
assert literature.relation_weights[0].weight == 0.25


def test_multiple_file_processing():
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -140,7 +140,7 @@ def test_literature_graph_creation_incorrect():
filepath=r'f:\ile\pat.h'
),
chunks=[Chunk(text='text')],
tags=[Chunk(text='text')], # should be a list of Tag
tags=[Chunk(text='text')], # should be a list of Tag
relation_weights=[RelationWeight(
literature='literature',
tag='tag',
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from context_search.preprocessor.embedder import Embedder
from context_search.preprocessor.embedder import TextEmbedder
from context_search.data_classes import Chunk
import pytest
import numpy as np
import random
Expand All @@ -11,9 +12,30 @@


def test_creating_embeddings_from_text():
embedder = Embedder()
embedder = TextEmbedder()
embedding = embedder.embed(text_to_embed)

assert pytest.approx(embedding, 1e-3) == np.load(
assert pytest.approx(embedding[0], 1e-3) == np.load(
"test/unit_tests/data_manager_test/preprocessor_test/test_embedding.npy"
)


def test_creating_embeddings_from_multiple_texts():
embedder = TextEmbedder()
chunk_1 = Chunk(text=text_to_embed)
chunk_2 = Chunk(text=text_to_embed)

embeddings = embedder.produce_embeddings([chunk_1, chunk_2])

assert len(embeddings) == 2
assert pytest.approx(embeddings[0].embeddings, 1e-3) == np.load(
"test/unit_tests/data_manager_test/preprocessor_test/test_embedding.npy"
)
assert pytest.approx(embeddings[1].embeddings, 1e-3) == np.load(
"test/unit_tests/data_manager_test/preprocessor_test/test_embedding.npy"
)


if __name__ == "__main__":
test_creating_embeddings_from_text()
test_creating_embeddings_from_multiple_texts()

This file was deleted.

Original file line number Diff line number Diff line change
Expand Up @@ -9,3 +9,13 @@ def test_produce_chunks():

assert len(chunks) == 1
assert isinstance(chunks[0], Chunk)


def test_produce_chunks_from_multiple_strings():
pipeline = Preprocessor()
texts = ['This is a test text', 'This is another test text']
chunks = pipeline.splitter.produce_chunks(texts)

assert len(chunks) == 2
assert isinstance(chunks[0], Chunk)
assert isinstance(chunks[1], Chunk)
Binary file not shown.
Loading

0 comments on commit ee963cd

Please sign in to comment.