diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 3f50bd1..20e1dd2 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -35,6 +35,7 @@ jobs: - name: Install dependencies run: | python -m pip install --upgrade pip + python -m pip install ruff python -m pip install .[dev] - name: Check quality @@ -45,7 +46,7 @@ jobs: needs: check_code_quality env: - OPENAI_BASE_URL: https://ai-yyds.com/v1 + OPENAI_BASE_URL: ${{ secrets.OPENAI_BASE_URL }} OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} DEFAULT_EMBED_MODEL: text-embedding-ada-002 DEFAULT_CHAT_MODEL: gpt-3.5-turbo @@ -67,6 +68,7 @@ jobs: ports: - 6379:6379 + runs-on: ubuntu-latest steps: @@ -83,7 +85,10 @@ jobs: - name: Install dependencies run: | python -m pip install --upgrade pip - python -m pip install .[dev] + python -m pip install -e . + python -m pip install -r requirements.txt + python -m pip install -r requirements-dev.txt + python -m pip install pytest wget https://github.com/milvus-io/milvus/releases/download/v2.4.4/milvus-standalone-docker-compose.yml -O docker-compose.yml sudo docker compose up -d @@ -91,13 +96,12 @@ jobs: run: | make test - - name: Configure sysctl limits + - name: Configure Elasticsearch run: | sudo swapoff -a sudo sysctl -w vm.swappiness=1 sudo sysctl -w fs.file-max=262144 sudo sysctl -w vm.max_map_count=262144 - - name: Runs Elasticsearch uses: elastic/elastic-github-actions/elasticsearch@master with: diff --git a/src/cardinal/splitter/text_splitter.py b/src/cardinal/splitter/text_splitter.py index c62c151..83532d2 100644 --- a/src/cardinal/splitter/text_splitter.py +++ b/src/cardinal/splitter/text_splitter.py @@ -15,10 +15,10 @@ class TextSplitter: https://github.com/langchain-ai/langchain/blob/v0.1.5/libs/langchain/langchain/text_splitter.py """ - def __init__(self, chunk_size: Optional[int] = None, chuck_overlap: Optional[int] = None) -> None: + def __init__(self, chunk_size: Optional[int] = None, chunk_overlap: Optional[int] = None) -> None: self._separators = ["\n\n", "\n", ". ", ", ", " ", ""] self._chunk_size = chunk_size if chunk_size is not None else settings.default_chunk_size - self._chunk_overlap = chuck_overlap if chuck_overlap is not None else settings.default_chunk_overlap + self._chunk_overlap = chunk_overlap if chunk_overlap is not None else settings.default_chunk_overlap assert self._chunk_overlap < self._chunk_size, "chunk overlap must be larger than chunk size" self._counter = TokenCounter() diff --git a/tests/.idea/.gitignore b/tests/.idea/.gitignore deleted file mode 100644 index 26d3352..0000000 --- a/tests/.idea/.gitignore +++ /dev/null @@ -1,3 +0,0 @@ -# Default ignored files -/shelf/ -/workspace.xml diff --git a/tests/.idea/inspectionProfiles/Project_Default.xml b/tests/.idea/inspectionProfiles/Project_Default.xml deleted file mode 100644 index 4f00b2d..0000000 --- a/tests/.idea/inspectionProfiles/Project_Default.xml +++ /dev/null @@ -1,21 +0,0 @@ - - - - \ No newline at end of file diff --git a/tests/.idea/inspectionProfiles/profiles_settings.xml b/tests/.idea/inspectionProfiles/profiles_settings.xml deleted file mode 100644 index 105ce2d..0000000 --- a/tests/.idea/inspectionProfiles/profiles_settings.xml +++ /dev/null @@ -1,6 +0,0 @@ - - - - \ No newline at end of file diff --git a/tests/.idea/misc.xml b/tests/.idea/misc.xml deleted file mode 100644 index 7e83473..0000000 --- a/tests/.idea/misc.xml +++ /dev/null @@ -1,4 +0,0 @@ - - - - \ No newline at end of file diff --git a/tests/.idea/modules.xml b/tests/.idea/modules.xml deleted file mode 100644 index dac5cbb..0000000 --- a/tests/.idea/modules.xml +++ /dev/null @@ -1,8 +0,0 @@ - - - - - - - - \ No newline at end of file diff --git a/tests/.idea/tests.iml b/tests/.idea/tests.iml deleted file mode 100644 index 68a3566..0000000 --- a/tests/.idea/tests.iml +++ /dev/null @@ -1,11 +0,0 @@ - - - - - - - - - - \ No newline at end of file diff --git a/tests/.idea/vcs.xml b/tests/.idea/vcs.xml deleted file mode 100644 index 6c0b863..0000000 --- a/tests/.idea/vcs.xml +++ /dev/null @@ -1,6 +0,0 @@ - - - - - - \ No newline at end of file diff --git a/tests/collector/test_base_collector.py b/tests/collector/test_base_collector.py new file mode 100644 index 0000000..6a7cad0 --- /dev/null +++ b/tests/collector/test_base_collector.py @@ -0,0 +1,20 @@ +from pydantic import BaseModel +from typing import List +from cardinal.collector import BaseCollector +from cardinal.common import BaseMessage, AssistantMessage, HumanMessage + +class History(BaseModel): + messages: List[BaseMessage] + +def test_base_collector(): + collector = BaseCollector[History](storage_name="test", drop_old=True) + messages = [HumanMessage(content="hi"), AssistantMessage(content="hi there")] + history1 = History(messages=messages) + collector.collect(history1) + messages = [HumanMessage(content="foo"), AssistantMessage(content="foo too")] + history2 = History(messages=messages) + collector.collect(history2) + results = collector.dump() + assert(results[0] == history1) + assert(results[1] == history2) + collector._storage.destroy() diff --git a/tests/collector/test_msg_collector.py b/tests/collector/test_msg_collector.py deleted file mode 100644 index 49fab45..0000000 --- a/tests/collector/test_msg_collector.py +++ /dev/null @@ -1,14 +0,0 @@ -from cardinal.collector import MsgCollector -from cardinal.common import HumanMessage, AssistantMessage - -def test_msg_collector(): - collector = MsgCollector(storage_name="test", drop_old=True) - messages = [HumanMessage(content="hi"), AssistantMessage(content="hi there")] - collector.collect(messages) - messages = [HumanMessage(content="foo"), AssistantMessage(content="foo too")] - collector.collect(messages) - results = collector.dump() - assert(results[0][0].content=='hi') - assert(results[0][1].content=='hi there') - assert(results[1][0].content=='foo') - assert(results[1][1].content=='foo too') diff --git a/tests/model/test_embed_openai.py b/tests/model/test_embed_openai.py index a1866fb..10c6f5d 100644 --- a/tests/model/test_embed_openai.py +++ b/tests/model/test_embed_openai.py @@ -2,7 +2,6 @@ import pytest -@pytest.mark.skip(reason="no permission") def test_embed_openai(): embed_openai = EmbedOpenAI() assert(embed_openai.batch_embed(["This is a test"]) is not None) diff --git a/tests/retriever/test_dense_retriever.py b/tests/retriever/test_dense_retriever.py index 004a1da..0968828 100644 --- a/tests/retriever/test_dense_retriever.py +++ b/tests/retriever/test_dense_retriever.py @@ -1,7 +1,6 @@ from pydantic import BaseModel from cardinal.vectorstore import AutoVectorStore from cardinal.retriever import DenseRetriever -import pytest class Animal(BaseModel): name: str @@ -10,9 +9,9 @@ class Animal(BaseModel): data = [Animal(name=text) for text in texts] -@pytest.mark.skip(reason="no permission") def test_dense_retriever(): - vectorstore = AutoVectorStore[Animal].create(name="test", texts=texts, data=data, drop_old=True) + vectorStore = AutoVectorStore[Animal].create(name="test", texts=texts, data=data, drop_old=True) retriever = DenseRetriever[Animal](vectorstore_name="test", verbose=True) - assert(retriever.retrieve(query="dog", top_k=1)[0] == data[1]) + assert(retriever.retrieve(query="dog", top_k=1) == [data[1]]) + vectorStore.destroy() \ No newline at end of file diff --git a/tests/retriever/test_hybird_retriever.py b/tests/retriever/test_hybird_retriever.py index 66b3d4a..751b9a7 100644 --- a/tests/retriever/test_hybird_retriever.py +++ b/tests/retriever/test_hybird_retriever.py @@ -1,7 +1,6 @@ from pydantic import BaseModel from cardinal.vectorstore import AutoVectorStore from cardinal.retriever import HybridRetriever -import pytest class Animal(BaseModel): @@ -12,11 +11,12 @@ class Animal(BaseModel): data = [Animal(name=name, color=color) for name, color in animals] -@pytest.mark.skip(reason="no permission") def test_hybird_retriever(): names = [animal.name for animal in data] colors = [animal.color for animal in data] - AutoVectorStore[Animal].create(name="test1", texts=names, data=data, drop_old=True) - AutoVectorStore[Animal].create(name="test2", texts=colors, data=data, drop_old=True) + store1 = AutoVectorStore[Animal].create(name="test1", texts=names, data=data, drop_old=True) + store2 = AutoVectorStore[Animal].create(name="test2", texts=colors, data=data, drop_old=True) retriever = HybridRetriever[Animal](vectorstore_names=["test1", "test2"], verbose=True) - print(retriever.retrieve(query="a pink dog", top_k=2)) + assert(retriever.retrieve(query="a pink dog", top_k=2) == [data[2], data[3]]) + store1.destroy() + store2.destroy() \ No newline at end of file diff --git a/tests/retriever/test_sparse_retriever.py b/tests/retriever/test_sparse_retriever.py index 3a0c373..a9f79a3 100644 --- a/tests/retriever/test_sparse_retriever.py +++ b/tests/retriever/test_sparse_retriever.py @@ -20,6 +20,8 @@ class Document(BaseModel): def test_sparse_retriever(): storage = AutoStorage[Document](name="test") storage.insert(keys=["doc1", "doc2"], values=[doc1, doc2]) + if ENV_STORAGE == 'es': + storage._storage.database.indices.refresh() retriever = SparseRetriever(storage_name="test", verbose=True) - assert(retriever.retrieve is not None) - \ No newline at end of file + assert(retriever.retrieve(query="alice", top_k=1) == [doc1]) + storage.destroy() \ No newline at end of file diff --git a/tests/splitter/test_text_splitter.py b/tests/splitter/test_text_splitter.py index e8c1128..3e04f06 100644 --- a/tests/splitter/test_text_splitter.py +++ b/tests/splitter/test_text_splitter.py @@ -2,7 +2,7 @@ def test_text_splitter(): - splitter = CJKTextSplitter(chunk_size=30, chuck_overlap=10) + splitter = CJKTextSplitter(chunk_size=30, chunk_overlap=10) text = ( "The document presents FastEdit, a repository aimed at efficiently injecting " "fresh and customized knowledge into large language models using a single command. " diff --git a/tests/storage/test_storage.py b/tests/storage/test_storage.py index e36de7d..44aefa7 100644 --- a/tests/storage/test_storage.py +++ b/tests/storage/test_storage.py @@ -12,13 +12,15 @@ class Document(BaseModel): def test_storage(): storage = AutoStorage[Document](name="test") - + assert(not storage.exists()) # False storage.insert(keys=["doc1", "doc2"], values=[doc1, doc2]) - assert(storage.query("doc1")==doc1) - storage.clear() - assert(storage.query("doc1")==None) + assert(storage.exists()) # True + assert(storage.query("doc1") == doc1) # content='I am alice.' title='test' + storage.delete("doc1") + assert(storage.query("doc1") is None) # None storage.unique_reset() storage.unique_incr() storage.unique_incr() - assert(storage.unique_get()==2) + assert(storage.unique_get() == 2) # 2 + storage.destroy() \ No newline at end of file diff --git a/tests/vectorstore/test_vector_store.py b/tests/vectorstore/test_vector_store.py index e60f843..8255b62 100644 --- a/tests/vectorstore/test_vector_store.py +++ b/tests/vectorstore/test_vector_store.py @@ -1,6 +1,7 @@ from cardinal.vectorstore import AutoVectorStore, AutoCondition from pydantic import BaseModel from enum import IntEnum +import os import pytest @@ -24,10 +25,18 @@ class Animal(BaseModel): data = [Animal(name=text) for text in texts] -@pytest.mark.skip(reason="no permission") def test_vector_store(): - data = [Animal(name=text) for text in texts] - vecstore = AutoVectorStore[Animal].create(name="test", texts=texts, data=data, drop_old=True) - vecstore.delete(AutoCondition(key="name", value="dog", op=Operator.Eq)) - print(vecstore.search(query="dog", top_k=2)) - \ No newline at end of file + vectorStore = AutoVectorStore[Animal](name="test") + ENV_VECTORSTORE = os.getenv('VECTORSTORE') + assert(not vectorStore.exists()) # False + vectorStore.insert(texts=texts, data=data) + if ENV_VECTORSTORE == 'milvus': + vectorStore._vectorstore.store.flush() + vectorStore.delete(AutoCondition(key="name", value="dog", op=Operator.Eq)) + if ENV_VECTORSTORE == 'milvus': + vectorStore._vectorstore.store.flush() + assert(vectorStore.search(query="dog", top_k=2)[0][0] == data[2]) + assert(vectorStore.search(query="dog", top_k=2)[1][0] == data[1]) + # [(Animal(name='puppy'), 0.8510237336158752), (Animal(name='llama'), 1.1970627307891846)] + assert(vectorStore.exists()) # True + vectorStore.destroy()