Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add column2Vec test workflow #34

Draft
wants to merge 2 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
56 changes: 51 additions & 5 deletions .github/workflows/py_test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -48,10 +48,10 @@ jobs:
run: |
black --check $(git ls-files '*.py')

python-tests:
similarity-tests:
env:
TEST_FILES: test/test_types.py test/test_metadata.py test/test_comparator.py test/test_column2VecCache.py
name: Run Python Tests
TEST_FILES: test/test_types.py test/test_metadata.py test/test_comparator.py
name: Run Similarity Tests
runs-on: ubuntu-latest
steps:
- name: Checkout repository
Expand All @@ -69,7 +69,7 @@ jobs:
pip install coverage pytest

- name: Run tests
run: coverage run --source='similarity,column2Vec' -m pytest $TEST_FILES
run: coverage run --source='similarity' -m pytest $TEST_FILES

- name: Show coverage
run: coverage report -m --omit=".*.ipynb"
Expand All @@ -90,6 +90,52 @@ jobs:
- uses: actions/upload-artifact@v4
if: github.event_name == 'pull_request'
with:
name: coverage
name: Similarity coverage
path: coverage.xml
retention-days: 1

column2vec-tests:
env:
TEST_FILES: test/test_column2VecCache.py test/test_column2Vec.py
name: Run Column2Vec Tests
runs-on: ubuntu-latest
steps:
- name: Checkout repository
uses: actions/checkout@v4

- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: '3.11'
cache: 'pip'

- name: Install dependencies
run: |
pip install -r requirements.txt
pip install coverage pytest pytest-xdist

- name: Run tests
run: coverage run --source='column2Vec' -m pytest -n auto $TEST_FILES

- name: Show coverage
run: coverage report -m --omit=".*.ipynb"

- name: Create coverage file
if: github.event_name == 'pull_request'
run: coverage xml

- name: Get Cover
if: github.event_name == 'pull_request'
uses: orgoro/[email protected]
with:
coverageFile: coverage.xml
token: ${{ secrets.GITHUB_TOKEN }}
thresholdAll: 0.7
thresholdNew: 0.9

- uses: actions/upload-artifact@v4
if: github.event_name == 'pull_request'
with:
name: Column2Vec coverage
path: coverage.xml
retention-days: 1
7 changes: 6 additions & 1 deletion constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,12 @@ class TrainedModel:
"""

configure()
__model = SentenceTransformer("paraphrase-multilingual-mpnet-base-v2")
__model = SentenceTransformer(
"paraphrase-multilingual-mpnet-base-v2",
tokenizer_kwargs={
"clean_up_tokenization_spaces": True,
},
)

def set_module(self, model: SentenceTransformer):
"""
Expand Down
20 changes: 10 additions & 10 deletions test/test_column2Vec.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
# MODEL = 'all-mpnet-base-v2' # bert-base-nli-mean-tokens
MODEL = 'bert-base-nli-mean-tokens' #
THIS_DIR = os.path.dirname(os.path.abspath(__file__))

TRANSFORMER = SentenceTransformer(MODEL)

def vectors_are_same(vec1, vec2):
for i, j in zip(vec1, vec2):
Expand All @@ -33,7 +33,7 @@ def get_vectors(function, data):
count = 1
for key in data:
# print("Processing column: " + key + " " + str(round((count / len(data)) * 100, 2)) + "%")
result[key] = function(data[key], SentenceTransformer(MODEL), key)
result[key] = function(data[key], TRANSFORMER, key)
count += 1
end = time.time()
print(f"ELAPSED TIME :{end - start}")
Expand All @@ -60,7 +60,7 @@ def get_data(files):
class TestSimilarityOfVectors(unittest.TestCase):
@classmethod
def setUpClass(cls):
cls.model = SentenceTransformer(MODEL)
cls.model = TRANSFORMER
file_m2 = os.path.join(THIS_DIR, os.pardir, 'data/netflix_titles.csv')
# make an array of all the files
files = [file_m2]
Expand All @@ -76,7 +76,7 @@ def setUpClass(cls):
stop += 1

def test_column2vec_as_sentence(self):
model = SentenceTransformer(MODEL)
model = TRANSFORMER
self.assertTrue(
vectors_are_same(column2vec_as_sentence(self.first, model, "a"),
column2vec_as_sentence(self.first, self.model, "b")))
Expand All @@ -87,7 +87,7 @@ def test_column2vec_as_sentence(self):
column2vec_as_sentence(self.third, self.model, "f")))

def test_column2vec_as_sentence_clean(self):
model = SentenceTransformer(MODEL)
model = TRANSFORMER
self.assertTrue(vectors_are_same(column2vec_as_sentence_clean(self.first, model, "g"),
column2vec_as_sentence_clean(self.first, self.model, "h")))
self.assertTrue(vectors_are_same(column2vec_as_sentence_clean(self.second, model, "i"),
Expand All @@ -96,7 +96,7 @@ def test_column2vec_as_sentence_clean(self):
column2vec_as_sentence_clean(self.third, self.model, "l")))

def test_column2vec_as_sentence_clean_uniq(self):
model = SentenceTransformer(MODEL)
model = TRANSFORMER
self.assertTrue(vectors_are_same(column2vec_as_sentence_clean_uniq(self.first, model, "m"),
column2vec_as_sentence_clean_uniq(self.first, self.model, "n")))
self.assertTrue(vectors_are_same(column2vec_as_sentence_clean_uniq(self.second, model, "o"),
Expand All @@ -105,14 +105,14 @@ def test_column2vec_as_sentence_clean_uniq(self):
column2vec_as_sentence_clean_uniq(self.third, self.model, "r")))

def test_column2vec_avg(self):
model = SentenceTransformer(MODEL)
model = TRANSFORMER
self.assertTrue(vectors_are_same(column2vec_avg(self.first, model, "v"),
column2vec_avg(self.first, self.model, "s")))
# self.assertTrue(vectors_are_same(column2vec_avg(self.second, model), column2vec_avg(self.second, self.model)))
# self.assertTrue(vectors_are_same(column2vec_avg(self.third, model), column2vec_avg(self.third, self.model)))

def test_column2vec_weighted_avg(self):
model = SentenceTransformer(MODEL)
model = TRANSFORMER
self.assertTrue(vectors_are_same(column2vec_weighted_avg(self.first, model, "u"),
column2vec_weighted_avg(self.first, self.model, "w")))
# self.assertTrue(vectors_are_same(column2vec_weighted_avg(self.second, model),
Expand All @@ -121,12 +121,12 @@ def test_column2vec_weighted_avg(self):
# column2vec_weighted_avg(self.third, self.model)))

def test_column2vec_sum(self):
model = SentenceTransformer(MODEL)
model = TRANSFORMER
self.assertTrue(vectors_are_same(column2vec_sum(self.first, model, "x"),
column2vec_sum(self.first, self.model, "y")))

def test_column2vec_weighted_sum(self):
model = SentenceTransformer(MODEL)
model = TRANSFORMER
self.assertTrue(vectors_are_same(column2vec_weighted_sum(self.first, model, "z"),
column2vec_weighted_sum(self.first, self.model, "ab")))

Expand Down
Loading