From 7d410a5b5bdecc60592209f64bfaf6444f5e6c95 Mon Sep 17 00:00:00 2001 From: OlivieFranklova Date: Wed, 18 Sep 2024 16:43:58 +0200 Subject: [PATCH 1/2] Add column2Vec test workflow --- .github/workflows/py_test.yml | 56 +++++++++++++++++++++++++++++++---- constants.py | 5 +++- 2 files changed, 55 insertions(+), 6 deletions(-) diff --git a/.github/workflows/py_test.yml b/.github/workflows/py_test.yml index 15187d9..e75310c 100644 --- a/.github/workflows/py_test.yml +++ b/.github/workflows/py_test.yml @@ -48,10 +48,10 @@ jobs: run: | black --check $(git ls-files '*.py') - python-tests: + similarity-tests: env: - TEST_FILES: test/test_types.py test/test_metadata.py test/test_comparator.py test/test_column2VecCache.py - name: Run Python Tests + TEST_FILES: test/test_types.py test/test_metadata.py test/test_comparator.py + name: Run Similarity Tests runs-on: ubuntu-latest steps: - name: Checkout repository @@ -69,7 +69,7 @@ jobs: pip install coverage pytest - name: Run tests - run: coverage run --source='similarity,column2Vec' -m pytest $TEST_FILES + run: coverage run --source='similarity' -m pytest $TEST_FILES - name: Show coverage run: coverage report -m --omit=".*.ipynb" @@ -90,6 +90,52 @@ jobs: - uses: actions/upload-artifact@v4 if: github.event_name == 'pull_request' with: - name: coverage + name: Similarity coverage + path: coverage.xml + retention-days: 1 + + column2vec-tests: + env: + TEST_FILES: test/test_column2VecCache.py test/test_column2Vec.py + name: Run Column2Vec Tests + runs-on: ubuntu-latest + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: '3.11' + cache: 'pip' + + - name: Install dependencies + run: | + pip install -r requirements.txt + pip install coverage pytest pytest-xdist + + - name: Run tests + run: coverage run --source='column2Vec' -m pytest -n auto $TEST_FILES + + - name: Show coverage + run: coverage report -m --omit=".*.ipynb" + + - name: Create coverage file + if: github.event_name == 'pull_request' + run: coverage xml + + - name: Get Cover + if: github.event_name == 'pull_request' + uses: orgoro/coverage@v3.1 + with: + coverageFile: coverage.xml + token: ${{ secrets.GITHUB_TOKEN }} + thresholdAll: 0.7 + thresholdNew: 0.9 + + - uses: actions/upload-artifact@v4 + if: github.event_name == 'pull_request' + with: + name: Column2Vec coverage path: coverage.xml retention-days: 1 diff --git a/constants.py b/constants.py index 1530cff..0727942 100644 --- a/constants.py +++ b/constants.py @@ -51,7 +51,10 @@ class TrainedModel: """ configure() - __model = SentenceTransformer("paraphrase-multilingual-mpnet-base-v2") + __model = SentenceTransformer("paraphrase-multilingual-mpnet-base-v2", + tokenizer_kwargs={ + "clean_up_tokenization_spaces": True, + }) def set_module(self, model: SentenceTransformer): """ From ac79a77103aa4e4ccc19f355089d815eedbdf653 Mon Sep 17 00:00:00 2001 From: OlivieFranklova Date: Wed, 18 Sep 2024 21:55:35 +0200 Subject: [PATCH 2/2] Format files with black --- constants.py | 10 ++++++---- test/test_column2Vec.py | 20 ++++++++++---------- 2 files changed, 16 insertions(+), 14 deletions(-) diff --git a/constants.py b/constants.py index 0727942..10dc328 100644 --- a/constants.py +++ b/constants.py @@ -51,10 +51,12 @@ class TrainedModel: """ configure() - __model = SentenceTransformer("paraphrase-multilingual-mpnet-base-v2", - tokenizer_kwargs={ - "clean_up_tokenization_spaces": True, - }) + __model = SentenceTransformer( + "paraphrase-multilingual-mpnet-base-v2", + tokenizer_kwargs={ + "clean_up_tokenization_spaces": True, + }, + ) def set_module(self, model: SentenceTransformer): """ diff --git a/test/test_column2Vec.py b/test/test_column2Vec.py index 7e09015..76c37d9 100644 --- a/test/test_column2Vec.py +++ b/test/test_column2Vec.py @@ -18,7 +18,7 @@ # MODEL = 'all-mpnet-base-v2' # bert-base-nli-mean-tokens MODEL = 'bert-base-nli-mean-tokens' # THIS_DIR = os.path.dirname(os.path.abspath(__file__)) - +TRANSFORMER = SentenceTransformer(MODEL) def vectors_are_same(vec1, vec2): for i, j in zip(vec1, vec2): @@ -33,7 +33,7 @@ def get_vectors(function, data): count = 1 for key in data: # print("Processing column: " + key + " " + str(round((count / len(data)) * 100, 2)) + "%") - result[key] = function(data[key], SentenceTransformer(MODEL), key) + result[key] = function(data[key], TRANSFORMER, key) count += 1 end = time.time() print(f"ELAPSED TIME :{end - start}") @@ -60,7 +60,7 @@ def get_data(files): class TestSimilarityOfVectors(unittest.TestCase): @classmethod def setUpClass(cls): - cls.model = SentenceTransformer(MODEL) + cls.model = TRANSFORMER file_m2 = os.path.join(THIS_DIR, os.pardir, 'data/netflix_titles.csv') # make an array of all the files files = [file_m2] @@ -76,7 +76,7 @@ def setUpClass(cls): stop += 1 def test_column2vec_as_sentence(self): - model = SentenceTransformer(MODEL) + model = TRANSFORMER self.assertTrue( vectors_are_same(column2vec_as_sentence(self.first, model, "a"), column2vec_as_sentence(self.first, self.model, "b"))) @@ -87,7 +87,7 @@ def test_column2vec_as_sentence(self): column2vec_as_sentence(self.third, self.model, "f"))) def test_column2vec_as_sentence_clean(self): - model = SentenceTransformer(MODEL) + model = TRANSFORMER self.assertTrue(vectors_are_same(column2vec_as_sentence_clean(self.first, model, "g"), column2vec_as_sentence_clean(self.first, self.model, "h"))) self.assertTrue(vectors_are_same(column2vec_as_sentence_clean(self.second, model, "i"), @@ -96,7 +96,7 @@ def test_column2vec_as_sentence_clean(self): column2vec_as_sentence_clean(self.third, self.model, "l"))) def test_column2vec_as_sentence_clean_uniq(self): - model = SentenceTransformer(MODEL) + model = TRANSFORMER self.assertTrue(vectors_are_same(column2vec_as_sentence_clean_uniq(self.first, model, "m"), column2vec_as_sentence_clean_uniq(self.first, self.model, "n"))) self.assertTrue(vectors_are_same(column2vec_as_sentence_clean_uniq(self.second, model, "o"), @@ -105,14 +105,14 @@ def test_column2vec_as_sentence_clean_uniq(self): column2vec_as_sentence_clean_uniq(self.third, self.model, "r"))) def test_column2vec_avg(self): - model = SentenceTransformer(MODEL) + model = TRANSFORMER self.assertTrue(vectors_are_same(column2vec_avg(self.first, model, "v"), column2vec_avg(self.first, self.model, "s"))) # self.assertTrue(vectors_are_same(column2vec_avg(self.second, model), column2vec_avg(self.second, self.model))) # self.assertTrue(vectors_are_same(column2vec_avg(self.third, model), column2vec_avg(self.third, self.model))) def test_column2vec_weighted_avg(self): - model = SentenceTransformer(MODEL) + model = TRANSFORMER self.assertTrue(vectors_are_same(column2vec_weighted_avg(self.first, model, "u"), column2vec_weighted_avg(self.first, self.model, "w"))) # self.assertTrue(vectors_are_same(column2vec_weighted_avg(self.second, model), @@ -121,12 +121,12 @@ def test_column2vec_weighted_avg(self): # column2vec_weighted_avg(self.third, self.model))) def test_column2vec_sum(self): - model = SentenceTransformer(MODEL) + model = TRANSFORMER self.assertTrue(vectors_are_same(column2vec_sum(self.first, model, "x"), column2vec_sum(self.first, self.model, "y"))) def test_column2vec_weighted_sum(self): - model = SentenceTransformer(MODEL) + model = TRANSFORMER self.assertTrue(vectors_are_same(column2vec_weighted_sum(self.first, model, "z"), column2vec_weighted_sum(self.first, self.model, "ab")))