-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge branch 'master' into doc/fill-readme-description-and-contributing
- Loading branch information
Showing
15 changed files
with
1,662 additions
and
1,275 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,31 @@ | ||
name: "Column2Vec tests" | ||
|
||
on: | ||
schedule: | ||
- cron: '0 10 * * 6' | ||
workflow_dispatch: | ||
|
||
|
||
jobs: | ||
python-tests: | ||
name: Run Tests for Column2Vec | ||
runs-on: ubuntu-latest | ||
steps: | ||
- name: Checkout repository | ||
uses: actions/checkout@v4 | ||
|
||
- name: Set up Python | ||
uses: actions/setup-python@v5 | ||
with: | ||
python-version: '3.11' | ||
cache: 'pip' | ||
|
||
- name: Install dependencies | ||
run: | | ||
pip install pytest | ||
pip install -r requirements.txt | ||
- name: Run tests | ||
run: | | ||
pytest test/test_column2Vec.py | ||
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,70 @@ | ||
name: "Static analysis & tests" | ||
|
||
on: | ||
push: | ||
branches: [ "master" ] | ||
pull_request: | ||
|
||
jobs: | ||
analysis: | ||
runs-on: ubuntu-latest | ||
name: Pylint Analysis | ||
steps: | ||
- name: Checkout repository | ||
uses: actions/checkout@v4 | ||
|
||
- name: Set up Python | ||
uses: actions/setup-python@v5 | ||
with: | ||
python-version: '3.11' | ||
cache: 'pip' | ||
|
||
- name: Install dependencies | ||
run: | | ||
pip install -r requirements.txt | ||
pip install pylint | ||
- name: Analysing the code with pylint | ||
run: | | ||
pylint \ | ||
--fail-under=6.0 \ | ||
--ignore-patterns=test_.*?py \ | ||
--max-line-length=180 \ | ||
$(git ls-files '*.py') | ||
python-tests: | ||
env: | ||
TEST_FILES: test/test_types.py test/test_metadata.py test/test_comparator.py | ||
name: Run Python Tests | ||
runs-on: ubuntu-latest | ||
steps: | ||
- name: Checkout repository | ||
uses: actions/checkout@v4 | ||
|
||
- name: Set up Python | ||
uses: actions/setup-python@v5 | ||
with: | ||
python-version: '3.11' | ||
cache: 'pip' | ||
|
||
- name: Install dependencies | ||
run: | | ||
pip install -r requirements.txt | ||
pip install coverage pytest | ||
- name: Run tests | ||
run: coverage run --source='similarity,column2Vec' -m pytest $TEST_FILES | ||
|
||
- name: Show coverage | ||
run: coverage report -m | ||
|
||
- name: Create coverage file | ||
run: coverage xml | ||
|
||
- name: Get Cover | ||
uses: orgoro/[email protected] | ||
with: | ||
coverageFile: coverage.xml | ||
token: ${{ secrets.GITHUB_TOKEN }} | ||
thresholdAll: 0.7 | ||
thresholdNew: 0.9 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,74 @@ | ||
""" | ||
Functions usefull for column2Vec. | ||
""" | ||
from typing import Any | ||
|
||
import numpy as np | ||
import pandas as pd | ||
from sklearn.cluster import KMeans | ||
|
||
from similarity.Comparator import cosine_sim | ||
from similarity.DataFrameMetadataCreator import DataFrameMetadataCreator | ||
from similarity.Types import NONNUMERICAL | ||
|
||
|
||
def get_data(files: list[str]) -> dict[str, Any]: | ||
""" | ||
Reads all csv files (which name is in files). Creates metadata for them. | ||
Save only nonnumerical columns into dictionary. Key is name of column. | ||
Value is column. | ||
:param files: list names of csv files | ||
:return: dictionary of all tables. | ||
""" | ||
result = {} | ||
index = 0 | ||
for i in files: | ||
index += 1 | ||
data = pd.read_csv(i) | ||
metadata_creator = (DataFrameMetadataCreator(data). | ||
compute_advanced_structural_types(). | ||
compute_column_kind()) | ||
metadata1 = metadata_creator.get_metadata() | ||
column_names = metadata1.get_column_names_by_type(NONNUMERICAL) | ||
for name in column_names: | ||
print(f" {i} : {name}") | ||
result[name + str(index)] = data[name] | ||
return result | ||
|
||
|
||
def get_clusters(vectors_to_cluster: pd.DataFrame, n_clusters: int) -> list[list[str]]: | ||
""" | ||
Creates clusters by KMeans for given vectors. | ||
:param vectors_to_cluster: embeddings for columns | ||
:param n_clusters: number of clusters we want | ||
:return: List, for each cluster number it contains list of column names | ||
""" | ||
kmeans = KMeans(n_clusters=n_clusters, random_state=0) # Change n_clusters as needed | ||
list_of_vectors = np.array(list(vectors_to_cluster.values())) | ||
kmeans.fit(list_of_vectors) | ||
|
||
clusters = [[]] * n_clusters | ||
for i in range(n_clusters): | ||
names = [] | ||
for cluster, name in zip(kmeans.labels_, vectors_to_cluster.keys()): | ||
if cluster == i: | ||
names.append(name) | ||
clusters[i] = names | ||
|
||
return clusters | ||
|
||
|
||
def compute_distances(vectors: dict): | ||
""" | ||
Compute distance for each pair of vectors. | ||
:param vectors: dictionary of embedding vectors | ||
:return: matrix with distances | ||
""" | ||
res = {} | ||
for key1, vec1 in vectors.items(): | ||
res[key1] = {} | ||
for key2, vec2 in vectors.items(): | ||
res[key1][key2] = 1 - cosine_sim(vec1, vec2) | ||
return res |
14 changes: 14 additions & 0 deletions
14
column2Vec/generated/Clusters_Average_column2vec_clusters.html
Large diffs are not rendered by default.
Oops, something went wrong.
14 changes: 14 additions & 0 deletions
14
column2Vec/generated/Clusters_Clean_uniq_sentence_column2vec_vectors.html
Large diffs are not rendered by default.
Oops, something went wrong.
14 changes: 14 additions & 0 deletions
14
column2Vec/generated/Clusters_Cleaned_sentence_column2vec_vectors.html
Large diffs are not rendered by default.
Oops, something went wrong.
Oops, something went wrong.