Skip to content

Commit

Permalink
Merge branch 'master' into doc/fill-readme-description-and-contributing
Browse files Browse the repository at this point in the history
  • Loading branch information
Olivie Franklova (CZ) authored and Olivie Franklova (CZ) committed Apr 30, 2024
2 parents c82d65e + 40acd54 commit 7fdc58b
Show file tree
Hide file tree
Showing 15 changed files with 1,662 additions and 1,275 deletions.
25 changes: 1 addition & 24 deletions .github/workflows/codeql.yml
Original file line number Diff line number Diff line change
Expand Up @@ -82,30 +82,7 @@ jobs:
uses: github/codeql-action/analyze@v3
with:
category: "/language:${{matrix.language}}"
python-tests:
name: Run Python Tests
runs-on: ${{ (matrix.language == 'swift' && 'macos-latest') || 'ubuntu-latest' }}
steps:
- name: Checkout repository
uses: actions/checkout@v2

- name: Set up Python
uses: actions/setup-python@v2
with:
python-version: '3.x'

- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install pytest
pip install -r requirements.txt
- name: Run tests
run: |
pytest tests/types_test.py
pytest tests/metadata_test.py
pytest tests/comparator_test.py
pytest tests/column2Vec_test.py




Expand Down
31 changes: 31 additions & 0 deletions .github/workflows/column2Vec_test.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
name: "Column2Vec tests"

on:
schedule:
- cron: '0 10 * * 6'
workflow_dispatch:


jobs:
python-tests:
name: Run Tests for Column2Vec
runs-on: ubuntu-latest
steps:
- name: Checkout repository
uses: actions/checkout@v4

- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: '3.11'
cache: 'pip'

- name: Install dependencies
run: |
pip install pytest
pip install -r requirements.txt
- name: Run tests
run: |
pytest test/test_column2Vec.py
70 changes: 70 additions & 0 deletions .github/workflows/py_test.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
name: "Static analysis & tests"

on:
push:
branches: [ "master" ]
pull_request:

jobs:
analysis:
runs-on: ubuntu-latest
name: Pylint Analysis
steps:
- name: Checkout repository
uses: actions/checkout@v4

- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: '3.11'
cache: 'pip'

- name: Install dependencies
run: |
pip install -r requirements.txt
pip install pylint
- name: Analysing the code with pylint
run: |
pylint \
--fail-under=6.0 \
--ignore-patterns=test_.*?py \
--max-line-length=180 \
$(git ls-files '*.py')
python-tests:
env:
TEST_FILES: test/test_types.py test/test_metadata.py test/test_comparator.py
name: Run Python Tests
runs-on: ubuntu-latest
steps:
- name: Checkout repository
uses: actions/checkout@v4

- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: '3.11'
cache: 'pip'

- name: Install dependencies
run: |
pip install -r requirements.txt
pip install coverage pytest
- name: Run tests
run: coverage run --source='similarity,column2Vec' -m pytest $TEST_FILES

- name: Show coverage
run: coverage report -m

- name: Create coverage file
run: coverage xml

- name: Get Cover
uses: orgoro/[email protected]
with:
coverageFile: coverage.xml
token: ${{ secrets.GITHUB_TOKEN }}
thresholdAll: 0.7
thresholdNew: 0.9
2 changes: 1 addition & 1 deletion categorical.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
"source": [
"import pandas as pd\n",
"\n",
"import functions as f\n",
"import similarity.functions as f\n",
"import time\n",
"from comparing import ComparatorForDatasets\n",
"from comparing import CategoricalSimilarity\n",
Expand Down
57 changes: 49 additions & 8 deletions column2Vec/Column2Vec.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
"""
This file contains column2Vec implementations.
"""
import re

import numpy as np
Expand All @@ -9,7 +12,8 @@ def column2vec_as_sentence(column: pd.Series, model: SentenceTransformer):
"""
Convert a column to a vector
Make one string from all the items in the column. Convert string to a vector by sentence transformer.
Make one string from all the items in the column.
Convert string to a vector by sentence transformer.
"""
sentence = [str(column.tolist()).replace("\'", "").replace("]", "").replace("[", "")]
return model.encode(sentence)[0]
Expand All @@ -19,48 +23,85 @@ def column2vec_as_sentence_clean(column: pd.Series, model: SentenceTransformer):
"""
Convert a column to a vector
Make one string from all the items in the column, clean the column that it will contain only a-z and 0-9.
Make one string from all the items in the column, clean the column that
it will contain only a-z and 0-9.
Convert string to a vector by sentence transformer.
"""
column_as_str = str(column.tolist()).lower()
sentence = [re.sub("[^(0-9 |a-z)]", " ", column_as_str)]
return model.encode(sentence)[0]


def column2vec_as_sentence_clean_uniq(column: pd.Series, model: SentenceTransformer):
"""
Convert a column to a vector
Make one string from all the items in the column, clean the column that it will contain only a-z and 0-9, it will contains only uniq values.
Make one string from all the items in the column,
clean the column that it will contain only a-z and
0-9, it will contain only uniq values.
Convert string to a vector by sentence transformer.
"""
uniq_column = column.unique()
column_as_str = str(uniq_column.tolist()).lower()
sentence = [re.sub("[^(0-9 |a-z)]", " ", column_as_str)]
return model.encode(sentence)[0]


def column2vec_avg(column: pd.Series, model: SentenceTransformer):
"""
Convert a column to a vector
Convert each item in the column to a vector and return the average of all the vectors
"""
uniq_column = column.unique()
column_clean = pd.Series(uniq_column).apply(lambda x: re.sub("[^(0-9 |a-z)]", " ", str(x).lower())).values
column_clean = pd.Series(uniq_column).apply(lambda x: re.sub("[^(0-9 |a-z)]",
" ", str(x).lower())).values
encoded_columns = model.encode(column_clean)
to_ret = np.mean(encoded_columns, axis=0) # counts arithmetic mean (average)
to_ret = np.mean(encoded_columns, axis=0) # counts arithmetic mean (average)
return to_ret


def column2vec_weighted_avg(column: pd.Series, model: SentenceTransformer):
"""
todo tests it does what it should
Convert a column to a vector
Convert each item in the column to a vector and return the weighted average of all the vectors
"""
uniq_column = column.value_counts(normalize=True)
weights = uniq_column.values
column_clean = pd.Series(uniq_column.keys()).apply(lambda x: re.sub("[^(0-9 |a-z)]", " ", str(x).lower())).values
column_clean = pd.Series(uniq_column.keys()).apply(lambda x: re.sub("[^(0-9 |a-z)]",
" ", str(x).lower())).values
encoded_columns = model.encode(column_clean)
to_ret = np.average(encoded_columns, axis=0, weights=weights) # counts weighted average
return to_ret


def column2vec_sum(column: pd.Series, model: SentenceTransformer):
"""
Convert a column to a vector
Convert each item in the column to a vector and return the average of all the vectors
"""
uniq_column = column.unique()
column_clean = pd.Series(uniq_column).apply(lambda x: re.sub("[^(0-9 |a-z)]",
" ", str(x).lower())).values
encoded_columns = model.encode(column_clean)
to_ret = np.average(encoded_columns, axis=0, weights=weights) # counts weighted average
to_ret = sum(encoded_columns) # sum of values
return to_ret


def column2vec_weighted_sum(column: pd.Series, model: SentenceTransformer):
"""
Convert a column to a vector
Convert each item in the column to a vector and return the weighted average of all the vectors
"""
uniq_column = column.value_counts(normalize=True)
weights = uniq_column.values
column_clean = pd.Series(uniq_column.keys()).apply(lambda x: re.sub("[^(0-9 |a-z)]",
" ", str(x).lower())).values
encoded_columns = model.encode(column_clean)
to_ret = 0
for number, weight in zip(encoded_columns, weights):
to_ret += number * weight
return to_ret
74 changes: 74 additions & 0 deletions column2Vec/functions.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
"""
Functions usefull for column2Vec.
"""
from typing import Any

import numpy as np
import pandas as pd
from sklearn.cluster import KMeans

from similarity.Comparator import cosine_sim
from similarity.DataFrameMetadataCreator import DataFrameMetadataCreator
from similarity.Types import NONNUMERICAL


def get_data(files: list[str]) -> dict[str, Any]:
"""
Reads all csv files (which name is in files). Creates metadata for them.
Save only nonnumerical columns into dictionary. Key is name of column.
Value is column.
:param files: list names of csv files
:return: dictionary of all tables.
"""
result = {}
index = 0
for i in files:
index += 1
data = pd.read_csv(i)
metadata_creator = (DataFrameMetadataCreator(data).
compute_advanced_structural_types().
compute_column_kind())
metadata1 = metadata_creator.get_metadata()
column_names = metadata1.get_column_names_by_type(NONNUMERICAL)
for name in column_names:
print(f" {i} : {name}")
result[name + str(index)] = data[name]
return result


def get_clusters(vectors_to_cluster: pd.DataFrame, n_clusters: int) -> list[list[str]]:
"""
Creates clusters by KMeans for given vectors.
:param vectors_to_cluster: embeddings for columns
:param n_clusters: number of clusters we want
:return: List, for each cluster number it contains list of column names
"""
kmeans = KMeans(n_clusters=n_clusters, random_state=0) # Change n_clusters as needed
list_of_vectors = np.array(list(vectors_to_cluster.values()))
kmeans.fit(list_of_vectors)

clusters = [[]] * n_clusters
for i in range(n_clusters):
names = []
for cluster, name in zip(kmeans.labels_, vectors_to_cluster.keys()):
if cluster == i:
names.append(name)
clusters[i] = names

return clusters


def compute_distances(vectors: dict):
"""
Compute distance for each pair of vectors.
:param vectors: dictionary of embedding vectors
:return: matrix with distances
"""
res = {}
for key1, vec1 in vectors.items():
res[key1] = {}
for key2, vec2 in vectors.items():
res[key1][key2] = 1 - cosine_sim(vec1, vec2)
return res
14 changes: 14 additions & 0 deletions column2Vec/generated/Clusters_Average_column2vec_clusters.html

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Loading

0 comments on commit 7fdc58b

Please sign in to comment.