Skip to content

Commit

Permalink
Update main.py and fix Comparator
Browse files Browse the repository at this point in the history
  • Loading branch information
Olivie Franklova (CZ) authored and Olivie Franklova (CZ) committed May 10, 2024
1 parent 95541d5 commit 271d8a0
Show file tree
Hide file tree
Showing 3 changed files with 30 additions and 8 deletions.
7 changes: 7 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -150,6 +150,13 @@ stored in DatasetDescription.md in belonging folder ([**data**](data/DatasetDesc
Both folders contain file DataShow.md with metadata information for each dataset ([**data**](data/DataShow.md), [**data_validation**](data_validation/DatasetDescription.md)).

## How to run
You can compare two or more tables by running main.py.
The Result will be distance between tables.
```bash
python main.py # for fix files
python main.py data/imdb_top_1000.csv data/netflix_titles.csv # for specific files
```

### generate DataShow

## How to run tests
Expand Down
10 changes: 10 additions & 0 deletions main.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,15 @@
ColumnNamesEmbeddingsComparator
from similarity.DataFrameMetadataCreator import DataFrameMetadataCreator

if not sys.warnoptions:
import warnings

def supress_warning():
warnings.filterwarnings('ignore',
message='*return a timezone-aware datetime.'
' In a future version, this will raise an exception*')
# todo fix


def create_metadata(data):
return (DataFrameMetadataCreator(data).
Expand All @@ -24,6 +33,7 @@ def compare_datasets(path1, path2):
.add_comparator_type(ColumnNamesEmbeddingsComparator()))
return compartor.compare(metadata1, metadata2)


if __name__ == '__main__':
files = sys.argv[1:]
print(files)
Expand Down
21 changes: 13 additions & 8 deletions similarity/Comparator.py
Original file line number Diff line number Diff line change
Expand Up @@ -275,20 +275,25 @@ def compute_result(self, distance_table, distance_function, settings, weight):
return tmp

def compute_embeddings_distance(self, embeddings1, embeddings2) -> float:
res = pd.DataFrame()
res = []
row_mins = []
id1 = 0
id2 = 0
for embed1 in embeddings1:
# for embed2 in embeddings2:
# res.loc[id1, id2] = 1 - cosine_sim(embed1, embed2)
# id2 += 1
results = []
for embed2 in embeddings2:
res.loc[id1, id2] = 1 - cosine_sim(embed1, embed2)
id1 += 1
id2 += 1
row_mins.append(min(res[id1]))
result = 1 - cosine_sim(embed1, embed2)
results.append(result)
res.append(results) #
row_mins.append(min(results))
id1 += 1
column_mins = []
for _, column in res.items():
for_iter = pd.DataFrame(data=res)
for _, column in for_iter.items():
column_mins.append(min(column))
return max([mean(row_mins), mean(column_mins)]) # todo vysvetlit v textu
return max([mean(column_mins), mean(row_mins)]) # todo vysvetlit v textu

def __are_columns_null(self, column1, column2, message) -> tuple[bool, pd.DataFrame]:
"""
Expand Down

0 comments on commit 271d8a0

Please sign in to comment.