Update main.py and fix Comparator

AbsaOSS · May 10, 2024 · 271d8a0 · 271d8a0
1 parent 95541d5
commit 271d8a0
Show file tree

Hide file tree

Showing 3 changed files with 30 additions and 8 deletions.
diff --git a/README.md b/README.md
@@ -150,6 +150,13 @@ stored in DatasetDescription.md in belonging folder ([**data**](data/DatasetDesc
 Both folders contain file DataShow.md with metadata information for each dataset ([**data**](data/DataShow.md), [**data_validation**](data_validation/DatasetDescription.md)).
 
 ## How to run
+You can compare two or more tables by running main.py.
+The Result will be distance between tables.
+```bash
+ python main.py # for fix files
+ python main.py data/imdb_top_1000.csv data/netflix_titles.csv # for specific files
+```
+
 ### generate DataShow
 
 ## How to run tests

diff --git a/main.py b/main.py
@@ -6,6 +6,15 @@
     ColumnNamesEmbeddingsComparator
 from similarity.DataFrameMetadataCreator import DataFrameMetadataCreator
 
+if not sys.warnoptions:
+    import warnings
+
+def supress_warning():
+    warnings.filterwarnings('ignore',
+                            message='*return a timezone-aware datetime.'
+                                    '  In a future version, this will raise an exception*')
+    # todo fix
+
 
 def create_metadata(data):
     return (DataFrameMetadataCreator(data).
@@ -24,6 +33,7 @@ def compare_datasets(path1, path2):
                  .add_comparator_type(ColumnNamesEmbeddingsComparator()))
     return compartor.compare(metadata1, metadata2)
 
+
 if __name__ == '__main__':
     files = sys.argv[1:]
     print(files)

diff --git a/similarity/Comparator.py b/similarity/Comparator.py
@@ -275,20 +275,25 @@ def compute_result(self, distance_table, distance_function, settings, weight):
         return tmp
 
     def compute_embeddings_distance(self, embeddings1, embeddings2) -> float:
-        res = pd.DataFrame()
+        res = []
         row_mins = []
         id1 = 0
-        id2 = 0
         for embed1 in embeddings1:
+            # for embed2 in embeddings2:
+            #     res.loc[id1, id2] = 1 - cosine_sim(embed1, embed2)
+            #     id2 += 1
+            results = []
             for embed2 in embeddings2:
-                res.loc[id1, id2] = 1 - cosine_sim(embed1, embed2)
-                id1 += 1
-                id2 += 1
-            row_mins.append(min(res[id1]))
+                result = 1 - cosine_sim(embed1, embed2)
+                results.append(result)
+            res.append(results)  #
+            row_mins.append(min(results))
+            id1 += 1
         column_mins = []
-        for _, column in res.items():
+        for_iter = pd.DataFrame(data=res)
+        for _, column in for_iter.items():
             column_mins.append(min(column))
-        return max([mean(row_mins), mean(column_mins)])  # todo vysvetlit v textu
+        return max([mean(column_mins), mean(row_mins)])  # todo vysvetlit v textu
 
     def __are_columns_null(self, column1, column2, message) -> tuple[bool, pd.DataFrame]:
         """