Add README to similaritu table

AbsaOSS · May 17, 2024 · 56e8553 · 56e8553
1 parent c96e609
commit 56e8553
Show file tree

Hide file tree

Showing 12 changed files with 15,865 additions and 813 deletions.
diff --git a/README.md b/README.md
@@ -131,7 +131,7 @@ Explaining kinds:
 - found out which data are duplicated 
 - finding similar or different data
 ## Structure
-- **Source code** is in folder [similarity](similarity).
+- **Source code** is in folder [similarity](similarity). More about similarity folder structure in [README.md](similarity/README.md)
 - **Source code for column2Vec** is in folder [column2Vec](column2Vec).
 - **Tests** are in folder [test](test)
 - **Data** are stored in folders [**data**](data) and [**data_validation**](data_validation).

diff --git a/similarity/Comparator.py b/similarity/Comparator.py
@@ -168,7 +168,6 @@ def compare(self, metadata1: DataFrameMetadata, metadata2: DataFrameMetadata, di
 class ColumnEmbeddingComparator(ComparatorType):
     def compare(self, metadata1: DataFrameMetadata, metadata2: DataFrameMetadata, distance_function: DistanceFunction,
                 settings: set[Settings]) -> pd.DataFrame:
-        ## todo originally it was used threshold here
         result = pd.DataFrame()
         name_distance = pd.DataFrame()
         for id1, (column1, embedding1) in enumerate(metadata1.column_embeddings.items()):

diff --git a/similarity/DataFrameMetadata.py b/similarity/DataFrameMetadata.py
@@ -131,6 +131,12 @@ def get_column_kind(self, name):
             if name in columns:
                 return column_kind
 
+    def get_column_names_by_kind(self, *kinds):
+        columns = []
+        for t in kinds:
+            columns.extend(self.column_kind[t])
+        return columns
+
     def get_column_names_by_type(self, *types):
         if NONNUMERICAL in types:
             types = list(types)

diff --git a/similarity/DataFrameMetadataCreator.py b/similarity/DataFrameMetadataCreator.py
@@ -223,12 +223,13 @@ def create_column_embeddings(self, types=None) -> 'DataFrameMetadataCreator':
         :return: self DataFrameMetadataCreator
         """
         if types is None:
-            types = [NONNUMERICAL, UNDEFINED, WORD, ALL, MULTIPLE_VALUES, PHRASE, ARTICLE, ALPHANUMERIC, ALPHABETIC ] ## todo
+            types = [NONNUMERICAL, UNDEFINED, WORD, ALL, MULTIPLE_VALUES, PHRASE, ARTICLE, ALPHANUMERIC, ALPHABETIC] ## todo
         sentences = []
         names = []
         for i in types:
             for column in self.metadata.type_column[i]:
-                self.metadata.column_embeddings[column] = column2vec_as_sentence(self.dataframe[column], self.__get_model()) ## todo is it the same ?
+                self.metadata.column_embeddings[column] = column2vec_as_sentence(self.dataframe[column],
+                                                                                 self.__get_model(), column)
         #         sentences.append(str(self.dataframe[column].tolist())
         #                          .replace("\'", "")
         #                          .replace("]", "")
@@ -247,6 +248,13 @@ def get_column_by_type(self, *types):
         """
         return self.dataframe[self.metadata.get_column_names_by_type(types)]
 
+    def get_column_by_kind(self, *kinds):
+        """
+        :param kinds: of columns
+        :return: dataframe with columns with specific kind
+        """
+        return self.dataframe[self.metadata.get_column_names_by_type(kinds)]
+
     def get_numerical_columns(self):
         """
         :return: dataframe with only numerical columns

diff --git a/similarity/README.md b/similarity/README.md
@@ -2,23 +2,59 @@
 The file Types.py includes a Type class and functions
 that can be used to determine the type of each column in a dataset.
 
-### File test.ipynb
-The file test.ipynb contains usage example of MetadataCreator class and 
-Comparator class. It also shows heatmaps and accuracy scores.
+
 
 ### File DataFrameMetadata.py
 Contains Metadata Class and CategoricalMetadata class
 
 ### File DataFrameMetadataCreator.py
 Contains MetadataCreator to create Metadata
 
-### File Comparator.py
-File contains Comparator class ComparatorType class and DistanceFunction
+# Structure
+## folder [comparing_all_tables](comparing_all_tables) 
+This folder contains two files categorical.ipynb and comparing.py.
+File comparing.py contains the first version of Comparator.
+Comparator compares all the tables together,
+so it is constructed by nested loops.
+It is very complicated, it has not a good design, and
+we do not recommend to use it. 
+
+File categorical.ipynb shows usage of comparing.py
+
+## file Comparator.py
+File contains Comparator class ComparatorType classes and DistanceFunction
+Comparator is part of the pipeline that is shown below
+![img_2.png](../images/pipeline1.png)
+You can se the implementation of Comparator in the picture below.
+The user sets several specific comparators for comparator
+(for example, bool, string and category)
+Each comparator will be executed.
+Specific comparator creates a number of distance tables for all bool columns. 
+Then these tables are merged by counting average for each cell.
+That will create one distance table for bool this table will be passed to
+Distance function together with a weight and table ratio, and it will count number.
+All numbers will be passed to Euclidean distance and the overall distance will be computed.
+![img.png](images/img.png)
+
+This comparator is used in main.py, test.ipynb and test_comparator.py
 
+### File test.ipynb
+The file test.ipynb contains usage example of MetadataCreator class and 
+Comparator class. It also shows heatmaps and accuracy scores.
 
 
+# file ComparatorByColumns
+![img_3.png](../images/pipeline2.png)
 
 
 ### todo 
-
+- categorical.ipynb,
+- Datasets_Desription.ipynb,
+- functions.ipynb,
+- playground.ipynb
+
+---
+- functions.py
+- comparing.py
+- ComparatorByColumn.py