Add cashing to column2Vec

AbsaOSS · May 14, 2024 · c9f5b17 · c9f5b17
1 parent b153a1c
commit c9f5b17
Show file tree

Hide file tree

Showing 7 changed files with 165 additions and 151 deletions.
diff --git a/column2Vec/Column2Vec.py b/column2Vec/Column2Vec.py
@@ -43,7 +43,9 @@ def save(self, key: str, function: str, embedding: list):
         :param function: Function name
         :param embedding: to save
         """
-        self.__cache.loc[function, key] = embedding
+        print(f"|{int(function)}| : |{int(key)}|") # todo solve this
+        self.__cache.at[function, key] = embedding
+        # self.__cache.loc[function, key] = embedding
 
     def save_persistently(self):
         """

diff --git a/column2Vec/README.md b/column2Vec/README.md
@@ -1,21 +1,26 @@
 # What is column2Vec
-Is word2Vec type tool for creating embeddings vectors for string columns
+Is word2Vec type tool for creating embedding vectors for string columns
 in tables.
+We have implemented seven different approaches. 
 
 ## Structure
 
-folder [**generated**](generated) contains all generated files. Mostly html files representing
+Folder [**generated**](generated) contains all generated files.
+Mostly html files representing
 2D clusters, created by clustering vectors. 
 
 file [**Column2Vec.py**](Column2Vec.py) contains 7 different implementations of column2Vec.
+
+## Implementation description
 - **column2vec_as_sentence** creates one string from column, and then it transforms it to vector
 - **column2vec_as_sentence_clean** creates one string from column. String contains only numbers and a-z. Then it transforms clean string in to vector.
 - **column2vec_as_sentence_clean_uniq** creates one string from uniq values in column. String contains only numbers and a-z. Then it transforms clean string in to vector.
-- **column2vec_avg** transforms every element in column into vector and then it makes average of them.
-- **column2vec_weighted_avg** transforms every element in column into vector and then it makes weighted average of them (based on occurrence).
+- **column2vec_avg** transforms every element in column into vector, and then it makes average of them.
+- **column2vec_weighted_avg** transforms every element in column into vector, and then it makes weighted average of them (based on occurrence).
 - **column2vec_sum** transforms every uniq element in column into vector and then sum it.
 - **column2vec_weighted_sum** transforms every element in column into vector and then sum it.
 
+> Inspired by [Michael J. Mior, Alexander G. Ororbia](https://arxiv.org/pdf/1903.08621)
 ---
 # Data and cluster description
 #### Used tables
@@ -78,12 +83,12 @@ rating .
 duration .
 date_added .
 ```
-## How Did I cluster by copilot 
+## Making clusters by Microsoft Copilot
 - I wrote: `I will send you few rows of diferent tables could you please clustered columns of these tables ?`
 - I wrote: `I will send you all tables in cvs format i will say done when i will be done`
-- Then I send 15 rows of each table to copilot and I worote done. 
+- Then I send 15 rows of each table to co-pilot. 
 - I wrote all names of columns in the list above.
-- I wrote `Could you please guess the clusters`, this does not worke and copilot response was `As an AI, I can provide a high-level approach to clustering the data based on the columns you’ve provided. However, I’m unable to perform the actual clustering operation or guess the clusters without running a specific clustering algorithm on the data. Here’s a general approach:`
+- I wrote `Could you please guess the clusters`, this does not work and copilot response was `As an AI, I can provide a high-level approach to clustering the data based on the columns you’ve provided. However, I’m unable to perform the actual clustering operation or guess the clusters without running a specific clustering algorithm on the data. Here’s a general approach:`
 - I wrote `Could you show similar groups of columns` and I got the response below. (Ad Clustering by Microsoft Copilot)
 - I wrote `Could you split it to more groups ?` and I got the response below. (Ad Granular Clustering by Microsoft Copilot)
 ### Clustering by Microsoft Copilot

diff --git a/column2Vec/functions.py b/column2Vec/functions.py
@@ -1,23 +1,29 @@
 """
 Functions usefull for column2Vec.
 """
+import time
 from typing import Any
+from collections.abc import Callable
 
 import numpy as np
 import pandas as pd
+import plotly.express as px
+from sentence_transformers import SentenceTransformer
 from sklearn.cluster import KMeans
+from sklearn.manifold import TSNE
 
+from constants import trained_model
 from similarity.Comparator import cosine_sim
 from similarity.DataFrameMetadataCreator import DataFrameMetadataCreator
 from similarity.Types import NONNUMERICAL
 
 
-def get_data(files: list[str]) -> dict[str, Any]:
+def get_nonnumerical_data(files: list[str]) -> dict[str, Any]:
     """
     Reads all csv files (which name is in files). Creates metadata for them.
-    Save only nonnumerical columns into dictionary. Key is name of column.
+    Save only nonnumerical columns into dictionary. Key is a name of column.
     Value is column.
-    :param files: list names of csv files
+    :param files: List names of csv files
     :return: dictionary of all tables.
     """
     result = {}
@@ -36,13 +42,36 @@ def get_data(files: list[str]) -> dict[str, Any]:
     return result
 
 
+def get_vectors(function: Callable[[pd.Series, SentenceTransformer, str], list],
+                data: dict[str, Any]) -> dict[str, Any]:
+    """
+    Creates embedding vectors from column by using one of
+     the column2Vec implementations.
+    It also prints progress percent and elapsed time.
+    :param function: Is one of the column2Vec implementations
+    :param data: Data is a result from get_nonnumerical_data,
+                dictionary of all columns in all tables.
+    :return: Dictionary of embeddings, each column has its own embedding.
+    """
+    start = time.time()
+    result = {}
+    count = 1
+    for key in data:
+        print("Processing column: " + key + " " + str(round((count / len(data)) * 100, 2)) + "%")
+        result[key] = function(data[key], trained_model.get_module(), key)
+        count += 1
+    end = time.time()
+    print(f"ELAPSED TIME :{end - start}")
+    return result
+
+
 def get_clusters(vectors_to_cluster: pd.DataFrame, n_clusters: int) -> list[list[str]]:
     """
     Creates clusters by KMeans for given vectors.
 
-    :param vectors_to_cluster: embeddings for columns
-    :param n_clusters: number of clusters we want
-    :return: List, for each cluster number it contains list of column names
+    :param vectors_to_cluster: Embeddings for all column
+    :param n_clusters: numbers of clusters we want
+    :return: List, for each cluster number it contains a list of column names
     """
     kmeans = KMeans(n_clusters=n_clusters, random_state=0)  # Change n_clusters as needed
     list_of_vectors = np.array(list(vectors_to_cluster.values()))
@@ -59,11 +88,38 @@ def get_clusters(vectors_to_cluster: pd.DataFrame, n_clusters: int) -> list[list
     return clusters
 
 
+def plot_clusters(vectors_to_plot: pd.DataFrame, title: str):
+    """
+    From vectors creates clusters by Kmeans then it transforms clusters
+     by TSNE(t-distributed Stochastic Neighbor Embedding).
+     It plots de graphics, and it saves the plot as file
+    :param vectors_to_plot: dataframe
+    :param title: title of plot containing name of function
+    """
+    n_clusters = 12
+    kmeans = KMeans(n_clusters=n_clusters, random_state=0)  # Change n_clusters as needed
+    list_of_vectors = np.array(list(vectors_to_plot.values()))
+    kmeans.fit(list_of_vectors)
+
+    tsne = TSNE(n_components=2, random_state=0)
+    reduced_vectors = tsne.fit_transform(list_of_vectors)
+
+    df = pd.DataFrame(reduced_vectors, columns=['x', 'y'])
+    df['names'] = vectors_to_plot.keys()
+    # The cluster labels are returned in kmeans.labels_
+    df['cluster'] = kmeans.labels_
+
+    fig = px.scatter(df, x='x', y='y', color='cluster', hover_data=['names'])
+    fig.update_layout(title=title)
+    fig.write_html(title.replace(" ", "_") + ".html")
+    fig.show()
+
+
 def compute_distances(vectors: dict):
     """
     Compute distance for each pair of vectors.
 
-    :param vectors: dictionary of embedding vectors
+    :param vectors: Dictionary of embedding vectors
     :return: matrix with distances
     """
     res = {}