Merge branch 'master' into doc/fill-readme-description-and-contributing

AbsaOSS · Apr 30, 2024 · 7fdc58b · 7fdc58b
2 parents c82d65e + 40acd54
commit 7fdc58b
Show file tree

Hide file tree

Showing 15 changed files with 1,662 additions and 1,275 deletions.
diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml
@@ -82,30 +82,7 @@ jobs:
       uses: github/codeql-action/analyze@v3
       with:
         category: "/language:${{matrix.language}}"
-  python-tests:
-    name: Run Python Tests
-    runs-on: ${{ (matrix.language == 'swift' && 'macos-latest') || 'ubuntu-latest' }}
-    steps:
-      - name: Checkout repository
-        uses: actions/checkout@v2
-
-      - name: Set up Python
-        uses: actions/setup-python@v2
-        with:
-          python-version: '3.x'
-
-      - name: Install dependencies
-        run: |
-          python -m pip install --upgrade pip
-          pip install pytest
-          pip install -r requirements.txt
-
-      - name: Run tests
-        run: |
-          pytest tests/types_test.py
-          pytest tests/metadata_test.py
-          pytest tests/comparator_test.py
-          pytest tests/column2Vec_test.py
+
 
 
 

diff --git a/.github/workflows/column2Vec_test.yml b/.github/workflows/column2Vec_test.yml
@@ -0,0 +1,31 @@
+name: "Column2Vec tests"
+
+on:
+  schedule:
+    - cron: '0 10 * * 6'
+  workflow_dispatch:
+
+
+jobs:
+  python-tests:
+    name: Run Tests for Column2Vec
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: '3.11'
+          cache: 'pip'
+
+      - name: Install dependencies
+        run: |
+          pip install pytest
+          pip install -r requirements.txt
+
+      - name: Run tests
+        run: |
+          pytest test/test_column2Vec.py
+
diff --git a/.github/workflows/py_test.yml b/.github/workflows/py_test.yml
@@ -0,0 +1,70 @@
+name: "Static analysis & tests"
+
+on:
+  push:
+    branches: [ "master" ]
+  pull_request:
+
+jobs:
+  analysis:
+    runs-on: ubuntu-latest
+    name: Pylint Analysis
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: '3.11'
+          cache: 'pip'
+
+      - name: Install dependencies
+        run: |
+          pip install -r requirements.txt
+          pip install pylint
+
+      - name: Analysing the code with pylint
+        run: |
+          pylint \
+            --fail-under=6.0 \
+            --ignore-patterns=test_.*?py \
+            --max-line-length=180 \
+            $(git ls-files '*.py')
+
+  python-tests:
+    env:
+      TEST_FILES: test/test_types.py test/test_metadata.py test/test_comparator.py
+    name: Run Python Tests
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: '3.11'
+          cache: 'pip'
+
+      - name: Install dependencies
+        run: |
+          pip install -r requirements.txt
+          pip install coverage pytest
+
+      - name: Run tests
+        run: coverage run --source='similarity,column2Vec' -m pytest $TEST_FILES
+
+      - name: Show coverage
+        run: coverage report -m
+
+      - name: Create coverage file
+        run: coverage xml
+
+      - name: Get Cover
+        uses: orgoro/[email protected]
+        with:
+          coverageFile: coverage.xml
+          token: ${{ secrets.GITHUB_TOKEN }}
+          thresholdAll: 0.7
+          thresholdNew: 0.9
diff --git a/categorical.ipynb b/categorical.ipynb
@@ -21,7 +21,7 @@
    "source": [
     "import pandas as pd\n",
     "\n",
-    "import functions as f\n",
+    "import similarity.functions as f\n",
     "import time\n",
     "from comparing import ComparatorForDatasets\n",
     "from comparing import CategoricalSimilarity\n",

diff --git a/column2Vec/Column2Vec.py b/column2Vec/Column2Vec.py
@@ -1,3 +1,6 @@
+"""
+This file contains column2Vec implementations.
+"""
 import re
 
 import numpy as np
@@ -9,7 +12,8 @@ def column2vec_as_sentence(column: pd.Series, model: SentenceTransformer):
     """
     Convert a column to a vector
 
-    Make one string from all the items in the column. Convert string to a vector by sentence transformer.
+    Make one string from all the items in the column.
+    Convert string to a vector by sentence transformer.
     """
     sentence = [str(column.tolist()).replace("\'", "").replace("]", "").replace("[", "")]
     return model.encode(sentence)[0]
@@ -19,48 +23,85 @@ def column2vec_as_sentence_clean(column: pd.Series, model: SentenceTransformer):
     """
     Convert a column to a vector
 
-    Make one string from all the items in the column, clean the column that it will contain only a-z and 0-9.
+    Make one string from all the items in the column, clean the column that
+     it will contain only a-z and 0-9.
     Convert string to a vector by sentence transformer.
     """
     column_as_str = str(column.tolist()).lower()
     sentence = [re.sub("[^(0-9 |a-z)]", " ", column_as_str)]
     return model.encode(sentence)[0]
 
+
 def column2vec_as_sentence_clean_uniq(column: pd.Series, model: SentenceTransformer):
     """
     Convert a column to a vector
 
-    Make one string from all the items in the column, clean the column that it will contain only a-z and 0-9, it will contains only uniq values.
+    Make one string from all the items in the column,
+    clean the column that it will contain only a-z and
+    0-9, it will contain only uniq values.
     Convert string to a vector by sentence transformer.
     """
     uniq_column = column.unique()
     column_as_str = str(uniq_column.tolist()).lower()
     sentence = [re.sub("[^(0-9 |a-z)]", " ", column_as_str)]
     return model.encode(sentence)[0]
 
+
 def column2vec_avg(column: pd.Series,  model: SentenceTransformer):
     """
     Convert a column to a vector
 
     Convert each item in the column to a vector and return the average of all the vectors
     """
     uniq_column = column.unique()
-    column_clean = pd.Series(uniq_column).apply(lambda x: re.sub("[^(0-9 |a-z)]", " ", str(x).lower())).values
+    column_clean = pd.Series(uniq_column).apply(lambda x: re.sub("[^(0-9 |a-z)]",
+                                                                 " ", str(x).lower())).values
     encoded_columns = model.encode(column_clean)
-    to_ret = np.mean(encoded_columns, axis=0) # counts arithmetic mean (average)
+    to_ret = np.mean(encoded_columns, axis=0)  # counts arithmetic mean (average)
     return to_ret
 
+
 def column2vec_weighted_avg(column: pd.Series, model: SentenceTransformer):
     """
-    todo tests it does what it should
     Convert a column to a vector
 
     Convert each item in the column to a vector and return the weighted average of all the vectors
     """
     uniq_column = column.value_counts(normalize=True)
     weights = uniq_column.values
-    column_clean = pd.Series(uniq_column.keys()).apply(lambda x: re.sub("[^(0-9 |a-z)]", " ", str(x).lower())).values
+    column_clean = pd.Series(uniq_column.keys()).apply(lambda x: re.sub("[^(0-9 |a-z)]",
+                                                                        " ", str(x).lower())).values
+    encoded_columns = model.encode(column_clean)
+    to_ret = np.average(encoded_columns, axis=0, weights=weights)  # counts weighted average
+    return to_ret
+
+
+def column2vec_sum(column: pd.Series,  model: SentenceTransformer):
+    """
+    Convert a column to a vector
+
+    Convert each item in the column to a vector and return the average of all the vectors
+    """
+    uniq_column = column.unique()
+    column_clean = pd.Series(uniq_column).apply(lambda x: re.sub("[^(0-9 |a-z)]",
+                                                                 " ", str(x).lower())).values
     encoded_columns = model.encode(column_clean)
-    to_ret = np.average(encoded_columns, axis=0, weights=weights) # counts weighted average
+    to_ret = sum(encoded_columns)  # sum of values
     return to_ret
 
+
+def column2vec_weighted_sum(column: pd.Series, model: SentenceTransformer):
+    """
+    Convert a column to a vector
+
+    Convert each item in the column to a vector and return the weighted average of all the vectors
+    """
+    uniq_column = column.value_counts(normalize=True)
+    weights = uniq_column.values
+    column_clean = pd.Series(uniq_column.keys()).apply(lambda x: re.sub("[^(0-9 |a-z)]",
+                                                                        " ", str(x).lower())).values
+    encoded_columns = model.encode(column_clean)
+    to_ret = 0
+    for number, weight in zip(encoded_columns, weights):
+        to_ret += number * weight
+    return to_ret
diff --git a/column2Vec/functions.py b/column2Vec/functions.py
@@ -0,0 +1,74 @@
+"""
+Functions usefull for column2Vec.
+"""
+from typing import Any
+
+import numpy as np
+import pandas as pd
+from sklearn.cluster import KMeans
+
+from similarity.Comparator import cosine_sim
+from similarity.DataFrameMetadataCreator import DataFrameMetadataCreator
+from similarity.Types import NONNUMERICAL
+
+
+def get_data(files: list[str]) -> dict[str, Any]:
+    """
+    Reads all csv files (which name is in files). Creates metadata for them.
+    Save only nonnumerical columns into dictionary. Key is name of column.
+    Value is column.
+    :param files: list names of csv files
+    :return: dictionary of all tables.
+    """
+    result = {}
+    index = 0
+    for i in files:
+        index += 1
+        data = pd.read_csv(i)
+        metadata_creator = (DataFrameMetadataCreator(data).
+                            compute_advanced_structural_types().
+                            compute_column_kind())
+        metadata1 = metadata_creator.get_metadata()
+        column_names = metadata1.get_column_names_by_type(NONNUMERICAL)
+        for name in column_names:
+            print(f" {i} : {name}")
+            result[name + str(index)] = data[name]
+    return result
+
+
+def get_clusters(vectors_to_cluster: pd.DataFrame, n_clusters: int) -> list[list[str]]:
+    """
+    Creates clusters by KMeans for given vectors.
+
+    :param vectors_to_cluster: embeddings for columns
+    :param n_clusters: number of clusters we want
+    :return: List, for each cluster number it contains list of column names
+    """
+    kmeans = KMeans(n_clusters=n_clusters, random_state=0)  # Change n_clusters as needed
+    list_of_vectors = np.array(list(vectors_to_cluster.values()))
+    kmeans.fit(list_of_vectors)
+
+    clusters = [[]] * n_clusters
+    for i in range(n_clusters):
+        names = []
+        for cluster, name in zip(kmeans.labels_, vectors_to_cluster.keys()):
+            if cluster == i:
+                names.append(name)
+        clusters[i] = names
+
+    return clusters
+
+
+def compute_distances(vectors: dict):
+    """
+    Compute distance for each pair of vectors.
+
+    :param vectors: dictionary of embedding vectors
+    :return: matrix with distances
+    """
+    res = {}
+    for key1, vec1 in vectors.items():
+        res[key1] = {}
+        for key2, vec2 in vectors.items():
+            res[key1][key2] = 1 - cosine_sim(vec1, vec2)
+    return res
diff --git a/column2Vec/generated/Clusters_Average_column2vec_clusters.html b/column2Vec/generated/Clusters_Average_column2vec_clusters.html
diff --git a/column2Vec/generated/Clusters_Clean_uniq_sentence_column2vec_vectors.html b/column2Vec/generated/Clusters_Clean_uniq_sentence_column2vec_vectors.html
diff --git a/column2Vec/generated/Clusters_Cleaned_sentence_column2vec_vectors.html b/column2Vec/generated/Clusters_Cleaned_sentence_column2vec_vectors.html