diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml
index 82b5950..bd2b1d1 100644
--- a/.github/workflows/codeql.yml
+++ b/.github/workflows/codeql.yml
@@ -82,30 +82,7 @@ jobs:
uses: github/codeql-action/analyze@v3
with:
category: "/language:${{matrix.language}}"
- python-tests:
- name: Run Python Tests
- runs-on: ${{ (matrix.language == 'swift' && 'macos-latest') || 'ubuntu-latest' }}
- steps:
- - name: Checkout repository
- uses: actions/checkout@v2
-
- - name: Set up Python
- uses: actions/setup-python@v2
- with:
- python-version: '3.x'
-
- - name: Install dependencies
- run: |
- python -m pip install --upgrade pip
- pip install pytest
- pip install -r requirements.txt
-
- - name: Run tests
- run: |
- pytest tests/types_test.py
- pytest tests/metadata_test.py
- pytest tests/comparator_test.py
- pytest tests/column2Vec_test.py
+
diff --git a/.github/workflows/column2Vec_test.yml b/.github/workflows/column2Vec_test.yml
new file mode 100644
index 0000000..7aede4b
--- /dev/null
+++ b/.github/workflows/column2Vec_test.yml
@@ -0,0 +1,31 @@
+name: "Column2Vec tests"
+
+on:
+ schedule:
+ - cron: '0 10 * * 6'
+ workflow_dispatch:
+
+
+jobs:
+ python-tests:
+ name: Run Tests for Column2Vec
+ runs-on: ubuntu-latest
+ steps:
+ - name: Checkout repository
+ uses: actions/checkout@v4
+
+ - name: Set up Python
+ uses: actions/setup-python@v5
+ with:
+ python-version: '3.11'
+ cache: 'pip'
+
+ - name: Install dependencies
+ run: |
+ pip install pytest
+ pip install -r requirements.txt
+
+ - name: Run tests
+ run: |
+ pytest test/test_column2Vec.py
+
diff --git a/.github/workflows/py_test.yml b/.github/workflows/py_test.yml
new file mode 100644
index 0000000..e8da178
--- /dev/null
+++ b/.github/workflows/py_test.yml
@@ -0,0 +1,70 @@
+name: "Static analysis & tests"
+
+on:
+ push:
+ branches: [ "master" ]
+ pull_request:
+
+jobs:
+ analysis:
+ runs-on: ubuntu-latest
+ name: Pylint Analysis
+ steps:
+ - name: Checkout repository
+ uses: actions/checkout@v4
+
+ - name: Set up Python
+ uses: actions/setup-python@v5
+ with:
+ python-version: '3.11'
+ cache: 'pip'
+
+ - name: Install dependencies
+ run: |
+ pip install -r requirements.txt
+ pip install pylint
+
+ - name: Analysing the code with pylint
+ run: |
+ pylint \
+ --fail-under=6.0 \
+ --ignore-patterns=test_.*?py \
+ --max-line-length=180 \
+ $(git ls-files '*.py')
+
+ python-tests:
+ env:
+ TEST_FILES: test/test_types.py test/test_metadata.py test/test_comparator.py
+ name: Run Python Tests
+ runs-on: ubuntu-latest
+ steps:
+ - name: Checkout repository
+ uses: actions/checkout@v4
+
+ - name: Set up Python
+ uses: actions/setup-python@v5
+ with:
+ python-version: '3.11'
+ cache: 'pip'
+
+ - name: Install dependencies
+ run: |
+ pip install -r requirements.txt
+ pip install coverage pytest
+
+ - name: Run tests
+ run: coverage run --source='similarity,column2Vec' -m pytest $TEST_FILES
+
+ - name: Show coverage
+ run: coverage report -m
+
+ - name: Create coverage file
+ run: coverage xml
+
+ - name: Get Cover
+ uses: orgoro/coverage@v3.1
+ with:
+ coverageFile: coverage.xml
+ token: ${{ secrets.GITHUB_TOKEN }}
+ thresholdAll: 0.7
+ thresholdNew: 0.9
diff --git a/categorical.ipynb b/categorical.ipynb
index db0bd7c..bf79b59 100644
--- a/categorical.ipynb
+++ b/categorical.ipynb
@@ -21,7 +21,7 @@
"source": [
"import pandas as pd\n",
"\n",
- "import functions as f\n",
+ "import similarity.functions as f\n",
"import time\n",
"from comparing import ComparatorForDatasets\n",
"from comparing import CategoricalSimilarity\n",
diff --git a/column2Vec/Column2Vec.py b/column2Vec/Column2Vec.py
index 7dd305f..9709c67 100644
--- a/column2Vec/Column2Vec.py
+++ b/column2Vec/Column2Vec.py
@@ -1,3 +1,6 @@
+"""
+This file contains column2Vec implementations.
+"""
import re
import numpy as np
@@ -9,7 +12,8 @@ def column2vec_as_sentence(column: pd.Series, model: SentenceTransformer):
"""
Convert a column to a vector
- Make one string from all the items in the column. Convert string to a vector by sentence transformer.
+ Make one string from all the items in the column.
+ Convert string to a vector by sentence transformer.
"""
sentence = [str(column.tolist()).replace("\'", "").replace("]", "").replace("[", "")]
return model.encode(sentence)[0]
@@ -19,18 +23,22 @@ def column2vec_as_sentence_clean(column: pd.Series, model: SentenceTransformer):
"""
Convert a column to a vector
- Make one string from all the items in the column, clean the column that it will contain only a-z and 0-9.
+ Make one string from all the items in the column, clean the column that
+ it will contain only a-z and 0-9.
Convert string to a vector by sentence transformer.
"""
column_as_str = str(column.tolist()).lower()
sentence = [re.sub("[^(0-9 |a-z)]", " ", column_as_str)]
return model.encode(sentence)[0]
+
def column2vec_as_sentence_clean_uniq(column: pd.Series, model: SentenceTransformer):
"""
Convert a column to a vector
- Make one string from all the items in the column, clean the column that it will contain only a-z and 0-9, it will contains only uniq values.
+ Make one string from all the items in the column,
+ clean the column that it will contain only a-z and
+ 0-9, it will contain only uniq values.
Convert string to a vector by sentence transformer.
"""
uniq_column = column.unique()
@@ -38,6 +46,7 @@ def column2vec_as_sentence_clean_uniq(column: pd.Series, model: SentenceTransfor
sentence = [re.sub("[^(0-9 |a-z)]", " ", column_as_str)]
return model.encode(sentence)[0]
+
def column2vec_avg(column: pd.Series, model: SentenceTransformer):
"""
Convert a column to a vector
@@ -45,22 +54,54 @@ def column2vec_avg(column: pd.Series, model: SentenceTransformer):
Convert each item in the column to a vector and return the average of all the vectors
"""
uniq_column = column.unique()
- column_clean = pd.Series(uniq_column).apply(lambda x: re.sub("[^(0-9 |a-z)]", " ", str(x).lower())).values
+ column_clean = pd.Series(uniq_column).apply(lambda x: re.sub("[^(0-9 |a-z)]",
+ " ", str(x).lower())).values
encoded_columns = model.encode(column_clean)
- to_ret = np.mean(encoded_columns, axis=0) # counts arithmetic mean (average)
+ to_ret = np.mean(encoded_columns, axis=0) # counts arithmetic mean (average)
return to_ret
+
def column2vec_weighted_avg(column: pd.Series, model: SentenceTransformer):
"""
- todo tests it does what it should
Convert a column to a vector
Convert each item in the column to a vector and return the weighted average of all the vectors
"""
uniq_column = column.value_counts(normalize=True)
weights = uniq_column.values
- column_clean = pd.Series(uniq_column.keys()).apply(lambda x: re.sub("[^(0-9 |a-z)]", " ", str(x).lower())).values
+ column_clean = pd.Series(uniq_column.keys()).apply(lambda x: re.sub("[^(0-9 |a-z)]",
+ " ", str(x).lower())).values
+ encoded_columns = model.encode(column_clean)
+ to_ret = np.average(encoded_columns, axis=0, weights=weights) # counts weighted average
+ return to_ret
+
+
+def column2vec_sum(column: pd.Series, model: SentenceTransformer):
+ """
+ Convert a column to a vector
+
+ Convert each item in the column to a vector and return the average of all the vectors
+ """
+ uniq_column = column.unique()
+ column_clean = pd.Series(uniq_column).apply(lambda x: re.sub("[^(0-9 |a-z)]",
+ " ", str(x).lower())).values
encoded_columns = model.encode(column_clean)
- to_ret = np.average(encoded_columns, axis=0, weights=weights) # counts weighted average
+ to_ret = sum(encoded_columns) # sum of values
return to_ret
+
+def column2vec_weighted_sum(column: pd.Series, model: SentenceTransformer):
+ """
+ Convert a column to a vector
+
+ Convert each item in the column to a vector and return the weighted average of all the vectors
+ """
+ uniq_column = column.value_counts(normalize=True)
+ weights = uniq_column.values
+ column_clean = pd.Series(uniq_column.keys()).apply(lambda x: re.sub("[^(0-9 |a-z)]",
+ " ", str(x).lower())).values
+ encoded_columns = model.encode(column_clean)
+ to_ret = 0
+ for number, weight in zip(encoded_columns, weights):
+ to_ret += number * weight
+ return to_ret
diff --git a/column2Vec/functions.py b/column2Vec/functions.py
index e69de29..7ef71dc 100644
--- a/column2Vec/functions.py
+++ b/column2Vec/functions.py
@@ -0,0 +1,74 @@
+"""
+Functions usefull for column2Vec.
+"""
+from typing import Any
+
+import numpy as np
+import pandas as pd
+from sklearn.cluster import KMeans
+
+from similarity.Comparator import cosine_sim
+from similarity.DataFrameMetadataCreator import DataFrameMetadataCreator
+from similarity.Types import NONNUMERICAL
+
+
+def get_data(files: list[str]) -> dict[str, Any]:
+ """
+ Reads all csv files (which name is in files). Creates metadata for them.
+ Save only nonnumerical columns into dictionary. Key is name of column.
+ Value is column.
+ :param files: list names of csv files
+ :return: dictionary of all tables.
+ """
+ result = {}
+ index = 0
+ for i in files:
+ index += 1
+ data = pd.read_csv(i)
+ metadata_creator = (DataFrameMetadataCreator(data).
+ compute_advanced_structural_types().
+ compute_column_kind())
+ metadata1 = metadata_creator.get_metadata()
+ column_names = metadata1.get_column_names_by_type(NONNUMERICAL)
+ for name in column_names:
+ print(f" {i} : {name}")
+ result[name + str(index)] = data[name]
+ return result
+
+
+def get_clusters(vectors_to_cluster: pd.DataFrame, n_clusters: int) -> list[list[str]]:
+ """
+ Creates clusters by KMeans for given vectors.
+
+ :param vectors_to_cluster: embeddings for columns
+ :param n_clusters: number of clusters we want
+ :return: List, for each cluster number it contains list of column names
+ """
+ kmeans = KMeans(n_clusters=n_clusters, random_state=0) # Change n_clusters as needed
+ list_of_vectors = np.array(list(vectors_to_cluster.values()))
+ kmeans.fit(list_of_vectors)
+
+ clusters = [[]] * n_clusters
+ for i in range(n_clusters):
+ names = []
+ for cluster, name in zip(kmeans.labels_, vectors_to_cluster.keys()):
+ if cluster == i:
+ names.append(name)
+ clusters[i] = names
+
+ return clusters
+
+
+def compute_distances(vectors: dict):
+ """
+ Compute distance for each pair of vectors.
+
+ :param vectors: dictionary of embedding vectors
+ :return: matrix with distances
+ """
+ res = {}
+ for key1, vec1 in vectors.items():
+ res[key1] = {}
+ for key2, vec2 in vectors.items():
+ res[key1][key2] = 1 - cosine_sim(vec1, vec2)
+ return res
diff --git a/column2Vec/generated/Clusters_Average_column2vec_clusters.html b/column2Vec/generated/Clusters_Average_column2vec_clusters.html
new file mode 100644
index 0000000..bdbd7e6
--- /dev/null
+++ b/column2Vec/generated/Clusters_Average_column2vec_clusters.html
@@ -0,0 +1,14 @@
+
+
+
+
+
+
\ No newline at end of file
diff --git a/column2Vec/generated/Clusters_Clean_uniq_sentence_column2vec_vectors.html b/column2Vec/generated/Clusters_Clean_uniq_sentence_column2vec_vectors.html
new file mode 100644
index 0000000..060a8b0
--- /dev/null
+++ b/column2Vec/generated/Clusters_Clean_uniq_sentence_column2vec_vectors.html
@@ -0,0 +1,14 @@
+
+
+
+
+
+
\ No newline at end of file
diff --git a/column2Vec/generated/Clusters_Cleaned_sentence_column2vec_vectors.html b/column2Vec/generated/Clusters_Cleaned_sentence_column2vec_vectors.html
new file mode 100644
index 0000000..0dacf4f
--- /dev/null
+++ b/column2Vec/generated/Clusters_Cleaned_sentence_column2vec_vectors.html
@@ -0,0 +1,14 @@
+
+
+
+
+
+
\ No newline at end of file
diff --git a/column2Vec/playground.ipynb b/column2Vec/playground.ipynb
index d345195..f78ee09 100644
--- a/column2Vec/playground.ipynb
+++ b/column2Vec/playground.ipynb
@@ -7,33 +7,34 @@
"import pandas as pd\n",
"from sentence_transformers import SentenceTransformer\n",
"from column2Vec.Column2Vec import column2vec_avg\n",
- "from similarity.Types import Type, NONNUMERICAL\n",
+ "from similarity.Types import NONNUMERICAL\n",
"from similarity.DataFrameMetadataCreator import DataFrameMetadataCreator\n",
"from column2Vec.Column2Vec import column2vec_as_sentence\n",
"from column2Vec.Column2Vec import column2vec_as_sentence_clean\n",
"from column2Vec.Column2Vec import column2vec_as_sentence_clean_uniq\n",
"from column2Vec.Column2Vec import column2vec_weighted_avg\n",
- "import time"
+ "import time\n",
+ "from column2Vec.functions import get_clusters"
],
"metadata": {
"collapsed": false,
"ExecuteTime": {
- "end_time": "2024-03-19T12:24:21.922107Z",
- "start_time": "2024-03-19T12:24:17.545278Z"
+ "end_time": "2024-03-26T09:57:59.420458Z",
+ "start_time": "2024-03-26T09:57:59.414725Z"
}
},
"id": "d2f663cd8db4d03b",
- "execution_count": 1
+ "execution_count": 11
},
{
"cell_type": "code",
- "execution_count": 2,
+ "execution_count": 12,
"id": "initial_id",
"metadata": {
"collapsed": true,
"ExecuteTime": {
- "end_time": "2024-03-19T12:24:21.927591Z",
- "start_time": "2024-03-19T12:24:21.923224Z"
+ "end_time": "2024-03-26T09:57:59.444903Z",
+ "start_time": "2024-03-26T09:57:59.439511Z"
}
},
"outputs": [],
@@ -96,12 +97,12 @@
"metadata": {
"collapsed": false,
"ExecuteTime": {
- "end_time": "2024-03-19T15:06:00.114656Z",
- "start_time": "2024-03-19T15:05:59.282313Z"
+ "end_time": "2024-03-26T09:58:00.132126Z",
+ "start_time": "2024-03-26T09:57:59.501655Z"
}
},
"id": "74ad1f08faa50a70",
- "execution_count": 60
+ "execution_count": 13
},
{
"cell_type": "code",
@@ -134,12 +135,12 @@
"metadata": {
"collapsed": false,
"ExecuteTime": {
- "end_time": "2024-03-19T12:24:23.072402Z",
- "start_time": "2024-03-19T12:24:22.934435Z"
+ "end_time": "2024-03-26T09:58:00.137581Z",
+ "start_time": "2024-03-26T09:58:00.133237Z"
}
},
"id": "19c03920fae6aab8",
- "execution_count": 4
+ "execution_count": 14
},
{
"cell_type": "code",
@@ -148,13 +149,13 @@
"name": "stdout",
"output_type": "stream",
"text": [
- " ../data/aircraft-data_nov_dec.csv : reg_city\n",
" ../data/aircraft-data_nov_dec.csv : reg_state\n",
- " ../data/aircraft-data_nov_dec.csv : flight\n",
+ " ../data/aircraft-data_nov_dec.csv : reg_city\n",
" ../data/aircraft-data_nov_dec.csv : tail_number\n",
+ " ../data/aircraft-data_nov_dec.csv : flight\n",
" ../data/aircraft-data_nov_dec.csv : reg_expiration\n",
- " ../data/aircraft-data_nov_dec.csv : reg_owner\n",
" ../data/aircraft-data_nov_dec.csv : manufacturer\n",
+ " ../data/aircraft-data_nov_dec.csv : reg_owner\n",
" ../data/aircraft-data_nov_dec.csv : model\n"
]
},
@@ -162,21 +163,19 @@
"name": "stderr",
"output_type": "stream",
"text": [
- "C:\\Users\\ab032mj\\Desktop\\thesis\\simillarity\\venv\\Lib\\site-packages\\dateutil\\parser\\_parser.py:1207: UnknownTimezoneWarning:\n",
- "\n",
- "tzname AG identified but not understood. Pass `tzinfos` argument in order to correctly return a timezone-aware datetime. In a future version, this will raise an exception.\n",
- "\n"
+ "C:\\Users\\ab032mj\\Desktop\\thesis\\simillarity\\venv\\Lib\\site-packages\\dateutil\\parser\\_parser.py:1207: UnknownTimezoneWarning: tzname AG identified but not understood. Pass `tzinfos` argument in order to correctly return a timezone-aware datetime. In a future version, this will raise an exception.\n",
+ " warnings.warn(\"tzname {tzname} identified but not understood. \"\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
+ " ../data/Airplane_Cleaned.csv : Multi Engine\n",
" ../data/Airplane_Cleaned.csv : TP mods\n",
" ../data/Airplane_Cleaned.csv : Engine Type\n",
- " ../data/Airplane_Cleaned.csv : Multi Engine\n",
- " ../data/Airplane_Cleaned.csv : Company\n",
" ../data/Airplane_Cleaned.csv : Model\n",
+ " ../data/Airplane_Cleaned.csv : Company\n",
" ../data/autoscout24-germany-dataset.csv : make\n",
" ../data/autoscout24-germany-dataset.csv : gear\n",
" ../data/autoscout24-germany-dataset.csv : model\n",
@@ -184,26 +183,26 @@
" ../data/autoscout24-germany-dataset.csv : offerType\n",
" ../data/CARS_1.csv : fuel_type\n",
" ../data/CARS_1.csv : transmission_type\n",
- " ../data/CARS_1.csv : car_name\n",
" ../data/CARS_1.csv : body_type\n",
+ " ../data/CARS_1.csv : car_name\n",
+ " ../data/USA_cars_datasets.csv : country\n",
" ../data/USA_cars_datasets.csv : model\n",
- " ../data/USA_cars_datasets.csv : brand\n",
" ../data/USA_cars_datasets.csv : vin\n",
- " ../data/USA_cars_datasets.csv : country\n",
- " ../data/USA_cars_datasets.csv : state\n",
- " ../data/USA_cars_datasets.csv : title_status\n",
+ " ../data/USA_cars_datasets.csv : brand\n",
" ../data/USA_cars_datasets.csv : condition\n",
+ " ../data/USA_cars_datasets.csv : title_status\n",
+ " ../data/USA_cars_datasets.csv : state\n",
" ../data/USA_cars_datasets.csv : color\n",
" ../data/imdb_top_1000.csv : Certificate\n",
- " ../data/imdb_top_1000.csv : Gross\n",
" ../data/imdb_top_1000.csv : Poster_Link\n",
+ " ../data/imdb_top_1000.csv : Gross\n",
+ " ../data/imdb_top_1000.csv : Director\n",
+ " ../data/imdb_top_1000.csv : Star3\n",
" ../data/imdb_top_1000.csv : Star2\n",
+ " ../data/imdb_top_1000.csv : Star1\n",
" ../data/imdb_top_1000.csv : Overview\n",
- " ../data/imdb_top_1000.csv : Director\n",
" ../data/imdb_top_1000.csv : Star4\n",
" ../data/imdb_top_1000.csv : Series_Title\n",
- " ../data/imdb_top_1000.csv : Star1\n",
- " ../data/imdb_top_1000.csv : Star3\n",
" ../data/imdb_top_1000.csv : Genre\n"
]
},
@@ -211,10 +210,8 @@
"name": "stderr",
"output_type": "stream",
"text": [
- "C:\\Users\\ab032mj\\Desktop\\thesis\\simillarity\\venv\\Lib\\site-packages\\dateutil\\parser\\_parser.py:1207: UnknownTimezoneWarning:\n",
- "\n",
- "tzname ELSIE identified but not understood. Pass `tzinfos` argument in order to correctly return a timezone-aware datetime. In a future version, this will raise an exception.\n",
- "\n"
+ "C:\\Users\\ab032mj\\Desktop\\thesis\\simillarity\\venv\\Lib\\site-packages\\dateutil\\parser\\_parser.py:1207: UnknownTimezoneWarning: tzname ELSIE identified but not understood. Pass `tzinfos` argument in order to correctly return a timezone-aware datetime. In a future version, this will raise an exception.\n",
+ " warnings.warn(\"tzname {tzname} identified but not understood. \"\n"
]
},
{
@@ -223,14 +220,14 @@
"text": [
" ../data/netflix_titles.csv : show_id\n",
" ../data/netflix_titles.csv : cast\n",
- " ../data/netflix_titles.csv : title\n",
" ../data/netflix_titles.csv : description\n",
+ " ../data/netflix_titles.csv : title\n",
" ../data/netflix_titles.csv : director\n",
- " ../data/netflix_titles.csv : listed_in\n",
- " ../data/netflix_titles.csv : duration\n",
" ../data/netflix_titles.csv : type\n",
- " ../data/netflix_titles.csv : rating\n",
" ../data/netflix_titles.csv : country\n",
+ " ../data/netflix_titles.csv : listed_in\n",
+ " ../data/netflix_titles.csv : rating\n",
+ " ../data/netflix_titles.csv : duration\n",
" ../data/netflix_titles.csv : date_added\n"
]
}
@@ -241,12 +238,12 @@
"metadata": {
"collapsed": false,
"ExecuteTime": {
- "end_time": "2024-03-19T15:06:36.629839Z",
- "start_time": "2024-03-19T15:06:04.509081Z"
+ "end_time": "2024-03-26T09:58:32.212781Z",
+ "start_time": "2024-03-26T09:58:00.137581Z"
}
},
"id": "cfe57003e670ba15",
- "execution_count": 61
+ "execution_count": 15
},
{
"cell_type": "code",
@@ -256,8 +253,8 @@
"\n",
"\n",
"def plot_clusters(vectors_to_plot: pd.DataFrame, title: str):\n",
- "\n",
- " kmeans = KMeans(n_clusters=12, random_state=0) # Change n_clusters as needed\n",
+ " n_clusters = 12\n",
+ " kmeans = KMeans(n_clusters=n_clusters, random_state=0) # Change n_clusters as needed\n",
" list_of_vectors = np.array(list(vectors_to_plot.values()))\n",
" kmeans.fit(list_of_vectors)\n",
"\n",
@@ -271,18 +268,18 @@
"\n",
" fig = px.scatter(df, x='x', y='y', color='cluster', hover_data=['names'])\n",
" fig.update_layout(title=title)\n",
- " # fig.write_html(title.replace(\" \", \"_\") + \".html\")\n",
+ " fig.write_html(title.replace(\" \", \"_\") + \".html\")\n",
" fig.show()\n"
],
"metadata": {
"collapsed": false,
"ExecuteTime": {
- "end_time": "2024-03-19T15:06:39.490133Z",
- "start_time": "2024-03-19T15:06:39.485011Z"
+ "end_time": "2024-03-26T09:58:32.218154Z",
+ "start_time": "2024-03-26T09:58:32.213907Z"
}
},
"id": "19c881d9f450b556",
- "execution_count": 62
+ "execution_count": 16
},
{
"cell_type": "code",
@@ -291,19 +288,19 @@
"name": "stdout",
"output_type": "stream",
"text": [
- "Processing column: reg_city1 1.92%\n",
- "Processing column: reg_state1 3.85%\n",
- "Processing column: flight1 5.77%\n",
- "Processing column: tail_number1 7.69%\n",
+ "Processing column: reg_state1 1.92%\n",
+ "Processing column: reg_city1 3.85%\n",
+ "Processing column: tail_number1 5.77%\n",
+ "Processing column: flight1 7.69%\n",
"Processing column: reg_expiration1 9.62%\n",
- "Processing column: reg_owner1 11.54%\n",
- "Processing column: manufacturer1 13.46%\n",
+ "Processing column: manufacturer1 11.54%\n",
+ "Processing column: reg_owner1 13.46%\n",
"Processing column: model1 15.38%\n",
- "Processing column: TP mods2 17.31%\n",
- "Processing column: Engine Type2 19.23%\n",
- "Processing column: Multi Engine2 21.15%\n",
- "Processing column: Company2 23.08%\n",
- "Processing column: Model2 25.0%\n",
+ "Processing column: Multi Engine2 17.31%\n",
+ "Processing column: TP mods2 19.23%\n",
+ "Processing column: Engine Type2 21.15%\n",
+ "Processing column: Model2 23.08%\n",
+ "Processing column: Company2 25.0%\n",
"Processing column: make3 26.92%\n",
"Processing column: gear3 28.85%\n",
"Processing column: model3 30.77%\n",
@@ -311,39 +308,39 @@
"Processing column: offerType3 34.62%\n",
"Processing column: fuel_type4 36.54%\n",
"Processing column: transmission_type4 38.46%\n",
- "Processing column: car_name4 40.38%\n",
- "Processing column: body_type4 42.31%\n",
- "Processing column: model5 44.23%\n",
- "Processing column: brand5 46.15%\n",
+ "Processing column: body_type4 40.38%\n",
+ "Processing column: car_name4 42.31%\n",
+ "Processing column: country5 44.23%\n",
+ "Processing column: model5 46.15%\n",
"Processing column: vin5 48.08%\n",
- "Processing column: country5 50.0%\n",
- "Processing column: state5 51.92%\n",
+ "Processing column: brand5 50.0%\n",
+ "Processing column: condition5 51.92%\n",
"Processing column: title_status5 53.85%\n",
- "Processing column: condition5 55.77%\n",
+ "Processing column: state5 55.77%\n",
"Processing column: color5 57.69%\n",
"Processing column: Certificate6 59.62%\n",
- "Processing column: Gross6 61.54%\n",
- "Processing column: Poster_Link6 63.46%\n",
- "Processing column: Star26 65.38%\n",
- "Processing column: Overview6 67.31%\n",
- "Processing column: Director6 69.23%\n",
- "Processing column: Star46 71.15%\n",
- "Processing column: Series_Title6 73.08%\n",
- "Processing column: Star16 75.0%\n",
- "Processing column: Star36 76.92%\n",
+ "Processing column: Poster_Link6 61.54%\n",
+ "Processing column: Gross6 63.46%\n",
+ "Processing column: Director6 65.38%\n",
+ "Processing column: Star36 67.31%\n",
+ "Processing column: Star26 69.23%\n",
+ "Processing column: Star16 71.15%\n",
+ "Processing column: Overview6 73.08%\n",
+ "Processing column: Star46 75.0%\n",
+ "Processing column: Series_Title6 76.92%\n",
"Processing column: Genre6 78.85%\n",
"Processing column: show_id7 80.77%\n",
"Processing column: cast7 82.69%\n",
- "Processing column: title7 84.62%\n",
- "Processing column: description7 86.54%\n",
+ "Processing column: description7 84.62%\n",
+ "Processing column: title7 86.54%\n",
"Processing column: director7 88.46%\n",
- "Processing column: listed_in7 90.38%\n",
- "Processing column: duration7 92.31%\n",
- "Processing column: type7 94.23%\n",
+ "Processing column: type7 90.38%\n",
+ "Processing column: country7 92.31%\n",
+ "Processing column: listed_in7 94.23%\n",
"Processing column: rating7 96.15%\n",
- "Processing column: country7 98.08%\n",
+ "Processing column: duration7 98.08%\n",
"Processing column: date_added7 100.0%\n",
- "ELAPSED TIME :529.7594430446625\n"
+ "ELAPSED TIME :549.7098529338837\n"
]
}
],
@@ -353,12 +350,12 @@
"metadata": {
"collapsed": false,
"ExecuteTime": {
- "end_time": "2024-03-19T15:15:34.774275Z",
- "start_time": "2024-03-19T15:06:45.010192Z"
+ "end_time": "2024-03-26T10:07:41.940696Z",
+ "start_time": "2024-03-26T09:58:32.218307Z"
}
},
"id": "d18443a1c921f509",
- "execution_count": 63
+ "execution_count": 17
},
{
"cell_type": "code",
@@ -367,41 +364,49 @@
"name": "stderr",
"output_type": "stream",
"text": [
- "C:\\Users\\ab032mj\\Desktop\\thesis\\simillarity\\venv\\Lib\\site-packages\\sklearn\\cluster\\_kmeans.py:1416: FutureWarning:\n",
- "\n",
- "The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning\n",
- "\n"
+ "C:\\Users\\ab032mj\\Desktop\\thesis\\simillarity\\venv\\Lib\\site-packages\\sklearn\\cluster\\_kmeans.py:1416: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning\n",
+ " super()._check_params_vs_input(X, default_n_init=10)\n"
]
},
+ {
+ "data": {
+ "text/html": " \n "
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
{
"data": {
"application/vnd.plotly.v1+json": {
"data": [
{
"customdata": [
+ [
+ "reg_state1"
+ ],
[
"reg_city1"
],
[
- "reg_state1"
+ "tail_number1"
],
[
"flight1"
],
[
- "tail_number1"
+ "reg_expiration1"
],
[
- "reg_expiration1"
+ "manufacturer1"
],
[
"reg_owner1"
],
[
- "manufacturer1"
+ "model1"
],
[
- "model1"
+ "Multi Engine2"
],
[
"TP mods2"
@@ -410,14 +415,11 @@
"Engine Type2"
],
[
- "Multi Engine2"
+ "Model2"
],
[
"Company2"
],
- [
- "Model2"
- ],
[
"make3"
],
@@ -440,31 +442,31 @@
"transmission_type4"
],
[
- "car_name4"
+ "body_type4"
],
[
- "body_type4"
+ "car_name4"
],
[
- "model5"
+ "country5"
],
[
- "brand5"
+ "model5"
],
[
"vin5"
],
[
- "country5"
+ "brand5"
],
[
- "state5"
+ "condition5"
],
[
"title_status5"
],
[
- "condition5"
+ "state5"
],
[
"color5"
@@ -473,31 +475,31 @@
"Certificate6"
],
[
- "Gross6"
+ "Poster_Link6"
],
[
- "Poster_Link6"
+ "Gross6"
],
[
- "Star26"
+ "Director6"
],
[
- "Overview6"
+ "Star36"
],
[
- "Director6"
+ "Star26"
],
[
- "Star46"
+ "Star16"
],
[
- "Series_Title6"
+ "Overview6"
],
[
- "Star16"
+ "Star46"
],
[
- "Star36"
+ "Series_Title6"
],
[
"Genre6"
@@ -509,28 +511,28 @@
"cast7"
],
[
- "title7"
+ "description7"
],
[
- "description7"
+ "title7"
],
[
"director7"
],
[
- "listed_in7"
+ "type7"
],
[
- "duration7"
+ "country7"
],
[
- "type7"
+ "listed_in7"
],
[
"rating7"
],
[
- "country7"
+ "duration7"
],
[
"date_added7"
@@ -540,58 +542,58 @@
"legendgroup": "",
"marker": {
"color": [
- 1,
- 1,
- 4,
- 4,
- 4,
+ 10,
+ 3,
+ 7,
+ 7,
+ 7,
+ 11,
+ 11,
+ 7,
+ 0,
+ 0,
+ 11,
+ 11,
+ 11,
+ 2,
+ 0,
+ 3,
8,
+ 0,
8,
+ 0,
+ 2,
+ 2,
+ 10,
+ 0,
+ 7,
+ 2,
4,
3,
- 8,
+ 10,
+ 3,
+ 0,
+ 9,
+ 7,
+ 3,
3,
- 8,
- 8,
- 6,
3,
- 1,
- 11,
- 1,
- 11,
3,
- 6,
- 6,
1,
+ 3,
+ 3,
6,
- 4,
- 1,
- 1,
- 9,
- 0,
- 1,
- 1,
- 4,
7,
- 10,
- 2,
- 10,
- 10,
+ 3,
1,
- 10,
- 10,
+ 3,
+ 3,
5,
- 4,
10,
- 1,
- 2,
- 10,
- 5,
+ 6,
0,
- 5,
- 1,
- 1,
- 0
+ 7,
+ 4
],
"coloraxis": "coloraxis",
"symbol": "circle"
@@ -601,113 +603,113 @@
"orientation": "v",
"showlegend": false,
"x": [
- 5.481040000915527,
- 4.461870193481445,
- 4.956617832183838,
- 4.9444499015808105,
- 5.674870491027832,
- 4.4419264793396,
- 4.319740295410156,
- 5.136926174163818,
- 7.437771797180176,
- 4.348060131072998,
- 7.0663018226623535,
- 4.1616387367248535,
- 4.288301467895508,
- 5.361490249633789,
- 6.856499195098877,
- 5.667142868041992,
- 6.075915336608887,
- 6.264303684234619,
- 6.080878257751465,
- 6.997607231140137,
- 4.9815263748168945,
- 5.517688274383545,
- 5.867679119110107,
- 5.311161518096924,
- 4.352320194244385,
- 3.6641652584075928,
- 4.4079413414001465,
- 6.797448635101318,
- 6.221177101135254,
- 6.101926803588867,
- 6.199456691741943,
- 5.258615493774414,
- 3.1090869903564453,
- 5.236301422119141,
- 2.041367292404175,
- 5.144412517547607,
- 5.259392738342285,
- 5.225100040435791,
- 5.1772847175598145,
- 5.25313663482666,
- 2.5723178386688232,
- 4.8922271728515625,
- 3.6730144023895264,
- 5.107095718383789,
- 2.0164794921875,
- 5.152388095855713,
- 2.4399759769439697,
- 5.877944469451904,
- 3.193415880203247,
- 5.954677104949951,
- 3.6659605503082275,
- 6.515915393829346
+ 7.634982109069824,
+ 8.18161392211914,
+ 9.760482788085938,
+ 9.65578556060791,
+ 10.46049690246582,
+ 7.527761459350586,
+ 7.7938385009765625,
+ 9.330303192138672,
+ 8.89563274383545,
+ 8.884279251098633,
+ 5.870842933654785,
+ 8.49361515045166,
+ 7.291825771331787,
+ 6.987878322601318,
+ 8.564809799194336,
+ 8.554859161376953,
+ 6.354427337646484,
+ 8.301039695739746,
+ 6.307615756988525,
+ 8.199542999267578,
+ 6.75204610824585,
+ 6.548539161682129,
+ 6.594710350036621,
+ 8.208972930908203,
+ 9.793411254882812,
+ 6.7602009773254395,
+ 10.585469245910645,
+ 7.192152976989746,
+ 7.594589710235596,
+ 7.73886251449585,
+ 8.795101165771484,
+ 9.152059555053711,
+ 9.864975929260254,
+ 8.501593589782715,
+ 8.485865592956543,
+ 8.46456241607666,
+ 8.433588981628418,
+ 6.663652420043945,
+ 8.49349594116211,
+ 8.021809577941895,
+ 7.3567280769348145,
+ 9.757957458496094,
+ 8.53760814666748,
+ 6.686389923095703,
+ 7.943576335906982,
+ 8.512290954589844,
+ 8.190762519836426,
+ 6.943475246429443,
+ 7.313745498657227,
+ 9.077346801757812,
+ 10.087260246276855,
+ 10.557514190673828
],
"xaxis": "x",
"y": [
- 4.3125786781311035,
- 4.480679988861084,
- 5.934484481811523,
- 6.035068035125732,
- 6.627974510192871,
- 3.8020052909851074,
- 3.5600695610046387,
- 5.560845851898193,
- 4.549907684326172,
- 2.305506467819214,
- 4.657924175262451,
- 3.242453098297119,
- 5.140314102172852,
- 3.107086658477783,
- 4.361886978149414,
- 4.635444641113281,
- 2.251649856567383,
- 4.172117710113525,
- 2.205137014389038,
- 3.932243585586548,
- 2.756131649017334,
- 2.8223607540130615,
- 4.202624320983887,
- 2.893759250640869,
- 6.234554290771484,
- 4.007889747619629,
- 4.486442565917969,
- 2.9157278537750244,
- 6.661800384521484,
- 3.6967668533325195,
- 4.754232406616211,
- 6.072335243225098,
- 5.7372331619262695,
- 4.607232093811035,
- 4.480278015136719,
- 4.682827472686768,
- 4.638975620269775,
- 4.1178507804870605,
- 4.5823140144348145,
- 4.631624698638916,
- 3.6405510902404785,
- 6.063982963562012,
- 5.105567455291748,
- 4.051309585571289,
- 4.475528717041016,
- 4.691291332244873,
- 3.689364194869995,
- 6.186923503875732,
- 2.650754451751709,
- 5.092771530151367,
- 4.2011871337890625,
- 6.588094234466553
+ 11.651568412780762,
+ 12.191211700439453,
+ 11.268287658691406,
+ 11.299283027648926,
+ 11.919140815734863,
+ 10.920798301696777,
+ 10.986883163452148,
+ 11.574243545532227,
+ 13.608915328979492,
+ 13.987488746643066,
+ 11.457459449768066,
+ 10.793388366699219,
+ 10.746611595153809,
+ 12.363808631896973,
+ 13.473563194274902,
+ 12.267362594604492,
+ 13.26131534576416,
+ 12.938798904418945,
+ 13.27975082397461,
+ 13.716704368591309,
+ 12.57845401763916,
+ 12.065139770507812,
+ 11.04773998260498,
+ 12.567636489868164,
+ 10.669133186340332,
+ 12.349822998046875,
+ 12.474716186523438,
+ 13.81960391998291,
+ 11.614741325378418,
+ 12.918475151062012,
+ 12.735498428344727,
+ 9.668839454650879,
+ 11.589250564575195,
+ 11.631767272949219,
+ 11.802452087402344,
+ 11.795424461364746,
+ 11.737223625183105,
+ 9.014378547668457,
+ 11.800704002380371,
+ 11.993047714233398,
+ 9.46713638305664,
+ 11.210518836975098,
+ 10.427387237548828,
+ 9.043390274047852,
+ 11.889760971069336,
+ 11.644381523132324,
+ 9.431979179382324,
+ 10.852152824401855,
+ 9.312481880187988,
+ 12.395952224731445,
+ 12.187356948852539,
+ 12.789667129516602
],
"yaxis": "y",
"type": "scatter"
@@ -1622,25 +1624,72 @@
"plotlyServerURL": "https://plot.ly"
}
},
- "text/html": ""
+ "text/html": ""
},
"metadata": {},
"output_type": "display_data"
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "CLUSTER 0\n",
+ "CLUSTER 1\n",
+ "CLUSTER 2\n",
+ "CLUSTER 3\n",
+ "CLUSTER 4\n",
+ "CLUSTER 5\n",
+ "CLUSTER 6\n",
+ "CLUSTER 7\n",
+ "CLUSTER 8\n",
+ "CLUSTER 9\n",
+ "CLUSTER 10\n",
+ "CLUSTER 11\n",
+ "['Multi Engine2', 'TP mods2', 'gear3', 'offerType3', 'transmission_type4', 'model5', 'Certificate6', 'rating7']\n",
+ "['Overview6', 'description7']\n",
+ "['make3', 'body_type4', 'car_name4', 'brand5']\n",
+ "['reg_city1', 'model3', 'title_status5', 'color5', 'Director6', 'Star36', 'Star26', 'Star16', 'Star46', 'Series_Title6', 'cast7', 'title7', 'director7']\n",
+ "['condition5', 'date_added7']\n",
+ "['type7']\n",
+ "['Genre6', 'listed_in7']\n",
+ "['tail_number1', 'flight1', 'reg_expiration1', 'model1', 'vin5', 'Gross6', 'show_id7', 'duration7']\n",
+ "['fuel3', 'fuel_type4']\n",
+ "['Poster_Link6']\n",
+ "['reg_state1', 'country5', 'state5', 'country7']\n",
+ "['manufacturer1', 'reg_owner1', 'Engine Type2', 'Model2', 'Company2']\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "C:\\Users\\ab032mj\\Desktop\\thesis\\simillarity\\venv\\Lib\\site-packages\\sklearn\\cluster\\_kmeans.py:1416: FutureWarning:\n",
+ "\n",
+ "The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning\n",
+ "\n"
+ ]
}
],
"source": [
+ "\n",
+ "\n",
"# plot_vectors(vectors_avg, \"Average column2vec vectors\")\n",
- "plot_clusters(vectors_avg, \"Clusters Average column2vec clusters\")"
+ "plot_clusters(vectors_avg, \"Clusters Average column2vec clusters\")\n",
+ "\n",
+ "clusters = get_clusters(vectors_avg, 12)\n",
+ "\n",
+ "for i in clusters:\n",
+ " print(i)"
],
"metadata": {
"collapsed": false,
"ExecuteTime": {
- "end_time": "2024-03-19T15:15:35.022738Z",
- "start_time": "2024-03-19T15:15:34.775408Z"
+ "end_time": "2024-03-26T10:07:42.866553Z",
+ "start_time": "2024-03-26T10:07:41.941778Z"
}
},
"id": "a1ef23c604e2775f",
- "execution_count": 64
+ "execution_count": 18
},
{
"cell_type": "code",
@@ -1649,19 +1698,19 @@
"name": "stdout",
"output_type": "stream",
"text": [
- "Processing column: reg_city1 1.92%\n",
- "Processing column: reg_state1 3.85%\n",
- "Processing column: flight1 5.77%\n",
- "Processing column: tail_number1 7.69%\n",
+ "Processing column: reg_state1 1.92%\n",
+ "Processing column: reg_city1 3.85%\n",
+ "Processing column: tail_number1 5.77%\n",
+ "Processing column: flight1 7.69%\n",
"Processing column: reg_expiration1 9.62%\n",
- "Processing column: reg_owner1 11.54%\n",
- "Processing column: manufacturer1 13.46%\n",
+ "Processing column: manufacturer1 11.54%\n",
+ "Processing column: reg_owner1 13.46%\n",
"Processing column: model1 15.38%\n",
- "Processing column: TP mods2 17.31%\n",
- "Processing column: Engine Type2 19.23%\n",
- "Processing column: Multi Engine2 21.15%\n",
- "Processing column: Company2 23.08%\n",
- "Processing column: Model2 25.0%\n",
+ "Processing column: Multi Engine2 17.31%\n",
+ "Processing column: TP mods2 19.23%\n",
+ "Processing column: Engine Type2 21.15%\n",
+ "Processing column: Model2 23.08%\n",
+ "Processing column: Company2 25.0%\n",
"Processing column: make3 26.92%\n",
"Processing column: gear3 28.85%\n",
"Processing column: model3 30.77%\n",
@@ -1669,39 +1718,39 @@
"Processing column: offerType3 34.62%\n",
"Processing column: fuel_type4 36.54%\n",
"Processing column: transmission_type4 38.46%\n",
- "Processing column: car_name4 40.38%\n",
- "Processing column: body_type4 42.31%\n",
- "Processing column: model5 44.23%\n",
- "Processing column: brand5 46.15%\n",
+ "Processing column: body_type4 40.38%\n",
+ "Processing column: car_name4 42.31%\n",
+ "Processing column: country5 44.23%\n",
+ "Processing column: model5 46.15%\n",
"Processing column: vin5 48.08%\n",
- "Processing column: country5 50.0%\n",
- "Processing column: state5 51.92%\n",
+ "Processing column: brand5 50.0%\n",
+ "Processing column: condition5 51.92%\n",
"Processing column: title_status5 53.85%\n",
- "Processing column: condition5 55.77%\n",
+ "Processing column: state5 55.77%\n",
"Processing column: color5 57.69%\n",
"Processing column: Certificate6 59.62%\n",
- "Processing column: Gross6 61.54%\n",
- "Processing column: Poster_Link6 63.46%\n",
- "Processing column: Star26 65.38%\n",
- "Processing column: Overview6 67.31%\n",
- "Processing column: Director6 69.23%\n",
- "Processing column: Star46 71.15%\n",
- "Processing column: Series_Title6 73.08%\n",
- "Processing column: Star16 75.0%\n",
- "Processing column: Star36 76.92%\n",
+ "Processing column: Poster_Link6 61.54%\n",
+ "Processing column: Gross6 63.46%\n",
+ "Processing column: Director6 65.38%\n",
+ "Processing column: Star36 67.31%\n",
+ "Processing column: Star26 69.23%\n",
+ "Processing column: Star16 71.15%\n",
+ "Processing column: Overview6 73.08%\n",
+ "Processing column: Star46 75.0%\n",
+ "Processing column: Series_Title6 76.92%\n",
"Processing column: Genre6 78.85%\n",
"Processing column: show_id7 80.77%\n",
"Processing column: cast7 82.69%\n",
- "Processing column: title7 84.62%\n",
- "Processing column: description7 86.54%\n",
+ "Processing column: description7 84.62%\n",
+ "Processing column: title7 86.54%\n",
"Processing column: director7 88.46%\n",
- "Processing column: listed_in7 90.38%\n",
- "Processing column: duration7 92.31%\n",
- "Processing column: type7 94.23%\n",
+ "Processing column: type7 90.38%\n",
+ "Processing column: country7 92.31%\n",
+ "Processing column: listed_in7 94.23%\n",
"Processing column: rating7 96.15%\n",
- "Processing column: country7 98.08%\n",
+ "Processing column: duration7 98.08%\n",
"Processing column: date_added7 100.0%\n",
- "ELAPSED TIME :8.40464186668396\n"
+ "ELAPSED TIME :8.791604995727539\n"
]
}
],
@@ -1711,12 +1760,12 @@
"metadata": {
"collapsed": false,
"ExecuteTime": {
- "end_time": "2024-03-19T15:15:43.432300Z",
- "start_time": "2024-03-19T15:15:35.024903Z"
+ "end_time": "2024-03-26T10:07:51.663008Z",
+ "start_time": "2024-03-26T10:07:42.867616Z"
}
},
"id": "9ce83d1e71dcc010",
- "execution_count": 65
+ "execution_count": 19
},
{
"cell_type": "code",
@@ -1737,29 +1786,32 @@
"data": [
{
"customdata": [
+ [
+ "reg_state1"
+ ],
[
"reg_city1"
],
[
- "reg_state1"
+ "tail_number1"
],
[
"flight1"
],
[
- "tail_number1"
+ "reg_expiration1"
],
[
- "reg_expiration1"
+ "manufacturer1"
],
[
"reg_owner1"
],
[
- "manufacturer1"
+ "model1"
],
[
- "model1"
+ "Multi Engine2"
],
[
"TP mods2"
@@ -1768,14 +1820,11 @@
"Engine Type2"
],
[
- "Multi Engine2"
+ "Model2"
],
[
"Company2"
],
- [
- "Model2"
- ],
[
"make3"
],
@@ -1798,31 +1847,31 @@
"transmission_type4"
],
[
- "car_name4"
+ "body_type4"
],
[
- "body_type4"
+ "car_name4"
],
[
- "model5"
+ "country5"
],
[
- "brand5"
+ "model5"
],
[
"vin5"
],
[
- "country5"
+ "brand5"
],
[
- "state5"
+ "condition5"
],
[
"title_status5"
],
[
- "condition5"
+ "state5"
],
[
"color5"
@@ -1831,31 +1880,31 @@
"Certificate6"
],
[
- "Gross6"
+ "Poster_Link6"
],
[
- "Poster_Link6"
+ "Gross6"
],
[
- "Star26"
+ "Director6"
],
[
- "Overview6"
+ "Star36"
],
[
- "Director6"
+ "Star26"
],
[
- "Star46"
+ "Star16"
],
[
- "Series_Title6"
+ "Overview6"
],
[
- "Star16"
+ "Star46"
],
[
- "Star36"
+ "Series_Title6"
],
[
"Genre6"
@@ -1867,28 +1916,28 @@
"cast7"
],
[
- "title7"
+ "description7"
],
[
- "description7"
+ "title7"
],
[
"director7"
],
[
- "listed_in7"
+ "type7"
],
[
- "duration7"
+ "country7"
],
[
- "type7"
+ "listed_in7"
],
[
"rating7"
],
[
- "country7"
+ "duration7"
],
[
"date_added7"
@@ -1898,58 +1947,58 @@
"legendgroup": "",
"marker": {
"color": [
+ 6,
+ 0,
+ 10,
+ 10,
+ 10,
+ 1,
+ 1,
+ 1,
+ 7,
+ 10,
+ 9,
1,
1,
+ 4,
+ 3,
+ 0,
+ 9,
7,
+ 9,
+ 3,
+ 4,
+ 4,
7,
4,
- 6,
- 6,
- 6,
- 10,
- 8,
10,
- 6,
- 6,
- 2,
+ 4,
11,
2,
- 8,
+ 6,
10,
- 8,
- 11,
- 2,
- 2,
- 2,
- 2,
7,
10,
- 1,
- 9,
- 0,
10,
+ 0,
+ 0,
+ 6,
+ 0,
+ 5,
+ 6,
+ 5,
+ 8,
10,
- 4,
- 7,
- 1,
- 3,
- 1,
- 1,
- 3,
- 1,
- 1,
+ 6,
5,
- 4,
- 1,
- 3,
- 3,
- 1,
5,
+ 6,
+ 8,
+ 6,
+ 8,
7,
- 5,
10,
- 1,
- 4
+ 10
],
"coloraxis": "coloraxis",
"symbol": "circle"
@@ -1959,113 +2008,113 @@
"orientation": "v",
"showlegend": false,
"x": [
- 2.7740252017974854,
- 2.8364009857177734,
- 1.069262981414795,
- 1.0029431581497192,
- 1.5992205142974854,
- 3.3962671756744385,
- 3.2161507606506348,
- 3.105342149734497,
- -0.26255616545677185,
- 0.6103012561798096,
- -0.6897143125534058,
- 3.189011335372925,
- 2.700456142425537,
- 1.689704418182373,
- -0.5089970827102661,
- 3.4468133449554443,
- 0.2496114820241928,
- -0.20911100506782532,
- 0.22651678323745728,
- -0.5129572749137878,
- 1.8270411491394043,
- 1.6040159463882446,
- 1.6857876777648926,
- 1.5496913194656372,
- 0.8878748416900635,
- -0.515620231628418,
- 3.0048699378967285,
- -0.5076618790626526,
- 1.5513533353805542,
- 0.6620107293128967,
- 0.08456160873174667,
- 1.3856319189071655,
- 0.8733639717102051,
- 1.9646496772766113,
- 4.975734710693359,
- 2.2607388496398926,
- 1.911452293395996,
- 4.4013895988464355,
- 2.2770698070526123,
- 2.3053503036499023,
- 3.4285330772399902,
- 1.2291285991668701,
- 1.5165760517120361,
- 4.615434646606445,
- 5.032024383544922,
- 1.617465615272522,
- 3.7750792503356934,
- 1.0852265357971191,
- 3.424313545227051,
- 0.29817289113998413,
- 2.2570576667785645,
- 1.8820475339889526
+ -0.6300977468490601,
+ -0.42117050290107727,
+ -0.2486436814069748,
+ -0.2788216173648834,
+ -1.0231388807296753,
+ 1.9644521474838257,
+ 1.7976727485656738,
+ 1.9063923358917236,
+ 0.911508321762085,
+ -0.3994556665420532,
+ -1.698248028755188,
+ 2.2699530124664307,
+ 1.9196691513061523,
+ 2.3852696418762207,
+ -1.3150827884674072,
+ 1.4531058073043823,
+ -2.1891512870788574,
+ -0.3801000714302063,
+ -2.2206525802612305,
+ -1.4057995080947876,
+ 2.792318105697632,
+ 2.2648849487304688,
+ 0.4104484021663666,
+ 2.5826117992401123,
+ 0.025007257238030434,
+ 2.4077377319335938,
+ -1.835091471672058,
+ -2.8586337566375732,
+ -0.610065221786499,
+ -1.399946689605713,
+ 0.23535476624965668,
+ 0.632976770401001,
+ -0.7796151041984558,
+ 0.6179103255271912,
+ 0.32792437076568604,
+ 0.42420893907546997,
+ 0.5684399008750916,
+ -0.1674804985523224,
+ 0.4046439230442047,
+ -0.01373551320284605,
+ 0.0037475405260920525,
+ -0.564787745475769,
+ 0.4966707229614258,
+ -0.19300417602062225,
+ -0.10040821880102158,
+ 0.32298508286476135,
+ 0.5930238366127014,
+ -0.189849391579628,
+ 0.21084845066070557,
+ 1.2016559839248657,
+ -1.0955184698104858,
+ -1.4543946981430054
],
"xaxis": "x",
"y": [
- 3.3891963958740234,
- 3.713914394378662,
- 3.3481297492980957,
- 3.386765241622925,
- 4.212575435638428,
- 0.0772617980837822,
- -0.0958557277917862,
- 0.0595986545085907,
- 4.59176778793335,
- -0.37828168272972107,
- 3.275238513946533,
- -0.06835243850946426,
- -0.21176140010356903,
- 0.6041004061698914,
- 1.852208137512207,
- 2.6287810802459717,
- 0.30453425645828247,
- 2.5676887035369873,
- 0.3329438865184784,
- 1.7548259496688843,
- 0.6495657563209534,
- 0.20514854788780212,
- 0.37018677592277527,
- 0.6579046249389648,
- 3.2581090927124023,
- 3.536987066268921,
- 3.740365505218506,
- 0.4170229136943817,
- 4.943604946136475,
- 2.1078155040740967,
- 3.131649971008301,
- 3.806628704071045,
- 2.7448670864105225,
- 2.4906883239746094,
- 2.3012287616729736,
- 2.1632754802703857,
- 2.562673330307007,
- 2.3535079956054688,
- 2.231971502304077,
- 2.644801378250122,
- 1.9668481349945068,
- 3.6226108074188232,
- 2.6755826473236084,
- 2.317932605743408,
- 2.2890031337738037,
- 2.692415714263916,
- 1.7611156702041626,
- 4.269854545593262,
- 1.4409667253494263,
- 4.013367176055908,
- 3.184629440307617,
- 4.555140972137451
+ -0.34463998675346375,
+ -0.3702987730503082,
+ -2.027839183807373,
+ -1.9760643243789673,
+ -2.0544214248657227,
+ 1.2670314311981201,
+ 1.3112242221832275,
+ 1.0823720693588257,
+ -3.1692299842834473,
+ -3.810380697250366,
+ 0.754950225353241,
+ 0.8772135972976685,
+ 1.2372525930404663,
+ -0.5736871957778931,
+ -3.3916871547698975,
+ -0.022917641326785088,
+ 0.21249575912952423,
+ -3.1873059272766113,
+ 0.1714380532503128,
+ -3.4063022136688232,
+ -0.547065794467926,
+ -0.4382999539375305,
+ -3.318624258041382,
+ -0.4890264868736267,
+ -2.125154972076416,
+ -0.7327226996421814,
+ -1.7250908613204956,
+ -0.4163961112499237,
+ -0.15362370014190674,
+ -1.071980595588684,
+ -2.7636942863464355,
+ -2.0271103382110596,
+ -1.607711911201477,
+ -0.5869593620300293,
+ -0.7268936038017273,
+ -0.9733710289001465,
+ -0.5965152382850647,
+ 2.2380130290985107,
+ -1.0552787780761719,
+ 1.706160306930542,
+ 0.7388292551040649,
+ -1.9103707075119019,
+ -1.5025126934051514,
+ 2.287067413330078,
+ 1.8778971433639526,
+ -1.3680455684661865,
+ 0.702415406703949,
+ -0.8708230257034302,
+ 1.0335848331451416,
+ -2.3910341262817383,
+ -1.6667473316192627,
+ -2.2093350887298584
],
"yaxis": "y",
"type": "scatter"
@@ -2980,7 +3029,7 @@
"plotlyServerURL": "https://plot.ly"
}
},
- "text/html": ""
+ "text/html": ""
},
"metadata": {},
"output_type": "display_data"
@@ -2993,12 +3042,12 @@
"metadata": {
"collapsed": false,
"ExecuteTime": {
- "end_time": "2024-03-19T15:15:43.635850Z",
- "start_time": "2024-03-19T15:15:43.433390Z"
+ "end_time": "2024-03-26T10:07:51.904467Z",
+ "start_time": "2024-03-26T10:07:51.664082Z"
}
},
"id": "cec56275bccd843a",
- "execution_count": 66
+ "execution_count": 20
},
{
"cell_type": "code",
@@ -3007,19 +3056,19 @@
"name": "stdout",
"output_type": "stream",
"text": [
- "Processing column: reg_city1 1.92%\n",
- "Processing column: reg_state1 3.85%\n",
- "Processing column: flight1 5.77%\n",
- "Processing column: tail_number1 7.69%\n",
+ "Processing column: reg_state1 1.92%\n",
+ "Processing column: reg_city1 3.85%\n",
+ "Processing column: tail_number1 5.77%\n",
+ "Processing column: flight1 7.69%\n",
"Processing column: reg_expiration1 9.62%\n",
- "Processing column: reg_owner1 11.54%\n",
- "Processing column: manufacturer1 13.46%\n",
+ "Processing column: manufacturer1 11.54%\n",
+ "Processing column: reg_owner1 13.46%\n",
"Processing column: model1 15.38%\n",
- "Processing column: TP mods2 17.31%\n",
- "Processing column: Engine Type2 19.23%\n",
- "Processing column: Multi Engine2 21.15%\n",
- "Processing column: Company2 23.08%\n",
- "Processing column: Model2 25.0%\n",
+ "Processing column: Multi Engine2 17.31%\n",
+ "Processing column: TP mods2 19.23%\n",
+ "Processing column: Engine Type2 21.15%\n",
+ "Processing column: Model2 23.08%\n",
+ "Processing column: Company2 25.0%\n",
"Processing column: make3 26.92%\n",
"Processing column: gear3 28.85%\n",
"Processing column: model3 30.77%\n",
@@ -3027,39 +3076,39 @@
"Processing column: offerType3 34.62%\n",
"Processing column: fuel_type4 36.54%\n",
"Processing column: transmission_type4 38.46%\n",
- "Processing column: car_name4 40.38%\n",
- "Processing column: body_type4 42.31%\n",
- "Processing column: model5 44.23%\n",
- "Processing column: brand5 46.15%\n",
+ "Processing column: body_type4 40.38%\n",
+ "Processing column: car_name4 42.31%\n",
+ "Processing column: country5 44.23%\n",
+ "Processing column: model5 46.15%\n",
"Processing column: vin5 48.08%\n",
- "Processing column: country5 50.0%\n",
- "Processing column: state5 51.92%\n",
+ "Processing column: brand5 50.0%\n",
+ "Processing column: condition5 51.92%\n",
"Processing column: title_status5 53.85%\n",
- "Processing column: condition5 55.77%\n",
+ "Processing column: state5 55.77%\n",
"Processing column: color5 57.69%\n",
"Processing column: Certificate6 59.62%\n",
- "Processing column: Gross6 61.54%\n",
- "Processing column: Poster_Link6 63.46%\n",
- "Processing column: Star26 65.38%\n",
- "Processing column: Overview6 67.31%\n",
- "Processing column: Director6 69.23%\n",
- "Processing column: Star46 71.15%\n",
- "Processing column: Series_Title6 73.08%\n",
- "Processing column: Star16 75.0%\n",
- "Processing column: Star36 76.92%\n",
+ "Processing column: Poster_Link6 61.54%\n",
+ "Processing column: Gross6 63.46%\n",
+ "Processing column: Director6 65.38%\n",
+ "Processing column: Star36 67.31%\n",
+ "Processing column: Star26 69.23%\n",
+ "Processing column: Star16 71.15%\n",
+ "Processing column: Overview6 73.08%\n",
+ "Processing column: Star46 75.0%\n",
+ "Processing column: Series_Title6 76.92%\n",
"Processing column: Genre6 78.85%\n",
"Processing column: show_id7 80.77%\n",
"Processing column: cast7 82.69%\n",
- "Processing column: title7 84.62%\n",
- "Processing column: description7 86.54%\n",
+ "Processing column: description7 84.62%\n",
+ "Processing column: title7 86.54%\n",
"Processing column: director7 88.46%\n",
- "Processing column: listed_in7 90.38%\n",
- "Processing column: duration7 92.31%\n",
- "Processing column: type7 94.23%\n",
+ "Processing column: type7 90.38%\n",
+ "Processing column: country7 92.31%\n",
+ "Processing column: listed_in7 94.23%\n",
"Processing column: rating7 96.15%\n",
- "Processing column: country7 98.08%\n",
+ "Processing column: duration7 98.08%\n",
"Processing column: date_added7 100.0%\n",
- "ELAPSED TIME :8.255228281021118\n"
+ "ELAPSED TIME :8.641924381256104\n"
]
}
],
@@ -3069,12 +3118,73 @@
"metadata": {
"collapsed": false,
"ExecuteTime": {
- "end_time": "2024-03-19T15:15:51.896658Z",
- "start_time": "2024-03-19T15:15:43.636921Z"
+ "end_time": "2024-03-26T10:08:00.551380Z",
+ "start_time": "2024-03-26T10:07:51.905562Z"
}
},
"id": "a52718b90663e30e",
- "execution_count": 67
+ "execution_count": 21
+ },
+ {
+ "cell_type": "code",
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "CLUSTER 0\n",
+ "CLUSTER 1\n",
+ "CLUSTER 2\n",
+ "CLUSTER 3\n",
+ "CLUSTER 4\n",
+ "CLUSTER 5\n",
+ "CLUSTER 6\n",
+ "CLUSTER 7\n",
+ "CLUSTER 8\n",
+ "CLUSTER 9\n",
+ "CLUSTER 10\n",
+ "CLUSTER 11\n",
+ "['reg_state1', 'state5']\n",
+ "['make3', 'model3', 'fuel3', 'fuel_type4', 'body_type4', 'car_name4', 'model5', 'brand5', 'type7']\n",
+ "['reg_city1', 'Director6', 'Star36', 'Star26', 'Star16', 'Star46', 'cast7', 'director7', 'country7']\n",
+ "['tail_number1', 'flight1', 'reg_expiration1', 'vin5', 'condition5', 'Poster_Link6', 'Gross6', 'show_id7', 'duration7', 'date_added7']\n",
+ "['manufacturer1', 'reg_owner1', 'model1', 'Engine Type2', 'Model2', 'Company2']\n",
+ "['Overview6', 'Series_Title6', 'Genre6', 'description7', 'title7', 'listed_in7']\n",
+ "['Multi Engine2', 'TP mods2', 'title_status5']\n",
+ "['Certificate6', 'rating7']\n",
+ "['gear3', 'transmission_type4']\n",
+ "['color5']\n",
+ "['offerType3']\n",
+ "['country5']\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "C:\\Users\\ab032mj\\Desktop\\thesis\\simillarity\\venv\\Lib\\site-packages\\sklearn\\cluster\\_kmeans.py:1416: FutureWarning:\n",
+ "\n",
+ "The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning\n",
+ "\n"
+ ]
+ }
+ ],
+ "source": [
+ "\n",
+ "clusters = get_clusters(vectors_sentence_clean, 12)\n",
+ "\n",
+ "for i in clusters:\n",
+ " print(i)"
+ ],
+ "metadata": {
+ "collapsed": false,
+ "ExecuteTime": {
+ "end_time": "2024-03-26T10:08:00.629829Z",
+ "start_time": "2024-03-26T10:08:00.552499Z"
+ }
+ },
+ "id": "a1810cdce5d9ca45",
+ "execution_count": 22
},
{
"cell_type": "code",
@@ -3095,29 +3205,32 @@
"data": [
{
"customdata": [
+ [
+ "reg_state1"
+ ],
[
"reg_city1"
],
[
- "reg_state1"
+ "tail_number1"
],
[
"flight1"
],
[
- "tail_number1"
+ "reg_expiration1"
],
[
- "reg_expiration1"
+ "manufacturer1"
],
[
"reg_owner1"
],
[
- "manufacturer1"
+ "model1"
],
[
- "model1"
+ "Multi Engine2"
],
[
"TP mods2"
@@ -3126,14 +3239,11 @@
"Engine Type2"
],
[
- "Multi Engine2"
+ "Model2"
],
[
"Company2"
],
- [
- "Model2"
- ],
[
"make3"
],
@@ -3156,31 +3266,31 @@
"transmission_type4"
],
[
- "car_name4"
+ "body_type4"
],
[
- "body_type4"
+ "car_name4"
],
[
- "model5"
+ "country5"
],
[
- "brand5"
+ "model5"
],
[
"vin5"
],
[
- "country5"
+ "brand5"
],
[
- "state5"
+ "condition5"
],
[
"title_status5"
],
[
- "condition5"
+ "state5"
],
[
"color5"
@@ -3189,31 +3299,31 @@
"Certificate6"
],
[
- "Gross6"
+ "Poster_Link6"
],
[
- "Poster_Link6"
+ "Gross6"
],
[
- "Star26"
+ "Director6"
],
[
- "Overview6"
+ "Star36"
],
[
- "Director6"
+ "Star26"
],
[
- "Star46"
+ "Star16"
],
[
- "Series_Title6"
+ "Overview6"
],
[
- "Star16"
+ "Star46"
],
[
- "Star36"
+ "Series_Title6"
],
[
"Genre6"
@@ -3225,28 +3335,28 @@
"cast7"
],
[
- "title7"
+ "description7"
],
[
- "description7"
+ "title7"
],
[
"director7"
],
[
- "listed_in7"
+ "type7"
],
[
- "duration7"
+ "country7"
],
[
- "type7"
+ "listed_in7"
],
[
"rating7"
],
[
- "country7"
+ "duration7"
],
[
"date_added7"
@@ -3256,58 +3366,58 @@
"legendgroup": "",
"marker": {
"color": [
+ 0,
+ 2,
+ 3,
+ 3,
+ 3,
+ 4,
+ 4,
+ 4,
+ 6,
+ 6,
+ 4,
+ 4,
+ 4,
+ 1,
8,
- 8,
- 10,
+ 1,
+ 1,
10,
- 7,
1,
+ 8,
1,
1,
- 4,
+ 11,
1,
3,
1,
- 1,
- 2,
- 5,
+ 3,
6,
+ 0,
9,
+ 7,
3,
- 9,
+ 3,
+ 2,
+ 2,
+ 2,
+ 2,
5,
2,
+ 5,
+ 5,
+ 3,
2,
+ 5,
+ 5,
2,
+ 1,
2,
- 10,
- 7,
- 8,
- 11,
+ 5,
7,
- 9,
- 10,
- 10,
- 10,
- 3,
- 0,
- 3,
3,
- 0,
- 3,
- 3,
- 6,
- 10,
- 3,
- 0,
- 0,
- 3,
- 6,
- 10,
- 6,
- 10,
- 8,
- 7
+ 3
],
"coloraxis": "coloraxis",
"symbol": "circle"
@@ -3317,113 +3427,113 @@
"orientation": "v",
"showlegend": false,
"x": [
- -0.2607487738132477,
- -1.0079954862594604,
- 0.6057992577552795,
- 0.5344958901405334,
- 0.007245978340506554,
- 2.115837574005127,
- 2.367359161376953,
- 1.825493335723877,
- -0.2870010733604431,
- 2.9802863597869873,
- -0.10476882755756378,
- 2.4541375637054443,
- 2.198890447616577,
- 2.9787535667419434,
- 4.7972283363342285,
- 2.3306944370269775,
- 3.7547733783721924,
- -1.7262934446334839,
- 3.7424700260162354,
- 4.723546504974365,
- 2.612650156021118,
- 3.3930583000183105,
- 2.861128330230713,
- 3.0491740703582764,
- 0.5691105127334595,
- -1.7126460075378418,
- -1.0357009172439575,
- 3.9140195846557617,
- -0.3152306377887726,
- 1.4525127410888672,
- -0.6416651606559753,
- 0.5356868505477905,
- 0.9755063056945801,
- 0.6514178514480591,
- 1.5733528137207031,
- 1.0550673007965088,
- 0.5585827827453613,
- 1.587950348854065,
- 0.9356500506401062,
- 0.9580854773521423,
- 2.4921987056732178,
- 0.49259233474731445,
- 0.44213390350341797,
- 1.650417685508728,
- 1.6093469858169556,
- 0.4413849711418152,
- 2.4170665740966797,
- -0.0848783552646637,
- 2.8719282150268555,
- -0.8826955556869507,
- -0.4407237768173218,
- -0.4333607256412506
+ -0.8933649063110352,
+ -0.39842361211776733,
+ -1.4793410301208496,
+ -1.385990023612976,
+ -2.186568260192871,
+ 0.20130865275859833,
+ 0.09205331653356552,
+ -0.07491744309663773,
+ 0.5163057446479797,
+ 0.5787357687950134,
+ 0.3718307614326477,
+ 0.173736572265625,
+ 0.26832813024520874,
+ 1.4223390817642212,
+ 0.07582058012485504,
+ 1.2557424306869507,
+ -1.1702723503112793,
+ -2.0104966163635254,
+ -1.2018636465072632,
+ 0.13705717027187347,
+ 1.7902134656906128,
+ 1.112935185432434,
+ -2.7951245307922363,
+ 1.321556568145752,
+ -1.3741170167922974,
+ 1.38334321975708,
+ -2.2069263458251953,
+ 1.4237916469573975,
+ -0.8494253158569336,
+ -1.8130989074707031,
+ -0.9841021299362183,
+ -0.9236490726470947,
+ -1.5238884687423706,
+ 0.43331876397132874,
+ 0.30412358045578003,
+ -0.0314553864300251,
+ 0.2996326982975006,
+ 2.353454113006592,
+ -0.13156205415725708,
+ 1.6180741786956787,
+ 2.7325005531311035,
+ -1.6119945049285889,
+ -0.24953196942806244,
+ 2.360980987548828,
+ 1.9116843938827515,
+ -0.15537023544311523,
+ 2.8361775875091553,
+ -0.40099677443504333,
+ 2.635286569595337,
+ -1.0342307090759277,
+ -1.295596718788147,
+ -2.3089394569396973
],
"xaxis": "x",
"y": [
- 0.029110712930560112,
- 0.2831602394580841,
- 0.7355490922927856,
- 0.7714056372642517,
- 1.7808095216751099,
- 1.5367865562438965,
- 1.7146657705307007,
- 1.3714510202407837,
- -2.3006083965301514,
- 2.068046808242798,
- -1.8296335935592651,
- 1.7337384223937988,
- 1.2609519958496094,
- -0.17047174274921417,
- -0.23581287264823914,
- -0.7553197741508484,
- 0.7174517512321472,
- 1.1607658863067627,
- 0.6419799327850342,
- -0.22861970961093903,
- -0.12489606440067291,
- -0.20934484899044037,
- 0.013001663610339165,
- -0.03274914622306824,
- 0.6541449427604675,
- -0.30210813879966736,
- 0.2788488566875458,
- -1.3033324480056763,
- 2.003594160079956,
- 2.6429059505462646,
- -0.9512806534767151,
- 1.3433306217193604,
- 0.3491983711719513,
- -0.4967333674430847,
- -2.549825668334961,
- -0.7152066826820374,
- -0.42618969082832336,
- -1.7045925855636597,
- -0.6547468304634094,
- -0.6604399681091309,
- -2.0904040336608887,
- 1.0396133661270142,
- -0.41867274045944214,
- -1.9949613809585571,
- -2.536947011947632,
- -0.4811083674430847,
- -2.215475082397461,
- 0.9217885136604309,
- -2.3584845066070557,
- -1.1353845596313477,
- -0.043536651879549026,
- 1.4129749536514282
+ -2.4839179515838623,
+ -2.989748239517212,
+ -4.415885925292969,
+ -4.346672058105469,
+ -3.2706820964813232,
+ -0.9454534649848938,
+ -1.2883728742599487,
+ -1.6748652458190918,
+ -5.4596848487854,
+ -5.6513776779174805,
+ -0.24761804938316345,
+ -1.4888640642166138,
+ -0.8605363368988037,
+ -1.9725847244262695,
+ -6.909929275512695,
+ -2.708397626876831,
+ -0.7754024863243103,
+ -5.6967973709106445,
+ -0.8881263136863708,
+ -6.934398174285889,
+ -1.4960343837738037,
+ -2.2084426879882812,
+ -4.651385307312012,
+ -1.8190819025039673,
+ -4.44832181930542,
+ -1.8388243913650513,
+ -2.7469568252563477,
+ -5.604869842529297,
+ -2.437312126159668,
+ -1.3868107795715332,
+ -5.354982852935791,
+ -4.581441402435303,
+ -3.4043173789978027,
+ -4.036744117736816,
+ -3.7389719486236572,
+ -4.098353862762451,
+ -4.029059410095215,
+ -4.093706130981445,
+ -4.036102771759033,
+ -3.6575052738189697,
+ -3.290090560913086,
+ -4.05233097076416,
+ -4.351712226867676,
+ -4.0254034996032715,
+ -3.7103238105773926,
+ -4.058472633361816,
+ -2.5803277492523193,
+ -3.003248929977417,
+ -3.189161539077759,
+ -5.584373474121094,
+ -3.3394620418548584,
+ -3.5453758239746094
],
"yaxis": "y",
"type": "scatter"
@@ -4338,7 +4448,7 @@
"plotlyServerURL": "https://plot.ly"
}
},
- "text/html": ""
+ "text/html": ""
},
"metadata": {},
"output_type": "display_data"
@@ -4351,12 +4461,12 @@
"metadata": {
"collapsed": false,
"ExecuteTime": {
- "end_time": "2024-03-19T15:15:52.140066Z",
- "start_time": "2024-03-19T15:15:51.897705Z"
+ "end_time": "2024-03-26T10:08:00.901952Z",
+ "start_time": "2024-03-26T10:08:00.630898Z"
}
},
"id": "1c459935fab2bc93",
- "execution_count": 68
+ "execution_count": 23
},
{
"cell_type": "code",
@@ -4365,19 +4475,19 @@
"name": "stdout",
"output_type": "stream",
"text": [
- "Processing column: reg_city1 1.92%\n",
- "Processing column: reg_state1 3.85%\n",
- "Processing column: flight1 5.77%\n",
- "Processing column: tail_number1 7.69%\n",
+ "Processing column: reg_state1 1.92%\n",
+ "Processing column: reg_city1 3.85%\n",
+ "Processing column: tail_number1 5.77%\n",
+ "Processing column: flight1 7.69%\n",
"Processing column: reg_expiration1 9.62%\n",
- "Processing column: reg_owner1 11.54%\n",
- "Processing column: manufacturer1 13.46%\n",
+ "Processing column: manufacturer1 11.54%\n",
+ "Processing column: reg_owner1 13.46%\n",
"Processing column: model1 15.38%\n",
- "Processing column: TP mods2 17.31%\n",
- "Processing column: Engine Type2 19.23%\n",
- "Processing column: Multi Engine2 21.15%\n",
- "Processing column: Company2 23.08%\n",
- "Processing column: Model2 25.0%\n",
+ "Processing column: Multi Engine2 17.31%\n",
+ "Processing column: TP mods2 19.23%\n",
+ "Processing column: Engine Type2 21.15%\n",
+ "Processing column: Model2 23.08%\n",
+ "Processing column: Company2 25.0%\n",
"Processing column: make3 26.92%\n",
"Processing column: gear3 28.85%\n",
"Processing column: model3 30.77%\n",
@@ -4385,39 +4495,39 @@
"Processing column: offerType3 34.62%\n",
"Processing column: fuel_type4 36.54%\n",
"Processing column: transmission_type4 38.46%\n",
- "Processing column: car_name4 40.38%\n",
- "Processing column: body_type4 42.31%\n",
- "Processing column: model5 44.23%\n",
- "Processing column: brand5 46.15%\n",
+ "Processing column: body_type4 40.38%\n",
+ "Processing column: car_name4 42.31%\n",
+ "Processing column: country5 44.23%\n",
+ "Processing column: model5 46.15%\n",
"Processing column: vin5 48.08%\n",
- "Processing column: country5 50.0%\n",
- "Processing column: state5 51.92%\n",
+ "Processing column: brand5 50.0%\n",
+ "Processing column: condition5 51.92%\n",
"Processing column: title_status5 53.85%\n",
- "Processing column: condition5 55.77%\n",
+ "Processing column: state5 55.77%\n",
"Processing column: color5 57.69%\n",
"Processing column: Certificate6 59.62%\n",
- "Processing column: Gross6 61.54%\n",
- "Processing column: Poster_Link6 63.46%\n",
- "Processing column: Star26 65.38%\n",
- "Processing column: Overview6 67.31%\n",
- "Processing column: Director6 69.23%\n",
- "Processing column: Star46 71.15%\n",
- "Processing column: Series_Title6 73.08%\n",
- "Processing column: Star16 75.0%\n",
- "Processing column: Star36 76.92%\n",
+ "Processing column: Poster_Link6 61.54%\n",
+ "Processing column: Gross6 63.46%\n",
+ "Processing column: Director6 65.38%\n",
+ "Processing column: Star36 67.31%\n",
+ "Processing column: Star26 69.23%\n",
+ "Processing column: Star16 71.15%\n",
+ "Processing column: Overview6 73.08%\n",
+ "Processing column: Star46 75.0%\n",
+ "Processing column: Series_Title6 76.92%\n",
"Processing column: Genre6 78.85%\n",
"Processing column: show_id7 80.77%\n",
"Processing column: cast7 82.69%\n",
- "Processing column: title7 84.62%\n",
- "Processing column: description7 86.54%\n",
+ "Processing column: description7 84.62%\n",
+ "Processing column: title7 86.54%\n",
"Processing column: director7 88.46%\n",
- "Processing column: listed_in7 90.38%\n",
- "Processing column: duration7 92.31%\n",
- "Processing column: type7 94.23%\n",
+ "Processing column: type7 90.38%\n",
+ "Processing column: country7 92.31%\n",
+ "Processing column: listed_in7 94.23%\n",
"Processing column: rating7 96.15%\n",
- "Processing column: country7 98.08%\n",
+ "Processing column: duration7 98.08%\n",
"Processing column: date_added7 100.0%\n",
- "ELAPSED TIME :5.922260284423828\n"
+ "ELAPSED TIME :6.345606088638306\n"
]
}
],
@@ -4427,12 +4537,12 @@
"metadata": {
"collapsed": false,
"ExecuteTime": {
- "end_time": "2024-03-19T15:15:58.068124Z",
- "start_time": "2024-03-19T15:15:52.141138Z"
+ "end_time": "2024-03-26T10:08:07.252811Z",
+ "start_time": "2024-03-26T10:08:00.903005Z"
}
},
"id": "75f2e3d05b70b94e",
- "execution_count": 69
+ "execution_count": 24
},
{
"cell_type": "code",
@@ -4441,19 +4551,19 @@
"name": "stdout",
"output_type": "stream",
"text": [
- "Processing column: reg_city1 1.92%\n",
- "Processing column: reg_state1 3.85%\n",
- "Processing column: flight1 5.77%\n",
- "Processing column: tail_number1 7.69%\n",
+ "Processing column: reg_state1 1.92%\n",
+ "Processing column: reg_city1 3.85%\n",
+ "Processing column: tail_number1 5.77%\n",
+ "Processing column: flight1 7.69%\n",
"Processing column: reg_expiration1 9.62%\n",
- "Processing column: reg_owner1 11.54%\n",
- "Processing column: manufacturer1 13.46%\n",
+ "Processing column: manufacturer1 11.54%\n",
+ "Processing column: reg_owner1 13.46%\n",
"Processing column: model1 15.38%\n",
- "Processing column: TP mods2 17.31%\n",
- "Processing column: Engine Type2 19.23%\n",
- "Processing column: Multi Engine2 21.15%\n",
- "Processing column: Company2 23.08%\n",
- "Processing column: Model2 25.0%\n",
+ "Processing column: Multi Engine2 17.31%\n",
+ "Processing column: TP mods2 19.23%\n",
+ "Processing column: Engine Type2 21.15%\n",
+ "Processing column: Model2 23.08%\n",
+ "Processing column: Company2 25.0%\n",
"Processing column: make3 26.92%\n",
"Processing column: gear3 28.85%\n",
"Processing column: model3 30.77%\n",
@@ -4461,39 +4571,39 @@
"Processing column: offerType3 34.62%\n",
"Processing column: fuel_type4 36.54%\n",
"Processing column: transmission_type4 38.46%\n",
- "Processing column: car_name4 40.38%\n",
- "Processing column: body_type4 42.31%\n",
- "Processing column: model5 44.23%\n",
- "Processing column: brand5 46.15%\n",
+ "Processing column: body_type4 40.38%\n",
+ "Processing column: car_name4 42.31%\n",
+ "Processing column: country5 44.23%\n",
+ "Processing column: model5 46.15%\n",
"Processing column: vin5 48.08%\n",
- "Processing column: country5 50.0%\n",
- "Processing column: state5 51.92%\n",
+ "Processing column: brand5 50.0%\n",
+ "Processing column: condition5 51.92%\n",
"Processing column: title_status5 53.85%\n",
- "Processing column: condition5 55.77%\n",
+ "Processing column: state5 55.77%\n",
"Processing column: color5 57.69%\n",
"Processing column: Certificate6 59.62%\n",
- "Processing column: Gross6 61.54%\n",
- "Processing column: Poster_Link6 63.46%\n",
- "Processing column: Star26 65.38%\n",
- "Processing column: Overview6 67.31%\n",
- "Processing column: Director6 69.23%\n",
- "Processing column: Star46 71.15%\n",
- "Processing column: Series_Title6 73.08%\n",
- "Processing column: Star16 75.0%\n",
- "Processing column: Star36 76.92%\n",
+ "Processing column: Poster_Link6 61.54%\n",
+ "Processing column: Gross6 63.46%\n",
+ "Processing column: Director6 65.38%\n",
+ "Processing column: Star36 67.31%\n",
+ "Processing column: Star26 69.23%\n",
+ "Processing column: Star16 71.15%\n",
+ "Processing column: Overview6 73.08%\n",
+ "Processing column: Star46 75.0%\n",
+ "Processing column: Series_Title6 76.92%\n",
"Processing column: Genre6 78.85%\n",
"Processing column: show_id7 80.77%\n",
"Processing column: cast7 82.69%\n",
- "Processing column: title7 84.62%\n",
- "Processing column: description7 86.54%\n",
+ "Processing column: description7 84.62%\n",
+ "Processing column: title7 86.54%\n",
"Processing column: director7 88.46%\n",
- "Processing column: listed_in7 90.38%\n",
- "Processing column: duration7 92.31%\n",
- "Processing column: type7 94.23%\n",
+ "Processing column: type7 90.38%\n",
+ "Processing column: country7 92.31%\n",
+ "Processing column: listed_in7 94.23%\n",
"Processing column: rating7 96.15%\n",
- "Processing column: country7 98.08%\n",
+ "Processing column: duration7 98.08%\n",
"Processing column: date_added7 100.0%\n",
- "ELAPSED TIME :593.521614074707\n"
+ "ELAPSED TIME :553.5474011898041\n"
]
}
],
@@ -4503,12 +4613,12 @@
"metadata": {
"collapsed": false,
"ExecuteTime": {
- "end_time": "2024-03-19T15:25:51.594792Z",
- "start_time": "2024-03-19T15:15:58.069281Z"
+ "end_time": "2024-03-26T10:17:20.804893Z",
+ "start_time": "2024-03-26T10:08:07.253938Z"
}
},
"id": "287c7ad9f3cea37b",
- "execution_count": 70
+ "execution_count": 25
},
{
"cell_type": "code",
@@ -4529,30 +4639,33 @@
"data": [
{
"customdata": [
- [
- "reg_city1"
- ],
[
"reg_state1"
],
[
- "flight1"
+ "reg_city1"
],
[
"tail_number1"
],
+ [
+ "flight1"
+ ],
[
"reg_expiration1"
],
[
- "reg_owner1"
+ "manufacturer1"
],
[
- "manufacturer1"
+ "reg_owner1"
],
[
"model1"
],
+ [
+ "Multi Engine2"
+ ],
[
"TP mods2"
],
@@ -4560,14 +4673,11 @@
"Engine Type2"
],
[
- "Multi Engine2"
+ "Model2"
],
[
"Company2"
],
- [
- "Model2"
- ],
[
"make3"
],
@@ -4590,31 +4700,31 @@
"transmission_type4"
],
[
- "car_name4"
+ "body_type4"
],
[
- "body_type4"
+ "car_name4"
],
[
- "model5"
+ "country5"
],
[
- "brand5"
+ "model5"
],
[
"vin5"
],
[
- "country5"
+ "brand5"
],
[
- "state5"
+ "condition5"
],
[
"title_status5"
],
[
- "condition5"
+ "state5"
],
[
"color5"
@@ -4623,31 +4733,31 @@
"Certificate6"
],
[
- "Gross6"
+ "Poster_Link6"
],
[
- "Poster_Link6"
+ "Gross6"
],
[
- "Star26"
+ "Director6"
],
[
- "Overview6"
+ "Star36"
],
[
- "Director6"
+ "Star26"
],
[
- "Star46"
+ "Star16"
],
[
- "Series_Title6"
+ "Overview6"
],
[
- "Star16"
+ "Star46"
],
[
- "Star36"
+ "Series_Title6"
],
[
"Genre6"
@@ -4659,28 +4769,28 @@
"cast7"
],
[
- "title7"
+ "description7"
],
[
- "description7"
+ "title7"
],
[
"director7"
],
[
- "listed_in7"
+ "type7"
],
[
- "duration7"
+ "country7"
],
[
- "type7"
+ "listed_in7"
],
[
"rating7"
],
[
- "country7"
+ "duration7"
],
[
"date_added7"
@@ -4690,58 +4800,58 @@
"legendgroup": "",
"marker": {
"color": [
- 8,
- 8,
- 9,
+ 10,
+ 10,
+ 5,
+ 5,
9,
- 3,
- 6,
+ 4,
+ 4,
+ 4,
+ 8,
+ 0,
+ 4,
+ 4,
+ 4,
6,
+ 8,
6,
- 4,
+ 3,
+ 1,
+ 3,
+ 8,
6,
- 4,
6,
+ 1,
6,
- 7,
- 4,
- 7,
- 0,
- 10,
- 0,
- 0,
- 7,
5,
- 7,
- 7,
+ 6,
9,
- 11,
8,
- 4,
- 3,
- 7,
- 9,
- 3,
+ 10,
+ 6,
+ 5,
+ 5,
9,
- 1,
2,
- 1,
- 1,
2,
- 1,
- 1,
2,
- 9,
- 1,
2,
+ 11,
+ 2,
+ 11,
+ 7,
+ 5,
+ 2,
+ 11,
+ 11,
2,
- 1,
2,
- 3,
1,
+ 7,
+ 5,
9,
- 8,
- 3
+ 9
],
"coloraxis": "coloraxis",
"symbol": "circle"
@@ -4751,113 +4861,113 @@
"orientation": "v",
"showlegend": false,
"x": [
- -1.2315573692321777,
- -1.3960601091384888,
- 0.16279497742652893,
- 0.336090087890625,
- -0.2639041244983673,
- -2.56018328666687,
- -2.9185597896575928,
- -2.144641160964966,
- 2.4411609172821045,
- -3.4252243041992188,
- 2.3289527893066406,
- -2.9055278301239014,
- -2.4776194095611572,
- -1.5479029417037964,
- 2.1699094772338867,
- -2.1339659690856934,
- 1.3242402076721191,
- -0.5383439064025879,
- 1.4190571308135986,
- 2.1065027713775635,
- -1.4585567712783813,
- -1.735015869140625,
- -1.8539036512374878,
- -1.390334129333496,
- 0.2982367277145386,
- 0.926008939743042,
- -1.4070314168930054,
- 2.5462491512298584,
- -0.8543004989624023,
- -2.2616286277770996,
- 1.3494809865951538,
- -0.6881126165390015,
- 0.34734776616096497,
- -0.11908050626516342,
- -3.8797318935394287,
- -0.22775070369243622,
- -0.06335285305976868,
- -2.9259536266326904,
- -0.38409364223480225,
- -0.5979487895965576,
- -3.3752193450927734,
- 0.15268082916736603,
- 0.294850617647171,
- -3.1962966918945312,
- -3.836763858795166,
- -0.109186090528965,
- -3.3986334800720215,
- -0.7591543197631836,
- 0.08581861108541489,
- 1.2354620695114136,
- -1.6200463771820068,
- -0.33383724093437195
+ -0.93117356300354,
+ -0.5845441818237305,
+ -2.7235045433044434,
+ -2.6017391681671143,
+ -3.0384340286254883,
+ 0.5333382487297058,
+ 0.25806382298469543,
+ -0.09599914401769638,
+ -3.0942459106445312,
+ -3.3379340171813965,
+ 0.9325237274169922,
+ 0.3388823866844177,
+ 0.5669587254524231,
+ 1.4670075178146362,
+ -2.5555593967437744,
+ 1.04537832736969,
+ -1.2160331010818481,
+ -0.6075345277786255,
+ -1.341273546218872,
+ -2.3048923015594482,
+ 2.080313205718994,
+ 1.1715267896652222,
+ -1.4227235317230225,
+ 1.3285086154937744,
+ -2.558462142944336,
+ 1.5313986539840698,
+ -2.51103138923645,
+ -2.766148328781128,
+ -0.9191096425056458,
+ 0.6287733912467957,
+ -1.8047654628753662,
+ -2.1264681816101074,
+ -2.2419049739837646,
+ -0.694290280342102,
+ -0.5716606974601746,
+ -1.0548096895217896,
+ -0.6587052941322327,
+ 1.2156471014022827,
+ -1.102487564086914,
+ 0.6747475862503052,
+ 1.8054462671279907,
+ -2.824810743331909,
+ -1.412682294845581,
+ 1.2583482265472412,
+ 0.967156708240509,
+ -0.9760659337043762,
+ -0.40136075019836426,
+ -0.5862438678741455,
+ 1.6448854207992554,
+ -1.9822865724563599,
+ -2.2009506225585938,
+ -2.8945517539978027
],
"xaxis": "x",
"y": [
- -1.4533166885375977,
- -1.858666181564331,
- -2.41731858253479,
- -2.5061285495758057,
- -3.7297160625457764,
- -2.083111047744751,
- -2.4953696727752686,
- -2.7385475635528564,
- -1.27792227268219,
- -3.1510751247406006,
- -0.9756304621696472,
- -2.4108920097351074,
- -2.533029556274414,
- 0.7466858625411987,
- -0.2979642152786255,
- 0.020160024985671043,
- 1.2635958194732666,
- 1.3030813932418823,
- 1.169091820716858,
- 0.15421932935714722,
- 0.4831574261188507,
- 1.3783798217773438,
- 0.4652738869190216,
- 0.8725593090057373,
- -2.3842854499816895,
- -0.10721741616725922,
- -1.840975284576416,
- 0.0952586904168129,
- -3.5729644298553467,
- -0.5368195176124573,
- -2.155054807662964,
- -2.8722262382507324,
- -1.8498497009277344,
- -1.0395498275756836,
- -0.585018515586853,
- -0.5340877771377563,
- -1.0597599744796753,
- -0.7209460139274597,
- -0.6978242993354797,
- -0.8990448117256165,
- 0.3901121914386749,
- -2.7778234481811523,
- -1.1740888357162476,
- -0.5221556425094604,
- -0.5215041637420654,
- -0.8624324798583984,
- 0.22354581952095032,
- -3.0666046142578125,
- 0.3743482232093811,
- -2.3861749172210693,
- -1.1583610773086548,
- -3.593682289123535
+ -0.8487504720687866,
+ -0.2875222861766815,
+ 0.2438797652721405,
+ 0.210541769862175,
+ -1.0851330757141113,
+ -1.989612340927124,
+ -1.4605934619903564,
+ -1.680659532546997,
+ 1.9580214023590088,
+ 1.8102328777313232,
+ -2.6760451793670654,
+ -1.6934581995010376,
+ -1.8997161388397217,
+ -0.6463617086410522,
+ 2.353159189224243,
+ -0.13706769049167633,
+ 3.0675768852233887,
+ -2.792996644973755,
+ 3.043461561203003,
+ 2.655846118927002,
+ -0.8199979066848755,
+ -0.6571944952011108,
+ -2.1100311279296875,
+ -0.5589699149131775,
+ 0.2689114212989807,
+ -0.7590398192405701,
+ -1.3654488325119019,
+ 2.7915844917297363,
+ -0.8548046350479126,
+ -0.1063171774148941,
+ 0.03021148219704628,
+ 0.7474803328514099,
+ -0.8430166244506836,
+ 0.9680089354515076,
+ 0.6484963297843933,
+ 0.7129985094070435,
+ 0.7789408564567566,
+ 1.7361503839492798,
+ 0.6301454901695251,
+ 0.9279691576957703,
+ 1.0270980596542358,
+ -0.10064753144979477,
+ 0.8464698791503906,
+ 1.6700187921524048,
+ 1.0946271419525146,
+ 0.8363156914710999,
+ 1.941645622253418,
+ -0.8647369146347046,
+ 1.1233822107315063,
+ -0.17551742494106293,
+ -1.0526247024536133,
+ -1.0547256469726562
],
"yaxis": "y",
"type": "scatter"
@@ -5772,7 +5882,7 @@
"plotlyServerURL": "https://plot.ly"
}
},
- "text/html": ""
+ "text/html": ""
},
"metadata": {},
"output_type": "display_data"
@@ -5785,12 +5895,12 @@
"metadata": {
"collapsed": false,
"ExecuteTime": {
- "end_time": "2024-03-19T15:25:51.865742Z",
- "start_time": "2024-03-19T15:25:51.595953Z"
+ "end_time": "2024-03-26T10:17:21.266462Z",
+ "start_time": "2024-03-26T10:17:20.806032Z"
}
},
"id": "49fde730403db101",
- "execution_count": 71
+ "execution_count": 26
},
{
"cell_type": "code",
@@ -5811,29 +5921,32 @@
"data": [
{
"customdata": [
+ [
+ "reg_state1"
+ ],
[
"reg_city1"
],
[
- "reg_state1"
+ "tail_number1"
],
[
"flight1"
],
[
- "tail_number1"
+ "reg_expiration1"
],
[
- "reg_expiration1"
+ "manufacturer1"
],
[
"reg_owner1"
],
[
- "manufacturer1"
+ "model1"
],
[
- "model1"
+ "Multi Engine2"
],
[
"TP mods2"
@@ -5842,14 +5955,11 @@
"Engine Type2"
],
[
- "Multi Engine2"
+ "Model2"
],
[
"Company2"
],
- [
- "Model2"
- ],
[
"make3"
],
@@ -5872,31 +5982,31 @@
"transmission_type4"
],
[
- "car_name4"
+ "body_type4"
],
[
- "body_type4"
+ "car_name4"
],
[
- "model5"
+ "country5"
],
[
- "brand5"
+ "model5"
],
[
"vin5"
],
[
- "country5"
+ "brand5"
],
[
- "state5"
+ "condition5"
],
[
"title_status5"
],
[
- "condition5"
+ "state5"
],
[
"color5"
@@ -5905,31 +6015,31 @@
"Certificate6"
],
[
- "Gross6"
+ "Poster_Link6"
],
[
- "Poster_Link6"
+ "Gross6"
],
[
- "Star26"
+ "Director6"
],
[
- "Overview6"
+ "Star36"
],
[
- "Director6"
+ "Star26"
],
[
- "Star46"
+ "Star16"
],
[
- "Series_Title6"
+ "Overview6"
],
[
- "Star16"
+ "Star46"
],
[
- "Star36"
+ "Series_Title6"
],
[
"Genre6"
@@ -5941,28 +6051,28 @@
"cast7"
],
[
- "title7"
+ "description7"
],
[
- "description7"
+ "title7"
],
[
"director7"
],
[
- "listed_in7"
+ "type7"
],
[
- "duration7"
+ "country7"
],
[
- "type7"
+ "listed_in7"
],
[
"rating7"
],
[
- "country7"
+ "duration7"
],
[
"date_added7"
@@ -5972,58 +6082,58 @@
"legendgroup": "",
"marker": {
"color": [
- 4,
- 4,
1,
1,
- 10,
5,
5,
- 10,
- 9,
- 4,
+ 5,
+ 0,
0,
5,
- 10,
+ 7,
+ 7,
2,
+ 5,
0,
4,
- 8,
- 0,
- 8,
- 0,
+ 7,
+ 10,
2,
+ 10,
2,
+ 7,
4,
- 2,
- 1,
- 0,
4,
- 6,
10,
- 4,
- 0,
10,
- 11,
+ 5,
4,
+ 11,
+ 6,
+ 1,
+ 10,
+ 10,
+ 9,
+ 5,
+ 1,
+ 1,
+ 1,
+ 1,
3,
- 4,
- 4,
- 4,
- 4,
- 4,
- 7,
1,
- 4,
- 4,
+ 1,
+ 8,
+ 5,
+ 1,
3,
- 4,
- 7,
- 10,
- 7,
- 4,
- 4,
- 10
+ 1,
+ 1,
+ 8,
+ 1,
+ 8,
+ 8,
+ 5,
+ 11
],
"coloraxis": "coloraxis",
"symbol": "circle"
@@ -6033,113 +6143,113 @@
"orientation": "v",
"showlegend": false,
"x": [
- 2.232069730758667,
- 2.04703950881958,
- 3.685438871383667,
- 3.7534923553466797,
- 4.548223495483398,
- 2.1641151905059814,
- 1.6716158390045166,
- 3.707670211791992,
- 3.551330089569092,
- 0.7382426261901855,
- 2.2600345611572266,
- 1.6929031610488892,
- 3.5000805854797363,
- 0.8841426968574524,
- 1.9816348552703857,
- 2.538231134414673,
- 0.15006133913993835,
- 2.2033517360687256,
- 0.22405526041984558,
- 2.0226693153381348,
- 1.137275218963623,
- 0.6898834705352783,
- 2.1997299194335938,
- 0.7599173784255981,
- 4.115108489990234,
- 1.3826909065246582,
- 2.0932059288024902,
- 2.393995761871338,
- 5.186481475830078,
- 2.962049961090088,
- 2.391953468322754,
- 4.0034589767456055,
- 4.351011276245117,
- 2.7698190212249756,
- 3.9041121006011963,
- 2.769899368286133,
- 2.7911529541015625,
- 2.699028968811035,
- 2.755099058151245,
- 2.786696195602417,
- 3.2102408409118652,
- 3.760199546813965,
- 3.3576271533966064,
- 2.758638858795166,
- 3.8999764919281006,
- 2.8488729000091553,
- 3.1844098567962646,
- 4.427828311920166,
- 2.6792852878570557,
- 2.7276439666748047,
- 1.7881308794021606,
- 5.050660133361816
+ 0.07464970648288727,
+ 0.3129180669784546,
+ 1.8015512228012085,
+ 1.7333874702453613,
+ 2.3830831050872803,
+ 2.6272332668304443,
+ 1.9825297594070435,
+ 1.9731072187423706,
+ 0.8772491812705994,
+ -0.4399683475494385,
+ -1.1831083297729492,
+ 1.889535665512085,
+ 2.636423349380493,
+ -0.574123203754425,
+ 0.37415051460266113,
+ 0.6518558263778687,
+ -1.5904536247253418,
+ 0.34899643063545227,
+ -1.5395406484603882,
+ 0.3759491741657257,
+ -0.7246766686439514,
+ -0.26768624782562256,
+ -0.5958290100097656,
+ 0.3429754078388214,
+ 2.285098075866699,
+ -0.6929587721824646,
+ 3.0599184036254883,
+ 1.3004878759384155,
+ 0.12265949696302414,
+ 0.9976135492324829,
+ 0.4892101585865021,
+ 2.1270360946655273,
+ 2.1105659008026123,
+ 0.9012678861618042,
+ 0.8896762132644653,
+ 0.8739141821861267,
+ 0.8704595565795898,
+ 1.6939094066619873,
+ 0.8959806561470032,
+ 0.7778205275535583,
+ 1.0063170194625854,
+ 1.8169519901275635,
+ 1.3267322778701782,
+ 1.6878050565719604,
+ 0.8405084013938904,
+ 0.9591824412345886,
+ 0.4891231656074524,
+ -0.14022839069366455,
+ 1.0082389116287231,
+ 0.5928480625152588,
+ 2.39751935005188,
+ -0.8518069982528687
],
"xaxis": "x",
"y": [
- 2.690234661102295,
- 2.5165600776672363,
- 3.282524585723877,
- 3.40256667137146,
- 3.580906867980957,
- 1.5391180515289307,
- 0.852472722530365,
- 2.5674898624420166,
- 4.886576175689697,
- 3.5124340057373047,
- 5.190670490264893,
- 0.8154540657997131,
- 2.2167246341705322,
- 2.026197671890259,
- 4.476515769958496,
- 3.4282286167144775,
- 3.0664360523223877,
- 4.188016414642334,
- 3.111924171447754,
- 4.605981349945068,
- 1.835215449333191,
- 1.967597246170044,
- 3.4694032669067383,
- 2.036349058151245,
- 2.589768886566162,
- 3.527857542037964,
- 2.493844509124756,
- 5.637296199798584,
- 3.052475690841675,
- 4.274573802947998,
- 4.028963088989258,
- 3.023559808731079,
- 1.2799495458602905,
- 3.017526388168335,
- -0.001000845804810524,
- 2.961000680923462,
- 3.0660297870635986,
- 2.639333486557007,
- 2.9294049739837646,
- 3.0432934761047363,
- 0.8248129487037659,
- 3.3993475437164307,
- 1.8885663747787476,
- 2.466247081756592,
- 0.011215770617127419,
- 3.013014793395996,
- 0.6786802411079407,
- 3.1485755443573,
- 0.8549591302871704,
- 1.605120062828064,
- 2.9543240070343018,
- 2.55021071434021
+ 1.2225661277770996,
+ 1.3324196338653564,
+ 2.021209239959717,
+ 1.9221495389938354,
+ 2.5465946197509766,
+ 0.16001343727111816,
+ 0.39939454197883606,
+ 1.2751306295394897,
+ 3.6996567249298096,
+ 3.5395219326019287,
+ 1.2964847087860107,
+ 0.9139719009399414,
+ 0.1269349604845047,
+ 0.08850818872451782,
+ 3.0580782890319824,
+ 1.9371856451034546,
+ 0.7239078879356384,
+ 2.692063570022583,
+ 0.7861824631690979,
+ 3.1869115829467773,
+ -0.04748240113258362,
+ 0.01011333242058754,
+ 1.9134701490402222,
+ 1.9545291662216187,
+ 1.5034958124160767,
+ 0.05370209366083145,
+ 2.3742215633392334,
+ 4.004917621612549,
+ 1.2006990909576416,
+ 2.763378858566284,
+ 2.5211269855499268,
+ -0.5236079692840576,
+ 1.8120489120483398,
+ 1.3816590309143066,
+ 1.5251518487930298,
+ 1.502103328704834,
+ 1.396881341934204,
+ -1.523393988609314,
+ 1.5436475276947021,
+ 1.1724165678024292,
+ -0.5919604301452637,
+ 1.9960685968399048,
+ 0.46192702651023865,
+ -1.5119831562042236,
+ 1.0042810440063477,
+ 1.4661548137664795,
+ -0.528108537197113,
+ 1.517184853553772,
+ -0.7407345771789551,
+ 0.20635902881622314,
+ 2.198230028152466,
+ 2.5028865337371826
],
"yaxis": "y",
"type": "scatter"
@@ -7054,7 +7164,7 @@
"plotlyServerURL": "https://plot.ly"
}
},
- "text/html": ""
+ "text/html": ""
},
"metadata": {},
"output_type": "display_data"
@@ -7067,35 +7177,42 @@
"metadata": {
"collapsed": false,
"ExecuteTime": {
- "end_time": "2024-03-19T15:25:52.147365Z",
- "start_time": "2024-03-19T15:25:51.868966Z"
+ "end_time": "2024-03-26T10:17:21.637111Z",
+ "start_time": "2024-03-26T10:17:21.267536Z"
}
},
"id": "af662f30dc4f1479",
- "execution_count": 72
+ "execution_count": 27
},
{
"cell_type": "code",
"outputs": [],
- "source": [],
+ "source": [
+ " "
+ ],
"metadata": {
"collapsed": false,
"ExecuteTime": {
- "end_time": "2024-03-19T15:25:52.151973Z",
- "start_time": "2024-03-19T15:25:52.149061Z"
+ "end_time": "2024-03-26T10:17:21.642099Z",
+ "start_time": "2024-03-26T10:17:21.638772Z"
}
},
"id": "f56ac97595bdb926",
- "execution_count": 72
+ "execution_count": 27
},
{
"cell_type": "code",
"outputs": [],
"source": [],
"metadata": {
- "collapsed": false
+ "collapsed": false,
+ "ExecuteTime": {
+ "end_time": "2024-03-26T10:17:21.646460Z",
+ "start_time": "2024-03-26T10:17:21.643260Z"
+ }
},
- "id": "5549cbcc0f37bc4f"
+ "id": "5549cbcc0f37bc4f",
+ "execution_count": 27
}
],
"metadata": {
diff --git a/main.py b/main.py
index ea74ef4..2748652 100644
--- a/main.py
+++ b/main.py
@@ -1,20 +1,7 @@
-# This is a sample Python script.
-# Press Shift+F10 to execute it or replace it with your code.
-# Press Double Shift to search everywhere for classes, files, tool windows, actions, and settings.
-import functions as f
+def compare_datasets(path1, path2):
+ ...
-def print_hi(name):
- database, names = f.load__csv_files_from_folder("data")
- data0 = f.DataFrameMetadataCreator(database[0])
- columns = data0.get_numerical_columns()
- print(data0.column_type)
- data0.create_column_embeddings()
- embediings = data0.column_embeddings
-
-# Press the green button in the gutter to run the script.
if __name__ == '__main__':
- print_hi('PyCharm')
-
-# See PyCharm help at https://www.jetbrains.com/help/pycharm/
+ compare_datasets('data/netflix_titles.csv', 'data/imdb_top_1000.csv')
diff --git a/requirements.txt b/requirements.txt
index 322883e..a3ffea0 100644
Binary files a/requirements.txt and b/requirements.txt differ
diff --git a/similarity/ComparatorByColumn.py b/similarity/ComparatorByColumn.py
index 401a9eb..bec002d 100644
--- a/similarity/ComparatorByColumn.py
+++ b/similarity/ComparatorByColumn.py
@@ -9,7 +9,6 @@
from similarity.DataFrameMetadata import DataFrameMetadata, KindMetadata, CategoricalMetadata
from similarity.Types import DataKind
-
class ComparatorType(ABC):
def __init__(self, weight=1):
self.weight = weight
diff --git a/similarity/DataFrameMetadata.py b/similarity/DataFrameMetadata.py
index 2961bf0..3840f82 100644
--- a/similarity/DataFrameMetadata.py
+++ b/similarity/DataFrameMetadata.py
@@ -52,6 +52,10 @@ def __init__(self, value: Optional[tuple], distribution: Optional[tuple[int, ...
self.nulls = null_values
self.ratio_max_length = ratio_max_length
+ def __str__(self):
+ return f"KindMetadata(value={self.value}, distribution={self.distribution}, longest={self.longest}, shortest={self.shortest}, null_values={self.nulls}, ratio_max_length={self.ratio_max_length})"
+
+
class NonnumericalMetadata:
"""
@@ -69,6 +73,8 @@ def __init__(self, longest: str, shortest: str, avg_length: int):
# todo bigrams trigrams ?
# todo embeddings ?? nebo mame pro cele sloupce ?
+ def __str__(self):
+ return f"NonnumericalMetadata(longest={self.longest}, shortest={self.shortest}, avg_length={self.avg_length})"
class NumericalMetadata:
"""
@@ -86,6 +92,9 @@ def __init__(self, min_value: float | int, max_value: float | int, same_value_le
self.same_value_length = same_value_length
# todo distribution !!!!!!
+ def __str__(self):
+ return f"NumericalMetadata(min_value={self.min_value}, max_value={self.max_value}, range_size={self.range_size}, same_value_length={self.same_value_length})"
+
class DataFrameMetadata:
def __init__(self):
@@ -117,6 +126,11 @@ def get_column_type(self, name):
if name in columns:
return column_type
+ def get_column_kind(self, name):
+ for column_kind, columns in self.column_kind.items():
+ if name in columns:
+ return column_kind
+
def get_column_names_by_type(self, *types):
if NONNUMERICAL in types:
types = list(types)
diff --git a/similarity/Types.py b/similarity/Types.py
index 8572983..b16d1af 100644
--- a/similarity/Types.py
+++ b/similarity/Types.py
@@ -1,3 +1,6 @@
+"""
+This files contains all
+"""
import re
from enum import Enum
from typing import Any
@@ -65,6 +68,11 @@ def is_constant(column: pd.Series) -> bool:
def series_to_numeric(x: pd.Series):
+ """
+ Apply to_numeric on pd.Series
+ :param x:
+ :return: numeric series
+ """
try:
to_numeric = x.apply(lambda s: pd.to_numeric(s.replace(',', '.'), errors='coerce'))
except AttributeError:
@@ -76,13 +84,15 @@ def is_numerical(x: pd.Series) -> bool:
"""
Decide if column type is numerical.
- Column is numerical if it could be transferred into numeric, and it is float or int, and it is not full nulls
+ Column is numerical if it could be transferred into numeric,
+ and it is float or int, and it is not full nulls
:param x: the type
:return: true if it is numerical, otherwise false
"""
- x.isnull().values.sum() / x.size
- return x.any() and (x.dtype == np.float64 or x.dtype == np.int64) and not (x.isnull().values.sum() / x.size > 0.9)
+ return (x.any() and
+ (x.dtype in (np.float64, np.int64)) and not
+ (x.isnull().values.sum() / x.size > 0.9))
def is_int(x: pd.Series) -> bool:
@@ -101,7 +111,8 @@ def is_human_gen(x: pd.Series) -> bool:
"""
Decide if float number is human generated
- Float is human generated if number of numbers after decimal point is smaller than computer_generated_threshold
+ Float is human generated if number of numbers after decimal
+ point is smaller than computer_generated_threshold
:param x: the series for decide
:return: true if it is human generated, otherwise false
@@ -113,9 +124,9 @@ def floating_length_gt(num: Any, gt: int):
:param num: to decide
:param gt: threshold
"""
- splitted = str(num).split(".")
- if len(splitted) > 1:
- return len(splitted[1]) > gt
+ split = str(num).split(".")
+ if len(split) > 1:
+ return len(split[1]) > gt
return False
return x.apply(lambda s: not floating_length_gt(s, TypeSettings.computer_generated_threshold)).all()
@@ -124,7 +135,7 @@ def floating_length_gt(num: Any, gt: int):
def is_not_numerical(x: pd.Series) -> bool:
"""
Decide if type is not numerical
- The column is not numerical if it is not numerical and it could be transfer to string
+ The column is not numerical if it is not numerical, and it could be transferred to string
:param x: the series for decide
:return: false if it is numerical, otherwise true
@@ -217,15 +228,17 @@ def is_str_phrase(word: str):
def is_sentence(x: pd.Series) -> bool:
"""
- The string is sentence if it starts with uperCasse letter and end with interpunction
+ The string is sentence if it starts with upperCase letter and end with fullstops.
:param x: series for decide
:return: true for sentence
"""
def is_str_sentence(word: str):
- return ((word.endswith(".") or word.endswith("!") or word.endswith("?")) and word.count(".") <= 1
- and word.count("!") <= 1 and word.count("?") <= 1) and re.search("^[A-Z]", word)
+ return (((word.endswith(".") or word.endswith("!")
+ or word.endswith("?")) and word.count(".") <= 1
+ and word.count("!") <= 1 and word.count("?") <= 1)
+ and re.search("^[A-Z]", word))
return x.apply(lambda s: is_str_sentence(s)).all()
@@ -252,14 +265,17 @@ def is_str_multiple(word: str):
regex = re.compile('[a-zA-Z0-9]')
word_clean = regex.sub('', word)
res = "".join(dict.fromkeys(word_clean))
- return (word.count(res) == word_clean.count(res) and word_clean.count(res) > 0) or res == '' and res != ' '
+ return ((word.count(res) == word_clean.count(res)
+ and word_clean.count(res) > 0)
+ or res == '' and res != ' ')
return x.apply(lambda s: is_str_multiple(s)).all()
def is_true_multiple(x: pd.Series) -> bool:
"""
- The string is true multiple if the record is split by some sequence and the sequence is the same for all rows
+ The string is true multiple if the record is split
+ by some sequence and the sequence is the same for all rows
:param x: series for decide
:return: true for multiple
@@ -272,7 +288,7 @@ def is_str_multiple(word: str):
return res
result_arr = x.apply(lambda s: is_str_multiple(s)).values
- to_return = [i for i in result_arr if (i != '')]
+ to_return = [i for i in result_arr if i != '']
if len(to_return) == 0:
return True
if to_return[0].replace(" ", "") == '':
@@ -293,18 +309,20 @@ def is_str_date(word: str):
try:
parse(str(word), fuzzy_with_tokens=True) # todo add timezone
return True
- except ParserError:
+ except (ParserError, OverflowError) as e:
element = str(word).strip()
- one_or_two = '(\d{1}|\d{2})'
- two_or_four = '(\d{2}|\d{4})'
- months = '(January|February|March|April|May|June|July|August|September|October|November|December|Jan|Feb|Mar|Apr|May|June|July|Aug|Sept|Oct|Nov|Dec)'
- pattern = r'^(\d{1}|\d{2}|\d{4}),(\d{1}|\d{2}) ' + months # + '$' # 1999,4 Feb 1999,4 February
- pattern = pattern + '|' + r'^' + one_or_two + '\. ' + one_or_two + '\. ' + two_or_four # 11. 4. 1999
- pattern = pattern + '|' + r'^(\d{1}|\d{2}|\d{4}),(\d{1}|\d{2})' + months # 1999,4February 1999,4Feb
- if re.match(pattern, element):
- return True
- else:
- return False
+ one_or_two = r'(\d{1}|\d{2})'
+ two_or_four = r'(\d{2}|\d{4})'
+ months = ('(January|February|March|April|May|June|July|August|'
+ 'September|October|November|December|Jan|Feb'
+ '|Mar|Apr|May|June|July|Aug|Sept|Oct|Nov|Dec)')
+ # + '$' # 1999,4 Feb 1999,4 February
+ pattern = r'^(\d{1}|\d{2}|\d{4}),(\d{1}|\d{2}) ' + months
+ # 11. 4. 1999
+ pattern = pattern + '|' + r'^' + one_or_two + r'\. ' + one_or_two + r'\. ' + two_or_four
+ # 1999,4February 1999,4Feb
+ pattern = pattern + '|' + r'^(\d{1}|\d{2}|\d{4}),(\d{1}|\d{2})' + months
+ return bool(re.match(pattern, element))
return column.apply(lambda s: is_str_date(s)).all()
@@ -324,11 +342,13 @@ def get_data_kind(column: pd.Series) -> "DataKind":
return DataKind.CONSTANT
if is_categorical(column):
return DataKind.CATEGORICAL
- else:
- return DataKind.UNDEFINED
+ return DataKind.UNDEFINED
class DataKind(Enum):
+ """
+ Represents all data kind.
+ """
BOOL = "bool"
ID = "id"
CONSTANT = "constant"
@@ -349,8 +369,7 @@ def get_basic_type(column: pd.Series) -> Any:
return DATE
if is_not_numerical(column):
return NONNUMERICAL
- else:
- return UNDEFINED
+ return UNDEFINED
def get_advanced_type(column: pd.Series) -> Any:
@@ -364,19 +383,18 @@ def get_advanced_type(column: pd.Series) -> Any:
if is_numerical(column_num):
if is_int(column_num):
return INT
- else:
- return FLOAT
+ return FLOAT
if is_date(column): # todo what about year?
return DATE
if is_not_numerical(column):
return NONNUMERICAL
- else:
- return UNDEFINED
+ return UNDEFINED
def get_advanced_structural_type(column: pd.Series) -> Any:
"""
- Indicates type of column int, float - human, computer, date, text - word, sentence, phrase article, multiple
+ Indicates type of column int, float - human, computer, date,
+ text - word, sentence, phrase article, multiple
:param column: to indicate
:return: detected type
@@ -385,10 +403,9 @@ def get_advanced_structural_type(column: pd.Series) -> Any:
if is_numerical(column_num):
if is_int(column_num):
return INT
- else:
- if is_human_gen(column_num):
- return HUMAN_GENERATED
- return COMPUTER_GENERATED
+ if is_human_gen(column_num):
+ return HUMAN_GENERATED
+ return COMPUTER_GENERATED
if is_date(column):
return DATE
if is_not_numerical(column):
@@ -408,127 +425,145 @@ def get_advanced_structural_type(column: pd.Series) -> Any:
if is_article(column):
return ARTICLE
return NONNUMERICAL
- else:
- return UNDEFINED
-
-
-# class _Float(Enum):
-# HUMAN_GENERATED = "human_generated"
-# COMPUTER_GENERATED = "computer_generated"
-#
-#
-# class _Numerical(Enum):
-# FLOAT = _Float
-# INT = "int"
-#
-#
-# class _Word(Enum):
-# ALPHABETIC = "alphabetic"
-# ALPHANUMERIC = "alphanumeric"
-# ALL = "all"
-#
-#
-# class _Text(Enum):
-# WORD = _Word
-# SENTENCE = "sentence"
-# PHRASE = "phrase" # max 4 words without punctuation
-# ARTICLE = "article"
-# MULTIPLE_VALUES = "multiple" # genres name : "Action, Adventure, Drama"
-#
-#
-# class _NonNumerical(Enum):
-# TEXT = _Text
-#
-#
-# class Types(Enum):
-# """
-# Enum class representing column type
-# """
-# NUMERICAL = _Numerical
-# NONNUMERICAL = _NonNumerical
-# DATE = "date"
-# UNDEFINED = "undefined"
+ return UNDEFINED
class Type:
+ """
+ Base class for type
+ """
def __init__(self, value):
- self.value = value ## todo add values ?
+ self.value = value
+
+ def __str__(self):
+ return ""
class DATE(Type):
- pass
+ """
+ Represents type date.
+ """
+ def __str__(self):
+ return "DATE"
class UNDEFINED(Type):
- pass
+ """
+ Represents class undefined
+ """
+ def __str__(self):
+ return "UNDEFINED"
class NUMERICAL(Type):
- pass
+ """
+ Represents numerical types.
+ """
+ def __str__(self):
+ return "NUMERICAL"
class INT(NUMERICAL):
- pass
+ """
+ Represents INT type.
+ """
+ def __str__(self):
+ return "INT"
class FLOAT(NUMERICAL):
- pass
+ """
+ Represents FLOAT type.
+ """
+ def __str__(self):
+ return "FLOAT"
class HUMAN_GENERATED(FLOAT):
- pass
+ """
+ Represents float, which is probably generated by human.
+ Number of numbers after floating point is small or rounded.
+ """
+ def __str__(self):
+ return "HUMAN_GENERATED"
class COMPUTER_GENERATED(FLOAT):
- pass
+ """
+ Represents float, which is probably generated by computer.
+ Number of numbers after floating point is bigger or not rounded.
+ """
+ def __str__(self):
+ return "COMPUTER_GENERATED"
class NONNUMERICAL(Type):
- pass
+ """
+ Subclass for nonnumerical types
+ """
+ def __str__(self):
+ return "NONNUMERICAL"
class WORD(NONNUMERICAL):
- pass
+ """
+ Word is string without spaces.
+ """
+ def __str__(self):
+ return "WORD"
class ALPHABETIC(WORD):
- pass
+ """
+ This type is WORD, it contains only letters (a-z)
+ """
+ def __str__(self):
+ return "ALPHABETIC"
class ALPHANUMERIC(WORD):
- pass
+ """
+ This type is WORD, it contains only letters (a-z) and numbers (0-9)
+ """
+ def __str__(self):
+ return "ALPHANUMERIC"
class ALL(WORD):
- pass
+ """
+ This type is WORD, it could contain all characters.
+ """
+ def __str__(self):
+ return "ALL"
class SENTENCE(NONNUMERICAL):
- pass
+ """
+ Sentence is string that ends with fullstops. It could contain spaces.
+ """
+ def __str__(self):
+ return "SENTENCE"
class ARTICLE(NONNUMERICAL):
- pass
+ """
+ Article is string composite from sentences.
+ """
+ def __str__(self):
+ return "ARTICLE"
class PHRASE(NONNUMERICAL):
- pass
+ """
+ Phrase is string with spaces, but it is not sentence.
+ """
+ def __str__(self):
+ return "PHRASE"
class MULTIPLE_VALUES(NONNUMERICAL):
- pass
-
-
-# def get_super_type(type_: Types) -> Types:
-# if (type_ == Types.NUMERICAL or type_ == Types.NUMERICAL.value.FLOAT or type_ == Types.NUMERICAL.value.INT or
-# type_ == Types.NUMERICAL.value.FLOAT.value.HUMAN_GENERATED or
-# type_ == Types.NUMERICAL.value.FLOAT.value.COMPUTER_GENERATED):
-# return Types.NUMERICAL
-# if type_ == Types.DATE:
-# return Types.DATE
-# if type_ == Types.UNDEFINED:
-# return Types.UNDEFINED
-# return Types.NONNUMERICAL
-#
-#
-# x = Types.NUMERICAL.value
+ """
+ MULTIPLE_VALUES is string, and it contains pattern that is repeated. (Name1|Name2|Name3|Name4)
+ """
+ def __str__(self):
+ return "MULTIPLE_VALUES"