diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml index 82b5950..bd2b1d1 100644 --- a/.github/workflows/codeql.yml +++ b/.github/workflows/codeql.yml @@ -82,30 +82,7 @@ jobs: uses: github/codeql-action/analyze@v3 with: category: "/language:${{matrix.language}}" - python-tests: - name: Run Python Tests - runs-on: ${{ (matrix.language == 'swift' && 'macos-latest') || 'ubuntu-latest' }} - steps: - - name: Checkout repository - uses: actions/checkout@v2 - - - name: Set up Python - uses: actions/setup-python@v2 - with: - python-version: '3.x' - - - name: Install dependencies - run: | - python -m pip install --upgrade pip - pip install pytest - pip install -r requirements.txt - - - name: Run tests - run: | - pytest tests/types_test.py - pytest tests/metadata_test.py - pytest tests/comparator_test.py - pytest tests/column2Vec_test.py + diff --git a/.github/workflows/column2Vec_test.yml b/.github/workflows/column2Vec_test.yml new file mode 100644 index 0000000..7aede4b --- /dev/null +++ b/.github/workflows/column2Vec_test.yml @@ -0,0 +1,31 @@ +name: "Column2Vec tests" + +on: + schedule: + - cron: '0 10 * * 6' + workflow_dispatch: + + +jobs: + python-tests: + name: Run Tests for Column2Vec + runs-on: ubuntu-latest + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: '3.11' + cache: 'pip' + + - name: Install dependencies + run: | + pip install pytest + pip install -r requirements.txt + + - name: Run tests + run: | + pytest test/test_column2Vec.py + diff --git a/.github/workflows/py_test.yml b/.github/workflows/py_test.yml new file mode 100644 index 0000000..e8da178 --- /dev/null +++ b/.github/workflows/py_test.yml @@ -0,0 +1,70 @@ +name: "Static analysis & tests" + +on: + push: + branches: [ "master" ] + pull_request: + +jobs: + analysis: + runs-on: ubuntu-latest + name: Pylint Analysis + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: '3.11' + cache: 'pip' + + - name: Install dependencies + run: | + pip install -r requirements.txt + pip install pylint + + - name: Analysing the code with pylint + run: | + pylint \ + --fail-under=6.0 \ + --ignore-patterns=test_.*?py \ + --max-line-length=180 \ + $(git ls-files '*.py') + + python-tests: + env: + TEST_FILES: test/test_types.py test/test_metadata.py test/test_comparator.py + name: Run Python Tests + runs-on: ubuntu-latest + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: '3.11' + cache: 'pip' + + - name: Install dependencies + run: | + pip install -r requirements.txt + pip install coverage pytest + + - name: Run tests + run: coverage run --source='similarity,column2Vec' -m pytest $TEST_FILES + + - name: Show coverage + run: coverage report -m + + - name: Create coverage file + run: coverage xml + + - name: Get Cover + uses: orgoro/coverage@v3.1 + with: + coverageFile: coverage.xml + token: ${{ secrets.GITHUB_TOKEN }} + thresholdAll: 0.7 + thresholdNew: 0.9 diff --git a/categorical.ipynb b/categorical.ipynb index db0bd7c..bf79b59 100644 --- a/categorical.ipynb +++ b/categorical.ipynb @@ -21,7 +21,7 @@ "source": [ "import pandas as pd\n", "\n", - "import functions as f\n", + "import similarity.functions as f\n", "import time\n", "from comparing import ComparatorForDatasets\n", "from comparing import CategoricalSimilarity\n", diff --git a/column2Vec/Column2Vec.py b/column2Vec/Column2Vec.py index 7dd305f..9709c67 100644 --- a/column2Vec/Column2Vec.py +++ b/column2Vec/Column2Vec.py @@ -1,3 +1,6 @@ +""" +This file contains column2Vec implementations. +""" import re import numpy as np @@ -9,7 +12,8 @@ def column2vec_as_sentence(column: pd.Series, model: SentenceTransformer): """ Convert a column to a vector - Make one string from all the items in the column. Convert string to a vector by sentence transformer. + Make one string from all the items in the column. + Convert string to a vector by sentence transformer. """ sentence = [str(column.tolist()).replace("\'", "").replace("]", "").replace("[", "")] return model.encode(sentence)[0] @@ -19,18 +23,22 @@ def column2vec_as_sentence_clean(column: pd.Series, model: SentenceTransformer): """ Convert a column to a vector - Make one string from all the items in the column, clean the column that it will contain only a-z and 0-9. + Make one string from all the items in the column, clean the column that + it will contain only a-z and 0-9. Convert string to a vector by sentence transformer. """ column_as_str = str(column.tolist()).lower() sentence = [re.sub("[^(0-9 |a-z)]", " ", column_as_str)] return model.encode(sentence)[0] + def column2vec_as_sentence_clean_uniq(column: pd.Series, model: SentenceTransformer): """ Convert a column to a vector - Make one string from all the items in the column, clean the column that it will contain only a-z and 0-9, it will contains only uniq values. + Make one string from all the items in the column, + clean the column that it will contain only a-z and + 0-9, it will contain only uniq values. Convert string to a vector by sentence transformer. """ uniq_column = column.unique() @@ -38,6 +46,7 @@ def column2vec_as_sentence_clean_uniq(column: pd.Series, model: SentenceTransfor sentence = [re.sub("[^(0-9 |a-z)]", " ", column_as_str)] return model.encode(sentence)[0] + def column2vec_avg(column: pd.Series, model: SentenceTransformer): """ Convert a column to a vector @@ -45,22 +54,54 @@ def column2vec_avg(column: pd.Series, model: SentenceTransformer): Convert each item in the column to a vector and return the average of all the vectors """ uniq_column = column.unique() - column_clean = pd.Series(uniq_column).apply(lambda x: re.sub("[^(0-9 |a-z)]", " ", str(x).lower())).values + column_clean = pd.Series(uniq_column).apply(lambda x: re.sub("[^(0-9 |a-z)]", + " ", str(x).lower())).values encoded_columns = model.encode(column_clean) - to_ret = np.mean(encoded_columns, axis=0) # counts arithmetic mean (average) + to_ret = np.mean(encoded_columns, axis=0) # counts arithmetic mean (average) return to_ret + def column2vec_weighted_avg(column: pd.Series, model: SentenceTransformer): """ - todo tests it does what it should Convert a column to a vector Convert each item in the column to a vector and return the weighted average of all the vectors """ uniq_column = column.value_counts(normalize=True) weights = uniq_column.values - column_clean = pd.Series(uniq_column.keys()).apply(lambda x: re.sub("[^(0-9 |a-z)]", " ", str(x).lower())).values + column_clean = pd.Series(uniq_column.keys()).apply(lambda x: re.sub("[^(0-9 |a-z)]", + " ", str(x).lower())).values + encoded_columns = model.encode(column_clean) + to_ret = np.average(encoded_columns, axis=0, weights=weights) # counts weighted average + return to_ret + + +def column2vec_sum(column: pd.Series, model: SentenceTransformer): + """ + Convert a column to a vector + + Convert each item in the column to a vector and return the average of all the vectors + """ + uniq_column = column.unique() + column_clean = pd.Series(uniq_column).apply(lambda x: re.sub("[^(0-9 |a-z)]", + " ", str(x).lower())).values encoded_columns = model.encode(column_clean) - to_ret = np.average(encoded_columns, axis=0, weights=weights) # counts weighted average + to_ret = sum(encoded_columns) # sum of values return to_ret + +def column2vec_weighted_sum(column: pd.Series, model: SentenceTransformer): + """ + Convert a column to a vector + + Convert each item in the column to a vector and return the weighted average of all the vectors + """ + uniq_column = column.value_counts(normalize=True) + weights = uniq_column.values + column_clean = pd.Series(uniq_column.keys()).apply(lambda x: re.sub("[^(0-9 |a-z)]", + " ", str(x).lower())).values + encoded_columns = model.encode(column_clean) + to_ret = 0 + for number, weight in zip(encoded_columns, weights): + to_ret += number * weight + return to_ret diff --git a/column2Vec/functions.py b/column2Vec/functions.py index e69de29..7ef71dc 100644 --- a/column2Vec/functions.py +++ b/column2Vec/functions.py @@ -0,0 +1,74 @@ +""" +Functions usefull for column2Vec. +""" +from typing import Any + +import numpy as np +import pandas as pd +from sklearn.cluster import KMeans + +from similarity.Comparator import cosine_sim +from similarity.DataFrameMetadataCreator import DataFrameMetadataCreator +from similarity.Types import NONNUMERICAL + + +def get_data(files: list[str]) -> dict[str, Any]: + """ + Reads all csv files (which name is in files). Creates metadata for them. + Save only nonnumerical columns into dictionary. Key is name of column. + Value is column. + :param files: list names of csv files + :return: dictionary of all tables. + """ + result = {} + index = 0 + for i in files: + index += 1 + data = pd.read_csv(i) + metadata_creator = (DataFrameMetadataCreator(data). + compute_advanced_structural_types(). + compute_column_kind()) + metadata1 = metadata_creator.get_metadata() + column_names = metadata1.get_column_names_by_type(NONNUMERICAL) + for name in column_names: + print(f" {i} : {name}") + result[name + str(index)] = data[name] + return result + + +def get_clusters(vectors_to_cluster: pd.DataFrame, n_clusters: int) -> list[list[str]]: + """ + Creates clusters by KMeans for given vectors. + + :param vectors_to_cluster: embeddings for columns + :param n_clusters: number of clusters we want + :return: List, for each cluster number it contains list of column names + """ + kmeans = KMeans(n_clusters=n_clusters, random_state=0) # Change n_clusters as needed + list_of_vectors = np.array(list(vectors_to_cluster.values())) + kmeans.fit(list_of_vectors) + + clusters = [[]] * n_clusters + for i in range(n_clusters): + names = [] + for cluster, name in zip(kmeans.labels_, vectors_to_cluster.keys()): + if cluster == i: + names.append(name) + clusters[i] = names + + return clusters + + +def compute_distances(vectors: dict): + """ + Compute distance for each pair of vectors. + + :param vectors: dictionary of embedding vectors + :return: matrix with distances + """ + res = {} + for key1, vec1 in vectors.items(): + res[key1] = {} + for key2, vec2 in vectors.items(): + res[key1][key2] = 1 - cosine_sim(vec1, vec2) + return res diff --git a/column2Vec/generated/Clusters_Average_column2vec_clusters.html b/column2Vec/generated/Clusters_Average_column2vec_clusters.html new file mode 100644 index 0000000..bdbd7e6 --- /dev/null +++ b/column2Vec/generated/Clusters_Average_column2vec_clusters.html @@ -0,0 +1,14 @@ + + + +
+
+ + \ No newline at end of file diff --git a/column2Vec/generated/Clusters_Clean_uniq_sentence_column2vec_vectors.html b/column2Vec/generated/Clusters_Clean_uniq_sentence_column2vec_vectors.html new file mode 100644 index 0000000..060a8b0 --- /dev/null +++ b/column2Vec/generated/Clusters_Clean_uniq_sentence_column2vec_vectors.html @@ -0,0 +1,14 @@ + + + +
+
+ + \ No newline at end of file diff --git a/column2Vec/generated/Clusters_Cleaned_sentence_column2vec_vectors.html b/column2Vec/generated/Clusters_Cleaned_sentence_column2vec_vectors.html new file mode 100644 index 0000000..0dacf4f --- /dev/null +++ b/column2Vec/generated/Clusters_Cleaned_sentence_column2vec_vectors.html @@ -0,0 +1,14 @@ + + + +
+
+ + \ No newline at end of file diff --git a/column2Vec/playground.ipynb b/column2Vec/playground.ipynb index d345195..f78ee09 100644 --- a/column2Vec/playground.ipynb +++ b/column2Vec/playground.ipynb @@ -7,33 +7,34 @@ "import pandas as pd\n", "from sentence_transformers import SentenceTransformer\n", "from column2Vec.Column2Vec import column2vec_avg\n", - "from similarity.Types import Type, NONNUMERICAL\n", + "from similarity.Types import NONNUMERICAL\n", "from similarity.DataFrameMetadataCreator import DataFrameMetadataCreator\n", "from column2Vec.Column2Vec import column2vec_as_sentence\n", "from column2Vec.Column2Vec import column2vec_as_sentence_clean\n", "from column2Vec.Column2Vec import column2vec_as_sentence_clean_uniq\n", "from column2Vec.Column2Vec import column2vec_weighted_avg\n", - "import time" + "import time\n", + "from column2Vec.functions import get_clusters" ], "metadata": { "collapsed": false, "ExecuteTime": { - "end_time": "2024-03-19T12:24:21.922107Z", - "start_time": "2024-03-19T12:24:17.545278Z" + "end_time": "2024-03-26T09:57:59.420458Z", + "start_time": "2024-03-26T09:57:59.414725Z" } }, "id": "d2f663cd8db4d03b", - "execution_count": 1 + "execution_count": 11 }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 12, "id": "initial_id", "metadata": { "collapsed": true, "ExecuteTime": { - "end_time": "2024-03-19T12:24:21.927591Z", - "start_time": "2024-03-19T12:24:21.923224Z" + "end_time": "2024-03-26T09:57:59.444903Z", + "start_time": "2024-03-26T09:57:59.439511Z" } }, "outputs": [], @@ -96,12 +97,12 @@ "metadata": { "collapsed": false, "ExecuteTime": { - "end_time": "2024-03-19T15:06:00.114656Z", - "start_time": "2024-03-19T15:05:59.282313Z" + "end_time": "2024-03-26T09:58:00.132126Z", + "start_time": "2024-03-26T09:57:59.501655Z" } }, "id": "74ad1f08faa50a70", - "execution_count": 60 + "execution_count": 13 }, { "cell_type": "code", @@ -134,12 +135,12 @@ "metadata": { "collapsed": false, "ExecuteTime": { - "end_time": "2024-03-19T12:24:23.072402Z", - "start_time": "2024-03-19T12:24:22.934435Z" + "end_time": "2024-03-26T09:58:00.137581Z", + "start_time": "2024-03-26T09:58:00.133237Z" } }, "id": "19c03920fae6aab8", - "execution_count": 4 + "execution_count": 14 }, { "cell_type": "code", @@ -148,13 +149,13 @@ "name": "stdout", "output_type": "stream", "text": [ - " ../data/aircraft-data_nov_dec.csv : reg_city\n", " ../data/aircraft-data_nov_dec.csv : reg_state\n", - " ../data/aircraft-data_nov_dec.csv : flight\n", + " ../data/aircraft-data_nov_dec.csv : reg_city\n", " ../data/aircraft-data_nov_dec.csv : tail_number\n", + " ../data/aircraft-data_nov_dec.csv : flight\n", " ../data/aircraft-data_nov_dec.csv : reg_expiration\n", - " ../data/aircraft-data_nov_dec.csv : reg_owner\n", " ../data/aircraft-data_nov_dec.csv : manufacturer\n", + " ../data/aircraft-data_nov_dec.csv : reg_owner\n", " ../data/aircraft-data_nov_dec.csv : model\n" ] }, @@ -162,21 +163,19 @@ "name": "stderr", "output_type": "stream", "text": [ - "C:\\Users\\ab032mj\\Desktop\\thesis\\simillarity\\venv\\Lib\\site-packages\\dateutil\\parser\\_parser.py:1207: UnknownTimezoneWarning:\n", - "\n", - "tzname AG identified but not understood. Pass `tzinfos` argument in order to correctly return a timezone-aware datetime. In a future version, this will raise an exception.\n", - "\n" + "C:\\Users\\ab032mj\\Desktop\\thesis\\simillarity\\venv\\Lib\\site-packages\\dateutil\\parser\\_parser.py:1207: UnknownTimezoneWarning: tzname AG identified but not understood. Pass `tzinfos` argument in order to correctly return a timezone-aware datetime. In a future version, this will raise an exception.\n", + " warnings.warn(\"tzname {tzname} identified but not understood. \"\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ + " ../data/Airplane_Cleaned.csv : Multi Engine\n", " ../data/Airplane_Cleaned.csv : TP mods\n", " ../data/Airplane_Cleaned.csv : Engine Type\n", - " ../data/Airplane_Cleaned.csv : Multi Engine\n", - " ../data/Airplane_Cleaned.csv : Company\n", " ../data/Airplane_Cleaned.csv : Model\n", + " ../data/Airplane_Cleaned.csv : Company\n", " ../data/autoscout24-germany-dataset.csv : make\n", " ../data/autoscout24-germany-dataset.csv : gear\n", " ../data/autoscout24-germany-dataset.csv : model\n", @@ -184,26 +183,26 @@ " ../data/autoscout24-germany-dataset.csv : offerType\n", " ../data/CARS_1.csv : fuel_type\n", " ../data/CARS_1.csv : transmission_type\n", - " ../data/CARS_1.csv : car_name\n", " ../data/CARS_1.csv : body_type\n", + " ../data/CARS_1.csv : car_name\n", + " ../data/USA_cars_datasets.csv : country\n", " ../data/USA_cars_datasets.csv : model\n", - " ../data/USA_cars_datasets.csv : brand\n", " ../data/USA_cars_datasets.csv : vin\n", - " ../data/USA_cars_datasets.csv : country\n", - " ../data/USA_cars_datasets.csv : state\n", - " ../data/USA_cars_datasets.csv : title_status\n", + " ../data/USA_cars_datasets.csv : brand\n", " ../data/USA_cars_datasets.csv : condition\n", + " ../data/USA_cars_datasets.csv : title_status\n", + " ../data/USA_cars_datasets.csv : state\n", " ../data/USA_cars_datasets.csv : color\n", " ../data/imdb_top_1000.csv : Certificate\n", - " ../data/imdb_top_1000.csv : Gross\n", " ../data/imdb_top_1000.csv : Poster_Link\n", + " ../data/imdb_top_1000.csv : Gross\n", + " ../data/imdb_top_1000.csv : Director\n", + " ../data/imdb_top_1000.csv : Star3\n", " ../data/imdb_top_1000.csv : Star2\n", + " ../data/imdb_top_1000.csv : Star1\n", " ../data/imdb_top_1000.csv : Overview\n", - " ../data/imdb_top_1000.csv : Director\n", " ../data/imdb_top_1000.csv : Star4\n", " ../data/imdb_top_1000.csv : Series_Title\n", - " ../data/imdb_top_1000.csv : Star1\n", - " ../data/imdb_top_1000.csv : Star3\n", " ../data/imdb_top_1000.csv : Genre\n" ] }, @@ -211,10 +210,8 @@ "name": "stderr", "output_type": "stream", "text": [ - "C:\\Users\\ab032mj\\Desktop\\thesis\\simillarity\\venv\\Lib\\site-packages\\dateutil\\parser\\_parser.py:1207: UnknownTimezoneWarning:\n", - "\n", - "tzname ELSIE identified but not understood. Pass `tzinfos` argument in order to correctly return a timezone-aware datetime. In a future version, this will raise an exception.\n", - "\n" + "C:\\Users\\ab032mj\\Desktop\\thesis\\simillarity\\venv\\Lib\\site-packages\\dateutil\\parser\\_parser.py:1207: UnknownTimezoneWarning: tzname ELSIE identified but not understood. Pass `tzinfos` argument in order to correctly return a timezone-aware datetime. In a future version, this will raise an exception.\n", + " warnings.warn(\"tzname {tzname} identified but not understood. \"\n" ] }, { @@ -223,14 +220,14 @@ "text": [ " ../data/netflix_titles.csv : show_id\n", " ../data/netflix_titles.csv : cast\n", - " ../data/netflix_titles.csv : title\n", " ../data/netflix_titles.csv : description\n", + " ../data/netflix_titles.csv : title\n", " ../data/netflix_titles.csv : director\n", - " ../data/netflix_titles.csv : listed_in\n", - " ../data/netflix_titles.csv : duration\n", " ../data/netflix_titles.csv : type\n", - " ../data/netflix_titles.csv : rating\n", " ../data/netflix_titles.csv : country\n", + " ../data/netflix_titles.csv : listed_in\n", + " ../data/netflix_titles.csv : rating\n", + " ../data/netflix_titles.csv : duration\n", " ../data/netflix_titles.csv : date_added\n" ] } @@ -241,12 +238,12 @@ "metadata": { "collapsed": false, "ExecuteTime": { - "end_time": "2024-03-19T15:06:36.629839Z", - "start_time": "2024-03-19T15:06:04.509081Z" + "end_time": "2024-03-26T09:58:32.212781Z", + "start_time": "2024-03-26T09:58:00.137581Z" } }, "id": "cfe57003e670ba15", - "execution_count": 61 + "execution_count": 15 }, { "cell_type": "code", @@ -256,8 +253,8 @@ "\n", "\n", "def plot_clusters(vectors_to_plot: pd.DataFrame, title: str):\n", - "\n", - " kmeans = KMeans(n_clusters=12, random_state=0) # Change n_clusters as needed\n", + " n_clusters = 12\n", + " kmeans = KMeans(n_clusters=n_clusters, random_state=0) # Change n_clusters as needed\n", " list_of_vectors = np.array(list(vectors_to_plot.values()))\n", " kmeans.fit(list_of_vectors)\n", "\n", @@ -271,18 +268,18 @@ "\n", " fig = px.scatter(df, x='x', y='y', color='cluster', hover_data=['names'])\n", " fig.update_layout(title=title)\n", - " # fig.write_html(title.replace(\" \", \"_\") + \".html\")\n", + " fig.write_html(title.replace(\" \", \"_\") + \".html\")\n", " fig.show()\n" ], "metadata": { "collapsed": false, "ExecuteTime": { - "end_time": "2024-03-19T15:06:39.490133Z", - "start_time": "2024-03-19T15:06:39.485011Z" + "end_time": "2024-03-26T09:58:32.218154Z", + "start_time": "2024-03-26T09:58:32.213907Z" } }, "id": "19c881d9f450b556", - "execution_count": 62 + "execution_count": 16 }, { "cell_type": "code", @@ -291,19 +288,19 @@ "name": "stdout", "output_type": "stream", "text": [ - "Processing column: reg_city1 1.92%\n", - "Processing column: reg_state1 3.85%\n", - "Processing column: flight1 5.77%\n", - "Processing column: tail_number1 7.69%\n", + "Processing column: reg_state1 1.92%\n", + "Processing column: reg_city1 3.85%\n", + "Processing column: tail_number1 5.77%\n", + "Processing column: flight1 7.69%\n", "Processing column: reg_expiration1 9.62%\n", - "Processing column: reg_owner1 11.54%\n", - "Processing column: manufacturer1 13.46%\n", + "Processing column: manufacturer1 11.54%\n", + "Processing column: reg_owner1 13.46%\n", "Processing column: model1 15.38%\n", - "Processing column: TP mods2 17.31%\n", - "Processing column: Engine Type2 19.23%\n", - "Processing column: Multi Engine2 21.15%\n", - "Processing column: Company2 23.08%\n", - "Processing column: Model2 25.0%\n", + "Processing column: Multi Engine2 17.31%\n", + "Processing column: TP mods2 19.23%\n", + "Processing column: Engine Type2 21.15%\n", + "Processing column: Model2 23.08%\n", + "Processing column: Company2 25.0%\n", "Processing column: make3 26.92%\n", "Processing column: gear3 28.85%\n", "Processing column: model3 30.77%\n", @@ -311,39 +308,39 @@ "Processing column: offerType3 34.62%\n", "Processing column: fuel_type4 36.54%\n", "Processing column: transmission_type4 38.46%\n", - "Processing column: car_name4 40.38%\n", - "Processing column: body_type4 42.31%\n", - "Processing column: model5 44.23%\n", - "Processing column: brand5 46.15%\n", + "Processing column: body_type4 40.38%\n", + "Processing column: car_name4 42.31%\n", + "Processing column: country5 44.23%\n", + "Processing column: model5 46.15%\n", "Processing column: vin5 48.08%\n", - "Processing column: country5 50.0%\n", - "Processing column: state5 51.92%\n", + "Processing column: brand5 50.0%\n", + "Processing column: condition5 51.92%\n", "Processing column: title_status5 53.85%\n", - "Processing column: condition5 55.77%\n", + "Processing column: state5 55.77%\n", "Processing column: color5 57.69%\n", "Processing column: Certificate6 59.62%\n", - "Processing column: Gross6 61.54%\n", - "Processing column: Poster_Link6 63.46%\n", - "Processing column: Star26 65.38%\n", - "Processing column: Overview6 67.31%\n", - "Processing column: Director6 69.23%\n", - "Processing column: Star46 71.15%\n", - "Processing column: Series_Title6 73.08%\n", - "Processing column: Star16 75.0%\n", - "Processing column: Star36 76.92%\n", + "Processing column: Poster_Link6 61.54%\n", + "Processing column: Gross6 63.46%\n", + "Processing column: Director6 65.38%\n", + "Processing column: Star36 67.31%\n", + "Processing column: Star26 69.23%\n", + "Processing column: Star16 71.15%\n", + "Processing column: Overview6 73.08%\n", + "Processing column: Star46 75.0%\n", + "Processing column: Series_Title6 76.92%\n", "Processing column: Genre6 78.85%\n", "Processing column: show_id7 80.77%\n", "Processing column: cast7 82.69%\n", - "Processing column: title7 84.62%\n", - "Processing column: description7 86.54%\n", + "Processing column: description7 84.62%\n", + "Processing column: title7 86.54%\n", "Processing column: director7 88.46%\n", - "Processing column: listed_in7 90.38%\n", - "Processing column: duration7 92.31%\n", - "Processing column: type7 94.23%\n", + "Processing column: type7 90.38%\n", + "Processing column: country7 92.31%\n", + "Processing column: listed_in7 94.23%\n", "Processing column: rating7 96.15%\n", - "Processing column: country7 98.08%\n", + "Processing column: duration7 98.08%\n", "Processing column: date_added7 100.0%\n", - "ELAPSED TIME :529.7594430446625\n" + "ELAPSED TIME :549.7098529338837\n" ] } ], @@ -353,12 +350,12 @@ "metadata": { "collapsed": false, "ExecuteTime": { - "end_time": "2024-03-19T15:15:34.774275Z", - "start_time": "2024-03-19T15:06:45.010192Z" + "end_time": "2024-03-26T10:07:41.940696Z", + "start_time": "2024-03-26T09:58:32.218307Z" } }, "id": "d18443a1c921f509", - "execution_count": 63 + "execution_count": 17 }, { "cell_type": "code", @@ -367,41 +364,49 @@ "name": "stderr", "output_type": "stream", "text": [ - "C:\\Users\\ab032mj\\Desktop\\thesis\\simillarity\\venv\\Lib\\site-packages\\sklearn\\cluster\\_kmeans.py:1416: FutureWarning:\n", - "\n", - "The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning\n", - "\n" + "C:\\Users\\ab032mj\\Desktop\\thesis\\simillarity\\venv\\Lib\\site-packages\\sklearn\\cluster\\_kmeans.py:1416: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning\n", + " super()._check_params_vs_input(X, default_n_init=10)\n" ] }, + { + "data": { + "text/html": " \n " + }, + "metadata": {}, + "output_type": "display_data" + }, { "data": { "application/vnd.plotly.v1+json": { "data": [ { "customdata": [ + [ + "reg_state1" + ], [ "reg_city1" ], [ - "reg_state1" + "tail_number1" ], [ "flight1" ], [ - "tail_number1" + "reg_expiration1" ], [ - "reg_expiration1" + "manufacturer1" ], [ "reg_owner1" ], [ - "manufacturer1" + "model1" ], [ - "model1" + "Multi Engine2" ], [ "TP mods2" @@ -410,14 +415,11 @@ "Engine Type2" ], [ - "Multi Engine2" + "Model2" ], [ "Company2" ], - [ - "Model2" - ], [ "make3" ], @@ -440,31 +442,31 @@ "transmission_type4" ], [ - "car_name4" + "body_type4" ], [ - "body_type4" + "car_name4" ], [ - "model5" + "country5" ], [ - "brand5" + "model5" ], [ "vin5" ], [ - "country5" + "brand5" ], [ - "state5" + "condition5" ], [ "title_status5" ], [ - "condition5" + "state5" ], [ "color5" @@ -473,31 +475,31 @@ "Certificate6" ], [ - "Gross6" + "Poster_Link6" ], [ - "Poster_Link6" + "Gross6" ], [ - "Star26" + "Director6" ], [ - "Overview6" + "Star36" ], [ - "Director6" + "Star26" ], [ - "Star46" + "Star16" ], [ - "Series_Title6" + "Overview6" ], [ - "Star16" + "Star46" ], [ - "Star36" + "Series_Title6" ], [ "Genre6" @@ -509,28 +511,28 @@ "cast7" ], [ - "title7" + "description7" ], [ - "description7" + "title7" ], [ "director7" ], [ - "listed_in7" + "type7" ], [ - "duration7" + "country7" ], [ - "type7" + "listed_in7" ], [ "rating7" ], [ - "country7" + "duration7" ], [ "date_added7" @@ -540,58 +542,58 @@ "legendgroup": "", "marker": { "color": [ - 1, - 1, - 4, - 4, - 4, + 10, + 3, + 7, + 7, + 7, + 11, + 11, + 7, + 0, + 0, + 11, + 11, + 11, + 2, + 0, + 3, 8, + 0, 8, + 0, + 2, + 2, + 10, + 0, + 7, + 2, 4, 3, - 8, + 10, + 3, + 0, + 9, + 7, + 3, 3, - 8, - 8, - 6, 3, - 1, - 11, - 1, - 11, 3, - 6, - 6, 1, + 3, + 3, 6, - 4, - 1, - 1, - 9, - 0, - 1, - 1, - 4, 7, - 10, - 2, - 10, - 10, + 3, 1, - 10, - 10, + 3, + 3, 5, - 4, 10, - 1, - 2, - 10, - 5, + 6, 0, - 5, - 1, - 1, - 0 + 7, + 4 ], "coloraxis": "coloraxis", "symbol": "circle" @@ -601,113 +603,113 @@ "orientation": "v", "showlegend": false, "x": [ - 5.481040000915527, - 4.461870193481445, - 4.956617832183838, - 4.9444499015808105, - 5.674870491027832, - 4.4419264793396, - 4.319740295410156, - 5.136926174163818, - 7.437771797180176, - 4.348060131072998, - 7.0663018226623535, - 4.1616387367248535, - 4.288301467895508, - 5.361490249633789, - 6.856499195098877, - 5.667142868041992, - 6.075915336608887, - 6.264303684234619, - 6.080878257751465, - 6.997607231140137, - 4.9815263748168945, - 5.517688274383545, - 5.867679119110107, - 5.311161518096924, - 4.352320194244385, - 3.6641652584075928, - 4.4079413414001465, - 6.797448635101318, - 6.221177101135254, - 6.101926803588867, - 6.199456691741943, - 5.258615493774414, - 3.1090869903564453, - 5.236301422119141, - 2.041367292404175, - 5.144412517547607, - 5.259392738342285, - 5.225100040435791, - 5.1772847175598145, - 5.25313663482666, - 2.5723178386688232, - 4.8922271728515625, - 3.6730144023895264, - 5.107095718383789, - 2.0164794921875, - 5.152388095855713, - 2.4399759769439697, - 5.877944469451904, - 3.193415880203247, - 5.954677104949951, - 3.6659605503082275, - 6.515915393829346 + 7.634982109069824, + 8.18161392211914, + 9.760482788085938, + 9.65578556060791, + 10.46049690246582, + 7.527761459350586, + 7.7938385009765625, + 9.330303192138672, + 8.89563274383545, + 8.884279251098633, + 5.870842933654785, + 8.49361515045166, + 7.291825771331787, + 6.987878322601318, + 8.564809799194336, + 8.554859161376953, + 6.354427337646484, + 8.301039695739746, + 6.307615756988525, + 8.199542999267578, + 6.75204610824585, + 6.548539161682129, + 6.594710350036621, + 8.208972930908203, + 9.793411254882812, + 6.7602009773254395, + 10.585469245910645, + 7.192152976989746, + 7.594589710235596, + 7.73886251449585, + 8.795101165771484, + 9.152059555053711, + 9.864975929260254, + 8.501593589782715, + 8.485865592956543, + 8.46456241607666, + 8.433588981628418, + 6.663652420043945, + 8.49349594116211, + 8.021809577941895, + 7.3567280769348145, + 9.757957458496094, + 8.53760814666748, + 6.686389923095703, + 7.943576335906982, + 8.512290954589844, + 8.190762519836426, + 6.943475246429443, + 7.313745498657227, + 9.077346801757812, + 10.087260246276855, + 10.557514190673828 ], "xaxis": "x", "y": [ - 4.3125786781311035, - 4.480679988861084, - 5.934484481811523, - 6.035068035125732, - 6.627974510192871, - 3.8020052909851074, - 3.5600695610046387, - 5.560845851898193, - 4.549907684326172, - 2.305506467819214, - 4.657924175262451, - 3.242453098297119, - 5.140314102172852, - 3.107086658477783, - 4.361886978149414, - 4.635444641113281, - 2.251649856567383, - 4.172117710113525, - 2.205137014389038, - 3.932243585586548, - 2.756131649017334, - 2.8223607540130615, - 4.202624320983887, - 2.893759250640869, - 6.234554290771484, - 4.007889747619629, - 4.486442565917969, - 2.9157278537750244, - 6.661800384521484, - 3.6967668533325195, - 4.754232406616211, - 6.072335243225098, - 5.7372331619262695, - 4.607232093811035, - 4.480278015136719, - 4.682827472686768, - 4.638975620269775, - 4.1178507804870605, - 4.5823140144348145, - 4.631624698638916, - 3.6405510902404785, - 6.063982963562012, - 5.105567455291748, - 4.051309585571289, - 4.475528717041016, - 4.691291332244873, - 3.689364194869995, - 6.186923503875732, - 2.650754451751709, - 5.092771530151367, - 4.2011871337890625, - 6.588094234466553 + 11.651568412780762, + 12.191211700439453, + 11.268287658691406, + 11.299283027648926, + 11.919140815734863, + 10.920798301696777, + 10.986883163452148, + 11.574243545532227, + 13.608915328979492, + 13.987488746643066, + 11.457459449768066, + 10.793388366699219, + 10.746611595153809, + 12.363808631896973, + 13.473563194274902, + 12.267362594604492, + 13.26131534576416, + 12.938798904418945, + 13.27975082397461, + 13.716704368591309, + 12.57845401763916, + 12.065139770507812, + 11.04773998260498, + 12.567636489868164, + 10.669133186340332, + 12.349822998046875, + 12.474716186523438, + 13.81960391998291, + 11.614741325378418, + 12.918475151062012, + 12.735498428344727, + 9.668839454650879, + 11.589250564575195, + 11.631767272949219, + 11.802452087402344, + 11.795424461364746, + 11.737223625183105, + 9.014378547668457, + 11.800704002380371, + 11.993047714233398, + 9.46713638305664, + 11.210518836975098, + 10.427387237548828, + 9.043390274047852, + 11.889760971069336, + 11.644381523132324, + 9.431979179382324, + 10.852152824401855, + 9.312481880187988, + 12.395952224731445, + 12.187356948852539, + 12.789667129516602 ], "yaxis": "y", "type": "scatter" @@ -1622,25 +1624,72 @@ "plotlyServerURL": "https://plot.ly" } }, - "text/html": "
" + "text/html": "
" }, "metadata": {}, "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CLUSTER 0\n", + "CLUSTER 1\n", + "CLUSTER 2\n", + "CLUSTER 3\n", + "CLUSTER 4\n", + "CLUSTER 5\n", + "CLUSTER 6\n", + "CLUSTER 7\n", + "CLUSTER 8\n", + "CLUSTER 9\n", + "CLUSTER 10\n", + "CLUSTER 11\n", + "['Multi Engine2', 'TP mods2', 'gear3', 'offerType3', 'transmission_type4', 'model5', 'Certificate6', 'rating7']\n", + "['Overview6', 'description7']\n", + "['make3', 'body_type4', 'car_name4', 'brand5']\n", + "['reg_city1', 'model3', 'title_status5', 'color5', 'Director6', 'Star36', 'Star26', 'Star16', 'Star46', 'Series_Title6', 'cast7', 'title7', 'director7']\n", + "['condition5', 'date_added7']\n", + "['type7']\n", + "['Genre6', 'listed_in7']\n", + "['tail_number1', 'flight1', 'reg_expiration1', 'model1', 'vin5', 'Gross6', 'show_id7', 'duration7']\n", + "['fuel3', 'fuel_type4']\n", + "['Poster_Link6']\n", + "['reg_state1', 'country5', 'state5', 'country7']\n", + "['manufacturer1', 'reg_owner1', 'Engine Type2', 'Model2', 'Company2']\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "C:\\Users\\ab032mj\\Desktop\\thesis\\simillarity\\venv\\Lib\\site-packages\\sklearn\\cluster\\_kmeans.py:1416: FutureWarning:\n", + "\n", + "The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning\n", + "\n" + ] } ], "source": [ + "\n", + "\n", "# plot_vectors(vectors_avg, \"Average column2vec vectors\")\n", - "plot_clusters(vectors_avg, \"Clusters Average column2vec clusters\")" + "plot_clusters(vectors_avg, \"Clusters Average column2vec clusters\")\n", + "\n", + "clusters = get_clusters(vectors_avg, 12)\n", + "\n", + "for i in clusters:\n", + " print(i)" ], "metadata": { "collapsed": false, "ExecuteTime": { - "end_time": "2024-03-19T15:15:35.022738Z", - "start_time": "2024-03-19T15:15:34.775408Z" + "end_time": "2024-03-26T10:07:42.866553Z", + "start_time": "2024-03-26T10:07:41.941778Z" } }, "id": "a1ef23c604e2775f", - "execution_count": 64 + "execution_count": 18 }, { "cell_type": "code", @@ -1649,19 +1698,19 @@ "name": "stdout", "output_type": "stream", "text": [ - "Processing column: reg_city1 1.92%\n", - "Processing column: reg_state1 3.85%\n", - "Processing column: flight1 5.77%\n", - "Processing column: tail_number1 7.69%\n", + "Processing column: reg_state1 1.92%\n", + "Processing column: reg_city1 3.85%\n", + "Processing column: tail_number1 5.77%\n", + "Processing column: flight1 7.69%\n", "Processing column: reg_expiration1 9.62%\n", - "Processing column: reg_owner1 11.54%\n", - "Processing column: manufacturer1 13.46%\n", + "Processing column: manufacturer1 11.54%\n", + "Processing column: reg_owner1 13.46%\n", "Processing column: model1 15.38%\n", - "Processing column: TP mods2 17.31%\n", - "Processing column: Engine Type2 19.23%\n", - "Processing column: Multi Engine2 21.15%\n", - "Processing column: Company2 23.08%\n", - "Processing column: Model2 25.0%\n", + "Processing column: Multi Engine2 17.31%\n", + "Processing column: TP mods2 19.23%\n", + "Processing column: Engine Type2 21.15%\n", + "Processing column: Model2 23.08%\n", + "Processing column: Company2 25.0%\n", "Processing column: make3 26.92%\n", "Processing column: gear3 28.85%\n", "Processing column: model3 30.77%\n", @@ -1669,39 +1718,39 @@ "Processing column: offerType3 34.62%\n", "Processing column: fuel_type4 36.54%\n", "Processing column: transmission_type4 38.46%\n", - "Processing column: car_name4 40.38%\n", - "Processing column: body_type4 42.31%\n", - "Processing column: model5 44.23%\n", - "Processing column: brand5 46.15%\n", + "Processing column: body_type4 40.38%\n", + "Processing column: car_name4 42.31%\n", + "Processing column: country5 44.23%\n", + "Processing column: model5 46.15%\n", "Processing column: vin5 48.08%\n", - "Processing column: country5 50.0%\n", - "Processing column: state5 51.92%\n", + "Processing column: brand5 50.0%\n", + "Processing column: condition5 51.92%\n", "Processing column: title_status5 53.85%\n", - "Processing column: condition5 55.77%\n", + "Processing column: state5 55.77%\n", "Processing column: color5 57.69%\n", "Processing column: Certificate6 59.62%\n", - "Processing column: Gross6 61.54%\n", - "Processing column: Poster_Link6 63.46%\n", - "Processing column: Star26 65.38%\n", - "Processing column: Overview6 67.31%\n", - "Processing column: Director6 69.23%\n", - "Processing column: Star46 71.15%\n", - "Processing column: Series_Title6 73.08%\n", - "Processing column: Star16 75.0%\n", - "Processing column: Star36 76.92%\n", + "Processing column: Poster_Link6 61.54%\n", + "Processing column: Gross6 63.46%\n", + "Processing column: Director6 65.38%\n", + "Processing column: Star36 67.31%\n", + "Processing column: Star26 69.23%\n", + "Processing column: Star16 71.15%\n", + "Processing column: Overview6 73.08%\n", + "Processing column: Star46 75.0%\n", + "Processing column: Series_Title6 76.92%\n", "Processing column: Genre6 78.85%\n", "Processing column: show_id7 80.77%\n", "Processing column: cast7 82.69%\n", - "Processing column: title7 84.62%\n", - "Processing column: description7 86.54%\n", + "Processing column: description7 84.62%\n", + "Processing column: title7 86.54%\n", "Processing column: director7 88.46%\n", - "Processing column: listed_in7 90.38%\n", - "Processing column: duration7 92.31%\n", - "Processing column: type7 94.23%\n", + "Processing column: type7 90.38%\n", + "Processing column: country7 92.31%\n", + "Processing column: listed_in7 94.23%\n", "Processing column: rating7 96.15%\n", - "Processing column: country7 98.08%\n", + "Processing column: duration7 98.08%\n", "Processing column: date_added7 100.0%\n", - "ELAPSED TIME :8.40464186668396\n" + "ELAPSED TIME :8.791604995727539\n" ] } ], @@ -1711,12 +1760,12 @@ "metadata": { "collapsed": false, "ExecuteTime": { - "end_time": "2024-03-19T15:15:43.432300Z", - "start_time": "2024-03-19T15:15:35.024903Z" + "end_time": "2024-03-26T10:07:51.663008Z", + "start_time": "2024-03-26T10:07:42.867616Z" } }, "id": "9ce83d1e71dcc010", - "execution_count": 65 + "execution_count": 19 }, { "cell_type": "code", @@ -1737,29 +1786,32 @@ "data": [ { "customdata": [ + [ + "reg_state1" + ], [ "reg_city1" ], [ - "reg_state1" + "tail_number1" ], [ "flight1" ], [ - "tail_number1" + "reg_expiration1" ], [ - "reg_expiration1" + "manufacturer1" ], [ "reg_owner1" ], [ - "manufacturer1" + "model1" ], [ - "model1" + "Multi Engine2" ], [ "TP mods2" @@ -1768,14 +1820,11 @@ "Engine Type2" ], [ - "Multi Engine2" + "Model2" ], [ "Company2" ], - [ - "Model2" - ], [ "make3" ], @@ -1798,31 +1847,31 @@ "transmission_type4" ], [ - "car_name4" + "body_type4" ], [ - "body_type4" + "car_name4" ], [ - "model5" + "country5" ], [ - "brand5" + "model5" ], [ "vin5" ], [ - "country5" + "brand5" ], [ - "state5" + "condition5" ], [ "title_status5" ], [ - "condition5" + "state5" ], [ "color5" @@ -1831,31 +1880,31 @@ "Certificate6" ], [ - "Gross6" + "Poster_Link6" ], [ - "Poster_Link6" + "Gross6" ], [ - "Star26" + "Director6" ], [ - "Overview6" + "Star36" ], [ - "Director6" + "Star26" ], [ - "Star46" + "Star16" ], [ - "Series_Title6" + "Overview6" ], [ - "Star16" + "Star46" ], [ - "Star36" + "Series_Title6" ], [ "Genre6" @@ -1867,28 +1916,28 @@ "cast7" ], [ - "title7" + "description7" ], [ - "description7" + "title7" ], [ "director7" ], [ - "listed_in7" + "type7" ], [ - "duration7" + "country7" ], [ - "type7" + "listed_in7" ], [ "rating7" ], [ - "country7" + "duration7" ], [ "date_added7" @@ -1898,58 +1947,58 @@ "legendgroup": "", "marker": { "color": [ + 6, + 0, + 10, + 10, + 10, + 1, + 1, + 1, + 7, + 10, + 9, 1, 1, + 4, + 3, + 0, + 9, 7, + 9, + 3, + 4, + 4, 7, 4, - 6, - 6, - 6, - 10, - 8, 10, - 6, - 6, - 2, + 4, 11, 2, - 8, + 6, 10, - 8, - 11, - 2, - 2, - 2, - 2, 7, 10, - 1, - 9, - 0, 10, + 0, + 0, + 6, + 0, + 5, + 6, + 5, + 8, 10, - 4, - 7, - 1, - 3, - 1, - 1, - 3, - 1, - 1, + 6, 5, - 4, - 1, - 3, - 3, - 1, 5, + 6, + 8, + 6, + 8, 7, - 5, 10, - 1, - 4 + 10 ], "coloraxis": "coloraxis", "symbol": "circle" @@ -1959,113 +2008,113 @@ "orientation": "v", "showlegend": false, "x": [ - 2.7740252017974854, - 2.8364009857177734, - 1.069262981414795, - 1.0029431581497192, - 1.5992205142974854, - 3.3962671756744385, - 3.2161507606506348, - 3.105342149734497, - -0.26255616545677185, - 0.6103012561798096, - -0.6897143125534058, - 3.189011335372925, - 2.700456142425537, - 1.689704418182373, - -0.5089970827102661, - 3.4468133449554443, - 0.2496114820241928, - -0.20911100506782532, - 0.22651678323745728, - -0.5129572749137878, - 1.8270411491394043, - 1.6040159463882446, - 1.6857876777648926, - 1.5496913194656372, - 0.8878748416900635, - -0.515620231628418, - 3.0048699378967285, - -0.5076618790626526, - 1.5513533353805542, - 0.6620107293128967, - 0.08456160873174667, - 1.3856319189071655, - 0.8733639717102051, - 1.9646496772766113, - 4.975734710693359, - 2.2607388496398926, - 1.911452293395996, - 4.4013895988464355, - 2.2770698070526123, - 2.3053503036499023, - 3.4285330772399902, - 1.2291285991668701, - 1.5165760517120361, - 4.615434646606445, - 5.032024383544922, - 1.617465615272522, - 3.7750792503356934, - 1.0852265357971191, - 3.424313545227051, - 0.29817289113998413, - 2.2570576667785645, - 1.8820475339889526 + -0.6300977468490601, + -0.42117050290107727, + -0.2486436814069748, + -0.2788216173648834, + -1.0231388807296753, + 1.9644521474838257, + 1.7976727485656738, + 1.9063923358917236, + 0.911508321762085, + -0.3994556665420532, + -1.698248028755188, + 2.2699530124664307, + 1.9196691513061523, + 2.3852696418762207, + -1.3150827884674072, + 1.4531058073043823, + -2.1891512870788574, + -0.3801000714302063, + -2.2206525802612305, + -1.4057995080947876, + 2.792318105697632, + 2.2648849487304688, + 0.4104484021663666, + 2.5826117992401123, + 0.025007257238030434, + 2.4077377319335938, + -1.835091471672058, + -2.8586337566375732, + -0.610065221786499, + -1.399946689605713, + 0.23535476624965668, + 0.632976770401001, + -0.7796151041984558, + 0.6179103255271912, + 0.32792437076568604, + 0.42420893907546997, + 0.5684399008750916, + -0.1674804985523224, + 0.4046439230442047, + -0.01373551320284605, + 0.0037475405260920525, + -0.564787745475769, + 0.4966707229614258, + -0.19300417602062225, + -0.10040821880102158, + 0.32298508286476135, + 0.5930238366127014, + -0.189849391579628, + 0.21084845066070557, + 1.2016559839248657, + -1.0955184698104858, + -1.4543946981430054 ], "xaxis": "x", "y": [ - 3.3891963958740234, - 3.713914394378662, - 3.3481297492980957, - 3.386765241622925, - 4.212575435638428, - 0.0772617980837822, - -0.0958557277917862, - 0.0595986545085907, - 4.59176778793335, - -0.37828168272972107, - 3.275238513946533, - -0.06835243850946426, - -0.21176140010356903, - 0.6041004061698914, - 1.852208137512207, - 2.6287810802459717, - 0.30453425645828247, - 2.5676887035369873, - 0.3329438865184784, - 1.7548259496688843, - 0.6495657563209534, - 0.20514854788780212, - 0.37018677592277527, - 0.6579046249389648, - 3.2581090927124023, - 3.536987066268921, - 3.740365505218506, - 0.4170229136943817, - 4.943604946136475, - 2.1078155040740967, - 3.131649971008301, - 3.806628704071045, - 2.7448670864105225, - 2.4906883239746094, - 2.3012287616729736, - 2.1632754802703857, - 2.562673330307007, - 2.3535079956054688, - 2.231971502304077, - 2.644801378250122, - 1.9668481349945068, - 3.6226108074188232, - 2.6755826473236084, - 2.317932605743408, - 2.2890031337738037, - 2.692415714263916, - 1.7611156702041626, - 4.269854545593262, - 1.4409667253494263, - 4.013367176055908, - 3.184629440307617, - 4.555140972137451 + -0.34463998675346375, + -0.3702987730503082, + -2.027839183807373, + -1.9760643243789673, + -2.0544214248657227, + 1.2670314311981201, + 1.3112242221832275, + 1.0823720693588257, + -3.1692299842834473, + -3.810380697250366, + 0.754950225353241, + 0.8772135972976685, + 1.2372525930404663, + -0.5736871957778931, + -3.3916871547698975, + -0.022917641326785088, + 0.21249575912952423, + -3.1873059272766113, + 0.1714380532503128, + -3.4063022136688232, + -0.547065794467926, + -0.4382999539375305, + -3.318624258041382, + -0.4890264868736267, + -2.125154972076416, + -0.7327226996421814, + -1.7250908613204956, + -0.4163961112499237, + -0.15362370014190674, + -1.071980595588684, + -2.7636942863464355, + -2.0271103382110596, + -1.607711911201477, + -0.5869593620300293, + -0.7268936038017273, + -0.9733710289001465, + -0.5965152382850647, + 2.2380130290985107, + -1.0552787780761719, + 1.706160306930542, + 0.7388292551040649, + -1.9103707075119019, + -1.5025126934051514, + 2.287067413330078, + 1.8778971433639526, + -1.3680455684661865, + 0.702415406703949, + -0.8708230257034302, + 1.0335848331451416, + -2.3910341262817383, + -1.6667473316192627, + -2.2093350887298584 ], "yaxis": "y", "type": "scatter" @@ -2980,7 +3029,7 @@ "plotlyServerURL": "https://plot.ly" } }, - "text/html": "
" + "text/html": "
" }, "metadata": {}, "output_type": "display_data" @@ -2993,12 +3042,12 @@ "metadata": { "collapsed": false, "ExecuteTime": { - "end_time": "2024-03-19T15:15:43.635850Z", - "start_time": "2024-03-19T15:15:43.433390Z" + "end_time": "2024-03-26T10:07:51.904467Z", + "start_time": "2024-03-26T10:07:51.664082Z" } }, "id": "cec56275bccd843a", - "execution_count": 66 + "execution_count": 20 }, { "cell_type": "code", @@ -3007,19 +3056,19 @@ "name": "stdout", "output_type": "stream", "text": [ - "Processing column: reg_city1 1.92%\n", - "Processing column: reg_state1 3.85%\n", - "Processing column: flight1 5.77%\n", - "Processing column: tail_number1 7.69%\n", + "Processing column: reg_state1 1.92%\n", + "Processing column: reg_city1 3.85%\n", + "Processing column: tail_number1 5.77%\n", + "Processing column: flight1 7.69%\n", "Processing column: reg_expiration1 9.62%\n", - "Processing column: reg_owner1 11.54%\n", - "Processing column: manufacturer1 13.46%\n", + "Processing column: manufacturer1 11.54%\n", + "Processing column: reg_owner1 13.46%\n", "Processing column: model1 15.38%\n", - "Processing column: TP mods2 17.31%\n", - "Processing column: Engine Type2 19.23%\n", - "Processing column: Multi Engine2 21.15%\n", - "Processing column: Company2 23.08%\n", - "Processing column: Model2 25.0%\n", + "Processing column: Multi Engine2 17.31%\n", + "Processing column: TP mods2 19.23%\n", + "Processing column: Engine Type2 21.15%\n", + "Processing column: Model2 23.08%\n", + "Processing column: Company2 25.0%\n", "Processing column: make3 26.92%\n", "Processing column: gear3 28.85%\n", "Processing column: model3 30.77%\n", @@ -3027,39 +3076,39 @@ "Processing column: offerType3 34.62%\n", "Processing column: fuel_type4 36.54%\n", "Processing column: transmission_type4 38.46%\n", - "Processing column: car_name4 40.38%\n", - "Processing column: body_type4 42.31%\n", - "Processing column: model5 44.23%\n", - "Processing column: brand5 46.15%\n", + "Processing column: body_type4 40.38%\n", + "Processing column: car_name4 42.31%\n", + "Processing column: country5 44.23%\n", + "Processing column: model5 46.15%\n", "Processing column: vin5 48.08%\n", - "Processing column: country5 50.0%\n", - "Processing column: state5 51.92%\n", + "Processing column: brand5 50.0%\n", + "Processing column: condition5 51.92%\n", "Processing column: title_status5 53.85%\n", - "Processing column: condition5 55.77%\n", + "Processing column: state5 55.77%\n", "Processing column: color5 57.69%\n", "Processing column: Certificate6 59.62%\n", - "Processing column: Gross6 61.54%\n", - "Processing column: Poster_Link6 63.46%\n", - "Processing column: Star26 65.38%\n", - "Processing column: Overview6 67.31%\n", - "Processing column: Director6 69.23%\n", - "Processing column: Star46 71.15%\n", - "Processing column: Series_Title6 73.08%\n", - "Processing column: Star16 75.0%\n", - "Processing column: Star36 76.92%\n", + "Processing column: Poster_Link6 61.54%\n", + "Processing column: Gross6 63.46%\n", + "Processing column: Director6 65.38%\n", + "Processing column: Star36 67.31%\n", + "Processing column: Star26 69.23%\n", + "Processing column: Star16 71.15%\n", + "Processing column: Overview6 73.08%\n", + "Processing column: Star46 75.0%\n", + "Processing column: Series_Title6 76.92%\n", "Processing column: Genre6 78.85%\n", "Processing column: show_id7 80.77%\n", "Processing column: cast7 82.69%\n", - "Processing column: title7 84.62%\n", - "Processing column: description7 86.54%\n", + "Processing column: description7 84.62%\n", + "Processing column: title7 86.54%\n", "Processing column: director7 88.46%\n", - "Processing column: listed_in7 90.38%\n", - "Processing column: duration7 92.31%\n", - "Processing column: type7 94.23%\n", + "Processing column: type7 90.38%\n", + "Processing column: country7 92.31%\n", + "Processing column: listed_in7 94.23%\n", "Processing column: rating7 96.15%\n", - "Processing column: country7 98.08%\n", + "Processing column: duration7 98.08%\n", "Processing column: date_added7 100.0%\n", - "ELAPSED TIME :8.255228281021118\n" + "ELAPSED TIME :8.641924381256104\n" ] } ], @@ -3069,12 +3118,73 @@ "metadata": { "collapsed": false, "ExecuteTime": { - "end_time": "2024-03-19T15:15:51.896658Z", - "start_time": "2024-03-19T15:15:43.636921Z" + "end_time": "2024-03-26T10:08:00.551380Z", + "start_time": "2024-03-26T10:07:51.905562Z" } }, "id": "a52718b90663e30e", - "execution_count": 67 + "execution_count": 21 + }, + { + "cell_type": "code", + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CLUSTER 0\n", + "CLUSTER 1\n", + "CLUSTER 2\n", + "CLUSTER 3\n", + "CLUSTER 4\n", + "CLUSTER 5\n", + "CLUSTER 6\n", + "CLUSTER 7\n", + "CLUSTER 8\n", + "CLUSTER 9\n", + "CLUSTER 10\n", + "CLUSTER 11\n", + "['reg_state1', 'state5']\n", + "['make3', 'model3', 'fuel3', 'fuel_type4', 'body_type4', 'car_name4', 'model5', 'brand5', 'type7']\n", + "['reg_city1', 'Director6', 'Star36', 'Star26', 'Star16', 'Star46', 'cast7', 'director7', 'country7']\n", + "['tail_number1', 'flight1', 'reg_expiration1', 'vin5', 'condition5', 'Poster_Link6', 'Gross6', 'show_id7', 'duration7', 'date_added7']\n", + "['manufacturer1', 'reg_owner1', 'model1', 'Engine Type2', 'Model2', 'Company2']\n", + "['Overview6', 'Series_Title6', 'Genre6', 'description7', 'title7', 'listed_in7']\n", + "['Multi Engine2', 'TP mods2', 'title_status5']\n", + "['Certificate6', 'rating7']\n", + "['gear3', 'transmission_type4']\n", + "['color5']\n", + "['offerType3']\n", + "['country5']\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "C:\\Users\\ab032mj\\Desktop\\thesis\\simillarity\\venv\\Lib\\site-packages\\sklearn\\cluster\\_kmeans.py:1416: FutureWarning:\n", + "\n", + "The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning\n", + "\n" + ] + } + ], + "source": [ + "\n", + "clusters = get_clusters(vectors_sentence_clean, 12)\n", + "\n", + "for i in clusters:\n", + " print(i)" + ], + "metadata": { + "collapsed": false, + "ExecuteTime": { + "end_time": "2024-03-26T10:08:00.629829Z", + "start_time": "2024-03-26T10:08:00.552499Z" + } + }, + "id": "a1810cdce5d9ca45", + "execution_count": 22 }, { "cell_type": "code", @@ -3095,29 +3205,32 @@ "data": [ { "customdata": [ + [ + "reg_state1" + ], [ "reg_city1" ], [ - "reg_state1" + "tail_number1" ], [ "flight1" ], [ - "tail_number1" + "reg_expiration1" ], [ - "reg_expiration1" + "manufacturer1" ], [ "reg_owner1" ], [ - "manufacturer1" + "model1" ], [ - "model1" + "Multi Engine2" ], [ "TP mods2" @@ -3126,14 +3239,11 @@ "Engine Type2" ], [ - "Multi Engine2" + "Model2" ], [ "Company2" ], - [ - "Model2" - ], [ "make3" ], @@ -3156,31 +3266,31 @@ "transmission_type4" ], [ - "car_name4" + "body_type4" ], [ - "body_type4" + "car_name4" ], [ - "model5" + "country5" ], [ - "brand5" + "model5" ], [ "vin5" ], [ - "country5" + "brand5" ], [ - "state5" + "condition5" ], [ "title_status5" ], [ - "condition5" + "state5" ], [ "color5" @@ -3189,31 +3299,31 @@ "Certificate6" ], [ - "Gross6" + "Poster_Link6" ], [ - "Poster_Link6" + "Gross6" ], [ - "Star26" + "Director6" ], [ - "Overview6" + "Star36" ], [ - "Director6" + "Star26" ], [ - "Star46" + "Star16" ], [ - "Series_Title6" + "Overview6" ], [ - "Star16" + "Star46" ], [ - "Star36" + "Series_Title6" ], [ "Genre6" @@ -3225,28 +3335,28 @@ "cast7" ], [ - "title7" + "description7" ], [ - "description7" + "title7" ], [ "director7" ], [ - "listed_in7" + "type7" ], [ - "duration7" + "country7" ], [ - "type7" + "listed_in7" ], [ "rating7" ], [ - "country7" + "duration7" ], [ "date_added7" @@ -3256,58 +3366,58 @@ "legendgroup": "", "marker": { "color": [ + 0, + 2, + 3, + 3, + 3, + 4, + 4, + 4, + 6, + 6, + 4, + 4, + 4, + 1, 8, - 8, - 10, + 1, + 1, 10, - 7, 1, + 8, 1, 1, - 4, + 11, 1, 3, 1, - 1, - 2, - 5, + 3, 6, + 0, 9, + 7, 3, - 9, + 3, + 2, + 2, + 2, + 2, 5, 2, + 5, + 5, + 3, 2, + 5, + 5, 2, + 1, 2, - 10, - 7, - 8, - 11, + 5, 7, - 9, - 10, - 10, - 10, - 3, - 0, - 3, 3, - 0, - 3, - 3, - 6, - 10, - 3, - 0, - 0, - 3, - 6, - 10, - 6, - 10, - 8, - 7 + 3 ], "coloraxis": "coloraxis", "symbol": "circle" @@ -3317,113 +3427,113 @@ "orientation": "v", "showlegend": false, "x": [ - -0.2607487738132477, - -1.0079954862594604, - 0.6057992577552795, - 0.5344958901405334, - 0.007245978340506554, - 2.115837574005127, - 2.367359161376953, - 1.825493335723877, - -0.2870010733604431, - 2.9802863597869873, - -0.10476882755756378, - 2.4541375637054443, - 2.198890447616577, - 2.9787535667419434, - 4.7972283363342285, - 2.3306944370269775, - 3.7547733783721924, - -1.7262934446334839, - 3.7424700260162354, - 4.723546504974365, - 2.612650156021118, - 3.3930583000183105, - 2.861128330230713, - 3.0491740703582764, - 0.5691105127334595, - -1.7126460075378418, - -1.0357009172439575, - 3.9140195846557617, - -0.3152306377887726, - 1.4525127410888672, - -0.6416651606559753, - 0.5356868505477905, - 0.9755063056945801, - 0.6514178514480591, - 1.5733528137207031, - 1.0550673007965088, - 0.5585827827453613, - 1.587950348854065, - 0.9356500506401062, - 0.9580854773521423, - 2.4921987056732178, - 0.49259233474731445, - 0.44213390350341797, - 1.650417685508728, - 1.6093469858169556, - 0.4413849711418152, - 2.4170665740966797, - -0.0848783552646637, - 2.8719282150268555, - -0.8826955556869507, - -0.4407237768173218, - -0.4333607256412506 + -0.8933649063110352, + -0.39842361211776733, + -1.4793410301208496, + -1.385990023612976, + -2.186568260192871, + 0.20130865275859833, + 0.09205331653356552, + -0.07491744309663773, + 0.5163057446479797, + 0.5787357687950134, + 0.3718307614326477, + 0.173736572265625, + 0.26832813024520874, + 1.4223390817642212, + 0.07582058012485504, + 1.2557424306869507, + -1.1702723503112793, + -2.0104966163635254, + -1.2018636465072632, + 0.13705717027187347, + 1.7902134656906128, + 1.112935185432434, + -2.7951245307922363, + 1.321556568145752, + -1.3741170167922974, + 1.38334321975708, + -2.2069263458251953, + 1.4237916469573975, + -0.8494253158569336, + -1.8130989074707031, + -0.9841021299362183, + -0.9236490726470947, + -1.5238884687423706, + 0.43331876397132874, + 0.30412358045578003, + -0.0314553864300251, + 0.2996326982975006, + 2.353454113006592, + -0.13156205415725708, + 1.6180741786956787, + 2.7325005531311035, + -1.6119945049285889, + -0.24953196942806244, + 2.360980987548828, + 1.9116843938827515, + -0.15537023544311523, + 2.8361775875091553, + -0.40099677443504333, + 2.635286569595337, + -1.0342307090759277, + -1.295596718788147, + -2.3089394569396973 ], "xaxis": "x", "y": [ - 0.029110712930560112, - 0.2831602394580841, - 0.7355490922927856, - 0.7714056372642517, - 1.7808095216751099, - 1.5367865562438965, - 1.7146657705307007, - 1.3714510202407837, - -2.3006083965301514, - 2.068046808242798, - -1.8296335935592651, - 1.7337384223937988, - 1.2609519958496094, - -0.17047174274921417, - -0.23581287264823914, - -0.7553197741508484, - 0.7174517512321472, - 1.1607658863067627, - 0.6419799327850342, - -0.22861970961093903, - -0.12489606440067291, - -0.20934484899044037, - 0.013001663610339165, - -0.03274914622306824, - 0.6541449427604675, - -0.30210813879966736, - 0.2788488566875458, - -1.3033324480056763, - 2.003594160079956, - 2.6429059505462646, - -0.9512806534767151, - 1.3433306217193604, - 0.3491983711719513, - -0.4967333674430847, - -2.549825668334961, - -0.7152066826820374, - -0.42618969082832336, - -1.7045925855636597, - -0.6547468304634094, - -0.6604399681091309, - -2.0904040336608887, - 1.0396133661270142, - -0.41867274045944214, - -1.9949613809585571, - -2.536947011947632, - -0.4811083674430847, - -2.215475082397461, - 0.9217885136604309, - -2.3584845066070557, - -1.1353845596313477, - -0.043536651879549026, - 1.4129749536514282 + -2.4839179515838623, + -2.989748239517212, + -4.415885925292969, + -4.346672058105469, + -3.2706820964813232, + -0.9454534649848938, + -1.2883728742599487, + -1.6748652458190918, + -5.4596848487854, + -5.6513776779174805, + -0.24761804938316345, + -1.4888640642166138, + -0.8605363368988037, + -1.9725847244262695, + -6.909929275512695, + -2.708397626876831, + -0.7754024863243103, + -5.6967973709106445, + -0.8881263136863708, + -6.934398174285889, + -1.4960343837738037, + -2.2084426879882812, + -4.651385307312012, + -1.8190819025039673, + -4.44832181930542, + -1.8388243913650513, + -2.7469568252563477, + -5.604869842529297, + -2.437312126159668, + -1.3868107795715332, + -5.354982852935791, + -4.581441402435303, + -3.4043173789978027, + -4.036744117736816, + -3.7389719486236572, + -4.098353862762451, + -4.029059410095215, + -4.093706130981445, + -4.036102771759033, + -3.6575052738189697, + -3.290090560913086, + -4.05233097076416, + -4.351712226867676, + -4.0254034996032715, + -3.7103238105773926, + -4.058472633361816, + -2.5803277492523193, + -3.003248929977417, + -3.189161539077759, + -5.584373474121094, + -3.3394620418548584, + -3.5453758239746094 ], "yaxis": "y", "type": "scatter" @@ -4338,7 +4448,7 @@ "plotlyServerURL": "https://plot.ly" } }, - "text/html": "
" + "text/html": "
" }, "metadata": {}, "output_type": "display_data" @@ -4351,12 +4461,12 @@ "metadata": { "collapsed": false, "ExecuteTime": { - "end_time": "2024-03-19T15:15:52.140066Z", - "start_time": "2024-03-19T15:15:51.897705Z" + "end_time": "2024-03-26T10:08:00.901952Z", + "start_time": "2024-03-26T10:08:00.630898Z" } }, "id": "1c459935fab2bc93", - "execution_count": 68 + "execution_count": 23 }, { "cell_type": "code", @@ -4365,19 +4475,19 @@ "name": "stdout", "output_type": "stream", "text": [ - "Processing column: reg_city1 1.92%\n", - "Processing column: reg_state1 3.85%\n", - "Processing column: flight1 5.77%\n", - "Processing column: tail_number1 7.69%\n", + "Processing column: reg_state1 1.92%\n", + "Processing column: reg_city1 3.85%\n", + "Processing column: tail_number1 5.77%\n", + "Processing column: flight1 7.69%\n", "Processing column: reg_expiration1 9.62%\n", - "Processing column: reg_owner1 11.54%\n", - "Processing column: manufacturer1 13.46%\n", + "Processing column: manufacturer1 11.54%\n", + "Processing column: reg_owner1 13.46%\n", "Processing column: model1 15.38%\n", - "Processing column: TP mods2 17.31%\n", - "Processing column: Engine Type2 19.23%\n", - "Processing column: Multi Engine2 21.15%\n", - "Processing column: Company2 23.08%\n", - "Processing column: Model2 25.0%\n", + "Processing column: Multi Engine2 17.31%\n", + "Processing column: TP mods2 19.23%\n", + "Processing column: Engine Type2 21.15%\n", + "Processing column: Model2 23.08%\n", + "Processing column: Company2 25.0%\n", "Processing column: make3 26.92%\n", "Processing column: gear3 28.85%\n", "Processing column: model3 30.77%\n", @@ -4385,39 +4495,39 @@ "Processing column: offerType3 34.62%\n", "Processing column: fuel_type4 36.54%\n", "Processing column: transmission_type4 38.46%\n", - "Processing column: car_name4 40.38%\n", - "Processing column: body_type4 42.31%\n", - "Processing column: model5 44.23%\n", - "Processing column: brand5 46.15%\n", + "Processing column: body_type4 40.38%\n", + "Processing column: car_name4 42.31%\n", + "Processing column: country5 44.23%\n", + "Processing column: model5 46.15%\n", "Processing column: vin5 48.08%\n", - "Processing column: country5 50.0%\n", - "Processing column: state5 51.92%\n", + "Processing column: brand5 50.0%\n", + "Processing column: condition5 51.92%\n", "Processing column: title_status5 53.85%\n", - "Processing column: condition5 55.77%\n", + "Processing column: state5 55.77%\n", "Processing column: color5 57.69%\n", "Processing column: Certificate6 59.62%\n", - "Processing column: Gross6 61.54%\n", - "Processing column: Poster_Link6 63.46%\n", - "Processing column: Star26 65.38%\n", - "Processing column: Overview6 67.31%\n", - "Processing column: Director6 69.23%\n", - "Processing column: Star46 71.15%\n", - "Processing column: Series_Title6 73.08%\n", - "Processing column: Star16 75.0%\n", - "Processing column: Star36 76.92%\n", + "Processing column: Poster_Link6 61.54%\n", + "Processing column: Gross6 63.46%\n", + "Processing column: Director6 65.38%\n", + "Processing column: Star36 67.31%\n", + "Processing column: Star26 69.23%\n", + "Processing column: Star16 71.15%\n", + "Processing column: Overview6 73.08%\n", + "Processing column: Star46 75.0%\n", + "Processing column: Series_Title6 76.92%\n", "Processing column: Genre6 78.85%\n", "Processing column: show_id7 80.77%\n", "Processing column: cast7 82.69%\n", - "Processing column: title7 84.62%\n", - "Processing column: description7 86.54%\n", + "Processing column: description7 84.62%\n", + "Processing column: title7 86.54%\n", "Processing column: director7 88.46%\n", - "Processing column: listed_in7 90.38%\n", - "Processing column: duration7 92.31%\n", - "Processing column: type7 94.23%\n", + "Processing column: type7 90.38%\n", + "Processing column: country7 92.31%\n", + "Processing column: listed_in7 94.23%\n", "Processing column: rating7 96.15%\n", - "Processing column: country7 98.08%\n", + "Processing column: duration7 98.08%\n", "Processing column: date_added7 100.0%\n", - "ELAPSED TIME :5.922260284423828\n" + "ELAPSED TIME :6.345606088638306\n" ] } ], @@ -4427,12 +4537,12 @@ "metadata": { "collapsed": false, "ExecuteTime": { - "end_time": "2024-03-19T15:15:58.068124Z", - "start_time": "2024-03-19T15:15:52.141138Z" + "end_time": "2024-03-26T10:08:07.252811Z", + "start_time": "2024-03-26T10:08:00.903005Z" } }, "id": "75f2e3d05b70b94e", - "execution_count": 69 + "execution_count": 24 }, { "cell_type": "code", @@ -4441,19 +4551,19 @@ "name": "stdout", "output_type": "stream", "text": [ - "Processing column: reg_city1 1.92%\n", - "Processing column: reg_state1 3.85%\n", - "Processing column: flight1 5.77%\n", - "Processing column: tail_number1 7.69%\n", + "Processing column: reg_state1 1.92%\n", + "Processing column: reg_city1 3.85%\n", + "Processing column: tail_number1 5.77%\n", + "Processing column: flight1 7.69%\n", "Processing column: reg_expiration1 9.62%\n", - "Processing column: reg_owner1 11.54%\n", - "Processing column: manufacturer1 13.46%\n", + "Processing column: manufacturer1 11.54%\n", + "Processing column: reg_owner1 13.46%\n", "Processing column: model1 15.38%\n", - "Processing column: TP mods2 17.31%\n", - "Processing column: Engine Type2 19.23%\n", - "Processing column: Multi Engine2 21.15%\n", - "Processing column: Company2 23.08%\n", - "Processing column: Model2 25.0%\n", + "Processing column: Multi Engine2 17.31%\n", + "Processing column: TP mods2 19.23%\n", + "Processing column: Engine Type2 21.15%\n", + "Processing column: Model2 23.08%\n", + "Processing column: Company2 25.0%\n", "Processing column: make3 26.92%\n", "Processing column: gear3 28.85%\n", "Processing column: model3 30.77%\n", @@ -4461,39 +4571,39 @@ "Processing column: offerType3 34.62%\n", "Processing column: fuel_type4 36.54%\n", "Processing column: transmission_type4 38.46%\n", - "Processing column: car_name4 40.38%\n", - "Processing column: body_type4 42.31%\n", - "Processing column: model5 44.23%\n", - "Processing column: brand5 46.15%\n", + "Processing column: body_type4 40.38%\n", + "Processing column: car_name4 42.31%\n", + "Processing column: country5 44.23%\n", + "Processing column: model5 46.15%\n", "Processing column: vin5 48.08%\n", - "Processing column: country5 50.0%\n", - "Processing column: state5 51.92%\n", + "Processing column: brand5 50.0%\n", + "Processing column: condition5 51.92%\n", "Processing column: title_status5 53.85%\n", - "Processing column: condition5 55.77%\n", + "Processing column: state5 55.77%\n", "Processing column: color5 57.69%\n", "Processing column: Certificate6 59.62%\n", - "Processing column: Gross6 61.54%\n", - "Processing column: Poster_Link6 63.46%\n", - "Processing column: Star26 65.38%\n", - "Processing column: Overview6 67.31%\n", - "Processing column: Director6 69.23%\n", - "Processing column: Star46 71.15%\n", - "Processing column: Series_Title6 73.08%\n", - "Processing column: Star16 75.0%\n", - "Processing column: Star36 76.92%\n", + "Processing column: Poster_Link6 61.54%\n", + "Processing column: Gross6 63.46%\n", + "Processing column: Director6 65.38%\n", + "Processing column: Star36 67.31%\n", + "Processing column: Star26 69.23%\n", + "Processing column: Star16 71.15%\n", + "Processing column: Overview6 73.08%\n", + "Processing column: Star46 75.0%\n", + "Processing column: Series_Title6 76.92%\n", "Processing column: Genre6 78.85%\n", "Processing column: show_id7 80.77%\n", "Processing column: cast7 82.69%\n", - "Processing column: title7 84.62%\n", - "Processing column: description7 86.54%\n", + "Processing column: description7 84.62%\n", + "Processing column: title7 86.54%\n", "Processing column: director7 88.46%\n", - "Processing column: listed_in7 90.38%\n", - "Processing column: duration7 92.31%\n", - "Processing column: type7 94.23%\n", + "Processing column: type7 90.38%\n", + "Processing column: country7 92.31%\n", + "Processing column: listed_in7 94.23%\n", "Processing column: rating7 96.15%\n", - "Processing column: country7 98.08%\n", + "Processing column: duration7 98.08%\n", "Processing column: date_added7 100.0%\n", - "ELAPSED TIME :593.521614074707\n" + "ELAPSED TIME :553.5474011898041\n" ] } ], @@ -4503,12 +4613,12 @@ "metadata": { "collapsed": false, "ExecuteTime": { - "end_time": "2024-03-19T15:25:51.594792Z", - "start_time": "2024-03-19T15:15:58.069281Z" + "end_time": "2024-03-26T10:17:20.804893Z", + "start_time": "2024-03-26T10:08:07.253938Z" } }, "id": "287c7ad9f3cea37b", - "execution_count": 70 + "execution_count": 25 }, { "cell_type": "code", @@ -4529,30 +4639,33 @@ "data": [ { "customdata": [ - [ - "reg_city1" - ], [ "reg_state1" ], [ - "flight1" + "reg_city1" ], [ "tail_number1" ], + [ + "flight1" + ], [ "reg_expiration1" ], [ - "reg_owner1" + "manufacturer1" ], [ - "manufacturer1" + "reg_owner1" ], [ "model1" ], + [ + "Multi Engine2" + ], [ "TP mods2" ], @@ -4560,14 +4673,11 @@ "Engine Type2" ], [ - "Multi Engine2" + "Model2" ], [ "Company2" ], - [ - "Model2" - ], [ "make3" ], @@ -4590,31 +4700,31 @@ "transmission_type4" ], [ - "car_name4" + "body_type4" ], [ - "body_type4" + "car_name4" ], [ - "model5" + "country5" ], [ - "brand5" + "model5" ], [ "vin5" ], [ - "country5" + "brand5" ], [ - "state5" + "condition5" ], [ "title_status5" ], [ - "condition5" + "state5" ], [ "color5" @@ -4623,31 +4733,31 @@ "Certificate6" ], [ - "Gross6" + "Poster_Link6" ], [ - "Poster_Link6" + "Gross6" ], [ - "Star26" + "Director6" ], [ - "Overview6" + "Star36" ], [ - "Director6" + "Star26" ], [ - "Star46" + "Star16" ], [ - "Series_Title6" + "Overview6" ], [ - "Star16" + "Star46" ], [ - "Star36" + "Series_Title6" ], [ "Genre6" @@ -4659,28 +4769,28 @@ "cast7" ], [ - "title7" + "description7" ], [ - "description7" + "title7" ], [ "director7" ], [ - "listed_in7" + "type7" ], [ - "duration7" + "country7" ], [ - "type7" + "listed_in7" ], [ "rating7" ], [ - "country7" + "duration7" ], [ "date_added7" @@ -4690,58 +4800,58 @@ "legendgroup": "", "marker": { "color": [ - 8, - 8, - 9, + 10, + 10, + 5, + 5, 9, - 3, - 6, + 4, + 4, + 4, + 8, + 0, + 4, + 4, + 4, 6, + 8, 6, - 4, + 3, + 1, + 3, + 8, 6, - 4, 6, + 1, 6, - 7, - 4, - 7, - 0, - 10, - 0, - 0, - 7, 5, - 7, - 7, + 6, 9, - 11, 8, - 4, - 3, - 7, - 9, - 3, + 10, + 6, + 5, + 5, 9, - 1, 2, - 1, - 1, 2, - 1, - 1, 2, - 9, - 1, 2, + 11, + 2, + 11, + 7, + 5, + 2, + 11, + 11, 2, - 1, 2, - 3, 1, + 7, + 5, 9, - 8, - 3 + 9 ], "coloraxis": "coloraxis", "symbol": "circle" @@ -4751,113 +4861,113 @@ "orientation": "v", "showlegend": false, "x": [ - -1.2315573692321777, - -1.3960601091384888, - 0.16279497742652893, - 0.336090087890625, - -0.2639041244983673, - -2.56018328666687, - -2.9185597896575928, - -2.144641160964966, - 2.4411609172821045, - -3.4252243041992188, - 2.3289527893066406, - -2.9055278301239014, - -2.4776194095611572, - -1.5479029417037964, - 2.1699094772338867, - -2.1339659690856934, - 1.3242402076721191, - -0.5383439064025879, - 1.4190571308135986, - 2.1065027713775635, - -1.4585567712783813, - -1.735015869140625, - -1.8539036512374878, - -1.390334129333496, - 0.2982367277145386, - 0.926008939743042, - -1.4070314168930054, - 2.5462491512298584, - -0.8543004989624023, - -2.2616286277770996, - 1.3494809865951538, - -0.6881126165390015, - 0.34734776616096497, - -0.11908050626516342, - -3.8797318935394287, - -0.22775070369243622, - -0.06335285305976868, - -2.9259536266326904, - -0.38409364223480225, - -0.5979487895965576, - -3.3752193450927734, - 0.15268082916736603, - 0.294850617647171, - -3.1962966918945312, - -3.836763858795166, - -0.109186090528965, - -3.3986334800720215, - -0.7591543197631836, - 0.08581861108541489, - 1.2354620695114136, - -1.6200463771820068, - -0.33383724093437195 + -0.93117356300354, + -0.5845441818237305, + -2.7235045433044434, + -2.6017391681671143, + -3.0384340286254883, + 0.5333382487297058, + 0.25806382298469543, + -0.09599914401769638, + -3.0942459106445312, + -3.3379340171813965, + 0.9325237274169922, + 0.3388823866844177, + 0.5669587254524231, + 1.4670075178146362, + -2.5555593967437744, + 1.04537832736969, + -1.2160331010818481, + -0.6075345277786255, + -1.341273546218872, + -2.3048923015594482, + 2.080313205718994, + 1.1715267896652222, + -1.4227235317230225, + 1.3285086154937744, + -2.558462142944336, + 1.5313986539840698, + -2.51103138923645, + -2.766148328781128, + -0.9191096425056458, + 0.6287733912467957, + -1.8047654628753662, + -2.1264681816101074, + -2.2419049739837646, + -0.694290280342102, + -0.5716606974601746, + -1.0548096895217896, + -0.6587052941322327, + 1.2156471014022827, + -1.102487564086914, + 0.6747475862503052, + 1.8054462671279907, + -2.824810743331909, + -1.412682294845581, + 1.2583482265472412, + 0.967156708240509, + -0.9760659337043762, + -0.40136075019836426, + -0.5862438678741455, + 1.6448854207992554, + -1.9822865724563599, + -2.2009506225585938, + -2.8945517539978027 ], "xaxis": "x", "y": [ - -1.4533166885375977, - -1.858666181564331, - -2.41731858253479, - -2.5061285495758057, - -3.7297160625457764, - -2.083111047744751, - -2.4953696727752686, - -2.7385475635528564, - -1.27792227268219, - -3.1510751247406006, - -0.9756304621696472, - -2.4108920097351074, - -2.533029556274414, - 0.7466858625411987, - -0.2979642152786255, - 0.020160024985671043, - 1.2635958194732666, - 1.3030813932418823, - 1.169091820716858, - 0.15421932935714722, - 0.4831574261188507, - 1.3783798217773438, - 0.4652738869190216, - 0.8725593090057373, - -2.3842854499816895, - -0.10721741616725922, - -1.840975284576416, - 0.0952586904168129, - -3.5729644298553467, - -0.5368195176124573, - -2.155054807662964, - -2.8722262382507324, - -1.8498497009277344, - -1.0395498275756836, - -0.585018515586853, - -0.5340877771377563, - -1.0597599744796753, - -0.7209460139274597, - -0.6978242993354797, - -0.8990448117256165, - 0.3901121914386749, - -2.7778234481811523, - -1.1740888357162476, - -0.5221556425094604, - -0.5215041637420654, - -0.8624324798583984, - 0.22354581952095032, - -3.0666046142578125, - 0.3743482232093811, - -2.3861749172210693, - -1.1583610773086548, - -3.593682289123535 + -0.8487504720687866, + -0.2875222861766815, + 0.2438797652721405, + 0.210541769862175, + -1.0851330757141113, + -1.989612340927124, + -1.4605934619903564, + -1.680659532546997, + 1.9580214023590088, + 1.8102328777313232, + -2.6760451793670654, + -1.6934581995010376, + -1.8997161388397217, + -0.6463617086410522, + 2.353159189224243, + -0.13706769049167633, + 3.0675768852233887, + -2.792996644973755, + 3.043461561203003, + 2.655846118927002, + -0.8199979066848755, + -0.6571944952011108, + -2.1100311279296875, + -0.5589699149131775, + 0.2689114212989807, + -0.7590398192405701, + -1.3654488325119019, + 2.7915844917297363, + -0.8548046350479126, + -0.1063171774148941, + 0.03021148219704628, + 0.7474803328514099, + -0.8430166244506836, + 0.9680089354515076, + 0.6484963297843933, + 0.7129985094070435, + 0.7789408564567566, + 1.7361503839492798, + 0.6301454901695251, + 0.9279691576957703, + 1.0270980596542358, + -0.10064753144979477, + 0.8464698791503906, + 1.6700187921524048, + 1.0946271419525146, + 0.8363156914710999, + 1.941645622253418, + -0.8647369146347046, + 1.1233822107315063, + -0.17551742494106293, + -1.0526247024536133, + -1.0547256469726562 ], "yaxis": "y", "type": "scatter" @@ -5772,7 +5882,7 @@ "plotlyServerURL": "https://plot.ly" } }, - "text/html": "
" + "text/html": "
" }, "metadata": {}, "output_type": "display_data" @@ -5785,12 +5895,12 @@ "metadata": { "collapsed": false, "ExecuteTime": { - "end_time": "2024-03-19T15:25:51.865742Z", - "start_time": "2024-03-19T15:25:51.595953Z" + "end_time": "2024-03-26T10:17:21.266462Z", + "start_time": "2024-03-26T10:17:20.806032Z" } }, "id": "49fde730403db101", - "execution_count": 71 + "execution_count": 26 }, { "cell_type": "code", @@ -5811,29 +5921,32 @@ "data": [ { "customdata": [ + [ + "reg_state1" + ], [ "reg_city1" ], [ - "reg_state1" + "tail_number1" ], [ "flight1" ], [ - "tail_number1" + "reg_expiration1" ], [ - "reg_expiration1" + "manufacturer1" ], [ "reg_owner1" ], [ - "manufacturer1" + "model1" ], [ - "model1" + "Multi Engine2" ], [ "TP mods2" @@ -5842,14 +5955,11 @@ "Engine Type2" ], [ - "Multi Engine2" + "Model2" ], [ "Company2" ], - [ - "Model2" - ], [ "make3" ], @@ -5872,31 +5982,31 @@ "transmission_type4" ], [ - "car_name4" + "body_type4" ], [ - "body_type4" + "car_name4" ], [ - "model5" + "country5" ], [ - "brand5" + "model5" ], [ "vin5" ], [ - "country5" + "brand5" ], [ - "state5" + "condition5" ], [ "title_status5" ], [ - "condition5" + "state5" ], [ "color5" @@ -5905,31 +6015,31 @@ "Certificate6" ], [ - "Gross6" + "Poster_Link6" ], [ - "Poster_Link6" + "Gross6" ], [ - "Star26" + "Director6" ], [ - "Overview6" + "Star36" ], [ - "Director6" + "Star26" ], [ - "Star46" + "Star16" ], [ - "Series_Title6" + "Overview6" ], [ - "Star16" + "Star46" ], [ - "Star36" + "Series_Title6" ], [ "Genre6" @@ -5941,28 +6051,28 @@ "cast7" ], [ - "title7" + "description7" ], [ - "description7" + "title7" ], [ "director7" ], [ - "listed_in7" + "type7" ], [ - "duration7" + "country7" ], [ - "type7" + "listed_in7" ], [ "rating7" ], [ - "country7" + "duration7" ], [ "date_added7" @@ -5972,58 +6082,58 @@ "legendgroup": "", "marker": { "color": [ - 4, - 4, 1, 1, - 10, 5, 5, - 10, - 9, - 4, + 5, + 0, 0, 5, - 10, + 7, + 7, 2, + 5, 0, 4, - 8, - 0, - 8, - 0, + 7, + 10, 2, + 10, 2, + 7, 4, - 2, - 1, - 0, 4, - 6, 10, - 4, - 0, 10, - 11, + 5, 4, + 11, + 6, + 1, + 10, + 10, + 9, + 5, + 1, + 1, + 1, + 1, 3, - 4, - 4, - 4, - 4, - 4, - 7, 1, - 4, - 4, + 1, + 8, + 5, + 1, 3, - 4, - 7, - 10, - 7, - 4, - 4, - 10 + 1, + 1, + 8, + 1, + 8, + 8, + 5, + 11 ], "coloraxis": "coloraxis", "symbol": "circle" @@ -6033,113 +6143,113 @@ "orientation": "v", "showlegend": false, "x": [ - 2.232069730758667, - 2.04703950881958, - 3.685438871383667, - 3.7534923553466797, - 4.548223495483398, - 2.1641151905059814, - 1.6716158390045166, - 3.707670211791992, - 3.551330089569092, - 0.7382426261901855, - 2.2600345611572266, - 1.6929031610488892, - 3.5000805854797363, - 0.8841426968574524, - 1.9816348552703857, - 2.538231134414673, - 0.15006133913993835, - 2.2033517360687256, - 0.22405526041984558, - 2.0226693153381348, - 1.137275218963623, - 0.6898834705352783, - 2.1997299194335938, - 0.7599173784255981, - 4.115108489990234, - 1.3826909065246582, - 2.0932059288024902, - 2.393995761871338, - 5.186481475830078, - 2.962049961090088, - 2.391953468322754, - 4.0034589767456055, - 4.351011276245117, - 2.7698190212249756, - 3.9041121006011963, - 2.769899368286133, - 2.7911529541015625, - 2.699028968811035, - 2.755099058151245, - 2.786696195602417, - 3.2102408409118652, - 3.760199546813965, - 3.3576271533966064, - 2.758638858795166, - 3.8999764919281006, - 2.8488729000091553, - 3.1844098567962646, - 4.427828311920166, - 2.6792852878570557, - 2.7276439666748047, - 1.7881308794021606, - 5.050660133361816 + 0.07464970648288727, + 0.3129180669784546, + 1.8015512228012085, + 1.7333874702453613, + 2.3830831050872803, + 2.6272332668304443, + 1.9825297594070435, + 1.9731072187423706, + 0.8772491812705994, + -0.4399683475494385, + -1.1831083297729492, + 1.889535665512085, + 2.636423349380493, + -0.574123203754425, + 0.37415051460266113, + 0.6518558263778687, + -1.5904536247253418, + 0.34899643063545227, + -1.5395406484603882, + 0.3759491741657257, + -0.7246766686439514, + -0.26768624782562256, + -0.5958290100097656, + 0.3429754078388214, + 2.285098075866699, + -0.6929587721824646, + 3.0599184036254883, + 1.3004878759384155, + 0.12265949696302414, + 0.9976135492324829, + 0.4892101585865021, + 2.1270360946655273, + 2.1105659008026123, + 0.9012678861618042, + 0.8896762132644653, + 0.8739141821861267, + 0.8704595565795898, + 1.6939094066619873, + 0.8959806561470032, + 0.7778205275535583, + 1.0063170194625854, + 1.8169519901275635, + 1.3267322778701782, + 1.6878050565719604, + 0.8405084013938904, + 0.9591824412345886, + 0.4891231656074524, + -0.14022839069366455, + 1.0082389116287231, + 0.5928480625152588, + 2.39751935005188, + -0.8518069982528687 ], "xaxis": "x", "y": [ - 2.690234661102295, - 2.5165600776672363, - 3.282524585723877, - 3.40256667137146, - 3.580906867980957, - 1.5391180515289307, - 0.852472722530365, - 2.5674898624420166, - 4.886576175689697, - 3.5124340057373047, - 5.190670490264893, - 0.8154540657997131, - 2.2167246341705322, - 2.026197671890259, - 4.476515769958496, - 3.4282286167144775, - 3.0664360523223877, - 4.188016414642334, - 3.111924171447754, - 4.605981349945068, - 1.835215449333191, - 1.967597246170044, - 3.4694032669067383, - 2.036349058151245, - 2.589768886566162, - 3.527857542037964, - 2.493844509124756, - 5.637296199798584, - 3.052475690841675, - 4.274573802947998, - 4.028963088989258, - 3.023559808731079, - 1.2799495458602905, - 3.017526388168335, - -0.001000845804810524, - 2.961000680923462, - 3.0660297870635986, - 2.639333486557007, - 2.9294049739837646, - 3.0432934761047363, - 0.8248129487037659, - 3.3993475437164307, - 1.8885663747787476, - 2.466247081756592, - 0.011215770617127419, - 3.013014793395996, - 0.6786802411079407, - 3.1485755443573, - 0.8549591302871704, - 1.605120062828064, - 2.9543240070343018, - 2.55021071434021 + 1.2225661277770996, + 1.3324196338653564, + 2.021209239959717, + 1.9221495389938354, + 2.5465946197509766, + 0.16001343727111816, + 0.39939454197883606, + 1.2751306295394897, + 3.6996567249298096, + 3.5395219326019287, + 1.2964847087860107, + 0.9139719009399414, + 0.1269349604845047, + 0.08850818872451782, + 3.0580782890319824, + 1.9371856451034546, + 0.7239078879356384, + 2.692063570022583, + 0.7861824631690979, + 3.1869115829467773, + -0.04748240113258362, + 0.01011333242058754, + 1.9134701490402222, + 1.9545291662216187, + 1.5034958124160767, + 0.05370209366083145, + 2.3742215633392334, + 4.004917621612549, + 1.2006990909576416, + 2.763378858566284, + 2.5211269855499268, + -0.5236079692840576, + 1.8120489120483398, + 1.3816590309143066, + 1.5251518487930298, + 1.502103328704834, + 1.396881341934204, + -1.523393988609314, + 1.5436475276947021, + 1.1724165678024292, + -0.5919604301452637, + 1.9960685968399048, + 0.46192702651023865, + -1.5119831562042236, + 1.0042810440063477, + 1.4661548137664795, + -0.528108537197113, + 1.517184853553772, + -0.7407345771789551, + 0.20635902881622314, + 2.198230028152466, + 2.5028865337371826 ], "yaxis": "y", "type": "scatter" @@ -7054,7 +7164,7 @@ "plotlyServerURL": "https://plot.ly" } }, - "text/html": "
" + "text/html": "
" }, "metadata": {}, "output_type": "display_data" @@ -7067,35 +7177,42 @@ "metadata": { "collapsed": false, "ExecuteTime": { - "end_time": "2024-03-19T15:25:52.147365Z", - "start_time": "2024-03-19T15:25:51.868966Z" + "end_time": "2024-03-26T10:17:21.637111Z", + "start_time": "2024-03-26T10:17:21.267536Z" } }, "id": "af662f30dc4f1479", - "execution_count": 72 + "execution_count": 27 }, { "cell_type": "code", "outputs": [], - "source": [], + "source": [ + " " + ], "metadata": { "collapsed": false, "ExecuteTime": { - "end_time": "2024-03-19T15:25:52.151973Z", - "start_time": "2024-03-19T15:25:52.149061Z" + "end_time": "2024-03-26T10:17:21.642099Z", + "start_time": "2024-03-26T10:17:21.638772Z" } }, "id": "f56ac97595bdb926", - "execution_count": 72 + "execution_count": 27 }, { "cell_type": "code", "outputs": [], "source": [], "metadata": { - "collapsed": false + "collapsed": false, + "ExecuteTime": { + "end_time": "2024-03-26T10:17:21.646460Z", + "start_time": "2024-03-26T10:17:21.643260Z" + } }, - "id": "5549cbcc0f37bc4f" + "id": "5549cbcc0f37bc4f", + "execution_count": 27 } ], "metadata": { diff --git a/main.py b/main.py index ea74ef4..2748652 100644 --- a/main.py +++ b/main.py @@ -1,20 +1,7 @@ -# This is a sample Python script. -# Press Shift+F10 to execute it or replace it with your code. -# Press Double Shift to search everywhere for classes, files, tool windows, actions, and settings. -import functions as f +def compare_datasets(path1, path2): + ... -def print_hi(name): - database, names = f.load__csv_files_from_folder("data") - data0 = f.DataFrameMetadataCreator(database[0]) - columns = data0.get_numerical_columns() - print(data0.column_type) - data0.create_column_embeddings() - embediings = data0.column_embeddings - -# Press the green button in the gutter to run the script. if __name__ == '__main__': - print_hi('PyCharm') - -# See PyCharm help at https://www.jetbrains.com/help/pycharm/ + compare_datasets('data/netflix_titles.csv', 'data/imdb_top_1000.csv') diff --git a/requirements.txt b/requirements.txt index 322883e..a3ffea0 100644 Binary files a/requirements.txt and b/requirements.txt differ diff --git a/similarity/ComparatorByColumn.py b/similarity/ComparatorByColumn.py index 401a9eb..bec002d 100644 --- a/similarity/ComparatorByColumn.py +++ b/similarity/ComparatorByColumn.py @@ -9,7 +9,6 @@ from similarity.DataFrameMetadata import DataFrameMetadata, KindMetadata, CategoricalMetadata from similarity.Types import DataKind - class ComparatorType(ABC): def __init__(self, weight=1): self.weight = weight diff --git a/similarity/DataFrameMetadata.py b/similarity/DataFrameMetadata.py index 2961bf0..3840f82 100644 --- a/similarity/DataFrameMetadata.py +++ b/similarity/DataFrameMetadata.py @@ -52,6 +52,10 @@ def __init__(self, value: Optional[tuple], distribution: Optional[tuple[int, ... self.nulls = null_values self.ratio_max_length = ratio_max_length + def __str__(self): + return f"KindMetadata(value={self.value}, distribution={self.distribution}, longest={self.longest}, shortest={self.shortest}, null_values={self.nulls}, ratio_max_length={self.ratio_max_length})" + + class NonnumericalMetadata: """ @@ -69,6 +73,8 @@ def __init__(self, longest: str, shortest: str, avg_length: int): # todo bigrams trigrams ? # todo embeddings ?? nebo mame pro cele sloupce ? + def __str__(self): + return f"NonnumericalMetadata(longest={self.longest}, shortest={self.shortest}, avg_length={self.avg_length})" class NumericalMetadata: """ @@ -86,6 +92,9 @@ def __init__(self, min_value: float | int, max_value: float | int, same_value_le self.same_value_length = same_value_length # todo distribution !!!!!! + def __str__(self): + return f"NumericalMetadata(min_value={self.min_value}, max_value={self.max_value}, range_size={self.range_size}, same_value_length={self.same_value_length})" + class DataFrameMetadata: def __init__(self): @@ -117,6 +126,11 @@ def get_column_type(self, name): if name in columns: return column_type + def get_column_kind(self, name): + for column_kind, columns in self.column_kind.items(): + if name in columns: + return column_kind + def get_column_names_by_type(self, *types): if NONNUMERICAL in types: types = list(types) diff --git a/similarity/Types.py b/similarity/Types.py index 8572983..b16d1af 100644 --- a/similarity/Types.py +++ b/similarity/Types.py @@ -1,3 +1,6 @@ +""" +This files contains all +""" import re from enum import Enum from typing import Any @@ -65,6 +68,11 @@ def is_constant(column: pd.Series) -> bool: def series_to_numeric(x: pd.Series): + """ + Apply to_numeric on pd.Series + :param x: + :return: numeric series + """ try: to_numeric = x.apply(lambda s: pd.to_numeric(s.replace(',', '.'), errors='coerce')) except AttributeError: @@ -76,13 +84,15 @@ def is_numerical(x: pd.Series) -> bool: """ Decide if column type is numerical. - Column is numerical if it could be transferred into numeric, and it is float or int, and it is not full nulls + Column is numerical if it could be transferred into numeric, + and it is float or int, and it is not full nulls :param x: the type :return: true if it is numerical, otherwise false """ - x.isnull().values.sum() / x.size - return x.any() and (x.dtype == np.float64 or x.dtype == np.int64) and not (x.isnull().values.sum() / x.size > 0.9) + return (x.any() and + (x.dtype in (np.float64, np.int64)) and not + (x.isnull().values.sum() / x.size > 0.9)) def is_int(x: pd.Series) -> bool: @@ -101,7 +111,8 @@ def is_human_gen(x: pd.Series) -> bool: """ Decide if float number is human generated - Float is human generated if number of numbers after decimal point is smaller than computer_generated_threshold + Float is human generated if number of numbers after decimal + point is smaller than computer_generated_threshold :param x: the series for decide :return: true if it is human generated, otherwise false @@ -113,9 +124,9 @@ def floating_length_gt(num: Any, gt: int): :param num: to decide :param gt: threshold """ - splitted = str(num).split(".") - if len(splitted) > 1: - return len(splitted[1]) > gt + split = str(num).split(".") + if len(split) > 1: + return len(split[1]) > gt return False return x.apply(lambda s: not floating_length_gt(s, TypeSettings.computer_generated_threshold)).all() @@ -124,7 +135,7 @@ def floating_length_gt(num: Any, gt: int): def is_not_numerical(x: pd.Series) -> bool: """ Decide if type is not numerical - The column is not numerical if it is not numerical and it could be transfer to string + The column is not numerical if it is not numerical, and it could be transferred to string :param x: the series for decide :return: false if it is numerical, otherwise true @@ -217,15 +228,17 @@ def is_str_phrase(word: str): def is_sentence(x: pd.Series) -> bool: """ - The string is sentence if it starts with uperCasse letter and end with interpunction + The string is sentence if it starts with upperCase letter and end with fullstops. :param x: series for decide :return: true for sentence """ def is_str_sentence(word: str): - return ((word.endswith(".") or word.endswith("!") or word.endswith("?")) and word.count(".") <= 1 - and word.count("!") <= 1 and word.count("?") <= 1) and re.search("^[A-Z]", word) + return (((word.endswith(".") or word.endswith("!") + or word.endswith("?")) and word.count(".") <= 1 + and word.count("!") <= 1 and word.count("?") <= 1) + and re.search("^[A-Z]", word)) return x.apply(lambda s: is_str_sentence(s)).all() @@ -252,14 +265,17 @@ def is_str_multiple(word: str): regex = re.compile('[a-zA-Z0-9]') word_clean = regex.sub('', word) res = "".join(dict.fromkeys(word_clean)) - return (word.count(res) == word_clean.count(res) and word_clean.count(res) > 0) or res == '' and res != ' ' + return ((word.count(res) == word_clean.count(res) + and word_clean.count(res) > 0) + or res == '' and res != ' ') return x.apply(lambda s: is_str_multiple(s)).all() def is_true_multiple(x: pd.Series) -> bool: """ - The string is true multiple if the record is split by some sequence and the sequence is the same for all rows + The string is true multiple if the record is split + by some sequence and the sequence is the same for all rows :param x: series for decide :return: true for multiple @@ -272,7 +288,7 @@ def is_str_multiple(word: str): return res result_arr = x.apply(lambda s: is_str_multiple(s)).values - to_return = [i for i in result_arr if (i != '')] + to_return = [i for i in result_arr if i != ''] if len(to_return) == 0: return True if to_return[0].replace(" ", "") == '': @@ -293,18 +309,20 @@ def is_str_date(word: str): try: parse(str(word), fuzzy_with_tokens=True) # todo add timezone return True - except ParserError: + except (ParserError, OverflowError) as e: element = str(word).strip() - one_or_two = '(\d{1}|\d{2})' - two_or_four = '(\d{2}|\d{4})' - months = '(January|February|March|April|May|June|July|August|September|October|November|December|Jan|Feb|Mar|Apr|May|June|July|Aug|Sept|Oct|Nov|Dec)' - pattern = r'^(\d{1}|\d{2}|\d{4}),(\d{1}|\d{2}) ' + months # + '$' # 1999,4 Feb 1999,4 February - pattern = pattern + '|' + r'^' + one_or_two + '\. ' + one_or_two + '\. ' + two_or_four # 11. 4. 1999 - pattern = pattern + '|' + r'^(\d{1}|\d{2}|\d{4}),(\d{1}|\d{2})' + months # 1999,4February 1999,4Feb - if re.match(pattern, element): - return True - else: - return False + one_or_two = r'(\d{1}|\d{2})' + two_or_four = r'(\d{2}|\d{4})' + months = ('(January|February|March|April|May|June|July|August|' + 'September|October|November|December|Jan|Feb' + '|Mar|Apr|May|June|July|Aug|Sept|Oct|Nov|Dec)') + # + '$' # 1999,4 Feb 1999,4 February + pattern = r'^(\d{1}|\d{2}|\d{4}),(\d{1}|\d{2}) ' + months + # 11. 4. 1999 + pattern = pattern + '|' + r'^' + one_or_two + r'\. ' + one_or_two + r'\. ' + two_or_four + # 1999,4February 1999,4Feb + pattern = pattern + '|' + r'^(\d{1}|\d{2}|\d{4}),(\d{1}|\d{2})' + months + return bool(re.match(pattern, element)) return column.apply(lambda s: is_str_date(s)).all() @@ -324,11 +342,13 @@ def get_data_kind(column: pd.Series) -> "DataKind": return DataKind.CONSTANT if is_categorical(column): return DataKind.CATEGORICAL - else: - return DataKind.UNDEFINED + return DataKind.UNDEFINED class DataKind(Enum): + """ + Represents all data kind. + """ BOOL = "bool" ID = "id" CONSTANT = "constant" @@ -349,8 +369,7 @@ def get_basic_type(column: pd.Series) -> Any: return DATE if is_not_numerical(column): return NONNUMERICAL - else: - return UNDEFINED + return UNDEFINED def get_advanced_type(column: pd.Series) -> Any: @@ -364,19 +383,18 @@ def get_advanced_type(column: pd.Series) -> Any: if is_numerical(column_num): if is_int(column_num): return INT - else: - return FLOAT + return FLOAT if is_date(column): # todo what about year? return DATE if is_not_numerical(column): return NONNUMERICAL - else: - return UNDEFINED + return UNDEFINED def get_advanced_structural_type(column: pd.Series) -> Any: """ - Indicates type of column int, float - human, computer, date, text - word, sentence, phrase article, multiple + Indicates type of column int, float - human, computer, date, + text - word, sentence, phrase article, multiple :param column: to indicate :return: detected type @@ -385,10 +403,9 @@ def get_advanced_structural_type(column: pd.Series) -> Any: if is_numerical(column_num): if is_int(column_num): return INT - else: - if is_human_gen(column_num): - return HUMAN_GENERATED - return COMPUTER_GENERATED + if is_human_gen(column_num): + return HUMAN_GENERATED + return COMPUTER_GENERATED if is_date(column): return DATE if is_not_numerical(column): @@ -408,127 +425,145 @@ def get_advanced_structural_type(column: pd.Series) -> Any: if is_article(column): return ARTICLE return NONNUMERICAL - else: - return UNDEFINED - - -# class _Float(Enum): -# HUMAN_GENERATED = "human_generated" -# COMPUTER_GENERATED = "computer_generated" -# -# -# class _Numerical(Enum): -# FLOAT = _Float -# INT = "int" -# -# -# class _Word(Enum): -# ALPHABETIC = "alphabetic" -# ALPHANUMERIC = "alphanumeric" -# ALL = "all" -# -# -# class _Text(Enum): -# WORD = _Word -# SENTENCE = "sentence" -# PHRASE = "phrase" # max 4 words without punctuation -# ARTICLE = "article" -# MULTIPLE_VALUES = "multiple" # genres name : "Action, Adventure, Drama" -# -# -# class _NonNumerical(Enum): -# TEXT = _Text -# -# -# class Types(Enum): -# """ -# Enum class representing column type -# """ -# NUMERICAL = _Numerical -# NONNUMERICAL = _NonNumerical -# DATE = "date" -# UNDEFINED = "undefined" + return UNDEFINED class Type: + """ + Base class for type + """ def __init__(self, value): - self.value = value ## todo add values ? + self.value = value + + def __str__(self): + return "" class DATE(Type): - pass + """ + Represents type date. + """ + def __str__(self): + return "DATE" class UNDEFINED(Type): - pass + """ + Represents class undefined + """ + def __str__(self): + return "UNDEFINED" class NUMERICAL(Type): - pass + """ + Represents numerical types. + """ + def __str__(self): + return "NUMERICAL" class INT(NUMERICAL): - pass + """ + Represents INT type. + """ + def __str__(self): + return "INT" class FLOAT(NUMERICAL): - pass + """ + Represents FLOAT type. + """ + def __str__(self): + return "FLOAT" class HUMAN_GENERATED(FLOAT): - pass + """ + Represents float, which is probably generated by human. + Number of numbers after floating point is small or rounded. + """ + def __str__(self): + return "HUMAN_GENERATED" class COMPUTER_GENERATED(FLOAT): - pass + """ + Represents float, which is probably generated by computer. + Number of numbers after floating point is bigger or not rounded. + """ + def __str__(self): + return "COMPUTER_GENERATED" class NONNUMERICAL(Type): - pass + """ + Subclass for nonnumerical types + """ + def __str__(self): + return "NONNUMERICAL" class WORD(NONNUMERICAL): - pass + """ + Word is string without spaces. + """ + def __str__(self): + return "WORD" class ALPHABETIC(WORD): - pass + """ + This type is WORD, it contains only letters (a-z) + """ + def __str__(self): + return "ALPHABETIC" class ALPHANUMERIC(WORD): - pass + """ + This type is WORD, it contains only letters (a-z) and numbers (0-9) + """ + def __str__(self): + return "ALPHANUMERIC" class ALL(WORD): - pass + """ + This type is WORD, it could contain all characters. + """ + def __str__(self): + return "ALL" class SENTENCE(NONNUMERICAL): - pass + """ + Sentence is string that ends with fullstops. It could contain spaces. + """ + def __str__(self): + return "SENTENCE" class ARTICLE(NONNUMERICAL): - pass + """ + Article is string composite from sentences. + """ + def __str__(self): + return "ARTICLE" class PHRASE(NONNUMERICAL): - pass + """ + Phrase is string with spaces, but it is not sentence. + """ + def __str__(self): + return "PHRASE" class MULTIPLE_VALUES(NONNUMERICAL): - pass - - -# def get_super_type(type_: Types) -> Types: -# if (type_ == Types.NUMERICAL or type_ == Types.NUMERICAL.value.FLOAT or type_ == Types.NUMERICAL.value.INT or -# type_ == Types.NUMERICAL.value.FLOAT.value.HUMAN_GENERATED or -# type_ == Types.NUMERICAL.value.FLOAT.value.COMPUTER_GENERATED): -# return Types.NUMERICAL -# if type_ == Types.DATE: -# return Types.DATE -# if type_ == Types.UNDEFINED: -# return Types.UNDEFINED -# return Types.NONNUMERICAL -# -# -# x = Types.NUMERICAL.value + """ + MULTIPLE_VALUES is string, and it contains pattern that is repeated. (Name1|Name2|Name3|Name4) + """ + def __str__(self): + return "MULTIPLE_VALUES"