diff --git a/dev/.buildinfo b/dev/.buildinfo index 5a3c0cb3e..065c464a4 100644 --- a/dev/.buildinfo +++ b/dev/.buildinfo @@ -1,4 +1,4 @@ # Sphinx build info version 1 # This file hashes the configuration used when building these files. When it is not found, a full rebuild will be done. -config: 2477e942aab9c78438a49ee8fb3d50a8 +config: e111dff184e2451e0a92cf034a83814e tags: 645f666f9bcd5a90fca523b33c5a78b7 diff --git a/dev/_images/sphx_glr_01_encodings_001.png b/dev/_images/sphx_glr_01_encodings_001.png index 0819b4e21..3a8528524 100644 Binary files a/dev/_images/sphx_glr_01_encodings_001.png and b/dev/_images/sphx_glr_01_encodings_001.png differ diff --git a/dev/_images/sphx_glr_01_encodings_thumb.png b/dev/_images/sphx_glr_01_encodings_thumb.png index 96e985853..c7a1afc3c 100644 Binary files a/dev/_images/sphx_glr_01_encodings_thumb.png and b/dev/_images/sphx_glr_01_encodings_thumb.png differ diff --git a/dev/_images/sphx_glr_08_join_aggregation_003.png b/dev/_images/sphx_glr_08_join_aggregation_003.png index e93efe16e..b63a7b4f5 100644 Binary files a/dev/_images/sphx_glr_08_join_aggregation_003.png and b/dev/_images/sphx_glr_08_join_aggregation_003.png differ diff --git a/dev/_images/sphx_glr_09_interpolation_join_001.png b/dev/_images/sphx_glr_09_interpolation_join_001.png index c229641e3..cc108f1c5 100644 Binary files a/dev/_images/sphx_glr_09_interpolation_join_001.png and b/dev/_images/sphx_glr_09_interpolation_join_001.png differ diff --git a/dev/_images/sphx_glr_09_interpolation_join_002.png b/dev/_images/sphx_glr_09_interpolation_join_002.png index 5d6089a58..56e66bc7d 100644 Binary files a/dev/_images/sphx_glr_09_interpolation_join_002.png and b/dev/_images/sphx_glr_09_interpolation_join_002.png differ diff --git a/dev/_images/sphx_glr_09_interpolation_join_003.png b/dev/_images/sphx_glr_09_interpolation_join_003.png index 5613c13c5..a3f4a4be6 100644 Binary files a/dev/_images/sphx_glr_09_interpolation_join_003.png and b/dev/_images/sphx_glr_09_interpolation_join_003.png differ diff --git a/dev/_images/sphx_glr_09_interpolation_join_thumb.png b/dev/_images/sphx_glr_09_interpolation_join_thumb.png index 3ccd47ec6..d59810f56 100644 Binary files a/dev/_images/sphx_glr_09_interpolation_join_thumb.png and b/dev/_images/sphx_glr_09_interpolation_join_thumb.png differ diff --git a/dev/_sources/auto_examples/01_encodings.rst.txt b/dev/_sources/auto_examples/01_encodings.rst.txt index 183e70f49..b3883f666 100644 --- a/dev/_sources/auto_examples/01_encodings.rst.txt +++ b/dev/_sources/auto_examples/01_encodings.rst.txt @@ -440,44 +440,45 @@ corresponding columns: .. code-block:: none - array(['supports, support, recruit', 'shelters, centers, sheriff', - 'rockville, fallsgrove, downtown', 'animal, individual, virtual', - 'engineering, mangement, budgeting', 'station, state, estate', - 'facilities, maintenance, accounts', 'silver, spring, ride', - 'behavioral, health, school', 'equipment, management, automotive', - 'patrol, 4th, 6th', 'safety, collision, section', - 'communications, communication, telecommunications', - 'warehouse, delivery, operations', 'highway, welfare, services', - 'gaithersburg, the, clarksburg', 'nicholson, transit, taxicab', - 'development, planning, stormwater', - 'assessment, protective, process', - 'construction, instruction, building', - 'mechanical, special, commercial', - 'technology, systems, telephone', 'district, urban, 3rd', - 'family, sexual, crimes', 'security, mccf, unit', - 'custody, members, customer', + array(['services, highway, service', 'engineering, training, planning', + 'traffic, safety, alcohol', 'sexual, family, crimes', + 'district, payroll, squad', 'construction, instruction, building', + 'silver, spring, ride', 'behavioral, health, school', + 'delivery, operations, special', 'gaithersburg, clarksburg, the', + 'welfare, childhood, children', 'toddlers, custody, members', + 'supports, support, network', 'station, state, estate', + 'director, officers, projects', 'management, fleet, parking', + 'maintenance, facilities, eligibility', + 'processing, programs, accounting', 'security, mc311, mccf', + 'communications, communication, immunization', + 'administration, battalion, administrative', 'patrol, 4th, 6th', + 'procurement, protective, fiscal', + 'development, residential, stormwater', 'investigative, investigations, criminal', - 'administration, administrative, battalion', - 'assistance, eligibility, emergency', 'automated, traffic, office', - 'master, registered, firefighter', 'manager, budget, projects', - 'specialist, special, environmental', 'candidate, police, cadet', - 'officer, office, traffic', 'operator, bus, operations', - 'income, assistance, client', 'liquor, clerk, store', - 'rescuer, recruit, firefighter', 'technician, mechanic, supply', - 'program, programs, procurement', 'lieutenant, captain, chief', - 'community, nurse, unit', 'school, health, room', + 'emergency, centers, center', 'nicholson, transit, taxicab', + 'automotive, assessment, equipment', + 'technology, systems, telephone', 'rockville, twinbrook, library', + 'dietary, security, partnerships', + 'operator, equipment, apprentice', + 'recreation, planning, renovation', + 'master, registered, firefighter', 'candidate, police, of', + 'officer, office, police', 'specialist, special, procurement', + 'program, programs, projects', 'technician, mechanic, supply', + 'legislative, principal, executive', + 'coordinator, coordinating, depot', 'sergeant, cadet, police', + 'recruit, firefighter, rescuer', 'community, health, nurse', + 'warehouse, welfare, caseworker', + 'enforcement, permitting, inspector', 'manager, budget, engineer', + 'school, room, behavioral', 'captain, rescue, chief', + 'supervisory, supervisor, therapist', 'communications, telecommunications, safety', - 'coordinator, services, service', 'accountant, assistant, county', - 'supervisor, supervisory, sergeant', - 'craftsworker, customer, public', 'sheriff, deputy, autobody', - 'enforcement, permitting, inspector', - 'correctional, correction, records', - 'information, technology, recreation', - 'administrative, principal, executive', - 'crossing, purchasing, engineer', 'warehouse, welfare, driver', - 'corporal, erp, behavioral', 'worker, social, leader', - 'librarian, library, telephone', - 'equipment, investment, investigator'], dtype=object) + 'liquor, clerk, store', 'librarian, crossing, library', + 'sheriff, deputy, autobody', + 'information, technology, technologist', + 'lieutenant, attendant, facilities', 'services, service, urban', + 'assistance, assistant, income', + 'correctional, correction, corporal', + 'administrative, administration, administrator'], dtype=object) @@ -559,7 +560,7 @@ Let's look at the cross-validated R2 score of our model: .. code-block:: none - R2 score: mean: 0.922; std: 0.013 + R2 score: mean: 0.919; std: 0.017 @@ -695,7 +696,7 @@ to plot the feature importances. .. rst-class:: sphx-glr-timing - **Total running time of the script:** (1 minutes 16.726 seconds) + **Total running time of the script:** (1 minutes 20.106 seconds) .. _sphx_glr_download_auto_examples_01_encodings.py: diff --git a/dev/_sources/auto_examples/02_feature_interpretation_with_gapencoder.rst.txt b/dev/_sources/auto_examples/02_feature_interpretation_with_gapencoder.rst.txt index 5244a0a35..b233532e7 100644 --- a/dev/_sources/auto_examples/02_feature_interpretation_with_gapencoder.rst.txt +++ b/dev/_sources/auto_examples/02_feature_interpretation_with_gapencoder.rst.txt @@ -504,7 +504,7 @@ as a set of latent topics. .. rst-class:: sphx-glr-timing - **Total running time of the script:** (0 minutes 1.903 seconds) + **Total running time of the script:** (0 minutes 1.818 seconds) .. _sphx_glr_download_auto_examples_02_feature_interpretation_with_gapencoder.py: diff --git a/dev/_sources/auto_examples/03_datetime_encoder.rst.txt b/dev/_sources/auto_examples/03_datetime_encoder.rst.txt index 106bb51e9..a3029da7e 100644 --- a/dev/_sources/auto_examples/03_datetime_encoder.rst.txt +++ b/dev/_sources/auto_examples/03_datetime_encoder.rst.txt @@ -610,7 +610,7 @@ and transforms datetime columns by default. .. rst-class:: sphx-glr-timing - **Total running time of the script:** (0 minutes 4.509 seconds) + **Total running time of the script:** (0 minutes 4.276 seconds) .. _sphx_glr_download_auto_examples_03_datetime_encoder.py: diff --git a/dev/_sources/auto_examples/04_fuzzy_joining.rst.txt b/dev/_sources/auto_examples/04_fuzzy_joining.rst.txt index bc8356af6..85eb82281 100644 --- a/dev/_sources/auto_examples/04_fuzzy_joining.rst.txt +++ b/dev/_sources/auto_examples/04_fuzzy_joining.rst.txt @@ -1711,7 +1711,7 @@ introduced into a grid search: .. rst-class:: sphx-glr-timing - **Total running time of the script:** (0 minutes 19.485 seconds) + **Total running time of the script:** (0 minutes 20.298 seconds) .. _sphx_glr_download_auto_examples_04_fuzzy_joining.py: diff --git a/dev/_sources/auto_examples/05_deduplication.rst.txt b/dev/_sources/auto_examples/05_deduplication.rst.txt index e1647d9a4..4b304247b 100644 --- a/dev/_sources/auto_examples/05_deduplication.rst.txt +++ b/dev/_sources/auto_examples/05_deduplication.rst.txt @@ -335,7 +335,7 @@ or |MinHash|. .. rst-class:: sphx-glr-timing - **Total running time of the script:** (0 minutes 4.891 seconds) + **Total running time of the script:** (0 minutes 5.398 seconds) .. _sphx_glr_download_auto_examples_05_deduplication.py: diff --git a/dev/_sources/auto_examples/06_ken_embeddings.rst.txt b/dev/_sources/auto_examples/06_ken_embeddings.rst.txt index d7c5deb74..a6510c37a 100644 --- a/dev/_sources/auto_examples/06_ken_embeddings.rst.txt +++ b/dev/_sources/auto_examples/06_ken_embeddings.rst.txt @@ -305,7 +305,7 @@ We will start by checking out the available tables with .. code-block:: none - {'games', 'all_entities', 'companies', 'movies', 'albums', 'schools'} + {'games', 'all_entities', 'movies', 'schools', 'companies', 'albums'} @@ -840,7 +840,7 @@ It helped significantly improve the prediction score. .. rst-class:: sphx-glr-timing - **Total running time of the script:** (11 minutes 15.721 seconds) + **Total running time of the script:** (11 minutes 31.752 seconds) .. _sphx_glr_download_auto_examples_06_ken_embeddings.py: diff --git a/dev/_sources/auto_examples/07_multiple_key_join.rst.txt b/dev/_sources/auto_examples/07_multiple_key_join.rst.txt index c74538736..d88d0a20e 100644 --- a/dev/_sources/auto_examples/07_multiple_key_join.rst.txt +++ b/dev/_sources/auto_examples/07_multiple_key_join.rst.txt @@ -1225,7 +1225,7 @@ The results: /home/circleci/project/miniconda/envs/testenv/lib/python3.10/site-packages/sklearn/preprocessing/_encoders.py:228: UserWarning: Found unknown categories in columns [1] during transform. These unknown categories will be encoded as all zeros warnings.warn( - 0.58915 + 0.58585 @@ -1243,7 +1243,7 @@ Our final cross-validated accuracy score is 0.58. .. rst-class:: sphx-glr-timing - **Total running time of the script:** (10 minutes 53.468 seconds) + **Total running time of the script:** (12 minutes 0.348 seconds) .. _sphx_glr_download_auto_examples_07_multiple_key_join.py: diff --git a/dev/_sources/auto_examples/08_join_aggregation.rst.txt b/dev/_sources/auto_examples/08_join_aggregation.rst.txt index 035284f67..29a896783 100644 --- a/dev/_sources/auto_examples/08_join_aggregation.rst.txt +++ b/dev/_sources/auto_examples/08_join_aggregation.rst.txt @@ -844,75 +844,75 @@ operation maximizing our validation score. split1_test_score - 0.045640 - 0.041599 - 0.064299 - 0.075415 - 0.082621 + 0.035662 + 0.053087 + 0.072071 + 0.066001 + 0.079685 split2_test_score - 0.071529 - 0.074321 - 0.089585 - 0.088302 - 0.092612 + 0.076336 + 0.082476 + 0.094569 + 0.091759 + 0.091283 split3_test_score - 0.040797 - 0.063690 - 0.069269 - 0.062119 - 0.069033 + 0.039651 + 0.067618 + 0.056371 + 0.073471 + 0.074189 split4_test_score - 0.138299 - 0.121267 - 0.140101 - 0.142540 - 0.140436 + 0.141117 + 0.125026 + 0.144522 + 0.144590 + 0.141754 split5_test_score - 0.106619 - 0.112459 - 0.104979 - 0.108407 - 0.114175 + 0.109087 + 0.110900 + 0.106453 + 0.109212 + 0.114790 split6_test_score - 0.077635 - 0.103037 - 0.107903 - 0.105183 - 0.106193 + 0.082496 + 0.102770 + 0.109770 + 0.110297 + 0.112940 split7_test_score - 0.074042 - 0.067023 - 0.070372 - 0.070241 - 0.076133 + 0.059398 + 0.066302 + 0.060979 + 0.064334 + 0.076729 split8_test_score - 0.110951 - 0.114420 - 0.117400 - 0.113873 - 0.132726 + 0.104940 + 0.111360 + 0.117624 + 0.120712 + 0.119874 split9_test_score - 0.121891 - 0.148754 - 0.175560 - 0.161689 - 0.180554 + 0.106619 + 0.155110 + 0.171715 + 0.164387 + 0.176994 @@ -1003,7 +1003,7 @@ exhaustive histogram over all the possible values of ratings .. rst-class:: sphx-glr-timing - **Total running time of the script:** (0 minutes 16.742 seconds) + **Total running time of the script:** (0 minutes 17.713 seconds) .. _sphx_glr_download_auto_examples_08_join_aggregation.py: diff --git a/dev/_sources/auto_examples/09_interpolation_join.rst.txt b/dev/_sources/auto_examples/09_interpolation_join.rst.txt index e7bfe79de..a6cab4b84 100644 --- a/dev/_sources/auto_examples/09_interpolation_join.rst.txt +++ b/dev/_sources/auto_examples/09_interpolation_join.rst.txt @@ -350,9 +350,9 @@ To avoid clashes in the column names, we use the ``suffix`` parameter to append 42.6 0.0 NaN - 34.805413 - 40.631545 - 0.060178 + 34.198740 + 142.770662 + 0.056624 1 @@ -362,9 +362,9 @@ To avoid clashes in the column names, we use the ``suffix`` parameter to append 26.9 NaN NaN - 26.568328 - 96.431877 - 0.331631 + 26.708394 + 71.834684 + 0.453572 2 @@ -374,9 +374,9 @@ To avoid clashes in the column names, we use the ``suffix`` parameter to append 41.6 NaN NaN - 31.457290 - 7.708034 - 0.060178 + 31.337142 + 53.627947 + 0.140611 3 @@ -386,9 +386,9 @@ To avoid clashes in the column names, we use the ``suffix`` parameter to append 46.6 0.0 NaN - 34.110941 - 46.476880 - 0.060178 + 33.949103 + 93.586318 + 0.140611 4 @@ -398,9 +398,9 @@ To avoid clashes in the column names, we use the ``suffix`` parameter to append 36.1 0.0 NaN - 29.202521 - 25.925854 - 0.272614 + 28.957713 + 30.326362 + 0.110605 @@ -580,7 +580,7 @@ Then, we use it to transform the flights table -- it adds a ``'TMAX'`` column co MI 42.212059 -83.348836 - -0.047682 + 2.084197 1 @@ -592,7 +592,7 @@ Then, we use it to transform the flights table -- it adds a ``'TMAX'`` column co MI 42.212059 -83.348836 - 15.604202 + 13.988828 2 @@ -604,7 +604,7 @@ Then, we use it to transform the flights table -- it adds a ``'TMAX'`` column co MI 42.212059 -83.348836 - 6.541363 + 7.269396 3 @@ -616,7 +616,7 @@ Then, we use it to transform the flights table -- it adds a ``'TMAX'`` column co MI 42.212059 -83.348836 - 7.067635 + 7.983380 4 @@ -628,7 +628,7 @@ Then, we use it to transform the flights table -- it adds a ``'TMAX'`` column co MI 42.212059 -83.348836 - 7.669651 + 7.760605 @@ -678,11 +678,11 @@ States with the lowest average predicted temperatures: Alaska, Montana, North Da state - AK -2.772777 - MT 0.478902 - WA 0.795014 - ND 1.102758 - MN 1.201596 + AK -3.617829 + MT 0.039900 + WA 0.606960 + ND 1.314403 + MN 1.495447 Name: TMAX, dtype: float64 @@ -707,11 +707,11 @@ States with the highest predicted temperatures: Puerto Rico, Virgin Islands, Haw state - LA 21.597141 - FL 24.702922 - HI 26.811835 - VI 29.999518 - PR 30.403917 + LA 21.518561 + FL 24.747995 + HI 26.399745 + VI 30.839051 + PR 31.368437 Name: TMAX, dtype: float64 @@ -787,7 +787,7 @@ It is a generalization of the :func:`~skrub.fuzzy_join`, as :func:`~skrub.fuzzy_ .. rst-class:: sphx-glr-timing - **Total running time of the script:** (0 minutes 5.022 seconds) + **Total running time of the script:** (0 minutes 5.384 seconds) .. _sphx_glr_download_auto_examples_09_interpolation_join.py: diff --git a/dev/_sources/auto_examples/sg_execution_times.rst.txt b/dev/_sources/auto_examples/sg_execution_times.rst.txt index 0917dcd50..8fc95acb9 100644 --- a/dev/_sources/auto_examples/sg_execution_times.rst.txt +++ b/dev/_sources/auto_examples/sg_execution_times.rst.txt @@ -6,7 +6,7 @@ Computation times ================= -**24:18.466** total execution time for 9 files **from auto_examples**: +**25:47.091** total execution time for 9 files **from auto_examples**: .. container:: @@ -32,30 +32,30 @@ Computation times * - Example - Time - Mem (MB) - * - :ref:`sphx_glr_auto_examples_06_ken_embeddings.py` (``06_ken_embeddings.py``) - - 11:15.721 - - 0.0 * - :ref:`sphx_glr_auto_examples_07_multiple_key_join.py` (``07_multiple_key_join.py``) - - 10:53.468 + - 12:00.348 + - 0.0 + * - :ref:`sphx_glr_auto_examples_06_ken_embeddings.py` (``06_ken_embeddings.py``) + - 11:31.752 - 0.0 * - :ref:`sphx_glr_auto_examples_01_encodings.py` (``01_encodings.py``) - - 01:16.726 + - 01:20.106 - 0.0 * - :ref:`sphx_glr_auto_examples_04_fuzzy_joining.py` (``04_fuzzy_joining.py``) - - 00:19.485 + - 00:20.298 - 0.0 * - :ref:`sphx_glr_auto_examples_08_join_aggregation.py` (``08_join_aggregation.py``) - - 00:16.742 - - 0.0 - * - :ref:`sphx_glr_auto_examples_09_interpolation_join.py` (``09_interpolation_join.py``) - - 00:05.022 + - 00:17.713 - 0.0 * - :ref:`sphx_glr_auto_examples_05_deduplication.py` (``05_deduplication.py``) - - 00:04.891 + - 00:05.398 + - 0.0 + * - :ref:`sphx_glr_auto_examples_09_interpolation_join.py` (``09_interpolation_join.py``) + - 00:05.384 - 0.0 * - :ref:`sphx_glr_auto_examples_03_datetime_encoder.py` (``03_datetime_encoder.py``) - - 00:04.509 + - 00:04.276 - 0.0 * - :ref:`sphx_glr_auto_examples_02_feature_interpretation_with_gapencoder.py` (``02_feature_interpretation_with_gapencoder.py``) - - 00:01.903 + - 00:01.818 - 0.0 diff --git a/dev/_sources/sg_execution_times.rst.txt b/dev/_sources/sg_execution_times.rst.txt index 78d2a6189..eab2c7d8f 100644 --- a/dev/_sources/sg_execution_times.rst.txt +++ b/dev/_sources/sg_execution_times.rst.txt @@ -6,7 +6,7 @@ Computation times ================= -**24:18.466** total execution time for 9 files **from all galleries**: +**25:47.091** total execution time for 9 files **from all galleries**: .. container:: @@ -32,30 +32,30 @@ Computation times * - Example - Time - Mem (MB) - * - :ref:`sphx_glr_auto_examples_06_ken_embeddings.py` (``../examples/06_ken_embeddings.py``) - - 11:15.721 - - 0.0 * - :ref:`sphx_glr_auto_examples_07_multiple_key_join.py` (``../examples/07_multiple_key_join.py``) - - 10:53.468 + - 12:00.348 + - 0.0 + * - :ref:`sphx_glr_auto_examples_06_ken_embeddings.py` (``../examples/06_ken_embeddings.py``) + - 11:31.752 - 0.0 * - :ref:`sphx_glr_auto_examples_01_encodings.py` (``../examples/01_encodings.py``) - - 01:16.726 + - 01:20.106 - 0.0 * - :ref:`sphx_glr_auto_examples_04_fuzzy_joining.py` (``../examples/04_fuzzy_joining.py``) - - 00:19.485 + - 00:20.298 - 0.0 * - :ref:`sphx_glr_auto_examples_08_join_aggregation.py` (``../examples/08_join_aggregation.py``) - - 00:16.742 - - 0.0 - * - :ref:`sphx_glr_auto_examples_09_interpolation_join.py` (``../examples/09_interpolation_join.py``) - - 00:05.022 + - 00:17.713 - 0.0 * - :ref:`sphx_glr_auto_examples_05_deduplication.py` (``../examples/05_deduplication.py``) - - 00:04.891 + - 00:05.398 + - 0.0 + * - :ref:`sphx_glr_auto_examples_09_interpolation_join.py` (``../examples/09_interpolation_join.py``) + - 00:05.384 - 0.0 * - :ref:`sphx_glr_auto_examples_03_datetime_encoder.py` (``../examples/03_datetime_encoder.py``) - - 00:04.509 + - 00:04.276 - 0.0 * - :ref:`sphx_glr_auto_examples_02_feature_interpretation_with_gapencoder.py` (``../examples/02_feature_interpretation_with_gapencoder.py``) - - 00:01.903 + - 00:01.818 - 0.0 diff --git a/dev/auto_examples/01_encodings.html b/dev/auto_examples/01_encodings.html index 0417b4cff..ff6f8be96 100644 --- a/dev/auto_examples/01_encodings.html +++ b/dev/auto_examples/01_encodings.html @@ -797,44 +797,45 @@

A simple prediction pipeline
tv.named_transformers_["high_cardinality"].get_feature_names_out()
 
-
array(['supports, support, recruit', 'shelters, centers, sheriff',
-       'rockville, fallsgrove, downtown', 'animal, individual, virtual',
-       'engineering, mangement, budgeting', 'station, state, estate',
-       'facilities, maintenance, accounts', 'silver, spring, ride',
-       'behavioral, health, school', 'equipment, management, automotive',
-       'patrol, 4th, 6th', 'safety, collision, section',
-       'communications, communication, telecommunications',
-       'warehouse, delivery, operations', 'highway, welfare, services',
-       'gaithersburg, the, clarksburg', 'nicholson, transit, taxicab',
-       'development, planning, stormwater',
-       'assessment, protective, process',
-       'construction, instruction, building',
-       'mechanical, special, commercial',
-       'technology, systems, telephone', 'district, urban, 3rd',
-       'family, sexual, crimes', 'security, mccf, unit',
-       'custody, members, customer',
+
array(['services, highway, service', 'engineering, training, planning',
+       'traffic, safety, alcohol', 'sexual, family, crimes',
+       'district, payroll, squad', 'construction, instruction, building',
+       'silver, spring, ride', 'behavioral, health, school',
+       'delivery, operations, special', 'gaithersburg, clarksburg, the',
+       'welfare, childhood, children', 'toddlers, custody, members',
+       'supports, support, network', 'station, state, estate',
+       'director, officers, projects', 'management, fleet, parking',
+       'maintenance, facilities, eligibility',
+       'processing, programs, accounting', 'security, mc311, mccf',
+       'communications, communication, immunization',
+       'administration, battalion, administrative', 'patrol, 4th, 6th',
+       'procurement, protective, fiscal',
+       'development, residential, stormwater',
        'investigative, investigations, criminal',
-       'administration, administrative, battalion',
-       'assistance, eligibility, emergency', 'automated, traffic, office',
-       'master, registered, firefighter', 'manager, budget, projects',
-       'specialist, special, environmental', 'candidate, police, cadet',
-       'officer, office, traffic', 'operator, bus, operations',
-       'income, assistance, client', 'liquor, clerk, store',
-       'rescuer, recruit, firefighter', 'technician, mechanic, supply',
-       'program, programs, procurement', 'lieutenant, captain, chief',
-       'community, nurse, unit', 'school, health, room',
+       'emergency, centers, center', 'nicholson, transit, taxicab',
+       'automotive, assessment, equipment',
+       'technology, systems, telephone', 'rockville, twinbrook, library',
+       'dietary, security, partnerships',
+       'operator, equipment, apprentice',
+       'recreation, planning, renovation',
+       'master, registered, firefighter', 'candidate, police, of',
+       'officer, office, police', 'specialist, special, procurement',
+       'program, programs, projects', 'technician, mechanic, supply',
+       'legislative, principal, executive',
+       'coordinator, coordinating, depot', 'sergeant, cadet, police',
+       'recruit, firefighter, rescuer', 'community, health, nurse',
+       'warehouse, welfare, caseworker',
+       'enforcement, permitting, inspector', 'manager, budget, engineer',
+       'school, room, behavioral', 'captain, rescue, chief',
+       'supervisory, supervisor, therapist',
        'communications, telecommunications, safety',
-       'coordinator, services, service', 'accountant, assistant, county',
-       'supervisor, supervisory, sergeant',
-       'craftsworker, customer, public', 'sheriff, deputy, autobody',
-       'enforcement, permitting, inspector',
-       'correctional, correction, records',
-       'information, technology, recreation',
-       'administrative, principal, executive',
-       'crossing, purchasing, engineer', 'warehouse, welfare, driver',
-       'corporal, erp, behavioral', 'worker, social, leader',
-       'librarian, library, telephone',
-       'equipment, investment, investigator'], dtype=object)
+       'liquor, clerk, store', 'librarian, crossing, library',
+       'sheriff, deputy, autobody',
+       'information, technology, technologist',
+       'lieutenant, attendant, facilities', 'services, service, urban',
+       'assistance, assistant, income',
+       'correctional, correction, corporal',
+       'administrative, administration, administrator'], dtype=object)
 
-
R2 score:  mean: 0.922; std: 0.013
+
R2 score:  mean: 0.919; std: 0.017
 

The simple pipeline applied on this complex dataset gave us very good results.

@@ -936,7 +937,7 @@

ConclusionTotal running time of the script: (1 minutes 16.726 seconds)

+

Total running time of the script: (1 minutes 20.106 seconds)

-
{'games', 'all_entities', 'companies', 'movies', 'albums', 'schools'}
+
{'games', 'all_entities', 'movies', 'schools', 'companies', 'albums'}
 

The games table is the most relevant to our case. @@ -968,7 +968,7 @@

Plotting the results

It helped significantly improve the prediction score.

-

Total running time of the script: (11 minutes 15.721 seconds)

+

Total running time of the script: (11 minutes 31.752 seconds)