diff --git a/dev/.buildinfo b/dev/.buildinfo index 086b5254..d0fd411c 100644 --- a/dev/.buildinfo +++ b/dev/.buildinfo @@ -1,4 +1,4 @@ # Sphinx build info version 1 # This file hashes the configuration used when building these files. When it is not found, a full rebuild will be done. -config: 0419996a16f2a3c75464ed74042897b9 +config: 0630c51683edc6ca083cc9b7428c67b6 tags: 645f666f9bcd5a90fca523b33c5a78b7 diff --git a/dev/_images/sphx_glr_01_encodings_001.png b/dev/_images/sphx_glr_01_encodings_001.png index 65d771a1..1aa55b3d 100644 Binary files a/dev/_images/sphx_glr_01_encodings_001.png and b/dev/_images/sphx_glr_01_encodings_001.png differ diff --git a/dev/_images/sphx_glr_01_encodings_thumb.png b/dev/_images/sphx_glr_01_encodings_thumb.png index 203ecae4..fa6cceb5 100644 Binary files a/dev/_images/sphx_glr_01_encodings_thumb.png and b/dev/_images/sphx_glr_01_encodings_thumb.png differ diff --git a/dev/_images/sphx_glr_08_join_aggregation_003.png b/dev/_images/sphx_glr_08_join_aggregation_003.png index 36100ac6..b3dc3988 100644 Binary files a/dev/_images/sphx_glr_08_join_aggregation_003.png and b/dev/_images/sphx_glr_08_join_aggregation_003.png differ diff --git a/dev/_images/sphx_glr_09_interpolation_join_001.png b/dev/_images/sphx_glr_09_interpolation_join_001.png index b7fb6a23..6360edce 100644 Binary files a/dev/_images/sphx_glr_09_interpolation_join_001.png and b/dev/_images/sphx_glr_09_interpolation_join_001.png differ diff --git a/dev/_images/sphx_glr_09_interpolation_join_002.png b/dev/_images/sphx_glr_09_interpolation_join_002.png index 4b4d9870..cbab44ef 100644 Binary files a/dev/_images/sphx_glr_09_interpolation_join_002.png and b/dev/_images/sphx_glr_09_interpolation_join_002.png differ diff --git a/dev/_images/sphx_glr_09_interpolation_join_003.png b/dev/_images/sphx_glr_09_interpolation_join_003.png index 7d56525b..a5d928ef 100644 Binary files a/dev/_images/sphx_glr_09_interpolation_join_003.png and b/dev/_images/sphx_glr_09_interpolation_join_003.png differ diff --git a/dev/_images/sphx_glr_09_interpolation_join_thumb.png b/dev/_images/sphx_glr_09_interpolation_join_thumb.png index daed39ef..768d05f3 100644 Binary files a/dev/_images/sphx_glr_09_interpolation_join_thumb.png and b/dev/_images/sphx_glr_09_interpolation_join_thumb.png differ diff --git a/dev/_sources/auto_examples/01_encodings.rst.txt b/dev/_sources/auto_examples/01_encodings.rst.txt index a7acdb1a..c16be2ab 100644 --- a/dev/_sources/auto_examples/01_encodings.rst.txt +++ b/dev/_sources/auto_examples/01_encodings.rst.txt @@ -440,45 +440,44 @@ corresponding columns: .. code-block:: none - array(['station, state, estate', 'services, highway, service', - 'investigative, investigations, criminal', - 'nicholson, transit, taxicab', - 'communications, division, applications', 'silver, spring, ride', - 'training, building, recruit', 'gaithersburg, clarksburg, the', - 'patrol, 4th, 6th', 'director, automated, office', - 'programs, program, preparedness', - 'rockville, twinbrook, downtown', - 'eligibility, assistance, assisted', 'safety, section, collision', - 'management, equipment, automotive', 'security, mc311, mccf', + array(['gaithersburg, clarksburg, the', 'supports, support, sports', + 'district, patrol, 3rd', 'telephone, automated, stormwater', + 'special, section, labor', 'environmental, regulatory, behavioral', + 'welfare, childhood, child', 'maintenance, facilities, finance', + 'services, highway, service', 'station, state, estate', + 'traffic, safety, alcohol', 'security, custody, mcdc', + 'silver, spring, urban', 'family, crimes, major', + 'training, recruit, recruiting', + 'technology, systems, information', + 'building, construction, instruction', + 'nicholson, transit, transport', + 'communications, communication, division', + 'emergency, centers, center', 'delivery, warehouse, liquor', + 'management, mangement, engineering', 'administration, administrative, battalion', - 'behavioral, health, school', 'welfare, children, childhood', - 'protective, procurement, project', - 'development, government, stormwater', 'supports, support, sports', - 'family, animal, robbery', 'emergency, commuter, duplicating', - 'custody, toddlers, mcdc', 'district, urban, 3rd', - 'technology, systems, telephone', 'council, centers, members', - 'delivery, special, operations', - 'maintenance, facilities, finance', 'captain, chief, autobody', - 'liquor, clerk, store', 'officer, office, police', - 'master, registered, water', 'operator, bus, operations', - 'administrative, legislative, principal', - 'technician, mechanic, supply', 'manager, budget, engineer', - 'recreation, renovation, resource', 'school, room, behavioral', - 'coordinator, transit, coordinating', - 'enforcement, permitting, inspector', - 'information, technology, technologist', - 'assistance, income, client', 'therapist, sheriff, plumber', + 'investigative, explosive, investigations', + 'eligibility, assistance, assisted', 'health, school, based', + 'fleet, animal, bureau', 'downtown, rockville, library', + 'accounts, toddlers, council', 'protective, programs, program', + 'officer, office, traffic', 'warehouse, welfare, caseworker', 'firefighter, rescuer, recruit', - 'correctional, correction, regional', - 'accountant, assistant, library', + 'librarian, candidate, psychiatric', 'income, assistance, client', + 'coordinator, services, service', 'manager, iii, management', + 'equipment, investment, investigator', 'operator, bus, operations', + 'specialist, special, quality', + 'enforcement, inspector, permitting', + 'technician, mechanic, supply', 'communications, telecommunications, safety', - 'services, service, aide', 'community, health, nurse', - 'sergeant, cadet, emergency', 'craftsworker, worker, social', - 'specialist, special, procurement', - 'crossing, purchasing, planning', 'warehouse, welfare, driver', - 'corporal, erp, behavioral', 'program, programs, projects', - 'equipment, investment, investigator', - 'lieutenant, attendant, shift'], dtype=object) + 'crossing, parking, guard', + 'administrative, legislative, principal', + 'correctional, correction, corporal', 'school, room, behavioral', + 'community, nurse, health', 'liquor, clerk, store', + 'lieutenant, maintenance, client', 'sheriff, deputy, aide', + 'accountant, assistant, library', 'sergeant, police, cadet', + 'captain, chief, autobody', 'supervisor, supervisory, transit', + 'program, programs, projects', 'environmental, therapist, budget', + 'master, registered, meter', 'information, technology, renovation', + 'planning, senior, background'], dtype=object) @@ -560,7 +559,7 @@ Let's look at the cross-validated R2 score of our model: .. code-block:: none - R2 score: mean: 0.921; std: 0.016 + R2 score: mean: 0.923; std: 0.013 @@ -696,7 +695,7 @@ to plot the feature importances. .. rst-class:: sphx-glr-timing - **Total running time of the script:** (1 minutes 19.568 seconds) + **Total running time of the script:** (1 minutes 8.670 seconds) .. _sphx_glr_download_auto_examples_01_encodings.py: diff --git a/dev/_sources/auto_examples/02_feature_interpretation_with_gapencoder.rst.txt b/dev/_sources/auto_examples/02_feature_interpretation_with_gapencoder.rst.txt index 694892af..36c38497 100644 --- a/dev/_sources/auto_examples/02_feature_interpretation_with_gapencoder.rst.txt +++ b/dev/_sources/auto_examples/02_feature_interpretation_with_gapencoder.rst.txt @@ -504,7 +504,7 @@ as a set of latent topics. .. rst-class:: sphx-glr-timing - **Total running time of the script:** (0 minutes 2.041 seconds) + **Total running time of the script:** (0 minutes 1.648 seconds) .. _sphx_glr_download_auto_examples_02_feature_interpretation_with_gapencoder.py: diff --git a/dev/_sources/auto_examples/03_datetime_encoder.rst.txt b/dev/_sources/auto_examples/03_datetime_encoder.rst.txt index d7276e52..9707ebad 100644 --- a/dev/_sources/auto_examples/03_datetime_encoder.rst.txt +++ b/dev/_sources/auto_examples/03_datetime_encoder.rst.txt @@ -610,7 +610,7 @@ and transforms datetime columns by default. .. rst-class:: sphx-glr-timing - **Total running time of the script:** (0 minutes 4.948 seconds) + **Total running time of the script:** (0 minutes 4.149 seconds) .. _sphx_glr_download_auto_examples_03_datetime_encoder.py: diff --git a/dev/_sources/auto_examples/04_fuzzy_joining.rst.txt b/dev/_sources/auto_examples/04_fuzzy_joining.rst.txt index dab97550..142db368 100644 --- a/dev/_sources/auto_examples/04_fuzzy_joining.rst.txt +++ b/dev/_sources/auto_examples/04_fuzzy_joining.rst.txt @@ -1711,7 +1711,7 @@ introduced into a grid search: .. rst-class:: sphx-glr-timing - **Total running time of the script:** (0 minutes 19.674 seconds) + **Total running time of the script:** (0 minutes 17.135 seconds) .. _sphx_glr_download_auto_examples_04_fuzzy_joining.py: diff --git a/dev/_sources/auto_examples/05_deduplication.rst.txt b/dev/_sources/auto_examples/05_deduplication.rst.txt index 91a2ccc9..6b32cfbd 100644 --- a/dev/_sources/auto_examples/05_deduplication.rst.txt +++ b/dev/_sources/auto_examples/05_deduplication.rst.txt @@ -335,7 +335,7 @@ or |MinHash|. .. rst-class:: sphx-glr-timing - **Total running time of the script:** (0 minutes 4.680 seconds) + **Total running time of the script:** (0 minutes 4.909 seconds) .. _sphx_glr_download_auto_examples_05_deduplication.py: diff --git a/dev/_sources/auto_examples/06_ken_embeddings.rst.txt b/dev/_sources/auto_examples/06_ken_embeddings.rst.txt index 9ff2206f..a326a489 100644 --- a/dev/_sources/auto_examples/06_ken_embeddings.rst.txt +++ b/dev/_sources/auto_examples/06_ken_embeddings.rst.txt @@ -305,7 +305,7 @@ We will start by checking out the available tables with .. code-block:: none - {'companies', 'games', 'all_entities', 'albums', 'schools', 'movies'} + {'all_entities', 'companies', 'movies', 'albums', 'games', 'schools'} @@ -840,7 +840,7 @@ It helped significantly improve the prediction score. .. rst-class:: sphx-glr-timing - **Total running time of the script:** (11 minutes 7.838 seconds) + **Total running time of the script:** (10 minutes 30.856 seconds) .. _sphx_glr_download_auto_examples_06_ken_embeddings.py: diff --git a/dev/_sources/auto_examples/07_multiple_key_join.rst.txt b/dev/_sources/auto_examples/07_multiple_key_join.rst.txt index 71dacf7c..4897f89a 100644 --- a/dev/_sources/auto_examples/07_multiple_key_join.rst.txt +++ b/dev/_sources/auto_examples/07_multiple_key_join.rst.txt @@ -1226,7 +1226,7 @@ The results: /home/circleci/project/miniconda/envs/testenv/lib/python3.10/site-packages/sklearn/preprocessing/_encoders.py:228: UserWarning: Found unknown categories in columns [1] during transform. These unknown categories will be encoded as all zeros warnings.warn( - 0.58995 + 0.5878500000000001 @@ -1244,7 +1244,7 @@ Our final cross-validated accuracy score is 0.58. .. rst-class:: sphx-glr-timing - **Total running time of the script:** (11 minutes 29.384 seconds) + **Total running time of the script:** (10 minutes 56.053 seconds) .. _sphx_glr_download_auto_examples_07_multiple_key_join.py: diff --git a/dev/_sources/auto_examples/08_join_aggregation.rst.txt b/dev/_sources/auto_examples/08_join_aggregation.rst.txt index c66ab1fe..8ba3cd56 100644 --- a/dev/_sources/auto_examples/08_join_aggregation.rst.txt +++ b/dev/_sources/auto_examples/08_join_aggregation.rst.txt @@ -844,75 +844,75 @@ operation maximizing our validation score. split1_test_score - 0.047956 - 0.063122 - 0.082469 - 0.057066 - 0.062993 + 0.039475 + 0.047479 + 0.075629 + 0.065258 + 0.070249 split2_test_score - 0.077542 - 0.081570 - 0.094021 - 0.098690 - 0.097006 + 0.075227 + 0.083063 + 0.092384 + 0.096931 + 0.089502 split3_test_score - 0.043531 - 0.065571 - 0.048403 - 0.065745 - 0.069058 + 0.042883 + 0.060734 + 0.061950 + 0.070185 + 0.077840 split4_test_score - 0.146376 - 0.134605 - 0.134260 - 0.144846 - 0.146525 + 0.140699 + 0.133569 + 0.139557 + 0.152427 + 0.143822 split5_test_score - 0.106774 - 0.110818 - 0.108646 - 0.103209 - 0.118163 + 0.113387 + 0.114269 + 0.109349 + 0.106754 + 0.116180 split6_test_score - 0.087637 - 0.101564 - 0.106001 - 0.108448 - 0.104643 + 0.080712 + 0.095461 + 0.107454 + 0.107903 + 0.108229 split7_test_score - 0.069828 - 0.061365 - 0.063359 - 0.078356 - 0.087845 + 0.070899 + 0.067751 + 0.059840 + 0.069572 + 0.078243 split8_test_score - 0.103975 - 0.113042 - 0.124293 - 0.122794 - 0.134883 + 0.103105 + 0.112480 + 0.121120 + 0.124410 + 0.127232 split9_test_score - 0.119554 - 0.151334 - 0.162009 - 0.169056 - 0.191246 + 0.107526 + 0.157958 + 0.171831 + 0.164579 + 0.184743 @@ -1003,7 +1003,7 @@ exhaustive histogram over all the possible values of ratings .. rst-class:: sphx-glr-timing - **Total running time of the script:** (0 minutes 18.997 seconds) + **Total running time of the script:** (0 minutes 21.351 seconds) .. _sphx_glr_download_auto_examples_08_join_aggregation.py: diff --git a/dev/_sources/auto_examples/09_interpolation_join.rst.txt b/dev/_sources/auto_examples/09_interpolation_join.rst.txt index aca2dea8..bbdf0958 100644 --- a/dev/_sources/auto_examples/09_interpolation_join.rst.txt +++ b/dev/_sources/auto_examples/09_interpolation_join.rst.txt @@ -350,9 +350,9 @@ To avoid clashes in the column names, we use the ``suffix`` parameter to append 42.6 0.0 NaN - 34.050713 - 79.557726 - 0.061708 + 33.273594 + 93.598452 + -0.143711 1 @@ -362,9 +362,9 @@ To avoid clashes in the column names, we use the ``suffix`` parameter to append 26.9 NaN NaN - 26.447479 - 80.855013 - 0.282900 + 27.421386 + 145.126263 + 0.032536 2 @@ -374,9 +374,9 @@ To avoid clashes in the column names, we use the ``suffix`` parameter to append 41.6 NaN NaN - 31.378841 - 29.758804 - 0.061708 + 31.697874 + 29.228029 + -0.146347 3 @@ -386,9 +386,9 @@ To avoid clashes in the column names, we use the ``suffix`` parameter to append 46.6 0.0 NaN - 33.341582 - 56.433833 - 0.061708 + 33.469607 + 55.614446 + -0.146347 4 @@ -398,9 +398,9 @@ To avoid clashes in the column names, we use the ``suffix`` parameter to append 36.1 0.0 NaN - 29.703763 - 34.348435 - 0.061708 + 29.672556 + 28.401216 + 0.129800 @@ -580,7 +580,7 @@ Then, we use it to transform the flights table -- it adds a ``'TMAX'`` column co MI 42.212059 -83.348836 - 1.573918 + 0.100304 1 @@ -592,7 +592,7 @@ Then, we use it to transform the flights table -- it adds a ``'TMAX'`` column co MI 42.212059 -83.348836 - 16.273818 + 15.822712 2 @@ -604,7 +604,7 @@ Then, we use it to transform the flights table -- it adds a ``'TMAX'`` column co MI 42.212059 -83.348836 - 5.417276 + 5.827000 3 @@ -616,7 +616,7 @@ Then, we use it to transform the flights table -- it adds a ``'TMAX'`` column co MI 42.212059 -83.348836 - 6.667582 + 6.568854 4 @@ -628,7 +628,7 @@ Then, we use it to transform the flights table -- it adds a ``'TMAX'`` column co MI 42.212059 -83.348836 - 6.430813 + 7.027463 @@ -678,11 +678,11 @@ States with the lowest average predicted temperatures: Alaska, Montana, North Da state - AK -4.848166 - WA -0.189643 - MT 0.126340 - ND 1.035899 - MN 1.665222 + AK -4.202223 + MT 0.497644 + WA 0.735437 + MN 1.459978 + ND 1.460721 Name: TMAX, dtype: float64 @@ -707,11 +707,11 @@ States with the highest predicted temperatures: Puerto Rico, Virgin Islands, Haw state - LA 21.833117 - FL 24.515461 - HI 26.971245 - VI 30.231336 - PR 30.808106 + LA 21.635930 + FL 24.662567 + HI 27.179546 + VI 29.895677 + PR 30.640109 Name: TMAX, dtype: float64 @@ -787,7 +787,7 @@ It is a generalization of the :func:`~skrub.fuzzy_join`, as :func:`~skrub.fuzzy_ .. rst-class:: sphx-glr-timing - **Total running time of the script:** (0 minutes 5.503 seconds) + **Total running time of the script:** (0 minutes 6.568 seconds) .. _sphx_glr_download_auto_examples_09_interpolation_join.py: diff --git a/dev/_sources/auto_examples/sg_execution_times.rst.txt b/dev/_sources/auto_examples/sg_execution_times.rst.txt index 9fe52d1c..777a2cb2 100644 --- a/dev/_sources/auto_examples/sg_execution_times.rst.txt +++ b/dev/_sources/auto_examples/sg_execution_times.rst.txt @@ -6,7 +6,7 @@ Computation times ================= -**24:52.633** total execution time for 9 files **from auto_examples**: +**23:31.339** total execution time for 9 files **from auto_examples**: .. container:: @@ -33,29 +33,29 @@ Computation times - Time - Mem (MB) * - :ref:`sphx_glr_auto_examples_07_multiple_key_join.py` (``07_multiple_key_join.py``) - - 11:29.384 + - 10:56.053 - 0.0 * - :ref:`sphx_glr_auto_examples_06_ken_embeddings.py` (``06_ken_embeddings.py``) - - 11:07.838 + - 10:30.856 - 0.0 * - :ref:`sphx_glr_auto_examples_01_encodings.py` (``01_encodings.py``) - - 01:19.568 - - 0.0 - * - :ref:`sphx_glr_auto_examples_04_fuzzy_joining.py` (``04_fuzzy_joining.py``) - - 00:19.674 + - 01:08.670 - 0.0 * - :ref:`sphx_glr_auto_examples_08_join_aggregation.py` (``08_join_aggregation.py``) - - 00:18.997 + - 00:21.351 - 0.0 - * - :ref:`sphx_glr_auto_examples_09_interpolation_join.py` (``09_interpolation_join.py``) - - 00:05.503 + * - :ref:`sphx_glr_auto_examples_04_fuzzy_joining.py` (``04_fuzzy_joining.py``) + - 00:17.135 - 0.0 - * - :ref:`sphx_glr_auto_examples_03_datetime_encoder.py` (``03_datetime_encoder.py``) - - 00:04.948 + * - :ref:`sphx_glr_auto_examples_09_interpolation_join.py` (``09_interpolation_join.py``) + - 00:06.568 - 0.0 * - :ref:`sphx_glr_auto_examples_05_deduplication.py` (``05_deduplication.py``) - - 00:04.680 + - 00:04.909 + - 0.0 + * - :ref:`sphx_glr_auto_examples_03_datetime_encoder.py` (``03_datetime_encoder.py``) + - 00:04.149 - 0.0 * - :ref:`sphx_glr_auto_examples_02_feature_interpretation_with_gapencoder.py` (``02_feature_interpretation_with_gapencoder.py``) - - 00:02.041 + - 00:01.648 - 0.0 diff --git a/dev/_sources/install.rst.txt b/dev/_sources/install.rst.txt index bfa3114f..cf572c4f 100644 --- a/dev/_sources/install.rst.txt +++ b/dev/_sources/install.rst.txt @@ -34,6 +34,7 @@ Installing

+ This will not work yet! The package is still waiting to be published on conda-forge. You can follow progress here. .. code:: console @@ -44,6 +45,7 @@ Installing

+ This will not work yet! The package is still waiting to be published on conda-forge. You can follow progress here. .. code:: console diff --git a/dev/_sources/sg_execution_times.rst.txt b/dev/_sources/sg_execution_times.rst.txt index 95e4edfb..00e5a2f1 100644 --- a/dev/_sources/sg_execution_times.rst.txt +++ b/dev/_sources/sg_execution_times.rst.txt @@ -6,7 +6,7 @@ Computation times ================= -**24:52.633** total execution time for 9 files **from all galleries**: +**23:31.339** total execution time for 9 files **from all galleries**: .. container:: @@ -33,29 +33,29 @@ Computation times - Time - Mem (MB) * - :ref:`sphx_glr_auto_examples_07_multiple_key_join.py` (``../examples/07_multiple_key_join.py``) - - 11:29.384 + - 10:56.053 - 0.0 * - :ref:`sphx_glr_auto_examples_06_ken_embeddings.py` (``../examples/06_ken_embeddings.py``) - - 11:07.838 + - 10:30.856 - 0.0 * - :ref:`sphx_glr_auto_examples_01_encodings.py` (``../examples/01_encodings.py``) - - 01:19.568 - - 0.0 - * - :ref:`sphx_glr_auto_examples_04_fuzzy_joining.py` (``../examples/04_fuzzy_joining.py``) - - 00:19.674 + - 01:08.670 - 0.0 * - :ref:`sphx_glr_auto_examples_08_join_aggregation.py` (``../examples/08_join_aggregation.py``) - - 00:18.997 + - 00:21.351 - 0.0 - * - :ref:`sphx_glr_auto_examples_09_interpolation_join.py` (``../examples/09_interpolation_join.py``) - - 00:05.503 + * - :ref:`sphx_glr_auto_examples_04_fuzzy_joining.py` (``../examples/04_fuzzy_joining.py``) + - 00:17.135 - 0.0 - * - :ref:`sphx_glr_auto_examples_03_datetime_encoder.py` (``../examples/03_datetime_encoder.py``) - - 00:04.948 + * - :ref:`sphx_glr_auto_examples_09_interpolation_join.py` (``../examples/09_interpolation_join.py``) + - 00:06.568 - 0.0 * - :ref:`sphx_glr_auto_examples_05_deduplication.py` (``../examples/05_deduplication.py``) - - 00:04.680 + - 00:04.909 + - 0.0 + * - :ref:`sphx_glr_auto_examples_03_datetime_encoder.py` (``../examples/03_datetime_encoder.py``) + - 00:04.149 - 0.0 * - :ref:`sphx_glr_auto_examples_02_feature_interpretation_with_gapencoder.py` (``../examples/02_feature_interpretation_with_gapencoder.py``) - - 00:02.041 + - 00:01.648 - 0.0 diff --git a/dev/auto_examples/01_encodings.html b/dev/auto_examples/01_encodings.html index ee77d716..f9b1fe99 100644 --- a/dev/auto_examples/01_encodings.html +++ b/dev/auto_examples/01_encodings.html @@ -797,45 +797,44 @@

A simple prediction pipeline
tv.named_transformers_["high_cardinality"].get_feature_names_out()
 

-
array(['station, state, estate', 'services, highway, service',
-       'investigative, investigations, criminal',
-       'nicholson, transit, taxicab',
-       'communications, division, applications', 'silver, spring, ride',
-       'training, building, recruit', 'gaithersburg, clarksburg, the',
-       'patrol, 4th, 6th', 'director, automated, office',
-       'programs, program, preparedness',
-       'rockville, twinbrook, downtown',
-       'eligibility, assistance, assisted', 'safety, section, collision',
-       'management, equipment, automotive', 'security, mc311, mccf',
+
array(['gaithersburg, clarksburg, the', 'supports, support, sports',
+       'district, patrol, 3rd', 'telephone, automated, stormwater',
+       'special, section, labor', 'environmental, regulatory, behavioral',
+       'welfare, childhood, child', 'maintenance, facilities, finance',
+       'services, highway, service', 'station, state, estate',
+       'traffic, safety, alcohol', 'security, custody, mcdc',
+       'silver, spring, urban', 'family, crimes, major',
+       'training, recruit, recruiting',
+       'technology, systems, information',
+       'building, construction, instruction',
+       'nicholson, transit, transport',
+       'communications, communication, division',
+       'emergency, centers, center', 'delivery, warehouse, liquor',
+       'management, mangement, engineering',
        'administration, administrative, battalion',
-       'behavioral, health, school', 'welfare, children, childhood',
-       'protective, procurement, project',
-       'development, government, stormwater', 'supports, support, sports',
-       'family, animal, robbery', 'emergency, commuter, duplicating',
-       'custody, toddlers, mcdc', 'district, urban, 3rd',
-       'technology, systems, telephone', 'council, centers, members',
-       'delivery, special, operations',
-       'maintenance, facilities, finance', 'captain, chief, autobody',
-       'liquor, clerk, store', 'officer, office, police',
-       'master, registered, water', 'operator, bus, operations',
-       'administrative, legislative, principal',
-       'technician, mechanic, supply', 'manager, budget, engineer',
-       'recreation, renovation, resource', 'school, room, behavioral',
-       'coordinator, transit, coordinating',
-       'enforcement, permitting, inspector',
-       'information, technology, technologist',
-       'assistance, income, client', 'therapist, sheriff, plumber',
+       'investigative, explosive, investigations',
+       'eligibility, assistance, assisted', 'health, school, based',
+       'fleet, animal, bureau', 'downtown, rockville, library',
+       'accounts, toddlers, council', 'protective, programs, program',
+       'officer, office, traffic', 'warehouse, welfare, caseworker',
        'firefighter, rescuer, recruit',
-       'correctional, correction, regional',
-       'accountant, assistant, library',
+       'librarian, candidate, psychiatric', 'income, assistance, client',
+       'coordinator, services, service', 'manager, iii, management',
+       'equipment, investment, investigator', 'operator, bus, operations',
+       'specialist, special, quality',
+       'enforcement, inspector, permitting',
+       'technician, mechanic, supply',
        'communications, telecommunications, safety',
-       'services, service, aide', 'community, health, nurse',
-       'sergeant, cadet, emergency', 'craftsworker, worker, social',
-       'specialist, special, procurement',
-       'crossing, purchasing, planning', 'warehouse, welfare, driver',
-       'corporal, erp, behavioral', 'program, programs, projects',
-       'equipment, investment, investigator',
-       'lieutenant, attendant, shift'], dtype=object)
+       'crossing, parking, guard',
+       'administrative, legislative, principal',
+       'correctional, correction, corporal', 'school, room, behavioral',
+       'community, nurse, health', 'liquor, clerk, store',
+       'lieutenant, maintenance, client', 'sheriff, deputy, aide',
+       'accountant, assistant, library', 'sergeant, police, cadet',
+       'captain, chief, autobody', 'supervisor, supervisory, transit',
+       'program, programs, projects', 'environmental, therapist, budget',
+       'master, registered, meter', 'information, technology, renovation',
+       'planning, senior, background'], dtype=object)
 
-
R2 score:  mean: 0.921; std: 0.016
+
R2 score:  mean: 0.923; std: 0.013
 

The simple pipeline applied on this complex dataset gave us very good results.

@@ -937,7 +936,7 @@

ConclusionTotal running time of the script: (1 minutes 19.568 seconds)

+

Total running time of the script: (1 minutes 8.670 seconds)

-
{'companies', 'games', 'all_entities', 'albums', 'schools', 'movies'}
+
{'all_entities', 'companies', 'movies', 'albums', 'games', 'schools'}
 

The games table is the most relevant to our case. @@ -968,7 +968,7 @@

Plotting the results

It helped significantly improve the prediction score.

-

Total running time of the script: (11 minutes 7.838 seconds)

+

Total running time of the script: (10 minutes 30.856 seconds)