diff --git a/dev/.buildinfo b/dev/.buildinfo index 6f3da64e..a6b5279f 100644 --- a/dev/.buildinfo +++ b/dev/.buildinfo @@ -1,4 +1,4 @@ # Sphinx build info version 1 # This file hashes the configuration used when building these files. When it is not found, a full rebuild will be done. -config: e65101e38a131eedb841b03c0c1c495b +config: 1a1ba6b402fb86cc0386a59eccfbc0a2 tags: 645f666f9bcd5a90fca523b33c5a78b7 diff --git a/dev/_images/sphx_glr_01_encodings_001.png b/dev/_images/sphx_glr_01_encodings_001.png index 43e86fa4..e9982168 100644 Binary files a/dev/_images/sphx_glr_01_encodings_001.png and b/dev/_images/sphx_glr_01_encodings_001.png differ diff --git a/dev/_images/sphx_glr_01_encodings_thumb.png b/dev/_images/sphx_glr_01_encodings_thumb.png index 17a360c7..ac67e04d 100644 Binary files a/dev/_images/sphx_glr_01_encodings_thumb.png and b/dev/_images/sphx_glr_01_encodings_thumb.png differ diff --git a/dev/_images/sphx_glr_08_join_aggregation_003.png b/dev/_images/sphx_glr_08_join_aggregation_003.png index 395a3083..3b28be08 100644 Binary files a/dev/_images/sphx_glr_08_join_aggregation_003.png and b/dev/_images/sphx_glr_08_join_aggregation_003.png differ diff --git a/dev/_images/sphx_glr_09_interpolation_join_001.png b/dev/_images/sphx_glr_09_interpolation_join_001.png index 5f041205..537534c8 100644 Binary files a/dev/_images/sphx_glr_09_interpolation_join_001.png and b/dev/_images/sphx_glr_09_interpolation_join_001.png differ diff --git a/dev/_images/sphx_glr_09_interpolation_join_002.png b/dev/_images/sphx_glr_09_interpolation_join_002.png index 99d55fc6..22f496fc 100644 Binary files a/dev/_images/sphx_glr_09_interpolation_join_002.png and b/dev/_images/sphx_glr_09_interpolation_join_002.png differ diff --git a/dev/_images/sphx_glr_09_interpolation_join_003.png b/dev/_images/sphx_glr_09_interpolation_join_003.png index e63982ad..5a477379 100644 Binary files a/dev/_images/sphx_glr_09_interpolation_join_003.png and b/dev/_images/sphx_glr_09_interpolation_join_003.png differ diff --git a/dev/_images/sphx_glr_09_interpolation_join_thumb.png b/dev/_images/sphx_glr_09_interpolation_join_thumb.png index 44516288..728aaeb3 100644 Binary files a/dev/_images/sphx_glr_09_interpolation_join_thumb.png and b/dev/_images/sphx_glr_09_interpolation_join_thumb.png differ diff --git a/dev/_sources/auto_examples/01_encodings.rst.txt b/dev/_sources/auto_examples/01_encodings.rst.txt index f0b8a886..376c1cd1 100644 --- a/dev/_sources/auto_examples/01_encodings.rst.txt +++ b/dev/_sources/auto_examples/01_encodings.rst.txt @@ -847,43 +847,43 @@ corresponding columns: .. code-block:: none - array(['gaithersburg, the, clarksburg', 'station, mc311, state', - 'patrol, 5th, 4th', 'behavioral, health, school', - 'division, animal, virtual', - 'administration, battalion, operations', 'silver, spring, ride', - 'protective, programs, program', 'automotive, equipment, budget', - 'delivery, explosive, warehouse', 'human, ombudsman, juvenile', - 'safety, collision, section', 'training, recruit, recruiting', - 'highway, services, service', 'district, 3rd, 1st', - 'communications, communication, commuter', - 'welfare, children, childhood', 'rockville, twinbrook, library', - 'maintenance, facilities, council', - 'engineering, mangement, parking', 'supports, support, sports', - 'technology, systems, telephone', 'personnel, family, crisis', - 'eligibility, assistance, assisted', 'director, automated, office', - 'nicholson, transit, taxicab', 'compliance, accounts, hampden', - 'construction, building, design', 'security, custody, mcdc', - 'investigative, inspections, investigations', - 'operator, equipment, bus', 'manager, investigator, iii', - 'officer, office, traffic', 'school, room, behavioral', - 'master, registered, meter', 'liquor, clerk, store', - 'information, recreation, technology', 'community, health, nurse', - 'specialist, special, resource', - 'coordinator, coordinating, services', - 'supervisory, supervisor, therapist', 'lieutenant, captain, chief', + array(['special, programs, program', 'resources, resource, network', + 'silver, spring, ride', 'station, state, estate', + 'district, squad, payroll', 'management, equipment, automotive', + 'investigative, investigations, criminal', + 'gaithersburg, clarksburg, the', 'engineering, parking, marking', + 'rockville, downtown, library', 'services, highway, service', + 'communications, communication, telecommunications', + 'behavioral, health, school', 'director, automated, office', + 'safety, division, section', 'technology, telephone, systems', + 'welfare, child, childhood', 'nicholson, transit, taxicab', + 'sexual, family, crimes', 'eligibility, assistance, medical', + 'security, mccf, unit', '4th, 6th, 5th', + 'delivery, warehouse, liquor', + 'environmental, regulatory, adolescent', + 'training, administration, recruit', '3rd, patrol, 2nd', + 'maintenance, facilities, council', 'supports, support, sports', + 'building, construction, instruction', + 'toddlers, custody, members', 'warehouse, welfare, driver', + 'program, programs, projects', 'administrative, administration, administrator', - 'enforcement, inspector, force', 'technician, mechanic, supply', - 'correctional, correction, corporal', - 'firefighter, rescuer, recruit', + 'operator, equipment, apprentice', 'officer, office, police', + 'candidate, attendant, master', 'firefighter, rescuer, recruit', + 'school, room, behavioral', 'information, recreation, technology', + 'liquor, clerk, store', 'specialist, special, planning', + 'manager, engineer, investigator', 'worker, social, leader', + 'community, health, nurse', 'technician, mechanic, systems', + 'enforcement, permitting, inspector', + 'coordinator, coordinating, legislative', + 'crossing, library, librarian', + 'correctional, correction, corporal', 'sheriff, deputy, autobody', 'communications, telecommunications, safety', - 'warehouse, welfare, caseworker', - 'crossing, purchasing, background', 'planning, senior, engineer', - 'income, assistance, client', 'sergeant, police, cadet', - 'accountant, assistant, library', 'program, programs, projects', - 'legislative, principal, executive', - 'librarian, psychiatric, employee', - 'customer, urban, representative', 'candidate, sheriff, deputy', - 'environmental, budget, scientist'], dtype=object) + 'lieutenant, shift, commander', 'income, assistance, client', + 'sergeant, emergency, energy', 'principal, executive, examiner', + 'environmental, budget, ombudsman', 'services, service, urban', + 'accountant, assistant, attorney', + 'supervisory, supervisor, therapist', 'captain, rescue, battalion'], + dtype=object) @@ -965,7 +965,7 @@ Let's look at the cross-validated R2 score of our model: .. code-block:: none - R2 score: mean: 0.922; std: 0.014 + R2 score: mean: 0.919; std: 0.012 @@ -1505,7 +1505,7 @@ to plot the feature importances. .. rst-class:: sphx-glr-timing - **Total running time of the script:** (1 minutes 8.940 seconds) + **Total running time of the script:** (1 minutes 29.145 seconds) .. _sphx_glr_download_auto_examples_01_encodings.py: diff --git a/dev/_sources/auto_examples/02_feature_interpretation_with_gapencoder.rst.txt b/dev/_sources/auto_examples/02_feature_interpretation_with_gapencoder.rst.txt index 95337ca1..a68312cb 100644 --- a/dev/_sources/auto_examples/02_feature_interpretation_with_gapencoder.rst.txt +++ b/dev/_sources/auto_examples/02_feature_interpretation_with_gapencoder.rst.txt @@ -504,7 +504,7 @@ as a set of latent topics. .. rst-class:: sphx-glr-timing - **Total running time of the script:** (0 minutes 1.618 seconds) + **Total running time of the script:** (0 minutes 2.488 seconds) .. _sphx_glr_download_auto_examples_02_feature_interpretation_with_gapencoder.py: diff --git a/dev/_sources/auto_examples/03_datetime_encoder.rst.txt b/dev/_sources/auto_examples/03_datetime_encoder.rst.txt index d672fbb8..a7c8c0df 100644 --- a/dev/_sources/auto_examples/03_datetime_encoder.rst.txt +++ b/dev/_sources/auto_examples/03_datetime_encoder.rst.txt @@ -610,7 +610,7 @@ and transforms datetime columns by default. .. rst-class:: sphx-glr-timing - **Total running time of the script:** (0 minutes 4.129 seconds) + **Total running time of the script:** (0 minutes 5.359 seconds) .. _sphx_glr_download_auto_examples_03_datetime_encoder.py: diff --git a/dev/_sources/auto_examples/04_fuzzy_joining.rst.txt b/dev/_sources/auto_examples/04_fuzzy_joining.rst.txt index bc7e71c8..58077fa4 100644 --- a/dev/_sources/auto_examples/04_fuzzy_joining.rst.txt +++ b/dev/_sources/auto_examples/04_fuzzy_joining.rst.txt @@ -1732,7 +1732,7 @@ introduced into a grid search: .. rst-class:: sphx-glr-timing - **Total running time of the script:** (0 minutes 20.409 seconds) + **Total running time of the script:** (0 minutes 24.406 seconds) .. _sphx_glr_download_auto_examples_04_fuzzy_joining.py: diff --git a/dev/_sources/auto_examples/05_deduplication.rst.txt b/dev/_sources/auto_examples/05_deduplication.rst.txt index cae39d3e..8805fabe 100644 --- a/dev/_sources/auto_examples/05_deduplication.rst.txt +++ b/dev/_sources/auto_examples/05_deduplication.rst.txt @@ -335,7 +335,7 @@ or |MinHash|. .. rst-class:: sphx-glr-timing - **Total running time of the script:** (0 minutes 4.789 seconds) + **Total running time of the script:** (0 minutes 5.798 seconds) .. _sphx_glr_download_auto_examples_05_deduplication.py: diff --git a/dev/_sources/auto_examples/06_ken_embeddings.rst.txt b/dev/_sources/auto_examples/06_ken_embeddings.rst.txt index a29e0ad7..8cc44e79 100644 --- a/dev/_sources/auto_examples/06_ken_embeddings.rst.txt +++ b/dev/_sources/auto_examples/06_ken_embeddings.rst.txt @@ -305,7 +305,7 @@ We will start by checking out the available tables with .. code-block:: none - {'movies', 'companies', 'games', 'albums', 'all_entities', 'schools'} + {'movies', 'schools', 'all_entities', 'games', 'albums', 'companies'} @@ -873,7 +873,7 @@ It helped significantly improve the prediction score. .. rst-class:: sphx-glr-timing - **Total running time of the script:** (11 minutes 40.136 seconds) + **Total running time of the script:** (11 minutes 45.643 seconds) .. _sphx_glr_download_auto_examples_06_ken_embeddings.py: diff --git a/dev/_sources/auto_examples/07_multiple_key_join.rst.txt b/dev/_sources/auto_examples/07_multiple_key_join.rst.txt index ec2fdc9b..75d10c73 100644 --- a/dev/_sources/auto_examples/07_multiple_key_join.rst.txt +++ b/dev/_sources/auto_examples/07_multiple_key_join.rst.txt @@ -1256,7 +1256,7 @@ The results: /home/circleci/project/miniconda/envs/testenv/lib/python3.10/site-packages/sklearn/preprocessing/_encoders.py:241: UserWarning: Found unknown categories in columns [1] during transform. These unknown categories will be encoded as all zeros warnings.warn( - 0.5908 + 0.5864499999999999 @@ -1274,7 +1274,7 @@ Our final cross-validated accuracy score is 0.58. .. rst-class:: sphx-glr-timing - **Total running time of the script:** (11 minutes 59.788 seconds) + **Total running time of the script:** (11 minutes 43.985 seconds) .. _sphx_glr_download_auto_examples_07_multiple_key_join.py: diff --git a/dev/_sources/auto_examples/08_join_aggregation.rst.txt b/dev/_sources/auto_examples/08_join_aggregation.rst.txt index 4f4f6b62..5f3db1e6 100644 --- a/dev/_sources/auto_examples/08_join_aggregation.rst.txt +++ b/dev/_sources/auto_examples/08_join_aggregation.rst.txt @@ -1284,75 +1284,75 @@ operation maximizing our validation score. split1_test_score - 0.054509 - 0.058128 - 0.071069 - 0.052660 - 0.052842 + 0.059505 + 0.032508 + 0.078928 + 0.069610 + 0.084291 split2_test_score - 0.069220 - 0.093858 - 0.092783 - 0.102946 - 0.094487 + 0.072540 + 0.081928 + 0.084433 + 0.097590 + 0.085638 split3_test_score - 0.045484 - 0.069533 - 0.059923 - 0.063730 - 0.078378 + 0.041792 + 0.063570 + 0.068268 + 0.068114 + 0.077827 split4_test_score - 0.129426 - 0.135126 - 0.142650 - 0.150611 - 0.156610 + 0.144257 + 0.121750 + 0.154879 + 0.154665 + 0.154509 split5_test_score - 0.109764 - 0.111292 - 0.109923 - 0.110233 - 0.115234 + 0.105915 + 0.112408 + 0.109866 + 0.108885 + 0.115138 split6_test_score - 0.083386 - 0.105465 - 0.105645 - 0.107080 - 0.105257 + 0.083759 + 0.098489 + 0.104181 + 0.110296 + 0.105656 split7_test_score - 0.069049 - 0.060614 - 0.059454 - 0.061911 - 0.081074 + 0.068791 + 0.066842 + 0.071446 + 0.064729 + 0.072894 split8_test_score - 0.101018 - 0.115044 - 0.116905 - 0.128505 - 0.118541 + 0.109317 + 0.113322 + 0.121739 + 0.127709 + 0.131357 split9_test_score - 0.118754 - 0.150841 - 0.169428 - 0.167694 - 0.177697 + 0.130825 + 0.161078 + 0.165871 + 0.164596 + 0.183170 @@ -1472,7 +1472,7 @@ exhaustive histogram over all the possible values of ratings .. rst-class:: sphx-glr-timing - **Total running time of the script:** (0 minutes 21.403 seconds) + **Total running time of the script:** (0 minutes 18.508 seconds) .. _sphx_glr_download_auto_examples_08_join_aggregation.py: diff --git a/dev/_sources/auto_examples/09_interpolation_join.rst.txt b/dev/_sources/auto_examples/09_interpolation_join.rst.txt index 0c31eba3..fb645355 100644 --- a/dev/_sources/auto_examples/09_interpolation_join.rst.txt +++ b/dev/_sources/auto_examples/09_interpolation_join.rst.txt @@ -350,9 +350,9 @@ To avoid clashes in the column names, we use the ``suffix`` parameter to append 40.1 0.0 NaN - 29.434572 - 30.716030 - -0.012324 + 28.565912 + 75.458772 + 0.071450 1 @@ -362,9 +362,9 @@ To avoid clashes in the column names, we use the ``suffix`` parameter to append 39.6 0.0 NaN - 29.473927 - 14.561546 - 0.030016 + 28.771769 + 22.775682 + 0.138466 2 @@ -374,9 +374,9 @@ To avoid clashes in the column names, we use the ``suffix`` parameter to append 20.5 NaN NaN - 25.567408 - 35.869333 - 0.354727 + 25.477924 + 56.348624 + 0.217091 3 @@ -386,9 +386,9 @@ To avoid clashes in the column names, we use the ``suffix`` parameter to append 39.4 0.0 NaN - 31.417770 - 21.121055 - -0.015113 + 30.320252 + 28.986260 + 0.071450 4 @@ -398,9 +398,9 @@ To avoid clashes in the column names, we use the ``suffix`` parameter to append 37.3 NaN NaN - 28.116925 - 17.026104 - 0.094577 + 26.009567 + 20.209893 + 0.243607 @@ -580,7 +580,7 @@ Then, we use it to transform the flights table -- it adds a ``'TMAX'`` column co MI 42.212059 -83.348836 - 2.445391 + 3.218007 1 @@ -592,7 +592,7 @@ Then, we use it to transform the flights table -- it adds a ``'TMAX'`` column co TX 32.895951 -97.037200 - 19.006449 + 19.344340 2 @@ -604,7 +604,7 @@ Then, we use it to transform the flights table -- it adds a ``'TMAX'`` column co NC 36.097747 -79.937297 - 17.140317 + 17.075899 3 @@ -616,7 +616,7 @@ Then, we use it to transform the flights table -- it adds a ``'TMAX'`` column co IL 41.979595 -87.904464 - 16.398789 + 16.210871 4 @@ -628,7 +628,7 @@ Then, we use it to transform the flights table -- it adds a ``'TMAX'`` column co TX 32.895951 -97.037200 - 12.513037 + 12.563494 @@ -678,11 +678,11 @@ States with the lowest average predicted temperatures: Alaska, Montana, North Da state - AK -4.647790 - MT 0.650437 - WA 1.048342 - ND 2.101548 - MN 2.144767 + AK -3.906127 + WA 0.975304 + MT 1.018083 + MN 1.998170 + ND 2.287165 Name: TMAX, dtype: float64 @@ -707,11 +707,11 @@ States with the highest predicted temperatures: Puerto Rico, Virgin Islands, Haw state - LA 21.666044 - FL 24.700309 - HI 25.903574 - VI 26.837353 - PR 27.499910 + LA 21.687122 + FL 24.634292 + HI 26.193950 + VI 27.213163 + PR 27.858555 Name: TMAX, dtype: float64 @@ -787,7 +787,7 @@ It is a generalization of the :func:`~skrub.fuzzy_join`, as :func:`~skrub.fuzzy_ .. rst-class:: sphx-glr-timing - **Total running time of the script:** (0 minutes 5.931 seconds) + **Total running time of the script:** (0 minutes 5.619 seconds) .. _sphx_glr_download_auto_examples_09_interpolation_join.py: diff --git a/dev/_sources/auto_examples/sg_execution_times.rst.txt b/dev/_sources/auto_examples/sg_execution_times.rst.txt index 752f0f5e..9f0f9069 100644 --- a/dev/_sources/auto_examples/sg_execution_times.rst.txt +++ b/dev/_sources/auto_examples/sg_execution_times.rst.txt @@ -6,7 +6,7 @@ Computation times ================= -**25:47.142** total execution time for 9 files **from auto_examples**: +**26:00.950** total execution time for 9 files **from auto_examples**: .. container:: @@ -32,30 +32,30 @@ Computation times * - Example - Time - Mem (MB) - * - :ref:`sphx_glr_auto_examples_07_multiple_key_join.py` (``07_multiple_key_join.py``) - - 11:59.788 - - 0.0 * - :ref:`sphx_glr_auto_examples_06_ken_embeddings.py` (``06_ken_embeddings.py``) - - 11:40.136 + - 11:45.643 - 0.0 - * - :ref:`sphx_glr_auto_examples_01_encodings.py` (``01_encodings.py``) - - 01:08.940 + * - :ref:`sphx_glr_auto_examples_07_multiple_key_join.py` (``07_multiple_key_join.py``) + - 11:43.985 - 0.0 - * - :ref:`sphx_glr_auto_examples_08_join_aggregation.py` (``08_join_aggregation.py``) - - 00:21.403 + * - :ref:`sphx_glr_auto_examples_01_encodings.py` (``01_encodings.py``) + - 01:29.145 - 0.0 * - :ref:`sphx_glr_auto_examples_04_fuzzy_joining.py` (``04_fuzzy_joining.py``) - - 00:20.409 + - 00:24.406 - 0.0 - * - :ref:`sphx_glr_auto_examples_09_interpolation_join.py` (``09_interpolation_join.py``) - - 00:05.931 + * - :ref:`sphx_glr_auto_examples_08_join_aggregation.py` (``08_join_aggregation.py``) + - 00:18.508 - 0.0 * - :ref:`sphx_glr_auto_examples_05_deduplication.py` (``05_deduplication.py``) - - 00:04.789 + - 00:05.798 + - 0.0 + * - :ref:`sphx_glr_auto_examples_09_interpolation_join.py` (``09_interpolation_join.py``) + - 00:05.619 - 0.0 * - :ref:`sphx_glr_auto_examples_03_datetime_encoder.py` (``03_datetime_encoder.py``) - - 00:04.129 + - 00:05.359 - 0.0 * - :ref:`sphx_glr_auto_examples_02_feature_interpretation_with_gapencoder.py` (``02_feature_interpretation_with_gapencoder.py``) - - 00:01.618 + - 00:02.488 - 0.0 diff --git a/dev/_sources/sg_execution_times.rst.txt b/dev/_sources/sg_execution_times.rst.txt index d5e8f653..ae21aa18 100644 --- a/dev/_sources/sg_execution_times.rst.txt +++ b/dev/_sources/sg_execution_times.rst.txt @@ -6,7 +6,7 @@ Computation times ================= -**25:47.142** total execution time for 9 files **from all galleries**: +**26:00.950** total execution time for 9 files **from all galleries**: .. container:: @@ -32,30 +32,30 @@ Computation times * - Example - Time - Mem (MB) - * - :ref:`sphx_glr_auto_examples_07_multiple_key_join.py` (``../examples/07_multiple_key_join.py``) - - 11:59.788 - - 0.0 * - :ref:`sphx_glr_auto_examples_06_ken_embeddings.py` (``../examples/06_ken_embeddings.py``) - - 11:40.136 + - 11:45.643 - 0.0 - * - :ref:`sphx_glr_auto_examples_01_encodings.py` (``../examples/01_encodings.py``) - - 01:08.940 + * - :ref:`sphx_glr_auto_examples_07_multiple_key_join.py` (``../examples/07_multiple_key_join.py``) + - 11:43.985 - 0.0 - * - :ref:`sphx_glr_auto_examples_08_join_aggregation.py` (``../examples/08_join_aggregation.py``) - - 00:21.403 + * - :ref:`sphx_glr_auto_examples_01_encodings.py` (``../examples/01_encodings.py``) + - 01:29.145 - 0.0 * - :ref:`sphx_glr_auto_examples_04_fuzzy_joining.py` (``../examples/04_fuzzy_joining.py``) - - 00:20.409 + - 00:24.406 - 0.0 - * - :ref:`sphx_glr_auto_examples_09_interpolation_join.py` (``../examples/09_interpolation_join.py``) - - 00:05.931 + * - :ref:`sphx_glr_auto_examples_08_join_aggregation.py` (``../examples/08_join_aggregation.py``) + - 00:18.508 - 0.0 * - :ref:`sphx_glr_auto_examples_05_deduplication.py` (``../examples/05_deduplication.py``) - - 00:04.789 + - 00:05.798 + - 0.0 + * - :ref:`sphx_glr_auto_examples_09_interpolation_join.py` (``../examples/09_interpolation_join.py``) + - 00:05.619 - 0.0 * - :ref:`sphx_glr_auto_examples_03_datetime_encoder.py` (``../examples/03_datetime_encoder.py``) - - 00:04.129 + - 00:05.359 - 0.0 * - :ref:`sphx_glr_auto_examples_02_feature_interpretation_with_gapencoder.py` (``../examples/02_feature_interpretation_with_gapencoder.py``) - - 00:01.618 + - 00:02.488 - 0.0 diff --git a/dev/auto_examples/01_encodings.html b/dev/auto_examples/01_encodings.html index 703d026f..839364e1 100644 --- a/dev/auto_examples/01_encodings.html +++ b/dev/auto_examples/01_encodings.html @@ -1194,43 +1194,43 @@

A simple prediction pipeline
tv.named_transformers_["high_cardinality"].get_feature_names_out()
 
-
array(['gaithersburg, the, clarksburg', 'station, mc311, state',
-       'patrol, 5th, 4th', 'behavioral, health, school',
-       'division, animal, virtual',
-       'administration, battalion, operations', 'silver, spring, ride',
-       'protective, programs, program', 'automotive, equipment, budget',
-       'delivery, explosive, warehouse', 'human, ombudsman, juvenile',
-       'safety, collision, section', 'training, recruit, recruiting',
-       'highway, services, service', 'district, 3rd, 1st',
-       'communications, communication, commuter',
-       'welfare, children, childhood', 'rockville, twinbrook, library',
-       'maintenance, facilities, council',
-       'engineering, mangement, parking', 'supports, support, sports',
-       'technology, systems, telephone', 'personnel, family, crisis',
-       'eligibility, assistance, assisted', 'director, automated, office',
-       'nicholson, transit, taxicab', 'compliance, accounts, hampden',
-       'construction, building, design', 'security, custody, mcdc',
-       'investigative, inspections, investigations',
-       'operator, equipment, bus', 'manager, investigator, iii',
-       'officer, office, traffic', 'school, room, behavioral',
-       'master, registered, meter', 'liquor, clerk, store',
-       'information, recreation, technology', 'community, health, nurse',
-       'specialist, special, resource',
-       'coordinator, coordinating, services',
-       'supervisory, supervisor, therapist', 'lieutenant, captain, chief',
+
array(['special, programs, program', 'resources, resource, network',
+       'silver, spring, ride', 'station, state, estate',
+       'district, squad, payroll', 'management, equipment, automotive',
+       'investigative, investigations, criminal',
+       'gaithersburg, clarksburg, the', 'engineering, parking, marking',
+       'rockville, downtown, library', 'services, highway, service',
+       'communications, communication, telecommunications',
+       'behavioral, health, school', 'director, automated, office',
+       'safety, division, section', 'technology, telephone, systems',
+       'welfare, child, childhood', 'nicholson, transit, taxicab',
+       'sexual, family, crimes', 'eligibility, assistance, medical',
+       'security, mccf, unit', '4th, 6th, 5th',
+       'delivery, warehouse, liquor',
+       'environmental, regulatory, adolescent',
+       'training, administration, recruit', '3rd, patrol, 2nd',
+       'maintenance, facilities, council', 'supports, support, sports',
+       'building, construction, instruction',
+       'toddlers, custody, members', 'warehouse, welfare, driver',
+       'program, programs, projects',
        'administrative, administration, administrator',
-       'enforcement, inspector, force', 'technician, mechanic, supply',
-       'correctional, correction, corporal',
-       'firefighter, rescuer, recruit',
+       'operator, equipment, apprentice', 'officer, office, police',
+       'candidate, attendant, master', 'firefighter, rescuer, recruit',
+       'school, room, behavioral', 'information, recreation, technology',
+       'liquor, clerk, store', 'specialist, special, planning',
+       'manager, engineer, investigator', 'worker, social, leader',
+       'community, health, nurse', 'technician, mechanic, systems',
+       'enforcement, permitting, inspector',
+       'coordinator, coordinating, legislative',
+       'crossing, library, librarian',
+       'correctional, correction, corporal', 'sheriff, deputy, autobody',
        'communications, telecommunications, safety',
-       'warehouse, welfare, caseworker',
-       'crossing, purchasing, background', 'planning, senior, engineer',
-       'income, assistance, client', 'sergeant, police, cadet',
-       'accountant, assistant, library', 'program, programs, projects',
-       'legislative, principal, executive',
-       'librarian, psychiatric, employee',
-       'customer, urban, representative', 'candidate, sheriff, deputy',
-       'environmental, budget, scientist'], dtype=object)
+       'lieutenant, shift, commander', 'income, assistance, client',
+       'sergeant, emergency, energy', 'principal, executive, examiner',
+       'environmental, budget, ombudsman', 'services, service, urban',
+       'accountant, assistant, attorney',
+       'supervisory, supervisor, therapist', 'captain, rescue, battalion'],
+      dtype=object)
 
-
R2 score:  mean: 0.922; std: 0.014
+
R2 score:  mean: 0.919; std: 0.012
 

The simple pipeline applied on this complex dataset gave us very good results.

@@ -1736,7 +1736,7 @@

ConclusionTotal running time of the script: (1 minutes 8.940 seconds)

+

Total running time of the script: (1 minutes 29.145 seconds)

-
{'movies', 'companies', 'games', 'albums', 'all_entities', 'schools'}
+
{'movies', 'schools', 'all_entities', 'games', 'albums', 'companies'}
 

The games table is the most relevant to our case. @@ -982,7 +982,7 @@

Plotting the results

It helped significantly improve the prediction score.

-

Total running time of the script: (11 minutes 40.136 seconds)

+

Total running time of the script: (11 minutes 45.643 seconds)