diff --git a/dev/.buildinfo b/dev/.buildinfo index 6f3da64e..a6b5279f 100644 --- a/dev/.buildinfo +++ b/dev/.buildinfo @@ -1,4 +1,4 @@ # Sphinx build info version 1 # This file hashes the configuration used when building these files. When it is not found, a full rebuild will be done. -config: e65101e38a131eedb841b03c0c1c495b +config: 1a1ba6b402fb86cc0386a59eccfbc0a2 tags: 645f666f9bcd5a90fca523b33c5a78b7 diff --git a/dev/_images/sphx_glr_01_encodings_001.png b/dev/_images/sphx_glr_01_encodings_001.png index 43e86fa4..e9982168 100644 Binary files a/dev/_images/sphx_glr_01_encodings_001.png and b/dev/_images/sphx_glr_01_encodings_001.png differ diff --git a/dev/_images/sphx_glr_01_encodings_thumb.png b/dev/_images/sphx_glr_01_encodings_thumb.png index 17a360c7..ac67e04d 100644 Binary files a/dev/_images/sphx_glr_01_encodings_thumb.png and b/dev/_images/sphx_glr_01_encodings_thumb.png differ diff --git a/dev/_images/sphx_glr_08_join_aggregation_003.png b/dev/_images/sphx_glr_08_join_aggregation_003.png index 395a3083..3b28be08 100644 Binary files a/dev/_images/sphx_glr_08_join_aggregation_003.png and b/dev/_images/sphx_glr_08_join_aggregation_003.png differ diff --git a/dev/_images/sphx_glr_09_interpolation_join_001.png b/dev/_images/sphx_glr_09_interpolation_join_001.png index 5f041205..537534c8 100644 Binary files a/dev/_images/sphx_glr_09_interpolation_join_001.png and b/dev/_images/sphx_glr_09_interpolation_join_001.png differ diff --git a/dev/_images/sphx_glr_09_interpolation_join_002.png b/dev/_images/sphx_glr_09_interpolation_join_002.png index 99d55fc6..22f496fc 100644 Binary files a/dev/_images/sphx_glr_09_interpolation_join_002.png and b/dev/_images/sphx_glr_09_interpolation_join_002.png differ diff --git a/dev/_images/sphx_glr_09_interpolation_join_003.png b/dev/_images/sphx_glr_09_interpolation_join_003.png index e63982ad..5a477379 100644 Binary files a/dev/_images/sphx_glr_09_interpolation_join_003.png and b/dev/_images/sphx_glr_09_interpolation_join_003.png differ diff --git a/dev/_images/sphx_glr_09_interpolation_join_thumb.png b/dev/_images/sphx_glr_09_interpolation_join_thumb.png index 44516288..728aaeb3 100644 Binary files a/dev/_images/sphx_glr_09_interpolation_join_thumb.png and b/dev/_images/sphx_glr_09_interpolation_join_thumb.png differ diff --git a/dev/_sources/auto_examples/01_encodings.rst.txt b/dev/_sources/auto_examples/01_encodings.rst.txt index f0b8a886..376c1cd1 100644 --- a/dev/_sources/auto_examples/01_encodings.rst.txt +++ b/dev/_sources/auto_examples/01_encodings.rst.txt @@ -847,43 +847,43 @@ corresponding columns: .. code-block:: none - array(['gaithersburg, the, clarksburg', 'station, mc311, state', - 'patrol, 5th, 4th', 'behavioral, health, school', - 'division, animal, virtual', - 'administration, battalion, operations', 'silver, spring, ride', - 'protective, programs, program', 'automotive, equipment, budget', - 'delivery, explosive, warehouse', 'human, ombudsman, juvenile', - 'safety, collision, section', 'training, recruit, recruiting', - 'highway, services, service', 'district, 3rd, 1st', - 'communications, communication, commuter', - 'welfare, children, childhood', 'rockville, twinbrook, library', - 'maintenance, facilities, council', - 'engineering, mangement, parking', 'supports, support, sports', - 'technology, systems, telephone', 'personnel, family, crisis', - 'eligibility, assistance, assisted', 'director, automated, office', - 'nicholson, transit, taxicab', 'compliance, accounts, hampden', - 'construction, building, design', 'security, custody, mcdc', - 'investigative, inspections, investigations', - 'operator, equipment, bus', 'manager, investigator, iii', - 'officer, office, traffic', 'school, room, behavioral', - 'master, registered, meter', 'liquor, clerk, store', - 'information, recreation, technology', 'community, health, nurse', - 'specialist, special, resource', - 'coordinator, coordinating, services', - 'supervisory, supervisor, therapist', 'lieutenant, captain, chief', + array(['special, programs, program', 'resources, resource, network', + 'silver, spring, ride', 'station, state, estate', + 'district, squad, payroll', 'management, equipment, automotive', + 'investigative, investigations, criminal', + 'gaithersburg, clarksburg, the', 'engineering, parking, marking', + 'rockville, downtown, library', 'services, highway, service', + 'communications, communication, telecommunications', + 'behavioral, health, school', 'director, automated, office', + 'safety, division, section', 'technology, telephone, systems', + 'welfare, child, childhood', 'nicholson, transit, taxicab', + 'sexual, family, crimes', 'eligibility, assistance, medical', + 'security, mccf, unit', '4th, 6th, 5th', + 'delivery, warehouse, liquor', + 'environmental, regulatory, adolescent', + 'training, administration, recruit', '3rd, patrol, 2nd', + 'maintenance, facilities, council', 'supports, support, sports', + 'building, construction, instruction', + 'toddlers, custody, members', 'warehouse, welfare, driver', + 'program, programs, projects', 'administrative, administration, administrator', - 'enforcement, inspector, force', 'technician, mechanic, supply', - 'correctional, correction, corporal', - 'firefighter, rescuer, recruit', + 'operator, equipment, apprentice', 'officer, office, police', + 'candidate, attendant, master', 'firefighter, rescuer, recruit', + 'school, room, behavioral', 'information, recreation, technology', + 'liquor, clerk, store', 'specialist, special, planning', + 'manager, engineer, investigator', 'worker, social, leader', + 'community, health, nurse', 'technician, mechanic, systems', + 'enforcement, permitting, inspector', + 'coordinator, coordinating, legislative', + 'crossing, library, librarian', + 'correctional, correction, corporal', 'sheriff, deputy, autobody', 'communications, telecommunications, safety', - 'warehouse, welfare, caseworker', - 'crossing, purchasing, background', 'planning, senior, engineer', - 'income, assistance, client', 'sergeant, police, cadet', - 'accountant, assistant, library', 'program, programs, projects', - 'legislative, principal, executive', - 'librarian, psychiatric, employee', - 'customer, urban, representative', 'candidate, sheriff, deputy', - 'environmental, budget, scientist'], dtype=object) + 'lieutenant, shift, commander', 'income, assistance, client', + 'sergeant, emergency, energy', 'principal, executive, examiner', + 'environmental, budget, ombudsman', 'services, service, urban', + 'accountant, assistant, attorney', + 'supervisory, supervisor, therapist', 'captain, rescue, battalion'], + dtype=object) @@ -965,7 +965,7 @@ Let's look at the cross-validated R2 score of our model: .. code-block:: none - R2 score: mean: 0.922; std: 0.014 + R2 score: mean: 0.919; std: 0.012 @@ -1505,7 +1505,7 @@ to plot the feature importances. .. rst-class:: sphx-glr-timing - **Total running time of the script:** (1 minutes 8.940 seconds) + **Total running time of the script:** (1 minutes 29.145 seconds) .. _sphx_glr_download_auto_examples_01_encodings.py: diff --git a/dev/_sources/auto_examples/02_feature_interpretation_with_gapencoder.rst.txt b/dev/_sources/auto_examples/02_feature_interpretation_with_gapencoder.rst.txt index 95337ca1..a68312cb 100644 --- a/dev/_sources/auto_examples/02_feature_interpretation_with_gapencoder.rst.txt +++ b/dev/_sources/auto_examples/02_feature_interpretation_with_gapencoder.rst.txt @@ -504,7 +504,7 @@ as a set of latent topics. .. rst-class:: sphx-glr-timing - **Total running time of the script:** (0 minutes 1.618 seconds) + **Total running time of the script:** (0 minutes 2.488 seconds) .. _sphx_glr_download_auto_examples_02_feature_interpretation_with_gapencoder.py: diff --git a/dev/_sources/auto_examples/03_datetime_encoder.rst.txt b/dev/_sources/auto_examples/03_datetime_encoder.rst.txt index d672fbb8..a7c8c0df 100644 --- a/dev/_sources/auto_examples/03_datetime_encoder.rst.txt +++ b/dev/_sources/auto_examples/03_datetime_encoder.rst.txt @@ -610,7 +610,7 @@ and transforms datetime columns by default. .. rst-class:: sphx-glr-timing - **Total running time of the script:** (0 minutes 4.129 seconds) + **Total running time of the script:** (0 minutes 5.359 seconds) .. _sphx_glr_download_auto_examples_03_datetime_encoder.py: diff --git a/dev/_sources/auto_examples/04_fuzzy_joining.rst.txt b/dev/_sources/auto_examples/04_fuzzy_joining.rst.txt index bc7e71c8..58077fa4 100644 --- a/dev/_sources/auto_examples/04_fuzzy_joining.rst.txt +++ b/dev/_sources/auto_examples/04_fuzzy_joining.rst.txt @@ -1732,7 +1732,7 @@ introduced into a grid search: .. rst-class:: sphx-glr-timing - **Total running time of the script:** (0 minutes 20.409 seconds) + **Total running time of the script:** (0 minutes 24.406 seconds) .. _sphx_glr_download_auto_examples_04_fuzzy_joining.py: diff --git a/dev/_sources/auto_examples/05_deduplication.rst.txt b/dev/_sources/auto_examples/05_deduplication.rst.txt index cae39d3e..8805fabe 100644 --- a/dev/_sources/auto_examples/05_deduplication.rst.txt +++ b/dev/_sources/auto_examples/05_deduplication.rst.txt @@ -335,7 +335,7 @@ or |MinHash|. .. rst-class:: sphx-glr-timing - **Total running time of the script:** (0 minutes 4.789 seconds) + **Total running time of the script:** (0 minutes 5.798 seconds) .. _sphx_glr_download_auto_examples_05_deduplication.py: diff --git a/dev/_sources/auto_examples/06_ken_embeddings.rst.txt b/dev/_sources/auto_examples/06_ken_embeddings.rst.txt index a29e0ad7..8cc44e79 100644 --- a/dev/_sources/auto_examples/06_ken_embeddings.rst.txt +++ b/dev/_sources/auto_examples/06_ken_embeddings.rst.txt @@ -305,7 +305,7 @@ We will start by checking out the available tables with .. code-block:: none - {'movies', 'companies', 'games', 'albums', 'all_entities', 'schools'} + {'movies', 'schools', 'all_entities', 'games', 'albums', 'companies'} @@ -873,7 +873,7 @@ It helped significantly improve the prediction score. .. rst-class:: sphx-glr-timing - **Total running time of the script:** (11 minutes 40.136 seconds) + **Total running time of the script:** (11 minutes 45.643 seconds) .. _sphx_glr_download_auto_examples_06_ken_embeddings.py: diff --git a/dev/_sources/auto_examples/07_multiple_key_join.rst.txt b/dev/_sources/auto_examples/07_multiple_key_join.rst.txt index ec2fdc9b..75d10c73 100644 --- a/dev/_sources/auto_examples/07_multiple_key_join.rst.txt +++ b/dev/_sources/auto_examples/07_multiple_key_join.rst.txt @@ -1256,7 +1256,7 @@ The results: /home/circleci/project/miniconda/envs/testenv/lib/python3.10/site-packages/sklearn/preprocessing/_encoders.py:241: UserWarning: Found unknown categories in columns [1] during transform. These unknown categories will be encoded as all zeros warnings.warn( - 0.5908 + 0.5864499999999999 @@ -1274,7 +1274,7 @@ Our final cross-validated accuracy score is 0.58. .. rst-class:: sphx-glr-timing - **Total running time of the script:** (11 minutes 59.788 seconds) + **Total running time of the script:** (11 minutes 43.985 seconds) .. _sphx_glr_download_auto_examples_07_multiple_key_join.py: diff --git a/dev/_sources/auto_examples/08_join_aggregation.rst.txt b/dev/_sources/auto_examples/08_join_aggregation.rst.txt index 4f4f6b62..5f3db1e6 100644 --- a/dev/_sources/auto_examples/08_join_aggregation.rst.txt +++ b/dev/_sources/auto_examples/08_join_aggregation.rst.txt @@ -1284,75 +1284,75 @@ operation maximizing our validation score.
tv.named_transformers_["high_cardinality"].get_feature_names_out()
array(['gaithersburg, the, clarksburg', 'station, mc311, state',
- 'patrol, 5th, 4th', 'behavioral, health, school',
- 'division, animal, virtual',
- 'administration, battalion, operations', 'silver, spring, ride',
- 'protective, programs, program', 'automotive, equipment, budget',
- 'delivery, explosive, warehouse', 'human, ombudsman, juvenile',
- 'safety, collision, section', 'training, recruit, recruiting',
- 'highway, services, service', 'district, 3rd, 1st',
- 'communications, communication, commuter',
- 'welfare, children, childhood', 'rockville, twinbrook, library',
- 'maintenance, facilities, council',
- 'engineering, mangement, parking', 'supports, support, sports',
- 'technology, systems, telephone', 'personnel, family, crisis',
- 'eligibility, assistance, assisted', 'director, automated, office',
- 'nicholson, transit, taxicab', 'compliance, accounts, hampden',
- 'construction, building, design', 'security, custody, mcdc',
- 'investigative, inspections, investigations',
- 'operator, equipment, bus', 'manager, investigator, iii',
- 'officer, office, traffic', 'school, room, behavioral',
- 'master, registered, meter', 'liquor, clerk, store',
- 'information, recreation, technology', 'community, health, nurse',
- 'specialist, special, resource',
- 'coordinator, coordinating, services',
- 'supervisory, supervisor, therapist', 'lieutenant, captain, chief',
+array(['special, programs, program', 'resources, resource, network',
+ 'silver, spring, ride', 'station, state, estate',
+ 'district, squad, payroll', 'management, equipment, automotive',
+ 'investigative, investigations, criminal',
+ 'gaithersburg, clarksburg, the', 'engineering, parking, marking',
+ 'rockville, downtown, library', 'services, highway, service',
+ 'communications, communication, telecommunications',
+ 'behavioral, health, school', 'director, automated, office',
+ 'safety, division, section', 'technology, telephone, systems',
+ 'welfare, child, childhood', 'nicholson, transit, taxicab',
+ 'sexual, family, crimes', 'eligibility, assistance, medical',
+ 'security, mccf, unit', '4th, 6th, 5th',
+ 'delivery, warehouse, liquor',
+ 'environmental, regulatory, adolescent',
+ 'training, administration, recruit', '3rd, patrol, 2nd',
+ 'maintenance, facilities, council', 'supports, support, sports',
+ 'building, construction, instruction',
+ 'toddlers, custody, members', 'warehouse, welfare, driver',
+ 'program, programs, projects',
'administrative, administration, administrator',
- 'enforcement, inspector, force', 'technician, mechanic, supply',
- 'correctional, correction, corporal',
- 'firefighter, rescuer, recruit',
+ 'operator, equipment, apprentice', 'officer, office, police',
+ 'candidate, attendant, master', 'firefighter, rescuer, recruit',
+ 'school, room, behavioral', 'information, recreation, technology',
+ 'liquor, clerk, store', 'specialist, special, planning',
+ 'manager, engineer, investigator', 'worker, social, leader',
+ 'community, health, nurse', 'technician, mechanic, systems',
+ 'enforcement, permitting, inspector',
+ 'coordinator, coordinating, legislative',
+ 'crossing, library, librarian',
+ 'correctional, correction, corporal', 'sheriff, deputy, autobody',
'communications, telecommunications, safety',
- 'warehouse, welfare, caseworker',
- 'crossing, purchasing, background', 'planning, senior, engineer',
- 'income, assistance, client', 'sergeant, police, cadet',
- 'accountant, assistant, library', 'program, programs, projects',
- 'legislative, principal, executive',
- 'librarian, psychiatric, employee',
- 'customer, urban, representative', 'candidate, sheriff, deputy',
- 'environmental, budget, scientist'], dtype=object)
+ 'lieutenant, shift, commander', 'income, assistance, client',
+ 'sergeant, emergency, energy', 'principal, executive, examiner',
+ 'environmental, budget, ombudsman', 'services, service, urban',
+ 'accountant, assistant, attorney',
+ 'supervisory, supervisor, therapist', 'captain, rescue, battalion'],
+ dtype=object)
R2 score: mean: 0.922; std: 0.014
+R2 score: mean: 0.919; std: 0.012
The simple pipeline applied on this complex dataset gave us very good results.
@@ -1736,7 +1736,7 @@ ConclusionTotal running time of the script: (1 minutes 8.940 seconds)
+Total running time of the script: (1 minutes 29.145 seconds)