Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[ENH] Drop numpy array input support #831

Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions benchmarks/bench_gap_divergence.py
Original file line number Diff line number Diff line change
Expand Up @@ -208,7 +208,7 @@ def benchmark(max_iter_e_step: int, dataset_name: str):
(
"encoding",
TableVectorizer(
high_card_cat_transformer=ModifiedGapEncoder(
high_cardinality_transformer=ModifiedGapEncoder(
min_iter=5,
max_iter=5,
max_iter_e_step=max_iter_e_step,
Expand All @@ -234,7 +234,7 @@ def benchmark(max_iter_e_step: int, dataset_name: str):
results = []
for pipeline, (_, cv_results) in zip(pipelines, cv_df.iterrows()):
for modified_gap_encoder in (
pipeline["encoding"].named_transformers_["high_card_cat"].fitted_models_
pipeline["encoding"].named_transformers_["high_cardinality"].fitted_models_
):
for gap_iter, inner_results in enumerate(
modified_gap_encoder.benchmark_results_
Expand Down
2 changes: 1 addition & 1 deletion benchmarks/bench_tablevectorizer_tuning.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ def benchmark(
):
tv = TableVectorizer(
cardinality_threshold=tv_cardinality_threshold,
high_card_cat_transformer=MinHashEncoder(n_components=minhash_n_components),
high_cardinality_transformer=MinHashEncoder(n_components=minhash_n_components),
)

dataset = dataset_map[dataset_name]
Expand Down
4 changes: 2 additions & 2 deletions benchmarks/run_on_openml_datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,14 +45,14 @@

classification_pipeline = Pipeline(
[
("vectorizer", TableVectorizer(high_card_cat_transformer=MinHashEncoder())),
("vectorizer", TableVectorizer(high_cardinality_transformer=MinHashEncoder())),
("classifier", HistGradientBoostingClassifier()),
]
)

regression_pipeline = Pipeline(
[
("vectorizer", TableVectorizer(high_card_cat_transformer=MinHashEncoder())),
("vectorizer", TableVectorizer(high_cardinality_transformer=MinHashEncoder())),
("regressor", HistGradientBoostingRegressor()),
]
)
Expand Down
4 changes: 2 additions & 2 deletions examples/01_encodings.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,14 +94,14 @@
# - The |OneHotEncoder| for low cardinality string variables, the columns
# ``'gender'``, ``'department'``, ``'department_name'`` and ``'assignment_category'``.

tv.named_transformers_["low_card_cat"].get_feature_names_out()
tv.named_transformers_["low_cardinality"].get_feature_names_out()

###############################################################################
# - The |GapEncoder| for high cardinality string columns, ``'employee_position_title'``
# and ``'division'``. The |GapEncoder| is a powerful encoder that can handle dirty
# categorical columns.

tv.named_transformers_["high_card_cat"].get_feature_names_out()
tv.named_transformers_["high_cardinality"].get_feature_names_out()

###############################################################################
# - The |DatetimeEncoder| to the ``'date_first_hired'`` column. The |DatetimeEncoder|
Expand Down
10 changes: 5 additions & 5 deletions examples/FIXME/07_grid_searching_with_the_tablevectorizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@
from skrub import MinHashEncoder

tv = TableVectorizer(
high_card_cat_transformer=MinHashEncoder(),
high_cardinality_transformer=MinHashEncoder(),
)
tv.fit(X)

Expand Down Expand Up @@ -101,8 +101,8 @@
# For that, we use the dunder separator, which indicates a nesting layer.
# That means that for tuning the parameter ``n_components`` of the
# |GapEncoder| saved in the |TableVectorizer| attribute
# ``high_card_cat_transformer``, we use the syntax
# ``tablevectorizer__high_card_cat_transformer__n_components``.
# ``high_cardinality_transformer``, we use the syntax
# ``tablevectorizer__high_cardinality_transformer__n_components``.
#
# We recommend using the 3-tuple syntax for the column-specific transformers,
# which allows us to give a name to the assignment (here ``mh_dep_name``).
Expand All @@ -114,7 +114,7 @@

pipeline = make_pipeline(
TableVectorizer(
high_card_cat_transformer=GapEncoder(),
high_cardinality_transformer=GapEncoder(),
specific_transformers=[
("mh_dep_name", MinHashEncoder(), ["department_name"]),
],
Expand All @@ -123,7 +123,7 @@
)

params = {
"tablevectorizer__high_card_cat_transformer__n_components": [10, 30, 50],
"tablevectorizer__high_cardinality_transformer__n_components": [10, 30, 50],
"tablevectorizer__mh_dep_name__n_components": [25, 50],
}

Expand Down
Loading
Loading