Skip to content

Commit

Permalink
[Describe] Align describe to new pandas version (#812)
Browse files Browse the repository at this point in the history
* [Describe] Align describe to new pandas version

* minor test fix

* update mlrun version

* add dask to requirements

* remove dask

* update numpy version

* debug

* debug

* debug

* remove dask tests

* remove debug code
  • Loading branch information
yonishelach authored Jun 13, 2024
1 parent d692c1a commit 1ca8e5e
Show file tree
Hide file tree
Showing 5 changed files with 65 additions and 151 deletions.
39 changes: 20 additions & 19 deletions describe/describe.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@
)
from mlrun.datastore import DataItem
from mlrun.execution import MLClientCtx
from mlrun.feature_store import FeatureSet, FeatureVector
from mlrun.feature_store import FeatureSet
from plotly.subplots import make_subplots

pd.set_option("display.float_format", lambda x: "%.2f" % x)
Expand Down Expand Up @@ -234,24 +234,24 @@ def _create_features_histogram_artifacts(
if label_column is not None and problem_type == "classification":
all_labels = df[label_column].unique()
visible = True
for (columnName, _) in df.iteritems():
if columnName == label_column:
for column_name in df.columns:
if column_name == label_column:
continue

if label_column is not None and problem_type == "classification":
for label in all_labels:
sub_fig = go.Histogram(
histfunc="count",
x=df.loc[df[label_column] == label][columnName],
x=df.loc[df[label_column] == label][column_name],
name=str(label),
visible=visible,
)
figs[f"{columnName}@?@{label}"] = sub_fig
figs[f"{column_name}@?@{label}"] = sub_fig
else:
sub_fig = go.Histogram(histfunc="count", x=df[columnName], visible=visible)
figs[f"{columnName}@?@{1}"] = sub_fig
sub_fig = go.Histogram(histfunc="count", x=df[column_name], visible=visible)
figs[f"{column_name}@?@{1}"] = sub_fig
if visible:
first_feature_name = columnName
first_feature_name = column_name
visible = False

fig = go.Figure()
Expand Down Expand Up @@ -338,7 +338,7 @@ def _create_features_2d_scatter_artifacts(
Create and log a scatter-2d artifact for each couple of features
"""
features = [
columnName for (columnName, _) in df.iteritems() if columnName != label_column
column_name for column_name in df.columns if column_name != label_column
]
max_feature_len = float(max(len(elem) for elem in features))
if label_column is not None:
Expand Down Expand Up @@ -450,11 +450,12 @@ def _create_violin_artifact(

plot_num = 0

for (columnName, columnData) in df.iteritems():
for column_name in df.columns:
column_data = df[column_name]
violin = go.Violin(
x=[columnName] * columnData.shape[0],
y=columnData,
name=columnName,
x=[column_name] * column_data.shape[0],
y=column_data,
name=column_name,
)

fig.add_trace(
Expand Down Expand Up @@ -491,15 +492,15 @@ def _create_imbalance_artifact(
"""
if label_column:
if problem_type == "classification":
values_column = "count"
labels_count = df[label_column].value_counts().sort_index()
df_labels_count = pd.DataFrame(labels_count)
df_labels_count.rename(columns={label_column: "Total"}, inplace=True)
df_labels_count[label_column] = labels_count.index
df_labels_count["weights"] = df_labels_count["Total"] / sum(
df_labels_count["Total"]
df_labels_count.rename(columns={"": values_column}, inplace=True)
df_labels_count[values_column] = df_labels_count[values_column] / sum(
df_labels_count[values_column]
)

fig = px.pie(df_labels_count, names=label_column, values="Total")
fig = px.pie(df_labels_count, names=label_column, values=values_column)
else:
fig = px.histogram(
histfunc="count",
Expand Down Expand Up @@ -532,7 +533,7 @@ def _create_corr_artifact(
"""
if label_column is not None:
df = df.drop([label_column], axis=1)
tblcorr = df.corr()
tblcorr = df.corr(numeric_only=True)
extra_data["correlation-matrix-csv"] = context.log_artifact(
TableArtifact("correlation-matrix-csv", df=tblcorr, visible=True),
local_path=f"{plots_dest}/correlation-matrix.csv",
Expand Down
96 changes: 43 additions & 53 deletions describe/function.yaml

Large diffs are not rendered by default.

4 changes: 2 additions & 2 deletions describe/item.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ labels:
author: Davids
maintainers: []
marketplaceType: ''
mlrunVersion: 1.4.1
mlrunVersion: 1.6.0
name: describe
platformVersion: 3.5.3
spec:
Expand All @@ -21,4 +21,4 @@ spec:
kind: job
requirements: []
url: ''
version: 1.2.0
version: 1.3.0
1 change: 0 additions & 1 deletion describe/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
scikit-learn~=1.0.2
plotly~=5.16.1
pytest~=7.0.1
pandas~=1.3.5
matplotlib~=3.5.1
seaborn~=0.11.2
76 changes: 0 additions & 76 deletions describe/test_describe.py
Original file line number Diff line number Diff line change
Expand Up @@ -271,79 +271,3 @@ def _create_data(n_samples, n_features, n_classes, n_informative, reg=False):
df["timestamp"] = [pd.Timestamp("2022").now()] * n_samples
df.to_parquet("artifacts/random_dataset.parquet")
return df


def _create_dask_func(uri):
dask_cluster_name = "dask-cluster"
dask_cluster = new_function(dask_cluster_name, kind="dask", image="mlrun/ml-models")
dask_cluster.spec.remote = False
dask_uri = uri
dask_cluster.export(dask_uri)


def test_import_function_describe_dask():
dask_uri = "dask_func.yaml"
_create_dask_func(dask_uri)
describe_func = import_function("function.yaml")
is_test_passed = True
_create_data(n_samples=100, n_features=5, n_classes=3, n_informative=3)
describe_func.spec.command = "describe_dask.py"

try:
describe_run = describe_func.run(
name="task-describe",
handler="analyze",
inputs={"table": DATA_PATH},
params={
"label_column": "label",
"dask_function": dask_uri,
"dask_flag": True,
},
artifact_path=os.path.abspath("./artifacts"),
local=True,
)

except Exception as exception:
print(f"- The test failed - raised the following error:\n- {exception}")
is_test_passed = False
_validate_paths(
{
"imbalance.html",
"imbalance-weights-vec.csv",
}
)
assert is_test_passed


def test_code_to_function_describe_dask():
dask_uri = "dask_func.yaml"
_create_dask_func(dask_uri)
describe_func = code_to_function(filename="describe.py", kind="local")
is_test_passed = True
_create_data(n_samples=100, n_features=5, n_classes=3, n_informative=3)
describe_func.spec.command = "describe_dask.py"

try:
describe_run = describe_func.run(
name="task-describe",
handler="analyze",
inputs={"table": DATA_PATH},
params={
"label_column": "label",
"dask_function": dask_uri,
"dask_flag": True,
},
artifact_path=os.path.abspath("./artifacts"),
local=True,
)

except Exception as exception:
print(f"- The test failed - raised the following error:\n- {exception}")
is_test_passed = False
_validate_paths(
{
"imbalance.html",
"imbalance-weights-vec.csv",
}
)
assert is_test_passed

0 comments on commit 1ca8e5e

Please sign in to comment.