Skip to content

Commit

Permalink
Fix implicit ordering of columns in server base
Browse files Browse the repository at this point in the history
Previously, just calling .stack() would reorder the second level
values and then be reassigned the order those values/tags were provided
in in the original multi level column dataframe. Leading to the wrong
features names being assigned to the wrong data
  • Loading branch information
milesgranger committed Aug 2, 2019
1 parent 17cdd4b commit e49340d
Show file tree
Hide file tree
Showing 3 changed files with 22 additions and 11 deletions.
14 changes: 12 additions & 2 deletions gordo_components/server/views/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -320,8 +320,18 @@ def multi_lvl_column_dataframe_to_dict(df: pd.DataFrame) -> typing.List[dict]:
# Stack the dataframe so second level column names become second level indexs
df.stack()
# For each column now, unstack the previous second level names (which are now the indexes of the series)
# back into a dataframe with those names, and convert to list
.apply(lambda col: col.unstack().dropna(axis=1).values.tolist())
# back into a dataframe with those names, and convert to list; if it's a Series we'll need to reshape it
.apply(
lambda col: col.reindex(df[col.name].columns, level=1)
.unstack()
.dropna(axis=1)
.values.tolist()
if isinstance(df[col.name], pd.DataFrame)
else col.unstack()
.rename(columns={"": col.name})[col.name]
.values.reshape(-1, 1)
.tolist()
)
)

results: typing.List[dict] = []
Expand Down
15 changes: 8 additions & 7 deletions tests/gordo_components/client/test_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -412,21 +412,22 @@ def _endpoint_metadata(name: str, healthy: bool) -> EndpointMetadata:
)


def test_ml_server_dataframe_to_dict_and_back():
@pytest.mark.parametrize("tags", [["C", "A", "B", "D"], tu.SENSORS_STR_LIST])
def test_ml_server_dataframe_to_dict_and_back(tags: typing.List[str]):
"""
Tests the flow of the server creating a dataframe from the model's data, putting into
a dict of string to lists of values, and the client being able to reconstruct it back
a dict of string to df. lists of values, and the client being able to reconstruct it back
to the original dataframe (less the second level names)
"""
# Some synthetic data
original_input = np.random.random((10, len(tu.SENSORTAG_LIST)))
model_output = np.random.random((10, len(tu.SENSORTAG_LIST)))
transformed_model_input = np.random.random((10, len(tu.SENSORTAG_LIST)))
inverse_transformed_model_output = np.random.random((10, len(tu.SENSORTAG_LIST)))
original_input = np.random.random((10, len(tags)))
model_output = np.random.random((10, len(tags)))
transformed_model_input = np.random.random((10, len(tags)))
inverse_transformed_model_output = np.random.random((10, len(tags)))

# Convert this data into a dataframe with multi index columns
df = BaseModelView.make_base_dataframe(
tu.SENSORTAG_LIST,
tags,
original_input,
model_output,
transformed_model_input,
Expand Down
4 changes: 2 additions & 2 deletions tests/gordo_components/server/test_anomaly_view.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,13 +49,13 @@ def test_anomaly_prediction_endpoint(
# start and end dates, because the server can't know what those are
assert "start" in record
assert (
len(record["start"]) == 0
record["start"][0] is None
if data_to_post is not None
else isinstance(record["start"][0], str)
)
assert "end" in record
assert (
len(record["end"]) == 0
record["end"][0] is None
if data_to_post is not None
else isinstance(record["end"][0], str)
)
Expand Down

0 comments on commit e49340d

Please sign in to comment.