Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

94 feat generalized transform norm and distance fct #95

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
124 changes: 124 additions & 0 deletions src/autora/utils/metrics.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,124 @@
import numpy as np


def norms(arr: np.ndarray) -> np.ndarray:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Perhaps consider renaming this to "norm" for singlular, also to mirror the naming convention in numpy (numpy.linalg.norm)

"""
Calculate the norms along the first axis
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Would be good to explain here in the doc string: The norm of what exactly? E.g., "norm of an nd array). Also, is it norm (singular) or norms (plural)?

Examples:
>>> import pandas as pd
>>> from autora.utils.transform import to_array

Simple dataframe with one condition
>>> df = pd.DataFrame({'x_0': [.2, 2, 3]})

First transform:
>>> as_array = to_array(df)
>>> norms(as_array)
array([0.2, 2. , 3. ])

>>> df_two_dim = pd.DataFrame({'x_0': [0, 1, 3], 'x_1': [1, 0, 4]})
>>> as_array = to_array(df_two_dim)
>>> norms(as_array)
array([1., 1., 5.])

For nested dataframes
>>> df_nested = pd.DataFrame({
... 'x_0': [[0, 0], [0, 1], [1, 0], [3, 4]]
... })
>>> as_array = to_array(df_nested)
>>> norms(as_array)
array([0., 1., 1., 5.])

... and deeply nested
>>> df_nested_deep = pd.DataFrame({
... 'x_0': [[[0, 0], [0, 1]], [[3, 0], [0, 4]]]
... })
>>> as_array = to_array(df_nested_deep)
>>> norms(as_array)
array([1., 5.])

... no matter how many columns
>>> df_nested_deep_multi_column = pd.DataFrame({
... 'x_0': [[[0, 0], [0, 4]], [[1, 0], [0, 0]]],
... 'x_1': [[[0, 3], [0, 0]], [[0, 0], [0, 0]]]
... })
>>> as_array = to_array(df_nested_deep_multi_column)
>>> norms(as_array)
array([5., 1.])
"""
return np.array([np.linalg.norm(np.ravel(row)) for row in arr])


def distances(arr_1: np.ndarray, arr_2: np.ndarray) -> np.ndarray:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Perhaps consider renaming this function to reflect this is a euclidean distance? E.g., numpy does it the following way:

dst = distance.euclidean(a, b)

"""
Calculate the euclidian distance between two arrays no matter their dimension along the
first axis
Examples:
>>> import pandas as pd
>>> from autora.utils.transform import to_array

Simple dataframe with one condition
>>> df_1 = pd.DataFrame({'x_0': [0, 1, 2]})
>>> df_2 = pd.DataFrame({'x_0': [1, 2, 3]})

First transform:
>>> as_array_1 = to_array(df_1)
>>> as_array_2 = to_array(df_2)
>>> distances(as_array_1, as_array_2)
array([1., 1., 1.])

>>> df_two_dim_1 = pd.DataFrame({'x_0': [0, 1, 3], 'x_1': [1, 0, 4]})
>>> df_two_dim_2 = pd.DataFrame({'x_0': [0, 1, 3], 'x_1': [1, 1, 4]})
>>> as_array_1 = to_array(df_two_dim_1)
>>> as_array_2 = to_array(df_two_dim_2)
>>> distances(as_array_1, as_array_2)
array([0., 1., 0.])

For nested dataframes
>>> df_nested_1 = pd.DataFrame({
... 'x_0': [[0, 0], [0, 2], [0, 2], [0, 10], [4, 0]]
... })
>>> df_nested_2 = pd.DataFrame({
... 'x_0': [[1, 0], [0, 0], [0, 5], [0, 6], [0, 3]]
... })
>>> as_array_1 = to_array(df_nested_1)
>>> as_array_2 = to_array(df_nested_2)
>>> distances(as_array_1, as_array_2)
array([1., 2., 3., 4., 5.])

... and deeply nested
>>> df_nested_deep_1 = pd.DataFrame({
... 'x_0': [[[0, 0], [0, 1]], [[6, 0], [0, 10]]]
... })
>>> df_nested_deep_2 = pd.DataFrame({
... 'x_0': [[[0, 0], [0, 1]], [[3, 0], [0, 6]]]
... })
>>> as_array_1 = to_array(df_nested_deep_1)
>>> as_array_2 = to_array(df_nested_deep_2)
>>> distances(as_array_1, as_array_2)
array([0., 5.])

... no matter how many columns
>>> df_nested_deep_multi_column_1 = pd.DataFrame({
... 'x_0': [[[0, 0], [0, 4]], [[1, 0], [0, 0]]],
... 'x_1': [[[0, 3], [0, 0]], [[0, 0], [0, 0]]]
... })
>>> df_nested_deep_multi_column_2 = pd.DataFrame({
... 'x_0': [[[0, 0], [0, 4]], [[1, 0], [0, 0]]],
... 'x_1': [[[0, 3], [0, 0]], [[0, 0], [0, 0]]]
... })
>>> as_array_1 = to_array(df_nested_deep_multi_column_1)
>>> as_array_2 = to_array(df_nested_deep_multi_column_2)
>>> distances(as_array_1, as_array_2)
array([0., 0.])

"""
# Check that the two arrays have the same shape
assert arr_1.shape == arr_2.shape, "Arrays must have the same shape"

# For each row, calculate the squared distance
return np.sqrt(
np.array(
[np.sum((np.ravel(a) - np.ravel(b)) ** 2) for a, b in zip(arr_1, arr_2)]
)
)
150 changes: 150 additions & 0 deletions src/autora/utils/transform.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,150 @@
from typing import Union

import numpy as np
import pandas as pd


def to_array(arr: Union[pd.DataFrame, pd.Series, np.ndarray]) -> np.ndarray:
"""
Transforms a pandas data frame to a numpy array
Args:
arr: the pandas data frame

Returns:
a numpy array

Examples:
Same result as np.array(df) if rows of df are one dimensional:
>>> df_one = pd.DataFrame({
... 'x_0': [1, 2, 3],
... 'x_1': [4, 5, 6],
... 'x_2': [7, 8, 9]})
>>> np.array_equal(np.array(df_one), to_array(df_one))
True

If the rows contain lists ...
>>> df_list = pd.DataFrame({
... 'x_0': [[0, 0], [1, 0], [2, 0]],
... 'x_1': [[0, 1], [1, 1], [2, 1]],
... 'x_2': [[0, 2], [1, 2], [2, 2]]
... })
>>> array_transformed = to_array(df_list)
>>> array_cast = np.array(df_list)

the results are not equal:
>>> np.array_equal(array_transformed, array_cast)
False

The cast array contains objects which are hard to work with:
>>> array_cast
array([[list([0, 0]), list([0, 1]), list([0, 2])],
[list([1, 0]), list([1, 1]), list([1, 2])],
[list([2, 0]), list([2, 1]), list([2, 2])]], dtype=object)

The transformed array containst vectors (numbers):
>>> array_transformed
array([[[0, 0],
[0, 1],
[0, 2]],
<BLANKLINE>
[[1, 0],
[1, 1],
[1, 2]],
<BLANKLINE>
[[2, 0],
[2, 1],
[2, 2]]])

... the same is true for arrays:
>>> df_array = pd.DataFrame({
... 'x_0': [np.array([0, 0]), np.array([1, 0]), np.array([2, 0])],
... 'x_1': [np.array([0, 1]), np.array([1, 1]), np.array([2, 1])],
... 'x_2': [np.array([0, 2]), np.array([1, 2]), np.array([2, 2])]
... })
>>> array_transformed = to_array(df_array)
>>> array_cast = np.array(df_list)

the results are not equal:
>>> np.array_equal(array_transformed, array_cast)
False

The cast array contains objects which are hard to work with:
>>> array_cast
array([[list([0, 0]), list([0, 1]), list([0, 2])],
[list([1, 0]), list([1, 1]), list([1, 2])],
[list([2, 0]), list([2, 1]), list([2, 2])]], dtype=object)

The transformed array containst vectors (numbers):
>>> array_transformed
array([[[0, 0],
[0, 1],
[0, 2]],
<BLANKLINE>
[[1, 0],
[1, 1],
[1, 2]],
<BLANKLINE>
[[2, 0],
[2, 1],
[2, 2]]])

# This also works with more nesting:
>>> df_nested = pd.DataFrame({
... 'x_0': [[[0,0],[1,1]], [[0,0],[2,2]]],
... 'x_1': [[[1,1],[1,1]], [[1,1],[2,2]]]
... })
>>> to_array(df_nested)
array([[[[0, 0],
[1, 1]],
<BLANKLINE>
[[1, 1],
[1, 1]]],
<BLANKLINE>
<BLANKLINE>
[[[0, 0],
[2, 2]],
<BLANKLINE>
[[1, 1],
[2, 2]]]])

When the inner lists don't have the same shape, an error is thrown and one can use
a flattening version of this (ATTENTION: when using the flattening version,
information about which entry belongs to which condition is lost):
"""
if isinstance(arr, np.ndarray):
return arr

_lst = [list(row) for _, row in arr.iterrows()]
return np.array(_lst)


def to_array_flatten(arr: Union[pd.DataFrame, pd.Series, np.ndarray]) -> np.ndarray:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Here, we could either consider having a separate function flatten which can be combined with the to_array function. Alternatively, we could call it to_flat_array?

"""
Flattens elements in a pandas DataFrame to resolve shape inconsistencies.

Args:
df: A pandas DataFrame or Series with inconsistent element shapes.

Returns:
A numpy array where all elements are flattened.

Example:
>>> df_inconsistent = pd.DataFrame({
... 'x_0': [0, 2, 4],
... 'x_1': [[1, 1], [3, 3], [5, 5]]
... })
>>> to_array_flatten(df_inconsistent)
array([[0, 1, 1],
[2, 3, 3],
[4, 5, 5]])
"""
if isinstance(arr, np.ndarray):
return arr
return np.array(
[
np.concatenate(
[np.ravel(x) if isinstance(x, (list, np.ndarray)) else [x] for x in row]
)
for _, row in arr.iterrows()
]
)
Loading