-
Notifications
You must be signed in to change notification settings - Fork 0
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
94 feat generalized transform norm and distance fct #95
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,124 @@ | ||
import numpy as np | ||
|
||
|
||
def norms(arr: np.ndarray) -> np.ndarray: | ||
""" | ||
Calculate the norms along the first axis | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Would be good to explain here in the doc string: The norm of what exactly? E.g., "norm of an nd array). Also, is it norm (singular) or norms (plural)? |
||
Examples: | ||
>>> import pandas as pd | ||
>>> from autora.utils.transform import to_array | ||
|
||
Simple dataframe with one condition | ||
>>> df = pd.DataFrame({'x_0': [.2, 2, 3]}) | ||
|
||
First transform: | ||
>>> as_array = to_array(df) | ||
>>> norms(as_array) | ||
array([0.2, 2. , 3. ]) | ||
|
||
>>> df_two_dim = pd.DataFrame({'x_0': [0, 1, 3], 'x_1': [1, 0, 4]}) | ||
>>> as_array = to_array(df_two_dim) | ||
>>> norms(as_array) | ||
array([1., 1., 5.]) | ||
|
||
For nested dataframes | ||
>>> df_nested = pd.DataFrame({ | ||
... 'x_0': [[0, 0], [0, 1], [1, 0], [3, 4]] | ||
... }) | ||
>>> as_array = to_array(df_nested) | ||
>>> norms(as_array) | ||
array([0., 1., 1., 5.]) | ||
|
||
... and deeply nested | ||
>>> df_nested_deep = pd.DataFrame({ | ||
... 'x_0': [[[0, 0], [0, 1]], [[3, 0], [0, 4]]] | ||
... }) | ||
>>> as_array = to_array(df_nested_deep) | ||
>>> norms(as_array) | ||
array([1., 5.]) | ||
|
||
... no matter how many columns | ||
>>> df_nested_deep_multi_column = pd.DataFrame({ | ||
... 'x_0': [[[0, 0], [0, 4]], [[1, 0], [0, 0]]], | ||
... 'x_1': [[[0, 3], [0, 0]], [[0, 0], [0, 0]]] | ||
... }) | ||
>>> as_array = to_array(df_nested_deep_multi_column) | ||
>>> norms(as_array) | ||
array([5., 1.]) | ||
""" | ||
return np.array([np.linalg.norm(np.ravel(row)) for row in arr]) | ||
|
||
|
||
def distances(arr_1: np.ndarray, arr_2: np.ndarray) -> np.ndarray: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Perhaps consider renaming this function to reflect this is a euclidean distance? E.g., numpy does it the following way:
|
||
""" | ||
Calculate the euclidian distance between two arrays no matter their dimension along the | ||
first axis | ||
Examples: | ||
>>> import pandas as pd | ||
>>> from autora.utils.transform import to_array | ||
|
||
Simple dataframe with one condition | ||
>>> df_1 = pd.DataFrame({'x_0': [0, 1, 2]}) | ||
>>> df_2 = pd.DataFrame({'x_0': [1, 2, 3]}) | ||
|
||
First transform: | ||
>>> as_array_1 = to_array(df_1) | ||
>>> as_array_2 = to_array(df_2) | ||
>>> distances(as_array_1, as_array_2) | ||
array([1., 1., 1.]) | ||
|
||
>>> df_two_dim_1 = pd.DataFrame({'x_0': [0, 1, 3], 'x_1': [1, 0, 4]}) | ||
>>> df_two_dim_2 = pd.DataFrame({'x_0': [0, 1, 3], 'x_1': [1, 1, 4]}) | ||
>>> as_array_1 = to_array(df_two_dim_1) | ||
>>> as_array_2 = to_array(df_two_dim_2) | ||
>>> distances(as_array_1, as_array_2) | ||
array([0., 1., 0.]) | ||
|
||
For nested dataframes | ||
>>> df_nested_1 = pd.DataFrame({ | ||
... 'x_0': [[0, 0], [0, 2], [0, 2], [0, 10], [4, 0]] | ||
... }) | ||
>>> df_nested_2 = pd.DataFrame({ | ||
... 'x_0': [[1, 0], [0, 0], [0, 5], [0, 6], [0, 3]] | ||
... }) | ||
>>> as_array_1 = to_array(df_nested_1) | ||
>>> as_array_2 = to_array(df_nested_2) | ||
>>> distances(as_array_1, as_array_2) | ||
array([1., 2., 3., 4., 5.]) | ||
|
||
... and deeply nested | ||
>>> df_nested_deep_1 = pd.DataFrame({ | ||
... 'x_0': [[[0, 0], [0, 1]], [[6, 0], [0, 10]]] | ||
... }) | ||
>>> df_nested_deep_2 = pd.DataFrame({ | ||
... 'x_0': [[[0, 0], [0, 1]], [[3, 0], [0, 6]]] | ||
... }) | ||
>>> as_array_1 = to_array(df_nested_deep_1) | ||
>>> as_array_2 = to_array(df_nested_deep_2) | ||
>>> distances(as_array_1, as_array_2) | ||
array([0., 5.]) | ||
|
||
... no matter how many columns | ||
>>> df_nested_deep_multi_column_1 = pd.DataFrame({ | ||
... 'x_0': [[[0, 0], [0, 4]], [[1, 0], [0, 0]]], | ||
... 'x_1': [[[0, 3], [0, 0]], [[0, 0], [0, 0]]] | ||
... }) | ||
>>> df_nested_deep_multi_column_2 = pd.DataFrame({ | ||
... 'x_0': [[[0, 0], [0, 4]], [[1, 0], [0, 0]]], | ||
... 'x_1': [[[0, 3], [0, 0]], [[0, 0], [0, 0]]] | ||
... }) | ||
>>> as_array_1 = to_array(df_nested_deep_multi_column_1) | ||
>>> as_array_2 = to_array(df_nested_deep_multi_column_2) | ||
>>> distances(as_array_1, as_array_2) | ||
array([0., 0.]) | ||
|
||
""" | ||
# Check that the two arrays have the same shape | ||
assert arr_1.shape == arr_2.shape, "Arrays must have the same shape" | ||
|
||
# For each row, calculate the squared distance | ||
return np.sqrt( | ||
np.array( | ||
[np.sum((np.ravel(a) - np.ravel(b)) ** 2) for a, b in zip(arr_1, arr_2)] | ||
) | ||
) |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,150 @@ | ||
from typing import Union | ||
|
||
import numpy as np | ||
import pandas as pd | ||
|
||
|
||
def to_array(arr: Union[pd.DataFrame, pd.Series, np.ndarray]) -> np.ndarray: | ||
""" | ||
Transforms a pandas data frame to a numpy array | ||
Args: | ||
arr: the pandas data frame | ||
|
||
Returns: | ||
a numpy array | ||
|
||
Examples: | ||
Same result as np.array(df) if rows of df are one dimensional: | ||
>>> df_one = pd.DataFrame({ | ||
... 'x_0': [1, 2, 3], | ||
... 'x_1': [4, 5, 6], | ||
... 'x_2': [7, 8, 9]}) | ||
>>> np.array_equal(np.array(df_one), to_array(df_one)) | ||
True | ||
|
||
If the rows contain lists ... | ||
>>> df_list = pd.DataFrame({ | ||
... 'x_0': [[0, 0], [1, 0], [2, 0]], | ||
... 'x_1': [[0, 1], [1, 1], [2, 1]], | ||
... 'x_2': [[0, 2], [1, 2], [2, 2]] | ||
... }) | ||
>>> array_transformed = to_array(df_list) | ||
>>> array_cast = np.array(df_list) | ||
|
||
the results are not equal: | ||
>>> np.array_equal(array_transformed, array_cast) | ||
False | ||
|
||
The cast array contains objects which are hard to work with: | ||
>>> array_cast | ||
array([[list([0, 0]), list([0, 1]), list([0, 2])], | ||
[list([1, 0]), list([1, 1]), list([1, 2])], | ||
[list([2, 0]), list([2, 1]), list([2, 2])]], dtype=object) | ||
|
||
The transformed array containst vectors (numbers): | ||
>>> array_transformed | ||
array([[[0, 0], | ||
[0, 1], | ||
[0, 2]], | ||
<BLANKLINE> | ||
[[1, 0], | ||
[1, 1], | ||
[1, 2]], | ||
<BLANKLINE> | ||
[[2, 0], | ||
[2, 1], | ||
[2, 2]]]) | ||
|
||
... the same is true for arrays: | ||
>>> df_array = pd.DataFrame({ | ||
... 'x_0': [np.array([0, 0]), np.array([1, 0]), np.array([2, 0])], | ||
... 'x_1': [np.array([0, 1]), np.array([1, 1]), np.array([2, 1])], | ||
... 'x_2': [np.array([0, 2]), np.array([1, 2]), np.array([2, 2])] | ||
... }) | ||
>>> array_transformed = to_array(df_array) | ||
>>> array_cast = np.array(df_list) | ||
|
||
the results are not equal: | ||
>>> np.array_equal(array_transformed, array_cast) | ||
False | ||
|
||
The cast array contains objects which are hard to work with: | ||
>>> array_cast | ||
array([[list([0, 0]), list([0, 1]), list([0, 2])], | ||
[list([1, 0]), list([1, 1]), list([1, 2])], | ||
[list([2, 0]), list([2, 1]), list([2, 2])]], dtype=object) | ||
|
||
The transformed array containst vectors (numbers): | ||
>>> array_transformed | ||
array([[[0, 0], | ||
[0, 1], | ||
[0, 2]], | ||
<BLANKLINE> | ||
[[1, 0], | ||
[1, 1], | ||
[1, 2]], | ||
<BLANKLINE> | ||
[[2, 0], | ||
[2, 1], | ||
[2, 2]]]) | ||
|
||
# This also works with more nesting: | ||
>>> df_nested = pd.DataFrame({ | ||
... 'x_0': [[[0,0],[1,1]], [[0,0],[2,2]]], | ||
... 'x_1': [[[1,1],[1,1]], [[1,1],[2,2]]] | ||
... }) | ||
>>> to_array(df_nested) | ||
array([[[[0, 0], | ||
[1, 1]], | ||
<BLANKLINE> | ||
[[1, 1], | ||
[1, 1]]], | ||
<BLANKLINE> | ||
<BLANKLINE> | ||
[[[0, 0], | ||
[2, 2]], | ||
<BLANKLINE> | ||
[[1, 1], | ||
[2, 2]]]]) | ||
|
||
When the inner lists don't have the same shape, an error is thrown and one can use | ||
a flattening version of this (ATTENTION: when using the flattening version, | ||
information about which entry belongs to which condition is lost): | ||
""" | ||
if isinstance(arr, np.ndarray): | ||
return arr | ||
|
||
_lst = [list(row) for _, row in arr.iterrows()] | ||
return np.array(_lst) | ||
|
||
|
||
def to_array_flatten(arr: Union[pd.DataFrame, pd.Series, np.ndarray]) -> np.ndarray: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Here, we could either consider having a separate function |
||
""" | ||
Flattens elements in a pandas DataFrame to resolve shape inconsistencies. | ||
|
||
Args: | ||
df: A pandas DataFrame or Series with inconsistent element shapes. | ||
|
||
Returns: | ||
A numpy array where all elements are flattened. | ||
|
||
Example: | ||
>>> df_inconsistent = pd.DataFrame({ | ||
... 'x_0': [0, 2, 4], | ||
... 'x_1': [[1, 1], [3, 3], [5, 5]] | ||
... }) | ||
>>> to_array_flatten(df_inconsistent) | ||
array([[0, 1, 1], | ||
[2, 3, 3], | ||
[4, 5, 5]]) | ||
""" | ||
if isinstance(arr, np.ndarray): | ||
return arr | ||
return np.array( | ||
[ | ||
np.concatenate( | ||
[np.ravel(x) if isinstance(x, (list, np.ndarray)) else [x] for x in row] | ||
) | ||
for _, row in arr.iterrows() | ||
] | ||
) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Perhaps consider renaming this to "norm" for singlular, also to mirror the naming convention in numpy (
numpy.linalg.norm
)