Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implemented DataFrame.lookup #1785

Open
wants to merge 12 commits into
base: master
Choose a base branch
from
55 changes: 55 additions & 0 deletions databricks/koalas/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -10224,6 +10224,61 @@ def from_dict(data, orient="columns", dtype=None, columns=None) -> "DataFrame":
"""
return DataFrame(pd.DataFrame.from_dict(data, orient=orient, dtype=dtype, columns=columns))

def lookup(self, row_labels, col_labels) -> np.ndarray:
"""
Label-based "fancy indexing" function for DataFrame.

Given equal-length arrays of row and column labels, return an
array of the values corresponding to each (row, col) pair.

.. note:: This method should only be used when the length of `row_labels` is small enough,
as all the data belongs to the `row_labels` is loaded into the driver's memory.
itholic marked this conversation as resolved.
Show resolved Hide resolved

Parameters
----------
row_labels : sequence
The row labels to use for lookup.
col_labels : sequence
The column labels to use for lookup.

Returns
-------
numpy.ndarray
The found values.

Examples
--------
>>> kdf = ks.DataFrame({'A': [3, 4, 5, 6, 7],
... 'B': [10.0, 20.0, 30.0, 40.0, 50.0],
... 'C': ['a', 'b', 'c', 'd', 'e']})
>>> kdf
A B C
0 3 10.0 a
1 4 20.0 b
2 5 30.0 c
3 6 40.0 d
4 7 50.0 e

>>> kdf.lookup([0], ["C"])
array(['a'], dtype=object)

>>> kdf.lookup([2, 3], ["A", "B"])
array([ 5., 40.])
"""
from databricks.koalas.series import Series
from databricks.koalas.indexes import Index

if len(row_labels) != len(col_labels):
raise ValueError("Row labels must have same size as column labels")
if isinstance(row_labels, (Series, Index)):
row_labels = row_labels.to_numpy().tolist()
if isinstance(col_labels, (Series, Index)):
itholic marked this conversation as resolved.
Show resolved Hide resolved
col_labels = col_labels.to_numpy().tolist()
itholic marked this conversation as resolved.
Show resolved Hide resolved
lookups = [
self.loc[row_label, col_label] for row_label, col_label in zip(row_labels, col_labels)
]
return np.asarray(pd.Series(lookups))
itholic marked this conversation as resolved.
Show resolved Hide resolved
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Out of curiosity, why did we change it to np.asarray(pd.Series(lookups))?


def _to_internal_pandas(self):
"""
Return a pandas DataFrame directly from _internal to avoid overhead of copy.
Expand Down
1 change: 0 additions & 1 deletion databricks/koalas/missing/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,6 @@ class _MissingPandasLikeDataFrame(object):
interpolate = _unsupported_function("interpolate")
itertuples = _unsupported_function("itertuples")
last = _unsupported_function("last")
lookup = _unsupported_function("lookup")
mode = _unsupported_function("mode")
reindex_like = _unsupported_function("reindex_like")
rename_axis = _unsupported_function("rename_axis")
Expand Down
64 changes: 64 additions & 0 deletions databricks/koalas/tests/test_dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -4019,3 +4019,67 @@ def test_from_dict(self):
pdf = pd.DataFrame.from_dict(data, orient="index", columns=["A", "B", "C", "D"])
kdf = ks.DataFrame.from_dict(data, orient="index", columns=["A", "B", "C", "D"])
self.assert_eq(pdf, kdf)

def test_lookup(self):
pdf = pd.DataFrame(
{
"A": [3, 4, 5, 6, 7],
"B": [10.0, 20.0, 30.0, 40.0, 50.0],
"C": ["a", "b", "c", "d", "e"],
}
)
kdf = ks.from_pandas(pdf)
itholic marked this conversation as resolved.
Show resolved Hide resolved

# list
self.assert_eq(pdf.lookup([0], ["C"]), kdf.lookup([0], ["C"]))
self.assert_list_eq(
pdf.lookup([0, 3, 4], ["A", "C", "A"]), kdf.lookup([0, 3, 4], ["A", "C", "A"])
)

# tuple
self.assert_eq(pdf.lookup((0,), ("C",)), kdf.lookup((0,), ("C",)))
self.assert_list_eq(
pdf.lookup((0, 3, 4), ("A", "C", "A")), kdf.lookup((0, 3, 4), ("A", "C", "A"))
)

# dict
self.assert_eq(pdf.lookup({0: None}, {"C": None}), kdf.lookup({0: None}, {"C": None}))
self.assert_list_eq(
pdf.lookup({0: None, 3: None, 4: None}, {"A": None, "C": None, "B": None}),
kdf.lookup({0: None, 3: None, 4: None}, {"A": None, "C": None, "B": None}),
)

# Index
self.assert_eq(
pdf.lookup(pd.Index([0]), pd.Index(["C"])), kdf.lookup(ks.Index([0]), ks.Index(["C"]))
)
self.assert_list_eq(
pdf.lookup(pd.Index([0, 3, 4]), pd.Index(["A", "C", "A"])),
kdf.lookup(ks.Index([0, 3, 4]), ks.Index(["A", "C", "A"])),
)

# Series
self.assert_eq(
pdf.lookup(pd.Series([0]), pd.Series(["C"])),
kdf.lookup(ks.Series([0]), ks.Series(["C"])),
)
self.assert_list_eq(
pdf.lookup(pd.Series([0, 3, 4]), pd.Series(["A", "C", "A"])),
kdf.lookup(ks.Series([0, 3, 4]), ks.Series(["A", "C", "A"])),
)

# MultiIndex
pdf.index = pd.MultiIndex.from_tuples(
[("a", "v"), ("b", "w"), ("c", "x"), ("d", "y"), ("e", "z")]
)
kdf = ks.from_pandas(pdf)

self.assert_eq(pdf.lookup([("a", "v")], ["C"]), kdf.lookup([("a", "v")], ["C"]))
self.assert_list_eq(
pdf.lookup([("a", "v"), ("d", "y"), ("e", "z")], ["A", "C", "A"]),
kdf.lookup([("a", "v"), ("d", "y"), ("e", "z")], ["A", "C", "A"]),
)

err_msg = "Row labels must have same size as column labels"
with self.assertRaisesRegex(ValueError, err_msg):
kdf.lookup([0, 3, 4], ["A", "C"])
1 change: 1 addition & 0 deletions docs/source/reference/frame.rst
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@ Indexing, iteration
DataFrame.items
DataFrame.iteritems
DataFrame.iterrows
DataFrame.lookup
DataFrame.keys
DataFrame.pop
DataFrame.tail
Expand Down