Design notes #7
thorwhalen
started this conversation in
General
Replies: 1 comment
-
DataframeKvReaderIn a nutshell, I tried several designs for DataframeKvReader, and ended up using the "indexed" one since, though it has a bigger overhead in construction, it has a lot faster getter (
This was produced by the code below: import pandas as pd
from collections.abc import Mapping
class DataframeKvReaderBasic(Mapping):
def __init__(self, df, key_columns, value_columns):
self.df = df
self.key_columns = (
key_columns if isinstance(key_columns, list) else [key_columns]
)
self.value_columns = (
value_columns if isinstance(value_columns, list) else [value_columns]
)
self._validate_columns()
def _validate_columns(self):
all_columns = self.df.columns.tolist()
for col in self.key_columns + self.value_columns:
if col not in all_columns:
raise ValueError(f"Column {col} not found in the DataFrame.")
def _get_key(self, row):
return tuple(row[col] for col in self.key_columns)
def __getitem__(self, key):
if not isinstance(key, tuple):
key = (key,)
sub_df = self.df[self.df[self.key_columns].apply(tuple, axis=1) == key]
if sub_df.empty:
raise KeyError(f"Key {key} not found")
return sub_df[self.value_columns]
def __iter__(self):
keys = self.df[self.key_columns].drop_duplicates().apply(tuple, axis=1)
return iter(keys)
def __len__(self):
return self.df[self.key_columns].drop_duplicates().shape[0]
def __contains__(self, key):
if not isinstance(key, tuple):
key = (key,)
return not self.df[self.df[self.key_columns].apply(tuple, axis=1) == key].empty
def __repr__(self):
return f"{type(self).__name__}(df=<{len(self.df)} rows>, key_columns={self.key_columns}, value_columns={self.value_columns})"
class DataframeKvReaderIndexed(Mapping):
def __init__(self, df, key_columns, value_columns):
self.df = df.set_index(key_columns, drop=False).sort_index()
self.key_columns = key_columns if isinstance(key_columns, list) else [key_columns]
self.value_columns = value_columns if isinstance(value_columns, list) else [value_columns]
def __getitem__(self, key):
try:
sub_df = self.df.loc[key, self.value_columns]
if isinstance(sub_df, pd.Series):
sub_df = sub_df.to_frame().T
return sub_df
except KeyError:
raise KeyError(f"Key {key} not found")
def __iter__(self):
return iter(self.df.index.drop_duplicates().tolist())
def __len__(self):
return self.df.index.nunique()
def __contains__(self, key):
return key in self.df.index
def __repr__(self):
return f"{type(self).__name__}(df=<{len(self.df)} rows>, key_columns={self.key_columns}, value_columns={self.value_columns})"
# Test cases
def test_DataframeKvReader():
df = pd.DataFrame({
'A': [1, 2, 1],
'B': [4, 5, 4],
'C': [7, 8, 9],
'D': [10, 11, 12]
})
kv_readers = [
DataframeKvReaderBasic(df, ['A', 'B'], ['C', 'D']),
DataframeKvReaderIndexed(df, ['A', 'B'], ['C', 'D']),
DataframeKvReaderGroupBy(df, ['A', 'B'], ['C', 'D'])
]
for kv_reader in kv_readers:
# Accessing values
key = (1, 4)
expected_df = pd.DataFrame([{'C': 7, 'D': 10}, {'C': 9, 'D': 12}], index=[0, 2])
assert kv_reader[key].reset_index(drop=True).equals(expected_df.reset_index(drop=True))
assert list(kv_reader) == [(1, 4), (2, 5)]
test_DataframeKvReader()
import time
import numpy as np
# Large DataFrame test
def generate_large_dataframe(n_rows=10**7):
np.random.seed(0)
df = pd.DataFrame({
'A': np.random.randint(0, 1000, n_rows),
'B': np.random.randint(0, 1000, n_rows),
'C': np.random.randint(0, 1000, n_rows),
'D': np.random.randint(0, 1000, n_rows)
})
return df
def test_performance_of_large_dataframe_kv_readers():
df = generate_large_dataframe()
implementations = {
'Basic': DataframeKvReaderBasic,
'Indexed': DataframeKvReaderIndexed,
'GroupBy': DataframeKvReaderGroupBy
}
for name, cls in implementations.items():
print(f"\nTesting {name} implementation...")
start_time = time.time()
kv_reader = cls(df, ['A', 'B'], ['C', 'D'])
init_time = time.time() - start_time
print(f"Initialization time: {init_time:.2f} seconds")
test_key = (df.iloc[5000000]['A'], df.iloc[5000000]['B'])
start_time = time.time()
result = kv_reader[test_key]
lookup_time = time.time() - start_time
print(f"Lookup time for key {test_key}: {lookup_time:.2f} seconds")
test_performance_of_large_dataframe_kv_readers() |
Beta Was this translation helpful? Give feedback.
0 replies
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
-
To have a (reference-able) record of some of the work that has been done when making some of our design choices.
Beta Was this translation helpful? Give feedback.
All reactions