Skip to content

Commit

Permalink
simplify docs and example and test
Browse files Browse the repository at this point in the history
  • Loading branch information
zkurtz committed Nov 28, 2024
1 parent b48dbd2 commit 7232847
Show file tree
Hide file tree
Showing 4 changed files with 338 additions and 105 deletions.
85 changes: 22 additions & 63 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,80 +1,39 @@
# packio

Packio allows you to use a single file to store and retrieve multiple python objects. A typical use case is to define IO methods on an instance of a class that contains multiple types of objects, such as a
- dictionary
- data frame
- string
- trained ML model (for example, lightgbm and xgboost each have built-in serialization methods for trained models)

When a class contains multiple of these data types, or even multiple instances of the same data type, saving and loading the data associated with a class tends to become unwieldy, requiring the user to either keep track multiple file paths or to fall back to using pickle, which introduces other problems (see below). The goal of packio is to make it as easy as possible to write `save` and `load` methods for such a class while allowing you to keep using all of your favorite object-type-specific serializers (i.e. `to_parquet` for pandas, `json` for dictionaries, `pathlib.Path.write_text` for strings, etc).

## Why a single file and not a directory?

In a word, *encapsulation*. Copy/move operations with a file are simpler than a directory, especially when it comes to moving data across platforms such as to/from the cloud. A file is also more tamper-resistant - it's typically harder to accidentally modify the contents of a file than it is for someone to add or remove files or subdirectories in a directory.

## Why not pickle?

Although `pickle` may be the most common approach for serialization of complex python objects, there are strong reasons to dislike pickle. As summarized by Gemini, "Python's pickle module, while convenient, has drawbacks. It poses security risks due to potential code execution vulnerabilities when handling untrusted data. Compatibility issues arise because it's Python-specific and version-dependent. Maintaining pickle can be challenging due to refactoring difficulties and complex debugging." See also [Ben Frederickson](https://www.benfrederickson.com/dont-pickle-your-data/).

## Example

Here is a toy example of a data class with `save` and `from_file` methods powered by `packio`:

Packio allows you to use a single file to store and retrieve multiple python objects. For example:
```
from dataclasses import dataclass
import json
from pathlib import Path
import dummio
import pandas as pd
from packio import Reader, Writer
# define some objects and an output filepath
df = pd.DataFrame({"a": [1, 2], "b": [3, 4]})
lookup = {"a": 1, "b": 2}
filepath = tmp_path / "data.packio"
@dataclass
class MyData:
"""A simple data class for testing.
# save both objects to the same filepath
with Writer(filepath) as writer:
df.to_parquet(writer.file("df.parquet"))
dummio.json.save(lookup, filepath=writer.file("lookup.json"))
Attributes:
documentation: Description of what this class is all about.
df: A data frame.
lookup: A dictionary.
"""
# load the objects from the file
with Reader(filepath) as reader:
df2 = pd.read_parquet(reader.file("df.parquet"))
lookup2 = dummio.json.load(reader.file("lookup.json"))
documentation: str
df: pd.Dataframe
lookup: dict[str, int]
assert df.equals(df2)
assert lookup == lookup2
```

def save(self, path: Path) -> None:
"""Save the data class to disk."""
with Writer(path) as writer:
writer.file("documentation.txt").write_text(self.documentation)
df.to_parquet(writer.file("df.parquet"))
with writer.file("lookup.json").open("w") as f:
json.dump(self.lookup, f)
[Available on pypi](https://pypi.org/project/packio/): `pip install packio`.

@classmethod
def from_file(cls, path: Path) -> "MyData":
"""Load the data class from disk."""
with Reader(path) as reader:
documentation = reader.file("documentation.txt").read_text()
df = pd.read_parquet(reader.file("df.parquet"))
with reader.file("lookup.json").open() as f:
lookup = json.load(f)
return cls(documentation=documentation, df=df, lookup=lookup)
## Why a single file and not a directory?

In a word, *encapsulation*. Copy/move operations with a file are simpler than a directory, especially when it comes to moving data across platforms such as to/from the cloud. A file is also more tamper-resistant - it's typically harder to accidentally modify the contents of a file than it is for someone to add or remove files or subdirectories in a directory.

# Create an instance of the class, save it, and re-load it as a new instance:
data = MyData(
documentation="This is an example.",
df=pd.DataFrame({"a": [1, 2], "b": [3, 4]}),
lookup={"a": 1, "b": 2},
)
data.save(tmp_path / "data.mydata")
loaded = MyData.from_file(tmp_path / "data.mydata")
## Why not pickle?

# Check that the new class instance matches the old one, at least in terms of it's data attributes:
assert loaded.documentation == data.documentation
pd.testing.assert_frame_equal(loaded.df, data.df)
assert loaded.lookup == data.lookup
```
Although `pickle` may be the most common approach for serialization of complex python objects, there are strong reasons to dislike pickle. As summarized by Gemini, "Python's pickle module, while convenient, has drawbacks. It poses security risks due to potential code execution vulnerabilities when handling untrusted data. Compatibility issues arise because it's Python-specific and version-dependent. Maintaining pickle can be challenging due to refactoring difficulties and complex debugging." See also [Ben Frederickson](https://www.benfrederickson.com/dont-pickle-your-data/).

## Development

Expand Down
5 changes: 4 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[project]
name = "packio"
version = "0.0.4"
version = "0.0.3"
description = "IO for multiple python objects to/from a single file"
authors = [{ name = "Zach Kurtz", email = "[email protected]" }]
readme = "README.md"
Expand All @@ -14,6 +14,9 @@ dev = [
"pytest >=8.3.2",
"sphinx>=8.1.3",
"sphinx-rtd-theme>=3.0.2",
"pandas>=2.2.3",
"dummio>=1.1.0",
"fastparquet>=2024.11.0",
]

[project.urls]
Expand Down
58 changes: 18 additions & 40 deletions tests/test_packio.py
Original file line number Diff line number Diff line change
@@ -1,48 +1,26 @@
"""Main tests of packio."""

import json
from dataclasses import dataclass
from pathlib import Path

import dummio
import pandas as pd
from packio import Reader, Writer


@dataclass
class MyData:
"""A simple data class for testing.
Attributes:
documentation: Description of what this class is all about.
lookup: A dictionary.
"""

documentation: str
lookup: dict[str, int]

def save(self, path: Path) -> None:
"""Save the data class to disk."""
with Writer(path) as writer:
writer.file("documentation.txt").write_text(self.documentation)
with writer.file("lookup.json").open("w") as f:
json.dump(self.lookup, f)
def test_io(tmp_path) -> None:
"""Test the dummio package."""
# define some objects
df = pd.DataFrame({"a": [1, 2], "b": [3, 4]})
lookup = {"a": 1, "b": 2}

@classmethod
def from_file(cls, path: Path) -> "MyData":
"""Load the data class from disk."""
with Reader(path) as reader:
documentation = reader.file("documentation.txt").read_text()
with reader.file("lookup.json").open() as f:
lookup = json.load(f)
return cls(documentation=documentation, lookup=lookup)
# save the objects to a single file
filepath = tmp_path / "data.packio"
with Writer(filepath) as writer:
df.to_parquet(writer.file("df.parquet"))
dummio.json.save(lookup, filepath=writer.file("lookup.json"))

# load the objects from the file
with Reader(filepath) as reader:
df2 = pd.read_parquet(reader.file("df.parquet"))
lookup2 = dummio.json.load(reader.file("lookup.json"))

def test_packio(tmp_path):
"""Test the packio package."""
data = MyData(
documentation="This is a test.",
lookup={"a": 1, "b": 2},
)
data.save(tmp_path / "data.mydata")
loaded = MyData.from_file(tmp_path / "data.mydata")
assert loaded.documentation == data.documentation
assert loaded.lookup == data.lookup
assert df.equals(df2)
assert lookup == lookup2
Loading

0 comments on commit 7232847

Please sign in to comment.