simplify docs and example and test

zkurtz · Nov 28, 2024 · bd0e819 · bd0e819
1 parent b48dbd2
commit bd0e819
Show file tree

Hide file tree

Showing 4 changed files with 338 additions and 105 deletions.
diff --git a/README.md b/README.md
@@ -1,80 +1,39 @@
 # packio
 
-Packio allows you to use a single file to store and retrieve multiple python objects. A typical use case is to define IO methods on an instance of a class that contains multiple types of objects, such as a
-- dictionary
-- data frame
-- string
-- trained ML model (for example, lightgbm and xgboost each have built-in serialization methods for trained models)
-
-When a class contains multiple of these data types, or even multiple instances of the same data type, saving and loading the data associated with a class tends to become unwieldy, requiring the user to either keep track multiple file paths or to fall back to using pickle, which introduces other problems (see below). The goal of packio is to make it as easy as possible to write `save` and `load` methods for such a class while allowing you to keep using all of your favorite object-type-specific serializers (i.e. `to_parquet` for pandas, `json` for dictionaries, `pathlib.Path.write_text` for strings, etc).
-
-## Why a single file and not a directory?
-
-In a word, *encapsulation*. Copy/move operations with a file are simpler than a directory, especially when it comes to moving data across platforms such as to/from the cloud. A file is also more tamper-resistant - it's typically harder to accidentally modify the contents of a file than it is for someone to add or remove files or subdirectories in a directory.
-
-## Why not pickle?
-
-Although `pickle` may be the most common approach for serialization of complex python objects, there are strong reasons to dislike pickle. As summarized by Gemini, "Python's pickle module, while convenient, has drawbacks. It poses security risks due to potential code execution vulnerabilities when handling untrusted data. Compatibility issues arise because it's Python-specific and version-dependent.  Maintaining pickle can be challenging due to refactoring difficulties and complex debugging." See also [Ben Frederickson](https://www.benfrederickson.com/dont-pickle-your-data/).
-
-## Example
-
-Here is a toy example of a data class with `save` and `from_file` methods powered by `packio`:
-
+Packio allows you to use a single file to store and retrieve multiple python objects. For example:
 ```
-from dataclasses import dataclass
-import json
-from pathlib import Path
+import dummio
 import pandas as pd
 from packio import Reader, Writer
 
+# define some objects and an output filepath
+df = pd.DataFrame({"a": [1, 2], "b": [3, 4]})
+lookup = {"a": 1, "b": 2}
+filepath = tmp_path / "data.packio"
 
-@dataclass
-class MyData:
-    """A simple data class for testing.
+# save both objects to the same filepath
+with Writer(filepath) as writer:
+    df.to_parquet(writer.file("df.parquet"))
+    dummio.json.save(lookup, filepath=writer.file("lookup.json"))
 
-    Attributes:
-        documentation: Description of what this class is all about.
-        df: A data frame.
-        lookup: A dictionary.
-    """
+# load the objects from the file
+with Reader(filepath) as reader:
+    df2 = pd.read_parquet(reader.file("df.parquet"))
+    lookup2 = dummio.json.load(reader.file("lookup.json"))
 
-    documentation: str
-    df: pd.Dataframe
-    lookup: dict[str, int]
+assert df.equals(df2)
+assert lookup == lookup2
+```
 
-    def save(self, path: Path) -> None:
-        """Save the data class to disk."""
-        with Writer(path) as writer:
-            writer.file("documentation.txt").write_text(self.documentation)
-            df.to_parquet(writer.file("df.parquet"))
-            with writer.file("lookup.json").open("w") as f:
-                json.dump(self.lookup, f)
+[Available on pypi](https://pypi.org/project/packio/): `pip install packio`.
 
-    @classmethod
-    def from_file(cls, path: Path) -> "MyData":
-        """Load the data class from disk."""
-        with Reader(path) as reader:
-            documentation = reader.file("documentation.txt").read_text()
-            df = pd.read_parquet(reader.file("df.parquet"))
-            with reader.file("lookup.json").open() as f:
-                lookup = json.load(f)
-        return cls(documentation=documentation, df=df, lookup=lookup)
+## Why a single file and not a directory?
 
+In a word, *encapsulation*. Copy/move operations with a file are simpler than a directory, especially when it comes to moving data across platforms such as to/from the cloud. A file is also more tamper-resistant - it's typically harder to accidentally modify the contents of a file than it is for someone to add or remove files or subdirectories in a directory.
 
-# Create an instance of the class, save it, and re-load it as a new instance:
-data = MyData(
-    documentation="This is an example.",
-    df=pd.DataFrame({"a": [1, 2], "b": [3, 4]}),
-    lookup={"a": 1, "b": 2},
-)
-data.save(tmp_path / "data.mydata")
-loaded = MyData.from_file(tmp_path / "data.mydata")
+## Why not pickle?
 
-# Check that the new class instance matches the old one, at least in terms of it's data attributes:
-assert loaded.documentation == data.documentation
-pd.testing.assert_frame_equal(loaded.df, data.df)
-assert loaded.lookup == data.lookup
-```
+Although `pickle` may be the most common approach for serialization of complex python objects, there are strong reasons to dislike pickle. As summarized by Gemini, "Python's pickle module, while convenient, has drawbacks. It poses security risks due to potential code execution vulnerabilities when handling untrusted data. Compatibility issues arise because it's Python-specific and version-dependent.  Maintaining pickle can be challenging due to refactoring difficulties and complex debugging." See also [Ben Frederickson](https://www.benfrederickson.com/dont-pickle-your-data/).
 
 ## Development
 

diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "packio"
-version = "0.0.4"
+version = "0.0.5"
 description = "IO for multiple python objects to/from a single file"
 authors = [{ name = "Zach Kurtz", email = "[email protected]" }]
 readme = "README.md"
@@ -14,6 +14,9 @@ dev = [
   "pytest >=8.3.2",
   "sphinx>=8.1.3",
   "sphinx-rtd-theme>=3.0.2",
+  "pandas>=2.2.3",
+  "dummio>=1.1.0",
+  "fastparquet>=2024.11.0",
 ]
 
 [project.urls]

diff --git a/tests/test_packio.py b/tests/test_packio.py
@@ -1,48 +1,26 @@
 """Main tests of packio."""
 
-import json
-from dataclasses import dataclass
-from pathlib import Path
-
+import dummio
+import pandas as pd
 from packio import Reader, Writer
 
 
-@dataclass
-class MyData:
-    """A simple data class for testing.
-
-    Attributes:
-        documentation: Description of what this class is all about.
-        lookup: A dictionary.
-    """
-
-    documentation: str
-    lookup: dict[str, int]
-
-    def save(self, path: Path) -> None:
-        """Save the data class to disk."""
-        with Writer(path) as writer:
-            writer.file("documentation.txt").write_text(self.documentation)
-            with writer.file("lookup.json").open("w") as f:
-                json.dump(self.lookup, f)
+def test_io(tmp_path) -> None:
+    """Test the dummio package."""
+    # define some objects
+    df = pd.DataFrame({"a": [1, 2], "b": [3, 4]})
+    lookup = {"a": 1, "b": 2}
 
-    @classmethod
-    def from_file(cls, path: Path) -> "MyData":
-        """Load the data class from disk."""
-        with Reader(path) as reader:
-            documentation = reader.file("documentation.txt").read_text()
-            with reader.file("lookup.json").open() as f:
-                lookup = json.load(f)
-        return cls(documentation=documentation, lookup=lookup)
+    # save the objects to a single file
+    filepath = tmp_path / "data.packio"
+    with Writer(filepath) as writer:
+        df.to_parquet(writer.file("df.parquet"))
+        dummio.json.save(lookup, filepath=writer.file("lookup.json"))
 
+    # load the objects from the file
+    with Reader(filepath) as reader:
+        df2 = pd.read_parquet(reader.file("df.parquet"))
+        lookup2 = dummio.json.load(reader.file("lookup.json"))
 
-def test_packio(tmp_path):
-    """Test the packio package."""
-    data = MyData(
-        documentation="This is a test.",
-        lookup={"a": 1, "b": 2},
-    )
-    data.save(tmp_path / "data.mydata")
-    loaded = MyData.from_file(tmp_path / "data.mydata")
-    assert loaded.documentation == data.documentation
-    assert loaded.lookup == data.lookup
+    assert df.equals(df2)
+    assert lookup == lookup2