Merge pull request #9 from zkurtz/add-parquet

add pandas dataframe support
zkurtz · Nov 27, 2024 · 484c063 · 484c063
2 parents 0be1354 + 6575dc9
commit 484c063
Show file tree

Hide file tree

Showing 11 changed files with 475 additions and 41 deletions.
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -3,35 +3,40 @@ name: CI
 on:
   pull_request:
     branches: [ main ]
-  push:
-    branches: [ main ]
 
 jobs:
   build:
     name: continuous-integration
     runs-on: ubuntu-latest
     strategy:
       matrix:
-        python-version: ['3.10', '3.12']
+        python-version:
+          - '3.10'
+          - '3.13'
+
     steps:
-    - uses: actions/checkout@v3
-    - uses: actions/setup-python@v4
-      with:
-        python-version: ${{ matrix.python-version }}
+      - name: Clone repo
+        uses: actions/checkout@v4
+
+      - name: Set the python version
+        run: echo "UV_PYTHON=${{ matrix.python-version }}" >> $GITHUB_ENV
+
+      - name: Setup uv
+        uses: astral-sh/setup-uv@v3
+        with:
+          version: "0.5.4"
 
-    - name: Set the python version for UV
-      run: echo "UV_PYTHON=${{ matrix.python-version }}" >> $GITHUB_ENV
+      - name: Install extras
+        run: uv sync --group extras
 
-    - name: Set up uv
-      run: pip install uv
+      - name: Linting check
+        run: uv run ruff check
 
-    - name: Check code quality with Ruff
-      run: |
-        uv run ruff check
-        uv run ruff format --check
+      - name: Formatting check
+        run: uv run ruff format --check
 
-    - name: Check type hints with pyright
-      run: uv run pyright
+      - name: Type checking
+        run: uv run pyright
 
-    - name: Run unit tests with pytest
-      run: uv run pytest
+      - name: Unit tests
+        run: uv run pytest
diff --git a/README.md b/README.md
@@ -1,28 +1,41 @@
 # dummio
 
-IO for dummies! We make IO as easy as possible by implementing the most common and recommendable default options. (Users may pass additional keyword arguments to the underlying IO methods.) For example, instead of
-```
-import json
+IO for dummies! We make IO as easy as possible by providing a unified `save`/`load` interface, using the most common and recommendable default options for IO between various object types and file types. (Users may pass additional keyword arguments to the underlying IO methods.)
+
+## Simple IO calls
 
+dummio simplifies IO calls for some file types. For example, instead of
+```
 with open(file_path, 'r', encoding='utf-8') as file:
     data = json.load(file)
 ```
 you can simply
 ```
-import dummio
-
 data = dummio.json.load(file_path)
 ```
 
+## Standardized IO interface
+
+In some coding applications it is desirable to pass an IO module as an argument to a function. Here it is convenient to pass a dummio submodule, since all dummio submodules have the same `save` and `load` interface, having equivalent signatures (except for differences hidden in `**kwargs`).
+
+## Supported object and file types
+
 So far we support:
 - text
-- json
-- yaml
+- simple dictionaries:
+    - json
+    - yaml
+- pandas dataframes:
+    - csv
+    - parquet
 
-Note that `yaml` is not a required dependency; you may install `dummio` and use it for `json` without bothering with `yaml` installation. Any other IO modules to be added will similarly be optional.
+## Dependencies
+
+dummio has no required dependencies. For example, calling `from dummio.pandas import df_parquet` will raise a helpful message to install pandas if you have not already done so.
 
 ## Examples
 
+Basic IO methods can be accessed directly as `dummio.text`, `dummio.json`, etc:.
 ```
 import dummio
 
@@ -34,10 +47,6 @@ path = "io_example_file"
 dummio.text.save(text, path=path)
 assert text == dummio.text.load(path)
 
-# JSON
-dummio.json.save(data)
-assert data == dummio.json.load(path)
-
 # YAML
 dummio.yaml.save(data)
 assert data == dummio.yaml.load(path)
@@ -53,7 +62,7 @@ We're [on pypi](https://pypi.org/project/dummio/), so `pip install dummio`.
 git clone [email protected]:zkurtz/dummio.git
 cd dummio
 pip install uv
-uv sync
+uv sync --group extras
 source .venv/bin/activate
 pre-commit install
 ```
diff --git a/dummio/__init__.py b/dummio/__init__.py
@@ -1,5 +1,3 @@
-"""dummio initialization."""
-
 from importlib.metadata import version
 
 __version__ = version("dummio")

diff --git a/dummio/pandas/__init__.py b/dummio/pandas/__init__.py
@@ -0,0 +1,9 @@
+try:
+    import pandas
+
+    del pandas
+except ImportError:
+    raise ImportError("Please install pandas to use dummio.pandas")
+
+from dummio.pandas import df_csv as df_csv
+from dummio.pandas import df_parquet as df_parquet
diff --git a/dummio/pandas/df_csv.py b/dummio/pandas/df_csv.py
@@ -0,0 +1,35 @@
+"""Pandas data frames to/from csv."""
+
+from typing import Any
+
+import pandas as pd
+
+from dummio.constants import PathType
+
+
+def save(
+    data: pd.DataFrame,
+    *,
+    filepath: PathType,
+    **kwargs: Any,
+) -> None:
+    """Save a yaml file.
+
+    Args:
+        data: Data to save.
+        filepath: Path to save the data.
+        **kwargs: Additional keyword arguments for pandas.DataFrame.to_csv
+    """
+    if "index" not in kwargs and not data.index.name:
+        kwargs["index"] = False
+    data.to_csv(filepath, **kwargs)
+
+
+def load(filepath: PathType, **kwargs: Any) -> pd.DataFrame:
+    """Read a yaml file.
+
+    Args:
+        filepath: Path to read the data.
+        **kwargs: Additional keyword arguments for pandas.read_csv
+    """
+    return pd.read_csv(filepath, **kwargs)
diff --git a/dummio/pandas/df_parquet.py b/dummio/pandas/df_parquet.py
@@ -0,0 +1,33 @@
+"""Pandas data frames to/from parquet."""
+
+from typing import Any
+
+import pandas as pd
+
+from dummio.constants import PathType
+
+
+def save(
+    data: pd.DataFrame,
+    *,
+    filepath: PathType,
+    **kwargs: Any,
+) -> None:
+    """Save a yaml file.
+
+    Args:
+        data: Data to save.
+        filepath: Path to save the data.
+        **kwargs: Additional keyword arguments for pandas.DataFrame.to_parquet
+    """
+    data.to_parquet(filepath, **kwargs)
+
+
+def load(filepath: PathType, **kwargs: Any) -> pd.DataFrame:
+    """Read a yaml file.
+
+    Args:
+        filepath: Path to read the data.
+        **kwargs: Additional keyword arguments for pandas.read_parquet
+    """
+    return pd.read_parquet(filepath, **kwargs)
diff --git a/dummio/yaml.py b/dummio/yaml.py
@@ -1,6 +1,9 @@
 """IO for yaml."""
 
-import yaml
+try:
+    import yaml
+except ImportError:
+    raise ImportError("Please install pyyaml to use dummio.yaml")
 
 from dummio.constants import DEFAULT_ENCODING, DEFAULT_WRITE_MODE, AnyDict, PathType
 

diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "dummio"
-version = "0.2.0"
+version = "1.0.0"
 description = "Easiest-possible IO for basic file types."
 authors = [{ name = "Zach Kurtz", email = "[email protected]" }]
 readme = "README.md"
@@ -14,7 +14,9 @@ dev = [
   "pytest >=8.3.2",
 ]
 extras = [
-  "pyyaml >=6.0.2",
+  "fastparquet>=2024.11.0",
+  "pandas>=1.5.0",
+  "pyyaml>=6.0.2",
 ]
 
 [project.urls]
@@ -26,8 +28,31 @@ package = true
 [tool.pytest.ini_options]
 testpaths = ["tests",]
 
+[tool.pyright]
+include = ["dummio", "tests"]
+
 [tool.ruff]
 line-length = 120
 
-[tool.pyright]
-include = ["dummio", "tests"]
+[tool.ruff.lint]
+select = [
+  # ruff defaults:
+  "E4", "E7", "E9", "F",
+  "I", # flake8-import-order
+  "TID", # flake8-tidy-imports
+  "D", # google-style docstrings
+]
+ignore = [
+  "D202", # would require one blank line after the last section of a multi-line docstring
+  "D203", # would require one blank line at start of class docstring
+  "D213", # would require multi-line docstring to start with a hard return
+  "D401", # would require imperative mood in docstring
+  "D413", # would put blank line at end of each multiline docstring
+]
+
+[tool.ruff.lint.flake8-tidy-imports]
+ban-relative-imports = "all"
+
+[tool.ruff.lint.per-file-ignores]
+"__init__.py" = ["D104"] # would require module-level documentation
+"test_*.py" = ["D"]
diff --git a/tests/test_all_basic.py b/tests/test_all_basic.py
@@ -1,4 +1,4 @@
-"""Main tests of dummio."""
+"""Test IO methods for basic data types."""
 
 from pathlib import Path
 from types import ModuleType

diff --git a/tests/test_pandas.py b/tests/test_pandas.py
@@ -0,0 +1,33 @@
+"""Test IO methods for tabulare data types."""
+
+from pathlib import Path
+from types import ModuleType
+
+import pandas as pd
+
+from dummio import pandas as pd_io
+
+
+def dataframe() -> pd.DataFrame:
+    return pd.DataFrame({"a": [1, 2], "b": [3, 4]})
+
+
+def _assert_cycle(*, data: pd.DataFrame, path: Path, module: ModuleType) -> None:
+    """Apply the module save/load cycle on the data and assert that the reloaded data matches the input data."""
+    module.save(data, filepath=path)
+    loaded_data = module.load(path)
+    pd.testing.assert_frame_equal(data, loaded_data)
+
+
+def test_df_io(tmp_path: Path) -> None:
+    """Test the packio package for IO for tabular data."""
+    modules = [
+        pd_io.df_parquet,
+        pd_io.df_csv,
+    ]
+    for module in modules:
+        _assert_cycle(
+            data=dataframe(),
+            path=tmp_path / "data",
+            module=module,
+        )