From 79955e2be0f26c7efa589f03363168db2236a23b Mon Sep 17 00:00:00 2001 From: Simon Date: Tue, 7 Nov 2023 20:01:16 +0000 Subject: [PATCH] Add data operations for reading from zip, iterating over csv and json records, and writing to parquet (#6) * Add data operations: reading from zip file, json and csv record parsing, writing to parquet * Update version to 0.1 * Fix linting; update to run multiple dependency versions * Run dependency version checks with specific python versions --- .github/workflows/publish.yml | 1 + .github/workflows/test.yml | 3 + Makefile | 26 ++- README.md | 63 ++++++- poetry.lock | 261 +++++++++++++++++++++++++++- pyproject.toml | 27 ++- scripts/test_dependency_versions.sh | 43 +++++ src/pipedata/__init__.py | 2 +- src/pipedata/core/chain.py | 4 +- src/pipedata/ops/__init__.py | 10 ++ src/pipedata/ops/files.py | 16 ++ src/pipedata/ops/records.py | 34 ++++ src/pipedata/ops/storage.py | 51 ++++++ tests/core/test_stream.py | 4 +- tests/ops/__init__.py | 0 tests/ops/test_files.py | 30 ++++ tests/ops/test_pipeline.py | 53 ++++++ tests/ops/test_records.py | 34 ++++ tests/ops/test_storage.py | 122 +++++++++++++ 19 files changed, 773 insertions(+), 11 deletions(-) create mode 100755 scripts/test_dependency_versions.sh create mode 100644 src/pipedata/ops/__init__.py create mode 100644 src/pipedata/ops/files.py create mode 100644 src/pipedata/ops/records.py create mode 100644 src/pipedata/ops/storage.py create mode 100644 tests/ops/__init__.py create mode 100644 tests/ops/test_files.py create mode 100644 tests/ops/test_pipeline.py create mode 100644 tests/ops/test_records.py create mode 100644 tests/ops/test_storage.py diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml index 4110ab9..4f1ea64 100644 --- a/.github/workflows/publish.yml +++ b/.github/workflows/publish.yml @@ -30,6 +30,7 @@ jobs: - run: curl -sSL https://install.python-poetry.org | python - -y - run: poetry config virtualenvs.in-project true - run: make test + - run: make test-dep-versions build: needs: diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 5557665..247bcff 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -68,3 +68,6 @@ jobs: - name: Run tests run: make test + + - name: Run tests on different dependency versions + run: make test-dep-versions diff --git a/Makefile b/Makefile index 72f5c9e..08ff935 100644 --- a/Makefile +++ b/Makefile @@ -4,7 +4,7 @@ sources = src tests .PHONY: prepare prepare: - poetry install + poetry install --with ops .PHONY: lintable @@ -21,13 +21,35 @@ lint: prepare poetry run mypy $(sources) - .PHONY: test test: prepare poetry run coverage run -m pytest poetry run coverage report +.PHONY: test-python-versions +test-python-versions: + poetry env use python3.8 + make test + + poetry env use python3.9 + make test + + poetry env use python3.10 + make test + + poetry env use python3.11 + make test + + poetry env use python3.12 + make test + + +.PHONY: test-dep-versions +test-dep-versions: prepare + ./scripts/test_dependency_versions.sh + + .PHONY: clean clean: rm -rf `find . -name __pycache__` diff --git a/README.md b/README.md index 0bc621f..2eb5079 100644 --- a/README.md +++ b/README.md @@ -8,7 +8,68 @@ Chained operations in Python, applied to data processing. pip install pipedata ``` -## An Example +## Examples + +### Chaining Data Operations + +pipedata.ops provides some operations for streaming data through memory. + +```py +import json +import zipfile + +import pyarrow.parquet as pq + +from pipedata.core import StreamStart +from pipedata.ops import json_records, parquet_writer, zipped_files + + +data1 = [ + {"col1": 1, "col2": "Hello"}, + {"col1": 2, "col2": "world"}, +] +data2 = [ + {"col1": 3, "col2": "!"}, +] + +with zipfile.ZipFile("test_input.json.zip", "w") as zipped: + zipped.writestr("file1.json", json.dumps(data1)) + zipped.writestr("file2.json", json.dumps(data2)) + +result = ( + StreamStart(["test_input.json.zip"]) + .flat_map(zipped_files) + .flat_map(json_records()) + .flat_map(parquet_writer("test_output.parquet")) + .to_list() +) + +table = pq.read_table("test_output.parquet") +print(table.to_pydict()) +#> {'col1': [1, 2, 3], 'col2': ['Hello', 'world', '!']} +``` + +Alternatively, you can construct the pipeline as a chain: + +```py +import pyarrow.parquet as pq + +from pipedata.core import ChainStart, StreamStart +from pipedata.ops import json_records, parquet_writer, zipped_files + +# Running this after input file created in above example +chain = ( + ChainStart() + .flat_map(zipped_files) + .flat_map(json_records()) + .flat_map(parquet_writer("test_output_2.parquet")) +) +result = StreamStart(["test_input.json.zip"]).flat_map(chain).to_list() +table = pq.read_table("test_output_2.parquet") +print(table.to_pydict()) +#> {'col1': [1, 2, 3], 'col2': ['Hello', 'world', '!']} + +``` ### Core Framework diff --git a/poetry.lock b/poetry.lock index f20ff9c..0598078 100644 --- a/poetry.lock +++ b/poetry.lock @@ -145,6 +145,139 @@ files = [ [package.extras] test = ["pytest (>=6)"] +[[package]] +name = "fsspec" +version = "2023.10.0" +description = "File-system specification" +optional = false +python-versions = ">=3.8" +files = [ + {file = "fsspec-2023.10.0-py3-none-any.whl", hash = "sha256:346a8f024efeb749d2a5fca7ba8854474b1ff9af7c3faaf636a4548781136529"}, + {file = "fsspec-2023.10.0.tar.gz", hash = "sha256:330c66757591df346ad3091a53bd907e15348c2ba17d63fd54f5c39c4457d2a5"}, +] + +[package.extras] +abfs = ["adlfs"] +adl = ["adlfs"] +arrow = ["pyarrow (>=1)"] +dask = ["dask", "distributed"] +devel = ["pytest", "pytest-cov"] +dropbox = ["dropbox", "dropboxdrivefs", "requests"] +full = ["adlfs", "aiohttp (!=4.0.0a0,!=4.0.0a1)", "dask", "distributed", "dropbox", "dropboxdrivefs", "fusepy", "gcsfs", "libarchive-c", "ocifs", "panel", "paramiko", "pyarrow (>=1)", "pygit2", "requests", "s3fs", "smbprotocol", "tqdm"] +fuse = ["fusepy"] +gcs = ["gcsfs"] +git = ["pygit2"] +github = ["requests"] +gs = ["gcsfs"] +gui = ["panel"] +hdfs = ["pyarrow (>=1)"] +http = ["aiohttp (!=4.0.0a0,!=4.0.0a1)", "requests"] +libarchive = ["libarchive-c"] +oci = ["ocifs"] +s3 = ["s3fs"] +sftp = ["paramiko"] +smb = ["smbprotocol"] +ssh = ["paramiko"] +tqdm = ["tqdm"] + +[[package]] +name = "ijson" +version = "3.2.3" +description = "Iterative JSON parser with standard Python iterator interfaces" +optional = false +python-versions = "*" +files = [ + {file = "ijson-3.2.3-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:0a4ae076bf97b0430e4e16c9cb635a6b773904aec45ed8dcbc9b17211b8569ba"}, + {file = "ijson-3.2.3-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:cfced0a6ec85916eb8c8e22415b7267ae118eaff2a860c42d2cc1261711d0d31"}, + {file = "ijson-3.2.3-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:0b9d1141cfd1e6d6643aa0b4876730d0d28371815ce846d2e4e84a2d4f471cf3"}, + {file = "ijson-3.2.3-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9e0a27db6454edd6013d40a956d008361aac5bff375a9c04ab11fc8c214250b5"}, + {file = "ijson-3.2.3-cp310-cp310-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:3c0d526ccb335c3c13063c273637d8611f32970603dfb182177b232d01f14c23"}, + {file = "ijson-3.2.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:545a30b3659df2a3481593d30d60491d1594bc8005f99600e1bba647bb44cbb5"}, + {file = "ijson-3.2.3-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:9680e37a10fedb3eab24a4a7e749d8a73f26f1a4c901430e7aa81b5da15f7307"}, + {file = "ijson-3.2.3-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:2a80c0bb1053055d1599e44dc1396f713e8b3407000e6390add72d49633ff3bb"}, + {file = "ijson-3.2.3-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:f05ed49f434ce396ddcf99e9fd98245328e99f991283850c309f5e3182211a79"}, + {file = "ijson-3.2.3-cp310-cp310-win32.whl", hash = "sha256:b4eb2304573c9fdf448d3fa4a4fdcb727b93002b5c5c56c14a5ffbbc39f64ae4"}, + {file = "ijson-3.2.3-cp310-cp310-win_amd64.whl", hash = "sha256:923131f5153c70936e8bd2dd9dcfcff43c67a3d1c789e9c96724747423c173eb"}, + {file = "ijson-3.2.3-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:904f77dd3d87736ff668884fe5197a184748eb0c3e302ded61706501d0327465"}, + {file = "ijson-3.2.3-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:0974444c1f416e19de1e9f567a4560890095e71e81623c509feff642114c1e53"}, + {file = "ijson-3.2.3-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:c1a4b8eb69b6d7b4e94170aa991efad75ba156b05f0de2a6cd84f991def12ff9"}, + {file = "ijson-3.2.3-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d052417fd7ce2221114f8d3b58f05a83c1a2b6b99cafe0b86ac9ed5e2fc889df"}, + {file = "ijson-3.2.3-cp311-cp311-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7b8064a85ec1b0beda7dd028e887f7112670d574db606f68006c72dd0bb0e0e2"}, + {file = "ijson-3.2.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:eaac293853f1342a8d2a45ac1f723c860f700860e7743fb97f7b76356df883a8"}, + {file = "ijson-3.2.3-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:6c32c18a934c1dc8917455b0ce478fd7a26c50c364bd52c5a4fb0fc6bb516af7"}, + {file = "ijson-3.2.3-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:713a919e0220ac44dab12b5fed74f9130f3480e55e90f9d80f58de129ea24f83"}, + {file = "ijson-3.2.3-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:4a3a6a2fbbe7550ffe52d151cf76065e6b89cfb3e9d0463e49a7e322a25d0426"}, + {file = "ijson-3.2.3-cp311-cp311-win32.whl", hash = "sha256:6a4db2f7fb9acfb855c9ae1aae602e4648dd1f88804a0d5cfb78c3639bcf156c"}, + {file = "ijson-3.2.3-cp311-cp311-win_amd64.whl", hash = "sha256:ccd6be56335cbb845f3d3021b1766299c056c70c4c9165fb2fbe2d62258bae3f"}, + {file = "ijson-3.2.3-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:055b71bbc37af5c3c5861afe789e15211d2d3d06ac51ee5a647adf4def19c0ea"}, + {file = "ijson-3.2.3-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:c075a547de32f265a5dd139ab2035900fef6653951628862e5cdce0d101af557"}, + {file = "ijson-3.2.3-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:457f8a5fc559478ac6b06b6d37ebacb4811f8c5156e997f0d87d708b0d8ab2ae"}, + {file = "ijson-3.2.3-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9788f0c915351f41f0e69ec2618b81ebfcf9f13d9d67c6d404c7f5afda3e4afb"}, + {file = "ijson-3.2.3-cp312-cp312-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:fa234ab7a6a33ed51494d9d2197fb96296f9217ecae57f5551a55589091e7853"}, + {file = "ijson-3.2.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bdd0dc5da4f9dc6d12ab6e8e0c57d8b41d3c8f9ceed31a99dae7b2baf9ea769a"}, + {file = "ijson-3.2.3-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:c6beb80df19713e39e68dc5c337b5c76d36ccf69c30b79034634e5e4c14d6904"}, + {file = "ijson-3.2.3-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:a2973ce57afb142d96f35a14e9cfec08308ef178a2c76b8b5e1e98f3960438bf"}, + {file = "ijson-3.2.3-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:105c314fd624e81ed20f925271ec506523b8dd236589ab6c0208b8707d652a0e"}, + {file = "ijson-3.2.3-cp312-cp312-win32.whl", hash = "sha256:ac44781de5e901ce8339352bb5594fcb3b94ced315a34dbe840b4cff3450e23b"}, + {file = "ijson-3.2.3-cp312-cp312-win_amd64.whl", hash = "sha256:0567e8c833825b119e74e10a7c29761dc65fcd155f5d4cb10f9d3b8916ef9912"}, + {file = "ijson-3.2.3-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:eeb286639649fb6bed37997a5e30eefcacddac79476d24128348ec890b2a0ccb"}, + {file = "ijson-3.2.3-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:396338a655fb9af4ac59dd09c189885b51fa0eefc84d35408662031023c110d1"}, + {file = "ijson-3.2.3-cp36-cp36m-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:0e0243d166d11a2a47c17c7e885debf3b19ed136be2af1f5d1c34212850236ac"}, + {file = "ijson-3.2.3-cp36-cp36m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:85afdb3f3a5d0011584d4fa8e6dccc5936be51c27e84cd2882fe904ca3bd04c5"}, + {file = "ijson-3.2.3-cp36-cp36m-musllinux_1_1_aarch64.whl", hash = "sha256:4fc35d569eff3afa76bfecf533f818ecb9390105be257f3f83c03204661ace70"}, + {file = "ijson-3.2.3-cp36-cp36m-musllinux_1_1_i686.whl", hash = "sha256:455d7d3b7a6aacfb8ab1ebcaf697eedf5be66e044eac32508fccdc633d995f0e"}, + {file = "ijson-3.2.3-cp36-cp36m-musllinux_1_1_x86_64.whl", hash = "sha256:c63f3d57dbbac56cead05b12b81e8e1e259f14ce7f233a8cbe7fa0996733b628"}, + {file = "ijson-3.2.3-cp36-cp36m-win32.whl", hash = "sha256:a4d7fe3629de3ecb088bff6dfe25f77be3e8261ed53d5e244717e266f8544305"}, + {file = "ijson-3.2.3-cp36-cp36m-win_amd64.whl", hash = "sha256:96190d59f015b5a2af388a98446e411f58ecc6a93934e036daa75f75d02386a0"}, + {file = "ijson-3.2.3-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:35194e0b8a2bda12b4096e2e792efa5d4801a0abb950c48ade351d479cd22ba5"}, + {file = "ijson-3.2.3-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d1053fb5f0b010ee76ca515e6af36b50d26c1728ad46be12f1f147a835341083"}, + {file = "ijson-3.2.3-cp37-cp37m-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:211124cff9d9d139dd0dfced356f1472860352c055d2481459038b8205d7d742"}, + {file = "ijson-3.2.3-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:92dc4d48e9f6a271292d6079e9fcdce33c83d1acf11e6e12696fb05c5889fe74"}, + {file = "ijson-3.2.3-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:3dcc33ee56f92a77f48776014ddb47af67c33dda361e84371153c4f1ed4434e1"}, + {file = "ijson-3.2.3-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:98c6799925a5d1988da4cd68879b8eeab52c6e029acc45e03abb7921a4715c4b"}, + {file = "ijson-3.2.3-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:4252e48c95cd8ceefc2caade310559ab61c37d82dfa045928ed05328eb5b5f65"}, + {file = "ijson-3.2.3-cp37-cp37m-win32.whl", hash = "sha256:644f4f03349ff2731fd515afd1c91b9e439e90c9f8c28292251834154edbffca"}, + {file = "ijson-3.2.3-cp37-cp37m-win_amd64.whl", hash = "sha256:ba33c764afa9ecef62801ba7ac0319268a7526f50f7601370d9f8f04e77fc02b"}, + {file = "ijson-3.2.3-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:4b2ec8c2a3f1742cbd5f36b65e192028e541b5fd8c7fd97c1fc0ca6c427c704a"}, + {file = "ijson-3.2.3-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:7dc357da4b4ebd8903e77dbcc3ce0555ee29ebe0747c3c7f56adda423df8ec89"}, + {file = "ijson-3.2.3-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:bcc51c84bb220ac330122468fe526a7777faa6464e3b04c15b476761beea424f"}, + {file = "ijson-3.2.3-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f8d54b624629f9903005c58d9321a036c72f5c212701bbb93d1a520ecd15e370"}, + {file = "ijson-3.2.3-cp38-cp38-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d6ea7c7e3ec44742e867c72fd750c6a1e35b112f88a917615332c4476e718d40"}, + {file = "ijson-3.2.3-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:916acdc5e504f8b66c3e287ada5d4b39a3275fc1f2013c4b05d1ab9933671a6c"}, + {file = "ijson-3.2.3-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:81815b4184b85ce124bfc4c446d5f5e5e643fc119771c5916f035220ada29974"}, + {file = "ijson-3.2.3-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:b49fd5fe1cd9c1c8caf6c59f82b08117dd6bea2ec45b641594e25948f48f4169"}, + {file = "ijson-3.2.3-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:86b3c91fdcb8ffb30556c9669930f02b7642de58ca2987845b04f0d7fe46d9a8"}, + {file = "ijson-3.2.3-cp38-cp38-win32.whl", hash = "sha256:a729b0c8fb935481afe3cf7e0dadd0da3a69cc7f145dbab8502e2f1e01d85a7c"}, + {file = "ijson-3.2.3-cp38-cp38-win_amd64.whl", hash = "sha256:d34e049992d8a46922f96483e96b32ac4c9cffd01a5c33a928e70a283710cd58"}, + {file = "ijson-3.2.3-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:9c2a12dcdb6fa28f333bf10b3a0f80ec70bc45280d8435be7e19696fab2bc706"}, + {file = "ijson-3.2.3-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:1844c5b57da21466f255a0aeddf89049e730d7f3dfc4d750f0e65c36e6a61a7c"}, + {file = "ijson-3.2.3-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:2ec3e5ff2515f1c40ef6a94983158e172f004cd643b9e4b5302017139b6c96e4"}, + {file = "ijson-3.2.3-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:46bafb1b9959872a1f946f8dd9c6f1a30a970fc05b7bfae8579da3f1f988e598"}, + {file = "ijson-3.2.3-cp39-cp39-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ab4db9fee0138b60e31b3c02fff8a4c28d7b152040553b6a91b60354aebd4b02"}, + {file = "ijson-3.2.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f4bc87e69d1997c6a55fff5ee2af878720801ff6ab1fb3b7f94adda050651e37"}, + {file = "ijson-3.2.3-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:e9fd906f0c38e9f0bfd5365e1bed98d649f506721f76bb1a9baa5d7374f26f19"}, + {file = "ijson-3.2.3-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:e84d27d1acb60d9102728d06b9650e5b7e5cb0631bd6e3dfadba8fb6a80d6c2f"}, + {file = "ijson-3.2.3-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:2cc04fc0a22bb945cd179f614845c8b5106c0b3939ee0d84ce67c7a61ac1a936"}, + {file = "ijson-3.2.3-cp39-cp39-win32.whl", hash = "sha256:e641814793a037175f7ec1b717ebb68f26d89d82cfd66f36e588f32d7e488d5f"}, + {file = "ijson-3.2.3-cp39-cp39-win_amd64.whl", hash = "sha256:6bd3e7e91d031f1e8cea7ce53f704ab74e61e505e8072467e092172422728b22"}, + {file = "ijson-3.2.3-pp37-pypy37_pp73-macosx_10_9_x86_64.whl", hash = "sha256:06f9707da06a19b01013f8c65bf67db523662a9b4a4ff027e946e66c261f17f0"}, + {file = "ijson-3.2.3-pp37-pypy37_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:be8495f7c13fa1f622a2c6b64e79ac63965b89caf664cc4e701c335c652d15f2"}, + {file = "ijson-3.2.3-pp37-pypy37_pp73-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7596b42f38c3dcf9d434dddd50f46aeb28e96f891444c2b4b1266304a19a2c09"}, + {file = "ijson-3.2.3-pp37-pypy37_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fbac4e9609a1086bbad075beb2ceec486a3b138604e12d2059a33ce2cba93051"}, + {file = "ijson-3.2.3-pp37-pypy37_pp73-win_amd64.whl", hash = "sha256:db2d6341f9cb538253e7fe23311d59252f124f47165221d3c06a7ed667ecd595"}, + {file = "ijson-3.2.3-pp38-pypy38_pp73-macosx_10_9_x86_64.whl", hash = "sha256:fa8b98be298efbb2588f883f9953113d8a0023ab39abe77fe734b71b46b1220a"}, + {file = "ijson-3.2.3-pp38-pypy38_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:674e585361c702fad050ab4c153fd168dc30f5980ef42b64400bc84d194e662d"}, + {file = "ijson-3.2.3-pp38-pypy38_pp73-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:fd12e42b9cb9c0166559a3ffa276b4f9fc9d5b4c304e5a13668642d34b48b634"}, + {file = "ijson-3.2.3-pp38-pypy38_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d31e0d771d82def80cd4663a66de277c3b44ba82cd48f630526b52f74663c639"}, + {file = "ijson-3.2.3-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:7ce4c70c23521179d6da842bb9bc2e36bb9fad1e0187e35423ff0f282890c9ca"}, + {file = "ijson-3.2.3-pp39-pypy39_pp73-macosx_10_9_x86_64.whl", hash = "sha256:39f551a6fbeed4433c85269c7c8778e2aaea2501d7ebcb65b38f556030642c17"}, + {file = "ijson-3.2.3-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3b14d322fec0de7af16f3ef920bf282f0dd747200b69e0b9628117f381b7775b"}, + {file = "ijson-3.2.3-pp39-pypy39_pp73-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7851a341429b12d4527ca507097c959659baf5106c7074d15c17c387719ffbcd"}, + {file = "ijson-3.2.3-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:db3bf1b42191b5cc9b6441552fdcb3b583594cb6b19e90d1578b7cbcf80d0fae"}, + {file = "ijson-3.2.3-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:6f662dc44362a53af3084d3765bb01cd7b4734d1f484a6095cad4cb0cbfe5374"}, + {file = "ijson-3.2.3.tar.gz", hash = "sha256:10294e9bf89cb713da05bc4790bdff616610432db561964827074898e174f917"}, +] + [[package]] name = "iniconfig" version = "2.0.0" @@ -213,6 +346,84 @@ files = [ {file = "mypy_extensions-1.0.0.tar.gz", hash = "sha256:75dbf8955dc00442a438fc4d0666508a9a97b6bd41aa2f0ffe9d2f2725af0782"}, ] +[[package]] +name = "numpy" +version = "1.24.4" +description = "Fundamental package for array computing in Python" +optional = false +python-versions = ">=3.8" +files = [ + {file = "numpy-1.24.4-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:c0bfb52d2169d58c1cdb8cc1f16989101639b34c7d3ce60ed70b19c63eba0b64"}, + {file = "numpy-1.24.4-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:ed094d4f0c177b1b8e7aa9cba7d6ceed51c0e569a5318ac0ca9a090680a6a1b1"}, + {file = "numpy-1.24.4-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:79fc682a374c4a8ed08b331bef9c5f582585d1048fa6d80bc6c35bc384eee9b4"}, + {file = "numpy-1.24.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7ffe43c74893dbf38c2b0a1f5428760a1a9c98285553c89e12d70a96a7f3a4d6"}, + {file = "numpy-1.24.4-cp310-cp310-win32.whl", hash = "sha256:4c21decb6ea94057331e111a5bed9a79d335658c27ce2adb580fb4d54f2ad9bc"}, + {file = "numpy-1.24.4-cp310-cp310-win_amd64.whl", hash = "sha256:b4bea75e47d9586d31e892a7401f76e909712a0fd510f58f5337bea9572c571e"}, + {file = "numpy-1.24.4-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:f136bab9c2cfd8da131132c2cf6cc27331dd6fae65f95f69dcd4ae3c3639c810"}, + {file = "numpy-1.24.4-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:e2926dac25b313635e4d6cf4dc4e51c8c0ebfed60b801c799ffc4c32bf3d1254"}, + {file = "numpy-1.24.4-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:222e40d0e2548690405b0b3c7b21d1169117391c2e82c378467ef9ab4c8f0da7"}, + {file = "numpy-1.24.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7215847ce88a85ce39baf9e89070cb860c98fdddacbaa6c0da3ffb31b3350bd5"}, + {file = "numpy-1.24.4-cp311-cp311-win32.whl", hash = "sha256:4979217d7de511a8d57f4b4b5b2b965f707768440c17cb70fbf254c4b225238d"}, + {file = "numpy-1.24.4-cp311-cp311-win_amd64.whl", hash = "sha256:b7b1fc9864d7d39e28f41d089bfd6353cb5f27ecd9905348c24187a768c79694"}, + {file = "numpy-1.24.4-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:1452241c290f3e2a312c137a9999cdbf63f78864d63c79039bda65ee86943f61"}, + {file = "numpy-1.24.4-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:04640dab83f7c6c85abf9cd729c5b65f1ebd0ccf9de90b270cd61935eef0197f"}, + {file = "numpy-1.24.4-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a5425b114831d1e77e4b5d812b69d11d962e104095a5b9c3b641a218abcc050e"}, + {file = "numpy-1.24.4-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:dd80e219fd4c71fc3699fc1dadac5dcf4fd882bfc6f7ec53d30fa197b8ee22dc"}, + {file = "numpy-1.24.4-cp38-cp38-win32.whl", hash = "sha256:4602244f345453db537be5314d3983dbf5834a9701b7723ec28923e2889e0bb2"}, + {file = "numpy-1.24.4-cp38-cp38-win_amd64.whl", hash = "sha256:692f2e0f55794943c5bfff12b3f56f99af76f902fc47487bdfe97856de51a706"}, + {file = "numpy-1.24.4-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:2541312fbf09977f3b3ad449c4e5f4bb55d0dbf79226d7724211acc905049400"}, + {file = "numpy-1.24.4-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:9667575fb6d13c95f1b36aca12c5ee3356bf001b714fc354eb5465ce1609e62f"}, + {file = "numpy-1.24.4-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f3a86ed21e4f87050382c7bc96571755193c4c1392490744ac73d660e8f564a9"}, + {file = "numpy-1.24.4-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d11efb4dbecbdf22508d55e48d9c8384db795e1b7b51ea735289ff96613ff74d"}, + {file = "numpy-1.24.4-cp39-cp39-win32.whl", hash = "sha256:6620c0acd41dbcb368610bb2f4d83145674040025e5536954782467100aa8835"}, + {file = "numpy-1.24.4-cp39-cp39-win_amd64.whl", hash = "sha256:befe2bf740fd8373cf56149a5c23a0f601e82869598d41f8e188a0e9869926f8"}, + {file = "numpy-1.24.4-pp38-pypy38_pp73-macosx_10_9_x86_64.whl", hash = "sha256:31f13e25b4e304632a4619d0e0777662c2ffea99fcae2029556b17d8ff958aef"}, + {file = "numpy-1.24.4-pp38-pypy38_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:95f7ac6540e95bc440ad77f56e520da5bf877f87dca58bd095288dce8940532a"}, + {file = "numpy-1.24.4-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:e98f220aa76ca2a977fe435f5b04d7b3470c0a2e6312907b37ba6068f26787f2"}, + {file = "numpy-1.24.4.tar.gz", hash = "sha256:80f5e3a4e498641401868df4208b74581206afbee7cf7b8329daae82676d9463"}, +] + +[[package]] +name = "numpy" +version = "1.26.1" +description = "Fundamental package for array computing in Python" +optional = false +python-versions = "<3.13,>=3.9" +files = [ + {file = "numpy-1.26.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:82e871307a6331b5f09efda3c22e03c095d957f04bf6bc1804f30048d0e5e7af"}, + {file = "numpy-1.26.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:cdd9ec98f0063d93baeb01aad472a1a0840dee302842a2746a7a8e92968f9575"}, + {file = "numpy-1.26.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d78f269e0c4fd365fc2992c00353e4530d274ba68f15e968d8bc3c69ce5f5244"}, + {file = "numpy-1.26.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8ab9163ca8aeb7fd32fe93866490654d2f7dda4e61bc6297bf72ce07fdc02f67"}, + {file = "numpy-1.26.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:78ca54b2f9daffa5f323f34cdf21e1d9779a54073f0018a3094ab907938331a2"}, + {file = "numpy-1.26.1-cp310-cp310-win32.whl", hash = "sha256:d1cfc92db6af1fd37a7bb58e55c8383b4aa1ba23d012bdbba26b4bcca45ac297"}, + {file = "numpy-1.26.1-cp310-cp310-win_amd64.whl", hash = "sha256:d2984cb6caaf05294b8466966627e80bf6c7afd273279077679cb010acb0e5ab"}, + {file = "numpy-1.26.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:cd7837b2b734ca72959a1caf3309457a318c934abef7a43a14bb984e574bbb9a"}, + {file = "numpy-1.26.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:1c59c046c31a43310ad0199d6299e59f57a289e22f0f36951ced1c9eac3665b9"}, + {file = "numpy-1.26.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d58e8c51a7cf43090d124d5073bc29ab2755822181fcad978b12e144e5e5a4b3"}, + {file = "numpy-1.26.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6081aed64714a18c72b168a9276095ef9155dd7888b9e74b5987808f0dd0a974"}, + {file = "numpy-1.26.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:97e5d6a9f0702c2863aaabf19f0d1b6c2628fbe476438ce0b5ce06e83085064c"}, + {file = "numpy-1.26.1-cp311-cp311-win32.whl", hash = "sha256:b9d45d1dbb9de84894cc50efece5b09939752a2d75aab3a8b0cef6f3a35ecd6b"}, + {file = "numpy-1.26.1-cp311-cp311-win_amd64.whl", hash = "sha256:3649d566e2fc067597125428db15d60eb42a4e0897fc48d28cb75dc2e0454e53"}, + {file = "numpy-1.26.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:1d1bd82d539607951cac963388534da3b7ea0e18b149a53cf883d8f699178c0f"}, + {file = "numpy-1.26.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:afd5ced4e5a96dac6725daeb5242a35494243f2239244fad10a90ce58b071d24"}, + {file = "numpy-1.26.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a03fb25610ef560a6201ff06df4f8105292ba56e7cdd196ea350d123fc32e24e"}, + {file = "numpy-1.26.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:dcfaf015b79d1f9f9c9fd0731a907407dc3e45769262d657d754c3a028586124"}, + {file = "numpy-1.26.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:e509cbc488c735b43b5ffea175235cec24bbc57b227ef1acc691725beb230d1c"}, + {file = "numpy-1.26.1-cp312-cp312-win32.whl", hash = "sha256:af22f3d8e228d84d1c0c44c1fbdeb80f97a15a0abe4f080960393a00db733b66"}, + {file = "numpy-1.26.1-cp312-cp312-win_amd64.whl", hash = "sha256:9f42284ebf91bdf32fafac29d29d4c07e5e9d1af862ea73686581773ef9e73a7"}, + {file = "numpy-1.26.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:bb894accfd16b867d8643fc2ba6c8617c78ba2828051e9a69511644ce86ce83e"}, + {file = "numpy-1.26.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:e44ccb93f30c75dfc0c3aa3ce38f33486a75ec9abadabd4e59f114994a9c4617"}, + {file = "numpy-1.26.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9696aa2e35cc41e398a6d42d147cf326f8f9d81befcb399bc1ed7ffea339b64e"}, + {file = "numpy-1.26.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a5b411040beead47a228bde3b2241100454a6abde9df139ed087bd73fc0a4908"}, + {file = "numpy-1.26.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:1e11668d6f756ca5ef534b5be8653d16c5352cbb210a5c2a79ff288e937010d5"}, + {file = "numpy-1.26.1-cp39-cp39-win32.whl", hash = "sha256:d1d2c6b7dd618c41e202c59c1413ef9b2c8e8a15f5039e344af64195459e3104"}, + {file = "numpy-1.26.1-cp39-cp39-win_amd64.whl", hash = "sha256:59227c981d43425ca5e5c01094d59eb14e8772ce6975d4b2fc1e106a833d5ae2"}, + {file = "numpy-1.26.1-pp39-pypy39_pp73-macosx_10_9_x86_64.whl", hash = "sha256:06934e1a22c54636a059215d6da99e23286424f316fddd979f5071093b648668"}, + {file = "numpy-1.26.1-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:76ff661a867d9272cd2a99eed002470f46dbe0943a5ffd140f49be84f68ffc42"}, + {file = "numpy-1.26.1-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:6965888d65d2848e8768824ca8288db0a81263c1efccec881cb35a0d805fcd2f"}, + {file = "numpy-1.26.1.tar.gz", hash = "sha256:c8c6c72d4a9f831f328efb1312642a1cafafaa88981d9ab76368d50d07d93cbe"}, +] + [[package]] name = "packaging" version = "23.2" @@ -265,6 +476,54 @@ files = [ dev = ["pre-commit", "tox"] testing = ["pytest", "pytest-benchmark"] +[[package]] +name = "pyarrow" +version = "14.0.0" +description = "Python library for Apache Arrow" +optional = false +python-versions = ">=3.8" +files = [ + {file = "pyarrow-14.0.0-cp310-cp310-macosx_10_14_x86_64.whl", hash = "sha256:4fce1db17efbc453080c5b306f021926de7c636456a128328797e574c151f81a"}, + {file = "pyarrow-14.0.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:28de7c05b4d7a71ec660360639cc9b65ceb1175e0e9d4dfccd879a1545bc38f7"}, + {file = "pyarrow-14.0.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1541e9209c094e7f4d7b43fdd9de3a8c71d3069cf6fc03b59bf5774042411849"}, + {file = "pyarrow-14.0.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8c05e6c45d303c80e41ab04996430a0251321f70986ed51213903ea7bc0b7efd"}, + {file = "pyarrow-14.0.0-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:426ffec63ab9b4dff23dec51be2150e3a4a99eb38e66c10a70e2c48779fe9c9d"}, + {file = "pyarrow-14.0.0-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:968844f591902160bd3c9ee240ce8822a3b4e7de731e91daea76ad43fe0ff062"}, + {file = "pyarrow-14.0.0-cp310-cp310-win_amd64.whl", hash = "sha256:dcedbc0b4ea955c530145acfe99e324875c386419a09db150291a24cb01aeb81"}, + {file = "pyarrow-14.0.0-cp311-cp311-macosx_10_14_x86_64.whl", hash = "sha256:97993a12aacc781efad9c92d4545a877e803c4d106d34237ec4ce987bec825a3"}, + {file = "pyarrow-14.0.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:80225768d94024d59a31320374f5e6abf8899866c958dfb4f4ea8e2d9ec91bde"}, + {file = "pyarrow-14.0.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b61546977a8bd7e3d0c697ede723341ef4737e761af2239aef6e1db447f97727"}, + {file = "pyarrow-14.0.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:42509e6c93b4a1c8ae8ccd939a43f437097783fe130a1991497a6a1abbba026f"}, + {file = "pyarrow-14.0.0-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:3eccce331a1392e46573f2ce849a9ee3c074e0d7008e9be0b44566ac149fd6a1"}, + {file = "pyarrow-14.0.0-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:ecc463c45f2b6b36431f5f2025842245e8c15afe4d42072230575785f3bb00c6"}, + {file = "pyarrow-14.0.0-cp311-cp311-win_amd64.whl", hash = "sha256:4362ed90def81640addcd521811dd16a13015f0a8255bec324a41262c1524b6c"}, + {file = "pyarrow-14.0.0-cp312-cp312-macosx_10_14_x86_64.whl", hash = "sha256:2fbb7ab62537782c5ab31aa08db0e1f6de92c2c515fdfc0790128384e919adcb"}, + {file = "pyarrow-14.0.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:ad7095f8f0fe0bfa3d3fca1909b8fa15c70e630b0cc1ff8d35e143f5e2704064"}, + {file = "pyarrow-14.0.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e6602272fce71c0fb64f266e7cdbe51b93b00c22fc1bb57f2b0cb681c4aeedf4"}, + {file = "pyarrow-14.0.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5b2b8f87951b08a3e72265c8963da3fe4f737bb81290269037e047dd172aa591"}, + {file = "pyarrow-14.0.0-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:a1c9675966662a042caebbaafa1ae7fc26291287ebc3da06aa63ad74c323ec30"}, + {file = "pyarrow-14.0.0-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:771079fddc0b4440c41af541dbdebc711a7062c93d3c4764476a9442606977db"}, + {file = "pyarrow-14.0.0-cp312-cp312-win_amd64.whl", hash = "sha256:c4096136318de1c4937370c0c365f949961c371201c396d8cc94a353f342069d"}, + {file = "pyarrow-14.0.0-cp38-cp38-macosx_10_14_x86_64.whl", hash = "sha256:6c94056fb5f0ee0bae2206c3f776881e1db2bd0d133d06805755ae7ac5145349"}, + {file = "pyarrow-14.0.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:687d0df1e08876b2d24d42abae129742fc655367e3fe6700aa4d79fcf2e3215e"}, + {file = "pyarrow-14.0.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0f4054e5ee6c88ca256a67fc8b27f9c59bcd385216346265831d462a6069033f"}, + {file = "pyarrow-14.0.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:768b962e4c042ab2c96576ca0757935472e220d11af855c7d0be3279d7fced5f"}, + {file = "pyarrow-14.0.0-cp38-cp38-manylinux_2_28_aarch64.whl", hash = "sha256:77293b1319c7044f68ebfa43db8c929a0a5254ce371f1a0873d343f1460171d0"}, + {file = "pyarrow-14.0.0-cp38-cp38-manylinux_2_28_x86_64.whl", hash = "sha256:d2bc7c53941d85f0133b1bd5a814bca0af213922f50d8a8dc0eed4d9ed477845"}, + {file = "pyarrow-14.0.0-cp38-cp38-win_amd64.whl", hash = "sha256:378955365dd087c285ef4f34ad939d7e551b7715326710e8cd21cfa2ce511bd7"}, + {file = "pyarrow-14.0.0-cp39-cp39-macosx_10_14_x86_64.whl", hash = "sha256:f05e81b4c621e6ad4bcd8f785e3aa1d6c49a935818b809ea6e7bf206a5b1a4e8"}, + {file = "pyarrow-14.0.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:6867f6a8057eaef5a7ac6d27fe5518133f67973c5d4295d79a943458350e7c61"}, + {file = "pyarrow-14.0.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ca54b87c46abdfe027f18f959ca388102bd7326c344838f72244807462d091b2"}, + {file = "pyarrow-14.0.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:35abf61bd0cc9daca3afc715f6ba74ea83d792fa040025352624204bec66bf6a"}, + {file = "pyarrow-14.0.0-cp39-cp39-manylinux_2_28_aarch64.whl", hash = "sha256:65c377523b369f7ef1ba02be814e832443bb3b15065010838f02dae5bdc0f53c"}, + {file = "pyarrow-14.0.0-cp39-cp39-manylinux_2_28_x86_64.whl", hash = "sha256:e8a1e470e4b5f7bda7bede0410291daec55ab69f346d77795d34fd6a45b41579"}, + {file = "pyarrow-14.0.0-cp39-cp39-win_amd64.whl", hash = "sha256:466c1a5a7a4b279cfa363ac34dedd0c3c6af388cec9e6a468ffc095a6627849a"}, + {file = "pyarrow-14.0.0.tar.gz", hash = "sha256:45d3324e1c9871a07de6b4d514ebd73225490963a6dd46c64c465c4b6079fe1e"}, +] + +[package.dependencies] +numpy = ">=1.16.6" + [[package]] name = "pytest" version = "7.4.3" @@ -338,4 +597,4 @@ files = [ [metadata] lock-version = "2.0" python-versions = "^3.8" -content-hash = "976fb8e3474a02337b189ad37c618101ddd164a273cf7ef7ebdd8f8c1f04a620" +content-hash = "fb5a8e0542fcdaa20990642e24c07cb80e96e7e39eede751671a94956e3a7d52" diff --git a/pyproject.toml b/pyproject.toml index 19690d7..37dadb6 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -5,7 +5,7 @@ build-backend = "poetry.core.masonry.api" [tool.poetry] name = "pipedata" -version = "0.0.1" +version = "0.1" description = "Framework for building pipelines for data processing" authors = ["Simon Wicks "] readme = "README.md" @@ -35,17 +35,36 @@ packages = [{include = "pipedata", from = "src"}] [tool.poetry.dependencies] python = "^3.8" +[tool.poetry.group.ops.dependencies] +fsspec = [ + { version = ">=0.9.0", python = "<3.12" }, + { version = ">=2022.1.0", python = ">=3.12,<3.13"}, +] +ijson = "^3.0.0" +pyarrow = [ + { version = ">=9.0.0", python = "<3.11" }, + { version = ">=11.0.0", python = ">=3.11,<3.12" }, + { version = ">=14.0.0", python = ">=3.12,<=3.13" }, +] +# We don't have a direct numpy dependency, but pyarrow depends on numpy +# and numpy has python version constraints with python 3.12 +numpy = [ + { version = "<1.25.0", python = "<3.9" }, + { version = "^1.26.0", python = ">=3.12,<3.13" } +] [tool.poetry.group.lint.dependencies] black = "^23.9.1" ruff = "^0.1.3" mypy = "^1.6.0" - [tool.poetry.group.test.dependencies] pytest = "^7.4.2" coverage = "^7.3.2" +[tool.poetry.group.ops] +optional = true + [tool.mypy] strict = true @@ -103,7 +122,9 @@ keep-runtime-typing = true testpaths = "tests" xfail_strict = true filterwarnings = [ - "error" + "error", + "ignore:distutils Version classes:DeprecationWarning", + "ignore:SelectableGroups dict:DeprecationWarning", ] diff --git a/scripts/test_dependency_versions.sh b/scripts/test_dependency_versions.sh new file mode 100755 index 0000000..f59708c --- /dev/null +++ b/scripts/test_dependency_versions.sh @@ -0,0 +1,43 @@ +#!/usr/bin/env bash + +set -o errexit # Abort on non-zero exit status +set -o nounset # Abort on unbound variable +set -o pipefail # Abort on non-zero exit in pipeline + +main() { + PYTHON_MINOR_VERSION=$(poetry run python -c 'import sys; version=sys.version_info[:3]; print("{1}".format(*version))') + echo "Python minor version: $PYTHON_MINOR_VERSION" + + # The errors are mostly / all installation errors, + # about building from source. Could lower + # the requirements if able to build from source. + if (( $PYTHON_MINOR_VERSION < "11" )); then + poetry run pip install pyarrow==9.0.0 + poetry run python -m pytest + + poetry run pip install pyarrow==10.0.0 + poetry run python -m pytest + fi + + if (( $PYTHON_MINOR_VERSION < "12" )); then + poetry run pip install pyarrow==11.0.0 + poetry run python -m pytest + + poetry run pip install pyarrow==13.0.0 + poetry run python -m pytest + + poetry run pip install fsspec==0.9.0 + poetry run python -m pytest + fi + + poetry run pip install pyarrow==14.0.0 + poetry run python -m pytest + + poetry run pip install ijson==3.0.0 + poetry run python -m pytest + + poetry run pip install fsspec==2022.1.0 + poetry run python -m pytest +} + +main diff --git a/src/pipedata/__init__.py b/src/pipedata/__init__.py index d8a87a9..d091303 100644 --- a/src/pipedata/__init__.py +++ b/src/pipedata/__init__.py @@ -1,4 +1,4 @@ -__version__ = "0.0.1" +__version__ = "0.1" __all__ = [ "__version__", diff --git a/src/pipedata/core/chain.py b/src/pipedata/core/chain.py index f8e5ed1..a4568d1 100644 --- a/src/pipedata/core/chain.py +++ b/src/pipedata/core/chain.py @@ -22,7 +22,7 @@ TOther = TypeVar("TOther") -def _batched(iterable: Iterator[TEnd], n: Optional[int]) -> Iterator[Tuple[TEnd, ...]]: +def batched(iterable: Iterator[TEnd], n: Optional[int]) -> Iterator[Tuple[TEnd, ...]]: """Can be replaced by itertools.batched once using Python 3.12+.""" while (elements := tuple(itertools.islice(iterable, n))) != (): yield elements @@ -160,7 +160,7 @@ def batched_map( @functools.wraps(func) def new_action(previous_step: Iterator[TEnd]) -> Iterator[TOther]: - for elements in _batched(previous_step, n): + for elements in batched(previous_step, n): yield func(elements) return self.flat_map(new_action) diff --git a/src/pipedata/ops/__init__.py b/src/pipedata/ops/__init__.py new file mode 100644 index 0000000..caa604c --- /dev/null +++ b/src/pipedata/ops/__init__.py @@ -0,0 +1,10 @@ +from .files import zipped_files +from .records import csv_records, json_records +from .storage import parquet_writer + +__all__ = [ + "zipped_files", + "csv_records", + "json_records", + "parquet_writer", +] diff --git a/src/pipedata/ops/files.py b/src/pipedata/ops/files.py new file mode 100644 index 0000000..c7af7df --- /dev/null +++ b/src/pipedata/ops/files.py @@ -0,0 +1,16 @@ +import logging +import zipfile +from typing import IO, Iterator + +import fsspec # type: ignore + +logger = logging.getLogger(__name__) + + +def zipped_files(file_refs: Iterator[str]) -> Iterator[IO[bytes]]: + for file_ref in file_refs: + with fsspec.open(file_ref, "rb") as file: + with zipfile.ZipFile(file) as zip_file: + for name in zip_file.namelist(): + with zip_file.open(name) as inner_file: + yield inner_file diff --git a/src/pipedata/ops/records.py b/src/pipedata/ops/records.py new file mode 100644 index 0000000..d862221 --- /dev/null +++ b/src/pipedata/ops/records.py @@ -0,0 +1,34 @@ +import csv +import io +import logging +from typing import IO, Any, Callable, Dict, Iterator, Optional + +import ijson # type: ignore + +logger = logging.getLogger(__name__) + + +def json_records( + json_path: str = "item", multiple_values: Optional[bool] = False +) -> Callable[[Iterator[IO[bytes]]], Iterator[Dict[str, Any]]]: + logger.info(f"Initializing json reader for {json_path}") + + def json_records_func(json_files: Iterator[IO[bytes]]) -> Iterator[Dict[str, Any]]: + for json_file in json_files: + logger.info(f"Reading json file {json_file}") + records = ijson.items(json_file, json_path, multiple_values=multiple_values) + yield from records + + return json_records_func + + +def csv_records() -> Callable[[Iterator[IO[bytes]]], Iterator[Dict[str, Any]]]: + def csv_records_func(csv_paths: Iterator[IO[bytes]]) -> Iterator[Dict[str, Any]]: + for csv_path in csv_paths: + logger.info(f"Reading csv file {csv_path}") + csv_reader = csv.DictReader( + io.TextIOWrapper(csv_path, "utf-8"), delimiter="," + ) + yield from csv_reader + + return csv_records_func diff --git a/src/pipedata/ops/storage.py b/src/pipedata/ops/storage.py new file mode 100644 index 0000000..178f004 --- /dev/null +++ b/src/pipedata/ops/storage.py @@ -0,0 +1,51 @@ +from typing import Any, Callable, Dict, Iterator, Optional + +import pyarrow as pa # type: ignore +import pyarrow.parquet as pq # type: ignore + +from pipedata.core.chain import batched + + +def parquet_writer( + file_path: str, + schema: Optional[pa.Schema] = None, + row_group_length: Optional[int] = None, + max_file_length: Optional[int] = None, +) -> Callable[[Iterator[Dict[str, Any]]], Iterator[str]]: + if row_group_length is None and max_file_length is not None: + row_group_length = max_file_length + + if max_file_length is not None: + if file_path.format(i=1) == file_path: + msg = "When (possibly) writing to multiple files (as the file_length" + msg += " argument is not None), the file_path argument must be a" + msg += " format string that contains a format specifier for the file." + raise ValueError(msg) + + def parquet_writer_func(records: Iterator[Dict[str, Any]]) -> Iterator[str]: + writer = None + file_number = 0 + file_length = 0 + for batch in batched(records, row_group_length): + table = pa.Table.from_pylist(batch, schema=schema) + if writer is None: + formated_file_path = file_path + if max_file_length is not None: + formated_file_path = file_path.format(i=file_number) + writer = pq.ParquetWriter(formated_file_path, table.schema) + + writer.write_table(table) + file_length += len(batch) + + if max_file_length is not None and file_length >= max_file_length: + writer.close() + writer = None + file_length = 0 + file_number += 1 + yield formated_file_path + + if writer is not None: + writer.close() + yield formated_file_path + + return parquet_writer_func diff --git a/tests/core/test_stream.py b/tests/core/test_stream.py index 3d50016..3737cb6 100644 --- a/tests/core/test_stream.py +++ b/tests/core/test_stream.py @@ -12,8 +12,10 @@ def test_stream_to_list() -> None: def test_stream_on_range() -> None: stream = StreamStart(range(3)) result = stream.to_list() - print(stream.get_counts()) assert result == [0, 1, 2] + assert stream.get_counts() == [ + {"name": "_identity", "inputs": 3, "outputs": 3}, + ] def test_repeated_stream() -> None: diff --git a/tests/ops/__init__.py b/tests/ops/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/ops/test_files.py b/tests/ops/test_files.py new file mode 100644 index 0000000..c551f9b --- /dev/null +++ b/tests/ops/test_files.py @@ -0,0 +1,30 @@ +import tempfile +import zipfile +from pathlib import Path + +from pipedata.core import StreamStart +from pipedata.ops.files import zipped_files + + +def test_zipped_files() -> None: + with tempfile.TemporaryDirectory() as temp_dir: + zip_path = Path(temp_dir) / "test.zip" + + with zipfile.ZipFile(zip_path, "w") as zip_file: + zip_file.writestr("test.txt", "Hello, world 1!") + zip_file.writestr("test2.txt", "Hello, world 2!") + zip_file.writestr("test3.txt", "Hello, world 3!") + + result = ( + StreamStart([str(zip_path)]) + .flat_map(zipped_files) + .map(lambda x: x.read().decode("utf-8")) + .to_list() + ) + + expected = [ + "Hello, world 1!", + "Hello, world 2!", + "Hello, world 3!", + ] + assert result == expected diff --git a/tests/ops/test_pipeline.py b/tests/ops/test_pipeline.py new file mode 100644 index 0000000..db8b94d --- /dev/null +++ b/tests/ops/test_pipeline.py @@ -0,0 +1,53 @@ +import json +import tempfile +import zipfile +from pathlib import Path + +import pyarrow.parquet as pq # type: ignore + +from pipedata.core import StreamStart +from pipedata.ops import json_records, parquet_writer, zipped_files + + +def test_zipped_files() -> None: + data1 = [ + {"a": 1, "b": 2}, + {"a": 3, "b": 4}, + ] + data2 = [ + {"a": 5, "b": 6}, + {"a": 7, "b": 8}, + ] + data3 = [ + {"a": 9, "b": 10}, + {"a": 11, "b": 12}, + ] + with tempfile.TemporaryDirectory() as tmpdir: + temp_path = Path(tmpdir) + zip_path = temp_path / "test.zip" + output_path = temp_path / "output.parquet" + + with zipfile.ZipFile(zip_path, "w") as zip_file: + zip_file.writestr("test.txt", json.dumps(data1)) + zip_file.writestr("test2.txt", json.dumps(data2)) + zip_file.writestr("test3.txt", json.dumps(data3)) + + result = ( + StreamStart([str(zip_path)]) + .flat_map(zipped_files) + .flat_map(json_records()) + .flat_map(parquet_writer(str(output_path))) + .to_list() + ) + + expected = [str(output_path)] + assert result == expected + + files = list(temp_path.glob("**/*")) + assert sorted(files) == sorted([output_path, zip_path]) + + table = pq.read_table(output_path) + assert table.to_pydict() == { + "a": [1, 3, 5, 7, 9, 11], + "b": [2, 4, 6, 8, 10, 12], + } diff --git a/tests/ops/test_records.py b/tests/ops/test_records.py new file mode 100644 index 0000000..65bed15 --- /dev/null +++ b/tests/ops/test_records.py @@ -0,0 +1,34 @@ +import io +import json + +from pipedata.core import StreamStart +from pipedata.ops.records import csv_records, json_records + + +def test_json_records() -> None: + json1 = [{"a": 1, "b": 2}, {"a": 3, "b": 4}] + json2 = [{"a": 5, "b": 6}, {"a": 7, "b": 8}] + + file1 = io.BytesIO(json.dumps(json1).encode("utf-8")) + file2 = io.BytesIO(json.dumps(json2).encode("utf-8")) + + result = StreamStart([file1, file2]).flat_map(json_records()).to_list() + expected = json1 + json2 + assert result == expected + + +def test_csv_records() -> None: + csv1 = "a,b\n1,2\n3,4" + csv2 = "a,b\n5,6\n7,8" + + file1 = io.BytesIO(csv1.encode("utf-8")) + file2 = io.BytesIO(csv2.encode("utf-8")) + + result = StreamStart([file1, file2]).flat_map(csv_records()).to_list() + expected = [ + {"a": "1", "b": "2"}, + {"a": "3", "b": "4"}, + {"a": "5", "b": "6"}, + {"a": "7", "b": "8"}, + ] + assert result == expected diff --git a/tests/ops/test_storage.py b/tests/ops/test_storage.py new file mode 100644 index 0000000..326dbce --- /dev/null +++ b/tests/ops/test_storage.py @@ -0,0 +1,122 @@ +import tempfile +from pathlib import Path + +import pyarrow.parquet as pq # type: ignore +import pytest + +from pipedata.core import StreamStart +from pipedata.ops.storage import parquet_writer + + +def test_parquet_simple_storage() -> None: + items = [ + {"a": 1, "b": 2}, + {"a": 3, "b": 4}, + {"a": 5, "b": 6}, + {"a": 7, "b": 8}, + ] + + with tempfile.TemporaryDirectory() as tmpdir: + temp_path = Path(tmpdir) + output_path = temp_path / "test.parquet" + + result = StreamStart(items).flat_map(parquet_writer(str(output_path))).to_list() + + assert result == [str(output_path)] + + files = list(temp_path.glob("**/*")) + assert files == [output_path] + + table = pq.read_table(output_path) + assert table.to_pydict() == { + "a": [1, 3, 5, 7], + "b": [2, 4, 6, 8], + } + + +def test_parquet_batched_storage() -> None: + items = [ + {"a": 1, "b": 2}, + {"a": 3, "b": 4}, + {"a": 5, "b": 6}, + {"a": 7, "b": 8}, + ] + + with tempfile.TemporaryDirectory() as tmpdir: + temp_path = Path(tmpdir) + output_path = temp_path / "test.parquet" + + result = ( + StreamStart(items) + .flat_map(parquet_writer(str(output_path), row_group_length=2)) + .to_list() + ) + + assert result == [str(output_path)] + + files = list(temp_path.glob("**/*")) + assert files == [output_path] + + table = pq.read_table(output_path) + assert table.to_pydict() == { + "a": [1, 3, 5, 7], + "b": [2, 4, 6, 8], + } + + +def test_parquet_multiple_files() -> None: + items = [ + {"a": 1, "b": 2}, + {"a": 3, "b": 4}, + {"a": 5, "b": 6}, + {"a": 7, "b": 8}, + ] + + with tempfile.TemporaryDirectory() as tmpdir: + temp_path = Path(tmpdir) + output_path = temp_path / "test_{i:04d}.parquet" + + result = ( + StreamStart(items) + .flat_map(parquet_writer(str(output_path), max_file_length=2)) + .to_list() + ) + + assert result == [ + str(temp_path / "test_0000.parquet"), + str(temp_path / "test_0001.parquet"), + ] + + files = list(temp_path.glob("**/*")) + expected_files = [ + temp_path / "test_0000.parquet", + temp_path / "test_0001.parquet", + ] + assert files == expected_files + + table1 = pq.read_table(files[0]) + assert table1.to_pydict() == { + "a": [1, 3], + "b": [2, 4], + } + + table2 = pq.read_table(files[1]) + assert table2.to_pydict() == { + "a": [5, 7], + "b": [6, 8], + } + + table = pq.read_table(temp_path) + assert table.to_pydict() == { + "a": [1, 3, 5, 7], + "b": [2, 4, 6, 8], + } + + +def test_parquet_multiple_files_wrong_path() -> None: + with tempfile.TemporaryDirectory() as tmpdir: + temp_path = Path(tmpdir) + output_path = temp_path / "test.parquet" + + with pytest.raises(ValueError): # noqa: PT011 + parquet_writer(str(output_path), max_file_length=2)