-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add data operations for reading from zip, iterating over csv and json…
… records, and writing to parquet (#6) * Add data operations: reading from zip file, json and csv record parsing, writing to parquet * Update version to 0.1 * Fix linting; update to run multiple dependency versions * Run dependency version checks with specific python versions
- Loading branch information
Showing
19 changed files
with
773 additions
and
11 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -5,7 +5,7 @@ build-backend = "poetry.core.masonry.api" | |
|
||
[tool.poetry] | ||
name = "pipedata" | ||
version = "0.0.1" | ||
version = "0.1" | ||
description = "Framework for building pipelines for data processing" | ||
authors = ["Simon Wicks <[email protected]>"] | ||
readme = "README.md" | ||
|
@@ -35,17 +35,36 @@ packages = [{include = "pipedata", from = "src"}] | |
[tool.poetry.dependencies] | ||
python = "^3.8" | ||
|
||
[tool.poetry.group.ops.dependencies] | ||
fsspec = [ | ||
{ version = ">=0.9.0", python = "<3.12" }, | ||
{ version = ">=2022.1.0", python = ">=3.12,<3.13"}, | ||
] | ||
ijson = "^3.0.0" | ||
pyarrow = [ | ||
{ version = ">=9.0.0", python = "<3.11" }, | ||
{ version = ">=11.0.0", python = ">=3.11,<3.12" }, | ||
{ version = ">=14.0.0", python = ">=3.12,<=3.13" }, | ||
] | ||
# We don't have a direct numpy dependency, but pyarrow depends on numpy | ||
# and numpy has python version constraints with python 3.12 | ||
numpy = [ | ||
{ version = "<1.25.0", python = "<3.9" }, | ||
{ version = "^1.26.0", python = ">=3.12,<3.13" } | ||
] | ||
|
||
[tool.poetry.group.lint.dependencies] | ||
black = "^23.9.1" | ||
ruff = "^0.1.3" | ||
mypy = "^1.6.0" | ||
|
||
|
||
[tool.poetry.group.test.dependencies] | ||
pytest = "^7.4.2" | ||
coverage = "^7.3.2" | ||
|
||
[tool.poetry.group.ops] | ||
optional = true | ||
|
||
|
||
[tool.mypy] | ||
strict = true | ||
|
@@ -103,7 +122,9 @@ keep-runtime-typing = true | |
testpaths = "tests" | ||
xfail_strict = true | ||
filterwarnings = [ | ||
"error" | ||
"error", | ||
"ignore:distutils Version classes:DeprecationWarning", | ||
"ignore:SelectableGroups dict:DeprecationWarning", | ||
] | ||
|
||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,43 @@ | ||
#!/usr/bin/env bash | ||
|
||
set -o errexit # Abort on non-zero exit status | ||
set -o nounset # Abort on unbound variable | ||
set -o pipefail # Abort on non-zero exit in pipeline | ||
|
||
main() { | ||
PYTHON_MINOR_VERSION=$(poetry run python -c 'import sys; version=sys.version_info[:3]; print("{1}".format(*version))') | ||
echo "Python minor version: $PYTHON_MINOR_VERSION" | ||
|
||
# The errors are mostly / all installation errors, | ||
# about building from source. Could lower | ||
# the requirements if able to build from source. | ||
if (( $PYTHON_MINOR_VERSION < "11" )); then | ||
poetry run pip install pyarrow==9.0.0 | ||
poetry run python -m pytest | ||
|
||
poetry run pip install pyarrow==10.0.0 | ||
poetry run python -m pytest | ||
fi | ||
|
||
if (( $PYTHON_MINOR_VERSION < "12" )); then | ||
poetry run pip install pyarrow==11.0.0 | ||
poetry run python -m pytest | ||
|
||
poetry run pip install pyarrow==13.0.0 | ||
poetry run python -m pytest | ||
|
||
poetry run pip install fsspec==0.9.0 | ||
poetry run python -m pytest | ||
fi | ||
|
||
poetry run pip install pyarrow==14.0.0 | ||
poetry run python -m pytest | ||
|
||
poetry run pip install ijson==3.0.0 | ||
poetry run python -m pytest | ||
|
||
poetry run pip install fsspec==2022.1.0 | ||
poetry run python -m pytest | ||
} | ||
|
||
main |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,4 +1,4 @@ | ||
__version__ = "0.0.1" | ||
__version__ = "0.1" | ||
|
||
__all__ = [ | ||
"__version__", | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,10 @@ | ||
from .files import zipped_files | ||
from .records import csv_records, json_records | ||
from .storage import parquet_writer | ||
|
||
__all__ = [ | ||
"zipped_files", | ||
"csv_records", | ||
"json_records", | ||
"parquet_writer", | ||
] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,16 @@ | ||
import logging | ||
import zipfile | ||
from typing import IO, Iterator | ||
|
||
import fsspec # type: ignore | ||
|
||
logger = logging.getLogger(__name__) | ||
|
||
|
||
def zipped_files(file_refs: Iterator[str]) -> Iterator[IO[bytes]]: | ||
for file_ref in file_refs: | ||
with fsspec.open(file_ref, "rb") as file: | ||
with zipfile.ZipFile(file) as zip_file: | ||
for name in zip_file.namelist(): | ||
with zip_file.open(name) as inner_file: | ||
yield inner_file |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,34 @@ | ||
import csv | ||
import io | ||
import logging | ||
from typing import IO, Any, Callable, Dict, Iterator, Optional | ||
|
||
import ijson # type: ignore | ||
|
||
logger = logging.getLogger(__name__) | ||
|
||
|
||
def json_records( | ||
json_path: str = "item", multiple_values: Optional[bool] = False | ||
) -> Callable[[Iterator[IO[bytes]]], Iterator[Dict[str, Any]]]: | ||
logger.info(f"Initializing json reader for {json_path}") | ||
|
||
def json_records_func(json_files: Iterator[IO[bytes]]) -> Iterator[Dict[str, Any]]: | ||
for json_file in json_files: | ||
logger.info(f"Reading json file {json_file}") | ||
records = ijson.items(json_file, json_path, multiple_values=multiple_values) | ||
yield from records | ||
|
||
return json_records_func | ||
|
||
|
||
def csv_records() -> Callable[[Iterator[IO[bytes]]], Iterator[Dict[str, Any]]]: | ||
def csv_records_func(csv_paths: Iterator[IO[bytes]]) -> Iterator[Dict[str, Any]]: | ||
for csv_path in csv_paths: | ||
logger.info(f"Reading csv file {csv_path}") | ||
csv_reader = csv.DictReader( | ||
io.TextIOWrapper(csv_path, "utf-8"), delimiter="," | ||
) | ||
yield from csv_reader | ||
|
||
return csv_records_func |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,51 @@ | ||
from typing import Any, Callable, Dict, Iterator, Optional | ||
|
||
import pyarrow as pa # type: ignore | ||
import pyarrow.parquet as pq # type: ignore | ||
|
||
from pipedata.core.chain import batched | ||
|
||
|
||
def parquet_writer( | ||
file_path: str, | ||
schema: Optional[pa.Schema] = None, | ||
row_group_length: Optional[int] = None, | ||
max_file_length: Optional[int] = None, | ||
) -> Callable[[Iterator[Dict[str, Any]]], Iterator[str]]: | ||
if row_group_length is None and max_file_length is not None: | ||
row_group_length = max_file_length | ||
|
||
if max_file_length is not None: | ||
if file_path.format(i=1) == file_path: | ||
msg = "When (possibly) writing to multiple files (as the file_length" | ||
msg += " argument is not None), the file_path argument must be a" | ||
msg += " format string that contains a format specifier for the file." | ||
raise ValueError(msg) | ||
|
||
def parquet_writer_func(records: Iterator[Dict[str, Any]]) -> Iterator[str]: | ||
writer = None | ||
file_number = 0 | ||
file_length = 0 | ||
for batch in batched(records, row_group_length): | ||
table = pa.Table.from_pylist(batch, schema=schema) | ||
if writer is None: | ||
formated_file_path = file_path | ||
if max_file_length is not None: | ||
formated_file_path = file_path.format(i=file_number) | ||
writer = pq.ParquetWriter(formated_file_path, table.schema) | ||
|
||
writer.write_table(table) | ||
file_length += len(batch) | ||
|
||
if max_file_length is not None and file_length >= max_file_length: | ||
writer.close() | ||
writer = None | ||
file_length = 0 | ||
file_number += 1 | ||
yield formated_file_path | ||
|
||
if writer is not None: | ||
writer.close() | ||
yield formated_file_path | ||
|
||
return parquet_writer_func |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Empty file.
Oops, something went wrong.