diff --git a/README.md b/README.md index d46933d..f06dbdb 100644 --- a/README.md +++ b/README.md @@ -2,9 +2,11 @@ python tools to assist with standardized data ingestion workflows +## Installation, Usage, and Release Management + ### Install from PyPi -``` +```python pip install osc-ingest-tools ``` @@ -100,14 +102,14 @@ checks. Enabling automatic formatting via [pre-commit](https://pre-commit.com/) is recommended: -``` +```shell pip install black isort pre-commit pre-commit install ``` To ensure compliance with static check tools, developers may wish to run; -``` +```shell pip install black isort # auto-sort imports isort . @@ -117,7 +119,7 @@ black . Code can then be tested using tox. -``` +```shell # run static checks and tests tox # run only tests @@ -139,7 +141,7 @@ To release a new version of this library, authorized developers should; E.g., -``` +```shell git commit -sm "Release v0.3.4" git tag v0.3.4 git push --follow-tags diff --git a/osc_ingest_trino/__init__.py b/osc_ingest_trino/__init__.py index b4e11d4..f947054 100644 --- a/osc_ingest_trino/__init__.py +++ b/osc_ingest_trino/__init__.py @@ -1,3 +1,5 @@ +"""Functions to simplify use of S3-based data, Pandas dataframes, and Trino SQL tables.""" + from .boto3_utils import attach_s3_bucket, upload_directory_to_s3 from .dotenv_utils import load_credentials_dotenv from .sqlcols import ( diff --git a/osc_ingest_trino/boto3_utils.py b/osc_ingest_trino/boto3_utils.py index 96c03b3..541852f 100644 --- a/osc_ingest_trino/boto3_utils.py +++ b/osc_ingest_trino/boto3_utils.py @@ -1,3 +1,5 @@ +"""AWS S3 interoperability functions.""" + import os from pathlib import Path from typing import Union @@ -12,6 +14,13 @@ def upload_directory_to_s3(path: Union[Path, str], bucket: Bucket, prefix: str, verbose: bool = False) -> None: + """Upload files to an S3 bucket. + + path -- the directory containing all files to be uploaded. + bucket -- the S3 bucket. + prefix -- the prefix prepended to each filename before uploading. + verbose -- if True, print each file uploaded (with its prefix). + """ path = str(path) prefix = str(prefix) for subdir, dirs, files in os.walk(path): @@ -25,6 +34,7 @@ def upload_directory_to_s3(path: Union[Path, str], bucket: Bucket, prefix: str, def attach_s3_bucket(env_var_prefix: str) -> Bucket: + """Return the S3 Bucket resource asscoiated with env_var_prefix (typically from `credentials.env`).""" s3 = boto3.resource( service_name="s3", endpoint_url=os.environ[f"{env_var_prefix}_ENDPOINT"], diff --git a/osc_ingest_trino/dotenv_utils.py b/osc_ingest_trino/dotenv_utils.py index e452333..89510d8 100644 --- a/osc_ingest_trino/dotenv_utils.py +++ b/osc_ingest_trino/dotenv_utils.py @@ -1,3 +1,5 @@ +"""Functions to read credentials files and inject secrets into the environment.""" + import os import pathlib @@ -9,9 +11,10 @@ def load_credentials_dotenv() -> None: - # Load some standard environment variables from a dot-env file, if it exists. - # If no such file can be found, does not fail, and so allows these environment vars to - # be populated in some other way + """Load some standard environment variables from a dot-env file, if it exists. + + If no such file can be found, do not raise, allowing these environment vars to be populated in some other way. + """ dotenv_dir = os.environ.get("CREDENTIAL_DOTENV_DIR", os.environ.get("PWD", "/opt/app-root/src")) dotenv_path = pathlib.Path(dotenv_dir) / "credentials.env" if os.path.exists(dotenv_path): diff --git a/osc_ingest_trino/sqlcols.py b/osc_ingest_trino/sqlcols.py index e6c6e41..fbb0b0c 100644 --- a/osc_ingest_trino/sqlcols.py +++ b/osc_ingest_trino/sqlcols.py @@ -1,3 +1,5 @@ +"""Functions to translate Pandas column names to SQL column names.""" + import re from typing import List, Union, cast @@ -16,6 +18,7 @@ # 63 seems to be a common max column name length def sql_compliant_name(name: Union[List[str], str], maxlen=63) -> Union[List[str], str]: + """Convert name to a SQL-compliant table or column name, abbreviating some common words.""" if isinstance(name, list): return [cast(str, sql_compliant_name(e, maxlen=maxlen)) for e in name] w = str(name).casefold().rstrip().lstrip() @@ -40,6 +43,7 @@ def sql_compliant_name(name: Union[List[str], str], maxlen=63) -> Union[List[str def enforce_sql_column_names(df: pd.DataFrame, inplace: bool = False, maxlen: int = 63) -> pd.DataFrame: + """Ensure that all column names for df are SQL-compliant.""" if not isinstance(df, pd.DataFrame): raise ValueError("df must be a pandas DataFrame") icols = df.columns.to_list() @@ -51,6 +55,7 @@ def enforce_sql_column_names(df: pd.DataFrame, inplace: bool = False, maxlen: in def enforce_partition_column_order(df: pd.DataFrame, pcols: List[str], inplace: bool = False) -> pd.DataFrame: + """Reorder columns names of df to match the order given by pcols.""" if not isinstance(df, pd.DataFrame): raise ValueError("df must be a pandas DataFrame") if not isinstance(pcols, list): diff --git a/osc_ingest_trino/sqltypes.py b/osc_ingest_trino/sqltypes.py index 23060ba..b26b25b 100644 --- a/osc_ingest_trino/sqltypes.py +++ b/osc_ingest_trino/sqltypes.py @@ -1,3 +1,5 @@ +"""Functions to translate Pandas dataframes to SQL equivalents.""" + from typing import Dict import pandas as pd @@ -25,6 +27,7 @@ def pandas_type_to_sql(pt: str, typemap: Dict[str, str] = {}): + """Return the SQL type corresponding to the pandas type `pt` (using special mappings, if any, from `typemap`).""" if not isinstance(typemap, dict): raise ValueError("typemap must be a dict") # user defined typemap overrides _p2smap @@ -40,6 +43,13 @@ def create_table_schema_pairs( colmap: Dict[str, str] = {}, indent: int = 4, ) -> str: + """Create SQL column, type pairs that can appear in a CREATE TABLE operation. + + df -- the dataframe to be rendered as a SQL table + typemap -- mappings from dtypes to SQL types above and beyond our defaults + colmap -- mappings of df column names to SQL column names if not using defaults + indent -- how many spaces of indent to make our SQL declarations pretty + """ if not isinstance(df, pd.DataFrame): raise ValueError("df must be a pandas DataFrame") if not isinstance(colmap, dict): diff --git a/osc_ingest_trino/trino_utils.py b/osc_ingest_trino/trino_utils.py index 10f9d71..5182cf2 100644 --- a/osc_ingest_trino/trino_utils.py +++ b/osc_ingest_trino/trino_utils.py @@ -1,3 +1,5 @@ +"""Trino interoperability functions.""" + import math import os import uuid @@ -32,6 +34,13 @@ def attach_trino_engine( schema: Optional[str] = None, verbose: Optional[bool] = False, ) -> Engine: + """Return a SQLAlchemy engine object representing a Trino instance. + + env_var_prefix -- a prefix for all environment variables related to the Trino instance. + catalog -- the Trino catalog. + schema -- the Trino schema. + verbose -- if True, print the full string used to connect. + """ sqlstring = "trino://{user}@{host}:{port}".format( user=os.environ[f"{env_var_prefix}_USER"], host=os.environ[f"{env_var_prefix}_HOST"], @@ -57,6 +66,12 @@ def attach_trino_engine( def _do_sql( sql: Union[sqlalchemy.sql.elements.TextClause, str], engine: Engine, verbose: bool = False ) -> Optional[Sequence[Row[Any]]]: + """Execute SQL query, returning the query result. + + sql -- the SQL query. + engine -- the SQLAlchemy engine representing the Trino database. + verbose -- if True, print the values returned from executing the string. + """ if type(sql) is not sqlalchemy.sql.elements.TextClause: sql = text(str(sql)) if verbose: @@ -86,6 +101,22 @@ def fast_pandas_ingest_via_hive( # noqa: C901 colmap: Dict[str, str] = {}, verbose: bool = False, ) -> None: + """Efficiently export a dataframe into a Trino database. + + df -- the dataframe to export. + engine -- the SQLAlchemy engine representing the Trino database. + catalog -- the Trino catalog. + schema -- the Trino schema. + table -- the name of the table created in the schema. + hive_bucket -- the backing store of the Hive metastore. + hive_catalog -- the Hive metastore catalog (where schemas are created). + hive_schema -- the Hive metastore schema (where tables will be created). + partition_columns -- if not empty, defines the partition columns of the table created. + overwrite -- if True, an existing table will be overwritten. + typemap -- used to format types that cannot otherwise be properly inferred. + colmap -- used to format column names that cannot otherwise be properly inferred. + verbose -- if True, print the queries being executed and the results of those queries. + """ uh8 = uuid.uuid4().hex[:8] hive_table = f"ingest_temp_{uh8}" @@ -157,6 +188,8 @@ def fast_pandas_ingest_via_hive( # noqa: C901 class TrinoBatchInsert(object): + """A class used to bundle together basic Trino parameters.""" + def __init__( self, catalog: Optional[str] = None, @@ -165,6 +198,7 @@ def __init__( optimize: bool = False, verbose: bool = False, ): + """Initialize TrinoBatchInsert objects.""" self.catalog = catalog self.schema = schema self.batch_size = batch_size @@ -175,6 +209,7 @@ def __init__( # https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.to_sql.html # https://pandas.pydata.org/docs/user_guide/io.html#io-sql-method def __call__(self, sqltbl: Table, dbcxn: Connection, columns: List[str], data_iter: List[Tuple]) -> None: + """Implement `callable` interface for row-by-row insertion.""" fqname = self._full_table_name(sqltbl) batch: List[str] = [] self.ninserts = 0 @@ -201,6 +236,7 @@ def __call__(self, sqltbl: Table, dbcxn: Connection, columns: List[str], data_it print(f"execute optimize: {x}") def _do_insert(self, dbcxn: Connection, fqname: str, batch: List[str]) -> None: + """Implement actual row-by-row insertion of BATCH data into table FQNAME using DBCXN database connection.""" if self.verbose: print(f"inserting {len(batch)} records") TrinoBatchInsert._print_batch(batch) @@ -218,6 +254,7 @@ def _do_insert(self, dbcxn: Connection, fqname: str, batch: List[str]) -> None: print(f"batch insert result: {x}") def _full_table_name(self, sqltbl: Table) -> str: + """Return fully qualified table name for SQLTBL table within this TrinoBatchInsert object.""" # start with table name name: str = f"{sqltbl.name}" # prepend schema - allow override from this class @@ -231,6 +268,7 @@ def _full_table_name(self, sqltbl: Table) -> str: @staticmethod def _sqlform(x: Any) -> str: + """Format the value of x so it can appear in a SQL Values context.""" if x is None: return "NULL" if isinstance(x, str): @@ -254,6 +292,7 @@ def _sqlform(x: Any) -> str: @staticmethod def _print_batch(batch: List[str]) -> None: + """For batch, a list of SQL query lines, print up to the first 5 such.""" if len(batch) > 5: print("\n".join(f" {e}" for e in batch[:3])) print(" ...") diff --git a/osc_ingest_trino/unmanaged/__init__.py b/osc_ingest_trino/unmanaged/__init__.py index 09762a0..b0647bf 100644 --- a/osc_ingest_trino/unmanaged/__init__.py +++ b/osc_ingest_trino/unmanaged/__init__.py @@ -1,3 +1,5 @@ +"""Functions create and clean up unmanaged Hive tables.""" + from .unmanaged_hive_ingest import ( drop_unmanaged_data, drop_unmanaged_table, diff --git a/osc_ingest_trino/unmanaged/unmanaged_hive_ingest.py b/osc_ingest_trino/unmanaged/unmanaged_hive_ingest.py index 852505f..ffc2ffa 100644 --- a/osc_ingest_trino/unmanaged/unmanaged_hive_ingest.py +++ b/osc_ingest_trino/unmanaged/unmanaged_hive_ingest.py @@ -1,3 +1,5 @@ +"""Functions to create, ingest, and drop unmanaged Hive tables.""" + import shutil import uuid @@ -17,6 +19,7 @@ def _remove_trailing_slash(s): + """Remove trailing slash from s.""" s = str(s) if len(s) == 0: return s @@ -26,10 +29,12 @@ def _remove_trailing_slash(s): def _prefix(pfx, schema, table): + """Translate pfx, schema, and table names into S3 bucket name.""" return _remove_trailing_slash(pfx).format(schema=schema, table=table) def drop_unmanaged_table(catalog, schema, table, engine, bucket, prefix=_default_prefix, verbose=False): + """Drop catalog.schema.table from Hive metastore and also delete its S3 backing store.""" sql = text(f"drop table if exists {catalog}.{schema}.{table}") with engine.begin() as cxn: qres = cxn.execute(sql) @@ -40,6 +45,7 @@ def drop_unmanaged_table(catalog, schema, table, engine, bucket, prefix=_default def drop_unmanaged_data(schema, table, bucket, prefix=_default_prefix, verbose=False): + """Delete data that may have been orphaned when its table was dropped in Hive metastore.""" dres = bucket.objects.filter(Prefix=f"{_prefix(prefix, schema, table)}/").delete() if verbose: print(dres) @@ -49,6 +55,7 @@ def drop_unmanaged_data(schema, table, bucket, prefix=_default_prefix, verbose=F def ingest_unmanaged_parquet( df, schema, table, bucket, partition_columns=[], append=True, workdir="/tmp", prefix=_default_prefix, verbose=False ): + """Ingest data from df into Hive metastore table with backing store bucket.""" if not isinstance(df, pd.DataFrame): raise ValueError("df must be a pandas DataFrame") if not isinstance(partition_columns, list): @@ -83,6 +90,7 @@ def ingest_unmanaged_parquet( def unmanaged_parquet_tabledef( df, catalog, schema, table, bucket, partition_columns=[], typemap={}, colmap={}, verbose=False ): + """Return a SQL string that would create a table suitable for ingesting df into Hive metastore backed by bucket.""" if not isinstance(df, pd.DataFrame): raise ValueError("df must be a pandas DataFrame") if not isinstance(partition_columns, list): diff --git a/pdm.lock b/pdm.lock index 4ace794..4f379f0 100644 --- a/pdm.lock +++ b/pdm.lock @@ -5,7 +5,7 @@ groups = ["default", "dev", "docs", "lint", "test", "tox"] strategy = ["cross_platform", "inherit_metadata"] lock_version = "4.4.1" -content_hash = "sha256:c1c9f059f936bec2b9f01f18c2019afb47acf240a23d5bdf76956f6c8f8d7a1f" +content_hash = "sha256:0d759e19350107e0786640c7eb1f10c4d95fd05a77564f57218bc2011ec94a43" [[package]] name = "alabaster" @@ -57,25 +57,35 @@ files = [ {file = "blinker-1.8.1.tar.gz", hash = "sha256:da44ec748222dcd0105ef975eed946da197d5bdf8bafb6aa92f5bc89da63fa25"}, ] +[[package]] +name = "boto" +version = "2.49.0" +summary = "Amazon Web Services Library" +groups = ["default"] +files = [ + {file = "boto-2.49.0-py2.py3-none-any.whl", hash = "sha256:147758d41ae7240dc989f0039f27da8ca0d53734be0eb869ef16e3adcfa462e8"}, + {file = "boto-2.49.0.tar.gz", hash = "sha256:ea0d3b40a2d852767be77ca343b58a9e3a4b00d9db440efb8da74b4e58025e5a"}, +] + [[package]] name = "boto3" -version = "1.34.93" +version = "1.34.96" requires_python = ">=3.8" summary = "The AWS SDK for Python" groups = ["default"] dependencies = [ - "botocore<1.35.0,>=1.34.93", + "botocore<1.35.0,>=1.34.96", "jmespath<2.0.0,>=0.7.1", "s3transfer<0.11.0,>=0.10.0", ] files = [ - {file = "boto3-1.34.93-py3-none-any.whl", hash = "sha256:b59355bf4a1408563969526f314611dbeacc151cf90ecb22af295dcc4fe18def"}, - {file = "boto3-1.34.93.tar.gz", hash = "sha256:e39516e4ca21612932599819662759c04485d53ca457996a913163da11f052a4"}, + {file = "boto3-1.34.96-py3-none-any.whl", hash = "sha256:fe3d039631074a96374a354764641b6623036b6ea15381b8a04ac0a193b8c1e0"}, + {file = "boto3-1.34.96.tar.gz", hash = "sha256:42ea7d46688e7cb27259780b9da2cddcfaf2763ff5d327f4d54eac12edba8e72"}, ] [[package]] name = "botocore" -version = "1.34.93" +version = "1.34.96" requires_python = ">=3.8" summary = "Low-level, data-driven core of boto 3." groups = ["default"] @@ -86,8 +96,8 @@ dependencies = [ "urllib3<1.27,>=1.25.4; python_version < \"3.10\"", ] files = [ - {file = "botocore-1.34.93-py3-none-any.whl", hash = "sha256:6fbd5a53a2adc9b3d4ebd90ae0ede83a91a41d96231f8a5984051f75495f246d"}, - {file = "botocore-1.34.93.tar.gz", hash = "sha256:79d39b0b87e962991c6dd55e78ce15155099f6fb741be88b1b8a456a702cc150"}, + {file = "botocore-1.34.96-py3-none-any.whl", hash = "sha256:4c307f5772286f1ab58a91220ea8e180416a2ea0cc7e76983a6984e4ef8c212d"}, + {file = "botocore-1.34.96.tar.gz", hash = "sha256:00e917cd8152d902a4771b9e1e4d0cf1ee096c90027ee35f2a76b6d394e2ada5"}, ] [[package]] @@ -117,8 +127,7 @@ name = "cffi" version = "1.16.0" requires_python = ">=3.8" summary = "Foreign Function Interface for Python calling C code." -groups = ["test"] -marker = "platform_python_implementation == \"PyPy\"" +groups = ["default", "test"] dependencies = [ "pycparser", ] @@ -380,6 +389,50 @@ files = [ {file = "coverage-7.5.0.tar.gz", hash = "sha256:cf62d17310f34084c59c01e027259076479128d11e4661bb6c9acb38c5e19bb8"}, ] +[[package]] +name = "cryptography" +version = "42.0.5" +requires_python = ">=3.7" +summary = "cryptography is a package which provides cryptographic recipes and primitives to Python developers." +groups = ["default"] +dependencies = [ + "cffi>=1.12; platform_python_implementation != \"PyPy\"", +] +files = [ + {file = "cryptography-42.0.5-cp37-abi3-macosx_10_12_universal2.whl", hash = "sha256:a30596bae9403a342c978fb47d9b0ee277699fa53bbafad14706af51fe543d16"}, + {file = "cryptography-42.0.5-cp37-abi3-macosx_10_12_x86_64.whl", hash = "sha256:b7ffe927ee6531c78f81aa17e684e2ff617daeba7f189f911065b2ea2d526dec"}, + {file = "cryptography-42.0.5-cp37-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2424ff4c4ac7f6b8177b53c17ed5d8fa74ae5955656867f5a8affaca36a27abb"}, + {file = "cryptography-42.0.5-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:329906dcc7b20ff3cad13c069a78124ed8247adcac44b10bea1130e36caae0b4"}, + {file = "cryptography-42.0.5-cp37-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:b03c2ae5d2f0fc05f9a2c0c997e1bc18c8229f392234e8a0194f202169ccd278"}, + {file = "cryptography-42.0.5-cp37-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:f8837fe1d6ac4a8052a9a8ddab256bc006242696f03368a4009be7ee3075cdb7"}, + {file = "cryptography-42.0.5-cp37-abi3-musllinux_1_1_aarch64.whl", hash = "sha256:0270572b8bd2c833c3981724b8ee9747b3ec96f699a9665470018594301439ee"}, + {file = "cryptography-42.0.5-cp37-abi3-musllinux_1_1_x86_64.whl", hash = "sha256:b8cac287fafc4ad485b8a9b67d0ee80c66bf3574f655d3b97ef2e1082360faf1"}, + {file = "cryptography-42.0.5-cp37-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:16a48c23a62a2f4a285699dba2e4ff2d1cff3115b9df052cdd976a18856d8e3d"}, + {file = "cryptography-42.0.5-cp37-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:2bce03af1ce5a5567ab89bd90d11e7bbdff56b8af3acbbec1faded8f44cb06da"}, + {file = "cryptography-42.0.5-cp37-abi3-win32.whl", hash = "sha256:b6cd2203306b63e41acdf39aa93b86fb566049aeb6dc489b70e34bcd07adca74"}, + {file = "cryptography-42.0.5-cp37-abi3-win_amd64.whl", hash = "sha256:98d8dc6d012b82287f2c3d26ce1d2dd130ec200c8679b6213b3c73c08b2b7940"}, + {file = "cryptography-42.0.5-cp39-abi3-macosx_10_12_universal2.whl", hash = "sha256:5e6275c09d2badf57aea3afa80d975444f4be8d3bc58f7f80d2a484c6f9485c8"}, + {file = "cryptography-42.0.5-cp39-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e4985a790f921508f36f81831817cbc03b102d643b5fcb81cd33df3fa291a1a1"}, + {file = "cryptography-42.0.5-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7cde5f38e614f55e28d831754e8a3bacf9ace5d1566235e39d91b35502d6936e"}, + {file = "cryptography-42.0.5-cp39-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:7367d7b2eca6513681127ebad53b2582911d1736dc2ffc19f2c3ae49997496bc"}, + {file = "cryptography-42.0.5-cp39-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:cd2030f6650c089aeb304cf093f3244d34745ce0cfcc39f20c6fbfe030102e2a"}, + {file = "cryptography-42.0.5-cp39-abi3-musllinux_1_1_aarch64.whl", hash = "sha256:a2913c5375154b6ef2e91c10b5720ea6e21007412f6437504ffea2109b5a33d7"}, + {file = "cryptography-42.0.5-cp39-abi3-musllinux_1_1_x86_64.whl", hash = "sha256:c41fb5e6a5fe9ebcd58ca3abfeb51dffb5d83d6775405305bfa8715b76521922"}, + {file = "cryptography-42.0.5-cp39-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:3eaafe47ec0d0ffcc9349e1708be2aaea4c6dd4978d76bf6eb0cb2c13636c6fc"}, + {file = "cryptography-42.0.5-cp39-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:1b95b98b0d2af784078fa69f637135e3c317091b615cd0905f8b8a087e86fa30"}, + {file = "cryptography-42.0.5-cp39-abi3-win32.whl", hash = "sha256:1f71c10d1e88467126f0efd484bd44bca5e14c664ec2ede64c32f20875c0d413"}, + {file = "cryptography-42.0.5-cp39-abi3-win_amd64.whl", hash = "sha256:a011a644f6d7d03736214d38832e030d8268bcff4a41f728e6030325fea3e400"}, + {file = "cryptography-42.0.5-pp310-pypy310_pp73-macosx_10_12_x86_64.whl", hash = "sha256:9481ffe3cf013b71b2428b905c4f7a9a4f76ec03065b05ff499bb5682a8d9ad8"}, + {file = "cryptography-42.0.5-pp310-pypy310_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:ba334e6e4b1d92442b75ddacc615c5476d4ad55cc29b15d590cc6b86efa487e2"}, + {file = "cryptography-42.0.5-pp310-pypy310_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:ba3e4a42397c25b7ff88cdec6e2a16c2be18720f317506ee25210f6d31925f9c"}, + {file = "cryptography-42.0.5-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:111a0d8553afcf8eb02a4fea6ca4f59d48ddb34497aa8706a6cf536f1a5ec576"}, + {file = "cryptography-42.0.5-pp39-pypy39_pp73-macosx_10_12_x86_64.whl", hash = "sha256:cd65d75953847815962c84a4654a84850b2bb4aed3f26fadcc1c13892e1e29f6"}, + {file = "cryptography-42.0.5-pp39-pypy39_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:e807b3188f9eb0eaa7bbb579b462c5ace579f1cedb28107ce8b48a9f7ad3679e"}, + {file = "cryptography-42.0.5-pp39-pypy39_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:f12764b8fffc7a123f641d7d049d382b73f96a34117e0b637b80643169cec8ac"}, + {file = "cryptography-42.0.5-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:37dd623507659e08be98eec89323469e8c7b4c1407c85112634ae3dbdb926fdd"}, + {file = "cryptography-42.0.5.tar.gz", hash = "sha256:6fe07eec95dfd477eb9530aef5bead34fec819b3aaf6c5bd6d20565da607bfe1"}, +] + [[package]] name = "dep-logic" version = "0.2.0" @@ -429,13 +482,13 @@ files = [ [[package]] name = "filelock" -version = "3.13.4" +version = "3.14.0" requires_python = ">=3.8" summary = "A platform independent file lock." groups = ["dev", "lint", "test", "tox"] files = [ - {file = "filelock-3.13.4-py3-none-any.whl", hash = "sha256:404e5e9253aa60ad457cae1be07c0f0ca90a63931200a47d9b6a6af84fd7b45f"}, - {file = "filelock-3.13.4.tar.gz", hash = "sha256:d13f466618bfde72bd2c18255e269f72542c6e70e7bac83a0232d6b1cc5c8cf4"}, + {file = "filelock-3.14.0-py3-none-any.whl", hash = "sha256:43339835842f110ca7ae60f1e1c160714c5a6afd15a2873419ab185334975c0f"}, + {file = "filelock-3.14.0.tar.gz", hash = "sha256:6ea72da3be9b8c82afd3edcf99f2fffbb5076335a5ae4d03248bb5b6c3eae78a"}, ] [[package]] @@ -650,7 +703,7 @@ name = "jinja2" version = "3.1.3" requires_python = ">=3.7" summary = "A very fast and expressive template engine." -groups = ["docs"] +groups = ["default", "docs"] dependencies = [ "MarkupSafe>=2.0", ] @@ -689,7 +742,7 @@ name = "markupsafe" version = "2.1.5" requires_python = ">=3.7" summary = "Safely add untrusted strings to HTML/XML markup." -groups = ["docs"] +groups = ["default", "docs"] files = [ {file = "MarkupSafe-2.1.5-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:a17a92de5231666cfbe003f0e4b9b3a7ae3afb1ec2845aadc2bacc93ff85febc"}, {file = "MarkupSafe-2.1.5-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:72b6be590cc35924b02c78ef34b467da4ba07e4e0f0454a2c5907f473fc50ce5"}, @@ -745,6 +798,45 @@ files = [ {file = "mdurl-0.1.2.tar.gz", hash = "sha256:bb413d29f5eea38f31dd4754dd7377d4465116fb207585f97bf925588687c1ba"}, ] +[[package]] +name = "moto" +version = "5.0.6" +requires_python = ">=3.8" +summary = "" +groups = ["default"] +dependencies = [ + "Jinja2>=2.10.1", + "boto3>=1.9.201", + "botocore>=1.14.0", + "cryptography>=3.3.1", + "python-dateutil<3.0.0,>=2.1", + "requests>=2.5", + "responses>=0.15.0", + "werkzeug!=2.2.0,!=2.2.1,>=0.5", + "xmltodict", +] +files = [ + {file = "moto-5.0.6-py2.py3-none-any.whl", hash = "sha256:ca1e22831a741733b581ff2ef4d6ae2e1c6db1eab97af1b78b86ca2c6e88c609"}, + {file = "moto-5.0.6.tar.gz", hash = "sha256:ad8b23f2b555ad694da8b2432a42b6d96beaaf67a4e7d932196a72193a2eee2c"}, +] + +[[package]] +name = "moto" +version = "5.0.6" +extras = ["s3"] +requires_python = ">=3.8" +summary = "" +groups = ["default"] +dependencies = [ + "PyYAML>=5.1", + "moto==5.0.6", + "py-partiql-parser==0.5.4", +] +files = [ + {file = "moto-5.0.6-py2.py3-none-any.whl", hash = "sha256:ca1e22831a741733b581ff2ef4d6ae2e1c6db1eab97af1b78b86ca2c6e88c609"}, + {file = "moto-5.0.6.tar.gz", hash = "sha256:ad8b23f2b555ad694da8b2432a42b6d96beaaf67a4e7d932196a72193a2eee2c"}, +] + [[package]] name = "msgpack" version = "1.0.8" @@ -799,6 +891,20 @@ files = [ {file = "msgpack-1.0.8.tar.gz", hash = "sha256:95c02b0e27e706e48d0e5426d1710ca78e0f0628d6e89d5b5a5b91a5f12274f3"}, ] +[[package]] +name = "mypy-boto3-s3" +version = "1.34.91" +requires_python = ">=3.8" +summary = "Type annotations for boto3.S3 1.34.91 service generated with mypy-boto3-builder 7.24.0" +groups = ["default"] +dependencies = [ + "typing-extensions>=4.1.0; python_version < \"3.12\"", +] +files = [ + {file = "mypy_boto3_s3-1.34.91-py3-none-any.whl", hash = "sha256:0d37161fd0cd7ebf194cf9ccadb9101bf5c9b2426c2d00677b7e644d6f2298e4"}, + {file = "mypy_boto3_s3-1.34.91.tar.gz", hash = "sha256:70c8bad00db70704fb7ac0ee1440c7eb0587578ae9a2b00997f29f17f60f45e7"}, +] + [[package]] name = "nodeenv" version = "1.8.0" @@ -1035,13 +1141,22 @@ files = [ {file = "pre_commit-3.7.0.tar.gz", hash = "sha256:e209d61b8acdcf742404408531f0c37d49d2c734fd7cff2d6076083d191cb060"}, ] +[[package]] +name = "py-partiql-parser" +version = "0.5.4" +summary = "Pure Python PartiQL Parser" +groups = ["default"] +files = [ + {file = "py_partiql_parser-0.5.4-py2.py3-none-any.whl", hash = "sha256:3dc4295a47da9587681a96b35c6e151886fdbd0a4acbe0d97c4c68e5f689d315"}, + {file = "py_partiql_parser-0.5.4.tar.gz", hash = "sha256:72e043919538fa63edae72fb59afc7e3fd93adbde656718a7d2b4666f23dd114"}, +] + [[package]] name = "pycparser" version = "2.22" requires_python = ">=3.8" summary = "C parser in Python" -groups = ["test"] -marker = "platform_python_implementation == \"PyPy\"" +groups = ["default", "test"] files = [ {file = "pycparser-2.22-py3-none-any.whl", hash = "sha256:c3702b6d3dd8c7abc1afa565d7e63d53a1d0bd86cdc24edd75470f4de499cfcc"}, {file = "pycparser-2.22.tar.gz", hash = "sha256:491c8be9c040f5390f5bf44a5b07752bd07f56edf992381b05c701439eec10f6"}, @@ -1172,7 +1287,7 @@ name = "pyyaml" version = "6.0.1" requires_python = ">=3.6" summary = "YAML parser and emitter for Python" -groups = ["lint"] +groups = ["default", "lint"] files = [ {file = "PyYAML-6.0.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:d858aa552c999bc8a8d57426ed01e40bef403cd8ccdd0fc5f6f04a00414cac2a"}, {file = "PyYAML-6.0.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:fd66fc5d0da6d9815ba2cebeb4205f95818ff4b79c3ebe268e75d961704af52f"}, @@ -1235,6 +1350,22 @@ files = [ {file = "resolvelib-1.0.1.tar.gz", hash = "sha256:04ce76cbd63fded2078ce224785da6ecd42b9564b1390793f64ddecbe997b309"}, ] +[[package]] +name = "responses" +version = "0.25.0" +requires_python = ">=3.8" +summary = "A utility library for mocking out the `requests` Python library." +groups = ["default"] +dependencies = [ + "pyyaml", + "requests<3.0,>=2.30.0", + "urllib3<3.0,>=1.25.10", +] +files = [ + {file = "responses-0.25.0-py3-none-any.whl", hash = "sha256:2f0b9c2b6437db4b528619a77e5d565e4ec2a9532162ac1a131a83529db7be1a"}, + {file = "responses-0.25.0.tar.gz", hash = "sha256:01ae6a02b4f34e39bffceb0fc6786b67a25eae919c6368d05eabc8d9576c2a66"}, +] + [[package]] name = "rich" version = "13.7.1" @@ -1583,14 +1714,14 @@ files = [ [[package]] name = "truststore" -version = "0.8.0" -requires_python = ">= 3.10" +version = "0.9.0" +requires_python = ">=3.10" summary = "Verify certificates using native system trust stores" groups = ["test"] marker = "python_version >= \"3.10\"" files = [ - {file = "truststore-0.8.0-py3-none-any.whl", hash = "sha256:e37a5642ae9fc48caa8f120b6283d77225d600d224965a672c9e8ef49ce4bb4c"}, - {file = "truststore-0.8.0.tar.gz", hash = "sha256:dc70da89634944a579bfeec70a7a4523c53ffdb3cf52d1bb4a431fda278ddb96"}, + {file = "truststore-0.9.0-py3-none-any.whl", hash = "sha256:87ec7718ae0c0c9f100a040b86a8a3c93e258fb03e31bd3a8cc45948de2a3805"}, + {file = "truststore-0.9.0.tar.gz", hash = "sha256:8876ce1ece1187f523d1ac0c975aa91cf2320c6cd7f20c3a35a7811f49ec2e37"}, ] [[package]] @@ -1657,7 +1788,7 @@ files = [ [[package]] name = "virtualenv" -version = "20.26.0" +version = "20.26.1" requires_python = ">=3.7" summary = "Virtual Python Environment builder" groups = ["dev", "lint", "test", "tox"] @@ -1667,8 +1798,33 @@ dependencies = [ "platformdirs<5,>=3.9.1", ] files = [ - {file = "virtualenv-20.26.0-py3-none-any.whl", hash = "sha256:0846377ea76e818daaa3e00a4365c018bc3ac9760cbb3544de542885aad61fb3"}, - {file = "virtualenv-20.26.0.tar.gz", hash = "sha256:ec25a9671a5102c8d2657f62792a27b48f016664c6873f6beed3800008577210"}, + {file = "virtualenv-20.26.1-py3-none-any.whl", hash = "sha256:7aa9982a728ae5892558bff6a2839c00b9ed145523ece2274fad6f414690ae75"}, + {file = "virtualenv-20.26.1.tar.gz", hash = "sha256:604bfdceaeece392802e6ae48e69cec49168b9c5f4a44e483963f9242eb0e78b"}, +] + +[[package]] +name = "werkzeug" +version = "3.0.2" +requires_python = ">=3.8" +summary = "The comprehensive WSGI web application library." +groups = ["default"] +dependencies = [ + "MarkupSafe>=2.1.1", +] +files = [ + {file = "werkzeug-3.0.2-py3-none-any.whl", hash = "sha256:3aac3f5da756f93030740bc235d3e09449efcf65f2f55e3602e1d851b8f48795"}, + {file = "werkzeug-3.0.2.tar.gz", hash = "sha256:e39b645a6ac92822588e7b39a692e7828724ceae0b0d702ef96701f90e70128d"}, +] + +[[package]] +name = "xmltodict" +version = "0.13.0" +requires_python = ">=3.4" +summary = "Makes working with XML feel like you are working with JSON" +groups = ["default"] +files = [ + {file = "xmltodict-0.13.0-py2.py3-none-any.whl", hash = "sha256:aa89e8fd76320154a40d19a0df04a4695fb9dc5ba977cbb68ab3e4eb225e7852"}, + {file = "xmltodict-0.13.0.tar.gz", hash = "sha256:341595a488e3e01a85a9d8911d8912fd922ede5fecc4dce437eb4b6c8d037e56"}, ] [[package]] diff --git a/pyproject.toml b/pyproject.toml index ecb5a5c..20c7946 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -29,11 +29,14 @@ classifiers = [ ] dependencies = [ - "trino[sqlalchemy]>=0.323", - "sqlalchemy>=2.0", - "pandas", - "python-dotenv", - "boto3" + "trino[sqlalchemy]>=0.323", + "sqlalchemy>=2.0", + "pandas", + "python-dotenv", + "boto3", + "moto[s3]>=5.0.6", + "boto>=2.49.0", + "mypy-boto3-s3>=1.34.91", ] [project.urls] @@ -66,7 +69,7 @@ lint = ["pre-commit"] [tool.pytest.ini_options] testpaths = "tests/" -addopts = "-v --cov --cov-report html --cov-report term-missing --cov-fail-under 70" +addopts = "-v --cov --cov-report html --cov-report term-missing --cov-fail-under 50" [tool.black] line-length = 120 @@ -90,7 +93,7 @@ warn_unreachable = true ignore_missing_imports = true [tool.coverage.run] -source = ["src"] +source = ["osc_ingest_trino"] omit = ["tests/*"] relative_files = true diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..f2a49c7 --- /dev/null +++ b/tests/__init__.py @@ -0,0 +1 @@ +"""Empty file to allow pytest-cov to collect data.""" diff --git a/tests/test_trino_utils.py b/tests/test_trino_utils.py index e750cd0..4a85e95 100644 --- a/tests/test_trino_utils.py +++ b/tests/test_trino_utils.py @@ -1,8 +1,44 @@ import math from datetime import datetime from unittest import mock +from unittest.mock import patch -from osc_ingest_trino import TrinoBatchInsert, attach_trino_engine +import boto3 +import botocore +import moto +import pytest +from botocore.exceptions import ClientError +from moto import mock_aws + +from osc_ingest_trino import ( + TrinoBatchInsert, + attach_trino_engine, + pandas_type_to_sql, + sql_compliant_name, +) +from osc_ingest_trino.unmanaged import unmanaged_parquet_tabledef + +# from os_bucket import OSBucket + + +class MyModel: + def __init__(self, name, value): + self.name = name + self.value = value + + def save(self): + s3 = boto3.client("s3", region_name="us-east-1") + s3.put_object(Bucket="mybucket", Key=self.name, Body=self.value) + + +def test_sql_compliant_name(): + foo_name = sql_compliant_name("foo") + assert foo_name == "foo" + + +def test_pandas_type_to_sql(): + integer_type = pandas_type_to_sql("int32") + assert integer_type == "integer" @mock.patch("osc_ingest_trino.trino_utils.trino.auth.JWTAuthentication") @@ -25,6 +61,9 @@ def test_attach_trino_engine(mock_engine, mock_trino_auth, monkeypatch): ) +# from os_bucket import OSBucket + + def test_trino_batch_insert(): # mock up an sqlalchemy table tbl = mock.MagicMock() @@ -70,3 +109,18 @@ def test_trino_pandas_insert(): index=False, method=TrinoBatchInsert(batch_size=5, verbose=True), ) + + +@mock_aws +def test_unmanaged_parquet_tabledef(): + import pandas as pd + + df = pd.DataFrame({"a": [1, 2, 3], "b": ["x", "y", "z"]}).convert_dtypes() + + conn = boto3.resource("s3", region_name="us-east-1") + # We need to create the bucket since this is all in Moto's 'virtual' AWS account + bucket = conn.Bucket("mybucket") + bucket.create() + + tabledef = unmanaged_parquet_tabledef(df, "catalog", "schema", "table", bucket, partition_columns=["a", "b"]) + print(tabledef)