-
Notifications
You must be signed in to change notification settings - Fork 10
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
* add several new utility functions * normalize version * fixing bugs from RCs * 0.2.0 * add prefix param * 0.2.0
- Loading branch information
1 parent
1576952
commit 0167dcb
Showing
7 changed files
with
189 additions
and
7 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,29 @@ | ||
import os | ||
import boto3 | ||
|
||
__all__ = [ | ||
"upload_directory_to_s3", | ||
"attach_s3_bucket", | ||
] | ||
|
||
def upload_directory_to_s3(path, bucket, prefix, verbose=False): | ||
path = str(path) | ||
prefix = str(prefix) | ||
for subdir, dirs, files in os.walk(path): | ||
for f in files: | ||
pfx = subdir.replace(path, prefix) | ||
src = os.path.join(subdir, f) | ||
dst = os.path.join(pfx, f) | ||
if verbose: | ||
print(f'{src} --> {dst}') | ||
bucket.upload_file(src, dst) | ||
|
||
def attach_s3_bucket(env_var_prefix): | ||
s3 = boto3.resource( | ||
service_name="s3", | ||
endpoint_url=os.environ[f"{env_var_prefix}_ENDPOINT"], | ||
aws_access_key_id=os.environ[f"{env_var_prefix}_ACCESS_KEY"], | ||
aws_secret_access_key=os.environ[f"{env_var_prefix}_SECRET_KEY"], | ||
) | ||
return s3.Bucket(os.environ[f"{env_var_prefix}_BUCKET"]) | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,16 @@ | ||
import os | ||
import pathlib | ||
from dotenv import load_dotenv | ||
|
||
__all__ = [ | ||
"load_credentials_dotenv", | ||
] | ||
|
||
def load_credentials_dotenv(): | ||
# Load some standard environment variables from a dot-env file, if it exists. | ||
# If no such file can be found, does not fail, and so allows these environment vars to | ||
# be populated in some other way | ||
dotenv_dir = os.environ.get('CREDENTIAL_DOTENV_DIR', os.environ.get('PWD', '/opt/app-root/src')) | ||
dotenv_path = pathlib.Path(dotenv_dir) / 'credentials.env' | ||
if os.path.exists(dotenv_path): | ||
load_dotenv(dotenv_path=dotenv_path, override=True) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,110 @@ | ||
import os | ||
import shutil | ||
import uuid | ||
|
||
import trino | ||
import pandas as pd | ||
from sqlalchemy.engine import create_engine | ||
|
||
from .boto3_utils import upload_directory_to_s3 | ||
from .sqltypes import create_table_schema_pairs | ||
|
||
__all__ = [ | ||
"attach_trino_engine", | ||
"drop_unmanaged_table", | ||
"drop_unmanaged_data", | ||
"ingest_unmanaged_parquet", | ||
"unmanaged_parquet_tabledef", | ||
] | ||
|
||
_default_prefix = 'trino/{schema}/{table}' | ||
|
||
def _remove_trailing_slash(s): | ||
s = str(s) | ||
if len(s) == 0: return s | ||
if (s[-1] != '/'): return s | ||
return _remove_trailing_slash(s[:-1]) | ||
|
||
def _prefix(pfx, schema, table): | ||
return _remove_trailing_slash(pfx).format(schema = schema, table = table) | ||
|
||
def attach_trino_engine(env_var_prefix = 'TRINO'): | ||
sqlstring = 'trino://{user}@{host}:{port}/'.format( | ||
user = os.environ[f'{env_var_prefix}_USER'], | ||
host = os.environ[f'{env_var_prefix}_HOST'], | ||
port = os.environ[f'{env_var_prefix}_PORT'] | ||
) | ||
sqlargs = { | ||
'auth': trino.auth.JWTAuthentication(os.environ[f'{env_var_prefix}_PASSWD']), | ||
'http_scheme': 'https' | ||
} | ||
engine = create_engine(sqlstring, connect_args = sqlargs) | ||
connection = engine.connect() | ||
return engine | ||
|
||
def drop_unmanaged_table(catalog, schema, table, engine, bucket, prefix=_default_prefix, verbose=False): | ||
sql = f'drop table if exists {catalog}.{schema}.{table}' | ||
qres = engine.execute(sql) | ||
dres = bucket.objects \ | ||
.filter(Prefix = f'{_prefix(prefix, schema, table)}/') \ | ||
.delete() | ||
if verbose: | ||
print(dres) | ||
return qres | ||
|
||
def drop_unmanaged_data(schema, table, bucket, prefix=_default_prefix, verbose=False): | ||
dres = bucket.objects \ | ||
.filter(Prefix = f'{_prefix(prefix, schema, table)}/') \ | ||
.delete() | ||
if verbose: print(dres) | ||
return dres | ||
|
||
def ingest_unmanaged_parquet(df, schema, table, bucket, partition_columns=[], append=True, workdir='/tmp', prefix=_default_prefix, verbose=False): | ||
if not isinstance(df, pd.DataFrame): | ||
raise ValueError("df must be a pandas DataFrame") | ||
if not isinstance(partition_columns, list): | ||
raise ValueError("partition_columns must be list of column names") | ||
|
||
s3pfx = _prefix(prefix, schema, table) | ||
|
||
if not append: | ||
dres = bucket.objects.filter(Prefix = f'{s3pfx}/').delete() | ||
if verbose: print(dres) | ||
|
||
if len(partition_columns) > 0: | ||
# tell pandas to write a directory tree, using partitions | ||
tmp = f'{workdir}/{table}' | ||
# pandas does not clean out destination directory for you: | ||
shutil.rmtree(tmp, ignore_errors=True) | ||
df.to_parquet(tmp, | ||
partition_cols=partition_columns, | ||
index=False) | ||
# upload the tree onto S3 | ||
upload_directory_to_s3(tmp, bucket, s3pfx, verbose=verbose) | ||
else: | ||
# do not use partitions: a single parquet file is created | ||
parquet = f'{uuid.uuid4().hex}.parquet' | ||
tmp = f'{workdir}/{parquet}' | ||
df.to_parquet(tmp, index=False) | ||
dst = f'{s3pfx}/{parquet}' | ||
if verbose: print(f'{tmp} --> {dst}') | ||
bucket.upload_file(tmp, dst) | ||
|
||
def unmanaged_parquet_tabledef(df, catalog, schema, table, bucket, partition_columns = [], verbose = False): | ||
if not isinstance(df, pd.DataFrame): | ||
raise ValueError("df must be a pandas DataFrame") | ||
if not isinstance(partition_columns, list): | ||
raise ValueError("partition_columns must be list of column names") | ||
|
||
columnschema = create_table_schema_pairs(df) | ||
|
||
tabledef = f"create table if not exists {catalog}.{schema}.{table} (\n" | ||
tabledef += f"{columnschema}\n" | ||
tabledef += ") with (\n format = 'parquet',\n" | ||
if len(partition_columns) > 0: | ||
tabledef += f" partitioned_by = array{partition_columns},\n" | ||
tabledef += f" external_location = 's3a://{bucket.name}/trino/{schema}/{table}/'\n)" | ||
|
||
if verbose: print(tabledef) | ||
return tabledef | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters