From a521afc7e3aca0b6ca389052d7551cbe6fcec3c6 Mon Sep 17 00:00:00 2001 From: hellcassius Date: Tue, 27 Jun 2023 12:14:13 -0300 Subject: [PATCH 01/59] add stpl rdo/rho capture --- .../rj_smtr/br_rj_riodejaneiro_rdo/flows.py | 101 +++++++++++++++++- 1 file changed, 99 insertions(+), 2 deletions(-) diff --git a/pipelines/rj_smtr/br_rj_riodejaneiro_rdo/flows.py b/pipelines/rj_smtr/br_rj_riodejaneiro_rdo/flows.py index 55132f4c3..6bc8c3804 100644 --- a/pipelines/rj_smtr/br_rj_riodejaneiro_rdo/flows.py +++ b/pipelines/rj_smtr/br_rj_riodejaneiro_rdo/flows.py @@ -97,7 +97,7 @@ ) with Flow( - "SMTR: RHO - Captura", + "SMTR: SPPO RHO - Captura", code_owners=["caio", "fernanda"], ) as captura_sppo_rho: # SETUP @@ -145,7 +145,7 @@ captura_sppo_rho.schedule = every_day with Flow( - "SMTR: RDO - Captura", + "SMTR: SPPO RDO - Captura", code_owners=["caio", "fernanda"], ) as captura_sppo_rdo: # SETUP @@ -193,6 +193,103 @@ captura_sppo_rdo.schedule = every_day +with Flow( + "SMTR: STPL RHO - Captura", + code_owners=["caio", "fernanda"], +) as captura_stpl_rho: + # SETUP + transport_mode = Parameter("transport_mode", "STPL") + report_type = Parameter("report_type", "RHO") + dump = Parameter("dump", False) + table_id = Parameter("table_id", constants.STPL_RHO_TABLE_ID.value) + materialize = Parameter("materialize", False) + + rename_run = rename_current_flow_run_now_time( + prefix=f"Captura FTP - {transport_mode.run()}-{report_type.run()} ", + now_time=get_current_timestamp(), + wait=None, + ) + # EXTRACT + files = get_file_paths_from_ftp( + transport_mode=transport_mode, report_type=report_type, dump=dump + ) + download_files = check_files_for_download( + files=files, dataset_id=constants.RDO_DATASET_ID.value, table_id=table_id + ) + updated_info = download_and_save_local_from_ftp.map(file_info=download_files) + # TRANSFORM + treated_path, raw_path, partitions, status = pre_treatment_br_rj_riodejaneiro_rdo( + files=updated_info + ) + # LOAD + errors = bq_upload.map( + dataset_id=unmapped(constants.RDO_DATASET_ID.value), + table_id=unmapped(table_id), + filepath=treated_path, + raw_filepath=raw_path, + partitions=partitions, + status=status, + ) + set_redis = update_rdo_redis( + download_files=download_files, table_id=table_id, errors=errors + ) + +captura_stpl_rho.storage = GCS(emd_constants.GCS_FLOWS_BUCKET.value) +captura_stpl_rho.run_config = KubernetesRun( + image=emd_constants.DOCKER_IMAGE.value, + labels=[emd_constants.RJ_SMTR_AGENT_LABEL.value], +) +captura_stpl_rho.schedule = every_day + +with Flow( + "SMTR: STPL RDO - Captura", + code_owners=["caio", "fernanda"], +) as captura_stpl_rdo: + # SETUP + transport_mode = Parameter("transport_mode", "STPL") + report_type = Parameter("report_type", "RDO") + dump = Parameter("dump", False) + table_id = Parameter("table_id", constants.STPL_RDO_TABLE_ID.value) + materialize = Parameter("materialize", False) + + rename_run = rename_current_flow_run_now_time( + prefix=f"Captura FTP - {transport_mode.run()}-{report_type.run()} ", + now_time=get_current_timestamp(), + wait=None, + ) + # EXTRACT + files = get_file_paths_from_ftp( + transport_mode=transport_mode, report_type=report_type, dump=dump + ) + download_files = check_files_for_download( + files=files, dataset_id=constants.RDO_DATASET_ID.value, table_id=table_id + ) + updated_info = download_and_save_local_from_ftp.map(file_info=download_files) + # TRANSFORM + treated_path, raw_path, partitions, status = pre_treatment_br_rj_riodejaneiro_rdo( + files=updated_info + ) + # LOAD + errors = bq_upload.map( + dataset_id=unmapped(constants.RDO_DATASET_ID.value), + table_id=unmapped(table_id), + filepath=treated_path, + raw_filepath=raw_path, + partitions=partitions, + status=status, + ) + set_redis = update_rdo_redis( + download_files=download_files, table_id=table_id, errors=errors + ) + +captura_stpl_rdo.storage = GCS(emd_constants.GCS_FLOWS_BUCKET.value) +captura_stpl_rdo.run_config = KubernetesRun( + image=emd_constants.DOCKER_IMAGE.value, + labels=[emd_constants.RJ_SMTR_AGENT_LABEL.value], +) +captura_stpl_rdo.schedule = every_day + + # captura_sppo_rho = deepcopy(captura_sppo_rdo) # captura_sppo_rho.storage = GCS(emd_constants.GCS_FLOWS_BUCKET.value) # captura_sppo_rho.run_config = KubernetesRun(image=emd_constants.DOCKER_IMAGE.value) From df294ea8062452caab11b4d078fde26fb38e1271 Mon Sep 17 00:00:00 2001 From: eng-rodrigocunha Date: Tue, 27 Jun 2023 12:28:33 -0300 Subject: [PATCH 02/59] fix pylint set_redis_rdo_files --- pipelines/rj_smtr/utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pipelines/rj_smtr/utils.py b/pipelines/rj_smtr/utils.py index 907496863..a2ce4651b 100644 --- a/pipelines/rj_smtr/utils.py +++ b/pipelines/rj_smtr/utils.py @@ -210,8 +210,8 @@ def set_redis_rdo_files(redis_client, dataset_id: str, table_id: str): """ try: content = redis_client.get(f"{dataset_id}.{table_id}")["files"] - except (TypeError) as e: - log(f"Caught error {e}. Will set unexisting key") + except (TypeError) as error: + log(f"Caught error {error}. Will set unexisting key") # set key to empty dict for filling later redis_client.set(f"{dataset_id}.{table_id}", {"files": []}) content = redis_client.get(f"{dataset_id}.{table_id}") From 87fc63906cf4f6276ed335fb6c90680880c6e253 Mon Sep 17 00:00:00 2001 From: hellcassius Date: Tue, 27 Jun 2023 13:30:18 -0300 Subject: [PATCH 03/59] minor fix on error logging --- pipelines/rj_smtr/br_rj_riodejaneiro_rdo/tasks.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pipelines/rj_smtr/br_rj_riodejaneiro_rdo/tasks.py b/pipelines/rj_smtr/br_rj_riodejaneiro_rdo/tasks.py index af27523c0..987bb8abd 100644 --- a/pipelines/rj_smtr/br_rj_riodejaneiro_rdo/tasks.py +++ b/pipelines/rj_smtr/br_rj_riodejaneiro_rdo/tasks.py @@ -239,7 +239,7 @@ def update_rdo_redis( download_files: list, table_id: str, dataset_id: str = constants.RDO_DATASET_ID.value, - errors=None, + errors: list = None, wait=None, # pylint: disable=W0613 ): """ @@ -260,7 +260,7 @@ def update_rdo_redis( redis_client = get_redis_client() content = redis_client.get(key) # get current redis state if errors: - log(f"Received errors:\n {errors}") + log(f"Received {len(errors)} errors:\n {errors[:10]}\n...") merge_file_info_and_errors(download_files, errors) log(f"content is:\n{content['files'][:5]}") insert_content = [ From cf0067f5e50e70498bd3b1a429937646f9f4d8df Mon Sep 17 00:00:00 2001 From: eng-rodrigocunha Date: Thu, 6 Jul 2023 16:48:17 -0300 Subject: [PATCH 04/59] Revert "chore: upgrade prefect to 1.4" This reverts commit 857d23f3c24bbd6b8dc5da424eddfaf62249ccff. --- poetry.lock | 415 ++++++++++++++++++++++++++++++++++------ pyproject.toml | 2 +- requirements-deploy.txt | 2 +- requirements-test.txt | 2 +- 4 files changed, 363 insertions(+), 58 deletions(-) diff --git a/poetry.lock b/poetry.lock index 44c88b88d..30de91281 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,9 +1,10 @@ -# This file is automatically @generated by Poetry 1.5.1 and should not be changed by hand. +# This file is automatically @generated by Poetry and should not be changed by hand. [[package]] name = "adal" version = "1.2.7" description = "Note: This library is already replaced by MSAL Python, available here: https://pypi.org/project/msal/ .ADAL Python remains available here as a legacy. The ADAL for Python library makes it easy for python application to authenticate to Azure Active Directory (AAD) in order to access AAD protected web resources." +category = "main" optional = false python-versions = "*" files = [ @@ -21,6 +22,7 @@ requests = ">=2.0.0,<3" name = "affine" version = "2.3.1" description = "Matrices describing affine transformation of the plane." +category = "main" optional = false python-versions = "*" files = [ @@ -35,6 +37,7 @@ test = ["coveralls", "flake8", "pydocstyle", "pytest (>=4.6)", "pytest-cov"] name = "aiobotocore" version = "2.4.0" description = "Async client for aws services using botocore and aiohttp" +category = "main" optional = false python-versions = ">=3.7" files = [ @@ -56,6 +59,7 @@ boto3 = ["boto3 (>=1.24.59,<1.24.60)"] name = "aiohttp" version = "3.8.3" description = "Async http client/server framework (asyncio)" +category = "main" optional = false python-versions = ">=3.6" files = [ @@ -164,6 +168,7 @@ speedups = ["Brotli", "aiodns", "cchardet"] name = "aioitertools" version = "0.11.0" description = "itertools and builtins for AsyncIO and mixed iterables" +category = "main" optional = false python-versions = ">=3.6" files = [ @@ -178,6 +183,7 @@ typing_extensions = {version = ">=4.0", markers = "python_version < \"3.10\""} name = "aiokafka" version = "0.7.2" description = "Kafka integration with asyncio." +category = "main" optional = false python-versions = "*" files = [ @@ -214,6 +220,7 @@ snappy = ["python-snappy (>=0.5)"] name = "aiosignal" version = "1.2.0" description = "aiosignal: a list of registered asynchronous callbacks" +category = "main" optional = false python-versions = ">=3.6" files = [ @@ -228,6 +235,7 @@ frozenlist = ">=1.1.0" name = "alembic" version = "1.8.1" description = "A database migration tool for SQLAlchemy." +category = "main" optional = false python-versions = ">=3.7" files = [ @@ -248,6 +256,7 @@ tz = ["python-dateutil"] name = "anyio" version = "3.6.1" description = "High level compatibility layer for multiple asynchronous event loop implementations" +category = "main" optional = false python-versions = ">=3.6.2" files = [ @@ -268,6 +277,7 @@ trio = ["trio (>=0.16)"] name = "appdirs" version = "1.4.4" description = "A small Python module for determining appropriate platform-specific dirs, e.g. a \"user data dir\"." +category = "main" optional = false python-versions = "*" files = [ @@ -279,6 +289,7 @@ files = [ name = "apscheduler" version = "3.6.3" description = "In-process task scheduler with Cron-like capabilities" +category = "main" optional = false python-versions = "*" files = [ @@ -309,6 +320,7 @@ zookeeper = ["kazoo"] name = "argcomplete" version = "2.0.0" description = "Bash tab completion for argparse" +category = "main" optional = false python-versions = ">=3.6" files = [ @@ -323,6 +335,7 @@ test = ["coverage", "flake8", "pexpect", "wheel"] name = "astroid" version = "2.11.7" description = "An abstract syntax tree for Python with inference support." +category = "dev" optional = false python-versions = ">=3.6.2" files = [ @@ -340,6 +353,7 @@ wrapt = ">=1.11,<2" name = "async-timeout" version = "4.0.2" description = "Timeout context manager for asyncio programs" +category = "main" optional = false python-versions = ">=3.6" files = [ @@ -351,6 +365,7 @@ files = [ name = "atomicwrites" version = "1.4.1" description = "Atomic file writes." +category = "dev" optional = false python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" files = [ @@ -361,6 +376,7 @@ files = [ name = "attrs" version = "22.1.0" description = "Classes Without Boilerplate" +category = "main" optional = false python-versions = ">=3.5" files = [ @@ -378,6 +394,7 @@ tests-no-zope = ["cloudpickle", "coverage[toml] (>=5.0.2)", "hypothesis", "mypy name = "azure-common" version = "1.1.28" description = "Microsoft Azure Client Library for Python (Common)" +category = "main" optional = false python-versions = "*" files = [ @@ -389,6 +406,7 @@ files = [ name = "azure-core" version = "1.25.1" description = "Microsoft Azure Core Library for Python" +category = "main" optional = false python-versions = ">=3.7" files = [ @@ -405,6 +423,7 @@ typing-extensions = ">=4.0.1" name = "azure-graphrbac" version = "0.61.1" description = "Microsoft Azure Graph RBAC Client Library for Python" +category = "main" optional = false python-versions = "*" files = [ @@ -421,6 +440,7 @@ msrestazure = ">=0.4.32,<2.0.0" name = "azure-mgmt-authorization" version = "2.0.0" description = "Microsoft Azure Authorization Management Client Library for Python" +category = "main" optional = false python-versions = "*" files = [ @@ -437,6 +457,7 @@ msrest = ">=0.6.21" name = "azure-mgmt-containerregistry" version = "10.0.0" description = "Microsoft Azure Container Registry Client Library for Python" +category = "main" optional = false python-versions = ">=3.6" files = [ @@ -453,6 +474,7 @@ msrest = ">=0.6.21" name = "azure-mgmt-core" version = "1.3.2" description = "Microsoft Azure Management Core Library for Python" +category = "main" optional = false python-versions = ">=3.6" files = [ @@ -467,6 +489,7 @@ azure-core = ">=1.24.0,<2.0.0" name = "azure-mgmt-keyvault" version = "10.1.0" description = "Microsoft Azure Key Vault Management Client Library for Python" +category = "main" optional = false python-versions = ">=3.7" files = [ @@ -483,6 +506,7 @@ msrest = ">=0.6.21" name = "azure-mgmt-resource" version = "21.1.0" description = "Microsoft Azure Resource Management Client Library for Python" +category = "main" optional = false python-versions = ">=3.6" files = [ @@ -499,6 +523,7 @@ msrest = ">=0.6.21" name = "azure-mgmt-storage" version = "20.0.0" description = "Microsoft Azure Storage Management Client Library for Python" +category = "main" optional = false python-versions = ">=3.6" files = [ @@ -515,6 +540,7 @@ msrest = ">=0.6.21" name = "azureml-core" version = "1.45.0.post2" description = "Azure Machine Learning core packages, modules, and classes" +category = "main" optional = false python-versions = ">=3.6,< 4.0" files = [ @@ -534,7 +560,7 @@ azure-mgmt-resource = ">=15.0.0,<22.0.0" azure-mgmt-storage = ">=16.0.0,<=20.0.0" "backports.tempfile" = "*" contextlib2 = "<22.0.0" -cryptography = "<1.9 || >1.9,<2.0.dev0 || >=2.3.dev0,<38.0.0" +cryptography = "<1.9 || >1.9,<2.0.0 || >=2.3.0,<38.0.0" docker = "<6.0.0" humanfriendly = ">=4.7,<11.0" jmespath = "<2.0.0" @@ -561,6 +587,7 @@ urllib3 = ">=1.23,<2.0.0" name = "backports.tempfile" version = "1.0" description = "Backport of new features in Python's tempfile module" +category = "main" optional = false python-versions = "*" files = [ @@ -575,6 +602,7 @@ files = [ name = "backports.weakref" version = "1.0.post1" description = "Backport of new features in Python's weakref module" +category = "main" optional = false python-versions = "*" files = [ @@ -586,6 +614,7 @@ files = [ name = "backports.zoneinfo" version = "0.2.1" description = "Backport of the standard library zoneinfo module" +category = "main" optional = false python-versions = ">=3.6" files = [ @@ -614,6 +643,7 @@ tzdata = ["tzdata"] name = "basedosdados" version = "1.7.0b5" description = "Organizar e facilitar o acesso a dados brasileiros através de tabelas públicas no BigQuery." +category = "main" optional = false python-versions = ">=3.7.1,<3.11" files = [ @@ -647,6 +677,7 @@ tqdm = "4.50.2" name = "bcrypt" version = "4.0.0" description = "Modern password hashing for your software and your servers" +category = "main" optional = false python-versions = ">=3.6" files = [ @@ -672,6 +703,7 @@ typecheck = ["mypy"] name = "beautifulsoup4" version = "4.11.1" description = "Screen-scraping library" +category = "main" optional = false python-versions = ">=3.6.0" files = [ @@ -690,6 +722,7 @@ lxml = ["lxml"] name = "black" version = "20.8b1" description = "The uncompromising code formatter." +category = "main" optional = false python-versions = ">=3.6" files = [ @@ -714,6 +747,7 @@ d = ["aiohttp (>=3.3.2)", "aiohttp-cors"] name = "boto3" version = "1.24.59" description = "The AWS SDK for Python" +category = "main" optional = false python-versions = ">= 3.7" files = [ @@ -733,6 +767,7 @@ crt = ["botocore[crt] (>=1.21.0,<2.0a0)"] name = "botocore" version = "1.27.59" description = "Low-level, data-driven core of boto 3." +category = "main" optional = false python-versions = ">= 3.7" files = [ @@ -752,6 +787,7 @@ crt = ["awscrt (==0.14.0)"] name = "bs4" version = "0.0.1" description = "Dummy package for Beautiful Soup" +category = "main" optional = false python-versions = "*" files = [ @@ -765,6 +801,7 @@ beautifulsoup4 = "*" name = "cachetools" version = "4.2.2" description = "Extensible memoizing collections and decorators" +category = "main" optional = false python-versions = "~=3.5" files = [ @@ -776,6 +813,7 @@ files = [ name = "certifi" version = "2022.9.24" description = "Python package for providing Mozilla's CA Bundle." +category = "main" optional = false python-versions = ">=3.6" files = [ @@ -787,6 +825,7 @@ files = [ name = "cffi" version = "1.15.1" description = "Foreign Function Interface for Python calling C code." +category = "main" optional = false python-versions = "*" files = [ @@ -863,6 +902,7 @@ pycparser = "*" name = "cfgv" version = "3.3.1" description = "Validate configuration and produce human readable error messages." +category = "dev" optional = false python-versions = ">=3.6.1" files = [ @@ -874,6 +914,7 @@ files = [ name = "cftime" version = "1.6.2" description = "Time-handling functionality from netcdf4-python" +category = "main" optional = false python-versions = ">=3.7" files = [ @@ -907,6 +948,7 @@ numpy = ">1.13.3" name = "charset-normalizer" version = "2.0.12" description = "The Real First Universal Charset Detector. Open, modern and actively maintained alternative to Chardet." +category = "main" optional = false python-versions = ">=3.5.0" files = [ @@ -921,6 +963,7 @@ unicode-backport = ["unicodedata2"] name = "ckanapi" version = "4.6" description = "A command line interface and Python module for accessing the CKAN Action API" +category = "main" optional = false python-versions = "*" files = [ @@ -938,6 +981,7 @@ six = ">=1.9,<2.0" name = "click" version = "8.0.3" description = "Composable command line interface toolkit" +category = "main" optional = false python-versions = ">=3.6" files = [ @@ -952,6 +996,7 @@ colorama = {version = "*", markers = "platform_system == \"Windows\""} name = "click-plugins" version = "1.1.1" description = "An extension module for click to enable registering CLI commands via setuptools entry-points." +category = "main" optional = false python-versions = "*" files = [ @@ -969,6 +1014,7 @@ dev = ["coveralls", "pytest (>=3.6)", "pytest-cov", "wheel"] name = "cligj" version = "0.7.2" description = "Click params for commmand line interfaces to GeoJSON" +category = "main" optional = false python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, <4" files = [ @@ -986,6 +1032,7 @@ test = ["pytest-cov"] name = "cloudpickle" version = "2.2.0" description = "Extended pickling support for Python objects" +category = "main" optional = false python-versions = ">=3.6" files = [ @@ -997,6 +1044,7 @@ files = [ name = "colorama" version = "0.4.5" description = "Cross-platform colored terminal text." +category = "main" optional = false python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*" files = [ @@ -1008,6 +1056,7 @@ files = [ name = "contextlib2" version = "21.6.0" description = "Backports and enhancements for the contextlib module" +category = "main" optional = false python-versions = ">=3.6" files = [ @@ -1019,6 +1068,7 @@ files = [ name = "coverage" version = "7.0.0" description = "Code coverage measurement for Python" +category = "dev" optional = false python-versions = ">=3.7" files = [ @@ -1085,6 +1135,7 @@ toml = ["tomli"] name = "croniter" version = "1.3.7" description = "croniter provides iteration for datetime object with cron like format" +category = "main" optional = false python-versions = ">=2.6, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" files = [ @@ -1099,6 +1150,7 @@ python-dateutil = "*" name = "cryptography" version = "37.0.4" description = "cryptography is a package which provides cryptographic recipes and primitives to Python developers." +category = "main" optional = false python-versions = ">=3.6" files = [ @@ -1141,6 +1193,7 @@ test = ["hypothesis (>=1.11.4,!=3.79.2)", "iso8601", "pretend", "pytest (>=6.2.0 name = "cx-oracle" version = "8.3.0" description = "Python interface to Oracle" +category = "main" optional = false python-versions = "*" files = [ @@ -1166,6 +1219,7 @@ files = [ name = "cycler" version = "0.11.0" description = "Composable style cycles" +category = "main" optional = false python-versions = ">=3.6" files = [ @@ -1177,6 +1231,7 @@ files = [ name = "dask" version = "2022.9.1" description = "Parallel PyData with Task Scheduling" +category = "main" optional = false python-versions = ">=3.8" files = [ @@ -1204,6 +1259,7 @@ test = ["pandas[test]", "pre-commit", "pytest", "pytest-rerunfailures", "pytest- name = "databricks-cli" version = "0.17.3" description = "A command line interface for Databricks" +category = "main" optional = false python-versions = "*" files = [ @@ -1223,6 +1279,7 @@ tabulate = ">=0.7.7" name = "db-dtypes" version = "1.0.4" description = "Pandas Data Types for SQL systems (BigQuery, Spanner)" +category = "main" optional = false python-versions = ">=3.7" files = [ @@ -1240,6 +1297,7 @@ pyarrow = ">=3.0.0,<10.0dev" name = "dbt-client" version = "0.1.3" description = "A simple client for DBT RPC instances" +category = "main" optional = false python-versions = ">=3.8,<4.0" files = [ @@ -1254,6 +1312,7 @@ requests = ">=2.26.0,<3.0.0" name = "deprecated" version = "1.2.13" description = "Python @deprecated decorator to deprecate old python classes, functions or methods." +category = "main" optional = false python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" files = [ @@ -1271,6 +1330,7 @@ dev = ["PyTest", "PyTest (<5)", "PyTest-Cov", "PyTest-Cov (<2.6)", "bump2version name = "dill" version = "0.3.5.1" description = "serialize all of python" +category = "main" optional = false python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*, !=3.5.*, !=3.6.*" files = [ @@ -1285,6 +1345,7 @@ graph = ["objgraph (>=1.7.2)"] name = "distlib" version = "0.3.6" description = "Distribution utilities" +category = "main" optional = false python-versions = "*" files = [ @@ -1296,6 +1357,7 @@ files = [ name = "distributed" version = "2022.9.1" description = "Distributed scheduler for Dask" +category = "main" optional = false python-versions = ">=3.8" files = [ @@ -1324,6 +1386,7 @@ zict = ">=0.1.3" name = "docker" version = "5.0.3" description = "A Python library for the Docker Engine API." +category = "main" optional = false python-versions = ">=3.6" files = [ @@ -1344,6 +1407,7 @@ tls = ["cryptography (>=3.4.7)", "idna (>=2.0.0)", "pyOpenSSL (>=17.5.0)"] name = "docopt" version = "0.6.2" description = "Pythonic argument parser, that will make you smile" +category = "main" optional = false python-versions = "*" files = [ @@ -1354,6 +1418,7 @@ files = [ name = "earthengine-api" version = "0.1.334" description = "Earth Engine Python API" +category = "main" optional = false python-versions = "*" files = [ @@ -1374,6 +1439,7 @@ six = "*" name = "elastic-transport" version = "8.4.0" description = "Transport classes and utilities shared among Python Elastic client libraries" +category = "main" optional = false python-versions = ">=3.6" files = [ @@ -1392,6 +1458,7 @@ develop = ["aiohttp", "mock", "pytest", "pytest-asyncio", "pytest-cov", "pytest- name = "elasticsearch" version = "8.4.2" description = "Python client for Elasticsearch" +category = "main" optional = false python-versions = ">=3.6, <4" files = [ @@ -1410,6 +1477,7 @@ requests = ["requests (>=2.4.0,<3.0.0)"] name = "entrypoints" version = "0.4" description = "Discover and load entry points from installed packages." +category = "main" optional = false python-versions = ">=3.6" files = [ @@ -1421,6 +1489,7 @@ files = [ name = "fastapi" version = "0.85.0" description = "FastAPI framework, high performance, easy to learn, fast to code, ready for production" +category = "main" optional = false python-versions = ">=3.7" files = [ @@ -1442,6 +1511,7 @@ test = ["anyio[trio] (>=3.2.1,<4.0.0)", "black (==22.8.0)", "databases[sqlite] ( name = "fastavro" version = "1.5.1" description = "Fast read/write of AVRO files" +category = "main" optional = false python-versions = ">=3.7" files = [ @@ -1470,6 +1540,7 @@ zstandard = ["zstandard"] name = "filelock" version = "3.8.0" description = "A platform independent file lock." +category = "main" optional = false python-versions = ">=3.7" files = [ @@ -1485,6 +1556,7 @@ testing = ["covdefaults (>=2.2)", "coverage (>=6.4.2)", "pytest (>=7.1.2)", "pyt name = "fiona" version = "1.8.21" description = "Fiona reads and writes spatial data files" +category = "main" optional = false python-versions = "*" files = [ @@ -1521,6 +1593,7 @@ test = ["boto3 (>=1.2.4)", "mock", "pytest (>=3)", "pytest-cov"] name = "flake8" version = "4.0.1" description = "the modular source code checker: pep8 pyflakes and co" +category = "dev" optional = false python-versions = ">=3.6" files = [ @@ -1537,6 +1610,7 @@ pyflakes = ">=2.4.0,<2.5.0" name = "flask" version = "2.2.2" description = "A simple framework for building complex web applications." +category = "main" optional = false python-versions = ">=3.7" files = [ @@ -1559,6 +1633,7 @@ dotenv = ["python-dotenv"] name = "fonttools" version = "4.37.3" description = "Tools to manipulate font files" +category = "main" optional = false python-versions = ">=3.7" files = [ @@ -1584,6 +1659,7 @@ woff = ["brotli (>=1.0.1)", "brotlicffi (>=0.8.0)", "zopfli (>=0.1.4)"] name = "frozenlist" version = "1.3.1" description = "A list-like structure which implements collections.abc.MutableSequence" +category = "main" optional = false python-versions = ">=3.7" files = [ @@ -1652,6 +1728,7 @@ files = [ name = "fsspec" version = "2022.8.2" description = "File-system specification" +category = "main" optional = false python-versions = ">=3.7" files = [ @@ -1686,6 +1763,7 @@ tqdm = ["tqdm"] name = "future" version = "0.18.2" description = "Clean single-source support for Python 3 and 2" +category = "main" optional = false python-versions = ">=2.6, !=3.0.*, !=3.1.*, !=3.2.*" files = [ @@ -1696,6 +1774,7 @@ files = [ name = "geobr" version = "0.1.10" description = "geobr: Download Official Spatial Data Sets of Brazil" +category = "main" optional = false python-versions = "*" files = [ @@ -1711,6 +1790,7 @@ shapely = ">=1.7.0,<2.0.0" name = "geographiclib" version = "2.0" description = "The geodesic routines from GeographicLib" +category = "main" optional = false python-versions = ">=3.7" files = [ @@ -1722,6 +1802,7 @@ files = [ name = "geojson" version = "2.5.0" description = "Python bindings and utilities for GeoJSON" +category = "main" optional = false python-versions = "*" files = [ @@ -1733,6 +1814,7 @@ files = [ name = "geojsplit" version = "0.1.2" description = "A python implementation of the npm package geojsplit. Used to split GeoJSON files into smaller pieces." +category = "main" optional = false python-versions = ">=3.6,<4.0" files = [ @@ -1752,6 +1834,7 @@ docs = ["sphinx (>=2.2,<3.0)", "sphinx_rtd_theme (>=0.4.3,<0.5.0)"] name = "geopandas" version = "0.7.0" description = "Geographic pandas extensions" +category = "main" optional = false python-versions = ">=3.5" files = [ @@ -1769,6 +1852,7 @@ shapely = "*" name = "geopy" version = "2.3.0" description = "Python Geocoding Toolbox" +category = "main" optional = false python-versions = ">=3.7" files = [ @@ -1792,6 +1876,7 @@ timezone = ["pytz"] name = "gitdb" version = "4.0.9" description = "Git Object Database" +category = "main" optional = false python-versions = ">=3.6" files = [ @@ -1806,6 +1891,7 @@ smmap = ">=3.0.1,<6" name = "gitpython" version = "3.1.27" description = "GitPython is a python library used to interact with Git repositories" +category = "main" optional = false python-versions = ">=3.7" files = [ @@ -1820,6 +1906,7 @@ gitdb = ">=4.0.1,<5" name = "google-api-core" version = "1.34.0" description = "Google API client core library" +category = "main" optional = false python-versions = ">=3.7" files = [ @@ -1844,6 +1931,7 @@ grpcio-gcp = ["grpcio-gcp (>=0.2.2,<1.0dev)"] name = "google-api-python-client" version = "2.69.0" description = "Google API Client Library for Python" +category = "main" optional = false python-versions = ">=3.7" files = [ @@ -1852,7 +1940,7 @@ files = [ ] [package.dependencies] -google-api-core = ">=1.31.5,<2.0.dev0 || >2.3.0,<3.0.0dev" +google-api-core = ">=1.31.5,<2.0.0 || >2.3.0,<3.0.0dev" google-auth = ">=1.19.0,<3.0.0dev" google-auth-httplib2 = ">=0.1.0" httplib2 = ">=0.15.0,<1dev" @@ -1862,6 +1950,7 @@ uritemplate = ">=3.0.1,<5" name = "google-auth" version = "2.11.1" description = "Google Authentication Library" +category = "main" optional = false python-versions = ">=2.7,!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*" files = [ @@ -1885,6 +1974,7 @@ reauth = ["pyu2f (>=0.1.5)"] name = "google-auth-httplib2" version = "0.1.0" description = "Google Authentication Library: httplib2 transport" +category = "main" optional = false python-versions = "*" files = [ @@ -1901,6 +1991,7 @@ six = "*" name = "google-auth-oauthlib" version = "0.5.3" description = "Google Authentication Library" +category = "main" optional = false python-versions = ">=3.6" files = [ @@ -1919,6 +2010,7 @@ tool = ["click (>=6.0.0)"] name = "google-cloud-bigquery" version = "2.30.1" description = "Google BigQuery API client library" +category = "main" optional = false python-versions = ">=3.6, <3.11" files = [ @@ -1950,6 +2042,7 @@ tqdm = ["tqdm (>=4.7.4,<5.0.0dev)"] name = "google-cloud-bigquery-connection" version = "1.7.3" description = "" +category = "main" optional = false python-versions = ">=3.7" files = [ @@ -1958,7 +2051,7 @@ files = [ ] [package.dependencies] -google-api-core = {version = ">=1.32.0,<2.0.dev0 || >=2.8.dev0,<3.0.0dev", extras = ["grpc"]} +google-api-core = {version = ">=1.32.0,<2.0.0 || >=2.8.0,<3.0.0dev", extras = ["grpc"]} grpc-google-iam-v1 = ">=0.12.4,<1.0.0dev" proto-plus = ">=1.22.0,<2.0.0dev" protobuf = ">=3.19.5,<3.20.0 || >3.20.0,<3.20.1 || >3.20.1,<4.21.0 || >4.21.0,<4.21.1 || >4.21.1,<4.21.2 || >4.21.2,<4.21.3 || >4.21.3,<4.21.4 || >4.21.4,<4.21.5 || >4.21.5,<5.0.0dev" @@ -1967,6 +2060,7 @@ protobuf = ">=3.19.5,<3.20.0 || >3.20.0,<3.20.1 || >3.20.1,<4.21.0 || >4.21.0,<4 name = "google-cloud-bigquery-storage" version = "1.1.0" description = "BigQuery Storage API API client library" +category = "main" optional = false python-versions = ">=2.7,!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*" files = [ @@ -1986,6 +2080,7 @@ pyarrow = ["pyarrow (>=0.15.0)"] name = "google-cloud-core" version = "2.3.2" description = "Google Cloud API client core library" +category = "main" optional = false python-versions = ">=3.7" files = [ @@ -1994,7 +2089,7 @@ files = [ ] [package.dependencies] -google-api-core = ">=1.31.6,<2.0.dev0 || >2.3.0,<3.0.0dev" +google-api-core = ">=1.31.6,<2.0.0 || >2.3.0,<3.0.0dev" google-auth = ">=1.25.0,<3.0dev" [package.extras] @@ -2004,6 +2099,7 @@ grpc = ["grpcio (>=1.38.0,<2.0dev)"] name = "google-cloud-storage" version = "1.42.3" description = "Google Cloud Storage API client library" +category = "main" optional = false python-versions = ">=2.7,!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*" files = [ @@ -2024,6 +2120,7 @@ six = "*" name = "google-crc32c" version = "1.5.0" description = "A python wrapper of the C library 'Google CRC32C'" +category = "main" optional = false python-versions = ">=3.7" files = [ @@ -2104,6 +2201,7 @@ testing = ["pytest"] name = "google-resumable-media" version = "2.3.3" description = "Utilities for Google Media Downloads and Resumable Uploads" +category = "main" optional = false python-versions = ">= 3.6" files = [ @@ -2122,6 +2220,7 @@ requests = ["requests (>=2.18.0,<3.0.0dev)"] name = "googleapis-common-protos" version = "1.56.4" description = "Common protobufs used in Google APIs" +category = "main" optional = false python-versions = ">=3.7" files = [ @@ -2140,6 +2239,7 @@ grpc = ["grpcio (>=1.0.0,<2.0.0dev)"] name = "greenlet" version = "1.1.3" description = "Lightweight in-process concurrent programming" +category = "main" optional = false python-versions = ">=2.7,!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*" files = [ @@ -2206,6 +2306,7 @@ docs = ["Sphinx"] name = "grpc-google-iam-v1" version = "0.12.4" description = "IAM API client library" +category = "main" optional = false python-versions = ">=3.6" files = [ @@ -2221,6 +2322,7 @@ grpcio = ">=1.0.0,<2.0.0dev" name = "grpcio" version = "1.49.1" description = "HTTP/2-based RPC framework" +category = "main" optional = false python-versions = ">=3.7" files = [ @@ -2281,6 +2383,7 @@ protobuf = ["grpcio-tools (>=1.49.1)"] name = "grpcio-status" version = "1.48.2" description = "Status proto mapping for gRPC" +category = "main" optional = false python-versions = ">=3.6" files = [ @@ -2297,6 +2400,7 @@ protobuf = ">=3.12.0" name = "gspread" version = "5.5.0" description = "Google Spreadsheets Python API" +category = "main" optional = false python-versions = ">=3.6, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" files = [ @@ -2312,6 +2416,7 @@ google-auth-oauthlib = ">=0.4.1" name = "gunicorn" version = "20.1.0" description = "WSGI HTTP Server for UNIX" +category = "main" optional = false python-versions = ">=3.5" files = [ @@ -2332,6 +2437,7 @@ tornado = ["tornado (>=0.2)"] name = "h11" version = "0.14.0" description = "A pure-Python, bring-your-own-I/O implementation of HTTP/1.1" +category = "main" optional = false python-versions = ">=3.7" files = [ @@ -2343,6 +2449,7 @@ files = [ name = "h5py" version = "3.8.0" description = "Read and write HDF5 files from Python" +category = "main" optional = false python-versions = ">=3.7" files = [ @@ -2380,6 +2487,7 @@ numpy = ">=1.14.5" name = "haversine" version = "2.8.0" description = "Calculate the distance between 2 points on Earth." +category = "main" optional = false python-versions = ">=3.5" files = [ @@ -2391,6 +2499,7 @@ files = [ name = "heapdict" version = "1.0.1" description = "a heap with decrease-key and increase-key operations" +category = "main" optional = false python-versions = "*" files = [ @@ -2402,6 +2511,7 @@ files = [ name = "html5lib" version = "1.1" description = "HTML parser based on the WHATWG HTML specification" +category = "main" optional = false python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*" files = [ @@ -2423,6 +2533,7 @@ lxml = ["lxml"] name = "httplib2" version = "0.20.4" description = "A comprehensive HTTP client library." +category = "main" optional = false python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" files = [ @@ -2437,6 +2548,7 @@ pyparsing = {version = ">=2.4.2,<3.0.0 || >3.0.0,<3.0.1 || >3.0.1,<3.0.2 || >3.0 name = "humanfriendly" version = "10.0" description = "Human friendly output for text interfaces using Python" +category = "main" optional = false python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*" files = [ @@ -2451,6 +2563,7 @@ pyreadline3 = {version = "*", markers = "sys_platform == \"win32\" and python_ve name = "hvac" version = "0.11.2" description = "HashiCorp Vault API client" +category = "main" optional = false python-versions = ">=2.7" files = [ @@ -2469,6 +2582,7 @@ parser = ["pyhcl (>=0.3.10)"] name = "identify" version = "2.5.5" description = "File identification library for Python" +category = "dev" optional = false python-versions = ">=3.7" files = [ @@ -2483,6 +2597,7 @@ license = ["ukkonen"] name = "idna" version = "3.4" description = "Internationalized Domain Names in Applications (IDNA)" +category = "main" optional = false python-versions = ">=3.5" files = [ @@ -2494,6 +2609,7 @@ files = [ name = "ijson" version = "2.6.1" description = "Iterative JSON parser with a standard Python iterator interface" +category = "main" optional = false python-versions = "*" files = [ @@ -2515,6 +2631,7 @@ files = [ name = "importlib-metadata" version = "4.12.0" description = "Read metadata from Python packages" +category = "main" optional = false python-versions = ">=3.7" files = [ @@ -2534,6 +2651,7 @@ testing = ["flufl.flake8", "importlib-resources (>=1.3)", "packaging", "pyfakefs name = "importlib-resources" version = "5.9.0" description = "Read resources from Python packages" +category = "main" optional = false python-versions = ">=3.7" files = [ @@ -2552,6 +2670,7 @@ testing = ["pytest (>=6)", "pytest-black (>=0.3.7)", "pytest-checkdocs (>=2.4)", name = "iniconfig" version = "1.1.1" description = "iniconfig: brain-dead simple config-ini parsing" +category = "dev" optional = false python-versions = "*" files = [ @@ -2563,6 +2682,7 @@ files = [ name = "isodate" version = "0.6.1" description = "An ISO 8601 date/time/duration parser and formatter" +category = "main" optional = false python-versions = "*" files = [ @@ -2577,6 +2697,7 @@ six = "*" name = "isort" version = "5.10.1" description = "A Python utility / library to sort Python imports." +category = "dev" optional = false python-versions = ">=3.6.1,<4.0" files = [ @@ -2594,6 +2715,7 @@ requirements-deprecated-finder = ["pip-api", "pipreqs"] name = "itsdangerous" version = "2.1.2" description = "Safely pass data to untrusted environments and back." +category = "main" optional = false python-versions = ">=3.7" files = [ @@ -2605,6 +2727,7 @@ files = [ name = "jeepney" version = "0.8.0" description = "Low-level, pure Python DBus protocol wrapper." +category = "main" optional = false python-versions = ">=3.7" files = [ @@ -2620,6 +2743,7 @@ trio = ["async_generator", "trio"] name = "jinja2" version = "3.0.3" description = "A very fast and expressive template engine." +category = "main" optional = false python-versions = ">=3.6" files = [ @@ -2637,6 +2761,7 @@ i18n = ["Babel (>=2.7)"] name = "jmespath" version = "1.0.1" description = "JSON Matching Expressions" +category = "main" optional = false python-versions = ">=3.7" files = [ @@ -2648,6 +2773,7 @@ files = [ name = "joblib" version = "1.2.0" description = "Lightweight pipelining with Python functions" +category = "main" optional = false python-versions = ">=3.7" files = [ @@ -2659,6 +2785,7 @@ files = [ name = "jsonpickle" version = "2.2.0" description = "Python library for serializing any arbitrary object graph into JSON" +category = "main" optional = false python-versions = ">=2.7" files = [ @@ -2675,6 +2802,7 @@ testing-libs = ["simplejson", "ujson", "yajl"] name = "kafka-python" version = "2.0.2" description = "Pure Python client for Apache Kafka" +category = "main" optional = false python-versions = "*" files = [ @@ -2689,6 +2817,7 @@ crc32c = ["crc32c"] name = "kaleido" version = "0.2.1" description = "Static image export for web-based visualization libraries with zero dependencies" +category = "main" optional = false python-versions = "*" files = [ @@ -2704,6 +2833,7 @@ files = [ name = "kiwisolver" version = "1.4.4" description = "A fast implementation of the Cassowary constraint solver" +category = "main" optional = false python-versions = ">=3.7" files = [ @@ -2781,6 +2911,7 @@ files = [ name = "knack" version = "0.9.0" description = "A Command-Line Interface framework" +category = "main" optional = false python-versions = "*" files = [ @@ -2799,6 +2930,7 @@ tabulate = "*" name = "kubernetes" version = "24.2.0" description = "Kubernetes python client" +category = "main" optional = false python-versions = ">=3.6" files = [ @@ -2816,7 +2948,7 @@ requests-oauthlib = "*" setuptools = ">=21.0.0" six = ">=1.9.0" urllib3 = ">=1.24.2" -websocket-client = ">=0.32.0,<0.40.0 || >0.40.0,<0.41.dev0 || >=0.43.dev0" +websocket-client = ">=0.32.0,<0.40.0 || >0.40.0,<0.41.0 || >=0.43.0" [package.extras] adal = ["adal (>=1.0.2)"] @@ -2825,6 +2957,7 @@ adal = ["adal (>=1.0.2)"] name = "lazy-object-proxy" version = "1.7.1" description = "A fast and thorough lazy object proxy." +category = "dev" optional = false python-versions = ">=3.6" files = [ @@ -2871,6 +3004,7 @@ files = [ name = "locket" version = "1.0.0" description = "File-based locks for Python on Linux and Windows" +category = "main" optional = false python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" files = [ @@ -2882,6 +3016,7 @@ files = [ name = "loguru" version = "0.6.0" description = "Python logging made (stupidly) simple" +category = "main" optional = false python-versions = ">=3.5" files = [ @@ -2900,6 +3035,7 @@ dev = ["Sphinx (>=4.1.1)", "black (>=19.10b0)", "colorama (>=0.3.4)", "docutils name = "lxml" version = "4.9.1" description = "Powerful and Pythonic XML processing library combining libxml2/libxslt with the ElementTree API." +category = "main" optional = false python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, != 3.4.*" files = [ @@ -2985,6 +3121,7 @@ source = ["Cython (>=0.29.7)"] name = "mako" version = "1.2.3" description = "A super-fast templating language that borrows the best ideas from the existing templating languages." +category = "main" optional = false python-versions = ">=3.7" files = [ @@ -3000,10 +3137,29 @@ babel = ["Babel"] lingua = ["lingua"] testing = ["pytest"] +[[package]] +name = "markdown" +version = "3.3.7" +description = "Python implementation of Markdown." +category = "dev" +optional = false +python-versions = ">=3.6" +files = [ + {file = "Markdown-3.3.7-py3-none-any.whl", hash = "sha256:f5da449a6e1c989a4cea2631aa8ee67caa5a2ef855d551c88f9e309f4634c621"}, + {file = "Markdown-3.3.7.tar.gz", hash = "sha256:cbb516f16218e643d8e0a95b309f77eb118cb138d39a4f27851e6a63581db874"}, +] + +[package.dependencies] +importlib-metadata = {version = ">=4.4", markers = "python_version < \"3.10\""} + +[package.extras] +testing = ["coverage", "pyyaml"] + [[package]] name = "markupsafe" version = "2.1.1" description = "Safely add untrusted strings to HTML/XML markup." +category = "main" optional = false python-versions = ">=3.7" files = [ @@ -3053,6 +3209,7 @@ files = [ name = "marshmallow" version = "3.18.0" description = "A lightweight library for converting complex datatypes to and from native Python datatypes." +category = "main" optional = false python-versions = ">=3.7" files = [ @@ -3073,6 +3230,7 @@ tests = ["pytest", "pytz", "simplejson"] name = "marshmallow-oneofschema" version = "3.0.1" description = "marshmallow multiplexing schema" +category = "main" optional = false python-versions = ">=3.6" files = [ @@ -3092,6 +3250,7 @@ tests = ["mock", "pytest"] name = "matplotlib" version = "3.5.1" description = "Python plotting package" +category = "main" optional = false python-versions = ">=3.7" files = [ @@ -3146,6 +3305,7 @@ python-dateutil = ">=2.7" name = "mccabe" version = "0.6.1" description = "McCabe checker, plugin for flake8" +category = "dev" optional = false python-versions = "*" files = [ @@ -3157,6 +3317,7 @@ files = [ name = "mlflow" version = "1.30.0" description = "MLflow: A Platform for ML Development and Productionization" +category = "main" optional = false python-versions = ">=3.7" files = [ @@ -3210,6 +3371,7 @@ sqlserver = ["mlflow-dbstore"] name = "mlserver" version = "1.1.0" description = "ML server" +category = "main" optional = false python-versions = "*" files = [ @@ -3229,7 +3391,7 @@ py-grpc-prometheus = "*" python-dotenv = "*" starlette-exporter = "*" uvicorn = "*" -uvloop = {version = "*", markers = "(sys_platform != \"win32\" and sys_platform != \"cygwin\") and platform_python_implementation != \"PyPy\""} +uvloop = {version = "*", markers = "sys_platform != \"win32\" and sys_platform != \"cygwin\" and platform_python_implementation != \"PyPy\""} [package.extras] all = ["orjson"] @@ -3238,6 +3400,7 @@ all = ["orjson"] name = "mlserver-mlflow" version = "1.1.0" description = "MLflow runtime for MLServer" +category = "main" optional = false python-versions = "*" files = [ @@ -3253,6 +3416,7 @@ mlserver = "*" name = "more-itertools" version = "8.14.0" description = "More routines for operating on iterables, beyond itertools" +category = "dev" optional = false python-versions = ">=3.5" files = [ @@ -3264,6 +3428,7 @@ files = [ name = "msal" version = "1.19.0" description = "The Microsoft Authentication Library (MSAL) for Python library enables your app to access the Microsoft Cloud by supporting authentication of users with Microsoft Azure Active Directory accounts (AAD) and Microsoft Accounts (MSA) using industry standard OAuth2 and OpenID Connect." +category = "main" optional = false python-versions = "*" files = [ @@ -3280,6 +3445,7 @@ requests = ">=2.0.0,<3" name = "msal-extensions" version = "1.0.0" description = "Microsoft Authentication Library extensions (MSAL EX) provides a persistence API that can save your data on disk, encrypted on Windows, macOS and Linux. Concurrent data access will be coordinated by a file lock mechanism." +category = "main" optional = false python-versions = "*" files = [ @@ -3298,6 +3464,7 @@ portalocker = [ name = "msgpack" version = "1.0.4" description = "MessagePack serializer" +category = "main" optional = false python-versions = "*" files = [ @@ -3359,6 +3526,7 @@ files = [ name = "msrest" version = "0.7.1" description = "AutoRest swagger generator Python client runtime." +category = "main" optional = false python-versions = ">=3.6" files = [ @@ -3380,6 +3548,7 @@ async = ["aiodns", "aiohttp (>=3.0)"] name = "msrestazure" version = "0.6.4" description = "AutoRest swagger generator Python client runtime. Azure-specific module." +category = "main" optional = false python-versions = "*" files = [ @@ -3396,6 +3565,7 @@ six = "*" name = "multidict" version = "6.0.2" description = "multidict implementation" +category = "main" optional = false python-versions = ">=3.7" files = [ @@ -3464,6 +3634,7 @@ files = [ name = "munch" version = "2.5.0" description = "A dot-accessible dictionary (a la JavaScript objects)" +category = "main" optional = false python-versions = "*" files = [ @@ -3482,6 +3653,7 @@ yaml = ["PyYAML (>=5.1.0)"] name = "mypy-extensions" version = "0.4.3" description = "Experimental type system extensions for programs checked with the mypy typechecker." +category = "main" optional = false python-versions = "*" files = [ @@ -3493,8 +3665,9 @@ files = [ name = "ndg-httpsclient" version = "0.5.1" description = "Provides enhanced HTTPS support for httplib and urllib2 using PyOpenSSL" +category = "main" optional = false -python-versions = ">=2.7,<3.0.dev0 || >=3.4.dev0" +python-versions = ">=2.7,<3.0.0 || >=3.4.0" files = [ {file = "ndg_httpsclient-0.5.1-py2-none-any.whl", hash = "sha256:d2c7225f6a1c6cf698af4ebc962da70178a99bcde24ee6d1961c4f3338130d57"}, {file = "ndg_httpsclient-0.5.1-py3-none-any.whl", hash = "sha256:dd174c11d971b6244a891f7be2b32ca9853d3797a72edb34fa5d7b07d8fff7d4"}, @@ -3509,6 +3682,7 @@ PyOpenSSL = "*" name = "netcdf4" version = "1.5.8" description = "Provides an object-oriented python interface to the netCDF version 4 library." +category = "main" optional = false python-versions = "*" files = [ @@ -3555,6 +3729,7 @@ numpy = ">=1.9" name = "nodeenv" version = "1.7.0" description = "Node.js virtual environment builder" +category = "dev" optional = false python-versions = ">=2.7,!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,!=3.6.*" files = [ @@ -3569,6 +3744,7 @@ setuptools = "*" name = "numpy" version = "1.22.0" description = "NumPy is the fundamental package for array computing with Python." +category = "main" optional = false python-versions = ">=3.8" files = [ @@ -3600,6 +3776,7 @@ files = [ name = "oauthlib" version = "3.2.1" description = "A generic, spec-compliant, thorough implementation of the OAuth request-signing logic" +category = "main" optional = false python-versions = ">=3.6" files = [ @@ -3616,6 +3793,7 @@ signedtoken = ["cryptography (>=3.0.0)", "pyjwt (>=2.0.0,<3)"] name = "opencv-python" version = "4.7.0.72" description = "Wrapper package for OpenCV python bindings." +category = "main" optional = false python-versions = ">=3.6" files = [ @@ -3642,6 +3820,7 @@ numpy = [ name = "packaging" version = "21.3" description = "Core utilities for Python packages" +category = "main" optional = false python-versions = ">=3.6" files = [ @@ -3656,6 +3835,7 @@ pyparsing = ">=2.0.2,<3.0.5 || >3.0.5" name = "pandas" version = "1.5.2" description = "Powerful data structures for data analysis, time series, and statistics" +category = "main" optional = false python-versions = ">=3.8" files = [ @@ -3703,6 +3883,7 @@ test = ["hypothesis (>=5.5.3)", "pytest (>=6.0)", "pytest-xdist (>=1.31)"] name = "pandas-gbq" version = "0.17.8" description = "Google BigQuery connector for pandas" +category = "main" optional = false python-versions = ">=3.7, <3.11" files = [ @@ -3712,10 +3893,10 @@ files = [ [package.dependencies] db-dtypes = ">=0.3.1,<2.0.0" -google-api-core = ">=1.31.5,<2.0.dev0 || >2.3.0,<3.0.0dev" +google-api-core = ">=1.31.5,<2.0.0 || >2.3.0,<3.0.0dev" google-auth = ">=1.25.0" google-auth-oauthlib = ">=0.0.1" -google-cloud-bigquery = ">=1.27.2,<2.4.dev0 || >=2.5.dev0,<4.0.0dev" +google-cloud-bigquery = ">=1.27.2,<2.4.0 || >=2.5.0,<4.0.0dev" google-cloud-bigquery-storage = ">=1.1.0,<3.0.0dev" numpy = ">=1.16.6" pandas = ">=0.24.2" @@ -3730,6 +3911,7 @@ tqdm = ["tqdm (>=4.23.0)"] name = "pandas-read-xml" version = "0.3.1" description = "A tool to read XML files as pandas dataframes." +category = "main" optional = false python-versions = ">=3.6" files = [ @@ -3749,6 +3931,7 @@ zipfile36 = "*" name = "pandavro" version = "1.7.1" description = "The interface between Avro and pandas DataFrame" +category = "main" optional = false python-versions = ">=3.6.1" files = [ @@ -3767,6 +3950,7 @@ tests = ["pytest (==7.1.2)"] name = "paramiko" version = "2.11.0" description = "SSH2 protocol library" +category = "main" optional = false python-versions = "*" files = [ @@ -3790,6 +3974,7 @@ invoke = ["invoke (>=1.3)"] name = "partd" version = "1.3.0" description = "Appendable key-value storage" +category = "main" optional = false python-versions = ">=3.7" files = [ @@ -3808,6 +3993,7 @@ complete = ["blosc", "numpy (>=1.9.0)", "pandas (>=0.19.0)", "pyzmq"] name = "pathspec" version = "0.10.1" description = "Utility library for gitignore style pattern matching of file paths." +category = "main" optional = false python-versions = ">=3.7" files = [ @@ -3819,6 +4005,7 @@ files = [ name = "patsy" version = "0.5.3" description = "A Python package for describing statistical models and for building design matrices." +category = "main" optional = false python-versions = "*" files = [ @@ -3837,6 +4024,7 @@ test = ["pytest", "pytest-cov", "scipy"] name = "pdoc3" version = "0.10.0" description = "Auto-generate API documentation for Python projects." +category = "dev" optional = false python-versions = ">= 3.6" files = [ @@ -3844,10 +4032,15 @@ files = [ {file = "pdoc3-0.10.0.tar.gz", hash = "sha256:5f22e7bcb969006738e1aa4219c75a32f34c2d62d46dc9d2fb2d3e0b0287e4b7"}, ] +[package.dependencies] +mako = "*" +markdown = ">=3.0" + [[package]] name = "pendulum" version = "2.1.2" description = "Python datetimes made easy" +category = "main" optional = false python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*" files = [ @@ -3882,6 +4075,7 @@ pytzdata = ">=2020.1" name = "pexpect" version = "4.8.0" description = "Pexpect allows easy control of interactive console applications." +category = "main" optional = false python-versions = "*" files = [ @@ -3896,6 +4090,7 @@ ptyprocess = ">=0.5" name = "phonenumbers" version = "8.13.0" description = "Python version of Google's common library for parsing, formatting, storing and validating international phone numbers." +category = "main" optional = false python-versions = "*" files = [ @@ -3907,6 +4102,7 @@ files = [ name = "pillow" version = "9.3.0" description = "Python Imaging Library (Fork)" +category = "main" optional = false python-versions = ">=3.7" files = [ @@ -3981,6 +4177,7 @@ tests = ["check-manifest", "coverage", "defusedxml", "markdown2", "olefile", "pa name = "pkginfo" version = "1.8.3" description = "Query metadatdata from sdists / bdists / installed packages." +category = "main" optional = false python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*, !=3.5.*" files = [ @@ -3995,6 +4192,7 @@ testing = ["coverage", "nose"] name = "platformdirs" version = "2.5.2" description = "A small Python module for determining appropriate platform-specific dirs, e.g. a \"user data dir\"." +category = "main" optional = false python-versions = ">=3.7" files = [ @@ -4010,6 +4208,7 @@ test = ["appdirs (==1.4.4)", "pytest (>=6)", "pytest-cov (>=2.7)", "pytest-mock name = "plotly" version = "5.14.0" description = "An open-source, interactive data visualization library for Python" +category = "main" optional = false python-versions = ">=3.6" files = [ @@ -4025,6 +4224,7 @@ tenacity = ">=6.2.0" name = "pluggy" version = "0.13.1" description = "plugin and hook calling mechanisms for python" +category = "dev" optional = false python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" files = [ @@ -4039,6 +4239,7 @@ dev = ["pre-commit", "tox"] name = "portalocker" version = "2.5.1" description = "Wraps the portalocker recipe for easy usage" +category = "main" optional = false python-versions = ">=3.5" files = [ @@ -4058,6 +4259,7 @@ tests = ["pytest (>=5.4.1)", "pytest-cov (>=2.8.1)", "pytest-mypy (>=0.8.0)", "p name = "pre-commit" version = "2.20.0" description = "A framework for managing and maintaining multi-language pre-commit hooks." +category = "dev" optional = false python-versions = ">=3.7" files = [ @@ -4075,88 +4277,78 @@ virtualenv = ">=20.0.8" [[package]] name = "prefect" -version = "1.4.1" +version = "0.15.9" description = "The Prefect Core automation and scheduling engine." +category = "main" optional = false -python-versions = ">=3.7" +python-versions = ">=3.6" files = [ - {file = "prefect-1.4.1-py3-none-any.whl", hash = "sha256:a838d427a88845b13279b89b925e2b6acde5ff2bb090c5480617bc6047a808a8"}, - {file = "prefect-1.4.1.tar.gz", hash = "sha256:179f179849286bb8dc0309c8718a7815e6e5fcc016398d2aea45e16fa0e3471b"}, + {file = "prefect-0.15.9-py3-none-any.whl", hash = "sha256:595c9f229349528f7bcd2aa866c9c10dcfbf059a20803526924339d45604ec76"}, + {file = "prefect-0.15.9.tar.gz", hash = "sha256:52d4d28493cd1a90e1acf96b5a92b2902950849b481a49f762998448a41cf127"}, ] [package.dependencies] -click = ">=7.0" +click = ">=7.0,<9.0" cloudpickle = ">=1.3.0" -croniter = ">=0.3.24" -dask = ">=2021.06.0" -distributed = ">=2.17.0" +croniter = ">=0.3.24,<2.0" +dask = {version = ">=2.17.0", markers = "python_version > \"3.6\""} +distributed = {version = ">=2.17.0", markers = "python_version > \"3.6\""} docker = ">=3.4.1" -importlib-resources = ">=3.0.0" marshmallow = ">=3.0.0b19" marshmallow-oneofschema = ">=2.0.0b2" msgpack = ">=0.6.0" mypy-extensions = ">=0.4.0" -packaging = ">=20.0" pendulum = ">=2.0.4" python-box = ">=5.1.0" python-dateutil = ">=2.7.0" python-slugify = ">=1.2.6" pytz = ">=2018.7" pyyaml = ">=3.13" -requests = ">=2.25" +requests = ">=2.20,<2.27" tabulate = ">=0.8.0" toml = ">=0.9.4" -urllib3 = ">=1.26.0" +urllib3 = ">=1.24.3" [package.extras] -airtable = ["airtable-python-wrapper (>=0.11)"] -all-extras = ["PyGithub (>=1.51)", "PyJWT (>=2.3.0)", "Pygments (>=2.2,<3.0)", "airtable-python-wrapper (>=0.11)", "atlassian-python-api (>=2.0.1)", "azure-core (>=1.10.0)", "azure-cosmos (>=3.1.1)", "azure-identity (>=1.7.0)", "azure-mgmt-datafactory (>=2.7.0)", "azure-storage-blob (>=12.1.0)", "azureml-sdk", "black", "boto3 (>=1.9)", "confluent-kafka (>=1.7.0)", "dask-cloudprovider[aws] (>=0.2.0)", "dask-kubernetes (>=0.8.0)", "dropbox (>=9.0,<10.0)", "dulwich (>=0.19.7)", "feedparser (>=5.0.1)", "firebolt-sdk (>=0.2.1)", "flaky (>=3.0)", "freezegun (>=1.0.0)", "google-auth (>=2.0)", "google-cloud-aiplatform (>=1.4.0)", "google-cloud-bigquery (>=1.6.0)", "google-cloud-secret-manager (>=2.4.0)", "google-cloud-storage (>=1.13)", "graphviz (>=0.8)", "graphviz (>=0.8.3)", "great-expectations (>=0.13.8)", "gspread (>=3.6.0)", "hvac (>=0.10)", "ipykernel (>=6.9.2)", "jinja2 (>=2.0)", "jinja2 (>=2.0,<4.0)", "jira (>=2.0.0)", "kubernetes (>=9.0.0a1)", "nbconvert (>=6.0.7)", "pandas (>=1.0.1)", "papermill (>=2.2.0)", "paramiko (>=2.10.4)", "prometheus-client (>=0.9.0)", "psycopg2-binary (>=2.8.2)", "pushbullet.py (>=0.11.0)", "py2neo (>=2021.2.3)", "pyarrow (>=5.0.0)", "pydantic (>=1.9.0)", "pyexasol (>=0.16.1)", "pymysql (>=0.9.3)", "pyodbc (>=4.0.30)", "pytest (>=6.0)", "pytest-env (>=0.6.0)", "pytest-xdist (>=2.0)", "python-gitlab (>=2.5.0)", "redis (>=3.2.1)", "responses (>=0.14.0)", "sendgrid (>=6.7.0)", "snowflake-connector-python (>=1.8.2)", "soda-spark (>=0.2.1)", "soda-sql (>=2.0.0b25)", "spacy (>=2.0.0)", "sqlalchemy-redshift (>=0.8.11)", "testfixtures (>=6.10.3)", "toloka-kit (>=0.1.25)", "transform (>=1.0.12)", "tweepy (>=3.5)"] -all-orchestration-extras = ["PyGithub (>=1.51)", "atlassian-python-api (>=2.0.1)", "azure-identity (>=1.7.0)", "azure-storage-blob (>=12.1.0)", "boto3 (>=1.9)", "dulwich (>=0.19.7)", "google-auth (>=2.0)", "google-cloud-aiplatform (>=1.4.0)", "google-cloud-secret-manager (>=2.4.0)", "google-cloud-storage (>=1.13)", "kubernetes (>=9.0.0a1)", "python-gitlab (>=2.5.0)"] -aws = ["boto3 (>=1.9)"] -azure = ["azure-core (>=1.10.0)", "azure-cosmos (>=3.1.1)", "azure-identity (>=1.7.0)", "azure-mgmt-datafactory (>=2.7.0)", "azure-storage-blob (>=12.1.0)"] -azureml = ["azureml-sdk"] -base-library-ci = ["PyGithub (>=1.51)", "PyJWT (>=2.3.0)", "Pygments (>=2.2,<3.0)", "atlassian-python-api (>=2.0.1)", "azure-identity (>=1.7.0)", "azure-storage-blob (>=12.1.0)", "black", "boto3 (>=1.9)", "dulwich (>=0.19.7)", "flaky (>=3.0)", "freezegun (>=1.0.0)", "google-auth (>=2.0)", "google-cloud-aiplatform (>=1.4.0)", "google-cloud-secret-manager (>=2.4.0)", "google-cloud-storage (>=1.13)", "graphviz (>=0.8)", "jinja2 (>=2.0,<4.0)", "jira (>=2.0.0)", "kubernetes (>=9.0.0a1)", "pandas (>=1.0.1)", "pytest (>=6.0)", "pytest-env (>=0.6.0)", "pytest-xdist (>=2.0)", "python-gitlab (>=2.5.0)", "responses (>=0.14.0)", "testfixtures (>=6.10.3)"] +airtable = ["airtable-python-wrapper (>=0.11,<0.12)"] +all-extras = ["PyGithub (>=1.51,<2.0)", "Pygments (>=2.2,<3.0)", "airtable-python-wrapper (>=0.11,<0.12)", "atlassian-python-api (>=2.0.1)", "azure-cosmos (>=3.1.1,<3.2)", "azure-storage-blob (>=12.1.0,<13.0)", "azureml-sdk (>=1.0.65,<1.1)", "black", "boto3 (>=1.9,<2.0)", "confluent-kafka (>=1.7.0)", "dask-cloudprovider[aws] (>=0.2.0)", "dask-kubernetes (>=0.8.0)", "dropbox (>=9.0,<10.0)", "dulwich (>=0.19.7)", "feedparser (>=5.0.1,<6.0)", "flaky (>=3.0)", "google-auth (>=2.0,<3.0)", "google-cloud-aiplatform (>=1.4.0,<2.0)", "google-cloud-bigquery (>=1.6.0,<3.0)", "google-cloud-secret-manager (>=2.4.0)", "google-cloud-storage (>=1.13,<2.0)", "graphviz (>=0.8)", "graphviz (>=0.8.3)", "great-expectations (>=0.11.1)", "gspread (>=3.6.0)", "hvac (>=0.10)", "jinja2 (>=2.0,<4.0)", "jira (>=2.0.0)", "kubernetes (>=9.0.0a1,<=13.0)", "mypy (>=0.600,<0.813)", "nbconvert (>=6.0.7)", "pandas (>=1.0.1)", "papermill (>=2.2.0)", "prometheus-client (>=0.9.0)", "psycopg2-binary (>=2.8.2)", "pushbullet.py (>=0.11.0)", "pyarrow (>=5.0.0)", "pyexasol (>=0.16.1)", "pymysql (>=0.9.3)", "pyodbc (>=4.0.30)", "pytest (>=6.0)", "pytest-env (>=0.6.0)", "pytest-xdist (>=2.0)", "python-gitlab (>=2.5.0,<3.0)", "redis (>=3.2.1)", "responses (>=0.14.0)", "sendgrid (>=6.7.0)", "snowflake-connector-python (>=1.8.2,<2.5)", "soda-sql (>=2.0.0b25)", "spacy (>=2.0.0,<3.0.0)", "testfixtures (>=6.10.3)", "tweepy (>=3.5,<4.0)"] +all-orchestration-extras = ["PyGithub (>=1.51,<2.0)", "atlassian-python-api (>=2.0.1)", "azure-storage-blob (>=12.1.0,<13.0)", "boto3 (>=1.9,<2.0)", "dulwich (>=0.19.7)", "google-auth (>=2.0,<3.0)", "google-cloud-aiplatform (>=1.4.0,<2.0)", "google-cloud-secret-manager (>=2.4.0)", "google-cloud-storage (>=1.13,<2.0)", "kubernetes (>=9.0.0a1,<=13.0)", "python-gitlab (>=2.5.0,<3.0)"] +aws = ["boto3 (>=1.9,<2.0)"] +azure = ["azure-cosmos (>=3.1.1,<3.2)", "azure-storage-blob (>=12.1.0,<13.0)", "azureml-sdk (>=1.0.65,<1.1)"] +base-library-ci = ["PyGithub (>=1.51,<2.0)", "Pygments (>=2.2,<3.0)", "atlassian-python-api (>=2.0.1)", "azure-storage-blob (>=12.1.0,<13.0)", "black", "boto3 (>=1.9,<2.0)", "dulwich (>=0.19.7)", "flaky (>=3.0)", "google-auth (>=2.0,<3.0)", "google-cloud-aiplatform (>=1.4.0,<2.0)", "google-cloud-secret-manager (>=2.4.0)", "google-cloud-storage (>=1.13,<2.0)", "graphviz (>=0.8)", "jinja2 (>=2.0,<4.0)", "jira (>=2.0.0)", "kubernetes (>=9.0.0a1,<=13.0)", "mypy (>=0.600,<0.813)", "pandas (>=1.0.1)", "pytest (>=6.0)", "pytest-env (>=0.6.0)", "pytest-xdist (>=2.0)", "python-gitlab (>=2.5.0,<3.0)", "responses (>=0.14.0)", "testfixtures (>=6.10.3)"] bitbucket = ["atlassian-python-api (>=2.0.1)"] -cubejs = ["PyJWT (>=2.3.0)"] dask-cloudprovider = ["dask-cloudprovider[aws] (>=0.2.0)"] -databricks = ["pydantic (>=1.9.0)"] -dev = ["PyJWT (>=2.3.0)", "Pygments (>=2.2,<3.0)", "black", "flaky (>=3.0)", "freezegun (>=1.0.0)", "graphviz (>=0.8)", "jinja2 (>=2.0,<4.0)", "pytest (>=6.0)", "pytest-env (>=0.6.0)", "pytest-xdist (>=2.0)", "responses (>=0.14.0)", "testfixtures (>=6.10.3)"] +dev = ["Pygments (>=2.2,<3.0)", "black", "flaky (>=3.0)", "graphviz (>=0.8)", "jinja2 (>=2.0,<4.0)", "mypy (>=0.600,<0.813)", "pytest (>=6.0)", "pytest-env (>=0.6.0)", "pytest-xdist (>=2.0)", "responses (>=0.14.0)", "testfixtures (>=6.10.3)"] dremio = ["pyarrow (>=5.0.0)"] dropbox = ["dropbox (>=9.0,<10.0)"] exasol = ["pyexasol (>=0.16.1)"] -firebolt = ["firebolt-sdk (>=0.2.1)"] -gcp = ["google-auth (>=2.0)", "google-cloud-aiplatform (>=1.4.0)", "google-cloud-bigquery (>=1.6.0)", "google-cloud-secret-manager (>=2.4.0)", "google-cloud-storage (>=1.13)"] -ge = ["great-expectations (>=0.13.8)", "sqlalchemy-redshift (>=0.8.11)"] +gcp = ["google-auth (>=2.0,<3.0)", "google-cloud-aiplatform (>=1.4.0,<2.0)", "google-cloud-bigquery (>=1.6.0,<3.0)", "google-cloud-secret-manager (>=2.4.0)", "google-cloud-storage (>=1.13,<2.0)"] +ge = ["great-expectations (>=0.11.1)"] git = ["dulwich (>=0.19.7)"] -github = ["PyGithub (>=1.51)"] -gitlab = ["python-gitlab (>=2.5.0)"] -google = ["google-auth (>=2.0)", "google-cloud-aiplatform (>=1.4.0)", "google-cloud-bigquery (>=1.6.0)", "google-cloud-secret-manager (>=2.4.0)", "google-cloud-storage (>=1.13)"] +github = ["PyGithub (>=1.51,<2.0)"] +gitlab = ["python-gitlab (>=2.5.0,<3.0)"] +google = ["google-auth (>=2.0,<3.0)", "google-cloud-aiplatform (>=1.4.0,<2.0)", "google-cloud-bigquery (>=1.6.0,<3.0)", "google-cloud-secret-manager (>=2.4.0)", "google-cloud-storage (>=1.13,<2.0)"] gsheets = ["gspread (>=3.6.0)"] jira = ["jira (>=2.0.0)"] -jupyter = ["ipykernel (>=6.9.2)", "nbconvert (>=6.0.7)", "papermill (>=2.2.0)"] +jupyter = ["nbconvert (>=6.0.7)", "papermill (>=2.2.0)"] kafka = ["confluent-kafka (>=1.7.0)"] -kubernetes = ["dask-kubernetes (>=0.8.0)", "kubernetes (>=9.0.0a1)"] +kubernetes = ["dask-kubernetes (>=0.8.0)", "kubernetes (>=9.0.0a1,<=13.0)"] mysql = ["pymysql (>=0.9.3)"] -neo4j = ["py2neo (>=2021.2.3)"] pandas = ["pandas (>=1.0.1)"] postgres = ["psycopg2-binary (>=2.8.2)"] prometheus = ["prometheus-client (>=0.9.0)"] pushbullet = ["pushbullet.py (>=0.11.0)"] redis = ["redis (>=3.2.1)"] -rss = ["feedparser (>=5.0.1)"] +rss = ["feedparser (>=5.0.1,<6.0)"] sendgrid = ["sendgrid (>=6.7.0)"] -sftp = ["paramiko (>=2.10.4)"] -snowflake = ["snowflake-connector-python (>=1.8.2)"] -sodaspark = ["soda-spark (>=0.2.1)"] +snowflake = ["snowflake-connector-python (>=1.8.2,<2.5)"] sodasql = ["soda-sql (>=2.0.0b25)"] -spacy = ["spacy (>=2.0.0)"] +spacy = ["spacy (>=2.0.0,<3.0.0)"] sql-server = ["pyodbc (>=4.0.30)"] -task-library-ci = ["PyGithub (>=1.51)", "PyJWT (>=2.3.0)", "Pygments (>=2.2,<3.0)", "airtable-python-wrapper (>=0.11)", "atlassian-python-api (>=2.0.1)", "azure-core (>=1.10.0)", "azure-cosmos (>=3.1.1)", "azure-identity (>=1.7.0)", "azure-mgmt-datafactory (>=2.7.0)", "azure-storage-blob (>=12.1.0)", "azureml-sdk", "black", "boto3 (>=1.9)", "confluent-kafka (>=1.7.0)", "dask-kubernetes (>=0.8.0)", "dropbox (>=9.0,<10.0)", "dulwich (>=0.19.7)", "feedparser (>=5.0.1)", "firebolt-sdk (>=0.2.1)", "flaky (>=3.0)", "freezegun (>=1.0.0)", "google-auth (>=2.0)", "google-cloud-aiplatform (>=1.4.0)", "google-cloud-bigquery (>=1.6.0)", "google-cloud-secret-manager (>=2.4.0)", "google-cloud-storage (>=1.13)", "graphviz (>=0.8)", "graphviz (>=0.8.3)", "great-expectations (>=0.13.8)", "gspread (>=3.6.0)", "hvac (>=0.10)", "ipykernel (>=6.9.2)", "jinja2 (>=2.0)", "jinja2 (>=2.0,<4.0)", "jira (>=2.0.0)", "kubernetes (>=9.0.0a1)", "nbconvert (>=6.0.7)", "pandas (>=1.0.1)", "papermill (>=2.2.0)", "paramiko (>=2.10.4)", "prometheus-client (>=0.9.0)", "psycopg2-binary (>=2.8.2)", "pushbullet.py (>=0.11.0)", "py2neo (>=2021.2.3)", "pyarrow (>=5.0.0)", "pydantic (>=1.9.0)", "pyexasol (>=0.16.1)", "pymysql (>=0.9.3)", "pytest (>=6.0)", "pytest-env (>=0.6.0)", "pytest-xdist (>=2.0)", "python-gitlab (>=2.5.0)", "redis (>=3.2.1)", "responses (>=0.14.0)", "sendgrid (>=6.7.0)", "snowflake-connector-python (>=1.8.2)", "spacy (>=2.0.0)", "sqlalchemy-redshift (>=0.8.11)", "testfixtures (>=6.10.3)", "toloka-kit (>=0.1.25)", "transform (>=1.0.12)", "tweepy (>=3.5)"] -templates = ["jinja2 (>=2.0)"] -test = ["PyJWT (>=2.3.0)", "flaky (>=3.0)", "freezegun (>=1.0.0)", "pytest (>=6.0)", "pytest-env (>=0.6.0)", "pytest-xdist (>=2.0)", "responses (>=0.14.0)", "testfixtures (>=6.10.3)"] -toloka = ["toloka-kit (>=0.1.25)"] -transform = ["transform (>=1.0.12)"] -twitter = ["tweepy (>=3.5)"] +task-library-ci = ["PyGithub (>=1.51,<2.0)", "Pygments (>=2.2,<3.0)", "airtable-python-wrapper (>=0.11,<0.12)", "atlassian-python-api (>=2.0.1)", "azure-cosmos (>=3.1.1,<3.2)", "azure-storage-blob (>=12.1.0,<13.0)", "azureml-sdk (>=1.0.65,<1.1)", "black", "boto3 (>=1.9,<2.0)", "confluent-kafka (>=1.7.0)", "dask-kubernetes (>=0.8.0)", "dropbox (>=9.0,<10.0)", "dulwich (>=0.19.7)", "feedparser (>=5.0.1,<6.0)", "flaky (>=3.0)", "google-auth (>=2.0,<3.0)", "google-cloud-aiplatform (>=1.4.0,<2.0)", "google-cloud-bigquery (>=1.6.0,<3.0)", "google-cloud-secret-manager (>=2.4.0)", "google-cloud-storage (>=1.13,<2.0)", "graphviz (>=0.8)", "graphviz (>=0.8.3)", "great-expectations (>=0.11.1)", "gspread (>=3.6.0)", "hvac (>=0.10)", "jinja2 (>=2.0,<4.0)", "jira (>=2.0.0)", "kubernetes (>=9.0.0a1,<=13.0)", "mypy (>=0.600,<0.813)", "nbconvert (>=6.0.7)", "pandas (>=1.0.1)", "papermill (>=2.2.0)", "prometheus-client (>=0.9.0)", "psycopg2-binary (>=2.8.2)", "pushbullet.py (>=0.11.0)", "pyarrow (>=5.0.0)", "pyexasol (>=0.16.1)", "pymysql (>=0.9.3)", "pytest (>=6.0)", "pytest-env (>=0.6.0)", "pytest-xdist (>=2.0)", "python-gitlab (>=2.5.0,<3.0)", "redis (>=3.2.1)", "responses (>=0.14.0)", "sendgrid (>=6.7.0)", "snowflake-connector-python (>=1.8.2,<2.5)", "soda-sql (>=2.0.0b25)", "spacy (>=2.0.0,<3.0.0)", "testfixtures (>=6.10.3)", "tweepy (>=3.5,<4.0)"] +templates = ["jinja2 (>=2.0,<4.0)"] +test = ["flaky (>=3.0)", "pytest (>=6.0)", "pytest-env (>=0.6.0)", "pytest-xdist (>=2.0)", "responses (>=0.14.0)", "testfixtures (>=6.10.3)"] +twitter = ["tweepy (>=3.5,<4.0)"] vault = ["hvac (>=0.10)"] viz = ["graphviz (>=0.8.3)"] @@ -4164,6 +4356,7 @@ viz = ["graphviz (>=0.8.3)"] name = "prometheus-client" version = "0.14.1" description = "Python client for the Prometheus monitoring system." +category = "main" optional = false python-versions = ">=3.6" files = [ @@ -4178,6 +4371,7 @@ twisted = ["twisted"] name = "prometheus-flask-exporter" version = "0.20.3" description = "Prometheus metrics exporter for Flask" +category = "main" optional = false python-versions = "*" files = [ @@ -4193,6 +4387,7 @@ prometheus-client = "*" name = "proto-plus" version = "1.22.1" description = "Beautiful, Pythonic protocol buffers." +category = "main" optional = false python-versions = ">=3.6" files = [ @@ -4210,6 +4405,7 @@ testing = ["google-api-core[grpc] (>=1.31.5)"] name = "protobuf" version = "3.20.2" description = "Protocol Buffers" +category = "main" optional = false python-versions = ">=3.7" files = [ @@ -4241,6 +4437,7 @@ files = [ name = "psutil" version = "5.9.2" description = "Cross-platform lib for process and system monitoring in Python." +category = "main" optional = false python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" files = [ @@ -4285,6 +4482,7 @@ test = ["enum34", "ipaddress", "mock", "pywin32", "wmi"] name = "ptyprocess" version = "0.7.0" description = "Run a subprocess in a pseudo terminal" +category = "main" optional = false python-versions = "*" files = [ @@ -4296,6 +4494,7 @@ files = [ name = "py" version = "1.11.0" description = "library with cross-python path, ini-parsing, io, code, log facilities" +category = "dev" optional = false python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*" files = [ @@ -4307,6 +4506,7 @@ files = [ name = "py-grpc-prometheus" version = "0.7.0" description = "Python gRPC Prometheus Interceptors" +category = "main" optional = false python-versions = "*" files = [ @@ -4323,6 +4523,7 @@ setuptools = ">=39.0.1" name = "pyaml" version = "20.4.0" description = "PyYAML-based module to produce pretty and readable YAML-serialized data" +category = "main" optional = false python-versions = "*" files = [ @@ -4337,6 +4538,7 @@ PyYAML = "*" name = "pyarrow" version = "6.0.0" description = "Python library for Apache Arrow" +category = "main" optional = false python-versions = ">=3.6" files = [ @@ -4385,6 +4587,7 @@ numpy = ">=1.16.6" name = "pyasn1" version = "0.4.8" description = "ASN.1 types and codecs" +category = "main" optional = false python-versions = "*" files = [ @@ -4396,6 +4599,7 @@ files = [ name = "pyasn1-modules" version = "0.2.8" description = "A collection of ASN.1-based protocols modules." +category = "main" optional = false python-versions = "*" files = [ @@ -4410,6 +4614,7 @@ pyasn1 = ">=0.4.6,<0.5.0" name = "pycodestyle" version = "2.8.0" description = "Python style guide checker" +category = "dev" optional = false python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*" files = [ @@ -4421,6 +4626,7 @@ files = [ name = "pycparser" version = "2.21" description = "C parser in Python" +category = "main" optional = false python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" files = [ @@ -4432,6 +4638,7 @@ files = [ name = "pydantic" version = "1.10.2" description = "Data validation and settings management using python type hints" +category = "main" optional = false python-versions = ">=3.7" files = [ @@ -4484,6 +4691,7 @@ email = ["email-validator (>=1.0.3)"] name = "pydata-google-auth" version = "1.4.0" description = "PyData helpers for authenticating to Google APIs" +category = "main" optional = false python-versions = "*" files = [ @@ -4500,6 +4708,7 @@ setuptools = "*" name = "pyflakes" version = "2.4.0" description = "passive checker of Python programs" +category = "dev" optional = false python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" files = [ @@ -4511,6 +4720,7 @@ files = [ name = "pygments" version = "2.13.0" description = "Pygments is a syntax highlighting package written in Python." +category = "main" optional = false python-versions = ">=3.6" files = [ @@ -4525,6 +4735,7 @@ plugins = ["importlib-metadata"] name = "pyjwt" version = "2.5.0" description = "JSON Web Token implementation in Python" +category = "main" optional = false python-versions = ">=3.7" files = [ @@ -4546,6 +4757,7 @@ tests = ["coverage[toml] (==5.0.4)", "pytest (>=6.0.0,<7.0.0)"] name = "pylint" version = "2.13.9" description = "python code static checker" +category = "dev" optional = false python-versions = ">=3.6.2" files = [ @@ -4570,6 +4782,7 @@ testutil = ["gitpython (>3)"] name = "pymssql" version = "2.2.5" description = "DB-API interface to Microsoft SQL Server for Python. (new Cython-based version)" +category = "main" optional = false python-versions = "*" files = [ @@ -4630,6 +4843,7 @@ files = [ name = "pymysql" version = "1.0.2" description = "Pure Python MySQL Driver" +category = "main" optional = false python-versions = ">=3.6" files = [ @@ -4648,6 +4862,7 @@ rsa = ["cryptography"] name = "pynacl" version = "1.5.0" description = "Python binding to the Networking and Cryptography (NaCl) library" +category = "main" optional = false python-versions = ">=3.6" files = [ @@ -4674,6 +4889,7 @@ tests = ["hypothesis (>=3.27.0)", "pytest (>=3.2.1,!=3.3.0)"] name = "pyopenssl" version = "22.0.0" description = "Python wrapper module around the OpenSSL library" +category = "main" optional = false python-versions = ">=3.6" files = [ @@ -4692,6 +4908,7 @@ test = ["flaky", "pretend", "pytest (>=3.0.1)"] name = "pyparsing" version = "3.0.9" description = "pyparsing module - Classes and methods to define and execute parsing grammars" +category = "main" optional = false python-versions = ">=3.6.8" files = [ @@ -4706,6 +4923,7 @@ diagrams = ["jinja2", "railroad-diagrams"] name = "pyproj" version = "3.5.0" description = "Python interface to PROJ (cartographic projections and coordinate transformations library)" +category = "main" optional = false python-versions = ">=3.8" files = [ @@ -4753,6 +4971,7 @@ certifi = "*" name = "pyreadline3" version = "3.4.1" description = "A python implementation of GNU readline." +category = "main" optional = false python-versions = "*" files = [ @@ -4764,6 +4983,7 @@ files = [ name = "pysftp" version = "0.2.9" description = "A friendly face on SFTP" +category = "main" optional = false python-versions = "*" files = [ @@ -4777,6 +4997,7 @@ paramiko = ">=1.17" name = "pysocks" version = "1.7.1" description = "A Python SOCKS client module. See https://github.com/Anorov/PySocks for more information." +category = "main" optional = false python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" files = [ @@ -4789,6 +5010,7 @@ files = [ name = "pytest" version = "6.0.2" description = "pytest: simple powerful testing with Python" +category = "dev" optional = false python-versions = ">=3.5" files = [ @@ -4815,6 +5037,7 @@ testing = ["argcomplete", "hypothesis (>=3.56)", "mock", "nose", "requests", "xm name = "pytest-cov" version = "3.0.0" description = "Pytest plugin for measuring coverage." +category = "dev" optional = false python-versions = ">=3.6" files = [ @@ -4833,6 +5056,7 @@ testing = ["fields", "hunter", "process-tests", "pytest-xdist", "six", "virtuale name = "python-box" version = "5.4.1" description = "Advanced Python dictionaries with dot notation access" +category = "main" optional = false python-versions = ">=3.6" files = [ @@ -4852,6 +5076,7 @@ yaml = ["ruamel.yaml"] name = "python-dateutil" version = "2.8.2" description = "Extensions to the standard Python datetime module" +category = "main" optional = false python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,>=2.7" files = [ @@ -4866,6 +5091,7 @@ six = ">=1.5" name = "python-dotenv" version = "0.21.0" description = "Read key-value pairs from a .env file and set them as environment variables" +category = "main" optional = false python-versions = ">=3.7" files = [ @@ -4880,6 +5106,7 @@ cli = ["click (>=5.0)"] name = "python-slugify" version = "6.1.2" description = "A Python slugify application that also handles Unicode" +category = "main" optional = false python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*, !=3.5.*" files = [ @@ -4897,6 +5124,7 @@ unidecode = ["Unidecode (>=1.1.1)"] name = "python-string-utils" version = "1.0.0" description = "Utility functions for strings validation and manipulation." +category = "dev" optional = false python-versions = ">=3.5" files = [] @@ -4912,6 +5140,7 @@ resolved_reference = "78929d88d90b1f90cb4837528ed955166bf0f559" name = "python-telegram-bot" version = "13.14" description = "We have made you a wrapper you can't refuse" +category = "main" optional = false python-versions = ">=3.7" files = [ @@ -4935,6 +5164,7 @@ socks = ["PySocks"] name = "pytz" version = "2021.3" description = "World timezone definitions, modern and historical" +category = "main" optional = false python-versions = "*" files = [ @@ -4946,6 +5176,7 @@ files = [ name = "pytz-deprecation-shim" version = "0.1.0.post0" description = "Shims to make deprecation of pytz easier" +category = "main" optional = false python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,>=2.7" files = [ @@ -4961,6 +5192,7 @@ tzdata = {version = "*", markers = "python_version >= \"3.6\""} name = "pytzdata" version = "2020.1" description = "The Olson timezone database for Python." +category = "main" optional = false python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" files = [ @@ -4972,6 +5204,7 @@ files = [ name = "pywin32" version = "227" description = "Python for Window Extensions" +category = "main" optional = false python-versions = "*" files = [ @@ -4993,6 +5226,7 @@ files = [ name = "pyyaml" version = "6.0" description = "YAML parser and emitter for Python" +category = "main" optional = false python-versions = ">=3.6" files = [ @@ -5042,6 +5276,7 @@ files = [ name = "querystring-parser" version = "1.2.4" description = "QueryString parser for Python/Django that correctly handles nested dictionaries" +category = "main" optional = false python-versions = "*" files = [ @@ -5056,6 +5291,7 @@ six = "*" name = "rasterio" version = "1.2.10" description = "Fast and direct raster I/O for use with Numpy and SciPy" +category = "main" optional = false python-versions = ">=3.6" files = [ @@ -5093,6 +5329,7 @@ test = ["boto3 (>=1.2.4)", "hypothesis", "packaging", "pytest (>=2.8.2)", "pytes name = "redis" version = "4.3.4" description = "Python client for Redis database and key-value store" +category = "main" optional = false python-versions = ">=3.6" files = [ @@ -5113,6 +5350,7 @@ ocsp = ["cryptography (>=36.0.1)", "pyopenssl (==20.0.1)", "requests (>=2.26.0)" name = "redis-pal" version = "1.0.0" description = "Store things in Redis without worrying about types or anything, just do it!" +category = "main" optional = false python-versions = ">=3.8,<4.0" files = [ @@ -5128,6 +5366,7 @@ redis = ">=4.0,<5.0" name = "regex" version = "2022.9.13" description = "Alternative regular expression module, to replace re." +category = "main" optional = false python-versions = ">=3.6" files = [ @@ -5225,6 +5464,7 @@ files = [ name = "requests" version = "2.26.0" description = "Python HTTP for Humans." +category = "main" optional = false python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*, !=3.5.*" files = [ @@ -5247,6 +5487,7 @@ use-chardet-on-py3 = ["chardet (>=3.0.2,<5)"] name = "requests-auth-aws-sigv4" version = "0.7" description = "AWS SigV4 Authentication with the python requests module" +category = "main" optional = false python-versions = ">=2.7, >=3.6" files = [ @@ -5261,6 +5502,7 @@ requests = "*" name = "requests-oauthlib" version = "1.3.1" description = "OAuthlib authentication support for Requests." +category = "main" optional = false python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" files = [ @@ -5279,6 +5521,7 @@ rsa = ["oauthlib[signedtoken] (>=3.0.0)"] name = "rioxarray" version = "0.9.0" description = "rasterio xarray extension." +category = "main" optional = false python-versions = ">=3.7" files = [ @@ -5301,6 +5544,7 @@ interp = ["scipy"] name = "rsa" version = "4.9" description = "Pure-Python RSA implementation" +category = "main" optional = false python-versions = ">=3.6,<4" files = [ @@ -5315,6 +5559,7 @@ pyasn1 = ">=0.1.3" name = "ruamel.yaml" version = "0.17.10" description = "ruamel.yaml is a YAML parser/emitter that supports roundtrip preservation of comments, seq/map flow style, and map key order" +category = "main" optional = false python-versions = ">=3" files = [ @@ -5333,6 +5578,7 @@ jinja2 = ["ruamel.yaml.jinja2 (>=0.2)"] name = "ruamel.yaml.clib" version = "0.2.6" description = "C version of reader, parser and emitter for ruamel.yaml derived from libyaml" +category = "main" optional = false python-versions = ">=3.5" files = [ @@ -5372,6 +5618,7 @@ files = [ name = "s3fs" version = "2022.8.2" description = "Convenient Filesystem interface over S3" +category = "main" optional = false python-versions = ">= 3.7" files = [ @@ -5392,6 +5639,7 @@ boto3 = ["aiobotocore[boto3] (>=2.4.0,<2.5.0)"] name = "s3transfer" version = "0.6.0" description = "An Amazon S3 Transfer Manager" +category = "main" optional = false python-versions = ">= 3.7" files = [ @@ -5409,6 +5657,7 @@ crt = ["botocore[crt] (>=1.20.29,<2.0a.0)"] name = "scikit-learn" version = "1.2.2" description = "A set of python modules for machine learning and data mining" +category = "main" optional = false python-versions = ">=3.8" files = [ @@ -5451,6 +5700,7 @@ tests = ["black (>=22.3.0)", "flake8 (>=3.8.2)", "matplotlib (>=3.1.3)", "mypy ( name = "scipy" version = "1.8.1" description = "SciPy: Scientific Library for Python" +category = "main" optional = false python-versions = ">=3.8,<3.11" files = [ @@ -5486,6 +5736,7 @@ numpy = ">=1.17.3,<1.25.0" name = "scipy" version = "1.9.1" description = "SciPy: Scientific Library for Python" +category = "main" optional = false python-versions = ">=3.8,<3.12" files = [ @@ -5521,6 +5772,7 @@ numpy = ">=1.18.5,<1.25.0" name = "scp" version = "0.14.4" description = "scp module for paramiko" +category = "main" optional = false python-versions = "*" files = [ @@ -5535,6 +5787,7 @@ paramiko = "*" name = "secretstorage" version = "3.3.3" description = "Python bindings to FreeDesktop.org Secret Service API" +category = "main" optional = false python-versions = ">=3.6" files = [ @@ -5550,6 +5803,7 @@ jeepney = ">=0.6" name = "setuptools" version = "65.6.3" description = "Easily download, build, install, upgrade, and uninstall Python packages" +category = "main" optional = false python-versions = ">=3.7" files = [ @@ -5566,6 +5820,7 @@ testing-integration = ["build[virtualenv]", "filelock (>=3.4.0)", "jaraco.envs ( name = "shapely" version = "1.8.4" description = "Geometric objects, predicates, and operations" +category = "main" optional = false python-versions = ">=3.6" files = [ @@ -5614,6 +5869,7 @@ vectorized = ["numpy"] name = "simplejson" version = "3.18.0" description = "Simple, fast, extensible JSON encoder/decoder for Python" +category = "main" optional = false python-versions = ">=2.5, !=3.0.*, !=3.1.*, !=3.2.*" files = [ @@ -5684,6 +5940,7 @@ files = [ name = "six" version = "1.16.0" description = "Python 2 and 3 compatibility utilities" +category = "main" optional = false python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*" files = [ @@ -5695,6 +5952,7 @@ files = [ name = "smmap" version = "5.0.0" description = "A pure Python implementation of a sliding window memory map manager" +category = "main" optional = false python-versions = ">=3.6" files = [ @@ -5706,6 +5964,7 @@ files = [ name = "sniffio" version = "1.3.0" description = "Sniff out which async library your code is running under" +category = "main" optional = false python-versions = ">=3.7" files = [ @@ -5717,6 +5976,7 @@ files = [ name = "snuggs" version = "1.4.7" description = "Snuggs are s-expressions for Numpy" +category = "main" optional = false python-versions = "*" files = [ @@ -5735,6 +5995,7 @@ test = ["hypothesis", "pytest"] name = "sortedcontainers" version = "2.4.0" description = "Sorted Containers -- Sorted List, Sorted Dict, Sorted Set" +category = "main" optional = false python-versions = "*" files = [ @@ -5746,6 +6007,7 @@ files = [ name = "soupsieve" version = "2.3.2.post1" description = "A modern CSS selector implementation for Beautiful Soup." +category = "main" optional = false python-versions = ">=3.6" files = [ @@ -5757,6 +6019,7 @@ files = [ name = "sqlalchemy" version = "1.4.41" description = "Database Abstraction Library" +category = "main" optional = false python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,>=2.7" files = [ @@ -5804,7 +6067,7 @@ files = [ ] [package.dependencies] -greenlet = {version = "!=0.4.17", markers = "python_version >= \"3\" and (platform_machine == \"win32\" or platform_machine == \"WIN32\" or platform_machine == \"AMD64\" or platform_machine == \"amd64\" or platform_machine == \"x86_64\" or platform_machine == \"ppc64le\" or platform_machine == \"aarch64\")"} +greenlet = {version = "!=0.4.17", markers = "python_version >= \"3\" and (platform_machine == \"aarch64\" or platform_machine == \"ppc64le\" or platform_machine == \"x86_64\" or platform_machine == \"amd64\" or platform_machine == \"AMD64\" or platform_machine == \"win32\" or platform_machine == \"WIN32\")"} [package.extras] aiomysql = ["aiomysql", "greenlet (!=0.4.17)"] @@ -5831,6 +6094,7 @@ sqlcipher = ["sqlcipher3-binary"] name = "sqlparse" version = "0.4.3" description = "A non-validating SQL parser." +category = "main" optional = false python-versions = ">=3.5" files = [ @@ -5842,6 +6106,7 @@ files = [ name = "starlette" version = "0.20.4" description = "The little ASGI library that shines." +category = "main" optional = false python-versions = ">=3.7" files = [ @@ -5860,6 +6125,7 @@ full = ["itsdangerous", "jinja2", "python-multipart", "pyyaml", "requests"] name = "starlette-exporter" version = "0.14.0" description = "Prometheus metrics exporter for Starlette applications." +category = "main" optional = false python-versions = "*" files = [ @@ -5875,6 +6141,7 @@ starlette = "*" name = "statsmodels" version = "0.13.2" description = "Statistical computations and models for Python" +category = "main" optional = false python-versions = ">=3.7" files = [ @@ -5919,6 +6186,7 @@ docs = ["ipykernel", "jupyter-client", "matplotlib", "nbconvert", "nbformat", "n name = "statsmodels" version = "0.13.5" description = "Statistical computations and models for Python" +category = "main" optional = false python-versions = ">=3.7" files = [ @@ -5956,7 +6224,7 @@ pandas = ">=0.25" patsy = ">=0.5.2" scipy = [ {version = ">=1.3", markers = "(python_version > \"3.9\" or platform_system != \"Windows\" or platform_machine != \"x86\") and python_version < \"3.12\""}, - {version = ">=1.3,<1.9", markers = "(python_version == \"3.8\" or python_version == \"3.9\") and platform_system == \"Windows\" and platform_machine == \"x86\""}, + {version = ">=1.3,<1.9", markers = "python_version == \"3.8\" and platform_system == \"Windows\" and platform_machine == \"x86\" or python_version == \"3.9\" and platform_system == \"Windows\" and platform_machine == \"x86\""}, ] [package.extras] @@ -5968,6 +6236,7 @@ docs = ["ipykernel", "jupyter-client", "matplotlib", "nbconvert", "nbformat", "n name = "tabulate" version = "0.8.10" description = "Pretty-print tabular data" +category = "main" optional = false python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*" files = [ @@ -5982,6 +6251,7 @@ widechars = ["wcwidth"] name = "tblib" version = "1.7.0" description = "Traceback serialization library." +category = "main" optional = false python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*" files = [ @@ -5993,6 +6263,7 @@ files = [ name = "tenacity" version = "8.2.2" description = "Retry code until it succeeds" +category = "main" optional = false python-versions = ">=3.6" files = [ @@ -6007,6 +6278,7 @@ doc = ["reno", "sphinx", "tornado (>=4.5)"] name = "text-unidecode" version = "1.3" description = "The most basic Text::Unidecode port" +category = "main" optional = false python-versions = "*" files = [ @@ -6018,6 +6290,7 @@ files = [ name = "threadpoolctl" version = "3.1.0" description = "threadpoolctl" +category = "main" optional = false python-versions = ">=3.6" files = [ @@ -6029,6 +6302,7 @@ files = [ name = "toml" version = "0.10.2" description = "Python Library for Tom's Obvious, Minimal Language" +category = "main" optional = false python-versions = ">=2.6, !=3.0.*, !=3.1.*, !=3.2.*" files = [ @@ -6040,6 +6314,7 @@ files = [ name = "tomli" version = "2.0.1" description = "A lil' TOML parser" +category = "dev" optional = false python-versions = ">=3.7" files = [ @@ -6051,6 +6326,7 @@ files = [ name = "tomlkit" version = "0.7.0" description = "Style preserving TOML library" +category = "main" optional = false python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*" files = [ @@ -6062,6 +6338,7 @@ files = [ name = "toolz" version = "0.12.0" description = "List processing tools and functional utilities" +category = "main" optional = false python-versions = ">=3.5" files = [ @@ -6073,6 +6350,7 @@ files = [ name = "tornado" version = "6.1" description = "Tornado is a Python web framework and asynchronous networking library, originally developed at FriendFeed." +category = "main" optional = false python-versions = ">= 3.5" files = [ @@ -6123,6 +6401,7 @@ files = [ name = "tqdm" version = "4.50.2" description = "Fast, Extensible Progress Meter" +category = "main" optional = false python-versions = ">=2.6, !=3.0.*, !=3.1.*" files = [ @@ -6137,6 +6416,7 @@ dev = ["argopt", "py-make (>=0.1.0)", "pydoc-markdown", "twine"] name = "tweepy" version = "4.4.0" description = "Twitter library for Python" +category = "main" optional = false python-versions = ">=3.6" files = [ @@ -6158,6 +6438,7 @@ test = ["vcrpy (>=1.10.3)"] name = "typed-ast" version = "1.5.4" description = "a fork of Python 2 and 3 ast modules with type comment support" +category = "main" optional = false python-versions = ">=3.6" files = [ @@ -6191,6 +6472,7 @@ files = [ name = "typer" version = "0.4.2" description = "Typer, build great CLIs. Easy to code. Based on Python type hints." +category = "dev" optional = false python-versions = ">=3.6" files = [ @@ -6211,6 +6493,7 @@ test = ["black (>=22.3.0,<23.0.0)", "coverage (>=5.2,<6.0)", "isort (>=5.0.6,<6. name = "types-cryptography" version = "3.3.23" description = "Typing stubs for cryptography" +category = "main" optional = false python-versions = "*" files = [ @@ -6222,6 +6505,7 @@ files = [ name = "typing-extensions" version = "4.3.0" description = "Backported and Experimental Type Hints for Python 3.7+" +category = "main" optional = false python-versions = ">=3.7" files = [ @@ -6233,6 +6517,7 @@ files = [ name = "tzdata" version = "2022.4" description = "Provider of IANA time zone data" +category = "main" optional = false python-versions = ">=2" files = [ @@ -6244,6 +6529,7 @@ files = [ name = "tzlocal" version = "4.2" description = "tzinfo object for the local timezone" +category = "main" optional = false python-versions = ">=3.6" files = [ @@ -6264,6 +6550,7 @@ test = ["pytest (>=4.3)", "pytest-mock (>=3.3)"] name = "unidecode" version = "1.3.6" description = "ASCII transliterations of Unicode text" +category = "main" optional = false python-versions = ">=3.5" files = [ @@ -6275,6 +6562,7 @@ files = [ name = "uritemplate" version = "4.1.1" description = "Implementation of RFC 6570 URI Templates" +category = "main" optional = false python-versions = ">=3.6" files = [ @@ -6286,6 +6574,7 @@ files = [ name = "urllib3" version = "1.26.12" description = "HTTP library with thread-safe connection pooling, file post, and more." +category = "main" optional = false python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*, !=3.5.*, <4" files = [ @@ -6302,6 +6591,7 @@ socks = ["PySocks (>=1.5.6,!=1.5.7,<2.0)"] name = "uvicorn" version = "0.18.3" description = "The lightning-fast ASGI server." +category = "main" optional = false python-versions = ">=3.7" files = [ @@ -6320,6 +6610,7 @@ standard = ["colorama (>=0.4)", "httptools (>=0.4.0)", "python-dotenv (>=0.13)", name = "uvloop" version = "0.17.0" description = "Fast implementation of asyncio event loop on top of libuv" +category = "main" optional = false python-versions = ">=3.7" files = [ @@ -6364,6 +6655,7 @@ test = ["Cython (>=0.29.32,<0.30.0)", "aiohttp", "flake8 (>=3.9.2,<3.10.0)", "my name = "virtualenv" version = "20.16.5" description = "Virtual Python Environment builder" +category = "main" optional = false python-versions = ">=3.6" files = [ @@ -6384,6 +6676,7 @@ testing = ["coverage (>=6.2)", "coverage-enable-subprocess (>=1)", "flaky (>=3.7 name = "waitress" version = "2.1.2" description = "Waitress WSGI server" +category = "main" optional = false python-versions = ">=3.7.0" files = [ @@ -6399,6 +6692,7 @@ testing = ["coverage (>=5.0)", "pytest", "pytest-cover"] name = "webencodings" version = "0.5.1" description = "Character encoding aliases for legacy web content" +category = "main" optional = false python-versions = "*" files = [ @@ -6410,6 +6704,7 @@ files = [ name = "websocket-client" version = "1.4.1" description = "WebSocket client for Python with low level API options" +category = "main" optional = false python-versions = ">=3.7" files = [ @@ -6426,6 +6721,7 @@ test = ["websockets"] name = "werkzeug" version = "2.2.2" description = "The comprehensive WSGI web application library." +category = "main" optional = false python-versions = ">=3.7" files = [ @@ -6443,6 +6739,7 @@ watchdog = ["watchdog"] name = "win32-setctime" version = "1.1.0" description = "A small Python utility to set file creation time on Windows" +category = "main" optional = false python-versions = ">=3.5" files = [ @@ -6457,6 +6754,7 @@ dev = ["black (>=19.3b0)", "pytest (>=4.6.2)"] name = "wrapt" version = "1.14.1" description = "Module for decorators, wrappers and monkey patching." +category = "main" optional = false python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,>=2.7" files = [ @@ -6530,6 +6828,7 @@ files = [ name = "xarray" version = "2022.12.0" description = "N-D labeled arrays and datasets in Python" +category = "main" optional = false python-versions = ">=3.8" files = [ @@ -6554,6 +6853,7 @@ viz = ["matplotlib", "nc-time-axis", "seaborn"] name = "xgboost" version = "1.7.4" description = "XGBoost Python Package" +category = "main" optional = false python-versions = ">=3.8" files = [ @@ -6581,6 +6881,7 @@ scikit-learn = ["scikit-learn"] name = "xmltodict" version = "0.13.0" description = "Makes working with XML feel like you are working with JSON" +category = "main" optional = false python-versions = ">=3.4" files = [ @@ -6592,6 +6893,7 @@ files = [ name = "yarl" version = "1.8.1" description = "Yet another URL library" +category = "main" optional = false python-versions = ">=3.7" files = [ @@ -6664,6 +6966,7 @@ multidict = ">=4.0" name = "zict" version = "2.2.0" description = "Mutable mapping tools" +category = "main" optional = false python-versions = ">=3.7" files = [ @@ -6678,6 +6981,7 @@ heapdict = "*" name = "zipfile36" version = "0.1.3" description = "Read and write ZIP files - backport of the zipfile module from Python 3.6" +category = "main" optional = false python-versions = "*" files = [ @@ -6689,6 +6993,7 @@ files = [ name = "zipp" version = "3.8.1" description = "Backport of pathlib-compatible object wrapper for zip files" +category = "main" optional = false python-versions = ">=3.7" files = [ @@ -6703,4 +7008,4 @@ testing = ["func-timeout", "jaraco.itertools", "pytest (>=6)", "pytest-black (>= [metadata] lock-version = "2.0" python-versions = ">=3.8,<3.11" -content-hash = "0a3a2f0a8b8edd2dc52008104299b57b4d924d89b6e4ea9cd85db9f740c6ad12" +content-hash = "d97502ee5e3adfbb7720ca8a746e722b3bbe97b1ed2fcd4122fa9c6f38a67496" diff --git a/pyproject.toml b/pyproject.toml index 865edb4b7..805fd7f8a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -32,7 +32,7 @@ pandas = "1.5.2" pandas-read-xml = "^0.3.1" pendulum = "2.1.2" phonenumbers = "^8.12.57" -prefect = "1.4.1" +prefect = "0.15.9" pymssql = "^2.2.4" python = ">=3.8,<3.11" python-telegram-bot = "^13.11" diff --git a/requirements-deploy.txt b/requirements-deploy.txt index b01918fe5..bf5e18d7d 100644 --- a/requirements-deploy.txt +++ b/requirements-deploy.txt @@ -1,7 +1,7 @@ google-cloud-storage==1.42.3 loguru==0.6.0 poetry==1.1.13 -prefect==1.4.1 +prefect==0.15.9 typer==0.4.0 networkx==2.6.3 pytest-cov==3.0.0 diff --git a/requirements-test.txt b/requirements-test.txt index 339540e45..f6bd03c70 100644 --- a/requirements-test.txt +++ b/requirements-test.txt @@ -1,4 +1,4 @@ networkx==2.6.3 pytest-cov==3.0.0 pyyaml -prefect==1.4.1 \ No newline at end of file +prefect==0.15.9 \ No newline at end of file From b211c2a6fae95c950f9aa783ddabf9608f11cb18 Mon Sep 17 00:00:00 2001 From: Rodrigo Cunha <66736583+eng-rodrigocunha@users.noreply.github.com> Date: Thu, 6 Jul 2023 16:54:03 -0300 Subject: [PATCH 05/59] add code_owners --- pipelines/rj_smtr/br_rj_riodejaneiro_rdo/flows.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pipelines/rj_smtr/br_rj_riodejaneiro_rdo/flows.py b/pipelines/rj_smtr/br_rj_riodejaneiro_rdo/flows.py index c72910b1f..fa97cc83d 100644 --- a/pipelines/rj_smtr/br_rj_riodejaneiro_rdo/flows.py +++ b/pipelines/rj_smtr/br_rj_riodejaneiro_rdo/flows.py @@ -243,7 +243,7 @@ with Flow( "SMTR: STPL RDO - Captura", - code_owners=["caio", "fernanda"], + code_owners=["caio", "fernanda", "boris", "rodrigo"], ) as captura_stpl_rdo: # SETUP transport_mode = Parameter("transport_mode", "STPL") From 871dc7033acfd6b958ce88dbd43bb639b5f68af9 Mon Sep 17 00:00:00 2001 From: Rodrigo Cunha <66736583+eng-rodrigocunha@users.noreply.github.com> Date: Thu, 6 Jul 2023 16:57:27 -0300 Subject: [PATCH 06/59] Add code_owners --- pipelines/rj_smtr/br_rj_riodejaneiro_rdo/flows.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pipelines/rj_smtr/br_rj_riodejaneiro_rdo/flows.py b/pipelines/rj_smtr/br_rj_riodejaneiro_rdo/flows.py index fa97cc83d..4a7b4aa08 100644 --- a/pipelines/rj_smtr/br_rj_riodejaneiro_rdo/flows.py +++ b/pipelines/rj_smtr/br_rj_riodejaneiro_rdo/flows.py @@ -195,7 +195,7 @@ with Flow( "SMTR: STPL RHO - Captura", - code_owners=["caio", "fernanda"], + code_owners=["caio", "fernanda", "boris", "rodrigo"], ) as captura_stpl_rho: # SETUP transport_mode = Parameter("transport_mode", "STPL") From b4abc7e2bb38728812c2735cd0012ef6535be74e Mon Sep 17 00:00:00 2001 From: eng-rodrigocunha Date: Mon, 25 Sep 2023 17:07:45 -0300 Subject: [PATCH 07/59] change flow names + change agent + remove redis --- .../rj_smtr/br_rj_riodejaneiro_rdo/flows.py | 20 +++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/pipelines/rj_smtr/br_rj_riodejaneiro_rdo/flows.py b/pipelines/rj_smtr/br_rj_riodejaneiro_rdo/flows.py index 4a7b4aa08..b7be66945 100644 --- a/pipelines/rj_smtr/br_rj_riodejaneiro_rdo/flows.py +++ b/pipelines/rj_smtr/br_rj_riodejaneiro_rdo/flows.py @@ -97,7 +97,7 @@ ) with Flow( - "SMTR: RHO - Captura", + "SMTR: SPPO RHO - Captura", code_owners=["caio", "fernanda", "boris", "rodrigo"], ) as captura_sppo_rho: # SETUP @@ -145,7 +145,7 @@ captura_sppo_rho.schedule = every_day with Flow( - "SMTR: RDO - Captura", + "SMTR: SPPO RDO - Captura", code_owners=["caio", "fernanda", "boris", "rodrigo"], ) as captura_sppo_rdo: # SETUP @@ -230,14 +230,14 @@ partitions=partitions, status=status, ) - set_redis = update_rdo_redis( - download_files=download_files, table_id=table_id, errors=errors - ) + # set_redis = update_rdo_redis( + # download_files=download_files, table_id=table_id, errors=errors + # ) captura_stpl_rho.storage = GCS(emd_constants.GCS_FLOWS_BUCKET.value) captura_stpl_rho.run_config = KubernetesRun( image=emd_constants.DOCKER_IMAGE.value, - labels=[emd_constants.RJ_SMTR_AGENT_LABEL.value], + labels=[emd_constants.RJ_SMTR_DEV_AGENT_LABEL.value], ) captura_stpl_rho.schedule = every_day @@ -278,14 +278,14 @@ partitions=partitions, status=status, ) - set_redis = update_rdo_redis( - download_files=download_files, table_id=table_id, errors=errors - ) + # set_redis = update_rdo_redis( + # download_files=download_files, table_id=table_id, errors=errors + # ) captura_stpl_rdo.storage = GCS(emd_constants.GCS_FLOWS_BUCKET.value) captura_stpl_rdo.run_config = KubernetesRun( image=emd_constants.DOCKER_IMAGE.value, - labels=[emd_constants.RJ_SMTR_AGENT_LABEL.value], + labels=[emd_constants.RJ_SMTR_DEV_AGENT_LABEL.value], ) captura_stpl_rdo.schedule = every_day From 9e2425221cbe2e0108e8718888f382fdb84223a5 Mon Sep 17 00:00:00 2001 From: eng-rodrigocunha Date: Mon, 25 Sep 2023 18:11:19 -0300 Subject: [PATCH 08/59] update file list --- pipelines/rj_smtr/br_rj_riodejaneiro_rdo/tasks.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/pipelines/rj_smtr/br_rj_riodejaneiro_rdo/tasks.py b/pipelines/rj_smtr/br_rj_riodejaneiro_rdo/tasks.py index 987bb8abd..1594e33f9 100644 --- a/pipelines/rj_smtr/br_rj_riodejaneiro_rdo/tasks.py +++ b/pipelines/rj_smtr/br_rj_riodejaneiro_rdo/tasks.py @@ -69,6 +69,9 @@ def get_file_paths_from_ftp( files.append(file_info) # except Exception as e: # pylint: disable=W0703 # return [{"error": e}] + + files = files[:10] + log(f"There are {len(files)} files at the FTP") return files From b6089fa2bf7fd75fd5abb1fb8affe74eb63f3ff8 Mon Sep 17 00:00:00 2001 From: fernandascovino Date: Mon, 25 Sep 2023 19:09:32 -0300 Subject: [PATCH 09/59] remove task de particao nao usada --- pipelines/rj_smtr/tasks.py | 28 ---------------------------- 1 file changed, 28 deletions(-) diff --git a/pipelines/rj_smtr/tasks.py b/pipelines/rj_smtr/tasks.py index de52c03df..983f93fbf 100644 --- a/pipelines/rj_smtr/tasks.py +++ b/pipelines/rj_smtr/tasks.py @@ -181,34 +181,6 @@ def parse_timestamp_to_string(timestamp: datetime, pattern="%Y-%m-%d-%H-%M-%S") return timestamp.strftime(pattern) -@task -def create_current_date_hour_partition(capture_time=None): - """Create partitioned directory structure to save data locally based - on capture time. - - Args: - capture_time(pendulum.datetime.DateTime, optional): - if recapturing data, will create partitions based - on the failed timestamps being recaptured - - Returns: - dict: "filename" contains the name which to upload the csv, "partitions" contains - the partitioned directory path - """ - if capture_time is None: - capture_time = datetime.now(tz=constants.TIMEZONE.value).replace( - minute=0, second=0, microsecond=0 - ) - date = capture_time.strftime("%Y-%m-%d") - hour = capture_time.strftime("%H") - - return { - "filename": capture_time.strftime("%Y-%m-%d-%H-%M-%S"), - "partitions": f"data={date}/hora={hour}", - "timestamp": capture_time, - } - - @task def create_local_partition_path( dataset_id: str, table_id: str, filename: str, partitions: str = None From dc197ccac6d2be6af8b6025974cbdd6e8c826041 Mon Sep 17 00:00:00 2001 From: fernandascovino Date: Mon, 25 Sep 2023 19:17:54 -0300 Subject: [PATCH 10/59] unifica tasks de particao de data e hora --- pipelines/rj_smtr/constants.py | 11 +++++------ pipelines/rj_smtr/flows.py | 12 ++---------- pipelines/rj_smtr/tasks.py | 15 +++++---------- pipelines/rj_smtr/veiculo/flows.py | 6 +++--- 4 files changed, 15 insertions(+), 29 deletions(-) diff --git a/pipelines/rj_smtr/constants.py b/pipelines/rj_smtr/constants.py index 7133b8abe..b22c4a412 100644 --- a/pipelines/rj_smtr/constants.py +++ b/pipelines/rj_smtr/constants.py @@ -180,8 +180,7 @@ class constants(Enum): # pylint: disable=c0103 ORDER BY data_processamento """, - "primary_key": ["id"], # id column to nest data on - "flag_date_partition": False, + "primary_key": ["id"] # id column to nest data on }, ] BILHETAGEM_TABLES_PARAMS = [ @@ -199,7 +198,7 @@ class constants(Enum): # pylint: disable=c0103 DT_INCLUSAO """, "primary_key": ["CD_LINHA"], # id column to nest data on - "flag_date_partition": True, + "partition_date_only": True, }, { "table_id": "grupo", @@ -215,7 +214,7 @@ class constants(Enum): # pylint: disable=c0103 DT_INCLUSAO """, "primary_key": ["CD_GRUPO"], - "flag_date_partition": True, + "partition_date_only": True, }, { "table_id": "grupo_linha", @@ -231,7 +230,7 @@ class constants(Enum): # pylint: disable=c0103 DT_INCLUSAO """, "primary_key": ["CD_GRUPO", "CD_LINHA"], # id column to nest data on - "flag_date_partition": True, + "partition_date_only": True, }, { "table_id": "matriz_integracao", @@ -250,7 +249,7 @@ class constants(Enum): # pylint: disable=c0103 "cd_versao_matriz", "cd_integracao", ], # id column to nest data on - "flag_date_partition": True, + "partition_date_only": True, }, ] BILHETAGEM_SECRET_PATH = "smtr_jae_access_data" diff --git a/pipelines/rj_smtr/flows.py b/pipelines/rj_smtr/flows.py index f1d29ed10..bfe9d86e4 100644 --- a/pipelines/rj_smtr/flows.py +++ b/pipelines/rj_smtr/flows.py @@ -5,8 +5,7 @@ from prefect.run_configs import KubernetesRun from prefect.storage import GCS -from prefect import case, Parameter -from prefect.tasks.control_flow import merge +from prefect import Parameter # EMD Imports # @@ -19,7 +18,6 @@ # SMTR Imports # from pipelines.rj_smtr.tasks import ( - create_date_partition, create_date_hour_partition, create_local_partition_path, get_current_timestamp, @@ -66,13 +64,7 @@ dataset_id=dataset_id, ) - with case(table_params["flag_date_partition"], True): - date_partitions = create_date_partition(timestamp) - - with case(table_params["flag_date_partition"], False): - date_hour_partitions = create_date_hour_partition(timestamp) - - partitions = merge(date_partitions, date_hour_partitions) + partitions = create_date_hour_partition(timestamp, partition_date_only=table_params["partition_date_only"]) filename = parse_timestamp_to_string(timestamp) diff --git a/pipelines/rj_smtr/tasks.py b/pipelines/rj_smtr/tasks.py index 983f93fbf..a2a5adddc 100644 --- a/pipelines/rj_smtr/tasks.py +++ b/pipelines/rj_smtr/tasks.py @@ -158,19 +158,14 @@ def get_current_timestamp(timestamp=None, truncate_minute: bool = True) -> datet @task -def create_date_hour_partition(timestamp: datetime) -> str: +def create_date_hour_partition(timestamp: datetime, partition_date_only: bool = False) -> str: """ Get date hour Hive partition structure from timestamp. """ - return f"data={timestamp.strftime('%Y-%m-%d')}/hora={timestamp.strftime('%H')}" - - -@task -def create_date_partition(timestamp: datetime) -> str: - """ - Get date hour Hive partition structure from timestamp. - """ - return f"data={timestamp.date()}" + partition = f"data={timestamp.strftime('%Y-%m-%d')}" + if partition_date_only: + parition += f"/hora={timestamp.strftime('%H')}" + return partition @task diff --git a/pipelines/rj_smtr/veiculo/flows.py b/pipelines/rj_smtr/veiculo/flows.py index 28188a129..e1fab515e 100644 --- a/pipelines/rj_smtr/veiculo/flows.py +++ b/pipelines/rj_smtr/veiculo/flows.py @@ -30,7 +30,7 @@ every_day_hour_seven, ) from pipelines.rj_smtr.tasks import ( - create_date_partition, + create_date_hour_partition, create_local_partition_path, get_current_timestamp, get_raw, @@ -71,7 +71,7 @@ ) # SETUP # - partitions = create_date_partition(timestamp) + partitions = create_date_hour_partition(timestamp, partition_date_only=True) filename = parse_timestamp_to_string(timestamp) @@ -140,7 +140,7 @@ ) # SETUP # - partitions = create_date_partition(timestamp) + partitions = create_date_hour_partition(timestamp, partition_date_only=True) filename = parse_timestamp_to_string(timestamp) From 66e84a1e2b2b24ead92842b604c2210238fb037b Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 25 Sep 2023 22:22:31 +0000 Subject: [PATCH 11/59] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- pipelines/rj_smtr/constants.py | 2 +- pipelines/rj_smtr/flows.py | 4 +++- pipelines/rj_smtr/tasks.py | 4 +++- 3 files changed, 7 insertions(+), 3 deletions(-) diff --git a/pipelines/rj_smtr/constants.py b/pipelines/rj_smtr/constants.py index b22c4a412..93303e5b7 100644 --- a/pipelines/rj_smtr/constants.py +++ b/pipelines/rj_smtr/constants.py @@ -180,7 +180,7 @@ class constants(Enum): # pylint: disable=c0103 ORDER BY data_processamento """, - "primary_key": ["id"] # id column to nest data on + "primary_key": ["id"], # id column to nest data on }, ] BILHETAGEM_TABLES_PARAMS = [ diff --git a/pipelines/rj_smtr/flows.py b/pipelines/rj_smtr/flows.py index bfe9d86e4..87d506813 100644 --- a/pipelines/rj_smtr/flows.py +++ b/pipelines/rj_smtr/flows.py @@ -64,7 +64,9 @@ dataset_id=dataset_id, ) - partitions = create_date_hour_partition(timestamp, partition_date_only=table_params["partition_date_only"]) + partitions = create_date_hour_partition( + timestamp, partition_date_only=table_params["partition_date_only"] + ) filename = parse_timestamp_to_string(timestamp) diff --git a/pipelines/rj_smtr/tasks.py b/pipelines/rj_smtr/tasks.py index a2a5adddc..f35a9db72 100644 --- a/pipelines/rj_smtr/tasks.py +++ b/pipelines/rj_smtr/tasks.py @@ -158,7 +158,9 @@ def get_current_timestamp(timestamp=None, truncate_minute: bool = True) -> datet @task -def create_date_hour_partition(timestamp: datetime, partition_date_only: bool = False) -> str: +def create_date_hour_partition( + timestamp: datetime, partition_date_only: bool = False +) -> str: """ Get date hour Hive partition structure from timestamp. """ From 7cb436bc9d0fc7cf045ca56248ef58a63ed634e7 Mon Sep 17 00:00:00 2001 From: fernandascovino Date: Mon, 25 Sep 2023 19:29:50 -0300 Subject: [PATCH 12/59] corrige condicional --- pipelines/rj_smtr/tasks.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pipelines/rj_smtr/tasks.py b/pipelines/rj_smtr/tasks.py index f35a9db72..e1a0d0c7d 100644 --- a/pipelines/rj_smtr/tasks.py +++ b/pipelines/rj_smtr/tasks.py @@ -165,8 +165,8 @@ def create_date_hour_partition( Get date hour Hive partition structure from timestamp. """ partition = f"data={timestamp.strftime('%Y-%m-%d')}" - if partition_date_only: - parition += f"/hora={timestamp.strftime('%H')}" + if not partition_date_only: + partition += f"/hora={timestamp.strftime('%H')}" return partition From 588fe7d3f3cc02500930d2bd94996152b51a5bce Mon Sep 17 00:00:00 2001 From: Rafael Date: Tue, 26 Sep 2023 11:20:28 -0300 Subject: [PATCH 13/59] change capture flow --- pipelines/rj_smtr/constants.py | 1 + pipelines/rj_smtr/flows.py | 44 +++++++++----- pipelines/rj_smtr/tasks.py | 45 +++++++++++++++ pipelines/rj_smtr/utils.py | 101 +++++++++++++++++++++++++++++++++ pipelines/utils/custom.py | 10 ++-- pipelines/utils/utils.py | 15 ++++- 6 files changed, 196 insertions(+), 20 deletions(-) diff --git a/pipelines/rj_smtr/constants.py b/pipelines/rj_smtr/constants.py index 7133b8abe..34b63781a 100644 --- a/pipelines/rj_smtr/constants.py +++ b/pipelines/rj_smtr/constants.py @@ -182,6 +182,7 @@ class constants(Enum): # pylint: disable=c0103 """, "primary_key": ["id"], # id column to nest data on "flag_date_partition": False, + "source": "api", }, ] BILHETAGEM_TABLES_PARAMS = [ diff --git a/pipelines/rj_smtr/flows.py b/pipelines/rj_smtr/flows.py index f1d29ed10..e36c8e676 100644 --- a/pipelines/rj_smtr/flows.py +++ b/pipelines/rj_smtr/flows.py @@ -23,13 +23,13 @@ create_date_hour_partition, create_local_partition_path, get_current_timestamp, - get_raw, parse_timestamp_to_string, save_raw_local, save_treated_local, upload_logs_to_bq, bq_upload, transform_to_nested_structure, + get_raw, ) from pipelines.rj_smtr.tasks import ( @@ -37,6 +37,14 @@ get_datetime_range, ) +with Flow( + "SMTR: Pre-Treatment", + code_owners=["caio", "fernanda", "boris", "rodrigo"], +) as default_pre_treatment_flow: + # SETUP # + table_params = Parameter("table_params", default=None) + dataset_id = Parameter("dataset_id", default=None) + with Flow( "SMTR: Captura", @@ -59,13 +67,6 @@ now_time=timestamp, ) - request_params, request_url = create_request_params( - datetime_range=datetime_range, - table_params=table_params, - secret_path=secret_path, - dataset_id=dataset_id, - ) - with case(table_params["flag_date_partition"], True): date_partitions = create_date_partition(timestamp) @@ -83,11 +84,28 @@ partitions=partitions, ) - raw_status = get_raw( - url=request_url, - headers=secret_path, - params=request_params, - ) + raw_status_list = [] + + with case(table_params["source"], "api"): + request_params, request_url = create_request_params( + datetime_range=datetime_range, + table_params=table_params, + secret_path=secret_path, + dataset_id=dataset_id, + ) + + api_raw_status = get_raw( + url=request_url, + headers=secret_path, + params=request_params, + ) + + raw_status_list.append(api_raw_status) + + with case(table_params["source"], "gcs"): + pass + + raw_status = merge(*raw_status_list) raw_filepath = save_raw_local(status=raw_status, file_path=filepath) diff --git a/pipelines/rj_smtr/tasks.py b/pipelines/rj_smtr/tasks.py index de52c03df..49c745076 100644 --- a/pipelines/rj_smtr/tasks.py +++ b/pipelines/rj_smtr/tasks.py @@ -28,6 +28,7 @@ get_last_run_timestamp, log_critical, data_info_str, + get_raw_data_api, ) from pipelines.utils.execute_dbt_model.utils import get_dbt_client from pipelines.utils.utils import log, get_redis_client, get_vault_secret @@ -960,3 +961,47 @@ def create_request_params( } return request_params, request_url + + +# @task(checkpoint=False) +# def get_raw_from_sources( +# source: str, +# url:str, +# dataset_id:str = None, +# table_id:str = None, +# mode:str = None, +# headers: str = None, +# filetype: str = "json", +# csv_args: dict = None, +# params: dict = None, +# ): +# if source == "api": +# return get_raw_data_api( +# url=url, +# headers=headers, +# filetype=filetype, +# csv_args=csv_args, +# params=params +# ) +# if source == "gcs": +# file = + + +@task(checkpoint=False) +def save_raw_storage( + dataset_id: str, + table_id: str, + raw_filepath: str, + partitions: str = None, +): + st_obj = Storage(table_id=table_id, dataset_id=dataset_id) + log( + f"""Uploading raw file to bucket {st_obj.bucket_name} at + {st_obj.bucket_name}/{dataset_id}/{table_id}""" + ) + st_obj.upload( + path=raw_filepath, + partitions=partitions, + mode="raw", + if_exists="replace", + ) diff --git a/pipelines/rj_smtr/utils.py b/pipelines/rj_smtr/utils.py index 9ddf7d687..3b3c7377d 100644 --- a/pipelines/rj_smtr/utils.py +++ b/pipelines/rj_smtr/utils.py @@ -14,6 +14,8 @@ from basedosdados import Table import pandas as pd import pytz +import requests +import zipfile from prefect.schedules.clocks import IntervalClock @@ -27,6 +29,8 @@ get_vault_secret, send_discord_message, get_redis_client, + get_storage_blobs, + get_storage_blob, ) @@ -445,3 +449,100 @@ def generate_execute_schedules( # pylint: disable=too-many-arguments,too-many-l ) ) return clocks + + +def get_raw_data_api( # pylint: disable=R0912 + url: str, + headers: str = None, + filetype: str = "json", + csv_args: dict = None, + params: dict = None, +) -> list[dict]: + """ + Request data from URL API + + Args: + url (str): URL to send request + headers (str, optional): Path to headers guardeded on Vault, if needed. + filetype (str, optional): Filetype to be formatted (supported only: json, csv and txt) + csv_args (dict, optional): Arguments for read_csv, if needed + params (dict, optional): Params to be sent on request + + Returns: + dict: Conatining keys + * `data` (json): data result + * `error` (str): catched error, if any. Otherwise, returns None + """ + data = None + error = None + + try: + if headers is not None: + headers = get_vault_secret(headers)["data"] + + # remove from headers, if present + remove_headers = ["host", "databases"] + for remove_header in remove_headers: + if remove_header in list(headers.keys()): + del headers[remove_header] + + response = requests.get( + url, + headers=headers, + timeout=constants.MAX_TIMEOUT_SECONDS.value, + params=params, + ) + + if response.ok: # status code is less than 400 + if filetype == "json": + data = response.json() + + # todo: move to data check on specfic API # pylint: disable=W0102 + if isinstance(data, dict) and "DescricaoErro" in data.keys(): + error = data["DescricaoErro"] + + elif filetype in ("txt", "csv"): + if csv_args is None: + csv_args = {} + data = pd.read_csv(io.StringIO(response.text), **csv_args).to_dict( + orient="records" + ) + else: + error = ( + "Unsupported raw file extension. Supported only: json, csv and txt" + ) + + except Exception as exp: + error = exp + + if error is not None: + log(f"[CATCHED] Task failed with error: \n{error}", level="error") + + return {"data": data, "error": error} + + +def get_raw_data_gcs( + dataset_id: str, table_id: str, file_name: str, mode: str, zip_file_name: str = None +) -> dict: + error = None + data = None + try: + if zip_file_name: + blob = get_storage_blob( + dataset_id=dataset_id, + table_id=table_id, + file_name=zip_file_name, + mode=mode, + ) + compressed_data = blob.download_as_bytes() + with zipfile.ZipFile(io.BytesIO(compressed_data), "r") as zipped_file: + data = zipped_file.read(file_name).decode(encoding="utf-8") + else: + blob = get_storage_blob( + dataset_id=dataset_id, table_id=table_id, file_name=file_name, mode=mode + ) + data = blob.download_as_string() + except Exception as exp: + error = exp + + return {"data": data, "error": error} diff --git a/pipelines/utils/custom.py b/pipelines/utils/custom.py index 13ae82dd5..d91739817 100644 --- a/pipelines/utils/custom.py +++ b/pipelines/utils/custom.py @@ -68,11 +68,11 @@ def __init__( # pylint: disable=too-many-arguments, too-many-locals edges=edges, reference_tasks=reference_tasks, state_handlers=state_handlers, - on_failure=partial( - notify_discord_on_failure, - secret_path=constants.EMD_DISCORD_WEBHOOK_SECRET_PATH.value, - code_owners=code_owners, - ), + # on_failure=partial( + # notify_discord_on_failure, + # secret_path=constants.EMD_DISCORD_WEBHOOK_SECRET_PATH.value, + # code_owners=code_owners, + # ), validate=validate, result=result, terminal_state_handler=terminal_state_handler, diff --git a/pipelines/utils/utils.py b/pipelines/utils/utils.py index efc21c133..7042709e9 100644 --- a/pipelines/utils/utils.py +++ b/pipelines/utils/utils.py @@ -711,7 +711,7 @@ def get_credentials_from_env( return cred -def get_storage_blobs(dataset_id: str, table_id: str) -> list: +def get_storage_blobs(dataset_id: str, table_id: str, mode: str = "staging") -> list: """ Get all blobs from a table in a dataset. """ @@ -720,7 +720,18 @@ def get_storage_blobs(dataset_id: str, table_id: str) -> list: return list( bd_storage.client["storage_staging"] .bucket(bd_storage.bucket_name) - .list_blobs(prefix=f"staging/{bd_storage.dataset_id}/{bd_storage.table_id}/") + .list_blobs(prefix=f"{mode}/{bd_storage.dataset_id}/{bd_storage.table_id}/") + ) + + +def get_storage_blob( + dataset_id: str, table_id: str, file_name: str, mode: str = "staging" +): + bd_storage = bd.Storage(dataset_id=dataset_id, table_id=table_id) + return ( + bd_storage.client["storage_staging"] + .bucket(bd_storage.bucket_name) + .get_blob(blob_name=f"{mode}/{dataset_id}/{table_id}/{file_name}") ) From 97746e1c34db7410a78a69e0b5ce4e7df4b12ad7 Mon Sep 17 00:00:00 2001 From: Rafael Date: Tue, 26 Sep 2023 15:04:09 -0300 Subject: [PATCH 14/59] change generic capture flow --- pipelines/rj_smtr/constants.py | 39 +++++++++------ pipelines/rj_smtr/flows.py | 72 +++++++++++++-------------- pipelines/rj_smtr/tasks.py | 89 ++++++++++++++++++---------------- pipelines/rj_smtr/utils.py | 52 ++++++++------------ pipelines/utils/utils.py | 15 +++++- 5 files changed, 135 insertions(+), 132 deletions(-) diff --git a/pipelines/rj_smtr/constants.py b/pipelines/rj_smtr/constants.py index 34b63781a..caa4a5e23 100644 --- a/pipelines/rj_smtr/constants.py +++ b/pipelines/rj_smtr/constants.py @@ -167,23 +167,30 @@ class constants(Enum): # pylint: disable=c0103 BILHETAGEM_DATASET_ID = "br_rj_riodejaneiro_bilhetagem" BILHETAGEM_TRANSACAO_TABLE_PARAMS = [ { - "table_id": "transacao", - "database": "transacao_db", - "query": """ - SELECT - * - FROM - transacao - WHERE - data_processamento BETWEEN '{start}' - AND '{end}' - ORDER BY - data_processamento - """, - "primary_key": ["id"], # id column to nest data on "flag_date_partition": False, - "source": "api", - }, + "flow_run_name": "transacao", + "extraction": { + "table_id": "transacao", + "database": "transacao_db", + "query": """ + SELECT + * + FROM + transacao + WHERE + data_processamento BETWEEN '{start}' + AND '{end}' + ORDER BY + data_processamento + """, + "source": "api", + }, + "pre-treatment": { + "table_id": "transacao", + "file_type": "json", + "primary_key": ["id"], # id column to nest data on + }, + } ] BILHETAGEM_TABLES_PARAMS = [ { diff --git a/pipelines/rj_smtr/flows.py b/pipelines/rj_smtr/flows.py index e36c8e676..8076633c8 100644 --- a/pipelines/rj_smtr/flows.py +++ b/pipelines/rj_smtr/flows.py @@ -7,6 +7,7 @@ from prefect.storage import GCS from prefect import case, Parameter from prefect.tasks.control_flow import merge +from prefect.utilities.collections import DotDict # EMD Imports # @@ -29,22 +30,12 @@ upload_logs_to_bq, bq_upload, transform_to_nested_structure, - get_raw, -) - -from pipelines.rj_smtr.tasks import ( + get_raw_from_sources, + transform_data_to_json, create_request_params, get_datetime_range, ) -with Flow( - "SMTR: Pre-Treatment", - code_owners=["caio", "fernanda", "boris", "rodrigo"], -) as default_pre_treatment_flow: - # SETUP # - table_params = Parameter("table_params", default=None) - dataset_id = Parameter("dataset_id", default=None) - with Flow( "SMTR: Captura", @@ -63,7 +54,7 @@ datetime_range = get_datetime_range(timestamp, interval=interval) rename_flow_run = rename_current_flow_run_now_time( - prefix=default_capture_flow.name + " " + table_params["table_id"] + ": ", + prefix=default_capture_flow.name + " " + table_params["flow_run_name"] + ": ", now_time=timestamp, ) @@ -79,41 +70,44 @@ filepath = create_local_partition_path( dataset_id=dataset_id, - table_id=table_params["table_id"], + table_id=table_params["pre-treatment"]["table_id"], filename=filename, partitions=partitions, ) - raw_status_list = [] - - with case(table_params["source"], "api"): - request_params, request_url = create_request_params( - datetime_range=datetime_range, - table_params=table_params, - secret_path=secret_path, - dataset_id=dataset_id, - ) - - api_raw_status = get_raw( - url=request_url, - headers=secret_path, - params=request_params, - ) - - raw_status_list.append(api_raw_status) - - with case(table_params["source"], "gcs"): - pass + # CAPTURA + request_params, request_url = create_request_params( + datetime_range=datetime_range, + table_params=table_params, + secret_path=secret_path, + dataset_id=dataset_id, + ) - raw_status = merge(*raw_status_list) + raw_status = get_raw_from_sources( + source=table_params["extraction"]["source"], + url=request_url, + dataset_id=dataset_id, + table_id=table_params["extraction"]["table_id"], + file_name=table_params["extraction"]["file_name"], + zip_file_name=table_params["extraction"]["zip_file_name"], + mode=table_params["extraction"]["mode"], + headers=secret_path, + params=request_params, + ) raw_filepath = save_raw_local(status=raw_status, file_path=filepath) # TREAT & CLEAN # - treated_status = transform_to_nested_structure( + json_status = transform_data_to_json( status=raw_status, + file_type=table_params["pre-treatment"]["file_type"], + csv_args=table_params["pre-treatment"]["csv_args"], + ) + + treated_status = transform_to_nested_structure( + status=json_status, timestamp=timestamp, - primary_key=table_params["primary_key"], + primary_key=table_params["pre-treatment"]["primary_key"], ) treated_filepath = save_treated_local(status=treated_status, file_path=filepath) @@ -121,7 +115,7 @@ # LOAD # error = bq_upload( dataset_id=dataset_id, - table_id=table_params["table_id"], + table_id=table_params["pre-treatment"]["table_id"], filepath=treated_filepath, raw_filepath=raw_filepath, partitions=partitions, @@ -130,7 +124,7 @@ upload_logs_to_bq( dataset_id=dataset_id, - parent_table_id=table_params["table_id"], + parent_table_id=table_params["pre-treatment"]["table_id"], error=error, timestamp=timestamp, ) diff --git a/pipelines/rj_smtr/tasks.py b/pipelines/rj_smtr/tasks.py index 49c745076..1b9545ca8 100644 --- a/pipelines/rj_smtr/tasks.py +++ b/pipelines/rj_smtr/tasks.py @@ -29,6 +29,7 @@ log_critical, data_info_str, get_raw_data_api, + get_raw_data_gcs, ) from pipelines.utils.execute_dbt_model.utils import get_dbt_client from pipelines.utils.utils import log, get_redis_client, get_vault_secret @@ -950,58 +951,62 @@ def create_request_params( if dataset_id == constants.BILHETAGEM_DATASET_ID.value: secrets = get_vault_secret(secret_path)["data"] - database_secrets = secrets["databases"][table_params["database"]] + database_secrets = secrets["databases"][table_params["extraction"]["database"]] request_url = secrets["vpn_url"] + database_secrets["engine"] request_params = { "host": database_secrets["host"], # TODO: exibir no log em ambiente fechado - "database": table_params["database"], - "query": table_params["query"].format(**datetime_range), + "database": table_params["extraction"]["database"], + "query": table_params["extraction"]["query"].format(**datetime_range), } return request_params, request_url -# @task(checkpoint=False) -# def get_raw_from_sources( -# source: str, -# url:str, -# dataset_id:str = None, -# table_id:str = None, -# mode:str = None, -# headers: str = None, -# filetype: str = "json", -# csv_args: dict = None, -# params: dict = None, -# ): -# if source == "api": -# return get_raw_data_api( -# url=url, -# headers=headers, -# filetype=filetype, -# csv_args=csv_args, -# params=params -# ) -# if source == "gcs": -# file = - - @task(checkpoint=False) -def save_raw_storage( - dataset_id: str, - table_id: str, - raw_filepath: str, +def get_raw_from_sources( + source: str, + url: str, + dataset_id: str = None, + table_id: str = None, + file_name: str = None, partitions: str = None, + zip_file_name: str = None, + mode: str = None, + headers: str = None, + params: dict = None, ): - st_obj = Storage(table_id=table_id, dataset_id=dataset_id) - log( - f"""Uploading raw file to bucket {st_obj.bucket_name} at - {st_obj.bucket_name}/{dataset_id}/{table_id}""" - ) - st_obj.upload( - path=raw_filepath, - partitions=partitions, - mode="raw", - if_exists="replace", - ) + if source == "api": + return get_raw_data_api(url=url, headers=headers, params=params) + if source == "gcs": + return get_raw_data_gcs( + dataset_id=dataset_id, + table_id=table_id, + file_name=file_name, + mode=mode, + partitions=partitions, + zip_file_name=zip_file_name, + ) + + +@task(checkpoint=False) +def transform_data_to_json(status: dict, file_type: str, csv_args: dict): + data = status["data"] + error = status["error"] + + if file_type == "json": + pass + + # todo: move to data check on specfic API # pylint: disable=W0102 + # if isinstance(data, dict) and "DescricaoErro" in data.keys(): + # error = data["DescricaoErro"] + + elif file_type in ("txt", "csv"): + if csv_args is None: + csv_args = {} + data = pd.read_csv(io.StringIO(data), **csv_args).to_dict(orient="records") + else: + error = "Unsupported raw file extension. Supported only: json, csv and txt" + + return {"data": data, "error": error} diff --git a/pipelines/rj_smtr/utils.py b/pipelines/rj_smtr/utils.py index 3b3c7377d..c7b13bfc3 100644 --- a/pipelines/rj_smtr/utils.py +++ b/pipelines/rj_smtr/utils.py @@ -454,8 +454,6 @@ def generate_execute_schedules( # pylint: disable=too-many-arguments,too-many-l def get_raw_data_api( # pylint: disable=R0912 url: str, headers: str = None, - filetype: str = "json", - csv_args: dict = None, params: dict = None, ) -> list[dict]: """ @@ -464,8 +462,6 @@ def get_raw_data_api( # pylint: disable=R0912 Args: url (str): URL to send request headers (str, optional): Path to headers guardeded on Vault, if needed. - filetype (str, optional): Filetype to be formatted (supported only: json, csv and txt) - csv_args (dict, optional): Arguments for read_csv, if needed params (dict, optional): Params to be sent on request Returns: @@ -493,24 +489,9 @@ def get_raw_data_api( # pylint: disable=R0912 params=params, ) - if response.ok: # status code is less than 400 - if filetype == "json": - data = response.json() + response.raise_for_status() - # todo: move to data check on specfic API # pylint: disable=W0102 - if isinstance(data, dict) and "DescricaoErro" in data.keys(): - error = data["DescricaoErro"] - - elif filetype in ("txt", "csv"): - if csv_args is None: - csv_args = {} - data = pd.read_csv(io.StringIO(response.text), **csv_args).to_dict( - orient="records" - ) - else: - error = ( - "Unsupported raw file extension. Supported only: json, csv and txt" - ) + data = response.text except Exception as exp: error = exp @@ -522,25 +503,30 @@ def get_raw_data_api( # pylint: disable=R0912 def get_raw_data_gcs( - dataset_id: str, table_id: str, file_name: str, mode: str, zip_file_name: str = None + dataset_id: str, + table_id: str, + file_name: str, + mode: str, + partitions: str = None, + zip_extracted_file: str = None, ) -> dict: error = None data = None try: - if zip_file_name: - blob = get_storage_blob( - dataset_id=dataset_id, - table_id=table_id, - file_name=zip_file_name, - mode=mode, - ) + blob = get_storage_blob( + dataset_id=dataset_id, + table_id=table_id, + file_name=file_name, + partitions=partitions, + mode=mode, + ) + + if zip_extracted_file: compressed_data = blob.download_as_bytes() + with zipfile.ZipFile(io.BytesIO(compressed_data), "r") as zipped_file: - data = zipped_file.read(file_name).decode(encoding="utf-8") + data = zipped_file.read(zip_extracted_file).decode(encoding="utf-8") else: - blob = get_storage_blob( - dataset_id=dataset_id, table_id=table_id, file_name=file_name, mode=mode - ) data = blob.download_as_string() except Exception as exp: error = exp diff --git a/pipelines/utils/utils.py b/pipelines/utils/utils.py index 7042709e9..79a264017 100644 --- a/pipelines/utils/utils.py +++ b/pipelines/utils/utils.py @@ -725,13 +725,24 @@ def get_storage_blobs(dataset_id: str, table_id: str, mode: str = "staging") -> def get_storage_blob( - dataset_id: str, table_id: str, file_name: str, mode: str = "staging" + dataset_id: str, + table_id: str, + file_name: str, + partitions: str = None, + mode: str = "staging", ): + path = f"{mode}/{dataset_id}/{table_id}/" + + if partitions: + path += f"{partitions}/" + + path += file_name + bd_storage = bd.Storage(dataset_id=dataset_id, table_id=table_id) return ( bd_storage.client["storage_staging"] .bucket(bd_storage.bucket_name) - .get_blob(blob_name=f"{mode}/{dataset_id}/{table_id}/{file_name}") + .get_blob(blob_name=path) ) From 6f12477d14e45a2bb83c817976a597282625a66b Mon Sep 17 00:00:00 2001 From: fernandascovino Date: Tue, 26 Sep 2023 17:18:56 -0300 Subject: [PATCH 15/59] atualiza esquema do flow padrao --- pipelines/rj_smtr/constants.py | 3 + pipelines/rj_smtr/flows.py | 121 +++++++++---------- pipelines/rj_smtr/tasks.py | 206 ++++++++++++++++++++++----------- pipelines/rj_smtr/utils.py | 163 +++++++++++++++++++++----- pipelines/utils/utils.py | 20 +--- 5 files changed, 337 insertions(+), 176 deletions(-) diff --git a/pipelines/rj_smtr/constants.py b/pipelines/rj_smtr/constants.py index 3b1b6dc8d..d402bb6e9 100644 --- a/pipelines/rj_smtr/constants.py +++ b/pipelines/rj_smtr/constants.py @@ -262,3 +262,6 @@ class constants(Enum): # pylint: disable=c0103 }, ] BILHETAGEM_SECRET_PATH = "smtr_jae_access_data" + + # GTFS + diff --git a/pipelines/rj_smtr/flows.py b/pipelines/rj_smtr/flows.py index da802d277..fb763cc5a 100644 --- a/pipelines/rj_smtr/flows.py +++ b/pipelines/rj_smtr/flows.py @@ -22,15 +22,17 @@ create_local_partition_path, get_current_timestamp, parse_timestamp_to_string, - save_raw_local, - save_treated_local, - upload_logs_to_bq, - bq_upload, - transform_to_nested_structure, + # save_raw_local, + # save_treated_local, + # upload_logs_to_bq, + # bq_upload, + upload_raw_data_to_gcs, + upload_staging_data_to_gcs, + transform_raw_to_nested_structure, get_raw_from_sources, - transform_data_to_json, + # transform_data_to_json, create_request_params, - get_datetime_range, + # get_datetime_range, ) @@ -38,96 +40,87 @@ "SMTR: Captura", code_owners=["caio", "fernanda", "boris", "rodrigo"], ) as default_capture_flow: - # SETUP # + + ### Configuração ### - table_params = Parameter("table_params", default=None) - timestamp_param = Parameter("timestamp", default=None) - interval = Parameter("interval", default=None) + table_id = Parameter("table_id", default=None) + partition_date_only = Parameter("partition_date_only", default=None) + request_params = Parameter("request_params", default=None) dataset_id = Parameter("dataset_id", default=None) secret_path = Parameter("secret_path", default=None) + primary_key = Parameter("primary_key", default=None) + source_type = Parameter("source_type", default=None) - timestamp = get_current_timestamp(timestamp_param) - - datetime_range = get_datetime_range(timestamp, interval=interval) + timestamp = get_current_timestamp() rename_flow_run = rename_current_flow_run_now_time( - prefix=default_capture_flow.name + " " + table_params["flow_run_name"] + ": ", + prefix=default_capture_flow.name + " " + table_id + ": ", now_time=timestamp, ) - request_params, request_url = create_request_params( - datetime_range=datetime_range, - table_params=table_params, - secret_path=secret_path, - dataset_id=dataset_id, - ) - partitions = create_date_hour_partition( - timestamp, partition_date_only=table_params["partition_date_only"] + timestamp, partition_date_only=partition_date_only ) filename = parse_timestamp_to_string(timestamp) filepath = create_local_partition_path( dataset_id=dataset_id, - table_id=table_params["pre-treatment"]["table_id"], + table_id=table_id, filename=filename, partitions=partitions, ) - # CAPTURA - request_params, request_url = create_request_params( - datetime_range=datetime_range, - table_params=table_params, + ### Extração ### + # é necessária task ou função dentro da extract_raw_data? + request_params, request_path = create_request_params( secret_path=secret_path, dataset_id=dataset_id, ) - raw_status = get_raw_from_sources( - source=table_params["extraction"]["source"], - url=request_url, - dataset_id=dataset_id, - table_id=table_params["extraction"]["table_id"], - file_name=table_params["extraction"]["file_name"], - zip_file_name=table_params["extraction"]["zip_file_name"], - mode=table_params["extraction"]["mode"], - headers=secret_path, - params=request_params, + error, raw_filepath = get_raw_from_sources( + source_type=source_type, # parametro de extracao, onde ficar? + source_path=request_path, + zip_filename=table_id, + secret_path=secret_path, + request_params=request_params, ) - raw_filepath = save_raw_local(status=raw_status, file_path=filepath) - - # TREAT & CLEAN # - json_status = transform_data_to_json( - status=raw_status, - file_type=table_params["pre-treatment"]["file_type"], - csv_args=table_params["pre-treatment"]["csv_args"], + RAW_UPLOADED = upload_raw_data_to_gcs( + error=error, + filepath=raw_filepath, + timestamp=timestamp, + partitions=partitions ) - treated_status = transform_to_nested_structure( - status=json_status, + ### Pré-tratamento ### + + error, staging_filepath = transform_raw_to_nested_structure( + raw_filepath=raw_filepath, timestamp=timestamp, - primary_key=table_params["pre-treatment"]["primary_key"], + primary_key=primary_key, ) - treated_filepath = save_treated_local(status=treated_status, file_path=filepath) + STAGING_UPLOADED = upload_staging_data_to_gcs(error=error, filepath=staging_filepath, timestamp=timestamp) - # LOAD # - error = bq_upload( - dataset_id=dataset_id, - table_id=table_params["pre-treatment"]["table_id"], - filepath=treated_filepath, - raw_filepath=raw_filepath, - partitions=partitions, - status=treated_status, - ) + # treated_filepath = save_treated_local(status=treated_status, file_path=filepath) - upload_logs_to_bq( - dataset_id=dataset_id, - parent_table_id=table_params["pre-treatment"]["table_id"], - error=error, - timestamp=timestamp, - ) + # LOAD # + # error = bq_upload( + # dataset_id=dataset_id, + # table_id=table_params["pre-treatment"]["table_id"], + # filepath=treated_filepath, + # raw_filepath=raw_filepath, + # partitions=partitions, + # status=treated_status, + # ) + + # upload_logs_to_bq( + # dataset_id=dataset_id, + # parent_table_id=table_params["pre-treatment"]["table_id"], + # error=error, + # timestamp=timestamp, + # ) default_capture_flow.storage = GCS(emd_constants.GCS_FLOWS_BUCKET.value) default_capture_flow.run_config = KubernetesRun( diff --git a/pipelines/rj_smtr/tasks.py b/pipelines/rj_smtr/tasks.py index bf0aec407..b7f484171 100644 --- a/pipelines/rj_smtr/tasks.py +++ b/pipelines/rj_smtr/tasks.py @@ -30,6 +30,7 @@ data_info_str, get_raw_data_api, get_raw_data_gcs, + upload_run_logs_to_bq ) from pipelines.utils.execute_dbt_model.utils import get_dbt_client from pipelines.utils.utils import log, get_redis_client, get_vault_secret @@ -601,6 +602,69 @@ def upload_logs_to_bq( # pylint: disable=R0913 raise Exception(f"Pipeline failed with error: {error}") +@task +def upload_raw_data_to_gcs( + error: bool, raw_filepath: str, timestamp: datetime, table_id: str, dataset_id: str, partitions: list +): + if not error: + try: + st_obj = Storage(table_id=table_id, dataset_id=dataset_id) + log( + f"""Uploading raw file to bucket {st_obj.bucket_name} at + {st_obj.bucket_name}/{dataset_id}/{table_id}""" + ) + st_obj.upload( + path=raw_filepath, + partitions=partitions, + mode="raw", + if_exists="replace", + ) + except Exception: + error = traceback.format_exc() + log(f"[CATCHED] Task failed with error: \n{error}", level="error") + + upload_run_logs_to_bq( + dataset_id=dataset_id, + parent_table_id=table_id, + error=error, + timestamp=timestamp, + mode="raw" + ) + + +@task +def upload_staging_data_to_gcs( + error: bool, staging_filepath: str, timestamp: datetime, table_id: str, dataset_id: str, partitions: list +): + if not error: + try: + # Creates and publish table if it does not exist, append to it otherwise + create_or_append_table( + dataset_id=dataset_id, + table_id=table_id, + path=staging_filepath, + partitions=partitions + ) + except Exception: + error = traceback.format_exc() + log(f"[CATCHED] Task failed with error: \n{error}", level="error") + + upload_run_logs_to_bq( + dataset_id=dataset_id, + parent_table_id=table_id, + error=error, + timestamp=timestamp, + mode="staging" + ) + + +############### +# +# Daterange tasks +# +############### + + @task( checkpoint=False, max_retries=constants.MAX_RETRIES.value, @@ -791,9 +855,16 @@ def get_previous_date(days): return now.to_date_string() +############### +# +# Pretreat data +# +############### + + @task -def transform_to_nested_structure( - status: dict, timestamp: datetime, primary_key: list = None +def transform_raw_to_nested_structure( + filepath: str, error: bool, timestamp: datetime, primary_key: list = None ): """Transform dataframe to nested structure @@ -810,21 +881,29 @@ def transform_to_nested_structure( * `error` (str): catched error, if any. Otherwise, returns None """ + # ORGANIZAR: + # json_status = transform_data_to_json( + # status=raw_status, + # file_type=table_params["pre-treatment"]["file_type"], + # csv_args=table_params["pre-treatment"]["csv_args"], + # ) + # Check previous error - if status["error"] is not None: - return {"data": pd.DataFrame(), "error": status["error"]} + if error is not None: + return {"data": pd.DataFrame(), "error": error} # Check empty dataframe - if len(status["data"]) == 0: - log("Empty dataframe, skipping transformation...") - return {"data": pd.DataFrame(), "error": status["error"]} + # if len(status["data"]) == 0: + # log("Empty dataframe, skipping transformation...") + # return {"data": pd.DataFrame(), "error": error} try: if primary_key is None: primary_key = [] error = None - data = pd.DataFrame(status["data"]) + # leitura do dado raw + # data = pd.DataFrame(status["data"]) log( f""" @@ -860,40 +939,43 @@ def transform_to_nested_structure( level="info", ) + # save treated local + filepath = _save_trated_local(data=data, filepath=filepath) + except Exception as exp: # pylint: disable=W0703 error = exp if error is not None: log(f"[CATCHED] Task failed with error: \n{error}", level="error") - return {"data": data, "error": error} + return error, filepath -@task(checkpoint=False) -def get_datetime_range( - timestamp: datetime, - interval: int, -) -> dict: - """ - Task to get datetime range in UTC +# @task(checkpoint=False) +# def get_datetime_range( +# timestamp: datetime, +# interval: int, +# ) -> dict: +# """ +# Task to get datetime range in UTC - Args: - timestamp (datetime): timestamp to get datetime range - interval (int): interval in seconds +# Args: +# timestamp (datetime): timestamp to get datetime range +# interval (int): interval in seconds - Returns: - dict: datetime range - """ +# Returns: +# dict: datetime range +# """ - start = ( - (timestamp - timedelta(seconds=interval)) - .astimezone(tz=timezone("UTC")) - .strftime("%Y-%m-%d %H:%M:%S") - ) +# start = ( +# (timestamp - timedelta(seconds=interval)) +# .astimezone(tz=timezone("UTC")) +# .strftime("%Y-%m-%d %H:%M:%S") +# ) - end = timestamp.astimezone(tz=timezone("UTC")).strftime("%Y-%m-%d %H:%M:%S") +# end = timestamp.astimezone(tz=timezone("UTC")).strftime("%Y-%m-%d %H:%M:%S") - return {"start": start, "end": end} +# return {"start": start, "end": end} @task(checkpoint=False, nout=2) @@ -916,11 +998,8 @@ def create_request_params( if dataset_id == constants.BILHETAGEM_DATASET_ID.value: secrets = get_vault_secret(secret_path)["data"] - database_secrets = secrets["databases"][table_params["extraction"]["database"]] - request_url = secrets["vpn_url"] + database_secrets["engine"] - request_params = { "host": database_secrets["host"], # TODO: exibir no log em ambiente fechado "database": table_params["extraction"]["database"], @@ -932,47 +1011,40 @@ def create_request_params( @task(checkpoint=False) def get_raw_from_sources( - source: str, - url: str, - dataset_id: str = None, - table_id: str = None, - file_name: str = None, - partitions: str = None, - zip_file_name: str = None, - mode: str = None, - headers: str = None, - params: dict = None, + source_type: str, + source_path: str = None, + zip_filename: str = None, + secret_path: str = None, + api_params: dict = None, ): - if source == "api": - return get_raw_data_api(url=url, headers=headers, params=params) - if source == "gcs": + if source_type == "api": + return get_raw_data_api(url=source_path, secret_path=secret_path, params=api_params) + if source_type == "gcs": return get_raw_data_gcs( - dataset_id=dataset_id, - table_id=table_id, - file_name=file_name, - mode=mode, - partitions=partitions, - zip_file_name=zip_file_name, + gcs_path=source_path, + mode="raw", + zip_filename=zip_filename, ) -@task(checkpoint=False) -def transform_data_to_json(status: dict, file_type: str, csv_args: dict): - data = status["data"] - error = status["error"] +# TODO: passar para função para dentro da transform_raw_to_nested_structure +# @task(checkpoint=False) +# def transform_data_to_json(status: dict, file_type: str, csv_args: dict): +# data = status["data"] +# error = status["error"] - if file_type == "json": - pass +# if file_type == "json": +# pass - # todo: move to data check on specfic API # pylint: disable=W0102 - # if isinstance(data, dict) and "DescricaoErro" in data.keys(): - # error = data["DescricaoErro"] +# # todo: move to data check on specfic API # pylint: disable=W0102 +# # if isinstance(data, dict) and "DescricaoErro" in data.keys(): +# # error = data["DescricaoErro"] - elif file_type in ("txt", "csv"): - if csv_args is None: - csv_args = {} - data = pd.read_csv(io.StringIO(data), **csv_args).to_dict(orient="records") - else: - error = "Unsupported raw file extension. Supported only: json, csv and txt" +# elif file_type in ("txt", "csv"): +# if csv_args is None: +# csv_args = {} +# data = pd.read_csv(io.StringIO(data), **csv_args).to_dict(orient="records") +# else: +# error = "Unsupported raw file extension. Supported only: json, csv and txt" - return {"data": data, "error": error} +# return {"data": data, "error": error} diff --git a/pipelines/rj_smtr/utils.py b/pipelines/rj_smtr/utils.py index c7b13bfc3..a4376bb88 100644 --- a/pipelines/rj_smtr/utils.py +++ b/pipelines/rj_smtr/utils.py @@ -451,17 +451,47 @@ def generate_execute_schedules( # pylint: disable=too-many-arguments,too-many-l return clocks +def _save_raw_local(data: dict, file_path: str, mode: str = "raw", filetype: str = "json") -> str: + """ + Saves json response from API to .json file. + Args: + file_path (str): Path which to save raw file + status (dict): Must contain keys + * data: json returned from API + * error: error catched from API request + mode (str, optional): Folder to save locally, later folder which to upload to GCS. + Returns: + str: Path to the saved file + """ + + # diferentes tipos de arquivos para salvar + _file_path = file_path.format(mode=mode, filetype=filetype) + Path(_file_path).parent.mkdir(parents=True, exist_ok=True) + + if filetype == "json": + json.dump(data, Path(_file_path).open("w", encoding="utf-8")) + + if filetype == "csv": + pass + if filetype == "txt": + pass + + log(f"Raw data saved to: {_file_path}") + return _file_path + + def get_raw_data_api( # pylint: disable=R0912 url: str, - headers: str = None, - params: dict = None, + secret_path: str = None, + api_params: dict = None, + filepath: str = None ) -> list[dict]: """ Request data from URL API Args: url (str): URL to send request - headers (str, optional): Path to headers guardeded on Vault, if needed. + secret_path (str, optional): Path to secrets guardeded on Vault, if needed. params (dict, optional): Params to be sent on request Returns: @@ -469,58 +499,45 @@ def get_raw_data_api( # pylint: disable=R0912 * `data` (json): data result * `error` (str): catched error, if any. Otherwise, returns None """ - data = None error = None - try: - if headers is not None: - headers = get_vault_secret(headers)["data"] - - # remove from headers, if present - remove_headers = ["host", "databases"] - for remove_header in remove_headers: - if remove_header in list(headers.keys()): - del headers[remove_header] + if secret_path is None: + headers = secret_path + else: + headers = get_vault_secret(secret_path)["data"] response = requests.get( url, headers=headers, timeout=constants.MAX_TIMEOUT_SECONDS.value, - params=params, + params=api_params, ) response.raise_for_status() - - data = response.text + filepath = _save_raw_local(data=response.text, filepath=filepath) except Exception as exp: error = exp - - if error is not None: log(f"[CATCHED] Task failed with error: \n{error}", level="error") - return {"data": data, "error": error} + return error, filepath def get_raw_data_gcs( - dataset_id: str, - table_id: str, - file_name: str, - mode: str, - partitions: str = None, + gcs_path: str, zip_extracted_file: str = None, ) -> dict: + error = None - data = None + try: blob = get_storage_blob( - dataset_id=dataset_id, - table_id=table_id, - file_name=file_name, - partitions=partitions, - mode=mode, + gcs_path=gcs_path, + mode="raw", ) + data = blob.download_as_bytes() + if zip_extracted_file: compressed_data = blob.download_as_bytes() @@ -528,7 +545,93 @@ def get_raw_data_gcs( data = zipped_file.read(zip_extracted_file).decode(encoding="utf-8") else: data = blob.download_as_string() + except Exception as exp: error = exp return {"data": data, "error": error} + + +def _save_treated_local(file_path: str, status: dict, mode: str = "staging") -> str: + """ + Save treated file to CSV. + + Args: + file_path (str): Path which to save treated file + status (dict): Must contain keys + * `data`: dataframe returned from treatement + * `error`: error catched from data treatement + mode (str, optional): Folder to save locally, later folder which to upload to GCS. + + Returns: + str: Path to the saved file + """ + _file_path = file_path.format(mode=mode, filetype="csv") + Path(_file_path).parent.mkdir(parents=True, exist_ok=True) + if status["error"] is None: + status["data"].to_csv(_file_path, index=False) + log(f"Treated data saved to: {_file_path}") + return _file_path + + +def upload_run_logs_to_bq( # pylint: disable=R0913 + dataset_id: str, + parent_table_id: str, + timestamp: str, + error: str = None, + previous_error: str = None, + recapture: bool = False, + mode: str = "raw" +): + """ + Upload execution status table to BigQuery. + Table is uploaded to the same dataset, named {parent_table_id}_logs. + If passing status_dict, should not pass timestamp and error. + + Args: + dataset_id (str): dataset_id on BigQuery + parent_table_id (str): Parent table id related to the status table + timestamp (str): ISO formatted timestamp string + error (str, optional): String associated with error caught during execution + Returns: + None + """ + table_id = parent_table_id + "_logs" + # Create partition directory + filename = f"{table_id}_{timestamp.isoformat()}" + partition = f"data={timestamp.date()}" + filepath = Path( + f"""data/{mode}/{dataset_id}/{table_id}/{partition}/{filename}.csv""" + ) + filepath.parent.mkdir(exist_ok=True, parents=True) + # Create dataframe to be uploaded + if not error and recapture is True: + # if the recapture is succeeded, update the column erro + dataframe = pd.DataFrame( + { + "timestamp_captura": [timestamp], + "sucesso": [True], + "erro": [f"[recapturado]{previous_error}"], + } + ) + log(f"Recapturing {timestamp} with previous error:\n{error}") + else: + # not recapturing or error during flow execution + dataframe = pd.DataFrame( + { + "timestamp_captura": [timestamp], + "sucesso": [error is None], + "erro": [error], + } + ) + # Save data local + dataframe.to_csv(filepath, index=False) + # Upload to Storage + create_or_append_table( + dataset_id=dataset_id, + table_id=table_id, + path=filepath.as_posix(), + partitions=partition, + ) + if error is not None: + raise Exception(f"Pipeline failed with error: {error}") \ No newline at end of file diff --git a/pipelines/utils/utils.py b/pipelines/utils/utils.py index 79a264017..147e54f4f 100644 --- a/pipelines/utils/utils.py +++ b/pipelines/utils/utils.py @@ -725,24 +725,14 @@ def get_storage_blobs(dataset_id: str, table_id: str, mode: str = "staging") -> def get_storage_blob( - dataset_id: str, - table_id: str, - file_name: str, - partitions: str = None, + gcs_path: str, mode: str = "staging", ): - path = f"{mode}/{dataset_id}/{table_id}/" - - if partitions: - path += f"{partitions}/" - - path += file_name - - bd_storage = bd.Storage(dataset_id=dataset_id, table_id=table_id) + bucket = bd.Storage() return ( - bd_storage.client["storage_staging"] - .bucket(bd_storage.bucket_name) - .get_blob(blob_name=path) + bucket.client["storage_staging"] + .bucket(bucket.bucket_name) + .get_blob(blob_name=gcs_path) ) From 0c3df1b05e8a257a20d9367cb282050a1df74cb9 Mon Sep 17 00:00:00 2001 From: Rafael Date: Tue, 26 Sep 2023 22:41:01 -0300 Subject: [PATCH 16/59] change default capture flow structure --- pipelines/rj_smtr/constants.py | 12 ++++- pipelines/rj_smtr/tasks.py | 87 ++++++++++++++++++++++------------ pipelines/rj_smtr/utils.py | 55 ++++++++++++--------- 3 files changed, 102 insertions(+), 52 deletions(-) diff --git a/pipelines/rj_smtr/constants.py b/pipelines/rj_smtr/constants.py index d402bb6e9..00558f9cc 100644 --- a/pipelines/rj_smtr/constants.py +++ b/pipelines/rj_smtr/constants.py @@ -264,4 +264,14 @@ class constants(Enum): # pylint: disable=c0103 BILHETAGEM_SECRET_PATH = "smtr_jae_access_data" # GTFS - + GTFS_DATASET_ID = "br_rj_riodejaneiro_gtfs" + + GTFS_SOURCE_TYPE = "gcs" + + GTFS_AGENCY_REQUEST_PARAMS = { + "filepath": "development/br_rj_riodejaneiro_gtfs/upload/gtfs.zip" + } + + GTFS_AGENCY_TABLE_ID = "agency" + + GTFS_QUADRO_TABLE_ID = "quadro" diff --git a/pipelines/rj_smtr/tasks.py b/pipelines/rj_smtr/tasks.py index b7f484171..0a40dae26 100644 --- a/pipelines/rj_smtr/tasks.py +++ b/pipelines/rj_smtr/tasks.py @@ -30,7 +30,7 @@ data_info_str, get_raw_data_api, get_raw_data_gcs, - upload_run_logs_to_bq + upload_run_logs_to_bq, ) from pipelines.utils.execute_dbt_model.utils import get_dbt_client from pipelines.utils.utils import log, get_redis_client, get_vault_secret @@ -604,7 +604,12 @@ def upload_logs_to_bq( # pylint: disable=R0913 @task def upload_raw_data_to_gcs( - error: bool, raw_filepath: str, timestamp: datetime, table_id: str, dataset_id: str, partitions: list + error: bool, + raw_filepath: str, + timestamp: datetime, + table_id: str, + dataset_id: str, + partitions: list, ): if not error: try: @@ -622,19 +627,24 @@ def upload_raw_data_to_gcs( except Exception: error = traceback.format_exc() log(f"[CATCHED] Task failed with error: \n{error}", level="error") - + upload_run_logs_to_bq( dataset_id=dataset_id, parent_table_id=table_id, error=error, timestamp=timestamp, - mode="raw" + mode="raw", ) @task def upload_staging_data_to_gcs( - error: bool, staging_filepath: str, timestamp: datetime, table_id: str, dataset_id: str, partitions: list + error: bool, + staging_filepath: str, + timestamp: datetime, + table_id: str, + dataset_id: str, + partitions: list, ): if not error: try: @@ -643,20 +653,20 @@ def upload_staging_data_to_gcs( dataset_id=dataset_id, table_id=table_id, path=staging_filepath, - partitions=partitions + partitions=partitions, ) except Exception: error = traceback.format_exc() log(f"[CATCHED] Task failed with error: \n{error}", level="error") - + upload_run_logs_to_bq( dataset_id=dataset_id, parent_table_id=table_id, error=error, timestamp=timestamp, - mode="staging" + mode="staging", ) - + ############### # @@ -904,7 +914,7 @@ def transform_raw_to_nested_structure( error = None # leitura do dado raw # data = pd.DataFrame(status["data"]) - + data = None log( f""" Received inputs: @@ -940,7 +950,7 @@ def transform_raw_to_nested_structure( ) # save treated local - filepath = _save_trated_local(data=data, filepath=filepath) + # filepath = _save_trated_local(data=data, filepath=filepath) except Exception as exp: # pylint: disable=W0703 error = exp @@ -980,7 +990,11 @@ def transform_raw_to_nested_structure( @task(checkpoint=False, nout=2) def create_request_params( - datetime_range: dict, table_params: dict, secret_path: str, dataset_id: str + # datetime_range: dict, + # table_params: dict, + table_id: str, + secret_path: str, + dataset_id: str, ) -> tuple: """ Task to create request params @@ -995,16 +1009,28 @@ def create_request_params( request_params: host, database and query to request data request_url: url to request data """ - + request_params = None # TODO: retirar essa linha if dataset_id == constants.BILHETAGEM_DATASET_ID.value: secrets = get_vault_secret(secret_path)["data"] - database_secrets = secrets["databases"][table_params["extraction"]["database"]] - request_url = secrets["vpn_url"] + database_secrets["engine"] - request_params = { - "host": database_secrets["host"], # TODO: exibir no log em ambiente fechado - "database": table_params["extraction"]["database"], - "query": table_params["extraction"]["query"].format(**datetime_range), - } + + # TODO: RETIRAR ESSA LINHA + request_params = secrets + + # TODO: mudar modo de pegar os parametros + # database_secrets = secrets["databases"][table_params["extraction"]["database"]] + # request_url = secrets["vpn_url"] + database_secrets["engine"] + # request_params = { + # "host": database_secrets["host"], # TODO: exibir no log em ambiente fechado + # "database": table_params["extraction"]["database"], + # "query": table_params["extraction"]["query"].format(**datetime_range), + # } + + elif dataset_id == constants.GTFS_DATASET_ID.value: + gtfs_base_path = "development/br_rj_riodejaneiro_gtfs/upload" + if table_id == constants.GTFS_QUADRO_ID.value: + request_url = f"{gtfs_base_path}/quadro.csv" + else: + request_url = f"{gtfs_base_path}/gtfs.zip" return request_params, request_url @@ -1013,18 +1039,21 @@ def create_request_params( def get_raw_from_sources( source_type: str, source_path: str = None, - zip_filename: str = None, + table_id: str = None, secret_path: str = None, api_params: dict = None, ): - if source_type == "api": - return get_raw_data_api(url=source_path, secret_path=secret_path, params=api_params) - if source_type == "gcs": - return get_raw_data_gcs( - gcs_path=source_path, - mode="raw", - zip_filename=zip_filename, - ) + pass + # TODO: descomentar linhas abaixo, passando argumentos corretos + # if source_type == "api": + # return get_raw_data_api( + # url=source_path, secret_path=secret_path, params=api_params + # ) + # if source_type == "gcs": + # return get_raw_data_gcs( + # gcs_path=source_path, + # filename_to_unzip=table_id, + # ) # TODO: passar para função para dentro da transform_raw_to_nested_structure diff --git a/pipelines/rj_smtr/utils.py b/pipelines/rj_smtr/utils.py index a4376bb88..68774c17d 100644 --- a/pipelines/rj_smtr/utils.py +++ b/pipelines/rj_smtr/utils.py @@ -10,12 +10,14 @@ from datetime import timedelta, datetime from typing import List import io -import basedosdados as bd -from basedosdados import Table -import pandas as pd +import json import pytz import requests import zipfile +import basedosdados as bd +from basedosdados import Table +import pandas as pd + from prefect.schedules.clocks import IntervalClock @@ -451,7 +453,9 @@ def generate_execute_schedules( # pylint: disable=too-many-arguments,too-many-l return clocks -def _save_raw_local(data: dict, file_path: str, mode: str = "raw", filetype: str = "json") -> str: +def _save_raw_local( + data: dict, file_path: str, mode: str = "raw", filetype: str = "json" +) -> str: """ Saves json response from API to .json file. Args: @@ -471,20 +475,18 @@ def _save_raw_local(data: dict, file_path: str, mode: str = "raw", filetype: str if filetype == "json": json.dump(data, Path(_file_path).open("w", encoding="utf-8")) - if filetype == "csv": - pass + # if filetype == "csv": + # pass if filetype == "txt": - pass + with open(_file_path, "w", encoding="utf-8") as file: + file.write(data) log(f"Raw data saved to: {_file_path}") return _file_path def get_raw_data_api( # pylint: disable=R0912 - url: str, - secret_path: str = None, - api_params: dict = None, - filepath: str = None + url: str, secret_path: str = None, api_params: dict = None, filepath: str = None ) -> list[dict]: """ Request data from URL API @@ -525,9 +527,9 @@ def get_raw_data_api( # pylint: disable=R0912 def get_raw_data_gcs( gcs_path: str, - zip_extracted_file: str = None, + local_filepath: str, + filename_to_unzip: str = None, ) -> dict: - error = None try: @@ -538,18 +540,27 @@ def get_raw_data_gcs( data = blob.download_as_bytes() - if zip_extracted_file: - compressed_data = blob.download_as_bytes() - - with zipfile.ZipFile(io.BytesIO(compressed_data), "r") as zipped_file: - data = zipped_file.read(zip_extracted_file).decode(encoding="utf-8") + if filename_to_unzip: + with zipfile.ZipFile(io.BytesIO(data), "r") as zipped_file: + filenames = zipped_file.namelist() + filename = list( + filter(lambda x: x.split(".")[0] == filename_to_unzip, filenames) + )[0] + data = zipped_file.read(filename) else: - data = blob.download_as_string() + filename = blob.name + + raw_filepath = _save_raw_local( + data=data.decode(encoding="utf-8"), + file_path=local_filepath, + filetype=filename.split(".")[-1], + ) except Exception as exp: error = exp + log(f"[CATCHED] Task failed with error: \n{error}", level="error") - return {"data": data, "error": error} + return error, raw_filepath def _save_treated_local(file_path: str, status: dict, mode: str = "staging") -> str: @@ -581,7 +592,7 @@ def upload_run_logs_to_bq( # pylint: disable=R0913 error: str = None, previous_error: str = None, recapture: bool = False, - mode: str = "raw" + mode: str = "raw", ): """ Upload execution status table to BigQuery. @@ -634,4 +645,4 @@ def upload_run_logs_to_bq( # pylint: disable=R0913 partitions=partition, ) if error is not None: - raise Exception(f"Pipeline failed with error: {error}") \ No newline at end of file + raise Exception(f"Pipeline failed with error: {error}") From f6ca7ab8c23ad720e30b00c1862837848ad1fad3 Mon Sep 17 00:00:00 2001 From: Rafael Date: Wed, 27 Sep 2023 10:36:00 -0300 Subject: [PATCH 17/59] change generic capture flow --- pipelines/rj_smtr/flows.py | 53 ++++++++++------------ pipelines/rj_smtr/tasks.py | 80 +++++++++++++++++++-------------- pipelines/rj_smtr/utils.py | 91 +++++++++++++++++++++++++++++--------- 3 files changed, 141 insertions(+), 83 deletions(-) diff --git a/pipelines/rj_smtr/flows.py b/pipelines/rj_smtr/flows.py index fb763cc5a..3dd834a75 100644 --- a/pipelines/rj_smtr/flows.py +++ b/pipelines/rj_smtr/flows.py @@ -40,8 +40,7 @@ "SMTR: Captura", code_owners=["caio", "fernanda", "boris", "rodrigo"], ) as default_capture_flow: - - ### Configuração ### + # Configuração # table_id = Parameter("table_id", default=None) partition_date_only = Parameter("partition_date_only", default=None) @@ -71,15 +70,19 @@ partitions=partitions, ) - ### Extração ### + # Extração # # é necessária task ou função dentro da extract_raw_data? request_params, request_path = create_request_params( secret_path=secret_path, dataset_id=dataset_id, + request_params=request_params, + table_id=table_id, + timestamp=timestamp, ) error, raw_filepath = get_raw_from_sources( - source_type=source_type, # parametro de extracao, onde ficar? + source_type=source_type, # parametro de extracao, onde ficar? + local_filepath=filepath, source_path=request_path, zip_filename=table_id, secret_path=secret_path, @@ -87,40 +90,32 @@ ) RAW_UPLOADED = upload_raw_data_to_gcs( - error=error, - filepath=raw_filepath, - timestamp=timestamp, - partitions=partitions + error=error, + raw_filepath=raw_filepath, + timestamp=timestamp, + table_id=table_id, + dataset_id=dataset_id, + partitions=partitions, ) - ### Pré-tratamento ### + # Pré-tratamento # error, staging_filepath = transform_raw_to_nested_structure( raw_filepath=raw_filepath, + filepath=filepath, + error=error, timestamp=timestamp, primary_key=primary_key, ) - STAGING_UPLOADED = upload_staging_data_to_gcs(error=error, filepath=staging_filepath, timestamp=timestamp) - - # treated_filepath = save_treated_local(status=treated_status, file_path=filepath) - - # LOAD # - # error = bq_upload( - # dataset_id=dataset_id, - # table_id=table_params["pre-treatment"]["table_id"], - # filepath=treated_filepath, - # raw_filepath=raw_filepath, - # partitions=partitions, - # status=treated_status, - # ) - - # upload_logs_to_bq( - # dataset_id=dataset_id, - # parent_table_id=table_params["pre-treatment"]["table_id"], - # error=error, - # timestamp=timestamp, - # ) + STAGING_UPLOADED = upload_staging_data_to_gcs( + error=error, + staging_filepath=staging_filepath, + timestamp=timestamp, + table_id=table_id, + dataset_id=dataset_id, + partitions=partitions, + ) default_capture_flow.storage = GCS(emd_constants.GCS_FLOWS_BUCKET.value) default_capture_flow.run_config = KubernetesRun( diff --git a/pipelines/rj_smtr/tasks.py b/pipelines/rj_smtr/tasks.py index 0a40dae26..89beae6f2 100644 --- a/pipelines/rj_smtr/tasks.py +++ b/pipelines/rj_smtr/tasks.py @@ -31,6 +31,9 @@ get_raw_data_api, get_raw_data_gcs, upload_run_logs_to_bq, + get_datetime_range, + transform_data_to_json, + save_treated_local_func, ) from pipelines.utils.execute_dbt_model.utils import get_dbt_client from pipelines.utils.utils import log, get_redis_client, get_vault_secret @@ -874,7 +877,11 @@ def get_previous_date(days): @task def transform_raw_to_nested_structure( - filepath: str, error: bool, timestamp: datetime, primary_key: list = None + raw_filepath: str, + filepath: str, + error: bool, + timestamp: datetime, + primary_key: list = None, ): """Transform dataframe to nested structure @@ -891,16 +898,18 @@ def transform_raw_to_nested_structure( * `error` (str): catched error, if any. Otherwise, returns None """ + with open(raw_filepath, "r", encoding="utf-8") as file: + data = file.read() + # ORGANIZAR: - # json_status = transform_data_to_json( - # status=raw_status, - # file_type=table_params["pre-treatment"]["file_type"], - # csv_args=table_params["pre-treatment"]["csv_args"], - # ) + error, data = transform_data_to_json( + data=data, + file_type=raw_filepath.split(".")[-1], + ) # Check previous error if error is not None: - return {"data": pd.DataFrame(), "error": error} + return error, None # Check empty dataframe # if len(status["data"]) == 0: @@ -913,8 +922,8 @@ def transform_raw_to_nested_structure( error = None # leitura do dado raw - # data = pd.DataFrame(status["data"]) - data = None + data = pd.DataFrame(data) + log( f""" Received inputs: @@ -950,7 +959,7 @@ def transform_raw_to_nested_structure( ) # save treated local - # filepath = _save_trated_local(data=data, filepath=filepath) + filepath = save_treated_local_func(data=data, error=error, filepath=filepath) except Exception as exp: # pylint: disable=W0703 error = exp @@ -992,9 +1001,11 @@ def transform_raw_to_nested_structure( def create_request_params( # datetime_range: dict, # table_params: dict, + request_params: dict, table_id: str, secret_path: str, dataset_id: str, + timestamp: datetime, ) -> tuple: """ Task to create request params @@ -1009,25 +1020,25 @@ def create_request_params( request_params: host, database and query to request data request_url: url to request data """ - request_params = None # TODO: retirar essa linha + if dataset_id == constants.BILHETAGEM_DATASET_ID.value: secrets = get_vault_secret(secret_path)["data"] - # TODO: RETIRAR ESSA LINHA - request_params = secrets + database_secrets = secrets["databases"][request_params["database"]] + request_url = secrets["vpn_url"] + database_secrets["engine"] - # TODO: mudar modo de pegar os parametros - # database_secrets = secrets["databases"][table_params["extraction"]["database"]] - # request_url = secrets["vpn_url"] + database_secrets["engine"] - # request_params = { - # "host": database_secrets["host"], # TODO: exibir no log em ambiente fechado - # "database": table_params["extraction"]["database"], - # "query": table_params["extraction"]["query"].format(**datetime_range), - # } + datetime_range = get_datetime_range( + timestamp=timestamp, interval=request_params["run_interval"] + ) + request_params = { + "host": database_secrets["host"], # TODO: exibir no log em ambiente fechado + "database": request_params["database"], + "query": request_params["query"].format(**datetime_range), + } elif dataset_id == constants.GTFS_DATASET_ID.value: gtfs_base_path = "development/br_rj_riodejaneiro_gtfs/upload" - if table_id == constants.GTFS_QUADRO_ID.value: + if table_id == constants.GTFS_QUADRO_TABLE_ID.value: request_url = f"{gtfs_base_path}/quadro.csv" else: request_url = f"{gtfs_base_path}/gtfs.zip" @@ -1038,22 +1049,25 @@ def create_request_params( @task(checkpoint=False) def get_raw_from_sources( source_type: str, + local_filepath: str, source_path: str = None, table_id: str = None, secret_path: str = None, api_params: dict = None, ): - pass - # TODO: descomentar linhas abaixo, passando argumentos corretos - # if source_type == "api": - # return get_raw_data_api( - # url=source_path, secret_path=secret_path, params=api_params - # ) - # if source_type == "gcs": - # return get_raw_data_gcs( - # gcs_path=source_path, - # filename_to_unzip=table_id, - # ) + if source_type == "api": + return get_raw_data_api( + url=source_path, + secret_path=secret_path, + api_params=api_params, + filepath=local_filepath, + ) + if source_type == "gcs": + return get_raw_data_gcs( + gcs_path=source_path, + filename_to_unzip=table_id, + local_filepath=local_filepath, + ) # TODO: passar para função para dentro da transform_raw_to_nested_structure diff --git a/pipelines/rj_smtr/utils.py b/pipelines/rj_smtr/utils.py index 68774c17d..184a93df7 100644 --- a/pipelines/rj_smtr/utils.py +++ b/pipelines/rj_smtr/utils.py @@ -11,9 +11,9 @@ from typing import List import io import json +import zipfile import pytz import requests -import zipfile import basedosdados as bd from basedosdados import Table import pandas as pd @@ -453,13 +453,13 @@ def generate_execute_schedules( # pylint: disable=too-many-arguments,too-many-l return clocks -def _save_raw_local( - data: dict, file_path: str, mode: str = "raw", filetype: str = "json" +def save_raw_local_func( + data: dict, filepath: str, mode: str = "raw", filetype: str = "json" ) -> str: """ Saves json response from API to .json file. Args: - file_path (str): Path which to save raw file + filepath (str): Path which to save raw file status (dict): Must contain keys * data: json returned from API * error: error catched from API request @@ -469,20 +469,20 @@ def _save_raw_local( """ # diferentes tipos de arquivos para salvar - _file_path = file_path.format(mode=mode, filetype=filetype) - Path(_file_path).parent.mkdir(parents=True, exist_ok=True) + _filepath = filepath.format(mode=mode, filetype=filetype) + Path(_filepath).parent.mkdir(parents=True, exist_ok=True) if filetype == "json": - json.dump(data, Path(_file_path).open("w", encoding="utf-8")) + json.dump(data, Path(_filepath).open("w", encoding="utf-8")) # if filetype == "csv": # pass if filetype == "txt": - with open(_file_path, "w", encoding="utf-8") as file: + with open(_filepath, "w", encoding="utf-8") as file: file.write(data) - log(f"Raw data saved to: {_file_path}") - return _file_path + log(f"Raw data saved to: {_filepath}") + return _filepath def get_raw_data_api( # pylint: disable=R0912 @@ -516,7 +516,9 @@ def get_raw_data_api( # pylint: disable=R0912 ) response.raise_for_status() - filepath = _save_raw_local(data=response.text, filepath=filepath) + filepath = save_raw_local_func( + data=response.text, filepath=filepath + ) # TODO: mudar filetype except Exception as exp: error = exp @@ -550,9 +552,9 @@ def get_raw_data_gcs( else: filename = blob.name - raw_filepath = _save_raw_local( + raw_filepath = save_raw_local_func( data=data.decode(encoding="utf-8"), - file_path=local_filepath, + filepath=local_filepath, filetype=filename.split(".")[-1], ) @@ -563,12 +565,14 @@ def get_raw_data_gcs( return error, raw_filepath -def _save_treated_local(file_path: str, status: dict, mode: str = "staging") -> str: +def save_treated_local_func( + filepath: str, data: pd.DataFrame, error: str, mode: str = "staging" +) -> str: """ Save treated file to CSV. Args: - file_path (str): Path which to save treated file + filepath (str): Path which to save treated file status (dict): Must contain keys * `data`: dataframe returned from treatement * `error`: error catched from data treatement @@ -577,12 +581,12 @@ def _save_treated_local(file_path: str, status: dict, mode: str = "staging") -> Returns: str: Path to the saved file """ - _file_path = file_path.format(mode=mode, filetype="csv") - Path(_file_path).parent.mkdir(parents=True, exist_ok=True) - if status["error"] is None: - status["data"].to_csv(_file_path, index=False) - log(f"Treated data saved to: {_file_path}") - return _file_path + _filepath = filepath.format(mode=mode, filetype="csv") + Path(_filepath).parent.mkdir(parents=True, exist_ok=True) + if error is None: + data.to_csv(_filepath, index=False) + log(f"Treated data saved to: {_filepath}") + return _filepath def upload_run_logs_to_bq( # pylint: disable=R0913 @@ -646,3 +650,48 @@ def upload_run_logs_to_bq( # pylint: disable=R0913 ) if error is not None: raise Exception(f"Pipeline failed with error: {error}") + + +def get_datetime_range( + timestamp: datetime, + interval: int, +) -> dict: + """ + Task to get datetime range in UTC + + Args: + timestamp (datetime): timestamp to get datetime range + interval (int): interval in seconds + + Returns: + dict: datetime range + """ + + start = ( + (timestamp - timedelta(seconds=interval)) + .astimezone(tz=pytz.timezone("UTC")) + .strftime("%Y-%m-%d %H:%M:%S") + ) + + end = timestamp.astimezone(tz=pytz.timezone("UTC")).strftime("%Y-%m-%d %H:%M:%S") + + return {"start": start, "end": end} + + +def transform_data_to_json(data: str, file_type: str, csv_args: dict = {}): + try: + if file_type == "json": + data = json.loads(data) + + elif file_type in ("txt", "csv"): + if csv_args is None: + csv_args = {} + data = pd.read_csv(io.StringIO(data), **csv_args).to_dict(orient="records") + else: + error = "Unsupported raw file extension. Supported only: json, csv and txt" + + except Exception as exp: + error = exp + log(f"[CATCHED] Task failed with error: \n{error}", level="error") + + return error, data From fa17be21b41769895fb4154b78d86d373652d368 Mon Sep 17 00:00:00 2001 From: Rafael Date: Wed, 27 Sep 2023 11:20:15 -0300 Subject: [PATCH 18/59] adjust constant structure --- pipelines/rj_smtr/constants.py | 36 +++++++++++++++++++++++++------- pipelines/rj_smtr/flows.py | 6 ++---- pipelines/rj_smtr/tasks.py | 38 +++++++++++++++------------------- 3 files changed, 48 insertions(+), 32 deletions(-) diff --git a/pipelines/rj_smtr/constants.py b/pipelines/rj_smtr/constants.py index 00558f9cc..7eb18ef85 100644 --- a/pipelines/rj_smtr/constants.py +++ b/pipelines/rj_smtr/constants.py @@ -165,6 +165,18 @@ class constants(Enum): # pylint: disable=c0103 # BILHETAGEM BILHETAGEM_DATASET_ID = "br_rj_riodejaneiro_bilhetagem" + BILHETAGEM_DATABASES = { + "principal_db": { + "engine": "mysql", + "host": "principal-database-replica.internal", + }, + "tarifa_db": {"engine": "postgres", "host": "tarifa-database-replica.internal"}, + "transacao_db": { + "engine": "postgres", + "host": "transacao-database-replica.internal", + }, + } + BILHETAGEM_VPN_URL = "http://vpn-jae.mobilidade.rio/" BILHETAGEM_TRANSACAO_TABLE_PARAMS = [ { "partition_date_only": False, @@ -264,14 +276,24 @@ class constants(Enum): # pylint: disable=c0103 BILHETAGEM_SECRET_PATH = "smtr_jae_access_data" # GTFS - GTFS_DATASET_ID = "br_rj_riodejaneiro_gtfs" + GTFS_CAPTURE_PARAMS = [ + {"table_id": "agency", "primary_key": ["agency_id"]}, + {"table_id": "calendar_dates", "primary_key": ["service_id"]}, + {"table_id": "calendar", "primary_key": ["service_id"]}, + {"table_id": "feed_info", "primary_key": ["feed_publisher_name"]}, + {"table_id": "frequencies", "primary_key": ["trip_id"]}, + {"table_id": "routes", "primary_key": ["route_id"]}, + {"table_id": "shapes", "primary_key": ["shape_id"]}, + {"table_id": "stops", "primary_key": ["stop_id"]}, + {"table_id": "trips", "primary_key": ["trip_id"]}, + {"table_id": "fare_attributes", "primary_key": ["fare_id"]}, + {"table_id": "fare_rules", "primary_key": ["fare_id"]}, + ] - GTFS_SOURCE_TYPE = "gcs" + GTFS_GENERAL_CAPTURE_PARAMS = {"partition_date_only": True, "source_type": "gcs"} - GTFS_AGENCY_REQUEST_PARAMS = { - "filepath": "development/br_rj_riodejaneiro_gtfs/upload/gtfs.zip" - } + GTFS_QUADRO_CAPTURE_PARAMS = {"table_id": "quadro", "primary_key": "servico"} - GTFS_AGENCY_TABLE_ID = "agency" + GTFS_DATASET_ID = "br_rj_riodejaneiro_gtfs" - GTFS_QUADRO_TABLE_ID = "quadro" + GTFS_BASE_GCS_PATH = "development/br_rj_riodejaneiro_gtfs/upload" diff --git a/pipelines/rj_smtr/flows.py b/pipelines/rj_smtr/flows.py index 3dd834a75..94a3ffb93 100644 --- a/pipelines/rj_smtr/flows.py +++ b/pipelines/rj_smtr/flows.py @@ -44,7 +44,7 @@ table_id = Parameter("table_id", default=None) partition_date_only = Parameter("partition_date_only", default=None) - request_params = Parameter("request_params", default=None) + extract_params = Parameter("extract_params", default=None) dataset_id = Parameter("dataset_id", default=None) secret_path = Parameter("secret_path", default=None) primary_key = Parameter("primary_key", default=None) @@ -71,11 +71,9 @@ ) # Extração # - # é necessária task ou função dentro da extract_raw_data? request_params, request_path = create_request_params( - secret_path=secret_path, dataset_id=dataset_id, - request_params=request_params, + extract_params=extract_params, table_id=table_id, timestamp=timestamp, ) diff --git a/pipelines/rj_smtr/tasks.py b/pipelines/rj_smtr/tasks.py index 89beae6f2..a134dd966 100644 --- a/pipelines/rj_smtr/tasks.py +++ b/pipelines/rj_smtr/tasks.py @@ -879,7 +879,7 @@ def get_previous_date(days): def transform_raw_to_nested_structure( raw_filepath: str, filepath: str, - error: bool, + error: str, timestamp: datetime, primary_key: list = None, ): @@ -898,6 +898,10 @@ def transform_raw_to_nested_structure( * `error` (str): catched error, if any. Otherwise, returns None """ + # Check previous error + if error is not None: + return error, None + with open(raw_filepath, "r", encoding="utf-8") as file: data = file.read() @@ -907,10 +911,6 @@ def transform_raw_to_nested_structure( file_type=raw_filepath.split(".")[-1], ) - # Check previous error - if error is not None: - return error, None - # Check empty dataframe # if len(status["data"]) == 0: # log("Empty dataframe, skipping transformation...") @@ -999,11 +999,8 @@ def transform_raw_to_nested_structure( @task(checkpoint=False, nout=2) def create_request_params( - # datetime_range: dict, - # table_params: dict, - request_params: dict, + extract_params: dict, table_id: str, - secret_path: str, dataset_id: str, timestamp: datetime, ) -> tuple: @@ -1020,28 +1017,27 @@ def create_request_params( request_params: host, database and query to request data request_url: url to request data """ + request_params = None if dataset_id == constants.BILHETAGEM_DATASET_ID.value: - secrets = get_vault_secret(secret_path)["data"] - - database_secrets = secrets["databases"][request_params["database"]] - request_url = secrets["vpn_url"] + database_secrets["engine"] + database = constants.BILHETAGEM_DATABASES.value[extract_params["database"]] + request_url = constants.BILHETAGEM_VPN_URL.value + database["engine"] datetime_range = get_datetime_range( - timestamp=timestamp, interval=request_params["run_interval"] + timestamp=timestamp, interval=extract_params["run_interval"] ) + request_params = { - "host": database_secrets["host"], # TODO: exibir no log em ambiente fechado - "database": request_params["database"], - "query": request_params["query"].format(**datetime_range), + "host": database["host"], # TODO: exibir no log em ambiente fechado + "database": extract_params["database"], + "query": extract_params["query"].format(**datetime_range), } elif dataset_id == constants.GTFS_DATASET_ID.value: - gtfs_base_path = "development/br_rj_riodejaneiro_gtfs/upload" - if table_id == constants.GTFS_QUADRO_TABLE_ID.value: - request_url = f"{gtfs_base_path}/quadro.csv" + if table_id == constants.GTFS_QUADRO_CAPTURE_PARAMS.value["table_id"]: + request_url = f"{constants.GTFS_BASE_GCS_PATH.value}/{table_id}.csv" else: - request_url = f"{gtfs_base_path}/gtfs.zip" + request_url = f"{constants.GTFS_BASE_GCS_PATH.value}/gtfs.zip" return request_params, request_url From bdc3881cde88840b62175e1ce8ac66a596e37feb Mon Sep 17 00:00:00 2001 From: Rafael Date: Wed, 27 Sep 2023 13:27:11 -0300 Subject: [PATCH 19/59] change bilhetagem to new capture flow structure --- .../schedules.py | 18 +- pipelines/rj_smtr/constants.py | 186 ++++++++++-------- pipelines/rj_smtr/tasks.py | 14 +- pipelines/rj_smtr/utils.py | 40 ++-- 4 files changed, 145 insertions(+), 113 deletions(-) diff --git a/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/schedules.py b/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/schedules.py index 38fca85a9..538e5b816 100644 --- a/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/schedules.py +++ b/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/schedules.py @@ -16,26 +16,32 @@ ) bilhetagem_principal_clocks = generate_execute_schedules( - interval=timedelta(days=1), + clock_interval=timedelta( + **constants.BILHETAGEM_GENERAL_CAPTURE_PARAMS.value["principal_run_interval"] + ), labels=[ - emd_constants.RJ_SMTR_AGENT_LABEL.value, + emd_constants.RJ_SMTR_DEV_AGENT_LABEL.value, ], - table_parameters=constants.BILHETAGEM_TABLES_PARAMS.value, + table_parameters=constants.BILHETAGEM_CAPTURE_PARAMS.value, dataset_id=constants.BILHETAGEM_DATASET_ID.value, secret_path=constants.BILHETAGEM_SECRET_PATH.value, + source_type=constants.BILHETAGEM_GENERAL_CAPTURE_PARAMS.value["source_type"], runs_interval_minutes=15, ) bilhetagem_principal_schedule = Schedule(clocks=untuple(bilhetagem_principal_clocks)) bilhetagem_transacao_clocks = generate_execute_schedules( - interval=timedelta(minutes=1), + clock_interval=timedelta( + constants.BILHETAGEM_GENERAL_CAPTURE_PARAMS.value["transacao_run_interval"] + ), labels=[ - emd_constants.RJ_SMTR_AGENT_LABEL.value, + emd_constants.RJ_SMTR_DEV_AGENT_LABEL.value, ], - table_parameters=constants.BILHETAGEM_TRANSACAO_TABLE_PARAMS.value, + table_parameters=constants.BILHETAGEM_TRANSACAO_CAPTURE_PARAMS.value, dataset_id=constants.BILHETAGEM_DATASET_ID.value, secret_path=constants.BILHETAGEM_SECRET_PATH.value, + source_type=constants.BILHETAGEM_GENERAL_CAPTURE_PARAMS.value["source_type"], runs_interval_minutes=0, ) diff --git a/pipelines/rj_smtr/constants.py b/pipelines/rj_smtr/constants.py index 7eb18ef85..969ccd871 100644 --- a/pipelines/rj_smtr/constants.py +++ b/pipelines/rj_smtr/constants.py @@ -165,117 +165,142 @@ class constants(Enum): # pylint: disable=c0103 # BILHETAGEM BILHETAGEM_DATASET_ID = "br_rj_riodejaneiro_bilhetagem" - BILHETAGEM_DATABASES = { - "principal_db": { - "engine": "mysql", - "host": "principal-database-replica.internal", + + BILHETAGEM_GENERAL_CAPTURE_PARAMS = { + "databases": { + "principal_db": { + "engine": "mysql", + "host": "principal-database-replica.internal", + }, + "tarifa_db": { + "engine": "postgres", + "host": "tarifa-database-replica.internal", + }, + "transacao_db": { + "engine": "postgres", + "host": "transacao-database-replica.internal", + }, }, - "tarifa_db": {"engine": "postgres", "host": "tarifa-database-replica.internal"}, - "transacao_db": { - "engine": "postgres", - "host": "transacao-database-replica.internal", + "vpn_url": "http://vpn-jae.mobilidade.rio/", + "source_type": "api-json", + "transacao_run_interval": {"minutes": 1}, + "principal_run_interval": {"days": 1}, + } + + BILHETAGEM_TRANSACAO_CAPTURE_PARAMS = { + "table_id": "transacao", + "partition_date_only": False, + "extract_params": { + "database": "transacao_db", + "query": """ + SELECT + * + FROM + transacao + WHERE + data_processamento BETWEEN '{start}' + AND '{end}' + ORDER BY + data_processamento + """, + "run_interval": BILHETAGEM_GENERAL_CAPTURE_PARAMS["transacao_run_interval"], }, + "primary_key": ["id"], } - BILHETAGEM_VPN_URL = "http://vpn-jae.mobilidade.rio/" - BILHETAGEM_TRANSACAO_TABLE_PARAMS = [ + + BILHETAGEM_CAPTURE_PARAMS = [ { - "partition_date_only": False, - "flow_run_name": "transacao", - "extraction": { - "table_id": "transacao", - "database": "transacao_db", + "table_id": "linha", + "partition_date_only": True, + "extract_params": { + "database": "principal_db", "query": """ SELECT * FROM - transacao + LINHA WHERE - data_processamento BETWEEN '{start}' - AND '{end}' + DT_INCLUSAO >= '{start}' ORDER BY - data_processamento + DT_INCLUSAO """, - "source": "api", - }, - "pre-treatment": { - "table_id": "transacao", - "file_type": "json", - "primary_key": ["id"], # id column to nest data on + "run_interval": BILHETAGEM_GENERAL_CAPTURE_PARAMS[ + "principal_run_interval" + ], }, - } - ] - BILHETAGEM_TABLES_PARAMS = [ - { - "table_id": "linha", - "database": "principal_db", - "query": """ - SELECT - * - FROM - LINHA - WHERE - DT_INCLUSAO >= '{start}' - ORDER BY - DT_INCLUSAO - """, "primary_key": ["CD_LINHA"], # id column to nest data on - "partition_date_only": True, }, { "table_id": "grupo", - "database": "principal_db", - "query": """ - SELECT - * - FROM - GRUPO - WHERE - DT_INCLUSAO >= '{start}' - ORDER BY - DT_INCLUSAO - """, - "primary_key": ["CD_GRUPO"], "partition_date_only": True, + "extract_params": { + "database": "principal_db", + "query": """ + SELECT + * + FROM + GRUPO + WHERE + DT_INCLUSAO >= '{start}' + ORDER BY + DT_INCLUSAO + """, + "run_interval": BILHETAGEM_GENERAL_CAPTURE_PARAMS[ + "principal_run_interval" + ], + }, + "primary_key": ["CD_GRUPO"], }, { "table_id": "grupo_linha", - "database": "principal_db", - "query": """ - SELECT - * - FROM - GRUPO_LINHA - WHERE - DT_INCLUSAO >= '{start}' - ORDER BY - DT_INCLUSAO - """, - "primary_key": ["CD_GRUPO", "CD_LINHA"], # id column to nest data on "partition_date_only": True, + "extract_params": { + "database": "principal_db", + "query": """ + SELECT + * + FROM + GRUPO_LINHA + WHERE + DT_INCLUSAO >= '{start}' + ORDER BY + DT_INCLUSAO + """, + "run_interval": BILHETAGEM_GENERAL_CAPTURE_PARAMS[ + "principal_run_interval" + ], + }, + "primary_key": ["CD_GRUPO", "CD_LINHA"], # id column to nest data on }, { "table_id": "matriz_integracao", - "database": "tarifa_db", - "query": """ - SELECT - * - FROM - matriz_integracao - WHERE - dt_inclusao >= '{start}' - ORDER BY - dt_inclusao - """, + "partition_date_only": True, + "extract_params": { + "database": "tarifa_db", + "query": """ + SELECT + * + FROM + matriz_integracao + WHERE + dt_inclusao >= '{start}' + ORDER BY + dt_inclusao + """, + "run_interval": BILHETAGEM_GENERAL_CAPTURE_PARAMS[ + "principal_run_interval" + ], + }, "primary_key": [ "cd_versao_matriz", "cd_integracao", ], # id column to nest data on - "partition_date_only": True, }, ] BILHETAGEM_SECRET_PATH = "smtr_jae_access_data" # GTFS + GTFS_DATASET_ID = "br_rj_riodejaneiro_gtfs" GTFS_CAPTURE_PARAMS = [ {"table_id": "agency", "primary_key": ["agency_id"]}, {"table_id": "calendar_dates", "primary_key": ["service_id"]}, @@ -289,11 +314,6 @@ class constants(Enum): # pylint: disable=c0103 {"table_id": "fare_attributes", "primary_key": ["fare_id"]}, {"table_id": "fare_rules", "primary_key": ["fare_id"]}, ] - GTFS_GENERAL_CAPTURE_PARAMS = {"partition_date_only": True, "source_type": "gcs"} - GTFS_QUADRO_CAPTURE_PARAMS = {"table_id": "quadro", "primary_key": "servico"} - - GTFS_DATASET_ID = "br_rj_riodejaneiro_gtfs" - GTFS_BASE_GCS_PATH = "development/br_rj_riodejaneiro_gtfs/upload" diff --git a/pipelines/rj_smtr/tasks.py b/pipelines/rj_smtr/tasks.py index a134dd966..e414f1c70 100644 --- a/pipelines/rj_smtr/tasks.py +++ b/pipelines/rj_smtr/tasks.py @@ -1020,11 +1020,16 @@ def create_request_params( request_params = None if dataset_id == constants.BILHETAGEM_DATASET_ID.value: - database = constants.BILHETAGEM_DATABASES.value[extract_params["database"]] - request_url = constants.BILHETAGEM_VPN_URL.value + database["engine"] + database = constants.BILHETAGEM_GENERAL_CAPTURE_PARAMS.value["databases"][ + extract_params["database"] + ] + request_url = ( + constants.BILHETAGEM_GENERAL_CAPTURE_PARAMS.value["vpn_url"] + + database["engine"] + ) datetime_range = get_datetime_range( - timestamp=timestamp, interval=extract_params["run_interval"] + timestamp=timestamp, interval=timedelta(**extract_params["run_interval"]) ) request_params = { @@ -1051,12 +1056,15 @@ def get_raw_from_sources( secret_path: str = None, api_params: dict = None, ): + source_type, filetype = source_type.split("-", maxsplit=1) + if source_type == "api": return get_raw_data_api( url=source_path, secret_path=secret_path, api_params=api_params, filepath=local_filepath, + filetype=filetype, ) if source_type == "gcs": return get_raw_data_gcs( diff --git a/pipelines/rj_smtr/utils.py b/pipelines/rj_smtr/utils.py index 184a93df7..d354ae6ab 100644 --- a/pipelines/rj_smtr/utils.py +++ b/pipelines/rj_smtr/utils.py @@ -8,7 +8,7 @@ from pathlib import Path from datetime import timedelta, datetime -from typing import List +from typing import List, Union import io import json import zipfile @@ -31,7 +31,6 @@ get_vault_secret, send_discord_message, get_redis_client, - get_storage_blobs, get_storage_blob, ) @@ -404,46 +403,41 @@ def data_info_str(data: pd.DataFrame): def generate_execute_schedules( # pylint: disable=too-many-arguments,too-many-locals - interval: timedelta, + clock_interval: timedelta, labels: List[str], - table_parameters: list, - dataset_id: str, - secret_path: str, + table_parameters: Union[list[dict], dict], runs_interval_minutes: int = 15, start_date: datetime = datetime( 2020, 1, 1, tzinfo=pytz.timezone(emd_constants.DEFAULT_TIMEZONE.value) ), + **general_flow_params, ) -> List[IntervalClock]: """ Generates multiple schedules Args: - interval (timedelta): The interval to run the schedule + clock_interval (timedelta): The interval to run the schedule labels (List[str]): The labels to be added to the schedule - table_parameters (list): The table parameters - dataset_id (str): The dataset_id to be used in the schedule - secret_path (str): The secret path to be used in the schedule + table_parameters (list): The table parameters to iterate over runs_interval_minutes (int, optional): The interval between each schedule. Defaults to 15. start_date (datetime, optional): The start date of the schedule. Defaults to datetime(2020, 1, 1, tzinfo=pytz.timezone(emd_constants.DEFAULT_TIMEZONE.value)). - + general_flow_params: Any param that you want to pass to the flow Returns: List[IntervalClock]: The list of schedules """ + if isinstance(table_parameters, dict): + table_parameters = [table_parameters] clocks = [] for count, parameters in enumerate(table_parameters): - parameter_defaults = { - "table_params": parameters, - "dataset_id": dataset_id, - "secret_path": secret_path, - "interval": interval.total_seconds(), - } + parameter_defaults = parameters | general_flow_params + log(f"parameter_defaults: {parameter_defaults}") clocks.append( IntervalClock( - interval=interval, + interval=clock_interval, start_date=start_date + timedelta(minutes=runs_interval_minutes * count), labels=labels, @@ -486,7 +480,11 @@ def save_raw_local_func( def get_raw_data_api( # pylint: disable=R0912 - url: str, secret_path: str = None, api_params: dict = None, filepath: str = None + url: str, + secret_path: str = None, + api_params: dict = None, + filepath: str = None, + filetype: str = None, ) -> list[dict]: """ Request data from URL API @@ -517,8 +515,8 @@ def get_raw_data_api( # pylint: disable=R0912 response.raise_for_status() filepath = save_raw_local_func( - data=response.text, filepath=filepath - ) # TODO: mudar filetype + data=response.text, filepath=filepath, filetype=filetype + ) except Exception as exp: error = exp From fc61c4762c7a416872ba6fbbfa5a064a43e846a4 Mon Sep 17 00:00:00 2001 From: Rafael Date: Wed, 27 Sep 2023 14:24:48 -0300 Subject: [PATCH 20/59] fix get_storage_blob function --- pipelines/rj_smtr/constants.py | 2 +- pipelines/utils/utils.py | 3 +-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/pipelines/rj_smtr/constants.py b/pipelines/rj_smtr/constants.py index 969ccd871..2faeccb25 100644 --- a/pipelines/rj_smtr/constants.py +++ b/pipelines/rj_smtr/constants.py @@ -316,4 +316,4 @@ class constants(Enum): # pylint: disable=c0103 ] GTFS_GENERAL_CAPTURE_PARAMS = {"partition_date_only": True, "source_type": "gcs"} GTFS_QUADRO_CAPTURE_PARAMS = {"table_id": "quadro", "primary_key": "servico"} - GTFS_BASE_GCS_PATH = "development/br_rj_riodejaneiro_gtfs/upload" + GTFS_BASE_GCS_PATH = "raw/development/br_rj_riodejaneiro_gtfs/upload" diff --git a/pipelines/utils/utils.py b/pipelines/utils/utils.py index 147e54f4f..57384f8f4 100644 --- a/pipelines/utils/utils.py +++ b/pipelines/utils/utils.py @@ -726,9 +726,8 @@ def get_storage_blobs(dataset_id: str, table_id: str, mode: str = "staging") -> def get_storage_blob( gcs_path: str, - mode: str = "staging", ): - bucket = bd.Storage() + bucket = bd.Storage(dataset_id="", table_id="") return ( bucket.client["storage_staging"] .bucket(bucket.bucket_name) From 0fc26cbc9d786fd28b369ab35784b636c3ecdc12 Mon Sep 17 00:00:00 2001 From: Rafael Date: Wed, 27 Sep 2023 14:25:27 -0300 Subject: [PATCH 21/59] fix get_storage_blob call --- pipelines/rj_smtr/utils.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/pipelines/rj_smtr/utils.py b/pipelines/rj_smtr/utils.py index d354ae6ab..55abfc9cf 100644 --- a/pipelines/rj_smtr/utils.py +++ b/pipelines/rj_smtr/utils.py @@ -533,10 +533,7 @@ def get_raw_data_gcs( error = None try: - blob = get_storage_blob( - gcs_path=gcs_path, - mode="raw", - ) + blob = get_storage_blob(gcs_path=gcs_path) data = blob.download_as_bytes() From 634df851e41bff549fe5f9daab4801f0eb6e0858 Mon Sep 17 00:00:00 2001 From: Rafael Date: Wed, 27 Sep 2023 14:45:26 -0300 Subject: [PATCH 22/59] organize constants order --- pipelines/rj_smtr/constants.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pipelines/rj_smtr/constants.py b/pipelines/rj_smtr/constants.py index 2faeccb25..722d7e9e1 100644 --- a/pipelines/rj_smtr/constants.py +++ b/pipelines/rj_smtr/constants.py @@ -301,6 +301,7 @@ class constants(Enum): # pylint: disable=c0103 # GTFS GTFS_DATASET_ID = "br_rj_riodejaneiro_gtfs" + GTFS_GENERAL_CAPTURE_PARAMS = {"partition_date_only": True, "source_type": "gcs"} GTFS_CAPTURE_PARAMS = [ {"table_id": "agency", "primary_key": ["agency_id"]}, {"table_id": "calendar_dates", "primary_key": ["service_id"]}, @@ -314,6 +315,5 @@ class constants(Enum): # pylint: disable=c0103 {"table_id": "fare_attributes", "primary_key": ["fare_id"]}, {"table_id": "fare_rules", "primary_key": ["fare_id"]}, ] - GTFS_GENERAL_CAPTURE_PARAMS = {"partition_date_only": True, "source_type": "gcs"} GTFS_QUADRO_CAPTURE_PARAMS = {"table_id": "quadro", "primary_key": "servico"} - GTFS_BASE_GCS_PATH = "raw/development/br_rj_riodejaneiro_gtfs/upload" + GTFS_BASE_GCS_PATH = "development/br_rj_riodejaneiro_gtfs/upload" From bda52aa6eedb6eedec2c6334f0843e2a80edcd4a Mon Sep 17 00:00:00 2001 From: Rafael Date: Wed, 27 Sep 2023 14:46:45 -0300 Subject: [PATCH 23/59] fix get_raw_from_sources function call --- pipelines/rj_smtr/flows.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pipelines/rj_smtr/flows.py b/pipelines/rj_smtr/flows.py index 94a3ffb93..19ac776b7 100644 --- a/pipelines/rj_smtr/flows.py +++ b/pipelines/rj_smtr/flows.py @@ -82,9 +82,9 @@ source_type=source_type, # parametro de extracao, onde ficar? local_filepath=filepath, source_path=request_path, - zip_filename=table_id, + table_id=table_id, secret_path=secret_path, - request_params=request_params, + api_params=request_params, ) RAW_UPLOADED = upload_raw_data_to_gcs( From b2548d6b8cd1f56bf9dbd4676e52011ce5fdfa16 Mon Sep 17 00:00:00 2001 From: Rafael Date: Wed, 27 Sep 2023 14:47:35 -0300 Subject: [PATCH 24/59] change transform_raw_to_json to read_raw_data --- pipelines/rj_smtr/tasks.py | 17 +++++------------ 1 file changed, 5 insertions(+), 12 deletions(-) diff --git a/pipelines/rj_smtr/tasks.py b/pipelines/rj_smtr/tasks.py index e414f1c70..ee99ff654 100644 --- a/pipelines/rj_smtr/tasks.py +++ b/pipelines/rj_smtr/tasks.py @@ -32,7 +32,7 @@ get_raw_data_gcs, upload_run_logs_to_bq, get_datetime_range, - transform_data_to_json, + read_raw_data, save_treated_local_func, ) from pipelines.utils.execute_dbt_model.utils import get_dbt_client @@ -899,17 +899,11 @@ def transform_raw_to_nested_structure( """ # Check previous error + if error is not None: return error, None - with open(raw_filepath, "r", encoding="utf-8") as file: - data = file.read() - # ORGANIZAR: - error, data = transform_data_to_json( - data=data, - file_type=raw_filepath.split(".")[-1], - ) # Check empty dataframe # if len(status["data"]) == 0: @@ -917,13 +911,12 @@ def transform_raw_to_nested_structure( # return {"data": pd.DataFrame(), "error": error} try: + # leitura do dado raw + error, data = read_raw_data(filepath=raw_filepath) + if primary_key is None: primary_key = [] - error = None - # leitura do dado raw - data = pd.DataFrame(data) - log( f""" Received inputs: From 307863a1d381cefeeb5a9001fb8f4ef235923cbb Mon Sep 17 00:00:00 2001 From: Rafael Date: Wed, 27 Sep 2023 14:48:30 -0300 Subject: [PATCH 25/59] transform transform_raw_data_to_json to read_raw_data --- pipelines/rj_smtr/utils.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/pipelines/rj_smtr/utils.py b/pipelines/rj_smtr/utils.py index 55abfc9cf..3f4281a2c 100644 --- a/pipelines/rj_smtr/utils.py +++ b/pipelines/rj_smtr/utils.py @@ -673,15 +673,18 @@ def get_datetime_range( return {"start": start, "end": end} -def transform_data_to_json(data: str, file_type: str, csv_args: dict = {}): +def read_raw_data(filepath: str, csv_args: dict = {}) -> tuple[str, pd.DataFrame]: try: + file_type = filepath.split(".")[-1] + if file_type == "json": - data = json.loads(data) + data = pd.read_json(filepath) + # data = json.loads(data) elif file_type in ("txt", "csv"): if csv_args is None: csv_args = {} - data = pd.read_csv(io.StringIO(data), **csv_args).to_dict(orient="records") + data = pd.read_csv(filepath, **csv_args) else: error = "Unsupported raw file extension. Supported only: json, csv and txt" From 7f2c1e3fe3db535868943404e945b5b44eefad74 Mon Sep 17 00:00:00 2001 From: Rafael Date: Wed, 27 Sep 2023 14:59:43 -0300 Subject: [PATCH 26/59] fix nout task parameter --- pipelines/rj_smtr/tasks.py | 4 ++-- pipelines/rj_smtr/utils.py | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/pipelines/rj_smtr/tasks.py b/pipelines/rj_smtr/tasks.py index ee99ff654..9beb5a87e 100644 --- a/pipelines/rj_smtr/tasks.py +++ b/pipelines/rj_smtr/tasks.py @@ -875,7 +875,7 @@ def get_previous_date(days): ############### -@task +@task(nout=2) def transform_raw_to_nested_structure( raw_filepath: str, filepath: str, @@ -1040,7 +1040,7 @@ def create_request_params( return request_params, request_url -@task(checkpoint=False) +@task(checkpoint=False, nout=2) def get_raw_from_sources( source_type: str, local_filepath: str, diff --git a/pipelines/rj_smtr/utils.py b/pipelines/rj_smtr/utils.py index 3f4281a2c..8a8804474 100644 --- a/pipelines/rj_smtr/utils.py +++ b/pipelines/rj_smtr/utils.py @@ -485,7 +485,7 @@ def get_raw_data_api( # pylint: disable=R0912 api_params: dict = None, filepath: str = None, filetype: str = None, -) -> list[dict]: +) -> tuple[str, str]: """ Request data from URL API @@ -529,7 +529,7 @@ def get_raw_data_gcs( gcs_path: str, local_filepath: str, filename_to_unzip: str = None, -) -> dict: +) -> tuple[str, str]: error = None try: @@ -673,7 +673,7 @@ def get_datetime_range( return {"start": start, "end": end} -def read_raw_data(filepath: str, csv_args: dict = {}) -> tuple[str, pd.DataFrame]: +def read_raw_data(filepath: str, csv_args: dict = dict()) -> tuple[str, pd.DataFrame]: try: file_type = filepath.split(".")[-1] From 51977c10621d34ea3643004cba5bc4f990d249db Mon Sep 17 00:00:00 2001 From: Rafael Date: Wed, 27 Sep 2023 15:16:38 -0300 Subject: [PATCH 27/59] fix timedelta instantiation --- pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/schedules.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/schedules.py b/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/schedules.py index 538e5b816..f19f0d8ad 100644 --- a/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/schedules.py +++ b/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/schedules.py @@ -33,7 +33,7 @@ bilhetagem_transacao_clocks = generate_execute_schedules( clock_interval=timedelta( - constants.BILHETAGEM_GENERAL_CAPTURE_PARAMS.value["transacao_run_interval"] + **constants.BILHETAGEM_GENERAL_CAPTURE_PARAMS.value["transacao_run_interval"] ), labels=[ emd_constants.RJ_SMTR_DEV_AGENT_LABEL.value, From 8ef0b5df7c31ebb7f59ff719c338e029e34cf031 Mon Sep 17 00:00:00 2001 From: Rafael Date: Wed, 27 Sep 2023 15:58:05 -0300 Subject: [PATCH 28/59] set upstream tasks --- pipelines/rj_smtr/flows.py | 1 + pipelines/rj_smtr/tasks.py | 10 +++++++--- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/pipelines/rj_smtr/flows.py b/pipelines/rj_smtr/flows.py index 19ac776b7..a4044933a 100644 --- a/pipelines/rj_smtr/flows.py +++ b/pipelines/rj_smtr/flows.py @@ -104,6 +104,7 @@ error=error, timestamp=timestamp, primary_key=primary_key, + upstream_tasks=[RAW_UPLOADED], ) STAGING_UPLOADED = upload_staging_data_to_gcs( diff --git a/pipelines/rj_smtr/tasks.py b/pipelines/rj_smtr/tasks.py index 9beb5a87e..269ee73eb 100644 --- a/pipelines/rj_smtr/tasks.py +++ b/pipelines/rj_smtr/tasks.py @@ -1052,19 +1052,23 @@ def get_raw_from_sources( source_type, filetype = source_type.split("-", maxsplit=1) if source_type == "api": - return get_raw_data_api( + error, filepath = get_raw_data_api( url=source_path, secret_path=secret_path, api_params=api_params, filepath=local_filepath, filetype=filetype, ) - if source_type == "gcs": - return get_raw_data_gcs( + elif source_type == "gcs": + error, filepath = get_raw_data_gcs( gcs_path=source_path, filename_to_unzip=table_id, local_filepath=local_filepath, ) + else: + raise NotImplementedError(f"{source_type} not supported") + + return error, filepath # TODO: passar para função para dentro da transform_raw_to_nested_structure From 4f21f0af7fff375354538c868e7b4cedd7943f4d Mon Sep 17 00:00:00 2001 From: Rafael Date: Wed, 27 Sep 2023 16:02:09 -0300 Subject: [PATCH 29/59] declare raw_filepath --- pipelines/rj_smtr/utils.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pipelines/rj_smtr/utils.py b/pipelines/rj_smtr/utils.py index 8a8804474..0fd5c7d6c 100644 --- a/pipelines/rj_smtr/utils.py +++ b/pipelines/rj_smtr/utils.py @@ -531,6 +531,7 @@ def get_raw_data_gcs( filename_to_unzip: str = None, ) -> tuple[str, str]: error = None + raw_filepath = None try: blob = get_storage_blob(gcs_path=gcs_path) From 11b973581c7ccc103d16bccc09dccd41f86f68da Mon Sep 17 00:00:00 2001 From: eng-rodrigocunha Date: Wed, 27 Sep 2023 16:19:43 -0300 Subject: [PATCH 30/59] update docstrings --- pipelines/rj_smtr/tasks.py | 76 +++++++++++++++++++++++++++++++------- pipelines/rj_smtr/utils.py | 50 ++++++++++++++++++------- pipelines/utils/utils.py | 17 +++++++++ 3 files changed, 116 insertions(+), 27 deletions(-) diff --git a/pipelines/rj_smtr/tasks.py b/pipelines/rj_smtr/tasks.py index 269ee73eb..b12f0604c 100644 --- a/pipelines/rj_smtr/tasks.py +++ b/pipelines/rj_smtr/tasks.py @@ -168,7 +168,14 @@ def create_date_hour_partition( timestamp: datetime, partition_date_only: bool = False ) -> str: """ - Get date hour Hive partition structure from timestamp. + Generate partition string for date and hour. + + Args: + timestamp (datetime): timestamp to be used as reference + partition_date_only (bool, optional): whether to add hour partition or not + + Returns: + str: partition string """ partition = f"data={timestamp.strftime('%Y-%m-%d')}" if not partition_date_only: @@ -614,6 +621,20 @@ def upload_raw_data_to_gcs( dataset_id: str, partitions: list, ): + """ + Upload raw data to GCS. + + Args: + error (bool): whether the upstream tasks failed or not + raw_filepath (str): Path to the saved raw .json file + timestamp (datetime): timestamp for flow run + table_id (str): table_id on BigQuery + dataset_id (str): dataset_id on BigQuery + partitions (list): list of partition strings + + Returns: + None + """ if not error: try: st_obj = Storage(table_id=table_id, dataset_id=dataset_id) @@ -649,6 +670,20 @@ def upload_staging_data_to_gcs( dataset_id: str, partitions: list, ): + """ + Upload staging data to GCS. + + Args: + error (bool): whether the upstream tasks failed or not + staging_filepath (str): Path to the saved treated .csv file + timestamp (datetime): timestamp for flow run + table_id (str): table_id on BigQuery + dataset_id (str): dataset_id on BigQuery + partitions (list): list of partition strings + + Returns: + None + """ if not error: try: # Creates and publish table if it does not exist, append to it otherwise @@ -883,19 +918,18 @@ def transform_raw_to_nested_structure( timestamp: datetime, primary_key: list = None, ): - """Transform dataframe to nested structure + """ + Task to transform raw data to nested structure Args: - status (dict): Must contain keys - * `data`: dataframe returned from treatement - * `error`: error catched from data treatement - timestamp (datetime): timestamp of the capture - primary_key (list, optional): List of primary keys to be used for nesting. + raw_filepath (str): Path to the saved raw .json file + filepath (str): Path to the saved treated .csv file + error (str): Error catched from upstream tasks + timestamp (datetime): timestamp for flow run + primary_key (list, optional): Primary key to be used on nested structure Returns: - dict: Conatining keys - * `data` (json): nested data - * `error` (str): catched error, if any. Otherwise, returns None + str: Path to the saved treated .csv file """ # Check previous error @@ -1001,10 +1035,10 @@ def create_request_params( Task to create request params Args: - datetime_range (dict): datetime range to get params - table_params (dict): table params to get params - secret_path (str): secret path to get params - dataset_id (str): dataset id to get params + extract_params (dict): extract parameters + table_id (str): table_id on BigQuery + dataset_id (str): dataset_id on BigQuery + timestamp (datetime): timestamp for flow run Returns: request_params: host, database and query to request data @@ -1049,6 +1083,20 @@ def get_raw_from_sources( secret_path: str = None, api_params: dict = None, ): + """ + Task to get raw data from sources + + Args: + source_type (str): source type + local_filepath (str): local filepath + source_path (str, optional): source path. Defaults to None. + table_id (str, optional): table_id on BigQuery. Defaults to None. + secret_path (str, optional): secret path. Defaults to None. + api_params (dict, optional): api parameters. Defaults to None. + + Returns: + error: error + """ source_type, filetype = source_type.split("-", maxsplit=1) if source_type == "api": diff --git a/pipelines/rj_smtr/utils.py b/pipelines/rj_smtr/utils.py index 0fd5c7d6c..801c8d336 100644 --- a/pipelines/rj_smtr/utils.py +++ b/pipelines/rj_smtr/utils.py @@ -490,14 +490,14 @@ def get_raw_data_api( # pylint: disable=R0912 Request data from URL API Args: - url (str): URL to send request - secret_path (str, optional): Path to secrets guardeded on Vault, if needed. - params (dict, optional): Params to be sent on request + url (str): URL to request data + secret_path (str, optional): Secret path to get headers. Defaults to None. + api_params (dict, optional): Parameters to pass to API. Defaults to None. + filepath (str, optional): Path to save raw file. Defaults to None. + filetype (str, optional): Filetype to save raw file. Defaults to None. Returns: - dict: Conatining keys - * `data` (json): data result - * `error` (str): catched error, if any. Otherwise, returns None + tuple[str, str]: Error and filepath """ error = None try: @@ -530,6 +530,17 @@ def get_raw_data_gcs( local_filepath: str, filename_to_unzip: str = None, ) -> tuple[str, str]: + """ + Get raw data from GCS + + Args: + gcs_path (str): GCS path to get data + local_filepath (str): Local filepath to save raw data + filename_to_unzip (str, optional): Filename to unzip. Defaults to None. + + Returns: + tuple[str, str]: Error and filepath + """ error = None raw_filepath = None @@ -568,10 +579,9 @@ def save_treated_local_func( Save treated file to CSV. Args: - filepath (str): Path which to save treated file - status (dict): Must contain keys - * `data`: dataframe returned from treatement - * `error`: error catched from data treatement + filepath (str): Path to save file + data (pd.DataFrame): Dataframe to save + error (str): Error catched during execution mode (str, optional): Folder to save locally, later folder which to upload to GCS. Returns: @@ -601,9 +611,13 @@ def upload_run_logs_to_bq( # pylint: disable=R0913 Args: dataset_id (str): dataset_id on BigQuery - parent_table_id (str): Parent table id related to the status table - timestamp (str): ISO formatted timestamp string - error (str, optional): String associated with error caught during execution + parent_table_id (str): table_id on BigQuery + timestamp (str): timestamp to get datetime range + error (str): error catched during execution + previous_error (str): previous error catched during execution + recapture (bool): if the execution was a recapture + mode (str): folder to save locally, later folder which to upload to GCS + Returns: None """ @@ -675,6 +689,16 @@ def get_datetime_range( def read_raw_data(filepath: str, csv_args: dict = dict()) -> tuple[str, pd.DataFrame]: + """ + Read raw data from file + + Args: + filepath (str): filepath to read + csv_args (dict): arguments to pass to pandas.read_csv + + Returns: + tuple[str, pd.DataFrame]: error and data + """ try: file_type = filepath.split(".")[-1] diff --git a/pipelines/utils/utils.py b/pipelines/utils/utils.py index 57384f8f4..e37a88d8b 100644 --- a/pipelines/utils/utils.py +++ b/pipelines/utils/utils.py @@ -714,6 +714,14 @@ def get_credentials_from_env( def get_storage_blobs(dataset_id: str, table_id: str, mode: str = "staging") -> list: """ Get all blobs from a table in a dataset. + + Args: + dataset_id (str): dataset id + table_id (str): table id + mode (str, optional): mode to use. Defaults to "staging". + + Returns: + list: list of blobs """ bd_storage = bd.Storage(dataset_id=dataset_id, table_id=table_id) @@ -727,6 +735,15 @@ def get_storage_blobs(dataset_id: str, table_id: str, mode: str = "staging") -> def get_storage_blob( gcs_path: str, ): + """ + Get a blob from a path. + + Args: + gcs_path (str): path to blob + + Returns: + Blob: blob object + """ bucket = bd.Storage(dataset_id="", table_id="") return ( bucket.client["storage_staging"] From f484b880d54367e375a2ce72b02d9835f20fe4d1 Mon Sep 17 00:00:00 2001 From: Rafael Date: Wed, 27 Sep 2023 16:20:42 -0300 Subject: [PATCH 31/59] adjust get_raw_from_sources return --- pipelines/rj_smtr/tasks.py | 38 ++++++++++++++++++++++---------------- 1 file changed, 22 insertions(+), 16 deletions(-) diff --git a/pipelines/rj_smtr/tasks.py b/pipelines/rj_smtr/tasks.py index 269ee73eb..023ea2796 100644 --- a/pipelines/rj_smtr/tasks.py +++ b/pipelines/rj_smtr/tasks.py @@ -1051,22 +1051,28 @@ def get_raw_from_sources( ): source_type, filetype = source_type.split("-", maxsplit=1) - if source_type == "api": - error, filepath = get_raw_data_api( - url=source_path, - secret_path=secret_path, - api_params=api_params, - filepath=local_filepath, - filetype=filetype, - ) - elif source_type == "gcs": - error, filepath = get_raw_data_gcs( - gcs_path=source_path, - filename_to_unzip=table_id, - local_filepath=local_filepath, - ) - else: - raise NotImplementedError(f"{source_type} not supported") + log(f"Source type: {source_type}") + + try: + if source_type == "api": + error, filepath = get_raw_data_api( + url=source_path, + secret_path=secret_path, + api_params=api_params, + filepath=local_filepath, + filetype=filetype, + ) + elif source_type == "gcs": + error, filepath = get_raw_data_gcs( + gcs_path=source_path, + filename_to_unzip=table_id, + local_filepath=local_filepath, + ) + else: + raise NotImplementedError(f"{source_type} not supported") + except NotImplementedError as exp: + error = exp + filepath = None return error, filepath From 2df4318dc407b58ca6a6c4bf5a3bfad8db7fab37 Mon Sep 17 00:00:00 2001 From: Rafael Date: Wed, 27 Sep 2023 16:41:00 -0300 Subject: [PATCH 32/59] fix errors --- pipelines/rj_smtr/tasks.py | 13 +++++++++++-- pipelines/rj_smtr/utils.py | 1 + 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/pipelines/rj_smtr/tasks.py b/pipelines/rj_smtr/tasks.py index 7ff9ee637..9c2ae3be0 100644 --- a/pipelines/rj_smtr/tasks.py +++ b/pipelines/rj_smtr/tasks.py @@ -1097,7 +1097,15 @@ def get_raw_from_sources( Returns: error: error """ - source_type, filetype = source_type.split("-", maxsplit=1) + error = None + filepath = None + + source_values = source_type.split("-", maxsplit=1) + source_type = source_values[0] + try: + filetype = source_values[1] + except IndexError: + filetype = None log(f"Source type: {source_type}") @@ -1120,8 +1128,9 @@ def get_raw_from_sources( raise NotImplementedError(f"{source_type} not supported") except NotImplementedError as exp: error = exp - filepath = None + log(f"[CATCHED] Task failed with error: \n{error}", level="error") + log(f"Raw extraction ended returned values: {error}, {filepath}") return error, filepath diff --git a/pipelines/rj_smtr/utils.py b/pipelines/rj_smtr/utils.py index 801c8d336..743e955e1 100644 --- a/pipelines/rj_smtr/utils.py +++ b/pipelines/rj_smtr/utils.py @@ -699,6 +699,7 @@ def read_raw_data(filepath: str, csv_args: dict = dict()) -> tuple[str, pd.DataF Returns: tuple[str, pd.DataFrame]: error and data """ + error = None try: file_type = filepath.split(".")[-1] From df6525ac9e946f5a3d3709b768e02f2c26aae1c8 Mon Sep 17 00:00:00 2001 From: Rafael Date: Wed, 27 Sep 2023 16:45:37 -0300 Subject: [PATCH 33/59] change agent label to dev --- pipelines/rj_smtr/flows.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pipelines/rj_smtr/flows.py b/pipelines/rj_smtr/flows.py index a4044933a..27eaa76a4 100644 --- a/pipelines/rj_smtr/flows.py +++ b/pipelines/rj_smtr/flows.py @@ -119,5 +119,5 @@ default_capture_flow.storage = GCS(emd_constants.GCS_FLOWS_BUCKET.value) default_capture_flow.run_config = KubernetesRun( image=emd_constants.DOCKER_IMAGE.value, - labels=[emd_constants.RJ_SMTR_AGENT_LABEL.value], + labels=[emd_constants.RJ_SMTR_DEV_AGENT_LABEL.value], ) From 2983b687fb1910cc1086cb875367493706ed905e Mon Sep 17 00:00:00 2001 From: Rafael Date: Thu, 28 Sep 2023 10:54:51 -0300 Subject: [PATCH 34/59] refactore source values --- pipelines/rj_smtr/tasks.py | 36 ++++++------------------------------ 1 file changed, 6 insertions(+), 30 deletions(-) diff --git a/pipelines/rj_smtr/tasks.py b/pipelines/rj_smtr/tasks.py index 9c2ae3be0..4a7182daf 100644 --- a/pipelines/rj_smtr/tasks.py +++ b/pipelines/rj_smtr/tasks.py @@ -1100,14 +1100,13 @@ def get_raw_from_sources( error = None filepath = None - source_values = source_type.split("-", maxsplit=1) - source_type = source_values[0] - try: - filetype = source_values[1] - except IndexError: - filetype = None + source_values = source_type.split("-", 1) + + source_type, filetype = ( + source_values if len(source_values) == 2 else (source_values[0], None) + ) - log(f"Source type: {source_type}") + log(f"Getting raw data from source type: {source_type}") try: if source_type == "api": @@ -1132,26 +1131,3 @@ def get_raw_from_sources( log(f"Raw extraction ended returned values: {error}, {filepath}") return error, filepath - - -# TODO: passar para função para dentro da transform_raw_to_nested_structure -# @task(checkpoint=False) -# def transform_data_to_json(status: dict, file_type: str, csv_args: dict): -# data = status["data"] -# error = status["error"] - -# if file_type == "json": -# pass - -# # todo: move to data check on specfic API # pylint: disable=W0102 -# # if isinstance(data, dict) and "DescricaoErro" in data.keys(): -# # error = data["DescricaoErro"] - -# elif file_type in ("txt", "csv"): -# if csv_args is None: -# csv_args = {} -# data = pd.read_csv(io.StringIO(data), **csv_args).to_dict(orient="records") -# else: -# error = "Unsupported raw file extension. Supported only: json, csv and txt" - -# return {"data": data, "error": error} From 2c78b09404680d561a5afe5096428cb44a3b8032 Mon Sep 17 00:00:00 2001 From: eng-rodrigocunha Date: Thu, 28 Sep 2023 11:27:23 -0300 Subject: [PATCH 35/59] update constants --- pipelines/rj_smtr/constants.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/pipelines/rj_smtr/constants.py b/pipelines/rj_smtr/constants.py index 722d7e9e1..3afb0b1cd 100644 --- a/pipelines/rj_smtr/constants.py +++ b/pipelines/rj_smtr/constants.py @@ -185,6 +185,8 @@ class constants(Enum): # pylint: disable=c0103 "source_type": "api-json", "transacao_run_interval": {"minutes": 1}, "principal_run_interval": {"days": 1}, + "transacao_runs_interval_minutes": 0, + "principal_runs_interval_minutes": 15, } BILHETAGEM_TRANSACAO_CAPTURE_PARAMS = { @@ -205,7 +207,7 @@ class constants(Enum): # pylint: disable=c0103 """, "run_interval": BILHETAGEM_GENERAL_CAPTURE_PARAMS["transacao_run_interval"], }, - "primary_key": ["id"], + "primary_key": ["id"], # id column to nest data on } BILHETAGEM_CAPTURE_PARAMS = [ @@ -249,7 +251,7 @@ class constants(Enum): # pylint: disable=c0103 "principal_run_interval" ], }, - "primary_key": ["CD_GRUPO"], + "primary_key": ["CD_GRUPO"], # id column to nest data on }, { "table_id": "grupo_linha", From 1f3c2fc307e21e77de206f5ded612a690e8108cf Mon Sep 17 00:00:00 2001 From: eng-rodrigocunha Date: Thu, 28 Sep 2023 11:28:23 -0300 Subject: [PATCH 36/59] update agent --- pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/flows.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/flows.py b/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/flows.py index d7f44e3b9..793d37c0d 100644 --- a/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/flows.py +++ b/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/flows.py @@ -30,7 +30,7 @@ bilhetagem_transacao_captura.storage = GCS(emd_constants.GCS_FLOWS_BUCKET.value) bilhetagem_transacao_captura.run_config = KubernetesRun( image=emd_constants.DOCKER_IMAGE.value, - labels=[emd_constants.RJ_SMTR_AGENT_LABEL.value], + labels=[emd_constants.RJ_SMTR_DEV_AGENT_LABEL.value], ) bilhetagem_transacao_captura.schedule = bilhetagem_transacao_schedule @@ -41,6 +41,6 @@ bilhetagem_principal_captura.storage = GCS(emd_constants.GCS_FLOWS_BUCKET.value) bilhetagem_principal_captura.run_config = KubernetesRun( image=emd_constants.DOCKER_IMAGE.value, - labels=[emd_constants.RJ_SMTR_AGENT_LABEL.value], + labels=[emd_constants.RJ_SMTR_DEV_AGENT_LABEL.value], ) bilhetagem_principal_captura.schedule = bilhetagem_principal_schedule From 702e70d6ae1341889e333e2d07fc0fec70dd6cef Mon Sep 17 00:00:00 2001 From: eng-rodrigocunha Date: Thu, 28 Sep 2023 11:30:21 -0300 Subject: [PATCH 37/59] update schedule params --- .../rj_smtr/br_rj_riodejaneiro_bilhetagem/schedules.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/schedules.py b/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/schedules.py index f19f0d8ad..e897286b0 100644 --- a/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/schedules.py +++ b/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/schedules.py @@ -26,7 +26,9 @@ dataset_id=constants.BILHETAGEM_DATASET_ID.value, secret_path=constants.BILHETAGEM_SECRET_PATH.value, source_type=constants.BILHETAGEM_GENERAL_CAPTURE_PARAMS.value["source_type"], - runs_interval_minutes=15, + runs_interval_minutes=constants.BILHETAGEM_GENERAL_CAPTURE_PARAMS.value[ + "principal_runs_interval_minutes" + ], ) bilhetagem_principal_schedule = Schedule(clocks=untuple(bilhetagem_principal_clocks)) @@ -42,7 +44,9 @@ dataset_id=constants.BILHETAGEM_DATASET_ID.value, secret_path=constants.BILHETAGEM_SECRET_PATH.value, source_type=constants.BILHETAGEM_GENERAL_CAPTURE_PARAMS.value["source_type"], - runs_interval_minutes=0, + runs_interval_minutes=constants.BILHETAGEM_GENERAL_CAPTURE_PARAMS.value[ + "transacao_runs_interval_minutes" + ], ) bilhetagem_transacao_schedule = Schedule(clocks=untuple(bilhetagem_transacao_clocks)) From b5712d2746675c4925231382f2cf436da339be94 Mon Sep 17 00:00:00 2001 From: eng-rodrigocunha Date: Thu, 28 Sep 2023 11:42:25 -0300 Subject: [PATCH 38/59] update interval --- pipelines/rj_smtr/utils.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pipelines/rj_smtr/utils.py b/pipelines/rj_smtr/utils.py index 743e955e1..0972a22c8 100644 --- a/pipelines/rj_smtr/utils.py +++ b/pipelines/rj_smtr/utils.py @@ -664,21 +664,21 @@ def upload_run_logs_to_bq( # pylint: disable=R0913 def get_datetime_range( timestamp: datetime, - interval: int, + interval: timedelta, ) -> dict: """ Task to get datetime range in UTC Args: timestamp (datetime): timestamp to get datetime range - interval (int): interval in seconds + interval (timedelta): interval to get datetime range Returns: dict: datetime range """ start = ( - (timestamp - timedelta(seconds=interval)) + (timestamp - timedelta(interval)) .astimezone(tz=pytz.timezone("UTC")) .strftime("%Y-%m-%d %H:%M:%S") ) From e3df22cc2cec64b6fcc7e0258caafdf542c8ab86 Mon Sep 17 00:00:00 2001 From: eng-rodrigocunha Date: Thu, 28 Sep 2023 11:44:39 -0300 Subject: [PATCH 39/59] fix get_datetime_range interval --- pipelines/rj_smtr/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pipelines/rj_smtr/utils.py b/pipelines/rj_smtr/utils.py index 0972a22c8..7b32e2831 100644 --- a/pipelines/rj_smtr/utils.py +++ b/pipelines/rj_smtr/utils.py @@ -678,7 +678,7 @@ def get_datetime_range( """ start = ( - (timestamp - timedelta(interval)) + (timestamp - interval) .astimezone(tz=pytz.timezone("UTC")) .strftime("%Y-%m-%d %H:%M:%S") ) From 6ed06dad2772cb2d4ff32e6a19393d2e24cfe47f Mon Sep 17 00:00:00 2001 From: eng-rodrigocunha Date: Thu, 28 Sep 2023 12:21:35 -0300 Subject: [PATCH 40/59] remove order by from queries --- pipelines/rj_smtr/constants.py | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/pipelines/rj_smtr/constants.py b/pipelines/rj_smtr/constants.py index 3afb0b1cd..4f2b1c95a 100644 --- a/pipelines/rj_smtr/constants.py +++ b/pipelines/rj_smtr/constants.py @@ -202,8 +202,6 @@ class constants(Enum): # pylint: disable=c0103 WHERE data_processamento BETWEEN '{start}' AND '{end}' - ORDER BY - data_processamento """, "run_interval": BILHETAGEM_GENERAL_CAPTURE_PARAMS["transacao_run_interval"], }, @@ -223,8 +221,6 @@ class constants(Enum): # pylint: disable=c0103 LINHA WHERE DT_INCLUSAO >= '{start}' - ORDER BY - DT_INCLUSAO """, "run_interval": BILHETAGEM_GENERAL_CAPTURE_PARAMS[ "principal_run_interval" @@ -244,8 +240,6 @@ class constants(Enum): # pylint: disable=c0103 GRUPO WHERE DT_INCLUSAO >= '{start}' - ORDER BY - DT_INCLUSAO """, "run_interval": BILHETAGEM_GENERAL_CAPTURE_PARAMS[ "principal_run_interval" @@ -265,8 +259,6 @@ class constants(Enum): # pylint: disable=c0103 GRUPO_LINHA WHERE DT_INCLUSAO >= '{start}' - ORDER BY - DT_INCLUSAO """, "run_interval": BILHETAGEM_GENERAL_CAPTURE_PARAMS[ "principal_run_interval" @@ -286,8 +278,6 @@ class constants(Enum): # pylint: disable=c0103 matriz_integracao WHERE dt_inclusao >= '{start}' - ORDER BY - dt_inclusao """, "run_interval": BILHETAGEM_GENERAL_CAPTURE_PARAMS[ "principal_run_interval" From 822c59f256d4e4ff900486a6472145bcbea4b08a Mon Sep 17 00:00:00 2001 From: eng-rodrigocunha Date: Thu, 28 Sep 2023 12:22:30 -0300 Subject: [PATCH 41/59] fix get_raw_data_api --- pipelines/rj_smtr/utils.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/pipelines/rj_smtr/utils.py b/pipelines/rj_smtr/utils.py index 7b32e2831..445389340 100644 --- a/pipelines/rj_smtr/utils.py +++ b/pipelines/rj_smtr/utils.py @@ -506,6 +506,13 @@ def get_raw_data_api( # pylint: disable=R0912 else: headers = get_vault_secret(secret_path)["data"] + # remove from headers, if present + # TODO: remove this before merge to master + remove_headers = ["host", "databases"] + for remove_header in remove_headers: + if remove_header in list(headers.keys()): + del headers[remove_header] + response = requests.get( url, headers=headers, From c58ea9639bcb2812484dd899de6bfd33a776aec9 Mon Sep 17 00:00:00 2001 From: Rafael Date: Thu, 28 Sep 2023 15:41:42 -0300 Subject: [PATCH 42/59] change json read function --- pipelines/rj_smtr/utils.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/pipelines/rj_smtr/utils.py b/pipelines/rj_smtr/utils.py index 445389340..be8ed7bbd 100644 --- a/pipelines/rj_smtr/utils.py +++ b/pipelines/rj_smtr/utils.py @@ -707,11 +707,14 @@ def read_raw_data(filepath: str, csv_args: dict = dict()) -> tuple[str, pd.DataF tuple[str, pd.DataFrame]: error and data """ error = None + data = None try: file_type = filepath.split(".")[-1] if file_type == "json": - data = pd.read_json(filepath) + with open(filepath, "r") as file: + data = json.load(file) + data = pd.DataFrame(data) # data = json.loads(data) elif file_type in ("txt", "csv"): From 045a42368562263938b90a25feffaaed4c83318d Mon Sep 17 00:00:00 2001 From: Carolina Gomes Date: Thu, 28 Sep 2023 16:01:10 -0300 Subject: [PATCH 43/59] update read_raw_data --- pipelines/rj_smtr/utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pipelines/rj_smtr/utils.py b/pipelines/rj_smtr/utils.py index be8ed7bbd..c0c203dcd 100644 --- a/pipelines/rj_smtr/utils.py +++ b/pipelines/rj_smtr/utils.py @@ -713,8 +713,8 @@ def read_raw_data(filepath: str, csv_args: dict = dict()) -> tuple[str, pd.DataF if file_type == "json": with open(filepath, "r") as file: - data = json.load(file) - data = pd.DataFrame(data) + data = pd.DataFrame.from_dict(json.load(file), orient="records") + # data = json.loads(data) elif file_type in ("txt", "csv"): From d2d188f7491de19ac2554eb465e46829d04e572c Mon Sep 17 00:00:00 2001 From: Carolina Gomes Date: Thu, 28 Sep 2023 16:09:27 -0300 Subject: [PATCH 44/59] update save_raw_local_func --- pipelines/rj_smtr/utils.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pipelines/rj_smtr/utils.py b/pipelines/rj_smtr/utils.py index c0c203dcd..20168b039 100644 --- a/pipelines/rj_smtr/utils.py +++ b/pipelines/rj_smtr/utils.py @@ -448,7 +448,7 @@ def generate_execute_schedules( # pylint: disable=too-many-arguments,too-many-l def save_raw_local_func( - data: dict, filepath: str, mode: str = "raw", filetype: str = "json" + data: Union[dict, str], filepath: str, mode: str = "raw", filetype: str = "json" ) -> str: """ Saves json response from API to .json file. @@ -467,6 +467,8 @@ def save_raw_local_func( Path(_filepath).parent.mkdir(parents=True, exist_ok=True) if filetype == "json": + if isinstance(data, dict): + data = json.loads(data) json.dump(data, Path(_filepath).open("w", encoding="utf-8")) # if filetype == "csv": From b7c4e2fe39b2e0d3a613a68ecab8a155787f2292 Mon Sep 17 00:00:00 2001 From: Rafael Date: Thu, 28 Sep 2023 16:18:03 -0300 Subject: [PATCH 45/59] log error --- pipelines/rj_smtr/utils.py | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/pipelines/rj_smtr/utils.py b/pipelines/rj_smtr/utils.py index 20168b039..6219aaa78 100644 --- a/pipelines/rj_smtr/utils.py +++ b/pipelines/rj_smtr/utils.py @@ -9,6 +9,8 @@ from datetime import timedelta, datetime from typing import List, Union +import traceback +import sys import io import json import zipfile @@ -52,6 +54,19 @@ def log_critical(message: str, secret_path: str = constants.CRITICAL_SECRET_PATH return send_discord_message(message=message, webhook_url=url) +def log_error(error: str): + tb = sys.exc_info()[-1] + frame = traceback.extract_tb(tb, 1)[0] + file_name = frame[0] + function_name = frame[2] + line_no = frame[1] + + log( + f"[CATCHED] Task failed in file {file_name} - ({function_name}) line: {line_no} with error: \n{error}", + level="error", + ) + + def create_or_append_table( dataset_id: str, table_id: str, path: str, partitions: str = None ): @@ -728,6 +743,7 @@ def read_raw_data(filepath: str, csv_args: dict = dict()) -> tuple[str, pd.DataF except Exception as exp: error = exp - log(f"[CATCHED] Task failed with error: \n{error}", level="error") + log_error(error=error) + # log(f"[CATCHED] Task failed with error: \n{error}", level="error") return error, data From 2bedf890ee42187088bfa645d61a0af08598f4f7 Mon Sep 17 00:00:00 2001 From: Rafael Date: Thu, 28 Sep 2023 16:44:41 -0300 Subject: [PATCH 46/59] change raw api extraction for json --- pipelines/rj_smtr/tasks.py | 7 ++++--- pipelines/rj_smtr/utils.py | 14 +++++++++----- 2 files changed, 13 insertions(+), 8 deletions(-) diff --git a/pipelines/rj_smtr/tasks.py b/pipelines/rj_smtr/tasks.py index 4a7182daf..be878db21 100644 --- a/pipelines/rj_smtr/tasks.py +++ b/pipelines/rj_smtr/tasks.py @@ -34,6 +34,7 @@ get_datetime_range, read_raw_data, save_treated_local_func, + log_error, ) from pipelines.utils.execute_dbt_model.utils import get_dbt_client from pipelines.utils.utils import log, get_redis_client, get_vault_secret @@ -434,7 +435,7 @@ def get_raw( # pylint: disable=R0912 error = exp if error is not None: - log(f"[CATCHED] Task failed with error: \n{error}", level="error") + log_error(error=error) return {"data": data, "error": error} @@ -992,7 +993,7 @@ def transform_raw_to_nested_structure( error = exp if error is not None: - log(f"[CATCHED] Task failed with error: \n{error}", level="error") + log_error(error=error) return error, filepath @@ -1127,7 +1128,7 @@ def get_raw_from_sources( raise NotImplementedError(f"{source_type} not supported") except NotImplementedError as exp: error = exp - log(f"[CATCHED] Task failed with error: \n{error}", level="error") + log_error(error=error) log(f"Raw extraction ended returned values: {error}, {filepath}") return error, filepath diff --git a/pipelines/rj_smtr/utils.py b/pipelines/rj_smtr/utils.py index 6219aaa78..41b29d41e 100644 --- a/pipelines/rj_smtr/utils.py +++ b/pipelines/rj_smtr/utils.py @@ -538,13 +538,17 @@ def get_raw_data_api( # pylint: disable=R0912 ) response.raise_for_status() - filepath = save_raw_local_func( - data=response.text, filepath=filepath, filetype=filetype - ) + + if filetype == "json": + data = response.json() + else: + data = response.text + + filepath = save_raw_local_func(data=data, filepath=filepath, filetype=filetype) except Exception as exp: error = exp - log(f"[CATCHED] Task failed with error: \n{error}", level="error") + log_error(error=error) return error, filepath @@ -591,7 +595,7 @@ def get_raw_data_gcs( except Exception as exp: error = exp - log(f"[CATCHED] Task failed with error: \n{error}", level="error") + log_error(error=error) return error, raw_filepath From 20b48dfb2950ba513c049e922b8768da9ab03e57 Mon Sep 17 00:00:00 2001 From: Rafael Date: Thu, 28 Sep 2023 16:53:26 -0300 Subject: [PATCH 47/59] change read json function --- pipelines/rj_smtr/utils.py | 13 +++---------- 1 file changed, 3 insertions(+), 10 deletions(-) diff --git a/pipelines/rj_smtr/utils.py b/pipelines/rj_smtr/utils.py index 41b29d41e..9c04ed701 100644 --- a/pipelines/rj_smtr/utils.py +++ b/pipelines/rj_smtr/utils.py @@ -55,14 +55,9 @@ def log_critical(message: str, secret_path: str = constants.CRITICAL_SECRET_PATH def log_error(error: str): - tb = sys.exc_info()[-1] - frame = traceback.extract_tb(tb, 1)[0] - file_name = frame[0] - function_name = frame[2] - line_no = frame[1] - + error = traceback.format_exc() log( - f"[CATCHED] Task failed in file {file_name} - ({function_name}) line: {line_no} with error: \n{error}", + f"[CATCHED] Task failed with error: \n{error}", level="error", ) @@ -733,11 +728,9 @@ def read_raw_data(filepath: str, csv_args: dict = dict()) -> tuple[str, pd.DataF file_type = filepath.split(".")[-1] if file_type == "json": - with open(filepath, "r") as file: - data = pd.DataFrame.from_dict(json.load(file), orient="records") + data = pd.read_json(filepath) # data = json.loads(data) - elif file_type in ("txt", "csv"): if csv_args is None: csv_args = {} From 42c6ac008e6e8f569993c9b0a40958941c0750a0 Mon Sep 17 00:00:00 2001 From: Rafael Date: Thu, 28 Sep 2023 17:45:44 -0300 Subject: [PATCH 48/59] print log traceback --- pipelines/rj_smtr/tasks.py | 23 +++++++++-------------- pipelines/rj_smtr/utils.py | 21 ++++++--------------- 2 files changed, 15 insertions(+), 29 deletions(-) diff --git a/pipelines/rj_smtr/tasks.py b/pipelines/rj_smtr/tasks.py index be878db21..dd48d2c64 100644 --- a/pipelines/rj_smtr/tasks.py +++ b/pipelines/rj_smtr/tasks.py @@ -34,7 +34,6 @@ get_datetime_range, read_raw_data, save_treated_local_func, - log_error, ) from pipelines.utils.execute_dbt_model.utils import get_dbt_client from pipelines.utils.utils import log, get_redis_client, get_vault_secret @@ -431,11 +430,9 @@ def get_raw( # pylint: disable=R0912 "Unsupported raw file extension. Supported only: json, csv and txt" ) - except Exception as exp: - error = exp - - if error is not None: - log_error(error=error) + except Exception: + error = traceback.format_exc() + log(f"[CATCHED] Task failed with error: \n{error}", level="error") return {"data": data, "error": error} @@ -989,11 +986,9 @@ def transform_raw_to_nested_structure( # save treated local filepath = save_treated_local_func(data=data, error=error, filepath=filepath) - except Exception as exp: # pylint: disable=W0703 - error = exp - - if error is not None: - log_error(error=error) + except Exception: # pylint: disable=W0703 + error = traceback.format_exc() + log(f"[CATCHED] Task failed with error: \n{error}", level="error") return error, filepath @@ -1126,9 +1121,9 @@ def get_raw_from_sources( ) else: raise NotImplementedError(f"{source_type} not supported") - except NotImplementedError as exp: - error = exp - log_error(error=error) + except NotImplementedError: + error = traceback.format_exc() + log(f"[CATCHED] Task failed with error: \n{error}", level="error") log(f"Raw extraction ended returned values: {error}, {filepath}") return error, filepath diff --git a/pipelines/rj_smtr/utils.py b/pipelines/rj_smtr/utils.py index 9c04ed701..553bd860a 100644 --- a/pipelines/rj_smtr/utils.py +++ b/pipelines/rj_smtr/utils.py @@ -54,14 +54,6 @@ def log_critical(message: str, secret_path: str = constants.CRITICAL_SECRET_PATH return send_discord_message(message=message, webhook_url=url) -def log_error(error: str): - error = traceback.format_exc() - log( - f"[CATCHED] Task failed with error: \n{error}", - level="error", - ) - - def create_or_append_table( dataset_id: str, table_id: str, path: str, partitions: str = None ): @@ -542,8 +534,8 @@ def get_raw_data_api( # pylint: disable=R0912 filepath = save_raw_local_func(data=data, filepath=filepath, filetype=filetype) except Exception as exp: - error = exp - log_error(error=error) + error = traceback.format_exc() + log(f"[CATCHED] Task failed with error: \n{error}", level="error") return error, filepath @@ -589,8 +581,8 @@ def get_raw_data_gcs( ) except Exception as exp: - error = exp - log_error(error=error) + error = traceback.format_exc() + log(f"[CATCHED] Task failed with error: \n{error}", level="error") return error, raw_filepath @@ -739,8 +731,7 @@ def read_raw_data(filepath: str, csv_args: dict = dict()) -> tuple[str, pd.DataF error = "Unsupported raw file extension. Supported only: json, csv and txt" except Exception as exp: - error = exp - log_error(error=error) - # log(f"[CATCHED] Task failed with error: \n{error}", level="error") + error = traceback.format_exc() + log(f"[CATCHED] Task failed with error: \n{error}", level="error") return error, data From 2a1a25d41f18e45db740921fd40ce848e184605c Mon Sep 17 00:00:00 2001 From: eng-rodrigocunha Date: Thu, 28 Sep 2023 18:52:18 -0300 Subject: [PATCH 49/59] enrich logs --- .../rj_smtr/br_rj_riodejaneiro_rdo/tasks.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/pipelines/rj_smtr/br_rj_riodejaneiro_rdo/tasks.py b/pipelines/rj_smtr/br_rj_riodejaneiro_rdo/tasks.py index 1594e33f9..aeca6fb75 100644 --- a/pipelines/rj_smtr/br_rj_riodejaneiro_rdo/tasks.py +++ b/pipelines/rj_smtr/br_rj_riodejaneiro_rdo/tasks.py @@ -172,7 +172,18 @@ def pre_treatment_br_rj_riodejaneiro_rdo( log(f"Received {len(files)} to treat") for file_info in files: log(f"Processing file {files.index(file_info)}") + + log( + f"""rdo_constants.RDO_PRE_TREATMENT_CONFIG is:\n + {rdo_constants.RDO_PRE_TREATMENT_CONFIG.value}""" + ) + log(f"File info is:\n{file_info}") + try: + with open(file_info["raw_path"], "r") as raw_file: + log(f"Opened raw file {file_info['raw_path']}") + log(f"raw_file is:\n{raw_file}") + config = rdo_constants.RDO_PRE_TREATMENT_CONFIG.value[ file_info["transport_mode"] ][file_info["report_type"]] @@ -234,6 +245,12 @@ def pre_treatment_br_rj_riodejaneiro_rdo( raw_paths.append(None) partitions.append(None) status.append({"error": e}) + + log(f"Returning treated paths:\n {treated_paths}") + log(f"Returning raw paths:\n {raw_paths}") + log(f"Returning partitions:\n {partitions}") + log(f"Returning status:\n {status}") + return treated_paths, raw_paths, partitions, status From 0cf71887e47d6415490e9e83631182748a1b3f6d Mon Sep 17 00:00:00 2001 From: eng-rodrigocunha Date: Fri, 29 Sep 2023 16:10:51 -0300 Subject: [PATCH 50/59] treat error --- pipelines/rj_smtr/br_rj_riodejaneiro_rdo/tasks.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/pipelines/rj_smtr/br_rj_riodejaneiro_rdo/tasks.py b/pipelines/rj_smtr/br_rj_riodejaneiro_rdo/tasks.py index aeca6fb75..89687f6d2 100644 --- a/pipelines/rj_smtr/br_rj_riodejaneiro_rdo/tasks.py +++ b/pipelines/rj_smtr/br_rj_riodejaneiro_rdo/tasks.py @@ -177,12 +177,21 @@ def pre_treatment_br_rj_riodejaneiro_rdo( f"""rdo_constants.RDO_PRE_TREATMENT_CONFIG is:\n {rdo_constants.RDO_PRE_TREATMENT_CONFIG.value}""" ) + log(f"File info is:\n{file_info}") try: + if file_info["error"] is not None: + log(f"Pre Treatment failed with error: {file_info['error']}") + treated_paths.append(None) + raw_paths.append(None) + partitions.append(None) + status.append({"error": file_info["error"]}) + continue + with open(file_info["raw_path"], "r") as raw_file: log(f"Opened raw file {file_info['raw_path']}") - log(f"raw_file is:\n{raw_file}") + log(f"raw_file is:\n{raw_file.read()}") config = rdo_constants.RDO_PRE_TREATMENT_CONFIG.value[ file_info["transport_mode"] From dbdbffeb772f5011c8b5416d571256b024700cd5 Mon Sep 17 00:00:00 2001 From: eng-rodrigocunha Date: Fri, 29 Sep 2023 17:24:04 -0300 Subject: [PATCH 51/59] update to connect just once --- .../rj_smtr/br_rj_riodejaneiro_rdo/flows.py | 37 ++++++++++++++----- .../rj_smtr/br_rj_riodejaneiro_rdo/tasks.py | 20 +++++++--- pipelines/rj_smtr/tasks.py | 16 ++++++++ 3 files changed, 59 insertions(+), 14 deletions(-) diff --git a/pipelines/rj_smtr/br_rj_riodejaneiro_rdo/flows.py b/pipelines/rj_smtr/br_rj_riodejaneiro_rdo/flows.py index b7be66945..d301f2aab 100644 --- a/pipelines/rj_smtr/br_rj_riodejaneiro_rdo/flows.py +++ b/pipelines/rj_smtr/br_rj_riodejaneiro_rdo/flows.py @@ -23,6 +23,7 @@ bq_upload, get_current_timestamp, set_last_run_timestamp, + connect_ftp_task, ) from pipelines.rj_smtr.schedules import every_day @@ -210,13 +211,24 @@ wait=None, ) # EXTRACT + ftp_client = connect_ftp_task( + secret_path=constants.RDO_FTPS_SECRET_PATH.value, connect_flag=True + ) + files = get_file_paths_from_ftp( - transport_mode=transport_mode, report_type=report_type, dump=dump + transport_mode=transport_mode, + report_type=report_type, + dump=dump, + ftp_client=ftp_client, ) download_files = check_files_for_download( files=files, dataset_id=constants.RDO_DATASET_ID.value, table_id=table_id ) - updated_info = download_and_save_local_from_ftp.map(file_info=download_files) + updated_info = download_and_save_local_from_ftp.map( + file_info=download_files, ftp_client=ftp_client + ) + + connect_ftp_task(ftp_client=ftp_client, disconnect_flag=True) # TRANSFORM treated_path, raw_path, partitions, status = pre_treatment_br_rj_riodejaneiro_rdo( files=updated_info @@ -258,13 +270,25 @@ wait=None, ) # EXTRACT + ftp_client = connect_ftp_task( + secret_path=constants.RDO_FTPS_SECRET_PATH.value, connect_flag=True + ) + files = get_file_paths_from_ftp( - transport_mode=transport_mode, report_type=report_type, dump=dump + transport_mode=transport_mode, + report_type=report_type, + dump=dump, + ftp_client=ftp_client, ) download_files = check_files_for_download( files=files, dataset_id=constants.RDO_DATASET_ID.value, table_id=table_id ) - updated_info = download_and_save_local_from_ftp.map(file_info=download_files) + updated_info = download_and_save_local_from_ftp.map( + file_info=download_files, ftp_client=ftp_client + ) + + connect_ftp_task(ftp_client=ftp_client, disconnect_flag=True) + # TRANSFORM treated_path, raw_path, partitions, status = pre_treatment_br_rj_riodejaneiro_rdo( files=updated_info @@ -288,8 +312,3 @@ labels=[emd_constants.RJ_SMTR_DEV_AGENT_LABEL.value], ) captura_stpl_rdo.schedule = every_day - - -# captura_sppo_rho = deepcopy(captura_sppo_rdo) -# captura_sppo_rho.storage = GCS(emd_constants.GCS_FLOWS_BUCKET.value) -# captura_sppo_rho.run_config = KubernetesRun(image=emd_constants.DOCKER_IMAGE.value) diff --git a/pipelines/rj_smtr/br_rj_riodejaneiro_rdo/tasks.py b/pipelines/rj_smtr/br_rj_riodejaneiro_rdo/tasks.py index 89687f6d2..d23103953 100644 --- a/pipelines/rj_smtr/br_rj_riodejaneiro_rdo/tasks.py +++ b/pipelines/rj_smtr/br_rj_riodejaneiro_rdo/tasks.py @@ -34,7 +34,7 @@ @task def get_file_paths_from_ftp( - transport_mode: str, report_type: str, wait=None, dump=False + transport_mode: str, report_type: str, wait=None, dump=False, ftp_client=None ): # pylint: disable=W0613 """ Search for files inside previous interval (days) from current date, @@ -44,7 +44,9 @@ def get_file_paths_from_ftp( min_timestamp = datetime(2022, 1, 1).timestamp() # set min timestamp for search # Connect to FTP & search files # try: - ftp_client = connect_ftp(constants.RDO_FTPS_SECRET_PATH.value) + if ftp_client is None: + ftp_client = connect_ftp(constants.RDO_FTPS_SECRET_PATH.value) + files_updated_times = { file: datetime.timestamp(parser.parse(info["modify"])) for file, info in ftp_client.mlsd(transport_mode) @@ -105,7 +107,7 @@ def check_files_for_download(files: list, dataset_id: str, table_id: str): @task -def download_and_save_local_from_ftp(file_info: dict): +def download_and_save_local_from_ftp(file_info: dict, ftp_client=None): """ Downloads file from FTP and saves to data/raw//. """ @@ -122,6 +124,8 @@ def download_and_save_local_from_ftp(file_info: dict): mode=file_info["transport_mode"], report_type=file_info["report_type"] ) + ftp_client_quit_flag = False + # Set general local path to save file (bucket_modes: raw or staging) file_info[ "local_path" @@ -133,14 +137,20 @@ def download_and_save_local_from_ftp(file_info: dict): Path(file_info["raw_path"]).parent.mkdir(parents=True, exist_ok=True) try: # Get data from FTP - TODO: create get_raw() error alike - ftp_client = connect_ftp(constants.RDO_FTPS_SECRET_PATH.value) + if ftp_client is None: + ftp_client_quit_flag = True + ftp_client = connect_ftp(constants.RDO_FTPS_SECRET_PATH.value) + if not Path(file_info["raw_path"]).is_file(): with open(file_info["raw_path"], "wb") as raw_file: ftp_client.retrbinary( "RETR " + file_info["ftp_path"], raw_file.write, ) - ftp_client.quit() + + if ftp_client_quit_flag: + ftp_client.quit() + # Get timestamp of download time file_info["timestamp_captura"] = pendulum.now( constants.TIMEZONE.value diff --git a/pipelines/rj_smtr/tasks.py b/pipelines/rj_smtr/tasks.py index dd48d2c64..e9d360ea7 100644 --- a/pipelines/rj_smtr/tasks.py +++ b/pipelines/rj_smtr/tasks.py @@ -34,6 +34,7 @@ get_datetime_range, read_raw_data, save_treated_local_func, + connect_ftp, ) from pipelines.utils.execute_dbt_model.utils import get_dbt_client from pipelines.utils.utils import log, get_redis_client, get_vault_secret @@ -1127,3 +1128,18 @@ def get_raw_from_sources( log(f"Raw extraction ended returned values: {error}, {filepath}") return error, filepath + + +@task(checkpoint=False) +def connect_ftp_task( + secret_path: str = None, + secure: bool = True, + connect_flag: bool = False, + ftp_client=None, + disconnect_flag: bool = False, +): + if connect_flag: + return connect_ftp(secret_path, secure) + + if disconnect_flag: + ftp_client.quit() From c68d376608135f85951d0a32afdb8eb4bd1a69a1 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 2 Oct 2023 18:04:18 +0000 Subject: [PATCH 52/59] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- pipelines/rj_smtr/tasks.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/pipelines/rj_smtr/tasks.py b/pipelines/rj_smtr/tasks.py index efed417b0..8e8462086 100644 --- a/pipelines/rj_smtr/tasks.py +++ b/pipelines/rj_smtr/tasks.py @@ -744,7 +744,6 @@ def upload_raw_data_to_gcs( Union[str, None]: if there is an error returns it traceback, otherwise returns None """ if error is None: - try: st_obj = Storage(table_id=table_id, dataset_id=dataset_id) log( @@ -773,7 +772,6 @@ def upload_staging_data_to_gcs( dataset_id: str, partitions: list, ) -> Union[str, None]: - """ Upload staging data to GCS. @@ -789,7 +787,6 @@ def upload_staging_data_to_gcs( Union[str, None]: if there is an error returns it traceback, otherwise returns None """ if error is None: - try: # Creates and publish table if it does not exist, append to it otherwise create_or_append_table( @@ -812,6 +809,7 @@ def upload_staging_data_to_gcs( return error + ############### # # Daterange tasks @@ -1099,6 +1097,7 @@ def transform_raw_to_nested_structure( return error, filepath + @task(checkpoint=False) def connect_ftp_task( secret_path: str = None, From 8da9b9d3e6d1543f2ac16abe72658520afca0a18 Mon Sep 17 00:00:00 2001 From: eng-rodrigocunha Date: Mon, 2 Oct 2023 15:11:04 -0300 Subject: [PATCH 53/59] update utils --- pipelines/rj_smtr/utils.py | 338 +++++++++++++++++++++++++++++++++++-- 1 file changed, 320 insertions(+), 18 deletions(-) diff --git a/pipelines/rj_smtr/utils.py b/pipelines/rj_smtr/utils.py index 9ddf7d687..1d71dd3dd 100644 --- a/pipelines/rj_smtr/utils.py +++ b/pipelines/rj_smtr/utils.py @@ -8,12 +8,18 @@ from pathlib import Path from datetime import timedelta, datetime -from typing import List +from typing import List, Union +import traceback import io +import json +import zipfile +import pytz +import requests import basedosdados as bd from basedosdados import Table import pandas as pd -import pytz +from google.cloud.storage.blob import Blob + from prefect.schedules.clocks import IntervalClock @@ -398,46 +404,41 @@ def data_info_str(data: pd.DataFrame): def generate_execute_schedules( # pylint: disable=too-many-arguments,too-many-locals - interval: timedelta, + clock_interval: timedelta, labels: List[str], - table_parameters: list, - dataset_id: str, - secret_path: str, + table_parameters: Union[list[dict], dict], runs_interval_minutes: int = 15, start_date: datetime = datetime( 2020, 1, 1, tzinfo=pytz.timezone(emd_constants.DEFAULT_TIMEZONE.value) ), + **general_flow_params, ) -> List[IntervalClock]: """ Generates multiple schedules Args: - interval (timedelta): The interval to run the schedule + clock_interval (timedelta): The interval to run the schedule labels (List[str]): The labels to be added to the schedule - table_parameters (list): The table parameters - dataset_id (str): The dataset_id to be used in the schedule - secret_path (str): The secret path to be used in the schedule + table_parameters (list): The table parameters to iterate over runs_interval_minutes (int, optional): The interval between each schedule. Defaults to 15. start_date (datetime, optional): The start date of the schedule. Defaults to datetime(2020, 1, 1, tzinfo=pytz.timezone(emd_constants.DEFAULT_TIMEZONE.value)). - + general_flow_params: Any param that you want to pass to the flow Returns: List[IntervalClock]: The list of schedules """ + if isinstance(table_parameters, dict): + table_parameters = [table_parameters] clocks = [] for count, parameters in enumerate(table_parameters): - parameter_defaults = { - "table_params": parameters, - "dataset_id": dataset_id, - "secret_path": secret_path, - "interval": interval.total_seconds(), - } + parameter_defaults = parameters | general_flow_params + log(f"parameter_defaults: {parameter_defaults}") clocks.append( IntervalClock( - interval=interval, + interval=clock_interval, start_date=start_date + timedelta(minutes=runs_interval_minutes * count), labels=labels, @@ -445,3 +446,304 @@ def generate_execute_schedules( # pylint: disable=too-many-arguments,too-many-l ) ) return clocks + + +def save_raw_local_func( + data: Union[dict, str], filepath: str, mode: str = "raw", filetype: str = "json" +) -> str: + """ + Saves json response from API to .json file. + Args: + filepath (str): Path which to save raw file + status (dict): Must contain keys + * data: json returned from API + * error: error catched from API request + mode (str, optional): Folder to save locally, later folder which to upload to GCS. + Returns: + str: Path to the saved file + """ + + # diferentes tipos de arquivos para salvar + _filepath = filepath.format(mode=mode, filetype=filetype) + Path(_filepath).parent.mkdir(parents=True, exist_ok=True) + + if filetype == "json": + if isinstance(data, dict): + data = json.loads(data) + json.dump(data, Path(_filepath).open("w", encoding="utf-8")) + + # if filetype == "csv": + # pass + if filetype in ("txt", "csv"): + with open(_filepath, "w", encoding="utf-8") as file: + file.write(data) + + log(f"Raw data saved to: {_filepath}") + return _filepath + + +def get_raw_data_api( # pylint: disable=R0912 + url: str, + secret_path: str = None, + api_params: dict = None, + filetype: str = None, +) -> tuple[str, str, str]: + """ + Request data from URL API + + Args: + url (str): URL to request data + secret_path (str, optional): Secret path to get headers. Defaults to None. + api_params (dict, optional): Parameters to pass to API. Defaults to None. + filetype (str, optional): Filetype to save raw file. Defaults to None. + + Returns: + tuple[str, str, str]: Error, data and filetype + """ + error = None + data = None + try: + if secret_path is None: + headers = secret_path + else: + headers = get_vault_secret(secret_path)["data"] + + response = requests.get( + url, + headers=headers, + timeout=constants.MAX_TIMEOUT_SECONDS.value, + params=api_params, + ) + + response.raise_for_status() + + if filetype == "json": + data = response.json() + else: + data = response.text + + except Exception: + error = traceback.format_exc() + log(f"[CATCHED] Task failed with error: \n{error}", level="error") + + return error, data, filetype + + +def get_upload_storage_blob( + dataset_id: str, + filename: str, +) -> Blob: + """ + Get a blob from upload zone in storage + + Args: + dataset_id (str): The dataset id on BigQuery. + filename (str): The filename in GCS. + + Returns: + Blob: blob object + """ + bucket = bd.Storage(dataset_id="", table_id="") + blob_list = list( + bucket.client["storage_staging"] + .bucket(bucket.bucket_name) + .list_blobs(prefix=f"upload/{dataset_id}/{filename}.") + ) + return blob_list[0] + + +def get_raw_data_gcs( + dataset_id: str, + table_id: str, + zip_filename: str = None, +) -> tuple[str, str, str]: + """ + Get raw data from GCS + + Args: + dataset_id (str): The dataset id on BigQuery. + table_id (str): The table id on BigQuery. + zip_filename (str, optional): The zip file name. Defaults to None. + + Returns: + tuple[str, str, str]: Error, data and filetype + """ + error = None + data = None + filetype = None + + try: + blob_search_name = zip_filename or table_id + blob = get_upload_storage_blob(dataset_id=dataset_id, filename=blob_search_name) + + filename = blob.name + filetype = filename.split(".")[-1] + + data = blob.download_as_bytes() + + if filetype == "zip": + with zipfile.ZipFile(io.BytesIO(data), "r") as zipped_file: + filenames = zipped_file.namelist() + filename = list( + filter(lambda x: x.split(".")[0] == table_id, filenames) + )[0] + filetype = filename.split(".")[-1] + data = zipped_file.read(filename) + + data = data.decode(encoding="utf-8") + + except Exception: + error = traceback.format_exc() + log(f"[CATCHED] Task failed with error: \n{error}", level="error") + + return error, data, filetype + + +def save_treated_local_func( + filepath: str, data: pd.DataFrame, error: str, mode: str = "staging" +) -> str: + """ + Save treated file to CSV. + + Args: + filepath (str): Path to save file + data (pd.DataFrame): Dataframe to save + error (str): Error catched during execution + mode (str, optional): Folder to save locally, later folder which to upload to GCS. + + Returns: + str: Path to the saved file + """ + _filepath = filepath.format(mode=mode, filetype="csv") + Path(_filepath).parent.mkdir(parents=True, exist_ok=True) + if error is None: + data.to_csv(_filepath, index=False) + log(f"Treated data saved to: {_filepath}") + return _filepath + + +def upload_run_logs_to_bq( # pylint: disable=R0913 + dataset_id: str, + parent_table_id: str, + timestamp: str, + error: str = None, + previous_error: str = None, + recapture: bool = False, + mode: str = "raw", +): + """ + Upload execution status table to BigQuery. + Table is uploaded to the same dataset, named {parent_table_id}_logs. + If passing status_dict, should not pass timestamp and error. + + Args: + dataset_id (str): dataset_id on BigQuery + parent_table_id (str): table_id on BigQuery + timestamp (str): timestamp to get datetime range + error (str): error catched during execution + previous_error (str): previous error catched during execution + recapture (bool): if the execution was a recapture + mode (str): folder to save locally, later folder which to upload to GCS + + Returns: + None + """ + table_id = parent_table_id + "_logs" + # Create partition directory + filename = f"{table_id}_{timestamp.isoformat()}" + partition = f"data={timestamp.date()}" + filepath = Path( + f"""data/{mode}/{dataset_id}/{table_id}/{partition}/{filename}.csv""" + ) + filepath.parent.mkdir(exist_ok=True, parents=True) + # Create dataframe to be uploaded + if not error and recapture is True: + # if the recapture is succeeded, update the column erro + dataframe = pd.DataFrame( + { + "timestamp_captura": [timestamp], + "sucesso": [True], + "erro": [f"[recapturado]{previous_error}"], + } + ) + log(f"Recapturing {timestamp} with previous error:\n{error}") + else: + # not recapturing or error during flow execution + dataframe = pd.DataFrame( + { + "timestamp_captura": [timestamp], + "sucesso": [error is None], + "erro": [error], + } + ) + # Save data local + dataframe.to_csv(filepath, index=False) + # Upload to Storage + create_or_append_table( + dataset_id=dataset_id, + table_id=table_id, + path=filepath.as_posix(), + partitions=partition, + ) + if error is not None: + raise Exception(f"Pipeline failed with error: {error}") + + +def get_datetime_range( + timestamp: datetime, + interval: timedelta, +) -> dict: + """ + Task to get datetime range in UTC + + Args: + timestamp (datetime): timestamp to get datetime range + interval (timedelta): interval to get datetime range + + Returns: + dict: datetime range + """ + + start = ( + (timestamp - interval) + .astimezone(tz=pytz.timezone("UTC")) + .strftime("%Y-%m-%d %H:%M:%S") + ) + + end = timestamp.astimezone(tz=pytz.timezone("UTC")).strftime("%Y-%m-%d %H:%M:%S") + + return {"start": start, "end": end} + + +def read_raw_data(filepath: str, csv_args: dict = None) -> tuple[str, pd.DataFrame]: + """ + Read raw data from file + + Args: + filepath (str): filepath to read + csv_args (dict): arguments to pass to pandas.read_csv + + Returns: + tuple[str, pd.DataFrame]: error and data + """ + error = None + data = None + try: + file_type = filepath.split(".")[-1] + + if file_type == "json": + data = pd.read_json(filepath) + + # data = json.loads(data) + elif file_type in ("txt", "csv"): + if csv_args is None: + csv_args = {} + data = pd.read_csv(filepath, **csv_args) + else: + error = "Unsupported raw file extension. Supported only: json, csv and txt" + + except Exception: + error = traceback.format_exc() + log(f"[CATCHED] Task failed with error: \n{error}", level="error") + + return error, data From 2732aa93af6afaa545a3beb77808c6e5aaa62e49 Mon Sep 17 00:00:00 2001 From: eng-rodrigocunha Date: Mon, 2 Oct 2023 15:16:31 -0300 Subject: [PATCH 54/59] update utils --- pipelines/utils/utils.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/pipelines/utils/utils.py b/pipelines/utils/utils.py index efc21c133..adf89bc94 100644 --- a/pipelines/utils/utils.py +++ b/pipelines/utils/utils.py @@ -711,16 +711,24 @@ def get_credentials_from_env( return cred -def get_storage_blobs(dataset_id: str, table_id: str) -> list: +def get_storage_blobs(dataset_id: str, table_id: str, mode: str = "staging") -> list: """ Get all blobs from a table in a dataset. + + Args: + dataset_id (str): dataset id + table_id (str): table id + mode (str, optional): mode to use. Defaults to "staging". + + Returns: + list: list of blobs """ bd_storage = bd.Storage(dataset_id=dataset_id, table_id=table_id) return list( bd_storage.client["storage_staging"] .bucket(bd_storage.bucket_name) - .list_blobs(prefix=f"staging/{bd_storage.dataset_id}/{bd_storage.table_id}/") + .list_blobs(prefix=f"{mode}/{bd_storage.dataset_id}/{bd_storage.table_id}/") ) From fec51a299f0c0fb0a9f955aab855e29890a45668 Mon Sep 17 00:00:00 2001 From: eng-rodrigocunha Date: Mon, 2 Oct 2023 15:20:05 -0300 Subject: [PATCH 55/59] update constants --- pipelines/rj_smtr/constants.py | 19 ------------------- 1 file changed, 19 deletions(-) diff --git a/pipelines/rj_smtr/constants.py b/pipelines/rj_smtr/constants.py index d30989743..52e30d9f8 100644 --- a/pipelines/rj_smtr/constants.py +++ b/pipelines/rj_smtr/constants.py @@ -290,22 +290,3 @@ class constants(Enum): # pylint: disable=c0103 }, ] BILHETAGEM_SECRET_PATH = "smtr_jae_access_data" - - # GTFS - GTFS_DATASET_ID = "br_rj_riodejaneiro_gtfs" - GTFS_GENERAL_CAPTURE_PARAMS = {"partition_date_only": True, "source_type": "gcs"} - GTFS_CAPTURE_PARAMS = [ - {"table_id": "agency", "primary_key": ["agency_id"]}, - {"table_id": "calendar_dates", "primary_key": ["service_id"]}, - {"table_id": "calendar", "primary_key": ["service_id"]}, - {"table_id": "feed_info", "primary_key": ["feed_publisher_name"]}, - {"table_id": "frequencies", "primary_key": ["trip_id"]}, - {"table_id": "routes", "primary_key": ["route_id"]}, - {"table_id": "shapes", "primary_key": ["shape_id"]}, - {"table_id": "stops", "primary_key": ["stop_id"]}, - {"table_id": "trips", "primary_key": ["trip_id"]}, - {"table_id": "fare_attributes", "primary_key": ["fare_id"]}, - {"table_id": "fare_rules", "primary_key": ["fare_id"]}, - ] - GTFS_QUADRO_CAPTURE_PARAMS = {"table_id": "quadro", "primary_key": "servico"} - GTFS_BASE_GCS_PATH = "development/br_rj_riodejaneiro_gtfs/upload" From a7e47e763f39464d7ee0f4b3104fa6edc530e77e Mon Sep 17 00:00:00 2001 From: eng-rodrigocunha Date: Mon, 2 Oct 2023 16:44:17 -0300 Subject: [PATCH 56/59] atualiza estrutura de ftp_client --- .../rj_smtr/br_rj_riodejaneiro_rdo/flows.py | 18 ++++++++---------- .../rj_smtr/br_rj_riodejaneiro_rdo/tasks.py | 9 ++++----- 2 files changed, 12 insertions(+), 15 deletions(-) diff --git a/pipelines/rj_smtr/br_rj_riodejaneiro_rdo/flows.py b/pipelines/rj_smtr/br_rj_riodejaneiro_rdo/flows.py index d301f2aab..781bcc181 100644 --- a/pipelines/rj_smtr/br_rj_riodejaneiro_rdo/flows.py +++ b/pipelines/rj_smtr/br_rj_riodejaneiro_rdo/flows.py @@ -211,15 +211,14 @@ wait=None, ) # EXTRACT - ftp_client = connect_ftp_task( - secret_path=constants.RDO_FTPS_SECRET_PATH.value, connect_flag=True - ) + # ftp_client = connect_ftp_task( + # secret_path=constants.RDO_FTPS_SECRET_PATH.value, connect_flag=True + # ) - files = get_file_paths_from_ftp( + files, ftp_client = get_file_paths_from_ftp( transport_mode=transport_mode, report_type=report_type, dump=dump, - ftp_client=ftp_client, ) download_files = check_files_for_download( files=files, dataset_id=constants.RDO_DATASET_ID.value, table_id=table_id @@ -270,15 +269,14 @@ wait=None, ) # EXTRACT - ftp_client = connect_ftp_task( - secret_path=constants.RDO_FTPS_SECRET_PATH.value, connect_flag=True - ) + # ftp_client = connect_ftp_task( + # secret_path=constants.RDO_FTPS_SECRET_PATH.value, connect_flag=True + # ) - files = get_file_paths_from_ftp( + files, ftp_client = get_file_paths_from_ftp( transport_mode=transport_mode, report_type=report_type, dump=dump, - ftp_client=ftp_client, ) download_files = check_files_for_download( files=files, dataset_id=constants.RDO_DATASET_ID.value, table_id=table_id diff --git a/pipelines/rj_smtr/br_rj_riodejaneiro_rdo/tasks.py b/pipelines/rj_smtr/br_rj_riodejaneiro_rdo/tasks.py index d23103953..c126d8e1f 100644 --- a/pipelines/rj_smtr/br_rj_riodejaneiro_rdo/tasks.py +++ b/pipelines/rj_smtr/br_rj_riodejaneiro_rdo/tasks.py @@ -32,9 +32,9 @@ from pipelines.utils.utils import log, get_redis_client -@task +@task(nout=2) def get_file_paths_from_ftp( - transport_mode: str, report_type: str, wait=None, dump=False, ftp_client=None + transport_mode: str, report_type: str, wait=None, dump=False ): # pylint: disable=W0613 """ Search for files inside previous interval (days) from current date, @@ -44,8 +44,7 @@ def get_file_paths_from_ftp( min_timestamp = datetime(2022, 1, 1).timestamp() # set min timestamp for search # Connect to FTP & search files # try: - if ftp_client is None: - ftp_client = connect_ftp(constants.RDO_FTPS_SECRET_PATH.value) + ftp_client = connect_ftp(constants.RDO_FTPS_SECRET_PATH.value) files_updated_times = { file: datetime.timestamp(parser.parse(info["modify"])) @@ -75,7 +74,7 @@ def get_file_paths_from_ftp( files = files[:10] log(f"There are {len(files)} files at the FTP") - return files + return files, ftp_client @task From fa6dfbe639a96f03ad2a25112584cd0057a15b89 Mon Sep 17 00:00:00 2001 From: eng-rodrigocunha Date: Mon, 2 Oct 2023 20:32:34 -0300 Subject: [PATCH 57/59] altera para o agent anterior --- pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/flows.py | 4 ++-- pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/schedules.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/flows.py b/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/flows.py index 793d37c0d..d7f44e3b9 100644 --- a/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/flows.py +++ b/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/flows.py @@ -30,7 +30,7 @@ bilhetagem_transacao_captura.storage = GCS(emd_constants.GCS_FLOWS_BUCKET.value) bilhetagem_transacao_captura.run_config = KubernetesRun( image=emd_constants.DOCKER_IMAGE.value, - labels=[emd_constants.RJ_SMTR_DEV_AGENT_LABEL.value], + labels=[emd_constants.RJ_SMTR_AGENT_LABEL.value], ) bilhetagem_transacao_captura.schedule = bilhetagem_transacao_schedule @@ -41,6 +41,6 @@ bilhetagem_principal_captura.storage = GCS(emd_constants.GCS_FLOWS_BUCKET.value) bilhetagem_principal_captura.run_config = KubernetesRun( image=emd_constants.DOCKER_IMAGE.value, - labels=[emd_constants.RJ_SMTR_DEV_AGENT_LABEL.value], + labels=[emd_constants.RJ_SMTR_AGENT_LABEL.value], ) bilhetagem_principal_captura.schedule = bilhetagem_principal_schedule diff --git a/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/schedules.py b/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/schedules.py index e897286b0..2f7804811 100644 --- a/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/schedules.py +++ b/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/schedules.py @@ -20,7 +20,7 @@ **constants.BILHETAGEM_GENERAL_CAPTURE_PARAMS.value["principal_run_interval"] ), labels=[ - emd_constants.RJ_SMTR_DEV_AGENT_LABEL.value, + emd_constants.RJ_SMTR_AGENT_LABEL.value, ], table_parameters=constants.BILHETAGEM_CAPTURE_PARAMS.value, dataset_id=constants.BILHETAGEM_DATASET_ID.value, @@ -38,7 +38,7 @@ **constants.BILHETAGEM_GENERAL_CAPTURE_PARAMS.value["transacao_run_interval"] ), labels=[ - emd_constants.RJ_SMTR_DEV_AGENT_LABEL.value, + emd_constants.RJ_SMTR_AGENT_LABEL.value, ], table_parameters=constants.BILHETAGEM_TRANSACAO_CAPTURE_PARAMS.value, dataset_id=constants.BILHETAGEM_DATASET_ID.value, From 8be8f0c53712edbe613145ce9e51d588833143e8 Mon Sep 17 00:00:00 2001 From: eng-rodrigocunha Date: Mon, 2 Oct 2023 20:49:27 -0300 Subject: [PATCH 58/59] remove task connect_ftp_task --- pipelines/rj_smtr/tasks.py | 16 ---------------- 1 file changed, 16 deletions(-) diff --git a/pipelines/rj_smtr/tasks.py b/pipelines/rj_smtr/tasks.py index 8e8462086..a846851b5 100644 --- a/pipelines/rj_smtr/tasks.py +++ b/pipelines/rj_smtr/tasks.py @@ -34,7 +34,6 @@ get_datetime_range, read_raw_data, save_treated_local_func, - connect_ftp, save_raw_local_func, ) from pipelines.utils.execute_dbt_model.utils import get_dbt_client @@ -1096,18 +1095,3 @@ def transform_raw_to_nested_structure( log(f"[CATCHED] Task failed with error: \n{error}", level="error") return error, filepath - - -@task(checkpoint=False) -def connect_ftp_task( - secret_path: str = None, - secure: bool = True, - connect_flag: bool = False, - ftp_client=None, - disconnect_flag: bool = False, -): - if connect_flag: - return connect_ftp(secret_path, secure) - - if disconnect_flag: - ftp_client.quit() From cb38e921a1813b9235f96c60e8350daae1a343ff Mon Sep 17 00:00:00 2001 From: eng-rodrigocunha Date: Mon, 2 Oct 2023 20:56:54 -0300 Subject: [PATCH 59/59] cria task download_and_save_list_local_from_ftp --- .../rj_smtr/br_rj_riodejaneiro_rdo/flows.py | 31 ++++----- .../rj_smtr/br_rj_riodejaneiro_rdo/tasks.py | 65 ++++++++++++++++++- 2 files changed, 74 insertions(+), 22 deletions(-) diff --git a/pipelines/rj_smtr/br_rj_riodejaneiro_rdo/flows.py b/pipelines/rj_smtr/br_rj_riodejaneiro_rdo/flows.py index 781bcc181..612b21a96 100644 --- a/pipelines/rj_smtr/br_rj_riodejaneiro_rdo/flows.py +++ b/pipelines/rj_smtr/br_rj_riodejaneiro_rdo/flows.py @@ -17,13 +17,13 @@ pre_treatment_br_rj_riodejaneiro_rdo, get_rdo_date_range, update_rdo_redis, + download_and_save_list_local_from_ftp, ) from pipelines.rj_smtr.constants import constants from pipelines.rj_smtr.tasks import ( bq_upload, get_current_timestamp, set_last_run_timestamp, - connect_ftp_task, ) from pipelines.rj_smtr.schedules import every_day @@ -210,12 +210,8 @@ now_time=get_current_timestamp(), wait=None, ) - # EXTRACT - # ftp_client = connect_ftp_task( - # secret_path=constants.RDO_FTPS_SECRET_PATH.value, connect_flag=True - # ) - files, ftp_client = get_file_paths_from_ftp( + files = get_file_paths_from_ftp( transport_mode=transport_mode, report_type=report_type, dump=dump, @@ -223,11 +219,11 @@ download_files = check_files_for_download( files=files, dataset_id=constants.RDO_DATASET_ID.value, table_id=table_id ) - updated_info = download_and_save_local_from_ftp.map( - file_info=download_files, ftp_client=ftp_client - ) + # updated_info = download_and_save_local_from_ftp.map( + # file_info=download_files + # ) + updated_info = download_and_save_list_local_from_ftp(files_info=download_files) - connect_ftp_task(ftp_client=ftp_client, disconnect_flag=True) # TRANSFORM treated_path, raw_path, partitions, status = pre_treatment_br_rj_riodejaneiro_rdo( files=updated_info @@ -268,12 +264,8 @@ now_time=get_current_timestamp(), wait=None, ) - # EXTRACT - # ftp_client = connect_ftp_task( - # secret_path=constants.RDO_FTPS_SECRET_PATH.value, connect_flag=True - # ) - files, ftp_client = get_file_paths_from_ftp( + files = get_file_paths_from_ftp( transport_mode=transport_mode, report_type=report_type, dump=dump, @@ -281,11 +273,10 @@ download_files = check_files_for_download( files=files, dataset_id=constants.RDO_DATASET_ID.value, table_id=table_id ) - updated_info = download_and_save_local_from_ftp.map( - file_info=download_files, ftp_client=ftp_client - ) - - connect_ftp_task(ftp_client=ftp_client, disconnect_flag=True) + # updated_info = download_and_save_local_from_ftp.map( + # file_info=download_files + # ) + updated_info = download_and_save_list_local_from_ftp(files_info=download_files) # TRANSFORM treated_path, raw_path, partitions, status = pre_treatment_br_rj_riodejaneiro_rdo( diff --git a/pipelines/rj_smtr/br_rj_riodejaneiro_rdo/tasks.py b/pipelines/rj_smtr/br_rj_riodejaneiro_rdo/tasks.py index c126d8e1f..c0704e418 100644 --- a/pipelines/rj_smtr/br_rj_riodejaneiro_rdo/tasks.py +++ b/pipelines/rj_smtr/br_rj_riodejaneiro_rdo/tasks.py @@ -1,4 +1,5 @@ # -*- coding: utf-8 -*- +# flake8: noqa: E501 """ Tasks for br_rj_riodejaneiro_rdo """ @@ -32,7 +33,7 @@ from pipelines.utils.utils import log, get_redis_client -@task(nout=2) +@task def get_file_paths_from_ftp( transport_mode: str, report_type: str, wait=None, dump=False ): # pylint: disable=W0613 @@ -74,7 +75,9 @@ def get_file_paths_from_ftp( files = files[:10] log(f"There are {len(files)} files at the FTP") - return files, ftp_client + ftp_client.quit() + + return files @task @@ -337,3 +340,61 @@ def get_rdo_date_range(dataset_id: str, table_id: str, mode: str = "prod"): "date_range_start": last_run_date, "date_range_end": pendulum.now(constants.TIMEZONE.value).date().isoformat(), } + + +@task +def download_and_save_list_local_from_ftp(files_info: list) -> list: + """ + Downloads files from FTP and saves to data/raw//. + """ + + file_info_list = [] + ftp_client = connect_ftp(constants.RDO_FTPS_SECRET_PATH.value) + + try: + for file_info in files_info: + if file_info["error"] is not None: + file_info_list.append(file_info) + continue + + dataset_id = constants.RDO_DATASET_ID.value + base_path = f'{os.getcwd()}/{os.getenv("DATA_FOLDER", "data")}/{{bucket_mode}}/{dataset_id}' + + table_id = build_table_id( # mudar pra task + mode=file_info["transport_mode"], report_type=file_info["report_type"] + ) + + # Set general local path to save file (bucket_modes: raw or staging) + file_info[ + "local_path" + ] = f"{base_path}/{table_id}/{file_info['partitions']}/{file_info['filename']}.{{file_ext}}" + # Get raw data + file_info["raw_path"] = file_info["local_path"].format( + bucket_mode="raw", file_ext="txt" + ) + Path(file_info["raw_path"]).parent.mkdir(parents=True, exist_ok=True) + + if not Path(file_info["raw_path"]).is_file(): + with open(file_info["raw_path"], "wb") as raw_file: + ftp_client.retrbinary( + "RETR " + file_info["ftp_path"], + raw_file.write, + ) + + # Get timestamp of download time + file_info["timestamp_captura"] = pendulum.now( + constants.TIMEZONE.value + ).isoformat() + + log(f"Timestamp captura is {file_info['timestamp_captura']}") + log(f"Update file info: {file_info}") + + file_info_list.append(file_info) + + ftp_client.quit() + + except Exception as error: # pylint: disable=W0703 + file_info["error"] = error + file_info_list.append(file_info) + + return file_info_list