From a521afc7e3aca0b6ca389052d7551cbe6fcec3c6 Mon Sep 17 00:00:00 2001
From: hellcassius <caiorogerio.santos@gmail.com>
Date: Tue, 27 Jun 2023 12:14:13 -0300
Subject: [PATCH 01/59] add stpl rdo/rho capture

---
 .../rj_smtr/br_rj_riodejaneiro_rdo/flows.py   | 101 +++++++++++++++++-
 1 file changed, 99 insertions(+), 2 deletions(-)

diff --git a/pipelines/rj_smtr/br_rj_riodejaneiro_rdo/flows.py b/pipelines/rj_smtr/br_rj_riodejaneiro_rdo/flows.py
index 55132f4c3..6bc8c3804 100644
--- a/pipelines/rj_smtr/br_rj_riodejaneiro_rdo/flows.py
+++ b/pipelines/rj_smtr/br_rj_riodejaneiro_rdo/flows.py
@@ -97,7 +97,7 @@
 )
 
 with Flow(
-    "SMTR: RHO - Captura",
+    "SMTR: SPPO RHO - Captura",
     code_owners=["caio", "fernanda"],
 ) as captura_sppo_rho:
     # SETUP
@@ -145,7 +145,7 @@
 captura_sppo_rho.schedule = every_day
 
 with Flow(
-    "SMTR: RDO - Captura",
+    "SMTR: SPPO RDO - Captura",
     code_owners=["caio", "fernanda"],
 ) as captura_sppo_rdo:
     # SETUP
@@ -193,6 +193,103 @@
 captura_sppo_rdo.schedule = every_day
 
 
+with Flow(
+    "SMTR: STPL RHO - Captura",
+    code_owners=["caio", "fernanda"],
+) as captura_stpl_rho:
+    # SETUP
+    transport_mode = Parameter("transport_mode", "STPL")
+    report_type = Parameter("report_type", "RHO")
+    dump = Parameter("dump", False)
+    table_id = Parameter("table_id", constants.STPL_RHO_TABLE_ID.value)
+    materialize = Parameter("materialize", False)
+
+    rename_run = rename_current_flow_run_now_time(
+        prefix=f"Captura FTP - {transport_mode.run()}-{report_type.run()} ",
+        now_time=get_current_timestamp(),
+        wait=None,
+    )
+    # EXTRACT
+    files = get_file_paths_from_ftp(
+        transport_mode=transport_mode, report_type=report_type, dump=dump
+    )
+    download_files = check_files_for_download(
+        files=files, dataset_id=constants.RDO_DATASET_ID.value, table_id=table_id
+    )
+    updated_info = download_and_save_local_from_ftp.map(file_info=download_files)
+    # TRANSFORM
+    treated_path, raw_path, partitions, status = pre_treatment_br_rj_riodejaneiro_rdo(
+        files=updated_info
+    )
+    # LOAD
+    errors = bq_upload.map(
+        dataset_id=unmapped(constants.RDO_DATASET_ID.value),
+        table_id=unmapped(table_id),
+        filepath=treated_path,
+        raw_filepath=raw_path,
+        partitions=partitions,
+        status=status,
+    )
+    set_redis = update_rdo_redis(
+        download_files=download_files, table_id=table_id, errors=errors
+    )
+
+captura_stpl_rho.storage = GCS(emd_constants.GCS_FLOWS_BUCKET.value)
+captura_stpl_rho.run_config = KubernetesRun(
+    image=emd_constants.DOCKER_IMAGE.value,
+    labels=[emd_constants.RJ_SMTR_AGENT_LABEL.value],
+)
+captura_stpl_rho.schedule = every_day
+
+with Flow(
+    "SMTR: STPL RDO - Captura",
+    code_owners=["caio", "fernanda"],
+) as captura_stpl_rdo:
+    # SETUP
+    transport_mode = Parameter("transport_mode", "STPL")
+    report_type = Parameter("report_type", "RDO")
+    dump = Parameter("dump", False)
+    table_id = Parameter("table_id", constants.STPL_RDO_TABLE_ID.value)
+    materialize = Parameter("materialize", False)
+
+    rename_run = rename_current_flow_run_now_time(
+        prefix=f"Captura FTP - {transport_mode.run()}-{report_type.run()} ",
+        now_time=get_current_timestamp(),
+        wait=None,
+    )
+    # EXTRACT
+    files = get_file_paths_from_ftp(
+        transport_mode=transport_mode, report_type=report_type, dump=dump
+    )
+    download_files = check_files_for_download(
+        files=files, dataset_id=constants.RDO_DATASET_ID.value, table_id=table_id
+    )
+    updated_info = download_and_save_local_from_ftp.map(file_info=download_files)
+    # TRANSFORM
+    treated_path, raw_path, partitions, status = pre_treatment_br_rj_riodejaneiro_rdo(
+        files=updated_info
+    )
+    # LOAD
+    errors = bq_upload.map(
+        dataset_id=unmapped(constants.RDO_DATASET_ID.value),
+        table_id=unmapped(table_id),
+        filepath=treated_path,
+        raw_filepath=raw_path,
+        partitions=partitions,
+        status=status,
+    )
+    set_redis = update_rdo_redis(
+        download_files=download_files, table_id=table_id, errors=errors
+    )
+
+captura_stpl_rdo.storage = GCS(emd_constants.GCS_FLOWS_BUCKET.value)
+captura_stpl_rdo.run_config = KubernetesRun(
+    image=emd_constants.DOCKER_IMAGE.value,
+    labels=[emd_constants.RJ_SMTR_AGENT_LABEL.value],
+)
+captura_stpl_rdo.schedule = every_day
+
+
 # captura_sppo_rho = deepcopy(captura_sppo_rdo)
 # captura_sppo_rho.storage = GCS(emd_constants.GCS_FLOWS_BUCKET.value)
 # captura_sppo_rho.run_config = KubernetesRun(image=emd_constants.DOCKER_IMAGE.value)

From df294ea8062452caab11b4d078fde26fb38e1271 Mon Sep 17 00:00:00 2001
From: eng-rodrigocunha <engtransportes.rodrigocunha@gmail.com>
Date: Tue, 27 Jun 2023 12:28:33 -0300
Subject: [PATCH 02/59] fix pylint set_redis_rdo_files

---
 pipelines/rj_smtr/utils.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pipelines/rj_smtr/utils.py b/pipelines/rj_smtr/utils.py
index 907496863..a2ce4651b 100644
--- a/pipelines/rj_smtr/utils.py
+++ b/pipelines/rj_smtr/utils.py
@@ -210,8 +210,8 @@ def set_redis_rdo_files(redis_client, dataset_id: str, table_id: str):
     """
     try:
         content = redis_client.get(f"{dataset_id}.{table_id}")["files"]
-    except (TypeError) as e:
-        log(f"Caught error {e}. Will set unexisting key")
+    except (TypeError) as error:
+        log(f"Caught error {error}. Will set unexisting key")
         # set key to empty dict for filling later
         redis_client.set(f"{dataset_id}.{table_id}", {"files": []})
         content = redis_client.get(f"{dataset_id}.{table_id}")

From 87fc63906cf4f6276ed335fb6c90680880c6e253 Mon Sep 17 00:00:00 2001
From: hellcassius <caiorogerio.santos@gmail.com>
Date: Tue, 27 Jun 2023 13:30:18 -0300
Subject: [PATCH 03/59] minor fix on error logging

---
 pipelines/rj_smtr/br_rj_riodejaneiro_rdo/tasks.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pipelines/rj_smtr/br_rj_riodejaneiro_rdo/tasks.py b/pipelines/rj_smtr/br_rj_riodejaneiro_rdo/tasks.py
index af27523c0..987bb8abd 100644
--- a/pipelines/rj_smtr/br_rj_riodejaneiro_rdo/tasks.py
+++ b/pipelines/rj_smtr/br_rj_riodejaneiro_rdo/tasks.py
@@ -239,7 +239,7 @@ def update_rdo_redis(
     download_files: list,
     table_id: str,
     dataset_id: str = constants.RDO_DATASET_ID.value,
-    errors=None,
+    errors: list = None,
     wait=None,  # pylint: disable=W0613
 ):
     """
@@ -260,7 +260,7 @@ def update_rdo_redis(
     redis_client = get_redis_client()
     content = redis_client.get(key)  # get current redis state
     if errors:
-        log(f"Received errors:\n {errors}")
+        log(f"Received {len(errors)} errors:\n {errors[:10]}\n...")
         merge_file_info_and_errors(download_files, errors)
     log(f"content is:\n{content['files'][:5]}")
     insert_content = [

From cf0067f5e50e70498bd3b1a429937646f9f4d8df Mon Sep 17 00:00:00 2001
From: eng-rodrigocunha <engtransportes.rodrigocunha@gmail.com>
Date: Thu, 6 Jul 2023 16:48:17 -0300
Subject: [PATCH 04/59] Revert "chore: upgrade prefect to 1.4"

This reverts commit 857d23f3c24bbd6b8dc5da424eddfaf62249ccff.
---
 poetry.lock             | 415 ++++++++++++++++++++++++++++++++++------
 pyproject.toml          |   2 +-
 requirements-deploy.txt |   2 +-
 requirements-test.txt   |   2 +-
 4 files changed, 363 insertions(+), 58 deletions(-)

diff --git a/poetry.lock b/poetry.lock
index 44c88b88d..30de91281 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -1,9 +1,10 @@
-# This file is automatically @generated by Poetry 1.5.1 and should not be changed by hand.
+# This file is automatically @generated by Poetry and should not be changed by hand.
 
 [[package]]
 name = "adal"
 version = "1.2.7"
 description = "Note: This library is already replaced by MSAL Python, available here: https://pypi.org/project/msal/ .ADAL Python remains available here as a legacy. The ADAL for Python library makes it easy for python application to authenticate to Azure Active Directory (AAD) in order to access AAD protected web resources."
+category = "main"
 optional = false
 python-versions = "*"
 files = [
@@ -21,6 +22,7 @@ requests = ">=2.0.0,<3"
 name = "affine"
 version = "2.3.1"
 description = "Matrices describing affine transformation of the plane."
+category = "main"
 optional = false
 python-versions = "*"
 files = [
@@ -35,6 +37,7 @@ test = ["coveralls", "flake8", "pydocstyle", "pytest (>=4.6)", "pytest-cov"]
 name = "aiobotocore"
 version = "2.4.0"
 description = "Async client for aws services using botocore and aiohttp"
+category = "main"
 optional = false
 python-versions = ">=3.7"
 files = [
@@ -56,6 +59,7 @@ boto3 = ["boto3 (>=1.24.59,<1.24.60)"]
 name = "aiohttp"
 version = "3.8.3"
 description = "Async http client/server framework (asyncio)"
+category = "main"
 optional = false
 python-versions = ">=3.6"
 files = [
@@ -164,6 +168,7 @@ speedups = ["Brotli", "aiodns", "cchardet"]
 name = "aioitertools"
 version = "0.11.0"
 description = "itertools and builtins for AsyncIO and mixed iterables"
+category = "main"
 optional = false
 python-versions = ">=3.6"
 files = [
@@ -178,6 +183,7 @@ typing_extensions = {version = ">=4.0", markers = "python_version < \"3.10\""}
 name = "aiokafka"
 version = "0.7.2"
 description = "Kafka integration with asyncio."
+category = "main"
 optional = false
 python-versions = "*"
 files = [
@@ -214,6 +220,7 @@ snappy = ["python-snappy (>=0.5)"]
 name = "aiosignal"
 version = "1.2.0"
 description = "aiosignal: a list of registered asynchronous callbacks"
+category = "main"
 optional = false
 python-versions = ">=3.6"
 files = [
@@ -228,6 +235,7 @@ frozenlist = ">=1.1.0"
 name = "alembic"
 version = "1.8.1"
 description = "A database migration tool for SQLAlchemy."
+category = "main"
 optional = false
 python-versions = ">=3.7"
 files = [
@@ -248,6 +256,7 @@ tz = ["python-dateutil"]
 name = "anyio"
 version = "3.6.1"
 description = "High level compatibility layer for multiple asynchronous event loop implementations"
+category = "main"
 optional = false
 python-versions = ">=3.6.2"
 files = [
@@ -268,6 +277,7 @@ trio = ["trio (>=0.16)"]
 name = "appdirs"
 version = "1.4.4"
 description = "A small Python module for determining appropriate platform-specific dirs, e.g. a \"user data dir\"."
+category = "main"
 optional = false
 python-versions = "*"
 files = [
@@ -279,6 +289,7 @@ files = [
 name = "apscheduler"
 version = "3.6.3"
 description = "In-process task scheduler with Cron-like capabilities"
+category = "main"
 optional = false
 python-versions = "*"
 files = [
@@ -309,6 +320,7 @@ zookeeper = ["kazoo"]
 name = "argcomplete"
 version = "2.0.0"
 description = "Bash tab completion for argparse"
+category = "main"
 optional = false
 python-versions = ">=3.6"
 files = [
@@ -323,6 +335,7 @@ test = ["coverage", "flake8", "pexpect", "wheel"]
 name = "astroid"
 version = "2.11.7"
 description = "An abstract syntax tree for Python with inference support."
+category = "dev"
 optional = false
 python-versions = ">=3.6.2"
 files = [
@@ -340,6 +353,7 @@ wrapt = ">=1.11,<2"
 name = "async-timeout"
 version = "4.0.2"
 description = "Timeout context manager for asyncio programs"
+category = "main"
 optional = false
 python-versions = ">=3.6"
 files = [
@@ -351,6 +365,7 @@ files = [
 name = "atomicwrites"
 version = "1.4.1"
 description = "Atomic file writes."
+category = "dev"
 optional = false
 python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*"
 files = [
@@ -361,6 +376,7 @@ files = [
 name = "attrs"
 version = "22.1.0"
 description = "Classes Without Boilerplate"
+category = "main"
 optional = false
 python-versions = ">=3.5"
 files = [
@@ -378,6 +394,7 @@ tests-no-zope = ["cloudpickle", "coverage[toml] (>=5.0.2)", "hypothesis", "mypy
 name = "azure-common"
 version = "1.1.28"
 description = "Microsoft Azure Client Library for Python (Common)"
+category = "main"
 optional = false
 python-versions = "*"
 files = [
@@ -389,6 +406,7 @@ files = [
 name = "azure-core"
 version = "1.25.1"
 description = "Microsoft Azure Core Library for Python"
+category = "main"
 optional = false
 python-versions = ">=3.7"
 files = [
@@ -405,6 +423,7 @@ typing-extensions = ">=4.0.1"
 name = "azure-graphrbac"
 version = "0.61.1"
 description = "Microsoft Azure Graph RBAC Client Library for Python"
+category = "main"
 optional = false
 python-versions = "*"
 files = [
@@ -421,6 +440,7 @@ msrestazure = ">=0.4.32,<2.0.0"
 name = "azure-mgmt-authorization"
 version = "2.0.0"
 description = "Microsoft Azure Authorization Management Client Library for Python"
+category = "main"
 optional = false
 python-versions = "*"
 files = [
@@ -437,6 +457,7 @@ msrest = ">=0.6.21"
 name = "azure-mgmt-containerregistry"
 version = "10.0.0"
 description = "Microsoft Azure Container Registry Client Library for Python"
+category = "main"
 optional = false
 python-versions = ">=3.6"
 files = [
@@ -453,6 +474,7 @@ msrest = ">=0.6.21"
 name = "azure-mgmt-core"
 version = "1.3.2"
 description = "Microsoft Azure Management Core Library for Python"
+category = "main"
 optional = false
 python-versions = ">=3.6"
 files = [
@@ -467,6 +489,7 @@ azure-core = ">=1.24.0,<2.0.0"
 name = "azure-mgmt-keyvault"
 version = "10.1.0"
 description = "Microsoft Azure Key Vault Management Client Library for Python"
+category = "main"
 optional = false
 python-versions = ">=3.7"
 files = [
@@ -483,6 +506,7 @@ msrest = ">=0.6.21"
 name = "azure-mgmt-resource"
 version = "21.1.0"
 description = "Microsoft Azure Resource Management Client Library for Python"
+category = "main"
 optional = false
 python-versions = ">=3.6"
 files = [
@@ -499,6 +523,7 @@ msrest = ">=0.6.21"
 name = "azure-mgmt-storage"
 version = "20.0.0"
 description = "Microsoft Azure Storage Management Client Library for Python"
+category = "main"
 optional = false
 python-versions = ">=3.6"
 files = [
@@ -515,6 +540,7 @@ msrest = ">=0.6.21"
 name = "azureml-core"
 version = "1.45.0.post2"
 description = "Azure Machine Learning core packages, modules, and classes"
+category = "main"
 optional = false
 python-versions = ">=3.6,< 4.0"
 files = [
@@ -534,7 +560,7 @@ azure-mgmt-resource = ">=15.0.0,<22.0.0"
 azure-mgmt-storage = ">=16.0.0,<=20.0.0"
 "backports.tempfile" = "*"
 contextlib2 = "<22.0.0"
-cryptography = "<1.9 || >1.9,<2.0.dev0 || >=2.3.dev0,<38.0.0"
+cryptography = "<1.9 || >1.9,<2.0.0 || >=2.3.0,<38.0.0"
 docker = "<6.0.0"
 humanfriendly = ">=4.7,<11.0"
 jmespath = "<2.0.0"
@@ -561,6 +587,7 @@ urllib3 = ">=1.23,<2.0.0"
 name = "backports.tempfile"
 version = "1.0"
 description = "Backport of new features in Python's tempfile module"
+category = "main"
 optional = false
 python-versions = "*"
 files = [
@@ -575,6 +602,7 @@ files = [
 name = "backports.weakref"
 version = "1.0.post1"
 description = "Backport of new features in Python's weakref module"
+category = "main"
 optional = false
 python-versions = "*"
 files = [
@@ -586,6 +614,7 @@ files = [
 name = "backports.zoneinfo"
 version = "0.2.1"
 description = "Backport of the standard library zoneinfo module"
+category = "main"
 optional = false
 python-versions = ">=3.6"
 files = [
@@ -614,6 +643,7 @@ tzdata = ["tzdata"]
 name = "basedosdados"
 version = "1.7.0b5"
 description = "Organizar e facilitar o acesso a dados brasileiros através de tabelas públicas no BigQuery."
+category = "main"
 optional = false
 python-versions = ">=3.7.1,<3.11"
 files = [
@@ -647,6 +677,7 @@ tqdm = "4.50.2"
 name = "bcrypt"
 version = "4.0.0"
 description = "Modern password hashing for your software and your servers"
+category = "main"
 optional = false
 python-versions = ">=3.6"
 files = [
@@ -672,6 +703,7 @@ typecheck = ["mypy"]
 name = "beautifulsoup4"
 version = "4.11.1"
 description = "Screen-scraping library"
+category = "main"
 optional = false
 python-versions = ">=3.6.0"
 files = [
@@ -690,6 +722,7 @@ lxml = ["lxml"]
 name = "black"
 version = "20.8b1"
 description = "The uncompromising code formatter."
+category = "main"
 optional = false
 python-versions = ">=3.6"
 files = [
@@ -714,6 +747,7 @@ d = ["aiohttp (>=3.3.2)", "aiohttp-cors"]
 name = "boto3"
 version = "1.24.59"
 description = "The AWS SDK for Python"
+category = "main"
 optional = false
 python-versions = ">= 3.7"
 files = [
@@ -733,6 +767,7 @@ crt = ["botocore[crt] (>=1.21.0,<2.0a0)"]
 name = "botocore"
 version = "1.27.59"
 description = "Low-level, data-driven core of boto 3."
+category = "main"
 optional = false
 python-versions = ">= 3.7"
 files = [
@@ -752,6 +787,7 @@ crt = ["awscrt (==0.14.0)"]
 name = "bs4"
 version = "0.0.1"
 description = "Dummy package for Beautiful Soup"
+category = "main"
 optional = false
 python-versions = "*"
 files = [
@@ -765,6 +801,7 @@ beautifulsoup4 = "*"
 name = "cachetools"
 version = "4.2.2"
 description = "Extensible memoizing collections and decorators"
+category = "main"
 optional = false
 python-versions = "~=3.5"
 files = [
@@ -776,6 +813,7 @@ files = [
 name = "certifi"
 version = "2022.9.24"
 description = "Python package for providing Mozilla's CA Bundle."
+category = "main"
 optional = false
 python-versions = ">=3.6"
 files = [
@@ -787,6 +825,7 @@ files = [
 name = "cffi"
 version = "1.15.1"
 description = "Foreign Function Interface for Python calling C code."
+category = "main"
 optional = false
 python-versions = "*"
 files = [
@@ -863,6 +902,7 @@ pycparser = "*"
 name = "cfgv"
 version = "3.3.1"
 description = "Validate configuration and produce human readable error messages."
+category = "dev"
 optional = false
 python-versions = ">=3.6.1"
 files = [
@@ -874,6 +914,7 @@ files = [
 name = "cftime"
 version = "1.6.2"
 description = "Time-handling functionality from netcdf4-python"
+category = "main"
 optional = false
 python-versions = ">=3.7"
 files = [
@@ -907,6 +948,7 @@ numpy = ">1.13.3"
 name = "charset-normalizer"
 version = "2.0.12"
 description = "The Real First Universal Charset Detector. Open, modern and actively maintained alternative to Chardet."
+category = "main"
 optional = false
 python-versions = ">=3.5.0"
 files = [
@@ -921,6 +963,7 @@ unicode-backport = ["unicodedata2"]
 name = "ckanapi"
 version = "4.6"
 description = "A command line interface and Python module for accessing the CKAN Action API"
+category = "main"
 optional = false
 python-versions = "*"
 files = [
@@ -938,6 +981,7 @@ six = ">=1.9,<2.0"
 name = "click"
 version = "8.0.3"
 description = "Composable command line interface toolkit"
+category = "main"
 optional = false
 python-versions = ">=3.6"
 files = [
@@ -952,6 +996,7 @@ colorama = {version = "*", markers = "platform_system == \"Windows\""}
 name = "click-plugins"
 version = "1.1.1"
 description = "An extension module for click to enable registering CLI commands via setuptools entry-points."
+category = "main"
 optional = false
 python-versions = "*"
 files = [
@@ -969,6 +1014,7 @@ dev = ["coveralls", "pytest (>=3.6)", "pytest-cov", "wheel"]
 name = "cligj"
 version = "0.7.2"
 description = "Click params for commmand line interfaces to GeoJSON"
+category = "main"
 optional = false
 python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, <4"
 files = [
@@ -986,6 +1032,7 @@ test = ["pytest-cov"]
 name = "cloudpickle"
 version = "2.2.0"
 description = "Extended pickling support for Python objects"
+category = "main"
 optional = false
 python-versions = ">=3.6"
 files = [
@@ -997,6 +1044,7 @@ files = [
 name = "colorama"
 version = "0.4.5"
 description = "Cross-platform colored terminal text."
+category = "main"
 optional = false
 python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*"
 files = [
@@ -1008,6 +1056,7 @@ files = [
 name = "contextlib2"
 version = "21.6.0"
 description = "Backports and enhancements for the contextlib module"
+category = "main"
 optional = false
 python-versions = ">=3.6"
 files = [
@@ -1019,6 +1068,7 @@ files = [
 name = "coverage"
 version = "7.0.0"
 description = "Code coverage measurement for Python"
+category = "dev"
 optional = false
 python-versions = ">=3.7"
 files = [
@@ -1085,6 +1135,7 @@ toml = ["tomli"]
 name = "croniter"
 version = "1.3.7"
 description = "croniter provides iteration for datetime object with cron like format"
+category = "main"
 optional = false
 python-versions = ">=2.6, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*"
 files = [
@@ -1099,6 +1150,7 @@ python-dateutil = "*"
 name = "cryptography"
 version = "37.0.4"
 description = "cryptography is a package which provides cryptographic recipes and primitives to Python developers."
+category = "main"
 optional = false
 python-versions = ">=3.6"
 files = [
@@ -1141,6 +1193,7 @@ test = ["hypothesis (>=1.11.4,!=3.79.2)", "iso8601", "pretend", "pytest (>=6.2.0
 name = "cx-oracle"
 version = "8.3.0"
 description = "Python interface to Oracle"
+category = "main"
 optional = false
 python-versions = "*"
 files = [
@@ -1166,6 +1219,7 @@ files = [
 name = "cycler"
 version = "0.11.0"
 description = "Composable style cycles"
+category = "main"
 optional = false
 python-versions = ">=3.6"
 files = [
@@ -1177,6 +1231,7 @@ files = [
 name = "dask"
 version = "2022.9.1"
 description = "Parallel PyData with Task Scheduling"
+category = "main"
 optional = false
 python-versions = ">=3.8"
 files = [
@@ -1204,6 +1259,7 @@ test = ["pandas[test]", "pre-commit", "pytest", "pytest-rerunfailures", "pytest-
 name = "databricks-cli"
 version = "0.17.3"
 description = "A command line interface for Databricks"
+category = "main"
 optional = false
 python-versions = "*"
 files = [
@@ -1223,6 +1279,7 @@ tabulate = ">=0.7.7"
 name = "db-dtypes"
 version = "1.0.4"
 description = "Pandas Data Types for SQL systems (BigQuery, Spanner)"
+category = "main"
 optional = false
 python-versions = ">=3.7"
 files = [
@@ -1240,6 +1297,7 @@ pyarrow = ">=3.0.0,<10.0dev"
 name = "dbt-client"
 version = "0.1.3"
 description = "A simple client for DBT RPC instances"
+category = "main"
 optional = false
 python-versions = ">=3.8,<4.0"
 files = [
@@ -1254,6 +1312,7 @@ requests = ">=2.26.0,<3.0.0"
 name = "deprecated"
 version = "1.2.13"
 description = "Python @deprecated decorator to deprecate old python classes, functions or methods."
+category = "main"
 optional = false
 python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*"
 files = [
@@ -1271,6 +1330,7 @@ dev = ["PyTest", "PyTest (<5)", "PyTest-Cov", "PyTest-Cov (<2.6)", "bump2version
 name = "dill"
 version = "0.3.5.1"
 description = "serialize all of python"
+category = "main"
 optional = false
 python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*, !=3.5.*, !=3.6.*"
 files = [
@@ -1285,6 +1345,7 @@ graph = ["objgraph (>=1.7.2)"]
 name = "distlib"
 version = "0.3.6"
 description = "Distribution utilities"
+category = "main"
 optional = false
 python-versions = "*"
 files = [
@@ -1296,6 +1357,7 @@ files = [
 name = "distributed"
 version = "2022.9.1"
 description = "Distributed scheduler for Dask"
+category = "main"
 optional = false
 python-versions = ">=3.8"
 files = [
@@ -1324,6 +1386,7 @@ zict = ">=0.1.3"
 name = "docker"
 version = "5.0.3"
 description = "A Python library for the Docker Engine API."
+category = "main"
 optional = false
 python-versions = ">=3.6"
 files = [
@@ -1344,6 +1407,7 @@ tls = ["cryptography (>=3.4.7)", "idna (>=2.0.0)", "pyOpenSSL (>=17.5.0)"]
 name = "docopt"
 version = "0.6.2"
 description = "Pythonic argument parser, that will make you smile"
+category = "main"
 optional = false
 python-versions = "*"
 files = [
@@ -1354,6 +1418,7 @@ files = [
 name = "earthengine-api"
 version = "0.1.334"
 description = "Earth Engine Python API"
+category = "main"
 optional = false
 python-versions = "*"
 files = [
@@ -1374,6 +1439,7 @@ six = "*"
 name = "elastic-transport"
 version = "8.4.0"
 description = "Transport classes and utilities shared among Python Elastic client libraries"
+category = "main"
 optional = false
 python-versions = ">=3.6"
 files = [
@@ -1392,6 +1458,7 @@ develop = ["aiohttp", "mock", "pytest", "pytest-asyncio", "pytest-cov", "pytest-
 name = "elasticsearch"
 version = "8.4.2"
 description = "Python client for Elasticsearch"
+category = "main"
 optional = false
 python-versions = ">=3.6, <4"
 files = [
@@ -1410,6 +1477,7 @@ requests = ["requests (>=2.4.0,<3.0.0)"]
 name = "entrypoints"
 version = "0.4"
 description = "Discover and load entry points from installed packages."
+category = "main"
 optional = false
 python-versions = ">=3.6"
 files = [
@@ -1421,6 +1489,7 @@ files = [
 name = "fastapi"
 version = "0.85.0"
 description = "FastAPI framework, high performance, easy to learn, fast to code, ready for production"
+category = "main"
 optional = false
 python-versions = ">=3.7"
 files = [
@@ -1442,6 +1511,7 @@ test = ["anyio[trio] (>=3.2.1,<4.0.0)", "black (==22.8.0)", "databases[sqlite] (
 name = "fastavro"
 version = "1.5.1"
 description = "Fast read/write of AVRO files"
+category = "main"
 optional = false
 python-versions = ">=3.7"
 files = [
@@ -1470,6 +1540,7 @@ zstandard = ["zstandard"]
 name = "filelock"
 version = "3.8.0"
 description = "A platform independent file lock."
+category = "main"
 optional = false
 python-versions = ">=3.7"
 files = [
@@ -1485,6 +1556,7 @@ testing = ["covdefaults (>=2.2)", "coverage (>=6.4.2)", "pytest (>=7.1.2)", "pyt
 name = "fiona"
 version = "1.8.21"
 description = "Fiona reads and writes spatial data files"
+category = "main"
 optional = false
 python-versions = "*"
 files = [
@@ -1521,6 +1593,7 @@ test = ["boto3 (>=1.2.4)", "mock", "pytest (>=3)", "pytest-cov"]
 name = "flake8"
 version = "4.0.1"
 description = "the modular source code checker: pep8 pyflakes and co"
+category = "dev"
 optional = false
 python-versions = ">=3.6"
 files = [
@@ -1537,6 +1610,7 @@ pyflakes = ">=2.4.0,<2.5.0"
 name = "flask"
 version = "2.2.2"
 description = "A simple framework for building complex web applications."
+category = "main"
 optional = false
 python-versions = ">=3.7"
 files = [
@@ -1559,6 +1633,7 @@ dotenv = ["python-dotenv"]
 name = "fonttools"
 version = "4.37.3"
 description = "Tools to manipulate font files"
+category = "main"
 optional = false
 python-versions = ">=3.7"
 files = [
@@ -1584,6 +1659,7 @@ woff = ["brotli (>=1.0.1)", "brotlicffi (>=0.8.0)", "zopfli (>=0.1.4)"]
 name = "frozenlist"
 version = "1.3.1"
 description = "A list-like structure which implements collections.abc.MutableSequence"
+category = "main"
 optional = false
 python-versions = ">=3.7"
 files = [
@@ -1652,6 +1728,7 @@ files = [
 name = "fsspec"
 version = "2022.8.2"
 description = "File-system specification"
+category = "main"
 optional = false
 python-versions = ">=3.7"
 files = [
@@ -1686,6 +1763,7 @@ tqdm = ["tqdm"]
 name = "future"
 version = "0.18.2"
 description = "Clean single-source support for Python 3 and 2"
+category = "main"
 optional = false
 python-versions = ">=2.6, !=3.0.*, !=3.1.*, !=3.2.*"
 files = [
@@ -1696,6 +1774,7 @@ files = [
 name = "geobr"
 version = "0.1.10"
 description = "geobr: Download Official Spatial Data Sets of Brazil"
+category = "main"
 optional = false
 python-versions = "*"
 files = [
@@ -1711,6 +1790,7 @@ shapely = ">=1.7.0,<2.0.0"
 name = "geographiclib"
 version = "2.0"
 description = "The geodesic routines from GeographicLib"
+category = "main"
 optional = false
 python-versions = ">=3.7"
 files = [
@@ -1722,6 +1802,7 @@ files = [
 name = "geojson"
 version = "2.5.0"
 description = "Python bindings and utilities for GeoJSON"
+category = "main"
 optional = false
 python-versions = "*"
 files = [
@@ -1733,6 +1814,7 @@ files = [
 name = "geojsplit"
 version = "0.1.2"
 description = "A python implementation of the npm package geojsplit. Used to split GeoJSON files into smaller pieces."
+category = "main"
 optional = false
 python-versions = ">=3.6,<4.0"
 files = [
@@ -1752,6 +1834,7 @@ docs = ["sphinx (>=2.2,<3.0)", "sphinx_rtd_theme (>=0.4.3,<0.5.0)"]
 name = "geopandas"
 version = "0.7.0"
 description = "Geographic pandas extensions"
+category = "main"
 optional = false
 python-versions = ">=3.5"
 files = [
@@ -1769,6 +1852,7 @@ shapely = "*"
 name = "geopy"
 version = "2.3.0"
 description = "Python Geocoding Toolbox"
+category = "main"
 optional = false
 python-versions = ">=3.7"
 files = [
@@ -1792,6 +1876,7 @@ timezone = ["pytz"]
 name = "gitdb"
 version = "4.0.9"
 description = "Git Object Database"
+category = "main"
 optional = false
 python-versions = ">=3.6"
 files = [
@@ -1806,6 +1891,7 @@ smmap = ">=3.0.1,<6"
 name = "gitpython"
 version = "3.1.27"
 description = "GitPython is a python library used to interact with Git repositories"
+category = "main"
 optional = false
 python-versions = ">=3.7"
 files = [
@@ -1820,6 +1906,7 @@ gitdb = ">=4.0.1,<5"
 name = "google-api-core"
 version = "1.34.0"
 description = "Google API client core library"
+category = "main"
 optional = false
 python-versions = ">=3.7"
 files = [
@@ -1844,6 +1931,7 @@ grpcio-gcp = ["grpcio-gcp (>=0.2.2,<1.0dev)"]
 name = "google-api-python-client"
 version = "2.69.0"
 description = "Google API Client Library for Python"
+category = "main"
 optional = false
 python-versions = ">=3.7"
 files = [
@@ -1852,7 +1940,7 @@ files = [
 ]
 
 [package.dependencies]
-google-api-core = ">=1.31.5,<2.0.dev0 || >2.3.0,<3.0.0dev"
+google-api-core = ">=1.31.5,<2.0.0 || >2.3.0,<3.0.0dev"
 google-auth = ">=1.19.0,<3.0.0dev"
 google-auth-httplib2 = ">=0.1.0"
 httplib2 = ">=0.15.0,<1dev"
@@ -1862,6 +1950,7 @@ uritemplate = ">=3.0.1,<5"
 name = "google-auth"
 version = "2.11.1"
 description = "Google Authentication Library"
+category = "main"
 optional = false
 python-versions = ">=2.7,!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*"
 files = [
@@ -1885,6 +1974,7 @@ reauth = ["pyu2f (>=0.1.5)"]
 name = "google-auth-httplib2"
 version = "0.1.0"
 description = "Google Authentication Library: httplib2 transport"
+category = "main"
 optional = false
 python-versions = "*"
 files = [
@@ -1901,6 +1991,7 @@ six = "*"
 name = "google-auth-oauthlib"
 version = "0.5.3"
 description = "Google Authentication Library"
+category = "main"
 optional = false
 python-versions = ">=3.6"
 files = [
@@ -1919,6 +2010,7 @@ tool = ["click (>=6.0.0)"]
 name = "google-cloud-bigquery"
 version = "2.30.1"
 description = "Google BigQuery API client library"
+category = "main"
 optional = false
 python-versions = ">=3.6, <3.11"
 files = [
@@ -1950,6 +2042,7 @@ tqdm = ["tqdm (>=4.7.4,<5.0.0dev)"]
 name = "google-cloud-bigquery-connection"
 version = "1.7.3"
 description = ""
+category = "main"
 optional = false
 python-versions = ">=3.7"
 files = [
@@ -1958,7 +2051,7 @@ files = [
 ]
 
 [package.dependencies]
-google-api-core = {version = ">=1.32.0,<2.0.dev0 || >=2.8.dev0,<3.0.0dev", extras = ["grpc"]}
+google-api-core = {version = ">=1.32.0,<2.0.0 || >=2.8.0,<3.0.0dev", extras = ["grpc"]}
 grpc-google-iam-v1 = ">=0.12.4,<1.0.0dev"
 proto-plus = ">=1.22.0,<2.0.0dev"
 protobuf = ">=3.19.5,<3.20.0 || >3.20.0,<3.20.1 || >3.20.1,<4.21.0 || >4.21.0,<4.21.1 || >4.21.1,<4.21.2 || >4.21.2,<4.21.3 || >4.21.3,<4.21.4 || >4.21.4,<4.21.5 || >4.21.5,<5.0.0dev"
@@ -1967,6 +2060,7 @@ protobuf = ">=3.19.5,<3.20.0 || >3.20.0,<3.20.1 || >3.20.1,<4.21.0 || >4.21.0,<4
 name = "google-cloud-bigquery-storage"
 version = "1.1.0"
 description = "BigQuery Storage API API client library"
+category = "main"
 optional = false
 python-versions = ">=2.7,!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*"
 files = [
@@ -1986,6 +2080,7 @@ pyarrow = ["pyarrow (>=0.15.0)"]
 name = "google-cloud-core"
 version = "2.3.2"
 description = "Google Cloud API client core library"
+category = "main"
 optional = false
 python-versions = ">=3.7"
 files = [
@@ -1994,7 +2089,7 @@ files = [
 ]
 
 [package.dependencies]
-google-api-core = ">=1.31.6,<2.0.dev0 || >2.3.0,<3.0.0dev"
+google-api-core = ">=1.31.6,<2.0.0 || >2.3.0,<3.0.0dev"
 google-auth = ">=1.25.0,<3.0dev"
 
 [package.extras]
@@ -2004,6 +2099,7 @@ grpc = ["grpcio (>=1.38.0,<2.0dev)"]
 name = "google-cloud-storage"
 version = "1.42.3"
 description = "Google Cloud Storage API client library"
+category = "main"
 optional = false
 python-versions = ">=2.7,!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*"
 files = [
@@ -2024,6 +2120,7 @@ six = "*"
 name = "google-crc32c"
 version = "1.5.0"
 description = "A python wrapper of the C library 'Google CRC32C'"
+category = "main"
 optional = false
 python-versions = ">=3.7"
 files = [
@@ -2104,6 +2201,7 @@ testing = ["pytest"]
 name = "google-resumable-media"
 version = "2.3.3"
 description = "Utilities for Google Media Downloads and Resumable Uploads"
+category = "main"
 optional = false
 python-versions = ">= 3.6"
 files = [
@@ -2122,6 +2220,7 @@ requests = ["requests (>=2.18.0,<3.0.0dev)"]
 name = "googleapis-common-protos"
 version = "1.56.4"
 description = "Common protobufs used in Google APIs"
+category = "main"
 optional = false
 python-versions = ">=3.7"
 files = [
@@ -2140,6 +2239,7 @@ grpc = ["grpcio (>=1.0.0,<2.0.0dev)"]
 name = "greenlet"
 version = "1.1.3"
 description = "Lightweight in-process concurrent programming"
+category = "main"
 optional = false
 python-versions = ">=2.7,!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*"
 files = [
@@ -2206,6 +2306,7 @@ docs = ["Sphinx"]
 name = "grpc-google-iam-v1"
 version = "0.12.4"
 description = "IAM API client library"
+category = "main"
 optional = false
 python-versions = ">=3.6"
 files = [
@@ -2221,6 +2322,7 @@ grpcio = ">=1.0.0,<2.0.0dev"
 name = "grpcio"
 version = "1.49.1"
 description = "HTTP/2-based RPC framework"
+category = "main"
 optional = false
 python-versions = ">=3.7"
 files = [
@@ -2281,6 +2383,7 @@ protobuf = ["grpcio-tools (>=1.49.1)"]
 name = "grpcio-status"
 version = "1.48.2"
 description = "Status proto mapping for gRPC"
+category = "main"
 optional = false
 python-versions = ">=3.6"
 files = [
@@ -2297,6 +2400,7 @@ protobuf = ">=3.12.0"
 name = "gspread"
 version = "5.5.0"
 description = "Google Spreadsheets Python API"
+category = "main"
 optional = false
 python-versions = ">=3.6, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*"
 files = [
@@ -2312,6 +2416,7 @@ google-auth-oauthlib = ">=0.4.1"
 name = "gunicorn"
 version = "20.1.0"
 description = "WSGI HTTP Server for UNIX"
+category = "main"
 optional = false
 python-versions = ">=3.5"
 files = [
@@ -2332,6 +2437,7 @@ tornado = ["tornado (>=0.2)"]
 name = "h11"
 version = "0.14.0"
 description = "A pure-Python, bring-your-own-I/O implementation of HTTP/1.1"
+category = "main"
 optional = false
 python-versions = ">=3.7"
 files = [
@@ -2343,6 +2449,7 @@ files = [
 name = "h5py"
 version = "3.8.0"
 description = "Read and write HDF5 files from Python"
+category = "main"
 optional = false
 python-versions = ">=3.7"
 files = [
@@ -2380,6 +2487,7 @@ numpy = ">=1.14.5"
 name = "haversine"
 version = "2.8.0"
 description = "Calculate the distance between 2 points on Earth."
+category = "main"
 optional = false
 python-versions = ">=3.5"
 files = [
@@ -2391,6 +2499,7 @@ files = [
 name = "heapdict"
 version = "1.0.1"
 description = "a heap with decrease-key and increase-key operations"
+category = "main"
 optional = false
 python-versions = "*"
 files = [
@@ -2402,6 +2511,7 @@ files = [
 name = "html5lib"
 version = "1.1"
 description = "HTML parser based on the WHATWG HTML specification"
+category = "main"
 optional = false
 python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*"
 files = [
@@ -2423,6 +2533,7 @@ lxml = ["lxml"]
 name = "httplib2"
 version = "0.20.4"
 description = "A comprehensive HTTP client library."
+category = "main"
 optional = false
 python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*"
 files = [
@@ -2437,6 +2548,7 @@ pyparsing = {version = ">=2.4.2,<3.0.0 || >3.0.0,<3.0.1 || >3.0.1,<3.0.2 || >3.0
 name = "humanfriendly"
 version = "10.0"
 description = "Human friendly output for text interfaces using Python"
+category = "main"
 optional = false
 python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*"
 files = [
@@ -2451,6 +2563,7 @@ pyreadline3 = {version = "*", markers = "sys_platform == \"win32\" and python_ve
 name = "hvac"
 version = "0.11.2"
 description = "HashiCorp Vault API client"
+category = "main"
 optional = false
 python-versions = ">=2.7"
 files = [
@@ -2469,6 +2582,7 @@ parser = ["pyhcl (>=0.3.10)"]
 name = "identify"
 version = "2.5.5"
 description = "File identification library for Python"
+category = "dev"
 optional = false
 python-versions = ">=3.7"
 files = [
@@ -2483,6 +2597,7 @@ license = ["ukkonen"]
 name = "idna"
 version = "3.4"
 description = "Internationalized Domain Names in Applications (IDNA)"
+category = "main"
 optional = false
 python-versions = ">=3.5"
 files = [
@@ -2494,6 +2609,7 @@ files = [
 name = "ijson"
 version = "2.6.1"
 description = "Iterative JSON parser with a standard Python iterator interface"
+category = "main"
 optional = false
 python-versions = "*"
 files = [
@@ -2515,6 +2631,7 @@ files = [
 name = "importlib-metadata"
 version = "4.12.0"
 description = "Read metadata from Python packages"
+category = "main"
 optional = false
 python-versions = ">=3.7"
 files = [
@@ -2534,6 +2651,7 @@ testing = ["flufl.flake8", "importlib-resources (>=1.3)", "packaging", "pyfakefs
 name = "importlib-resources"
 version = "5.9.0"
 description = "Read resources from Python packages"
+category = "main"
 optional = false
 python-versions = ">=3.7"
 files = [
@@ -2552,6 +2670,7 @@ testing = ["pytest (>=6)", "pytest-black (>=0.3.7)", "pytest-checkdocs (>=2.4)",
 name = "iniconfig"
 version = "1.1.1"
 description = "iniconfig: brain-dead simple config-ini parsing"
+category = "dev"
 optional = false
 python-versions = "*"
 files = [
@@ -2563,6 +2682,7 @@ files = [
 name = "isodate"
 version = "0.6.1"
 description = "An ISO 8601 date/time/duration parser and formatter"
+category = "main"
 optional = false
 python-versions = "*"
 files = [
@@ -2577,6 +2697,7 @@ six = "*"
 name = "isort"
 version = "5.10.1"
 description = "A Python utility / library to sort Python imports."
+category = "dev"
 optional = false
 python-versions = ">=3.6.1,<4.0"
 files = [
@@ -2594,6 +2715,7 @@ requirements-deprecated-finder = ["pip-api", "pipreqs"]
 name = "itsdangerous"
 version = "2.1.2"
 description = "Safely pass data to untrusted environments and back."
+category = "main"
 optional = false
 python-versions = ">=3.7"
 files = [
@@ -2605,6 +2727,7 @@ files = [
 name = "jeepney"
 version = "0.8.0"
 description = "Low-level, pure Python DBus protocol wrapper."
+category = "main"
 optional = false
 python-versions = ">=3.7"
 files = [
@@ -2620,6 +2743,7 @@ trio = ["async_generator", "trio"]
 name = "jinja2"
 version = "3.0.3"
 description = "A very fast and expressive template engine."
+category = "main"
 optional = false
 python-versions = ">=3.6"
 files = [
@@ -2637,6 +2761,7 @@ i18n = ["Babel (>=2.7)"]
 name = "jmespath"
 version = "1.0.1"
 description = "JSON Matching Expressions"
+category = "main"
 optional = false
 python-versions = ">=3.7"
 files = [
@@ -2648,6 +2773,7 @@ files = [
 name = "joblib"
 version = "1.2.0"
 description = "Lightweight pipelining with Python functions"
+category = "main"
 optional = false
 python-versions = ">=3.7"
 files = [
@@ -2659,6 +2785,7 @@ files = [
 name = "jsonpickle"
 version = "2.2.0"
 description = "Python library for serializing any arbitrary object graph into JSON"
+category = "main"
 optional = false
 python-versions = ">=2.7"
 files = [
@@ -2675,6 +2802,7 @@ testing-libs = ["simplejson", "ujson", "yajl"]
 name = "kafka-python"
 version = "2.0.2"
 description = "Pure Python client for Apache Kafka"
+category = "main"
 optional = false
 python-versions = "*"
 files = [
@@ -2689,6 +2817,7 @@ crc32c = ["crc32c"]
 name = "kaleido"
 version = "0.2.1"
 description = "Static image export for web-based visualization libraries with zero dependencies"
+category = "main"
 optional = false
 python-versions = "*"
 files = [
@@ -2704,6 +2833,7 @@ files = [
 name = "kiwisolver"
 version = "1.4.4"
 description = "A fast implementation of the Cassowary constraint solver"
+category = "main"
 optional = false
 python-versions = ">=3.7"
 files = [
@@ -2781,6 +2911,7 @@ files = [
 name = "knack"
 version = "0.9.0"
 description = "A Command-Line Interface framework"
+category = "main"
 optional = false
 python-versions = "*"
 files = [
@@ -2799,6 +2930,7 @@ tabulate = "*"
 name = "kubernetes"
 version = "24.2.0"
 description = "Kubernetes python client"
+category = "main"
 optional = false
 python-versions = ">=3.6"
 files = [
@@ -2816,7 +2948,7 @@ requests-oauthlib = "*"
 setuptools = ">=21.0.0"
 six = ">=1.9.0"
 urllib3 = ">=1.24.2"
-websocket-client = ">=0.32.0,<0.40.0 || >0.40.0,<0.41.dev0 || >=0.43.dev0"
+websocket-client = ">=0.32.0,<0.40.0 || >0.40.0,<0.41.0 || >=0.43.0"
 
 [package.extras]
 adal = ["adal (>=1.0.2)"]
@@ -2825,6 +2957,7 @@ adal = ["adal (>=1.0.2)"]
 name = "lazy-object-proxy"
 version = "1.7.1"
 description = "A fast and thorough lazy object proxy."
+category = "dev"
 optional = false
 python-versions = ">=3.6"
 files = [
@@ -2871,6 +3004,7 @@ files = [
 name = "locket"
 version = "1.0.0"
 description = "File-based locks for Python on Linux and Windows"
+category = "main"
 optional = false
 python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*"
 files = [
@@ -2882,6 +3016,7 @@ files = [
 name = "loguru"
 version = "0.6.0"
 description = "Python logging made (stupidly) simple"
+category = "main"
 optional = false
 python-versions = ">=3.5"
 files = [
@@ -2900,6 +3035,7 @@ dev = ["Sphinx (>=4.1.1)", "black (>=19.10b0)", "colorama (>=0.3.4)", "docutils
 name = "lxml"
 version = "4.9.1"
 description = "Powerful and Pythonic XML processing library combining libxml2/libxslt with the ElementTree API."
+category = "main"
 optional = false
 python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, != 3.4.*"
 files = [
@@ -2985,6 +3121,7 @@ source = ["Cython (>=0.29.7)"]
 name = "mako"
 version = "1.2.3"
 description = "A super-fast templating language that borrows the best ideas from the existing templating languages."
+category = "main"
 optional = false
 python-versions = ">=3.7"
 files = [
@@ -3000,10 +3137,29 @@ babel = ["Babel"]
 lingua = ["lingua"]
 testing = ["pytest"]
 
+[[package]]
+name = "markdown"
+version = "3.3.7"
+description = "Python implementation of Markdown."
+category = "dev"
+optional = false
+python-versions = ">=3.6"
+files = [
+    {file = "Markdown-3.3.7-py3-none-any.whl", hash = "sha256:f5da449a6e1c989a4cea2631aa8ee67caa5a2ef855d551c88f9e309f4634c621"},
+    {file = "Markdown-3.3.7.tar.gz", hash = "sha256:cbb516f16218e643d8e0a95b309f77eb118cb138d39a4f27851e6a63581db874"},
+]
+
+[package.dependencies]
+importlib-metadata = {version = ">=4.4", markers = "python_version < \"3.10\""}
+
+[package.extras]
+testing = ["coverage", "pyyaml"]
+
 [[package]]
 name = "markupsafe"
 version = "2.1.1"
 description = "Safely add untrusted strings to HTML/XML markup."
+category = "main"
 optional = false
 python-versions = ">=3.7"
 files = [
@@ -3053,6 +3209,7 @@ files = [
 name = "marshmallow"
 version = "3.18.0"
 description = "A lightweight library for converting complex datatypes to and from native Python datatypes."
+category = "main"
 optional = false
 python-versions = ">=3.7"
 files = [
@@ -3073,6 +3230,7 @@ tests = ["pytest", "pytz", "simplejson"]
 name = "marshmallow-oneofschema"
 version = "3.0.1"
 description = "marshmallow multiplexing schema"
+category = "main"
 optional = false
 python-versions = ">=3.6"
 files = [
@@ -3092,6 +3250,7 @@ tests = ["mock", "pytest"]
 name = "matplotlib"
 version = "3.5.1"
 description = "Python plotting package"
+category = "main"
 optional = false
 python-versions = ">=3.7"
 files = [
@@ -3146,6 +3305,7 @@ python-dateutil = ">=2.7"
 name = "mccabe"
 version = "0.6.1"
 description = "McCabe checker, plugin for flake8"
+category = "dev"
 optional = false
 python-versions = "*"
 files = [
@@ -3157,6 +3317,7 @@ files = [
 name = "mlflow"
 version = "1.30.0"
 description = "MLflow: A Platform for ML Development and Productionization"
+category = "main"
 optional = false
 python-versions = ">=3.7"
 files = [
@@ -3210,6 +3371,7 @@ sqlserver = ["mlflow-dbstore"]
 name = "mlserver"
 version = "1.1.0"
 description = "ML server"
+category = "main"
 optional = false
 python-versions = "*"
 files = [
@@ -3229,7 +3391,7 @@ py-grpc-prometheus = "*"
 python-dotenv = "*"
 starlette-exporter = "*"
 uvicorn = "*"
-uvloop = {version = "*", markers = "(sys_platform != \"win32\" and sys_platform != \"cygwin\") and platform_python_implementation != \"PyPy\""}
+uvloop = {version = "*", markers = "sys_platform != \"win32\" and sys_platform != \"cygwin\" and platform_python_implementation != \"PyPy\""}
 
 [package.extras]
 all = ["orjson"]
@@ -3238,6 +3400,7 @@ all = ["orjson"]
 name = "mlserver-mlflow"
 version = "1.1.0"
 description = "MLflow runtime for MLServer"
+category = "main"
 optional = false
 python-versions = "*"
 files = [
@@ -3253,6 +3416,7 @@ mlserver = "*"
 name = "more-itertools"
 version = "8.14.0"
 description = "More routines for operating on iterables, beyond itertools"
+category = "dev"
 optional = false
 python-versions = ">=3.5"
 files = [
@@ -3264,6 +3428,7 @@ files = [
 name = "msal"
 version = "1.19.0"
 description = "The Microsoft Authentication Library (MSAL) for Python library enables your app to access the Microsoft Cloud by supporting authentication of users with Microsoft Azure Active Directory accounts (AAD) and Microsoft Accounts (MSA) using industry standard OAuth2 and OpenID Connect."
+category = "main"
 optional = false
 python-versions = "*"
 files = [
@@ -3280,6 +3445,7 @@ requests = ">=2.0.0,<3"
 name = "msal-extensions"
 version = "1.0.0"
 description = "Microsoft Authentication Library extensions (MSAL EX) provides a persistence API that can save your data on disk, encrypted on Windows, macOS and Linux. Concurrent data access will be coordinated by a file lock mechanism."
+category = "main"
 optional = false
 python-versions = "*"
 files = [
@@ -3298,6 +3464,7 @@ portalocker = [
 name = "msgpack"
 version = "1.0.4"
 description = "MessagePack serializer"
+category = "main"
 optional = false
 python-versions = "*"
 files = [
@@ -3359,6 +3526,7 @@ files = [
 name = "msrest"
 version = "0.7.1"
 description = "AutoRest swagger generator Python client runtime."
+category = "main"
 optional = false
 python-versions = ">=3.6"
 files = [
@@ -3380,6 +3548,7 @@ async = ["aiodns", "aiohttp (>=3.0)"]
 name = "msrestazure"
 version = "0.6.4"
 description = "AutoRest swagger generator Python client runtime. Azure-specific module."
+category = "main"
 optional = false
 python-versions = "*"
 files = [
@@ -3396,6 +3565,7 @@ six = "*"
 name = "multidict"
 version = "6.0.2"
 description = "multidict implementation"
+category = "main"
 optional = false
 python-versions = ">=3.7"
 files = [
@@ -3464,6 +3634,7 @@ files = [
 name = "munch"
 version = "2.5.0"
 description = "A dot-accessible dictionary (a la JavaScript objects)"
+category = "main"
 optional = false
 python-versions = "*"
 files = [
@@ -3482,6 +3653,7 @@ yaml = ["PyYAML (>=5.1.0)"]
 name = "mypy-extensions"
 version = "0.4.3"
 description = "Experimental type system extensions for programs checked with the mypy typechecker."
+category = "main"
 optional = false
 python-versions = "*"
 files = [
@@ -3493,8 +3665,9 @@ files = [
 name = "ndg-httpsclient"
 version = "0.5.1"
 description = "Provides enhanced HTTPS support for httplib and urllib2 using PyOpenSSL"
+category = "main"
 optional = false
-python-versions = ">=2.7,<3.0.dev0 || >=3.4.dev0"
+python-versions = ">=2.7,<3.0.0 || >=3.4.0"
 files = [
     {file = "ndg_httpsclient-0.5.1-py2-none-any.whl", hash = "sha256:d2c7225f6a1c6cf698af4ebc962da70178a99bcde24ee6d1961c4f3338130d57"},
     {file = "ndg_httpsclient-0.5.1-py3-none-any.whl", hash = "sha256:dd174c11d971b6244a891f7be2b32ca9853d3797a72edb34fa5d7b07d8fff7d4"},
@@ -3509,6 +3682,7 @@ PyOpenSSL = "*"
 name = "netcdf4"
 version = "1.5.8"
 description = "Provides an object-oriented python interface to the netCDF version 4 library."
+category = "main"
 optional = false
 python-versions = "*"
 files = [
@@ -3555,6 +3729,7 @@ numpy = ">=1.9"
 name = "nodeenv"
 version = "1.7.0"
 description = "Node.js virtual environment builder"
+category = "dev"
 optional = false
 python-versions = ">=2.7,!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,!=3.6.*"
 files = [
@@ -3569,6 +3744,7 @@ setuptools = "*"
 name = "numpy"
 version = "1.22.0"
 description = "NumPy is the fundamental package for array computing with Python."
+category = "main"
 optional = false
 python-versions = ">=3.8"
 files = [
@@ -3600,6 +3776,7 @@ files = [
 name = "oauthlib"
 version = "3.2.1"
 description = "A generic, spec-compliant, thorough implementation of the OAuth request-signing logic"
+category = "main"
 optional = false
 python-versions = ">=3.6"
 files = [
@@ -3616,6 +3793,7 @@ signedtoken = ["cryptography (>=3.0.0)", "pyjwt (>=2.0.0,<3)"]
 name = "opencv-python"
 version = "4.7.0.72"
 description = "Wrapper package for OpenCV python bindings."
+category = "main"
 optional = false
 python-versions = ">=3.6"
 files = [
@@ -3642,6 +3820,7 @@ numpy = [
 name = "packaging"
 version = "21.3"
 description = "Core utilities for Python packages"
+category = "main"
 optional = false
 python-versions = ">=3.6"
 files = [
@@ -3656,6 +3835,7 @@ pyparsing = ">=2.0.2,<3.0.5 || >3.0.5"
 name = "pandas"
 version = "1.5.2"
 description = "Powerful data structures for data analysis, time series, and statistics"
+category = "main"
 optional = false
 python-versions = ">=3.8"
 files = [
@@ -3703,6 +3883,7 @@ test = ["hypothesis (>=5.5.3)", "pytest (>=6.0)", "pytest-xdist (>=1.31)"]
 name = "pandas-gbq"
 version = "0.17.8"
 description = "Google BigQuery connector for pandas"
+category = "main"
 optional = false
 python-versions = ">=3.7, <3.11"
 files = [
@@ -3712,10 +3893,10 @@ files = [
 
 [package.dependencies]
 db-dtypes = ">=0.3.1,<2.0.0"
-google-api-core = ">=1.31.5,<2.0.dev0 || >2.3.0,<3.0.0dev"
+google-api-core = ">=1.31.5,<2.0.0 || >2.3.0,<3.0.0dev"
 google-auth = ">=1.25.0"
 google-auth-oauthlib = ">=0.0.1"
-google-cloud-bigquery = ">=1.27.2,<2.4.dev0 || >=2.5.dev0,<4.0.0dev"
+google-cloud-bigquery = ">=1.27.2,<2.4.0 || >=2.5.0,<4.0.0dev"
 google-cloud-bigquery-storage = ">=1.1.0,<3.0.0dev"
 numpy = ">=1.16.6"
 pandas = ">=0.24.2"
@@ -3730,6 +3911,7 @@ tqdm = ["tqdm (>=4.23.0)"]
 name = "pandas-read-xml"
 version = "0.3.1"
 description = "A tool to read XML files as pandas dataframes."
+category = "main"
 optional = false
 python-versions = ">=3.6"
 files = [
@@ -3749,6 +3931,7 @@ zipfile36 = "*"
 name = "pandavro"
 version = "1.7.1"
 description = "The interface between Avro and pandas DataFrame"
+category = "main"
 optional = false
 python-versions = ">=3.6.1"
 files = [
@@ -3767,6 +3950,7 @@ tests = ["pytest (==7.1.2)"]
 name = "paramiko"
 version = "2.11.0"
 description = "SSH2 protocol library"
+category = "main"
 optional = false
 python-versions = "*"
 files = [
@@ -3790,6 +3974,7 @@ invoke = ["invoke (>=1.3)"]
 name = "partd"
 version = "1.3.0"
 description = "Appendable key-value storage"
+category = "main"
 optional = false
 python-versions = ">=3.7"
 files = [
@@ -3808,6 +3993,7 @@ complete = ["blosc", "numpy (>=1.9.0)", "pandas (>=0.19.0)", "pyzmq"]
 name = "pathspec"
 version = "0.10.1"
 description = "Utility library for gitignore style pattern matching of file paths."
+category = "main"
 optional = false
 python-versions = ">=3.7"
 files = [
@@ -3819,6 +4005,7 @@ files = [
 name = "patsy"
 version = "0.5.3"
 description = "A Python package for describing statistical models and for building design matrices."
+category = "main"
 optional = false
 python-versions = "*"
 files = [
@@ -3837,6 +4024,7 @@ test = ["pytest", "pytest-cov", "scipy"]
 name = "pdoc3"
 version = "0.10.0"
 description = "Auto-generate API documentation for Python projects."
+category = "dev"
 optional = false
 python-versions = ">= 3.6"
 files = [
@@ -3844,10 +4032,15 @@ files = [
     {file = "pdoc3-0.10.0.tar.gz", hash = "sha256:5f22e7bcb969006738e1aa4219c75a32f34c2d62d46dc9d2fb2d3e0b0287e4b7"},
 ]
 
+[package.dependencies]
+mako = "*"
+markdown = ">=3.0"
+
 [[package]]
 name = "pendulum"
 version = "2.1.2"
 description = "Python datetimes made easy"
+category = "main"
 optional = false
 python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*"
 files = [
@@ -3882,6 +4075,7 @@ pytzdata = ">=2020.1"
 name = "pexpect"
 version = "4.8.0"
 description = "Pexpect allows easy control of interactive console applications."
+category = "main"
 optional = false
 python-versions = "*"
 files = [
@@ -3896,6 +4090,7 @@ ptyprocess = ">=0.5"
 name = "phonenumbers"
 version = "8.13.0"
 description = "Python version of Google's common library for parsing, formatting, storing and validating international phone numbers."
+category = "main"
 optional = false
 python-versions = "*"
 files = [
@@ -3907,6 +4102,7 @@ files = [
 name = "pillow"
 version = "9.3.0"
 description = "Python Imaging Library (Fork)"
+category = "main"
 optional = false
 python-versions = ">=3.7"
 files = [
@@ -3981,6 +4177,7 @@ tests = ["check-manifest", "coverage", "defusedxml", "markdown2", "olefile", "pa
 name = "pkginfo"
 version = "1.8.3"
 description = "Query metadatdata from sdists / bdists / installed packages."
+category = "main"
 optional = false
 python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*, !=3.5.*"
 files = [
@@ -3995,6 +4192,7 @@ testing = ["coverage", "nose"]
 name = "platformdirs"
 version = "2.5.2"
 description = "A small Python module for determining appropriate platform-specific dirs, e.g. a \"user data dir\"."
+category = "main"
 optional = false
 python-versions = ">=3.7"
 files = [
@@ -4010,6 +4208,7 @@ test = ["appdirs (==1.4.4)", "pytest (>=6)", "pytest-cov (>=2.7)", "pytest-mock
 name = "plotly"
 version = "5.14.0"
 description = "An open-source, interactive data visualization library for Python"
+category = "main"
 optional = false
 python-versions = ">=3.6"
 files = [
@@ -4025,6 +4224,7 @@ tenacity = ">=6.2.0"
 name = "pluggy"
 version = "0.13.1"
 description = "plugin and hook calling mechanisms for python"
+category = "dev"
 optional = false
 python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*"
 files = [
@@ -4039,6 +4239,7 @@ dev = ["pre-commit", "tox"]
 name = "portalocker"
 version = "2.5.1"
 description = "Wraps the portalocker recipe for easy usage"
+category = "main"
 optional = false
 python-versions = ">=3.5"
 files = [
@@ -4058,6 +4259,7 @@ tests = ["pytest (>=5.4.1)", "pytest-cov (>=2.8.1)", "pytest-mypy (>=0.8.0)", "p
 name = "pre-commit"
 version = "2.20.0"
 description = "A framework for managing and maintaining multi-language pre-commit hooks."
+category = "dev"
 optional = false
 python-versions = ">=3.7"
 files = [
@@ -4075,88 +4277,78 @@ virtualenv = ">=20.0.8"
 
 [[package]]
 name = "prefect"
-version = "1.4.1"
+version = "0.15.9"
 description = "The Prefect Core automation and scheduling engine."
+category = "main"
 optional = false
-python-versions = ">=3.7"
+python-versions = ">=3.6"
 files = [
-    {file = "prefect-1.4.1-py3-none-any.whl", hash = "sha256:a838d427a88845b13279b89b925e2b6acde5ff2bb090c5480617bc6047a808a8"},
-    {file = "prefect-1.4.1.tar.gz", hash = "sha256:179f179849286bb8dc0309c8718a7815e6e5fcc016398d2aea45e16fa0e3471b"},
+    {file = "prefect-0.15.9-py3-none-any.whl", hash = "sha256:595c9f229349528f7bcd2aa866c9c10dcfbf059a20803526924339d45604ec76"},
+    {file = "prefect-0.15.9.tar.gz", hash = "sha256:52d4d28493cd1a90e1acf96b5a92b2902950849b481a49f762998448a41cf127"},
 ]
 
 [package.dependencies]
-click = ">=7.0"
+click = ">=7.0,<9.0"
 cloudpickle = ">=1.3.0"
-croniter = ">=0.3.24"
-dask = ">=2021.06.0"
-distributed = ">=2.17.0"
+croniter = ">=0.3.24,<2.0"
+dask = {version = ">=2.17.0", markers = "python_version > \"3.6\""}
+distributed = {version = ">=2.17.0", markers = "python_version > \"3.6\""}
 docker = ">=3.4.1"
-importlib-resources = ">=3.0.0"
 marshmallow = ">=3.0.0b19"
 marshmallow-oneofschema = ">=2.0.0b2"
 msgpack = ">=0.6.0"
 mypy-extensions = ">=0.4.0"
-packaging = ">=20.0"
 pendulum = ">=2.0.4"
 python-box = ">=5.1.0"
 python-dateutil = ">=2.7.0"
 python-slugify = ">=1.2.6"
 pytz = ">=2018.7"
 pyyaml = ">=3.13"
-requests = ">=2.25"
+requests = ">=2.20,<2.27"
 tabulate = ">=0.8.0"
 toml = ">=0.9.4"
-urllib3 = ">=1.26.0"
+urllib3 = ">=1.24.3"
 
 [package.extras]
-airtable = ["airtable-python-wrapper (>=0.11)"]
-all-extras = ["PyGithub (>=1.51)", "PyJWT (>=2.3.0)", "Pygments (>=2.2,<3.0)", "airtable-python-wrapper (>=0.11)", "atlassian-python-api (>=2.0.1)", "azure-core (>=1.10.0)", "azure-cosmos (>=3.1.1)", "azure-identity (>=1.7.0)", "azure-mgmt-datafactory (>=2.7.0)", "azure-storage-blob (>=12.1.0)", "azureml-sdk", "black", "boto3 (>=1.9)", "confluent-kafka (>=1.7.0)", "dask-cloudprovider[aws] (>=0.2.0)", "dask-kubernetes (>=0.8.0)", "dropbox (>=9.0,<10.0)", "dulwich (>=0.19.7)", "feedparser (>=5.0.1)", "firebolt-sdk (>=0.2.1)", "flaky (>=3.0)", "freezegun (>=1.0.0)", "google-auth (>=2.0)", "google-cloud-aiplatform (>=1.4.0)", "google-cloud-bigquery (>=1.6.0)", "google-cloud-secret-manager (>=2.4.0)", "google-cloud-storage (>=1.13)", "graphviz (>=0.8)", "graphviz (>=0.8.3)", "great-expectations (>=0.13.8)", "gspread (>=3.6.0)", "hvac (>=0.10)", "ipykernel (>=6.9.2)", "jinja2 (>=2.0)", "jinja2 (>=2.0,<4.0)", "jira (>=2.0.0)", "kubernetes (>=9.0.0a1)", "nbconvert (>=6.0.7)", "pandas (>=1.0.1)", "papermill (>=2.2.0)", "paramiko (>=2.10.4)", "prometheus-client (>=0.9.0)", "psycopg2-binary (>=2.8.2)", "pushbullet.py (>=0.11.0)", "py2neo (>=2021.2.3)", "pyarrow (>=5.0.0)", "pydantic (>=1.9.0)", "pyexasol (>=0.16.1)", "pymysql (>=0.9.3)", "pyodbc (>=4.0.30)", "pytest (>=6.0)", "pytest-env (>=0.6.0)", "pytest-xdist (>=2.0)", "python-gitlab (>=2.5.0)", "redis (>=3.2.1)", "responses (>=0.14.0)", "sendgrid (>=6.7.0)", "snowflake-connector-python (>=1.8.2)", "soda-spark (>=0.2.1)", "soda-sql (>=2.0.0b25)", "spacy (>=2.0.0)", "sqlalchemy-redshift (>=0.8.11)", "testfixtures (>=6.10.3)", "toloka-kit (>=0.1.25)", "transform (>=1.0.12)", "tweepy (>=3.5)"]
-all-orchestration-extras = ["PyGithub (>=1.51)", "atlassian-python-api (>=2.0.1)", "azure-identity (>=1.7.0)", "azure-storage-blob (>=12.1.0)", "boto3 (>=1.9)", "dulwich (>=0.19.7)", "google-auth (>=2.0)", "google-cloud-aiplatform (>=1.4.0)", "google-cloud-secret-manager (>=2.4.0)", "google-cloud-storage (>=1.13)", "kubernetes (>=9.0.0a1)", "python-gitlab (>=2.5.0)"]
-aws = ["boto3 (>=1.9)"]
-azure = ["azure-core (>=1.10.0)", "azure-cosmos (>=3.1.1)", "azure-identity (>=1.7.0)", "azure-mgmt-datafactory (>=2.7.0)", "azure-storage-blob (>=12.1.0)"]
-azureml = ["azureml-sdk"]
-base-library-ci = ["PyGithub (>=1.51)", "PyJWT (>=2.3.0)", "Pygments (>=2.2,<3.0)", "atlassian-python-api (>=2.0.1)", "azure-identity (>=1.7.0)", "azure-storage-blob (>=12.1.0)", "black", "boto3 (>=1.9)", "dulwich (>=0.19.7)", "flaky (>=3.0)", "freezegun (>=1.0.0)", "google-auth (>=2.0)", "google-cloud-aiplatform (>=1.4.0)", "google-cloud-secret-manager (>=2.4.0)", "google-cloud-storage (>=1.13)", "graphviz (>=0.8)", "jinja2 (>=2.0,<4.0)", "jira (>=2.0.0)", "kubernetes (>=9.0.0a1)", "pandas (>=1.0.1)", "pytest (>=6.0)", "pytest-env (>=0.6.0)", "pytest-xdist (>=2.0)", "python-gitlab (>=2.5.0)", "responses (>=0.14.0)", "testfixtures (>=6.10.3)"]
+airtable = ["airtable-python-wrapper (>=0.11,<0.12)"]
+all-extras = ["PyGithub (>=1.51,<2.0)", "Pygments (>=2.2,<3.0)", "airtable-python-wrapper (>=0.11,<0.12)", "atlassian-python-api (>=2.0.1)", "azure-cosmos (>=3.1.1,<3.2)", "azure-storage-blob (>=12.1.0,<13.0)", "azureml-sdk (>=1.0.65,<1.1)", "black", "boto3 (>=1.9,<2.0)", "confluent-kafka (>=1.7.0)", "dask-cloudprovider[aws] (>=0.2.0)", "dask-kubernetes (>=0.8.0)", "dropbox (>=9.0,<10.0)", "dulwich (>=0.19.7)", "feedparser (>=5.0.1,<6.0)", "flaky (>=3.0)", "google-auth (>=2.0,<3.0)", "google-cloud-aiplatform (>=1.4.0,<2.0)", "google-cloud-bigquery (>=1.6.0,<3.0)", "google-cloud-secret-manager (>=2.4.0)", "google-cloud-storage (>=1.13,<2.0)", "graphviz (>=0.8)", "graphviz (>=0.8.3)", "great-expectations (>=0.11.1)", "gspread (>=3.6.0)", "hvac (>=0.10)", "jinja2 (>=2.0,<4.0)", "jira (>=2.0.0)", "kubernetes (>=9.0.0a1,<=13.0)", "mypy (>=0.600,<0.813)", "nbconvert (>=6.0.7)", "pandas (>=1.0.1)", "papermill (>=2.2.0)", "prometheus-client (>=0.9.0)", "psycopg2-binary (>=2.8.2)", "pushbullet.py (>=0.11.0)", "pyarrow (>=5.0.0)", "pyexasol (>=0.16.1)", "pymysql (>=0.9.3)", "pyodbc (>=4.0.30)", "pytest (>=6.0)", "pytest-env (>=0.6.0)", "pytest-xdist (>=2.0)", "python-gitlab (>=2.5.0,<3.0)", "redis (>=3.2.1)", "responses (>=0.14.0)", "sendgrid (>=6.7.0)", "snowflake-connector-python (>=1.8.2,<2.5)", "soda-sql (>=2.0.0b25)", "spacy (>=2.0.0,<3.0.0)", "testfixtures (>=6.10.3)", "tweepy (>=3.5,<4.0)"]
+all-orchestration-extras = ["PyGithub (>=1.51,<2.0)", "atlassian-python-api (>=2.0.1)", "azure-storage-blob (>=12.1.0,<13.0)", "boto3 (>=1.9,<2.0)", "dulwich (>=0.19.7)", "google-auth (>=2.0,<3.0)", "google-cloud-aiplatform (>=1.4.0,<2.0)", "google-cloud-secret-manager (>=2.4.0)", "google-cloud-storage (>=1.13,<2.0)", "kubernetes (>=9.0.0a1,<=13.0)", "python-gitlab (>=2.5.0,<3.0)"]
+aws = ["boto3 (>=1.9,<2.0)"]
+azure = ["azure-cosmos (>=3.1.1,<3.2)", "azure-storage-blob (>=12.1.0,<13.0)", "azureml-sdk (>=1.0.65,<1.1)"]
+base-library-ci = ["PyGithub (>=1.51,<2.0)", "Pygments (>=2.2,<3.0)", "atlassian-python-api (>=2.0.1)", "azure-storage-blob (>=12.1.0,<13.0)", "black", "boto3 (>=1.9,<2.0)", "dulwich (>=0.19.7)", "flaky (>=3.0)", "google-auth (>=2.0,<3.0)", "google-cloud-aiplatform (>=1.4.0,<2.0)", "google-cloud-secret-manager (>=2.4.0)", "google-cloud-storage (>=1.13,<2.0)", "graphviz (>=0.8)", "jinja2 (>=2.0,<4.0)", "jira (>=2.0.0)", "kubernetes (>=9.0.0a1,<=13.0)", "mypy (>=0.600,<0.813)", "pandas (>=1.0.1)", "pytest (>=6.0)", "pytest-env (>=0.6.0)", "pytest-xdist (>=2.0)", "python-gitlab (>=2.5.0,<3.0)", "responses (>=0.14.0)", "testfixtures (>=6.10.3)"]
 bitbucket = ["atlassian-python-api (>=2.0.1)"]
-cubejs = ["PyJWT (>=2.3.0)"]
 dask-cloudprovider = ["dask-cloudprovider[aws] (>=0.2.0)"]
-databricks = ["pydantic (>=1.9.0)"]
-dev = ["PyJWT (>=2.3.0)", "Pygments (>=2.2,<3.0)", "black", "flaky (>=3.0)", "freezegun (>=1.0.0)", "graphviz (>=0.8)", "jinja2 (>=2.0,<4.0)", "pytest (>=6.0)", "pytest-env (>=0.6.0)", "pytest-xdist (>=2.0)", "responses (>=0.14.0)", "testfixtures (>=6.10.3)"]
+dev = ["Pygments (>=2.2,<3.0)", "black", "flaky (>=3.0)", "graphviz (>=0.8)", "jinja2 (>=2.0,<4.0)", "mypy (>=0.600,<0.813)", "pytest (>=6.0)", "pytest-env (>=0.6.0)", "pytest-xdist (>=2.0)", "responses (>=0.14.0)", "testfixtures (>=6.10.3)"]
 dremio = ["pyarrow (>=5.0.0)"]
 dropbox = ["dropbox (>=9.0,<10.0)"]
 exasol = ["pyexasol (>=0.16.1)"]
-firebolt = ["firebolt-sdk (>=0.2.1)"]
-gcp = ["google-auth (>=2.0)", "google-cloud-aiplatform (>=1.4.0)", "google-cloud-bigquery (>=1.6.0)", "google-cloud-secret-manager (>=2.4.0)", "google-cloud-storage (>=1.13)"]
-ge = ["great-expectations (>=0.13.8)", "sqlalchemy-redshift (>=0.8.11)"]
+gcp = ["google-auth (>=2.0,<3.0)", "google-cloud-aiplatform (>=1.4.0,<2.0)", "google-cloud-bigquery (>=1.6.0,<3.0)", "google-cloud-secret-manager (>=2.4.0)", "google-cloud-storage (>=1.13,<2.0)"]
+ge = ["great-expectations (>=0.11.1)"]
 git = ["dulwich (>=0.19.7)"]
-github = ["PyGithub (>=1.51)"]
-gitlab = ["python-gitlab (>=2.5.0)"]
-google = ["google-auth (>=2.0)", "google-cloud-aiplatform (>=1.4.0)", "google-cloud-bigquery (>=1.6.0)", "google-cloud-secret-manager (>=2.4.0)", "google-cloud-storage (>=1.13)"]
+github = ["PyGithub (>=1.51,<2.0)"]
+gitlab = ["python-gitlab (>=2.5.0,<3.0)"]
+google = ["google-auth (>=2.0,<3.0)", "google-cloud-aiplatform (>=1.4.0,<2.0)", "google-cloud-bigquery (>=1.6.0,<3.0)", "google-cloud-secret-manager (>=2.4.0)", "google-cloud-storage (>=1.13,<2.0)"]
 gsheets = ["gspread (>=3.6.0)"]
 jira = ["jira (>=2.0.0)"]
-jupyter = ["ipykernel (>=6.9.2)", "nbconvert (>=6.0.7)", "papermill (>=2.2.0)"]
+jupyter = ["nbconvert (>=6.0.7)", "papermill (>=2.2.0)"]
 kafka = ["confluent-kafka (>=1.7.0)"]
-kubernetes = ["dask-kubernetes (>=0.8.0)", "kubernetes (>=9.0.0a1)"]
+kubernetes = ["dask-kubernetes (>=0.8.0)", "kubernetes (>=9.0.0a1,<=13.0)"]
 mysql = ["pymysql (>=0.9.3)"]
-neo4j = ["py2neo (>=2021.2.3)"]
 pandas = ["pandas (>=1.0.1)"]
 postgres = ["psycopg2-binary (>=2.8.2)"]
 prometheus = ["prometheus-client (>=0.9.0)"]
 pushbullet = ["pushbullet.py (>=0.11.0)"]
 redis = ["redis (>=3.2.1)"]
-rss = ["feedparser (>=5.0.1)"]
+rss = ["feedparser (>=5.0.1,<6.0)"]
 sendgrid = ["sendgrid (>=6.7.0)"]
-sftp = ["paramiko (>=2.10.4)"]
-snowflake = ["snowflake-connector-python (>=1.8.2)"]
-sodaspark = ["soda-spark (>=0.2.1)"]
+snowflake = ["snowflake-connector-python (>=1.8.2,<2.5)"]
 sodasql = ["soda-sql (>=2.0.0b25)"]
-spacy = ["spacy (>=2.0.0)"]
+spacy = ["spacy (>=2.0.0,<3.0.0)"]
 sql-server = ["pyodbc (>=4.0.30)"]
-task-library-ci = ["PyGithub (>=1.51)", "PyJWT (>=2.3.0)", "Pygments (>=2.2,<3.0)", "airtable-python-wrapper (>=0.11)", "atlassian-python-api (>=2.0.1)", "azure-core (>=1.10.0)", "azure-cosmos (>=3.1.1)", "azure-identity (>=1.7.0)", "azure-mgmt-datafactory (>=2.7.0)", "azure-storage-blob (>=12.1.0)", "azureml-sdk", "black", "boto3 (>=1.9)", "confluent-kafka (>=1.7.0)", "dask-kubernetes (>=0.8.0)", "dropbox (>=9.0,<10.0)", "dulwich (>=0.19.7)", "feedparser (>=5.0.1)", "firebolt-sdk (>=0.2.1)", "flaky (>=3.0)", "freezegun (>=1.0.0)", "google-auth (>=2.0)", "google-cloud-aiplatform (>=1.4.0)", "google-cloud-bigquery (>=1.6.0)", "google-cloud-secret-manager (>=2.4.0)", "google-cloud-storage (>=1.13)", "graphviz (>=0.8)", "graphviz (>=0.8.3)", "great-expectations (>=0.13.8)", "gspread (>=3.6.0)", "hvac (>=0.10)", "ipykernel (>=6.9.2)", "jinja2 (>=2.0)", "jinja2 (>=2.0,<4.0)", "jira (>=2.0.0)", "kubernetes (>=9.0.0a1)", "nbconvert (>=6.0.7)", "pandas (>=1.0.1)", "papermill (>=2.2.0)", "paramiko (>=2.10.4)", "prometheus-client (>=0.9.0)", "psycopg2-binary (>=2.8.2)", "pushbullet.py (>=0.11.0)", "py2neo (>=2021.2.3)", "pyarrow (>=5.0.0)", "pydantic (>=1.9.0)", "pyexasol (>=0.16.1)", "pymysql (>=0.9.3)", "pytest (>=6.0)", "pytest-env (>=0.6.0)", "pytest-xdist (>=2.0)", "python-gitlab (>=2.5.0)", "redis (>=3.2.1)", "responses (>=0.14.0)", "sendgrid (>=6.7.0)", "snowflake-connector-python (>=1.8.2)", "spacy (>=2.0.0)", "sqlalchemy-redshift (>=0.8.11)", "testfixtures (>=6.10.3)", "toloka-kit (>=0.1.25)", "transform (>=1.0.12)", "tweepy (>=3.5)"]
-templates = ["jinja2 (>=2.0)"]
-test = ["PyJWT (>=2.3.0)", "flaky (>=3.0)", "freezegun (>=1.0.0)", "pytest (>=6.0)", "pytest-env (>=0.6.0)", "pytest-xdist (>=2.0)", "responses (>=0.14.0)", "testfixtures (>=6.10.3)"]
-toloka = ["toloka-kit (>=0.1.25)"]
-transform = ["transform (>=1.0.12)"]
-twitter = ["tweepy (>=3.5)"]
+task-library-ci = ["PyGithub (>=1.51,<2.0)", "Pygments (>=2.2,<3.0)", "airtable-python-wrapper (>=0.11,<0.12)", "atlassian-python-api (>=2.0.1)", "azure-cosmos (>=3.1.1,<3.2)", "azure-storage-blob (>=12.1.0,<13.0)", "azureml-sdk (>=1.0.65,<1.1)", "black", "boto3 (>=1.9,<2.0)", "confluent-kafka (>=1.7.0)", "dask-kubernetes (>=0.8.0)", "dropbox (>=9.0,<10.0)", "dulwich (>=0.19.7)", "feedparser (>=5.0.1,<6.0)", "flaky (>=3.0)", "google-auth (>=2.0,<3.0)", "google-cloud-aiplatform (>=1.4.0,<2.0)", "google-cloud-bigquery (>=1.6.0,<3.0)", "google-cloud-secret-manager (>=2.4.0)", "google-cloud-storage (>=1.13,<2.0)", "graphviz (>=0.8)", "graphviz (>=0.8.3)", "great-expectations (>=0.11.1)", "gspread (>=3.6.0)", "hvac (>=0.10)", "jinja2 (>=2.0,<4.0)", "jira (>=2.0.0)", "kubernetes (>=9.0.0a1,<=13.0)", "mypy (>=0.600,<0.813)", "nbconvert (>=6.0.7)", "pandas (>=1.0.1)", "papermill (>=2.2.0)", "prometheus-client (>=0.9.0)", "psycopg2-binary (>=2.8.2)", "pushbullet.py (>=0.11.0)", "pyarrow (>=5.0.0)", "pyexasol (>=0.16.1)", "pymysql (>=0.9.3)", "pytest (>=6.0)", "pytest-env (>=0.6.0)", "pytest-xdist (>=2.0)", "python-gitlab (>=2.5.0,<3.0)", "redis (>=3.2.1)", "responses (>=0.14.0)", "sendgrid (>=6.7.0)", "snowflake-connector-python (>=1.8.2,<2.5)", "soda-sql (>=2.0.0b25)", "spacy (>=2.0.0,<3.0.0)", "testfixtures (>=6.10.3)", "tweepy (>=3.5,<4.0)"]
+templates = ["jinja2 (>=2.0,<4.0)"]
+test = ["flaky (>=3.0)", "pytest (>=6.0)", "pytest-env (>=0.6.0)", "pytest-xdist (>=2.0)", "responses (>=0.14.0)", "testfixtures (>=6.10.3)"]
+twitter = ["tweepy (>=3.5,<4.0)"]
 vault = ["hvac (>=0.10)"]
 viz = ["graphviz (>=0.8.3)"]
 
@@ -4164,6 +4356,7 @@ viz = ["graphviz (>=0.8.3)"]
 name = "prometheus-client"
 version = "0.14.1"
 description = "Python client for the Prometheus monitoring system."
+category = "main"
 optional = false
 python-versions = ">=3.6"
 files = [
@@ -4178,6 +4371,7 @@ twisted = ["twisted"]
 name = "prometheus-flask-exporter"
 version = "0.20.3"
 description = "Prometheus metrics exporter for Flask"
+category = "main"
 optional = false
 python-versions = "*"
 files = [
@@ -4193,6 +4387,7 @@ prometheus-client = "*"
 name = "proto-plus"
 version = "1.22.1"
 description = "Beautiful, Pythonic protocol buffers."
+category = "main"
 optional = false
 python-versions = ">=3.6"
 files = [
@@ -4210,6 +4405,7 @@ testing = ["google-api-core[grpc] (>=1.31.5)"]
 name = "protobuf"
 version = "3.20.2"
 description = "Protocol Buffers"
+category = "main"
 optional = false
 python-versions = ">=3.7"
 files = [
@@ -4241,6 +4437,7 @@ files = [
 name = "psutil"
 version = "5.9.2"
 description = "Cross-platform lib for process and system monitoring in Python."
+category = "main"
 optional = false
 python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*"
 files = [
@@ -4285,6 +4482,7 @@ test = ["enum34", "ipaddress", "mock", "pywin32", "wmi"]
 name = "ptyprocess"
 version = "0.7.0"
 description = "Run a subprocess in a pseudo terminal"
+category = "main"
 optional = false
 python-versions = "*"
 files = [
@@ -4296,6 +4494,7 @@ files = [
 name = "py"
 version = "1.11.0"
 description = "library with cross-python path, ini-parsing, io, code, log facilities"
+category = "dev"
 optional = false
 python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*"
 files = [
@@ -4307,6 +4506,7 @@ files = [
 name = "py-grpc-prometheus"
 version = "0.7.0"
 description = "Python gRPC Prometheus Interceptors"
+category = "main"
 optional = false
 python-versions = "*"
 files = [
@@ -4323,6 +4523,7 @@ setuptools = ">=39.0.1"
 name = "pyaml"
 version = "20.4.0"
 description = "PyYAML-based module to produce pretty and readable YAML-serialized data"
+category = "main"
 optional = false
 python-versions = "*"
 files = [
@@ -4337,6 +4538,7 @@ PyYAML = "*"
 name = "pyarrow"
 version = "6.0.0"
 description = "Python library for Apache Arrow"
+category = "main"
 optional = false
 python-versions = ">=3.6"
 files = [
@@ -4385,6 +4587,7 @@ numpy = ">=1.16.6"
 name = "pyasn1"
 version = "0.4.8"
 description = "ASN.1 types and codecs"
+category = "main"
 optional = false
 python-versions = "*"
 files = [
@@ -4396,6 +4599,7 @@ files = [
 name = "pyasn1-modules"
 version = "0.2.8"
 description = "A collection of ASN.1-based protocols modules."
+category = "main"
 optional = false
 python-versions = "*"
 files = [
@@ -4410,6 +4614,7 @@ pyasn1 = ">=0.4.6,<0.5.0"
 name = "pycodestyle"
 version = "2.8.0"
 description = "Python style guide checker"
+category = "dev"
 optional = false
 python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*"
 files = [
@@ -4421,6 +4626,7 @@ files = [
 name = "pycparser"
 version = "2.21"
 description = "C parser in Python"
+category = "main"
 optional = false
 python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*"
 files = [
@@ -4432,6 +4638,7 @@ files = [
 name = "pydantic"
 version = "1.10.2"
 description = "Data validation and settings management using python type hints"
+category = "main"
 optional = false
 python-versions = ">=3.7"
 files = [
@@ -4484,6 +4691,7 @@ email = ["email-validator (>=1.0.3)"]
 name = "pydata-google-auth"
 version = "1.4.0"
 description = "PyData helpers for authenticating to Google APIs"
+category = "main"
 optional = false
 python-versions = "*"
 files = [
@@ -4500,6 +4708,7 @@ setuptools = "*"
 name = "pyflakes"
 version = "2.4.0"
 description = "passive checker of Python programs"
+category = "dev"
 optional = false
 python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*"
 files = [
@@ -4511,6 +4720,7 @@ files = [
 name = "pygments"
 version = "2.13.0"
 description = "Pygments is a syntax highlighting package written in Python."
+category = "main"
 optional = false
 python-versions = ">=3.6"
 files = [
@@ -4525,6 +4735,7 @@ plugins = ["importlib-metadata"]
 name = "pyjwt"
 version = "2.5.0"
 description = "JSON Web Token implementation in Python"
+category = "main"
 optional = false
 python-versions = ">=3.7"
 files = [
@@ -4546,6 +4757,7 @@ tests = ["coverage[toml] (==5.0.4)", "pytest (>=6.0.0,<7.0.0)"]
 name = "pylint"
 version = "2.13.9"
 description = "python code static checker"
+category = "dev"
 optional = false
 python-versions = ">=3.6.2"
 files = [
@@ -4570,6 +4782,7 @@ testutil = ["gitpython (>3)"]
 name = "pymssql"
 version = "2.2.5"
 description = "DB-API interface to Microsoft SQL Server for Python. (new Cython-based version)"
+category = "main"
 optional = false
 python-versions = "*"
 files = [
@@ -4630,6 +4843,7 @@ files = [
 name = "pymysql"
 version = "1.0.2"
 description = "Pure Python MySQL Driver"
+category = "main"
 optional = false
 python-versions = ">=3.6"
 files = [
@@ -4648,6 +4862,7 @@ rsa = ["cryptography"]
 name = "pynacl"
 version = "1.5.0"
 description = "Python binding to the Networking and Cryptography (NaCl) library"
+category = "main"
 optional = false
 python-versions = ">=3.6"
 files = [
@@ -4674,6 +4889,7 @@ tests = ["hypothesis (>=3.27.0)", "pytest (>=3.2.1,!=3.3.0)"]
 name = "pyopenssl"
 version = "22.0.0"
 description = "Python wrapper module around the OpenSSL library"
+category = "main"
 optional = false
 python-versions = ">=3.6"
 files = [
@@ -4692,6 +4908,7 @@ test = ["flaky", "pretend", "pytest (>=3.0.1)"]
 name = "pyparsing"
 version = "3.0.9"
 description = "pyparsing module - Classes and methods to define and execute parsing grammars"
+category = "main"
 optional = false
 python-versions = ">=3.6.8"
 files = [
@@ -4706,6 +4923,7 @@ diagrams = ["jinja2", "railroad-diagrams"]
 name = "pyproj"
 version = "3.5.0"
 description = "Python interface to PROJ (cartographic projections and coordinate transformations library)"
+category = "main"
 optional = false
 python-versions = ">=3.8"
 files = [
@@ -4753,6 +4971,7 @@ certifi = "*"
 name = "pyreadline3"
 version = "3.4.1"
 description = "A python implementation of GNU readline."
+category = "main"
 optional = false
 python-versions = "*"
 files = [
@@ -4764,6 +4983,7 @@ files = [
 name = "pysftp"
 version = "0.2.9"
 description = "A friendly face on SFTP"
+category = "main"
 optional = false
 python-versions = "*"
 files = [
@@ -4777,6 +4997,7 @@ paramiko = ">=1.17"
 name = "pysocks"
 version = "1.7.1"
 description = "A Python SOCKS client module. See https://github.com/Anorov/PySocks for more information."
+category = "main"
 optional = false
 python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*"
 files = [
@@ -4789,6 +5010,7 @@ files = [
 name = "pytest"
 version = "6.0.2"
 description = "pytest: simple powerful testing with Python"
+category = "dev"
 optional = false
 python-versions = ">=3.5"
 files = [
@@ -4815,6 +5037,7 @@ testing = ["argcomplete", "hypothesis (>=3.56)", "mock", "nose", "requests", "xm
 name = "pytest-cov"
 version = "3.0.0"
 description = "Pytest plugin for measuring coverage."
+category = "dev"
 optional = false
 python-versions = ">=3.6"
 files = [
@@ -4833,6 +5056,7 @@ testing = ["fields", "hunter", "process-tests", "pytest-xdist", "six", "virtuale
 name = "python-box"
 version = "5.4.1"
 description = "Advanced Python dictionaries with dot notation access"
+category = "main"
 optional = false
 python-versions = ">=3.6"
 files = [
@@ -4852,6 +5076,7 @@ yaml = ["ruamel.yaml"]
 name = "python-dateutil"
 version = "2.8.2"
 description = "Extensions to the standard Python datetime module"
+category = "main"
 optional = false
 python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,>=2.7"
 files = [
@@ -4866,6 +5091,7 @@ six = ">=1.5"
 name = "python-dotenv"
 version = "0.21.0"
 description = "Read key-value pairs from a .env file and set them as environment variables"
+category = "main"
 optional = false
 python-versions = ">=3.7"
 files = [
@@ -4880,6 +5106,7 @@ cli = ["click (>=5.0)"]
 name = "python-slugify"
 version = "6.1.2"
 description = "A Python slugify application that also handles Unicode"
+category = "main"
 optional = false
 python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*, !=3.5.*"
 files = [
@@ -4897,6 +5124,7 @@ unidecode = ["Unidecode (>=1.1.1)"]
 name = "python-string-utils"
 version = "1.0.0"
 description = "Utility functions for strings validation and manipulation."
+category = "dev"
 optional = false
 python-versions = ">=3.5"
 files = []
@@ -4912,6 +5140,7 @@ resolved_reference = "78929d88d90b1f90cb4837528ed955166bf0f559"
 name = "python-telegram-bot"
 version = "13.14"
 description = "We have made you a wrapper you can't refuse"
+category = "main"
 optional = false
 python-versions = ">=3.7"
 files = [
@@ -4935,6 +5164,7 @@ socks = ["PySocks"]
 name = "pytz"
 version = "2021.3"
 description = "World timezone definitions, modern and historical"
+category = "main"
 optional = false
 python-versions = "*"
 files = [
@@ -4946,6 +5176,7 @@ files = [
 name = "pytz-deprecation-shim"
 version = "0.1.0.post0"
 description = "Shims to make deprecation of pytz easier"
+category = "main"
 optional = false
 python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,>=2.7"
 files = [
@@ -4961,6 +5192,7 @@ tzdata = {version = "*", markers = "python_version >= \"3.6\""}
 name = "pytzdata"
 version = "2020.1"
 description = "The Olson timezone database for Python."
+category = "main"
 optional = false
 python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*"
 files = [
@@ -4972,6 +5204,7 @@ files = [
 name = "pywin32"
 version = "227"
 description = "Python for Window Extensions"
+category = "main"
 optional = false
 python-versions = "*"
 files = [
@@ -4993,6 +5226,7 @@ files = [
 name = "pyyaml"
 version = "6.0"
 description = "YAML parser and emitter for Python"
+category = "main"
 optional = false
 python-versions = ">=3.6"
 files = [
@@ -5042,6 +5276,7 @@ files = [
 name = "querystring-parser"
 version = "1.2.4"
 description = "QueryString parser for Python/Django that correctly handles nested dictionaries"
+category = "main"
 optional = false
 python-versions = "*"
 files = [
@@ -5056,6 +5291,7 @@ six = "*"
 name = "rasterio"
 version = "1.2.10"
 description = "Fast and direct raster I/O for use with Numpy and SciPy"
+category = "main"
 optional = false
 python-versions = ">=3.6"
 files = [
@@ -5093,6 +5329,7 @@ test = ["boto3 (>=1.2.4)", "hypothesis", "packaging", "pytest (>=2.8.2)", "pytes
 name = "redis"
 version = "4.3.4"
 description = "Python client for Redis database and key-value store"
+category = "main"
 optional = false
 python-versions = ">=3.6"
 files = [
@@ -5113,6 +5350,7 @@ ocsp = ["cryptography (>=36.0.1)", "pyopenssl (==20.0.1)", "requests (>=2.26.0)"
 name = "redis-pal"
 version = "1.0.0"
 description = "Store things in Redis without worrying about types or anything, just do it!"
+category = "main"
 optional = false
 python-versions = ">=3.8,<4.0"
 files = [
@@ -5128,6 +5366,7 @@ redis = ">=4.0,<5.0"
 name = "regex"
 version = "2022.9.13"
 description = "Alternative regular expression module, to replace re."
+category = "main"
 optional = false
 python-versions = ">=3.6"
 files = [
@@ -5225,6 +5464,7 @@ files = [
 name = "requests"
 version = "2.26.0"
 description = "Python HTTP for Humans."
+category = "main"
 optional = false
 python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*, !=3.5.*"
 files = [
@@ -5247,6 +5487,7 @@ use-chardet-on-py3 = ["chardet (>=3.0.2,<5)"]
 name = "requests-auth-aws-sigv4"
 version = "0.7"
 description = "AWS SigV4 Authentication with the python requests module"
+category = "main"
 optional = false
 python-versions = ">=2.7, >=3.6"
 files = [
@@ -5261,6 +5502,7 @@ requests = "*"
 name = "requests-oauthlib"
 version = "1.3.1"
 description = "OAuthlib authentication support for Requests."
+category = "main"
 optional = false
 python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*"
 files = [
@@ -5279,6 +5521,7 @@ rsa = ["oauthlib[signedtoken] (>=3.0.0)"]
 name = "rioxarray"
 version = "0.9.0"
 description = "rasterio xarray extension."
+category = "main"
 optional = false
 python-versions = ">=3.7"
 files = [
@@ -5301,6 +5544,7 @@ interp = ["scipy"]
 name = "rsa"
 version = "4.9"
 description = "Pure-Python RSA implementation"
+category = "main"
 optional = false
 python-versions = ">=3.6,<4"
 files = [
@@ -5315,6 +5559,7 @@ pyasn1 = ">=0.1.3"
 name = "ruamel.yaml"
 version = "0.17.10"
 description = "ruamel.yaml is a YAML parser/emitter that supports roundtrip preservation of comments, seq/map flow style, and map key order"
+category = "main"
 optional = false
 python-versions = ">=3"
 files = [
@@ -5333,6 +5578,7 @@ jinja2 = ["ruamel.yaml.jinja2 (>=0.2)"]
 name = "ruamel.yaml.clib"
 version = "0.2.6"
 description = "C version of reader, parser and emitter for ruamel.yaml derived from libyaml"
+category = "main"
 optional = false
 python-versions = ">=3.5"
 files = [
@@ -5372,6 +5618,7 @@ files = [
 name = "s3fs"
 version = "2022.8.2"
 description = "Convenient Filesystem interface over S3"
+category = "main"
 optional = false
 python-versions = ">= 3.7"
 files = [
@@ -5392,6 +5639,7 @@ boto3 = ["aiobotocore[boto3] (>=2.4.0,<2.5.0)"]
 name = "s3transfer"
 version = "0.6.0"
 description = "An Amazon S3 Transfer Manager"
+category = "main"
 optional = false
 python-versions = ">= 3.7"
 files = [
@@ -5409,6 +5657,7 @@ crt = ["botocore[crt] (>=1.20.29,<2.0a.0)"]
 name = "scikit-learn"
 version = "1.2.2"
 description = "A set of python modules for machine learning and data mining"
+category = "main"
 optional = false
 python-versions = ">=3.8"
 files = [
@@ -5451,6 +5700,7 @@ tests = ["black (>=22.3.0)", "flake8 (>=3.8.2)", "matplotlib (>=3.1.3)", "mypy (
 name = "scipy"
 version = "1.8.1"
 description = "SciPy: Scientific Library for Python"
+category = "main"
 optional = false
 python-versions = ">=3.8,<3.11"
 files = [
@@ -5486,6 +5736,7 @@ numpy = ">=1.17.3,<1.25.0"
 name = "scipy"
 version = "1.9.1"
 description = "SciPy: Scientific Library for Python"
+category = "main"
 optional = false
 python-versions = ">=3.8,<3.12"
 files = [
@@ -5521,6 +5772,7 @@ numpy = ">=1.18.5,<1.25.0"
 name = "scp"
 version = "0.14.4"
 description = "scp module for paramiko"
+category = "main"
 optional = false
 python-versions = "*"
 files = [
@@ -5535,6 +5787,7 @@ paramiko = "*"
 name = "secretstorage"
 version = "3.3.3"
 description = "Python bindings to FreeDesktop.org Secret Service API"
+category = "main"
 optional = false
 python-versions = ">=3.6"
 files = [
@@ -5550,6 +5803,7 @@ jeepney = ">=0.6"
 name = "setuptools"
 version = "65.6.3"
 description = "Easily download, build, install, upgrade, and uninstall Python packages"
+category = "main"
 optional = false
 python-versions = ">=3.7"
 files = [
@@ -5566,6 +5820,7 @@ testing-integration = ["build[virtualenv]", "filelock (>=3.4.0)", "jaraco.envs (
 name = "shapely"
 version = "1.8.4"
 description = "Geometric objects, predicates, and operations"
+category = "main"
 optional = false
 python-versions = ">=3.6"
 files = [
@@ -5614,6 +5869,7 @@ vectorized = ["numpy"]
 name = "simplejson"
 version = "3.18.0"
 description = "Simple, fast, extensible JSON encoder/decoder for Python"
+category = "main"
 optional = false
 python-versions = ">=2.5, !=3.0.*, !=3.1.*, !=3.2.*"
 files = [
@@ -5684,6 +5940,7 @@ files = [
 name = "six"
 version = "1.16.0"
 description = "Python 2 and 3 compatibility utilities"
+category = "main"
 optional = false
 python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*"
 files = [
@@ -5695,6 +5952,7 @@ files = [
 name = "smmap"
 version = "5.0.0"
 description = "A pure Python implementation of a sliding window memory map manager"
+category = "main"
 optional = false
 python-versions = ">=3.6"
 files = [
@@ -5706,6 +5964,7 @@ files = [
 name = "sniffio"
 version = "1.3.0"
 description = "Sniff out which async library your code is running under"
+category = "main"
 optional = false
 python-versions = ">=3.7"
 files = [
@@ -5717,6 +5976,7 @@ files = [
 name = "snuggs"
 version = "1.4.7"
 description = "Snuggs are s-expressions for Numpy"
+category = "main"
 optional = false
 python-versions = "*"
 files = [
@@ -5735,6 +5995,7 @@ test = ["hypothesis", "pytest"]
 name = "sortedcontainers"
 version = "2.4.0"
 description = "Sorted Containers -- Sorted List, Sorted Dict, Sorted Set"
+category = "main"
 optional = false
 python-versions = "*"
 files = [
@@ -5746,6 +6007,7 @@ files = [
 name = "soupsieve"
 version = "2.3.2.post1"
 description = "A modern CSS selector implementation for Beautiful Soup."
+category = "main"
 optional = false
 python-versions = ">=3.6"
 files = [
@@ -5757,6 +6019,7 @@ files = [
 name = "sqlalchemy"
 version = "1.4.41"
 description = "Database Abstraction Library"
+category = "main"
 optional = false
 python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,>=2.7"
 files = [
@@ -5804,7 +6067,7 @@ files = [
 ]
 
 [package.dependencies]
-greenlet = {version = "!=0.4.17", markers = "python_version >= \"3\" and (platform_machine == \"win32\" or platform_machine == \"WIN32\" or platform_machine == \"AMD64\" or platform_machine == \"amd64\" or platform_machine == \"x86_64\" or platform_machine == \"ppc64le\" or platform_machine == \"aarch64\")"}
+greenlet = {version = "!=0.4.17", markers = "python_version >= \"3\" and (platform_machine == \"aarch64\" or platform_machine == \"ppc64le\" or platform_machine == \"x86_64\" or platform_machine == \"amd64\" or platform_machine == \"AMD64\" or platform_machine == \"win32\" or platform_machine == \"WIN32\")"}
 
 [package.extras]
 aiomysql = ["aiomysql", "greenlet (!=0.4.17)"]
@@ -5831,6 +6094,7 @@ sqlcipher = ["sqlcipher3-binary"]
 name = "sqlparse"
 version = "0.4.3"
 description = "A non-validating SQL parser."
+category = "main"
 optional = false
 python-versions = ">=3.5"
 files = [
@@ -5842,6 +6106,7 @@ files = [
 name = "starlette"
 version = "0.20.4"
 description = "The little ASGI library that shines."
+category = "main"
 optional = false
 python-versions = ">=3.7"
 files = [
@@ -5860,6 +6125,7 @@ full = ["itsdangerous", "jinja2", "python-multipart", "pyyaml", "requests"]
 name = "starlette-exporter"
 version = "0.14.0"
 description = "Prometheus metrics exporter for Starlette applications."
+category = "main"
 optional = false
 python-versions = "*"
 files = [
@@ -5875,6 +6141,7 @@ starlette = "*"
 name = "statsmodels"
 version = "0.13.2"
 description = "Statistical computations and models for Python"
+category = "main"
 optional = false
 python-versions = ">=3.7"
 files = [
@@ -5919,6 +6186,7 @@ docs = ["ipykernel", "jupyter-client", "matplotlib", "nbconvert", "nbformat", "n
 name = "statsmodels"
 version = "0.13.5"
 description = "Statistical computations and models for Python"
+category = "main"
 optional = false
 python-versions = ">=3.7"
 files = [
@@ -5956,7 +6224,7 @@ pandas = ">=0.25"
 patsy = ">=0.5.2"
 scipy = [
     {version = ">=1.3", markers = "(python_version > \"3.9\" or platform_system != \"Windows\" or platform_machine != \"x86\") and python_version < \"3.12\""},
-    {version = ">=1.3,<1.9", markers = "(python_version == \"3.8\" or python_version == \"3.9\") and platform_system == \"Windows\" and platform_machine == \"x86\""},
+    {version = ">=1.3,<1.9", markers = "python_version == \"3.8\" and platform_system == \"Windows\" and platform_machine == \"x86\" or python_version == \"3.9\" and platform_system == \"Windows\" and platform_machine == \"x86\""},
 ]
 
 [package.extras]
@@ -5968,6 +6236,7 @@ docs = ["ipykernel", "jupyter-client", "matplotlib", "nbconvert", "nbformat", "n
 name = "tabulate"
 version = "0.8.10"
 description = "Pretty-print tabular data"
+category = "main"
 optional = false
 python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*"
 files = [
@@ -5982,6 +6251,7 @@ widechars = ["wcwidth"]
 name = "tblib"
 version = "1.7.0"
 description = "Traceback serialization library."
+category = "main"
 optional = false
 python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*"
 files = [
@@ -5993,6 +6263,7 @@ files = [
 name = "tenacity"
 version = "8.2.2"
 description = "Retry code until it succeeds"
+category = "main"
 optional = false
 python-versions = ">=3.6"
 files = [
@@ -6007,6 +6278,7 @@ doc = ["reno", "sphinx", "tornado (>=4.5)"]
 name = "text-unidecode"
 version = "1.3"
 description = "The most basic Text::Unidecode port"
+category = "main"
 optional = false
 python-versions = "*"
 files = [
@@ -6018,6 +6290,7 @@ files = [
 name = "threadpoolctl"
 version = "3.1.0"
 description = "threadpoolctl"
+category = "main"
 optional = false
 python-versions = ">=3.6"
 files = [
@@ -6029,6 +6302,7 @@ files = [
 name = "toml"
 version = "0.10.2"
 description = "Python Library for Tom's Obvious, Minimal Language"
+category = "main"
 optional = false
 python-versions = ">=2.6, !=3.0.*, !=3.1.*, !=3.2.*"
 files = [
@@ -6040,6 +6314,7 @@ files = [
 name = "tomli"
 version = "2.0.1"
 description = "A lil' TOML parser"
+category = "dev"
 optional = false
 python-versions = ">=3.7"
 files = [
@@ -6051,6 +6326,7 @@ files = [
 name = "tomlkit"
 version = "0.7.0"
 description = "Style preserving TOML library"
+category = "main"
 optional = false
 python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*"
 files = [
@@ -6062,6 +6338,7 @@ files = [
 name = "toolz"
 version = "0.12.0"
 description = "List processing tools and functional utilities"
+category = "main"
 optional = false
 python-versions = ">=3.5"
 files = [
@@ -6073,6 +6350,7 @@ files = [
 name = "tornado"
 version = "6.1"
 description = "Tornado is a Python web framework and asynchronous networking library, originally developed at FriendFeed."
+category = "main"
 optional = false
 python-versions = ">= 3.5"
 files = [
@@ -6123,6 +6401,7 @@ files = [
 name = "tqdm"
 version = "4.50.2"
 description = "Fast, Extensible Progress Meter"
+category = "main"
 optional = false
 python-versions = ">=2.6, !=3.0.*, !=3.1.*"
 files = [
@@ -6137,6 +6416,7 @@ dev = ["argopt", "py-make (>=0.1.0)", "pydoc-markdown", "twine"]
 name = "tweepy"
 version = "4.4.0"
 description = "Twitter library for Python"
+category = "main"
 optional = false
 python-versions = ">=3.6"
 files = [
@@ -6158,6 +6438,7 @@ test = ["vcrpy (>=1.10.3)"]
 name = "typed-ast"
 version = "1.5.4"
 description = "a fork of Python 2 and 3 ast modules with type comment support"
+category = "main"
 optional = false
 python-versions = ">=3.6"
 files = [
@@ -6191,6 +6472,7 @@ files = [
 name = "typer"
 version = "0.4.2"
 description = "Typer, build great CLIs. Easy to code. Based on Python type hints."
+category = "dev"
 optional = false
 python-versions = ">=3.6"
 files = [
@@ -6211,6 +6493,7 @@ test = ["black (>=22.3.0,<23.0.0)", "coverage (>=5.2,<6.0)", "isort (>=5.0.6,<6.
 name = "types-cryptography"
 version = "3.3.23"
 description = "Typing stubs for cryptography"
+category = "main"
 optional = false
 python-versions = "*"
 files = [
@@ -6222,6 +6505,7 @@ files = [
 name = "typing-extensions"
 version = "4.3.0"
 description = "Backported and Experimental Type Hints for Python 3.7+"
+category = "main"
 optional = false
 python-versions = ">=3.7"
 files = [
@@ -6233,6 +6517,7 @@ files = [
 name = "tzdata"
 version = "2022.4"
 description = "Provider of IANA time zone data"
+category = "main"
 optional = false
 python-versions = ">=2"
 files = [
@@ -6244,6 +6529,7 @@ files = [
 name = "tzlocal"
 version = "4.2"
 description = "tzinfo object for the local timezone"
+category = "main"
 optional = false
 python-versions = ">=3.6"
 files = [
@@ -6264,6 +6550,7 @@ test = ["pytest (>=4.3)", "pytest-mock (>=3.3)"]
 name = "unidecode"
 version = "1.3.6"
 description = "ASCII transliterations of Unicode text"
+category = "main"
 optional = false
 python-versions = ">=3.5"
 files = [
@@ -6275,6 +6562,7 @@ files = [
 name = "uritemplate"
 version = "4.1.1"
 description = "Implementation of RFC 6570 URI Templates"
+category = "main"
 optional = false
 python-versions = ">=3.6"
 files = [
@@ -6286,6 +6574,7 @@ files = [
 name = "urllib3"
 version = "1.26.12"
 description = "HTTP library with thread-safe connection pooling, file post, and more."
+category = "main"
 optional = false
 python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*, !=3.5.*, <4"
 files = [
@@ -6302,6 +6591,7 @@ socks = ["PySocks (>=1.5.6,!=1.5.7,<2.0)"]
 name = "uvicorn"
 version = "0.18.3"
 description = "The lightning-fast ASGI server."
+category = "main"
 optional = false
 python-versions = ">=3.7"
 files = [
@@ -6320,6 +6610,7 @@ standard = ["colorama (>=0.4)", "httptools (>=0.4.0)", "python-dotenv (>=0.13)",
 name = "uvloop"
 version = "0.17.0"
 description = "Fast implementation of asyncio event loop on top of libuv"
+category = "main"
 optional = false
 python-versions = ">=3.7"
 files = [
@@ -6364,6 +6655,7 @@ test = ["Cython (>=0.29.32,<0.30.0)", "aiohttp", "flake8 (>=3.9.2,<3.10.0)", "my
 name = "virtualenv"
 version = "20.16.5"
 description = "Virtual Python Environment builder"
+category = "main"
 optional = false
 python-versions = ">=3.6"
 files = [
@@ -6384,6 +6676,7 @@ testing = ["coverage (>=6.2)", "coverage-enable-subprocess (>=1)", "flaky (>=3.7
 name = "waitress"
 version = "2.1.2"
 description = "Waitress WSGI server"
+category = "main"
 optional = false
 python-versions = ">=3.7.0"
 files = [
@@ -6399,6 +6692,7 @@ testing = ["coverage (>=5.0)", "pytest", "pytest-cover"]
 name = "webencodings"
 version = "0.5.1"
 description = "Character encoding aliases for legacy web content"
+category = "main"
 optional = false
 python-versions = "*"
 files = [
@@ -6410,6 +6704,7 @@ files = [
 name = "websocket-client"
 version = "1.4.1"
 description = "WebSocket client for Python with low level API options"
+category = "main"
 optional = false
 python-versions = ">=3.7"
 files = [
@@ -6426,6 +6721,7 @@ test = ["websockets"]
 name = "werkzeug"
 version = "2.2.2"
 description = "The comprehensive WSGI web application library."
+category = "main"
 optional = false
 python-versions = ">=3.7"
 files = [
@@ -6443,6 +6739,7 @@ watchdog = ["watchdog"]
 name = "win32-setctime"
 version = "1.1.0"
 description = "A small Python utility to set file creation time on Windows"
+category = "main"
 optional = false
 python-versions = ">=3.5"
 files = [
@@ -6457,6 +6754,7 @@ dev = ["black (>=19.3b0)", "pytest (>=4.6.2)"]
 name = "wrapt"
 version = "1.14.1"
 description = "Module for decorators, wrappers and monkey patching."
+category = "main"
 optional = false
 python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,>=2.7"
 files = [
@@ -6530,6 +6828,7 @@ files = [
 name = "xarray"
 version = "2022.12.0"
 description = "N-D labeled arrays and datasets in Python"
+category = "main"
 optional = false
 python-versions = ">=3.8"
 files = [
@@ -6554,6 +6853,7 @@ viz = ["matplotlib", "nc-time-axis", "seaborn"]
 name = "xgboost"
 version = "1.7.4"
 description = "XGBoost Python Package"
+category = "main"
 optional = false
 python-versions = ">=3.8"
 files = [
@@ -6581,6 +6881,7 @@ scikit-learn = ["scikit-learn"]
 name = "xmltodict"
 version = "0.13.0"
 description = "Makes working with XML feel like you are working with JSON"
+category = "main"
 optional = false
 python-versions = ">=3.4"
 files = [
@@ -6592,6 +6893,7 @@ files = [
 name = "yarl"
 version = "1.8.1"
 description = "Yet another URL library"
+category = "main"
 optional = false
 python-versions = ">=3.7"
 files = [
@@ -6664,6 +6966,7 @@ multidict = ">=4.0"
 name = "zict"
 version = "2.2.0"
 description = "Mutable mapping tools"
+category = "main"
 optional = false
 python-versions = ">=3.7"
 files = [
@@ -6678,6 +6981,7 @@ heapdict = "*"
 name = "zipfile36"
 version = "0.1.3"
 description = "Read and write ZIP files - backport of the zipfile module from Python 3.6"
+category = "main"
 optional = false
 python-versions = "*"
 files = [
@@ -6689,6 +6993,7 @@ files = [
 name = "zipp"
 version = "3.8.1"
 description = "Backport of pathlib-compatible object wrapper for zip files"
+category = "main"
 optional = false
 python-versions = ">=3.7"
 files = [
@@ -6703,4 +7008,4 @@ testing = ["func-timeout", "jaraco.itertools", "pytest (>=6)", "pytest-black (>=
 [metadata]
 lock-version = "2.0"
 python-versions = ">=3.8,<3.11"
-content-hash = "0a3a2f0a8b8edd2dc52008104299b57b4d924d89b6e4ea9cd85db9f740c6ad12"
+content-hash = "d97502ee5e3adfbb7720ca8a746e722b3bbe97b1ed2fcd4122fa9c6f38a67496"
diff --git a/pyproject.toml b/pyproject.toml
index 865edb4b7..805fd7f8a 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -32,7 +32,7 @@ pandas = "1.5.2"
 pandas-read-xml = "^0.3.1"
 pendulum = "2.1.2"
 phonenumbers = "^8.12.57"
-prefect = "1.4.1"
+prefect = "0.15.9"
 pymssql = "^2.2.4"
 python = ">=3.8,<3.11"
 python-telegram-bot = "^13.11"
diff --git a/requirements-deploy.txt b/requirements-deploy.txt
index b01918fe5..bf5e18d7d 100644
--- a/requirements-deploy.txt
+++ b/requirements-deploy.txt
@@ -1,7 +1,7 @@
 google-cloud-storage==1.42.3
 loguru==0.6.0
 poetry==1.1.13
-prefect==1.4.1
+prefect==0.15.9
 typer==0.4.0
 networkx==2.6.3
 pytest-cov==3.0.0
diff --git a/requirements-test.txt b/requirements-test.txt
index 339540e45..f6bd03c70 100644
--- a/requirements-test.txt
+++ b/requirements-test.txt
@@ -1,4 +1,4 @@
 networkx==2.6.3
 pytest-cov==3.0.0
 pyyaml
-prefect==1.4.1
\ No newline at end of file
+prefect==0.15.9
\ No newline at end of file

From b211c2a6fae95c950f9aa783ddabf9608f11cb18 Mon Sep 17 00:00:00 2001
From: Rodrigo Cunha <66736583+eng-rodrigocunha@users.noreply.github.com>
Date: Thu, 6 Jul 2023 16:54:03 -0300
Subject: [PATCH 05/59] add code_owners

---
 pipelines/rj_smtr/br_rj_riodejaneiro_rdo/flows.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pipelines/rj_smtr/br_rj_riodejaneiro_rdo/flows.py b/pipelines/rj_smtr/br_rj_riodejaneiro_rdo/flows.py
index c72910b1f..fa97cc83d 100644
--- a/pipelines/rj_smtr/br_rj_riodejaneiro_rdo/flows.py
+++ b/pipelines/rj_smtr/br_rj_riodejaneiro_rdo/flows.py
@@ -243,7 +243,7 @@
 
 with Flow(
     "SMTR: STPL RDO - Captura",
-    code_owners=["caio", "fernanda"],
+    code_owners=["caio", "fernanda", "boris", "rodrigo"],
 ) as captura_stpl_rdo:
     # SETUP
     transport_mode = Parameter("transport_mode", "STPL")

From 871dc7033acfd6b958ce88dbd43bb639b5f68af9 Mon Sep 17 00:00:00 2001
From: Rodrigo Cunha <66736583+eng-rodrigocunha@users.noreply.github.com>
Date: Thu, 6 Jul 2023 16:57:27 -0300
Subject: [PATCH 06/59] Add code_owners

---
 pipelines/rj_smtr/br_rj_riodejaneiro_rdo/flows.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pipelines/rj_smtr/br_rj_riodejaneiro_rdo/flows.py b/pipelines/rj_smtr/br_rj_riodejaneiro_rdo/flows.py
index fa97cc83d..4a7b4aa08 100644
--- a/pipelines/rj_smtr/br_rj_riodejaneiro_rdo/flows.py
+++ b/pipelines/rj_smtr/br_rj_riodejaneiro_rdo/flows.py
@@ -195,7 +195,7 @@
 
 with Flow(
     "SMTR: STPL RHO - Captura",
-    code_owners=["caio", "fernanda"],
+    code_owners=["caio", "fernanda", "boris", "rodrigo"],
 ) as captura_stpl_rho:
     # SETUP
     transport_mode = Parameter("transport_mode", "STPL")

From b4abc7e2bb38728812c2735cd0012ef6535be74e Mon Sep 17 00:00:00 2001
From: eng-rodrigocunha <engtransportes.rodrigocunha@gmail.com>
Date: Mon, 25 Sep 2023 17:07:45 -0300
Subject: [PATCH 07/59] change flow names + change agent + remove redis

---
 .../rj_smtr/br_rj_riodejaneiro_rdo/flows.py   | 20 +++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/pipelines/rj_smtr/br_rj_riodejaneiro_rdo/flows.py b/pipelines/rj_smtr/br_rj_riodejaneiro_rdo/flows.py
index 4a7b4aa08..b7be66945 100644
--- a/pipelines/rj_smtr/br_rj_riodejaneiro_rdo/flows.py
+++ b/pipelines/rj_smtr/br_rj_riodejaneiro_rdo/flows.py
@@ -97,7 +97,7 @@
 )
 
 with Flow(
-    "SMTR: RHO - Captura",
+    "SMTR: SPPO RHO - Captura",
     code_owners=["caio", "fernanda", "boris", "rodrigo"],
 ) as captura_sppo_rho:
     # SETUP
@@ -145,7 +145,7 @@
 captura_sppo_rho.schedule = every_day
 
 with Flow(
-    "SMTR: RDO - Captura",
+    "SMTR: SPPO RDO - Captura",
     code_owners=["caio", "fernanda", "boris", "rodrigo"],
 ) as captura_sppo_rdo:
     # SETUP
@@ -230,14 +230,14 @@
         partitions=partitions,
         status=status,
     )
-    set_redis = update_rdo_redis(
-        download_files=download_files, table_id=table_id, errors=errors
-    )
+    # set_redis = update_rdo_redis(
+    #     download_files=download_files, table_id=table_id, errors=errors
+    # )
 
 captura_stpl_rho.storage = GCS(emd_constants.GCS_FLOWS_BUCKET.value)
 captura_stpl_rho.run_config = KubernetesRun(
     image=emd_constants.DOCKER_IMAGE.value,
-    labels=[emd_constants.RJ_SMTR_AGENT_LABEL.value],
+    labels=[emd_constants.RJ_SMTR_DEV_AGENT_LABEL.value],
 )
 captura_stpl_rho.schedule = every_day
 
@@ -278,14 +278,14 @@
         partitions=partitions,
         status=status,
     )
-    set_redis = update_rdo_redis(
-        download_files=download_files, table_id=table_id, errors=errors
-    )
+    # set_redis = update_rdo_redis(
+    #     download_files=download_files, table_id=table_id, errors=errors
+    # )
 
 captura_stpl_rdo.storage = GCS(emd_constants.GCS_FLOWS_BUCKET.value)
 captura_stpl_rdo.run_config = KubernetesRun(
     image=emd_constants.DOCKER_IMAGE.value,
-    labels=[emd_constants.RJ_SMTR_AGENT_LABEL.value],
+    labels=[emd_constants.RJ_SMTR_DEV_AGENT_LABEL.value],
 )
 captura_stpl_rdo.schedule = every_day
 

From 9e2425221cbe2e0108e8718888f382fdb84223a5 Mon Sep 17 00:00:00 2001
From: eng-rodrigocunha <engtransportes.rodrigocunha@gmail.com>
Date: Mon, 25 Sep 2023 18:11:19 -0300
Subject: [PATCH 08/59] update file list

---
 pipelines/rj_smtr/br_rj_riodejaneiro_rdo/tasks.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/pipelines/rj_smtr/br_rj_riodejaneiro_rdo/tasks.py b/pipelines/rj_smtr/br_rj_riodejaneiro_rdo/tasks.py
index 987bb8abd..1594e33f9 100644
--- a/pipelines/rj_smtr/br_rj_riodejaneiro_rdo/tasks.py
+++ b/pipelines/rj_smtr/br_rj_riodejaneiro_rdo/tasks.py
@@ -69,6 +69,9 @@ def get_file_paths_from_ftp(
                 files.append(file_info)
     # except Exception as e:  # pylint: disable=W0703
     #     return [{"error": e}]
+
+    files = files[:10]
+
     log(f"There are {len(files)} files at the FTP")
     return files
 

From b6089fa2bf7fd75fd5abb1fb8affe74eb63f3ff8 Mon Sep 17 00:00:00 2001
From: fernandascovino <fscovinom@gmail.com>
Date: Mon, 25 Sep 2023 19:09:32 -0300
Subject: [PATCH 09/59] remove task de particao nao usada

---
 pipelines/rj_smtr/tasks.py | 28 ----------------------------
 1 file changed, 28 deletions(-)

diff --git a/pipelines/rj_smtr/tasks.py b/pipelines/rj_smtr/tasks.py
index de52c03df..983f93fbf 100644
--- a/pipelines/rj_smtr/tasks.py
+++ b/pipelines/rj_smtr/tasks.py
@@ -181,34 +181,6 @@ def parse_timestamp_to_string(timestamp: datetime, pattern="%Y-%m-%d-%H-%M-%S")
     return timestamp.strftime(pattern)
 
 
-@task
-def create_current_date_hour_partition(capture_time=None):
-    """Create partitioned directory structure to save data locally based
-    on capture time.
-
-    Args:
-        capture_time(pendulum.datetime.DateTime, optional):
-            if recapturing data, will create partitions based
-            on the failed timestamps being recaptured
-
-    Returns:
-        dict: "filename" contains the name which to upload the csv, "partitions" contains
-        the partitioned directory path
-    """
-    if capture_time is None:
-        capture_time = datetime.now(tz=constants.TIMEZONE.value).replace(
-            minute=0, second=0, microsecond=0
-        )
-    date = capture_time.strftime("%Y-%m-%d")
-    hour = capture_time.strftime("%H")
-
-    return {
-        "filename": capture_time.strftime("%Y-%m-%d-%H-%M-%S"),
-        "partitions": f"data={date}/hora={hour}",
-        "timestamp": capture_time,
-    }
-
-
 @task
 def create_local_partition_path(
     dataset_id: str, table_id: str, filename: str, partitions: str = None

From dc197ccac6d2be6af8b6025974cbdd6e8c826041 Mon Sep 17 00:00:00 2001
From: fernandascovino <fscovinom@gmail.com>
Date: Mon, 25 Sep 2023 19:17:54 -0300
Subject: [PATCH 10/59] unifica tasks de particao de data e hora

---
 pipelines/rj_smtr/constants.py     | 11 +++++------
 pipelines/rj_smtr/flows.py         | 12 ++----------
 pipelines/rj_smtr/tasks.py         | 15 +++++----------
 pipelines/rj_smtr/veiculo/flows.py |  6 +++---
 4 files changed, 15 insertions(+), 29 deletions(-)

diff --git a/pipelines/rj_smtr/constants.py b/pipelines/rj_smtr/constants.py
index 7133b8abe..b22c4a412 100644
--- a/pipelines/rj_smtr/constants.py
+++ b/pipelines/rj_smtr/constants.py
@@ -180,8 +180,7 @@ class constants(Enum):  # pylint: disable=c0103
                 ORDER BY
                     data_processamento
             """,
-            "primary_key": ["id"],  # id column to nest data on
-            "flag_date_partition": False,
+            "primary_key": ["id"]  # id column to nest data on
         },
     ]
     BILHETAGEM_TABLES_PARAMS = [
@@ -199,7 +198,7 @@ class constants(Enum):  # pylint: disable=c0103
                     DT_INCLUSAO
             """,
             "primary_key": ["CD_LINHA"],  # id column to nest data on
-            "flag_date_partition": True,
+            "partition_date_only": True,
         },
         {
             "table_id": "grupo",
@@ -215,7 +214,7 @@ class constants(Enum):  # pylint: disable=c0103
                     DT_INCLUSAO
             """,
             "primary_key": ["CD_GRUPO"],
-            "flag_date_partition": True,
+            "partition_date_only": True,
         },
         {
             "table_id": "grupo_linha",
@@ -231,7 +230,7 @@ class constants(Enum):  # pylint: disable=c0103
                     DT_INCLUSAO
             """,
             "primary_key": ["CD_GRUPO", "CD_LINHA"],  # id column to nest data on
-            "flag_date_partition": True,
+            "partition_date_only": True,
         },
         {
             "table_id": "matriz_integracao",
@@ -250,7 +249,7 @@ class constants(Enum):  # pylint: disable=c0103
                 "cd_versao_matriz",
                 "cd_integracao",
             ],  # id column to nest data on
-            "flag_date_partition": True,
+            "partition_date_only": True,
         },
     ]
     BILHETAGEM_SECRET_PATH = "smtr_jae_access_data"
diff --git a/pipelines/rj_smtr/flows.py b/pipelines/rj_smtr/flows.py
index f1d29ed10..bfe9d86e4 100644
--- a/pipelines/rj_smtr/flows.py
+++ b/pipelines/rj_smtr/flows.py
@@ -5,8 +5,7 @@
 
 from prefect.run_configs import KubernetesRun
 from prefect.storage import GCS
-from prefect import case, Parameter
-from prefect.tasks.control_flow import merge
+from prefect import Parameter
 
 # EMD Imports #
 
@@ -19,7 +18,6 @@
 # SMTR Imports #
 
 from pipelines.rj_smtr.tasks import (
-    create_date_partition,
     create_date_hour_partition,
     create_local_partition_path,
     get_current_timestamp,
@@ -66,13 +64,7 @@
         dataset_id=dataset_id,
     )
 
-    with case(table_params["flag_date_partition"], True):
-        date_partitions = create_date_partition(timestamp)
-
-    with case(table_params["flag_date_partition"], False):
-        date_hour_partitions = create_date_hour_partition(timestamp)
-
-    partitions = merge(date_partitions, date_hour_partitions)
+    partitions = create_date_hour_partition(timestamp, partition_date_only=table_params["partition_date_only"])
 
     filename = parse_timestamp_to_string(timestamp)
 
diff --git a/pipelines/rj_smtr/tasks.py b/pipelines/rj_smtr/tasks.py
index 983f93fbf..a2a5adddc 100644
--- a/pipelines/rj_smtr/tasks.py
+++ b/pipelines/rj_smtr/tasks.py
@@ -158,19 +158,14 @@ def get_current_timestamp(timestamp=None, truncate_minute: bool = True) -> datet
 
 
 @task
-def create_date_hour_partition(timestamp: datetime) -> str:
+def create_date_hour_partition(timestamp: datetime, partition_date_only: bool = False) -> str:
     """
     Get date hour Hive partition structure from timestamp.
     """
-    return f"data={timestamp.strftime('%Y-%m-%d')}/hora={timestamp.strftime('%H')}"
-
-
-@task
-def create_date_partition(timestamp: datetime) -> str:
-    """
-    Get date hour Hive partition structure from timestamp.
-    """
-    return f"data={timestamp.date()}"
+    partition = f"data={timestamp.strftime('%Y-%m-%d')}"
+    if partition_date_only:
+        parition += f"/hora={timestamp.strftime('%H')}"
+    return partition
 
 
 @task
diff --git a/pipelines/rj_smtr/veiculo/flows.py b/pipelines/rj_smtr/veiculo/flows.py
index 28188a129..e1fab515e 100644
--- a/pipelines/rj_smtr/veiculo/flows.py
+++ b/pipelines/rj_smtr/veiculo/flows.py
@@ -30,7 +30,7 @@
     every_day_hour_seven,
 )
 from pipelines.rj_smtr.tasks import (
-    create_date_partition,
+    create_date_hour_partition,
     create_local_partition_path,
     get_current_timestamp,
     get_raw,
@@ -71,7 +71,7 @@
     )
 
     # SETUP #
-    partitions = create_date_partition(timestamp)
+    partitions = create_date_hour_partition(timestamp, partition_date_only=True)
 
     filename = parse_timestamp_to_string(timestamp)
 
@@ -140,7 +140,7 @@
     )
 
     # SETUP #
-    partitions = create_date_partition(timestamp)
+    partitions = create_date_hour_partition(timestamp, partition_date_only=True)
 
     filename = parse_timestamp_to_string(timestamp)
 

From 66e84a1e2b2b24ead92842b604c2210238fb037b Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Mon, 25 Sep 2023 22:22:31 +0000
Subject: [PATCH 11/59] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 pipelines/rj_smtr/constants.py | 2 +-
 pipelines/rj_smtr/flows.py     | 4 +++-
 pipelines/rj_smtr/tasks.py     | 4 +++-
 3 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/pipelines/rj_smtr/constants.py b/pipelines/rj_smtr/constants.py
index b22c4a412..93303e5b7 100644
--- a/pipelines/rj_smtr/constants.py
+++ b/pipelines/rj_smtr/constants.py
@@ -180,7 +180,7 @@ class constants(Enum):  # pylint: disable=c0103
                 ORDER BY
                     data_processamento
             """,
-            "primary_key": ["id"]  # id column to nest data on
+            "primary_key": ["id"],  # id column to nest data on
         },
     ]
     BILHETAGEM_TABLES_PARAMS = [
diff --git a/pipelines/rj_smtr/flows.py b/pipelines/rj_smtr/flows.py
index bfe9d86e4..87d506813 100644
--- a/pipelines/rj_smtr/flows.py
+++ b/pipelines/rj_smtr/flows.py
@@ -64,7 +64,9 @@
         dataset_id=dataset_id,
     )
 
-    partitions = create_date_hour_partition(timestamp, partition_date_only=table_params["partition_date_only"])
+    partitions = create_date_hour_partition(
+        timestamp, partition_date_only=table_params["partition_date_only"]
+    )
 
     filename = parse_timestamp_to_string(timestamp)
 
diff --git a/pipelines/rj_smtr/tasks.py b/pipelines/rj_smtr/tasks.py
index a2a5adddc..f35a9db72 100644
--- a/pipelines/rj_smtr/tasks.py
+++ b/pipelines/rj_smtr/tasks.py
@@ -158,7 +158,9 @@ def get_current_timestamp(timestamp=None, truncate_minute: bool = True) -> datet
 
 
 @task
-def create_date_hour_partition(timestamp: datetime, partition_date_only: bool = False) -> str:
+def create_date_hour_partition(
+    timestamp: datetime, partition_date_only: bool = False
+) -> str:
     """
     Get date hour Hive partition structure from timestamp.
     """

From 7cb436bc9d0fc7cf045ca56248ef58a63ed634e7 Mon Sep 17 00:00:00 2001
From: fernandascovino <fscovinom@gmail.com>
Date: Mon, 25 Sep 2023 19:29:50 -0300
Subject: [PATCH 12/59] corrige condicional

---
 pipelines/rj_smtr/tasks.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pipelines/rj_smtr/tasks.py b/pipelines/rj_smtr/tasks.py
index f35a9db72..e1a0d0c7d 100644
--- a/pipelines/rj_smtr/tasks.py
+++ b/pipelines/rj_smtr/tasks.py
@@ -165,8 +165,8 @@ def create_date_hour_partition(
     Get date hour Hive partition structure from timestamp.
     """
     partition = f"data={timestamp.strftime('%Y-%m-%d')}"
-    if partition_date_only:
-        parition += f"/hora={timestamp.strftime('%H')}"
+    if not partition_date_only:
+        partition += f"/hora={timestamp.strftime('%H')}"
     return partition
 
 

From 588fe7d3f3cc02500930d2bd94996152b51a5bce Mon Sep 17 00:00:00 2001
From: Rafael <rafael.c.pinheiro96@gmail.com>
Date: Tue, 26 Sep 2023 11:20:28 -0300
Subject: [PATCH 13/59] change capture flow

---
 pipelines/rj_smtr/constants.py |   1 +
 pipelines/rj_smtr/flows.py     |  44 +++++++++-----
 pipelines/rj_smtr/tasks.py     |  45 +++++++++++++++
 pipelines/rj_smtr/utils.py     | 101 +++++++++++++++++++++++++++++++++
 pipelines/utils/custom.py      |  10 ++--
 pipelines/utils/utils.py       |  15 ++++-
 6 files changed, 196 insertions(+), 20 deletions(-)

diff --git a/pipelines/rj_smtr/constants.py b/pipelines/rj_smtr/constants.py
index 7133b8abe..34b63781a 100644
--- a/pipelines/rj_smtr/constants.py
+++ b/pipelines/rj_smtr/constants.py
@@ -182,6 +182,7 @@ class constants(Enum):  # pylint: disable=c0103
             """,
             "primary_key": ["id"],  # id column to nest data on
             "flag_date_partition": False,
+            "source": "api",
         },
     ]
     BILHETAGEM_TABLES_PARAMS = [
diff --git a/pipelines/rj_smtr/flows.py b/pipelines/rj_smtr/flows.py
index f1d29ed10..e36c8e676 100644
--- a/pipelines/rj_smtr/flows.py
+++ b/pipelines/rj_smtr/flows.py
@@ -23,13 +23,13 @@
     create_date_hour_partition,
     create_local_partition_path,
     get_current_timestamp,
-    get_raw,
     parse_timestamp_to_string,
     save_raw_local,
     save_treated_local,
     upload_logs_to_bq,
     bq_upload,
     transform_to_nested_structure,
+    get_raw,
 )
 
 from pipelines.rj_smtr.tasks import (
@@ -37,6 +37,14 @@
     get_datetime_range,
 )
 
+with Flow(
+    "SMTR: Pre-Treatment",
+    code_owners=["caio", "fernanda", "boris", "rodrigo"],
+) as default_pre_treatment_flow:
+    # SETUP #
+    table_params = Parameter("table_params", default=None)
+    dataset_id = Parameter("dataset_id", default=None)
+
 
 with Flow(
     "SMTR: Captura",
@@ -59,13 +67,6 @@
         now_time=timestamp,
     )
 
-    request_params, request_url = create_request_params(
-        datetime_range=datetime_range,
-        table_params=table_params,
-        secret_path=secret_path,
-        dataset_id=dataset_id,
-    )
-
     with case(table_params["flag_date_partition"], True):
         date_partitions = create_date_partition(timestamp)
 
@@ -83,11 +84,28 @@
         partitions=partitions,
     )
 
-    raw_status = get_raw(
-        url=request_url,
-        headers=secret_path,
-        params=request_params,
-    )
+    raw_status_list = []
+
+    with case(table_params["source"], "api"):
+        request_params, request_url = create_request_params(
+            datetime_range=datetime_range,
+            table_params=table_params,
+            secret_path=secret_path,
+            dataset_id=dataset_id,
+        )
+
+        api_raw_status = get_raw(
+            url=request_url,
+            headers=secret_path,
+            params=request_params,
+        )
+
+        raw_status_list.append(api_raw_status)
+
+    with case(table_params["source"], "gcs"):
+        pass
+
+    raw_status = merge(*raw_status_list)
 
     raw_filepath = save_raw_local(status=raw_status, file_path=filepath)
 
diff --git a/pipelines/rj_smtr/tasks.py b/pipelines/rj_smtr/tasks.py
index de52c03df..49c745076 100644
--- a/pipelines/rj_smtr/tasks.py
+++ b/pipelines/rj_smtr/tasks.py
@@ -28,6 +28,7 @@
     get_last_run_timestamp,
     log_critical,
     data_info_str,
+    get_raw_data_api,
 )
 from pipelines.utils.execute_dbt_model.utils import get_dbt_client
 from pipelines.utils.utils import log, get_redis_client, get_vault_secret
@@ -960,3 +961,47 @@ def create_request_params(
         }
 
     return request_params, request_url
+
+
+# @task(checkpoint=False)
+# def get_raw_from_sources(
+#     source: str,
+#     url:str,
+#     dataset_id:str = None,
+#     table_id:str = None,
+#     mode:str = None,
+#     headers: str = None,
+#     filetype: str = "json",
+#     csv_args: dict = None,
+#     params: dict = None,
+# ):
+#     if source == "api":
+#         return get_raw_data_api(
+#             url=url,
+#             headers=headers,
+#             filetype=filetype,
+#             csv_args=csv_args,
+#             params=params
+#         )
+#     if source == "gcs":
+#         file =
+
+
+@task(checkpoint=False)
+def save_raw_storage(
+    dataset_id: str,
+    table_id: str,
+    raw_filepath: str,
+    partitions: str = None,
+):
+    st_obj = Storage(table_id=table_id, dataset_id=dataset_id)
+    log(
+        f"""Uploading raw file to bucket {st_obj.bucket_name} at
+        {st_obj.bucket_name}/{dataset_id}/{table_id}"""
+    )
+    st_obj.upload(
+        path=raw_filepath,
+        partitions=partitions,
+        mode="raw",
+        if_exists="replace",
+    )
diff --git a/pipelines/rj_smtr/utils.py b/pipelines/rj_smtr/utils.py
index 9ddf7d687..3b3c7377d 100644
--- a/pipelines/rj_smtr/utils.py
+++ b/pipelines/rj_smtr/utils.py
@@ -14,6 +14,8 @@
 from basedosdados import Table
 import pandas as pd
 import pytz
+import requests
+import zipfile
 
 from prefect.schedules.clocks import IntervalClock
 
@@ -27,6 +29,8 @@
     get_vault_secret,
     send_discord_message,
     get_redis_client,
+    get_storage_blobs,
+    get_storage_blob,
 )
 
 
@@ -445,3 +449,100 @@ def generate_execute_schedules(  # pylint: disable=too-many-arguments,too-many-l
             )
         )
     return clocks
+
+
+def get_raw_data_api(  # pylint: disable=R0912
+    url: str,
+    headers: str = None,
+    filetype: str = "json",
+    csv_args: dict = None,
+    params: dict = None,
+) -> list[dict]:
+    """
+    Request data from URL API
+
+    Args:
+        url (str): URL to send request
+        headers (str, optional): Path to headers guardeded on Vault, if needed.
+        filetype (str, optional): Filetype to be formatted (supported only: json, csv and txt)
+        csv_args (dict, optional): Arguments for read_csv, if needed
+        params (dict, optional): Params to be sent on request
+
+    Returns:
+        dict: Conatining keys
+          * `data` (json): data result
+          * `error` (str): catched error, if any. Otherwise, returns None
+    """
+    data = None
+    error = None
+
+    try:
+        if headers is not None:
+            headers = get_vault_secret(headers)["data"]
+
+            # remove from headers, if present
+            remove_headers = ["host", "databases"]
+            for remove_header in remove_headers:
+                if remove_header in list(headers.keys()):
+                    del headers[remove_header]
+
+        response = requests.get(
+            url,
+            headers=headers,
+            timeout=constants.MAX_TIMEOUT_SECONDS.value,
+            params=params,
+        )
+
+        if response.ok:  # status code is less than 400
+            if filetype == "json":
+                data = response.json()
+
+                # todo: move to data check on specfic API # pylint: disable=W0102
+                if isinstance(data, dict) and "DescricaoErro" in data.keys():
+                    error = data["DescricaoErro"]
+
+            elif filetype in ("txt", "csv"):
+                if csv_args is None:
+                    csv_args = {}
+                data = pd.read_csv(io.StringIO(response.text), **csv_args).to_dict(
+                    orient="records"
+                )
+            else:
+                error = (
+                    "Unsupported raw file extension. Supported only: json, csv and txt"
+                )
+
+    except Exception as exp:
+        error = exp
+
+    if error is not None:
+        log(f"[CATCHED] Task failed with error: \n{error}", level="error")
+
+    return {"data": data, "error": error}
+
+
+def get_raw_data_gcs(
+    dataset_id: str, table_id: str, file_name: str, mode: str, zip_file_name: str = None
+) -> dict:
+    error = None
+    data = None
+    try:
+        if zip_file_name:
+            blob = get_storage_blob(
+                dataset_id=dataset_id,
+                table_id=table_id,
+                file_name=zip_file_name,
+                mode=mode,
+            )
+            compressed_data = blob.download_as_bytes()
+            with zipfile.ZipFile(io.BytesIO(compressed_data), "r") as zipped_file:
+                data = zipped_file.read(file_name).decode(encoding="utf-8")
+        else:
+            blob = get_storage_blob(
+                dataset_id=dataset_id, table_id=table_id, file_name=file_name, mode=mode
+            )
+            data = blob.download_as_string()
+    except Exception as exp:
+        error = exp
+
+    return {"data": data, "error": error}
diff --git a/pipelines/utils/custom.py b/pipelines/utils/custom.py
index 13ae82dd5..d91739817 100644
--- a/pipelines/utils/custom.py
+++ b/pipelines/utils/custom.py
@@ -68,11 +68,11 @@ def __init__(  # pylint: disable=too-many-arguments, too-many-locals
             edges=edges,
             reference_tasks=reference_tasks,
             state_handlers=state_handlers,
-            on_failure=partial(
-                notify_discord_on_failure,
-                secret_path=constants.EMD_DISCORD_WEBHOOK_SECRET_PATH.value,
-                code_owners=code_owners,
-            ),
+            # on_failure=partial(
+            #     notify_discord_on_failure,
+            #     secret_path=constants.EMD_DISCORD_WEBHOOK_SECRET_PATH.value,
+            #     code_owners=code_owners,
+            # ),
             validate=validate,
             result=result,
             terminal_state_handler=terminal_state_handler,
diff --git a/pipelines/utils/utils.py b/pipelines/utils/utils.py
index efc21c133..7042709e9 100644
--- a/pipelines/utils/utils.py
+++ b/pipelines/utils/utils.py
@@ -711,7 +711,7 @@ def get_credentials_from_env(
     return cred
 
 
-def get_storage_blobs(dataset_id: str, table_id: str) -> list:
+def get_storage_blobs(dataset_id: str, table_id: str, mode: str = "staging") -> list:
     """
     Get all blobs from a table in a dataset.
     """
@@ -720,7 +720,18 @@ def get_storage_blobs(dataset_id: str, table_id: str) -> list:
     return list(
         bd_storage.client["storage_staging"]
         .bucket(bd_storage.bucket_name)
-        .list_blobs(prefix=f"staging/{bd_storage.dataset_id}/{bd_storage.table_id}/")
+        .list_blobs(prefix=f"{mode}/{bd_storage.dataset_id}/{bd_storage.table_id}/")
+    )
+
+
+def get_storage_blob(
+    dataset_id: str, table_id: str, file_name: str, mode: str = "staging"
+):
+    bd_storage = bd.Storage(dataset_id=dataset_id, table_id=table_id)
+    return (
+        bd_storage.client["storage_staging"]
+        .bucket(bd_storage.bucket_name)
+        .get_blob(blob_name=f"{mode}/{dataset_id}/{table_id}/{file_name}")
     )
 
 

From 97746e1c34db7410a78a69e0b5ce4e7df4b12ad7 Mon Sep 17 00:00:00 2001
From: Rafael <rafael.c.pinheiro96@gmail.com>
Date: Tue, 26 Sep 2023 15:04:09 -0300
Subject: [PATCH 14/59] change generic capture flow

---
 pipelines/rj_smtr/constants.py | 39 +++++++++------
 pipelines/rj_smtr/flows.py     | 72 +++++++++++++--------------
 pipelines/rj_smtr/tasks.py     | 89 ++++++++++++++++++----------------
 pipelines/rj_smtr/utils.py     | 52 ++++++++------------
 pipelines/utils/utils.py       | 15 +++++-
 5 files changed, 135 insertions(+), 132 deletions(-)

diff --git a/pipelines/rj_smtr/constants.py b/pipelines/rj_smtr/constants.py
index 34b63781a..caa4a5e23 100644
--- a/pipelines/rj_smtr/constants.py
+++ b/pipelines/rj_smtr/constants.py
@@ -167,23 +167,30 @@ class constants(Enum):  # pylint: disable=c0103
     BILHETAGEM_DATASET_ID = "br_rj_riodejaneiro_bilhetagem"
     BILHETAGEM_TRANSACAO_TABLE_PARAMS = [
         {
-            "table_id": "transacao",
-            "database": "transacao_db",
-            "query": """
-                SELECT
-                    *
-                FROM
-                    transacao
-                WHERE
-                    data_processamento BETWEEN '{start}'
-                    AND '{end}'
-                ORDER BY
-                    data_processamento
-            """,
-            "primary_key": ["id"],  # id column to nest data on
             "flag_date_partition": False,
-            "source": "api",
-        },
+            "flow_run_name": "transacao",
+            "extraction": {
+                "table_id": "transacao",
+                "database": "transacao_db",
+                "query": """
+                    SELECT
+                        *
+                    FROM
+                        transacao
+                    WHERE
+                        data_processamento BETWEEN '{start}'
+                        AND '{end}'
+                    ORDER BY
+                        data_processamento
+                """,
+                "source": "api",
+            },
+            "pre-treatment": {
+                "table_id": "transacao",
+                "file_type": "json",
+                "primary_key": ["id"],  # id column to nest data on
+            },
+        }
     ]
     BILHETAGEM_TABLES_PARAMS = [
         {
diff --git a/pipelines/rj_smtr/flows.py b/pipelines/rj_smtr/flows.py
index e36c8e676..8076633c8 100644
--- a/pipelines/rj_smtr/flows.py
+++ b/pipelines/rj_smtr/flows.py
@@ -7,6 +7,7 @@
 from prefect.storage import GCS
 from prefect import case, Parameter
 from prefect.tasks.control_flow import merge
+from prefect.utilities.collections import DotDict
 
 # EMD Imports #
 
@@ -29,22 +30,12 @@
     upload_logs_to_bq,
     bq_upload,
     transform_to_nested_structure,
-    get_raw,
-)
-
-from pipelines.rj_smtr.tasks import (
+    get_raw_from_sources,
+    transform_data_to_json,
     create_request_params,
     get_datetime_range,
 )
 
-with Flow(
-    "SMTR: Pre-Treatment",
-    code_owners=["caio", "fernanda", "boris", "rodrigo"],
-) as default_pre_treatment_flow:
-    # SETUP #
-    table_params = Parameter("table_params", default=None)
-    dataset_id = Parameter("dataset_id", default=None)
-
 
 with Flow(
     "SMTR: Captura",
@@ -63,7 +54,7 @@
     datetime_range = get_datetime_range(timestamp, interval=interval)
 
     rename_flow_run = rename_current_flow_run_now_time(
-        prefix=default_capture_flow.name + " " + table_params["table_id"] + ": ",
+        prefix=default_capture_flow.name + " " + table_params["flow_run_name"] + ": ",
         now_time=timestamp,
     )
 
@@ -79,41 +70,44 @@
 
     filepath = create_local_partition_path(
         dataset_id=dataset_id,
-        table_id=table_params["table_id"],
+        table_id=table_params["pre-treatment"]["table_id"],
         filename=filename,
         partitions=partitions,
     )
 
-    raw_status_list = []
-
-    with case(table_params["source"], "api"):
-        request_params, request_url = create_request_params(
-            datetime_range=datetime_range,
-            table_params=table_params,
-            secret_path=secret_path,
-            dataset_id=dataset_id,
-        )
-
-        api_raw_status = get_raw(
-            url=request_url,
-            headers=secret_path,
-            params=request_params,
-        )
-
-        raw_status_list.append(api_raw_status)
-
-    with case(table_params["source"], "gcs"):
-        pass
+    # CAPTURA
+    request_params, request_url = create_request_params(
+        datetime_range=datetime_range,
+        table_params=table_params,
+        secret_path=secret_path,
+        dataset_id=dataset_id,
+    )
 
-    raw_status = merge(*raw_status_list)
+    raw_status = get_raw_from_sources(
+        source=table_params["extraction"]["source"],
+        url=request_url,
+        dataset_id=dataset_id,
+        table_id=table_params["extraction"]["table_id"],
+        file_name=table_params["extraction"]["file_name"],
+        zip_file_name=table_params["extraction"]["zip_file_name"],
+        mode=table_params["extraction"]["mode"],
+        headers=secret_path,
+        params=request_params,
+    )
 
     raw_filepath = save_raw_local(status=raw_status, file_path=filepath)
 
     # TREAT & CLEAN #
-    treated_status = transform_to_nested_structure(
+    json_status = transform_data_to_json(
         status=raw_status,
+        file_type=table_params["pre-treatment"]["file_type"],
+        csv_args=table_params["pre-treatment"]["csv_args"],
+    )
+
+    treated_status = transform_to_nested_structure(
+        status=json_status,
         timestamp=timestamp,
-        primary_key=table_params["primary_key"],
+        primary_key=table_params["pre-treatment"]["primary_key"],
     )
 
     treated_filepath = save_treated_local(status=treated_status, file_path=filepath)
@@ -121,7 +115,7 @@
     # LOAD #
     error = bq_upload(
         dataset_id=dataset_id,
-        table_id=table_params["table_id"],
+        table_id=table_params["pre-treatment"]["table_id"],
         filepath=treated_filepath,
         raw_filepath=raw_filepath,
         partitions=partitions,
@@ -130,7 +124,7 @@
 
     upload_logs_to_bq(
         dataset_id=dataset_id,
-        parent_table_id=table_params["table_id"],
+        parent_table_id=table_params["pre-treatment"]["table_id"],
         error=error,
         timestamp=timestamp,
     )
diff --git a/pipelines/rj_smtr/tasks.py b/pipelines/rj_smtr/tasks.py
index 49c745076..1b9545ca8 100644
--- a/pipelines/rj_smtr/tasks.py
+++ b/pipelines/rj_smtr/tasks.py
@@ -29,6 +29,7 @@
     log_critical,
     data_info_str,
     get_raw_data_api,
+    get_raw_data_gcs,
 )
 from pipelines.utils.execute_dbt_model.utils import get_dbt_client
 from pipelines.utils.utils import log, get_redis_client, get_vault_secret
@@ -950,58 +951,62 @@ def create_request_params(
     if dataset_id == constants.BILHETAGEM_DATASET_ID.value:
         secrets = get_vault_secret(secret_path)["data"]
 
-        database_secrets = secrets["databases"][table_params["database"]]
+        database_secrets = secrets["databases"][table_params["extraction"]["database"]]
 
         request_url = secrets["vpn_url"] + database_secrets["engine"]
 
         request_params = {
             "host": database_secrets["host"],  # TODO: exibir no log em ambiente fechado
-            "database": table_params["database"],
-            "query": table_params["query"].format(**datetime_range),
+            "database": table_params["extraction"]["database"],
+            "query": table_params["extraction"]["query"].format(**datetime_range),
         }
 
     return request_params, request_url
 
 
-# @task(checkpoint=False)
-# def get_raw_from_sources(
-#     source: str,
-#     url:str,
-#     dataset_id:str = None,
-#     table_id:str = None,
-#     mode:str = None,
-#     headers: str = None,
-#     filetype: str = "json",
-#     csv_args: dict = None,
-#     params: dict = None,
-# ):
-#     if source == "api":
-#         return get_raw_data_api(
-#             url=url,
-#             headers=headers,
-#             filetype=filetype,
-#             csv_args=csv_args,
-#             params=params
-#         )
-#     if source == "gcs":
-#         file =
-
-
 @task(checkpoint=False)
-def save_raw_storage(
-    dataset_id: str,
-    table_id: str,
-    raw_filepath: str,
+def get_raw_from_sources(
+    source: str,
+    url: str,
+    dataset_id: str = None,
+    table_id: str = None,
+    file_name: str = None,
     partitions: str = None,
+    zip_file_name: str = None,
+    mode: str = None,
+    headers: str = None,
+    params: dict = None,
 ):
-    st_obj = Storage(table_id=table_id, dataset_id=dataset_id)
-    log(
-        f"""Uploading raw file to bucket {st_obj.bucket_name} at
-        {st_obj.bucket_name}/{dataset_id}/{table_id}"""
-    )
-    st_obj.upload(
-        path=raw_filepath,
-        partitions=partitions,
-        mode="raw",
-        if_exists="replace",
-    )
+    if source == "api":
+        return get_raw_data_api(url=url, headers=headers, params=params)
+    if source == "gcs":
+        return get_raw_data_gcs(
+            dataset_id=dataset_id,
+            table_id=table_id,
+            file_name=file_name,
+            mode=mode,
+            partitions=partitions,
+            zip_file_name=zip_file_name,
+        )
+
+
+@task(checkpoint=False)
+def transform_data_to_json(status: dict, file_type: str, csv_args: dict):
+    data = status["data"]
+    error = status["error"]
+
+    if file_type == "json":
+        pass
+
+        # todo: move to data check on specfic API # pylint: disable=W0102
+        # if isinstance(data, dict) and "DescricaoErro" in data.keys():
+        #     error = data["DescricaoErro"]
+
+    elif file_type in ("txt", "csv"):
+        if csv_args is None:
+            csv_args = {}
+        data = pd.read_csv(io.StringIO(data), **csv_args).to_dict(orient="records")
+    else:
+        error = "Unsupported raw file extension. Supported only: json, csv and txt"
+
+    return {"data": data, "error": error}
diff --git a/pipelines/rj_smtr/utils.py b/pipelines/rj_smtr/utils.py
index 3b3c7377d..c7b13bfc3 100644
--- a/pipelines/rj_smtr/utils.py
+++ b/pipelines/rj_smtr/utils.py
@@ -454,8 +454,6 @@ def generate_execute_schedules(  # pylint: disable=too-many-arguments,too-many-l
 def get_raw_data_api(  # pylint: disable=R0912
     url: str,
     headers: str = None,
-    filetype: str = "json",
-    csv_args: dict = None,
     params: dict = None,
 ) -> list[dict]:
     """
@@ -464,8 +462,6 @@ def get_raw_data_api(  # pylint: disable=R0912
     Args:
         url (str): URL to send request
         headers (str, optional): Path to headers guardeded on Vault, if needed.
-        filetype (str, optional): Filetype to be formatted (supported only: json, csv and txt)
-        csv_args (dict, optional): Arguments for read_csv, if needed
         params (dict, optional): Params to be sent on request
 
     Returns:
@@ -493,24 +489,9 @@ def get_raw_data_api(  # pylint: disable=R0912
             params=params,
         )
 
-        if response.ok:  # status code is less than 400
-            if filetype == "json":
-                data = response.json()
+        response.raise_for_status()
 
-                # todo: move to data check on specfic API # pylint: disable=W0102
-                if isinstance(data, dict) and "DescricaoErro" in data.keys():
-                    error = data["DescricaoErro"]
-
-            elif filetype in ("txt", "csv"):
-                if csv_args is None:
-                    csv_args = {}
-                data = pd.read_csv(io.StringIO(response.text), **csv_args).to_dict(
-                    orient="records"
-                )
-            else:
-                error = (
-                    "Unsupported raw file extension. Supported only: json, csv and txt"
-                )
+        data = response.text
 
     except Exception as exp:
         error = exp
@@ -522,25 +503,30 @@ def get_raw_data_api(  # pylint: disable=R0912
 
 
 def get_raw_data_gcs(
-    dataset_id: str, table_id: str, file_name: str, mode: str, zip_file_name: str = None
+    dataset_id: str,
+    table_id: str,
+    file_name: str,
+    mode: str,
+    partitions: str = None,
+    zip_extracted_file: str = None,
 ) -> dict:
     error = None
     data = None
     try:
-        if zip_file_name:
-            blob = get_storage_blob(
-                dataset_id=dataset_id,
-                table_id=table_id,
-                file_name=zip_file_name,
-                mode=mode,
-            )
+        blob = get_storage_blob(
+            dataset_id=dataset_id,
+            table_id=table_id,
+            file_name=file_name,
+            partitions=partitions,
+            mode=mode,
+        )
+
+        if zip_extracted_file:
             compressed_data = blob.download_as_bytes()
+
             with zipfile.ZipFile(io.BytesIO(compressed_data), "r") as zipped_file:
-                data = zipped_file.read(file_name).decode(encoding="utf-8")
+                data = zipped_file.read(zip_extracted_file).decode(encoding="utf-8")
         else:
-            blob = get_storage_blob(
-                dataset_id=dataset_id, table_id=table_id, file_name=file_name, mode=mode
-            )
             data = blob.download_as_string()
     except Exception as exp:
         error = exp
diff --git a/pipelines/utils/utils.py b/pipelines/utils/utils.py
index 7042709e9..79a264017 100644
--- a/pipelines/utils/utils.py
+++ b/pipelines/utils/utils.py
@@ -725,13 +725,24 @@ def get_storage_blobs(dataset_id: str, table_id: str, mode: str = "staging") ->
 
 
 def get_storage_blob(
-    dataset_id: str, table_id: str, file_name: str, mode: str = "staging"
+    dataset_id: str,
+    table_id: str,
+    file_name: str,
+    partitions: str = None,
+    mode: str = "staging",
 ):
+    path = f"{mode}/{dataset_id}/{table_id}/"
+
+    if partitions:
+        path += f"{partitions}/"
+
+    path += file_name
+
     bd_storage = bd.Storage(dataset_id=dataset_id, table_id=table_id)
     return (
         bd_storage.client["storage_staging"]
         .bucket(bd_storage.bucket_name)
-        .get_blob(blob_name=f"{mode}/{dataset_id}/{table_id}/{file_name}")
+        .get_blob(blob_name=path)
     )
 
 

From 6f12477d14e45a2bb83c817976a597282625a66b Mon Sep 17 00:00:00 2001
From: fernandascovino <fscovinom@gmail.com>
Date: Tue, 26 Sep 2023 17:18:56 -0300
Subject: [PATCH 15/59] atualiza esquema do flow padrao

---
 pipelines/rj_smtr/constants.py |   3 +
 pipelines/rj_smtr/flows.py     | 121 +++++++++----------
 pipelines/rj_smtr/tasks.py     | 206 ++++++++++++++++++++++-----------
 pipelines/rj_smtr/utils.py     | 163 +++++++++++++++++++++-----
 pipelines/utils/utils.py       |  20 +---
 5 files changed, 337 insertions(+), 176 deletions(-)

diff --git a/pipelines/rj_smtr/constants.py b/pipelines/rj_smtr/constants.py
index 3b1b6dc8d..d402bb6e9 100644
--- a/pipelines/rj_smtr/constants.py
+++ b/pipelines/rj_smtr/constants.py
@@ -262,3 +262,6 @@ class constants(Enum):  # pylint: disable=c0103
         },
     ]
     BILHETAGEM_SECRET_PATH = "smtr_jae_access_data"
+
+    # GTFS
+    
diff --git a/pipelines/rj_smtr/flows.py b/pipelines/rj_smtr/flows.py
index da802d277..fb763cc5a 100644
--- a/pipelines/rj_smtr/flows.py
+++ b/pipelines/rj_smtr/flows.py
@@ -22,15 +22,17 @@
     create_local_partition_path,
     get_current_timestamp,
     parse_timestamp_to_string,
-    save_raw_local,
-    save_treated_local,
-    upload_logs_to_bq,
-    bq_upload,
-    transform_to_nested_structure,
+    # save_raw_local,
+    # save_treated_local,
+    # upload_logs_to_bq,
+    # bq_upload,
+    upload_raw_data_to_gcs,
+    upload_staging_data_to_gcs,
+    transform_raw_to_nested_structure,
     get_raw_from_sources,
-    transform_data_to_json,
+    # transform_data_to_json,
     create_request_params,
-    get_datetime_range,
+    # get_datetime_range,
 )
 
 
@@ -38,96 +40,87 @@
     "SMTR: Captura",
     code_owners=["caio", "fernanda", "boris", "rodrigo"],
 ) as default_capture_flow:
-    # SETUP #
+    
+    ### Configuração ###
 
-    table_params = Parameter("table_params", default=None)
-    timestamp_param = Parameter("timestamp", default=None)
-    interval = Parameter("interval", default=None)
+    table_id = Parameter("table_id", default=None)
+    partition_date_only = Parameter("partition_date_only", default=None)
+    request_params = Parameter("request_params", default=None)
     dataset_id = Parameter("dataset_id", default=None)
     secret_path = Parameter("secret_path", default=None)
+    primary_key = Parameter("primary_key", default=None)
+    source_type = Parameter("source_type", default=None)
 
-    timestamp = get_current_timestamp(timestamp_param)
-
-    datetime_range = get_datetime_range(timestamp, interval=interval)
+    timestamp = get_current_timestamp()
 
     rename_flow_run = rename_current_flow_run_now_time(
-        prefix=default_capture_flow.name + " " + table_params["flow_run_name"] + ": ",
+        prefix=default_capture_flow.name + " " + table_id + ": ",
         now_time=timestamp,
     )
 
-    request_params, request_url = create_request_params(
-        datetime_range=datetime_range,
-        table_params=table_params,
-        secret_path=secret_path,
-        dataset_id=dataset_id,
-    )
-
     partitions = create_date_hour_partition(
-        timestamp, partition_date_only=table_params["partition_date_only"]
+        timestamp, partition_date_only=partition_date_only
     )
 
     filename = parse_timestamp_to_string(timestamp)
 
     filepath = create_local_partition_path(
         dataset_id=dataset_id,
-        table_id=table_params["pre-treatment"]["table_id"],
+        table_id=table_id,
         filename=filename,
         partitions=partitions,
     )
 
-    # CAPTURA
-    request_params, request_url = create_request_params(
-        datetime_range=datetime_range,
-        table_params=table_params,
+    ### Extração ###
+    # é necessária task ou função dentro da extract_raw_data?
+    request_params, request_path = create_request_params(
         secret_path=secret_path,
         dataset_id=dataset_id,
     )
 
-    raw_status = get_raw_from_sources(
-        source=table_params["extraction"]["source"],
-        url=request_url,
-        dataset_id=dataset_id,
-        table_id=table_params["extraction"]["table_id"],
-        file_name=table_params["extraction"]["file_name"],
-        zip_file_name=table_params["extraction"]["zip_file_name"],
-        mode=table_params["extraction"]["mode"],
-        headers=secret_path,
-        params=request_params,
+    error, raw_filepath = get_raw_from_sources(
+        source_type=source_type, # parametro de extracao, onde ficar?
+        source_path=request_path,
+        zip_filename=table_id,
+        secret_path=secret_path,
+        request_params=request_params,
     )
 
-    raw_filepath = save_raw_local(status=raw_status, file_path=filepath)
-
-    # TREAT & CLEAN #
-    json_status = transform_data_to_json(
-        status=raw_status,
-        file_type=table_params["pre-treatment"]["file_type"],
-        csv_args=table_params["pre-treatment"]["csv_args"],
+    RAW_UPLOADED = upload_raw_data_to_gcs(
+        error=error, 
+        filepath=raw_filepath, 
+        timestamp=timestamp, 
+        partitions=partitions
     )
 
-    treated_status = transform_to_nested_structure(
-        status=json_status,
+    ### Pré-tratamento ###
+
+    error, staging_filepath = transform_raw_to_nested_structure(
+        raw_filepath=raw_filepath,
         timestamp=timestamp,
-        primary_key=table_params["pre-treatment"]["primary_key"],
+        primary_key=primary_key,
     )
 
-    treated_filepath = save_treated_local(status=treated_status, file_path=filepath)
+    STAGING_UPLOADED = upload_staging_data_to_gcs(error=error, filepath=staging_filepath, timestamp=timestamp)
 
-    # LOAD #
-    error = bq_upload(
-        dataset_id=dataset_id,
-        table_id=table_params["pre-treatment"]["table_id"],
-        filepath=treated_filepath,
-        raw_filepath=raw_filepath,
-        partitions=partitions,
-        status=treated_status,
-    )
+    # treated_filepath = save_treated_local(status=treated_status, file_path=filepath)
 
-    upload_logs_to_bq(
-        dataset_id=dataset_id,
-        parent_table_id=table_params["pre-treatment"]["table_id"],
-        error=error,
-        timestamp=timestamp,
-    )
+    # LOAD #
+    # error = bq_upload(
+    #     dataset_id=dataset_id,
+    #     table_id=table_params["pre-treatment"]["table_id"],
+    #     filepath=treated_filepath,
+    #     raw_filepath=raw_filepath,
+    #     partitions=partitions,
+    #     status=treated_status,
+    # )
+
+    # upload_logs_to_bq(
+    #     dataset_id=dataset_id,
+    #     parent_table_id=table_params["pre-treatment"]["table_id"],
+    #     error=error,
+    #     timestamp=timestamp,
+    # )
 
 default_capture_flow.storage = GCS(emd_constants.GCS_FLOWS_BUCKET.value)
 default_capture_flow.run_config = KubernetesRun(
diff --git a/pipelines/rj_smtr/tasks.py b/pipelines/rj_smtr/tasks.py
index bf0aec407..b7f484171 100644
--- a/pipelines/rj_smtr/tasks.py
+++ b/pipelines/rj_smtr/tasks.py
@@ -30,6 +30,7 @@
     data_info_str,
     get_raw_data_api,
     get_raw_data_gcs,
+    upload_run_logs_to_bq
 )
 from pipelines.utils.execute_dbt_model.utils import get_dbt_client
 from pipelines.utils.utils import log, get_redis_client, get_vault_secret
@@ -601,6 +602,69 @@ def upload_logs_to_bq(  # pylint: disable=R0913
         raise Exception(f"Pipeline failed with error: {error}")
 
 
+@task
+def upload_raw_data_to_gcs(
+    error: bool, raw_filepath: str, timestamp: datetime, table_id: str, dataset_id: str, partitions: list
+):
+    if not error:
+        try:
+            st_obj = Storage(table_id=table_id, dataset_id=dataset_id)
+            log(
+                f"""Uploading raw file to bucket {st_obj.bucket_name} at
+                {st_obj.bucket_name}/{dataset_id}/{table_id}"""
+            )
+            st_obj.upload(
+                path=raw_filepath,
+                partitions=partitions,
+                mode="raw",
+                if_exists="replace",
+            )
+        except Exception:
+            error = traceback.format_exc()
+            log(f"[CATCHED] Task failed with error: \n{error}", level="error")
+    
+    upload_run_logs_to_bq(
+        dataset_id=dataset_id,
+        parent_table_id=table_id,
+        error=error,
+        timestamp=timestamp,
+        mode="raw"
+    )
+
+
+@task
+def upload_staging_data_to_gcs(
+    error: bool, staging_filepath: str, timestamp: datetime, table_id: str, dataset_id: str, partitions: list
+):
+    if not error:
+        try:
+            # Creates and publish table if it does not exist, append to it otherwise
+            create_or_append_table(
+                dataset_id=dataset_id,
+                table_id=table_id,
+                path=staging_filepath,
+                partitions=partitions
+            )
+        except Exception:
+            error = traceback.format_exc()
+            log(f"[CATCHED] Task failed with error: \n{error}", level="error")
+    
+    upload_run_logs_to_bq(
+        dataset_id=dataset_id,
+        parent_table_id=table_id,
+        error=error,
+        timestamp=timestamp,
+        mode="staging"
+    )
+    
+
+###############
+#
+# Daterange tasks
+#
+###############
+
+
 @task(
     checkpoint=False,
     max_retries=constants.MAX_RETRIES.value,
@@ -791,9 +855,16 @@ def get_previous_date(days):
     return now.to_date_string()
 
 
+###############
+#
+# Pretreat data
+#
+###############
+
+
 @task
-def transform_to_nested_structure(
-    status: dict, timestamp: datetime, primary_key: list = None
+def transform_raw_to_nested_structure(
+    filepath: str, error: bool, timestamp: datetime, primary_key: list = None
 ):
     """Transform dataframe to nested structure
 
@@ -810,21 +881,29 @@ def transform_to_nested_structure(
             * `error` (str): catched error, if any. Otherwise, returns None
     """
 
+    # ORGANIZAR:
+    # json_status = transform_data_to_json(
+    #     status=raw_status,
+    #     file_type=table_params["pre-treatment"]["file_type"],
+    #     csv_args=table_params["pre-treatment"]["csv_args"],
+    # )
+
     # Check previous error
-    if status["error"] is not None:
-        return {"data": pd.DataFrame(), "error": status["error"]}
+    if error is not None:
+        return {"data": pd.DataFrame(), "error": error}
 
     # Check empty dataframe
-    if len(status["data"]) == 0:
-        log("Empty dataframe, skipping transformation...")
-        return {"data": pd.DataFrame(), "error": status["error"]}
+    # if len(status["data"]) == 0:
+    #     log("Empty dataframe, skipping transformation...")
+    #     return {"data": pd.DataFrame(), "error": error}
 
     try:
         if primary_key is None:
             primary_key = []
 
         error = None
-        data = pd.DataFrame(status["data"])
+        # leitura do dado raw
+        # data = pd.DataFrame(status["data"])
 
         log(
             f"""
@@ -860,40 +939,43 @@ def transform_to_nested_structure(
             level="info",
         )
 
+        # save treated local
+        filepath = _save_trated_local(data=data, filepath=filepath)
+
     except Exception as exp:  # pylint: disable=W0703
         error = exp
 
     if error is not None:
         log(f"[CATCHED] Task failed with error: \n{error}", level="error")
 
-    return {"data": data, "error": error}
+    return error, filepath
 
 
-@task(checkpoint=False)
-def get_datetime_range(
-    timestamp: datetime,
-    interval: int,
-) -> dict:
-    """
-    Task to get datetime range in UTC
+# @task(checkpoint=False)
+# def get_datetime_range(
+#     timestamp: datetime,
+#     interval: int,
+# ) -> dict:
+#     """
+#     Task to get datetime range in UTC
 
-    Args:
-        timestamp (datetime): timestamp to get datetime range
-        interval (int): interval in seconds
+#     Args:
+#         timestamp (datetime): timestamp to get datetime range
+#         interval (int): interval in seconds
 
-    Returns:
-        dict: datetime range
-    """
+#     Returns:
+#         dict: datetime range
+#     """
 
-    start = (
-        (timestamp - timedelta(seconds=interval))
-        .astimezone(tz=timezone("UTC"))
-        .strftime("%Y-%m-%d %H:%M:%S")
-    )
+#     start = (
+#         (timestamp - timedelta(seconds=interval))
+#         .astimezone(tz=timezone("UTC"))
+#         .strftime("%Y-%m-%d %H:%M:%S")
+#     )
 
-    end = timestamp.astimezone(tz=timezone("UTC")).strftime("%Y-%m-%d %H:%M:%S")
+#     end = timestamp.astimezone(tz=timezone("UTC")).strftime("%Y-%m-%d %H:%M:%S")
 
-    return {"start": start, "end": end}
+#     return {"start": start, "end": end}
 
 
 @task(checkpoint=False, nout=2)
@@ -916,11 +998,8 @@ def create_request_params(
 
     if dataset_id == constants.BILHETAGEM_DATASET_ID.value:
         secrets = get_vault_secret(secret_path)["data"]
-
         database_secrets = secrets["databases"][table_params["extraction"]["database"]]
-
         request_url = secrets["vpn_url"] + database_secrets["engine"]
-
         request_params = {
             "host": database_secrets["host"],  # TODO: exibir no log em ambiente fechado
             "database": table_params["extraction"]["database"],
@@ -932,47 +1011,40 @@ def create_request_params(
 
 @task(checkpoint=False)
 def get_raw_from_sources(
-    source: str,
-    url: str,
-    dataset_id: str = None,
-    table_id: str = None,
-    file_name: str = None,
-    partitions: str = None,
-    zip_file_name: str = None,
-    mode: str = None,
-    headers: str = None,
-    params: dict = None,
+    source_type: str,
+    source_path: str = None,
+    zip_filename: str = None,
+    secret_path: str = None,
+    api_params: dict = None,
 ):
-    if source == "api":
-        return get_raw_data_api(url=url, headers=headers, params=params)
-    if source == "gcs":
+    if source_type == "api":
+        return get_raw_data_api(url=source_path, secret_path=secret_path, params=api_params)
+    if source_type == "gcs":
         return get_raw_data_gcs(
-            dataset_id=dataset_id,
-            table_id=table_id,
-            file_name=file_name,
-            mode=mode,
-            partitions=partitions,
-            zip_file_name=zip_file_name,
+            gcs_path=source_path,
+            mode="raw",
+            zip_filename=zip_filename,
         )
 
 
-@task(checkpoint=False)
-def transform_data_to_json(status: dict, file_type: str, csv_args: dict):
-    data = status["data"]
-    error = status["error"]
+# TODO: passar para função para dentro da transform_raw_to_nested_structure
+# @task(checkpoint=False)
+# def transform_data_to_json(status: dict, file_type: str, csv_args: dict):
+#     data = status["data"]
+#     error = status["error"]
 
-    if file_type == "json":
-        pass
+#     if file_type == "json":
+#         pass
 
-        # todo: move to data check on specfic API # pylint: disable=W0102
-        # if isinstance(data, dict) and "DescricaoErro" in data.keys():
-        #     error = data["DescricaoErro"]
+#         # todo: move to data check on specfic API # pylint: disable=W0102
+#         # if isinstance(data, dict) and "DescricaoErro" in data.keys():
+#         #     error = data["DescricaoErro"]
 
-    elif file_type in ("txt", "csv"):
-        if csv_args is None:
-            csv_args = {}
-        data = pd.read_csv(io.StringIO(data), **csv_args).to_dict(orient="records")
-    else:
-        error = "Unsupported raw file extension. Supported only: json, csv and txt"
+#     elif file_type in ("txt", "csv"):
+#         if csv_args is None:
+#             csv_args = {}
+#         data = pd.read_csv(io.StringIO(data), **csv_args).to_dict(orient="records")
+#     else:
+#         error = "Unsupported raw file extension. Supported only: json, csv and txt"
 
-    return {"data": data, "error": error}
+#     return {"data": data, "error": error}
diff --git a/pipelines/rj_smtr/utils.py b/pipelines/rj_smtr/utils.py
index c7b13bfc3..a4376bb88 100644
--- a/pipelines/rj_smtr/utils.py
+++ b/pipelines/rj_smtr/utils.py
@@ -451,17 +451,47 @@ def generate_execute_schedules(  # pylint: disable=too-many-arguments,too-many-l
     return clocks
 
 
+def _save_raw_local(data: dict, file_path: str, mode: str = "raw", filetype: str = "json") -> str:
+    """
+    Saves json response from API to .json file.
+    Args:
+        file_path (str): Path which to save raw file
+        status (dict): Must contain keys
+          * data: json returned from API
+          * error: error catched from API request
+        mode (str, optional): Folder to save locally, later folder which to upload to GCS.
+    Returns:
+        str: Path to the saved file
+    """
+
+    # diferentes tipos de arquivos para salvar
+    _file_path = file_path.format(mode=mode, filetype=filetype)
+    Path(_file_path).parent.mkdir(parents=True, exist_ok=True)
+
+    if filetype == "json":
+        json.dump(data, Path(_file_path).open("w", encoding="utf-8"))
+
+    if filetype == "csv":
+        pass
+    if filetype == "txt":
+        pass
+
+    log(f"Raw data saved to: {_file_path}")
+    return _file_path
+
+
 def get_raw_data_api(  # pylint: disable=R0912
     url: str,
-    headers: str = None,
-    params: dict = None,
+    secret_path: str = None,
+    api_params: dict = None,
+    filepath: str = None
 ) -> list[dict]:
     """
     Request data from URL API
 
     Args:
         url (str): URL to send request
-        headers (str, optional): Path to headers guardeded on Vault, if needed.
+        secret_path (str, optional): Path to secrets guardeded on Vault, if needed.
         params (dict, optional): Params to be sent on request
 
     Returns:
@@ -469,58 +499,45 @@ def get_raw_data_api(  # pylint: disable=R0912
           * `data` (json): data result
           * `error` (str): catched error, if any. Otherwise, returns None
     """
-    data = None
     error = None
-
     try:
-        if headers is not None:
-            headers = get_vault_secret(headers)["data"]
-
-            # remove from headers, if present
-            remove_headers = ["host", "databases"]
-            for remove_header in remove_headers:
-                if remove_header in list(headers.keys()):
-                    del headers[remove_header]
+        if secret_path is None:
+            headers = secret_path
+        else:
+            headers = get_vault_secret(secret_path)["data"]
 
         response = requests.get(
             url,
             headers=headers,
             timeout=constants.MAX_TIMEOUT_SECONDS.value,
-            params=params,
+            params=api_params,
         )
 
         response.raise_for_status()
-
-        data = response.text
+        filepath = _save_raw_local(data=response.text, filepath=filepath)
 
     except Exception as exp:
         error = exp
-
-    if error is not None:
         log(f"[CATCHED] Task failed with error: \n{error}", level="error")
 
-    return {"data": data, "error": error}
+    return error, filepath
 
 
 def get_raw_data_gcs(
-    dataset_id: str,
-    table_id: str,
-    file_name: str,
-    mode: str,
-    partitions: str = None,
+    gcs_path: str,
     zip_extracted_file: str = None,
 ) -> dict:
+    
     error = None
-    data = None
+
     try:
         blob = get_storage_blob(
-            dataset_id=dataset_id,
-            table_id=table_id,
-            file_name=file_name,
-            partitions=partitions,
-            mode=mode,
+            gcs_path=gcs_path,
+            mode="raw",
         )
 
+        data = blob.download_as_bytes()
+
         if zip_extracted_file:
             compressed_data = blob.download_as_bytes()
 
@@ -528,7 +545,93 @@ def get_raw_data_gcs(
                 data = zipped_file.read(zip_extracted_file).decode(encoding="utf-8")
         else:
             data = blob.download_as_string()
+
     except Exception as exp:
         error = exp
 
     return {"data": data, "error": error}
+
+
+def _save_treated_local(file_path: str, status: dict, mode: str = "staging") -> str:
+    """
+    Save treated file to CSV.
+
+    Args:
+        file_path (str): Path which to save treated file
+        status (dict): Must contain keys
+          * `data`: dataframe returned from treatement
+          * `error`: error catched from data treatement
+        mode (str, optional): Folder to save locally, later folder which to upload to GCS.
+
+    Returns:
+        str: Path to the saved file
+    """
+    _file_path = file_path.format(mode=mode, filetype="csv")
+    Path(_file_path).parent.mkdir(parents=True, exist_ok=True)
+    if status["error"] is None:
+        status["data"].to_csv(_file_path, index=False)
+        log(f"Treated data saved to: {_file_path}")
+    return _file_path
+
+
+def upload_run_logs_to_bq(  # pylint: disable=R0913
+    dataset_id: str,
+    parent_table_id: str,
+    timestamp: str,
+    error: str = None,
+    previous_error: str = None,
+    recapture: bool = False,
+    mode: str = "raw"
+):
+    """
+    Upload execution status table to BigQuery.
+    Table is uploaded to the same dataset, named {parent_table_id}_logs.
+    If passing status_dict, should not pass timestamp and error.
+
+    Args:
+        dataset_id (str): dataset_id on BigQuery
+        parent_table_id (str): Parent table id related to the status table
+        timestamp (str): ISO formatted timestamp string
+        error (str, optional): String associated with error caught during execution
+    Returns:
+        None
+    """
+    table_id = parent_table_id + "_logs"
+    # Create partition directory
+    filename = f"{table_id}_{timestamp.isoformat()}"
+    partition = f"data={timestamp.date()}"
+    filepath = Path(
+        f"""data/{mode}/{dataset_id}/{table_id}/{partition}/{filename}.csv"""
+    )
+    filepath.parent.mkdir(exist_ok=True, parents=True)
+    # Create dataframe to be uploaded
+    if not error and recapture is True:
+        # if the recapture is succeeded, update the column erro
+        dataframe = pd.DataFrame(
+            {
+                "timestamp_captura": [timestamp],
+                "sucesso": [True],
+                "erro": [f"[recapturado]{previous_error}"],
+            }
+        )
+        log(f"Recapturing {timestamp} with previous error:\n{error}")
+    else:
+        # not recapturing or error during flow execution
+        dataframe = pd.DataFrame(
+            {
+                "timestamp_captura": [timestamp],
+                "sucesso": [error is None],
+                "erro": [error],
+            }
+        )
+    # Save data local
+    dataframe.to_csv(filepath, index=False)
+    # Upload to Storage
+    create_or_append_table(
+        dataset_id=dataset_id,
+        table_id=table_id,
+        path=filepath.as_posix(),
+        partitions=partition,
+    )
+    if error is not None:
+        raise Exception(f"Pipeline failed with error: {error}")
\ No newline at end of file
diff --git a/pipelines/utils/utils.py b/pipelines/utils/utils.py
index 79a264017..147e54f4f 100644
--- a/pipelines/utils/utils.py
+++ b/pipelines/utils/utils.py
@@ -725,24 +725,14 @@ def get_storage_blobs(dataset_id: str, table_id: str, mode: str = "staging") ->
 
 
 def get_storage_blob(
-    dataset_id: str,
-    table_id: str,
-    file_name: str,
-    partitions: str = None,
+    gcs_path: str,
     mode: str = "staging",
 ):
-    path = f"{mode}/{dataset_id}/{table_id}/"
-
-    if partitions:
-        path += f"{partitions}/"
-
-    path += file_name
-
-    bd_storage = bd.Storage(dataset_id=dataset_id, table_id=table_id)
+    bucket = bd.Storage()
     return (
-        bd_storage.client["storage_staging"]
-        .bucket(bd_storage.bucket_name)
-        .get_blob(blob_name=path)
+        bucket.client["storage_staging"]
+        .bucket(bucket.bucket_name)
+        .get_blob(blob_name=gcs_path)
     )
 
 

From 0c3df1b05e8a257a20d9367cb282050a1df74cb9 Mon Sep 17 00:00:00 2001
From: Rafael <rafael.c.pinheiro96@gmail.com>
Date: Tue, 26 Sep 2023 22:41:01 -0300
Subject: [PATCH 16/59] change default capture flow structure

---
 pipelines/rj_smtr/constants.py | 12 ++++-
 pipelines/rj_smtr/tasks.py     | 87 ++++++++++++++++++++++------------
 pipelines/rj_smtr/utils.py     | 55 ++++++++++++---------
 3 files changed, 102 insertions(+), 52 deletions(-)

diff --git a/pipelines/rj_smtr/constants.py b/pipelines/rj_smtr/constants.py
index d402bb6e9..00558f9cc 100644
--- a/pipelines/rj_smtr/constants.py
+++ b/pipelines/rj_smtr/constants.py
@@ -264,4 +264,14 @@ class constants(Enum):  # pylint: disable=c0103
     BILHETAGEM_SECRET_PATH = "smtr_jae_access_data"
 
     # GTFS
-    
+    GTFS_DATASET_ID = "br_rj_riodejaneiro_gtfs"
+
+    GTFS_SOURCE_TYPE = "gcs"
+
+    GTFS_AGENCY_REQUEST_PARAMS = {
+        "filepath": "development/br_rj_riodejaneiro_gtfs/upload/gtfs.zip"
+    }
+
+    GTFS_AGENCY_TABLE_ID = "agency"
+
+    GTFS_QUADRO_TABLE_ID = "quadro"
diff --git a/pipelines/rj_smtr/tasks.py b/pipelines/rj_smtr/tasks.py
index b7f484171..0a40dae26 100644
--- a/pipelines/rj_smtr/tasks.py
+++ b/pipelines/rj_smtr/tasks.py
@@ -30,7 +30,7 @@
     data_info_str,
     get_raw_data_api,
     get_raw_data_gcs,
-    upload_run_logs_to_bq
+    upload_run_logs_to_bq,
 )
 from pipelines.utils.execute_dbt_model.utils import get_dbt_client
 from pipelines.utils.utils import log, get_redis_client, get_vault_secret
@@ -604,7 +604,12 @@ def upload_logs_to_bq(  # pylint: disable=R0913
 
 @task
 def upload_raw_data_to_gcs(
-    error: bool, raw_filepath: str, timestamp: datetime, table_id: str, dataset_id: str, partitions: list
+    error: bool,
+    raw_filepath: str,
+    timestamp: datetime,
+    table_id: str,
+    dataset_id: str,
+    partitions: list,
 ):
     if not error:
         try:
@@ -622,19 +627,24 @@ def upload_raw_data_to_gcs(
         except Exception:
             error = traceback.format_exc()
             log(f"[CATCHED] Task failed with error: \n{error}", level="error")
-    
+
     upload_run_logs_to_bq(
         dataset_id=dataset_id,
         parent_table_id=table_id,
         error=error,
         timestamp=timestamp,
-        mode="raw"
+        mode="raw",
     )
 
 
 @task
 def upload_staging_data_to_gcs(
-    error: bool, staging_filepath: str, timestamp: datetime, table_id: str, dataset_id: str, partitions: list
+    error: bool,
+    staging_filepath: str,
+    timestamp: datetime,
+    table_id: str,
+    dataset_id: str,
+    partitions: list,
 ):
     if not error:
         try:
@@ -643,20 +653,20 @@ def upload_staging_data_to_gcs(
                 dataset_id=dataset_id,
                 table_id=table_id,
                 path=staging_filepath,
-                partitions=partitions
+                partitions=partitions,
             )
         except Exception:
             error = traceback.format_exc()
             log(f"[CATCHED] Task failed with error: \n{error}", level="error")
-    
+
     upload_run_logs_to_bq(
         dataset_id=dataset_id,
         parent_table_id=table_id,
         error=error,
         timestamp=timestamp,
-        mode="staging"
+        mode="staging",
     )
-    
+
 
 ###############
 #
@@ -904,7 +914,7 @@ def transform_raw_to_nested_structure(
         error = None
         # leitura do dado raw
         # data = pd.DataFrame(status["data"])
-
+        data = None
         log(
             f"""
         Received inputs:
@@ -940,7 +950,7 @@ def transform_raw_to_nested_structure(
         )
 
         # save treated local
-        filepath = _save_trated_local(data=data, filepath=filepath)
+        # filepath = _save_trated_local(data=data, filepath=filepath)
 
     except Exception as exp:  # pylint: disable=W0703
         error = exp
@@ -980,7 +990,11 @@ def transform_raw_to_nested_structure(
 
 @task(checkpoint=False, nout=2)
 def create_request_params(
-    datetime_range: dict, table_params: dict, secret_path: str, dataset_id: str
+    # datetime_range: dict,
+    # table_params: dict,
+    table_id: str,
+    secret_path: str,
+    dataset_id: str,
 ) -> tuple:
     """
     Task to create request params
@@ -995,16 +1009,28 @@ def create_request_params(
         request_params: host, database and query to request data
         request_url: url to request data
     """
-
+    request_params = None  # TODO: retirar essa linha
     if dataset_id == constants.BILHETAGEM_DATASET_ID.value:
         secrets = get_vault_secret(secret_path)["data"]
-        database_secrets = secrets["databases"][table_params["extraction"]["database"]]
-        request_url = secrets["vpn_url"] + database_secrets["engine"]
-        request_params = {
-            "host": database_secrets["host"],  # TODO: exibir no log em ambiente fechado
-            "database": table_params["extraction"]["database"],
-            "query": table_params["extraction"]["query"].format(**datetime_range),
-        }
+
+        # TODO: RETIRAR ESSA LINHA
+        request_params = secrets
+
+        # TODO: mudar modo de pegar os parametros
+        # database_secrets = secrets["databases"][table_params["extraction"]["database"]]
+        # request_url = secrets["vpn_url"] + database_secrets["engine"]
+        # request_params = {
+        #     "host": database_secrets["host"],  # TODO: exibir no log em ambiente fechado
+        #     "database": table_params["extraction"]["database"],
+        #     "query": table_params["extraction"]["query"].format(**datetime_range),
+        # }
+
+    elif dataset_id == constants.GTFS_DATASET_ID.value:
+        gtfs_base_path = "development/br_rj_riodejaneiro_gtfs/upload"
+        if table_id == constants.GTFS_QUADRO_ID.value:
+            request_url = f"{gtfs_base_path}/quadro.csv"
+        else:
+            request_url = f"{gtfs_base_path}/gtfs.zip"
 
     return request_params, request_url
 
@@ -1013,18 +1039,21 @@ def create_request_params(
 def get_raw_from_sources(
     source_type: str,
     source_path: str = None,
-    zip_filename: str = None,
+    table_id: str = None,
     secret_path: str = None,
     api_params: dict = None,
 ):
-    if source_type == "api":
-        return get_raw_data_api(url=source_path, secret_path=secret_path, params=api_params)
-    if source_type == "gcs":
-        return get_raw_data_gcs(
-            gcs_path=source_path,
-            mode="raw",
-            zip_filename=zip_filename,
-        )
+    pass
+    # TODO: descomentar linhas abaixo, passando argumentos corretos
+    # if source_type == "api":
+    #     return get_raw_data_api(
+    #         url=source_path, secret_path=secret_path, params=api_params
+    #     )
+    # if source_type == "gcs":
+    #     return get_raw_data_gcs(
+    #         gcs_path=source_path,
+    #         filename_to_unzip=table_id,
+    #     )
 
 
 # TODO: passar para função para dentro da transform_raw_to_nested_structure
diff --git a/pipelines/rj_smtr/utils.py b/pipelines/rj_smtr/utils.py
index a4376bb88..68774c17d 100644
--- a/pipelines/rj_smtr/utils.py
+++ b/pipelines/rj_smtr/utils.py
@@ -10,12 +10,14 @@
 from datetime import timedelta, datetime
 from typing import List
 import io
-import basedosdados as bd
-from basedosdados import Table
-import pandas as pd
+import json
 import pytz
 import requests
 import zipfile
+import basedosdados as bd
+from basedosdados import Table
+import pandas as pd
+
 
 from prefect.schedules.clocks import IntervalClock
 
@@ -451,7 +453,9 @@ def generate_execute_schedules(  # pylint: disable=too-many-arguments,too-many-l
     return clocks
 
 
-def _save_raw_local(data: dict, file_path: str, mode: str = "raw", filetype: str = "json") -> str:
+def _save_raw_local(
+    data: dict, file_path: str, mode: str = "raw", filetype: str = "json"
+) -> str:
     """
     Saves json response from API to .json file.
     Args:
@@ -471,20 +475,18 @@ def _save_raw_local(data: dict, file_path: str, mode: str = "raw", filetype: str
     if filetype == "json":
         json.dump(data, Path(_file_path).open("w", encoding="utf-8"))
 
-    if filetype == "csv":
-        pass
+    # if filetype == "csv":
+    #     pass
     if filetype == "txt":
-        pass
+        with open(_file_path, "w", encoding="utf-8") as file:
+            file.write(data)
 
     log(f"Raw data saved to: {_file_path}")
     return _file_path
 
 
 def get_raw_data_api(  # pylint: disable=R0912
-    url: str,
-    secret_path: str = None,
-    api_params: dict = None,
-    filepath: str = None
+    url: str, secret_path: str = None, api_params: dict = None, filepath: str = None
 ) -> list[dict]:
     """
     Request data from URL API
@@ -525,9 +527,9 @@ def get_raw_data_api(  # pylint: disable=R0912
 
 def get_raw_data_gcs(
     gcs_path: str,
-    zip_extracted_file: str = None,
+    local_filepath: str,
+    filename_to_unzip: str = None,
 ) -> dict:
-    
     error = None
 
     try:
@@ -538,18 +540,27 @@ def get_raw_data_gcs(
 
         data = blob.download_as_bytes()
 
-        if zip_extracted_file:
-            compressed_data = blob.download_as_bytes()
-
-            with zipfile.ZipFile(io.BytesIO(compressed_data), "r") as zipped_file:
-                data = zipped_file.read(zip_extracted_file).decode(encoding="utf-8")
+        if filename_to_unzip:
+            with zipfile.ZipFile(io.BytesIO(data), "r") as zipped_file:
+                filenames = zipped_file.namelist()
+                filename = list(
+                    filter(lambda x: x.split(".")[0] == filename_to_unzip, filenames)
+                )[0]
+                data = zipped_file.read(filename)
         else:
-            data = blob.download_as_string()
+            filename = blob.name
+
+        raw_filepath = _save_raw_local(
+            data=data.decode(encoding="utf-8"),
+            file_path=local_filepath,
+            filetype=filename.split(".")[-1],
+        )
 
     except Exception as exp:
         error = exp
+        log(f"[CATCHED] Task failed with error: \n{error}", level="error")
 
-    return {"data": data, "error": error}
+    return error, raw_filepath
 
 
 def _save_treated_local(file_path: str, status: dict, mode: str = "staging") -> str:
@@ -581,7 +592,7 @@ def upload_run_logs_to_bq(  # pylint: disable=R0913
     error: str = None,
     previous_error: str = None,
     recapture: bool = False,
-    mode: str = "raw"
+    mode: str = "raw",
 ):
     """
     Upload execution status table to BigQuery.
@@ -634,4 +645,4 @@ def upload_run_logs_to_bq(  # pylint: disable=R0913
         partitions=partition,
     )
     if error is not None:
-        raise Exception(f"Pipeline failed with error: {error}")
\ No newline at end of file
+        raise Exception(f"Pipeline failed with error: {error}")

From f6ca7ab8c23ad720e30b00c1862837848ad1fad3 Mon Sep 17 00:00:00 2001
From: Rafael <rafael.c.pinheiro96@gmail.com>
Date: Wed, 27 Sep 2023 10:36:00 -0300
Subject: [PATCH 17/59] change generic capture flow

---
 pipelines/rj_smtr/flows.py | 53 ++++++++++------------
 pipelines/rj_smtr/tasks.py | 80 +++++++++++++++++++--------------
 pipelines/rj_smtr/utils.py | 91 +++++++++++++++++++++++++++++---------
 3 files changed, 141 insertions(+), 83 deletions(-)

diff --git a/pipelines/rj_smtr/flows.py b/pipelines/rj_smtr/flows.py
index fb763cc5a..3dd834a75 100644
--- a/pipelines/rj_smtr/flows.py
+++ b/pipelines/rj_smtr/flows.py
@@ -40,8 +40,7 @@
     "SMTR: Captura",
     code_owners=["caio", "fernanda", "boris", "rodrigo"],
 ) as default_capture_flow:
-    
-    ### Configuração ###
+    # Configuração #
 
     table_id = Parameter("table_id", default=None)
     partition_date_only = Parameter("partition_date_only", default=None)
@@ -71,15 +70,19 @@
         partitions=partitions,
     )
 
-    ### Extração ###
+    # Extração #
     # é necessária task ou função dentro da extract_raw_data?
     request_params, request_path = create_request_params(
         secret_path=secret_path,
         dataset_id=dataset_id,
+        request_params=request_params,
+        table_id=table_id,
+        timestamp=timestamp,
     )
 
     error, raw_filepath = get_raw_from_sources(
-        source_type=source_type, # parametro de extracao, onde ficar?
+        source_type=source_type,  # parametro de extracao, onde ficar?
+        local_filepath=filepath,
         source_path=request_path,
         zip_filename=table_id,
         secret_path=secret_path,
@@ -87,40 +90,32 @@
     )
 
     RAW_UPLOADED = upload_raw_data_to_gcs(
-        error=error, 
-        filepath=raw_filepath, 
-        timestamp=timestamp, 
-        partitions=partitions
+        error=error,
+        raw_filepath=raw_filepath,
+        timestamp=timestamp,
+        table_id=table_id,
+        dataset_id=dataset_id,
+        partitions=partitions,
     )
 
-    ### Pré-tratamento ###
+    # Pré-tratamento #
 
     error, staging_filepath = transform_raw_to_nested_structure(
         raw_filepath=raw_filepath,
+        filepath=filepath,
+        error=error,
         timestamp=timestamp,
         primary_key=primary_key,
     )
 
-    STAGING_UPLOADED = upload_staging_data_to_gcs(error=error, filepath=staging_filepath, timestamp=timestamp)
-
-    # treated_filepath = save_treated_local(status=treated_status, file_path=filepath)
-
-    # LOAD #
-    # error = bq_upload(
-    #     dataset_id=dataset_id,
-    #     table_id=table_params["pre-treatment"]["table_id"],
-    #     filepath=treated_filepath,
-    #     raw_filepath=raw_filepath,
-    #     partitions=partitions,
-    #     status=treated_status,
-    # )
-
-    # upload_logs_to_bq(
-    #     dataset_id=dataset_id,
-    #     parent_table_id=table_params["pre-treatment"]["table_id"],
-    #     error=error,
-    #     timestamp=timestamp,
-    # )
+    STAGING_UPLOADED = upload_staging_data_to_gcs(
+        error=error,
+        staging_filepath=staging_filepath,
+        timestamp=timestamp,
+        table_id=table_id,
+        dataset_id=dataset_id,
+        partitions=partitions,
+    )
 
 default_capture_flow.storage = GCS(emd_constants.GCS_FLOWS_BUCKET.value)
 default_capture_flow.run_config = KubernetesRun(
diff --git a/pipelines/rj_smtr/tasks.py b/pipelines/rj_smtr/tasks.py
index 0a40dae26..89beae6f2 100644
--- a/pipelines/rj_smtr/tasks.py
+++ b/pipelines/rj_smtr/tasks.py
@@ -31,6 +31,9 @@
     get_raw_data_api,
     get_raw_data_gcs,
     upload_run_logs_to_bq,
+    get_datetime_range,
+    transform_data_to_json,
+    save_treated_local_func,
 )
 from pipelines.utils.execute_dbt_model.utils import get_dbt_client
 from pipelines.utils.utils import log, get_redis_client, get_vault_secret
@@ -874,7 +877,11 @@ def get_previous_date(days):
 
 @task
 def transform_raw_to_nested_structure(
-    filepath: str, error: bool, timestamp: datetime, primary_key: list = None
+    raw_filepath: str,
+    filepath: str,
+    error: bool,
+    timestamp: datetime,
+    primary_key: list = None,
 ):
     """Transform dataframe to nested structure
 
@@ -891,16 +898,18 @@ def transform_raw_to_nested_structure(
             * `error` (str): catched error, if any. Otherwise, returns None
     """
 
+    with open(raw_filepath, "r", encoding="utf-8") as file:
+        data = file.read()
+
     # ORGANIZAR:
-    # json_status = transform_data_to_json(
-    #     status=raw_status,
-    #     file_type=table_params["pre-treatment"]["file_type"],
-    #     csv_args=table_params["pre-treatment"]["csv_args"],
-    # )
+    error, data = transform_data_to_json(
+        data=data,
+        file_type=raw_filepath.split(".")[-1],
+    )
 
     # Check previous error
     if error is not None:
-        return {"data": pd.DataFrame(), "error": error}
+        return error, None
 
     # Check empty dataframe
     # if len(status["data"]) == 0:
@@ -913,8 +922,8 @@ def transform_raw_to_nested_structure(
 
         error = None
         # leitura do dado raw
-        # data = pd.DataFrame(status["data"])
-        data = None
+        data = pd.DataFrame(data)
+
         log(
             f"""
         Received inputs:
@@ -950,7 +959,7 @@ def transform_raw_to_nested_structure(
         )
 
         # save treated local
-        # filepath = _save_trated_local(data=data, filepath=filepath)
+        filepath = save_treated_local_func(data=data, error=error, filepath=filepath)
 
     except Exception as exp:  # pylint: disable=W0703
         error = exp
@@ -992,9 +1001,11 @@ def transform_raw_to_nested_structure(
 def create_request_params(
     # datetime_range: dict,
     # table_params: dict,
+    request_params: dict,
     table_id: str,
     secret_path: str,
     dataset_id: str,
+    timestamp: datetime,
 ) -> tuple:
     """
     Task to create request params
@@ -1009,25 +1020,25 @@ def create_request_params(
         request_params: host, database and query to request data
         request_url: url to request data
     """
-    request_params = None  # TODO: retirar essa linha
+
     if dataset_id == constants.BILHETAGEM_DATASET_ID.value:
         secrets = get_vault_secret(secret_path)["data"]
 
-        # TODO: RETIRAR ESSA LINHA
-        request_params = secrets
+        database_secrets = secrets["databases"][request_params["database"]]
+        request_url = secrets["vpn_url"] + database_secrets["engine"]
 
-        # TODO: mudar modo de pegar os parametros
-        # database_secrets = secrets["databases"][table_params["extraction"]["database"]]
-        # request_url = secrets["vpn_url"] + database_secrets["engine"]
-        # request_params = {
-        #     "host": database_secrets["host"],  # TODO: exibir no log em ambiente fechado
-        #     "database": table_params["extraction"]["database"],
-        #     "query": table_params["extraction"]["query"].format(**datetime_range),
-        # }
+        datetime_range = get_datetime_range(
+            timestamp=timestamp, interval=request_params["run_interval"]
+        )
+        request_params = {
+            "host": database_secrets["host"],  # TODO: exibir no log em ambiente fechado
+            "database": request_params["database"],
+            "query": request_params["query"].format(**datetime_range),
+        }
 
     elif dataset_id == constants.GTFS_DATASET_ID.value:
         gtfs_base_path = "development/br_rj_riodejaneiro_gtfs/upload"
-        if table_id == constants.GTFS_QUADRO_ID.value:
+        if table_id == constants.GTFS_QUADRO_TABLE_ID.value:
             request_url = f"{gtfs_base_path}/quadro.csv"
         else:
             request_url = f"{gtfs_base_path}/gtfs.zip"
@@ -1038,22 +1049,25 @@ def create_request_params(
 @task(checkpoint=False)
 def get_raw_from_sources(
     source_type: str,
+    local_filepath: str,
     source_path: str = None,
     table_id: str = None,
     secret_path: str = None,
     api_params: dict = None,
 ):
-    pass
-    # TODO: descomentar linhas abaixo, passando argumentos corretos
-    # if source_type == "api":
-    #     return get_raw_data_api(
-    #         url=source_path, secret_path=secret_path, params=api_params
-    #     )
-    # if source_type == "gcs":
-    #     return get_raw_data_gcs(
-    #         gcs_path=source_path,
-    #         filename_to_unzip=table_id,
-    #     )
+    if source_type == "api":
+        return get_raw_data_api(
+            url=source_path,
+            secret_path=secret_path,
+            api_params=api_params,
+            filepath=local_filepath,
+        )
+    if source_type == "gcs":
+        return get_raw_data_gcs(
+            gcs_path=source_path,
+            filename_to_unzip=table_id,
+            local_filepath=local_filepath,
+        )
 
 
 # TODO: passar para função para dentro da transform_raw_to_nested_structure
diff --git a/pipelines/rj_smtr/utils.py b/pipelines/rj_smtr/utils.py
index 68774c17d..184a93df7 100644
--- a/pipelines/rj_smtr/utils.py
+++ b/pipelines/rj_smtr/utils.py
@@ -11,9 +11,9 @@
 from typing import List
 import io
 import json
+import zipfile
 import pytz
 import requests
-import zipfile
 import basedosdados as bd
 from basedosdados import Table
 import pandas as pd
@@ -453,13 +453,13 @@ def generate_execute_schedules(  # pylint: disable=too-many-arguments,too-many-l
     return clocks
 
 
-def _save_raw_local(
-    data: dict, file_path: str, mode: str = "raw", filetype: str = "json"
+def save_raw_local_func(
+    data: dict, filepath: str, mode: str = "raw", filetype: str = "json"
 ) -> str:
     """
     Saves json response from API to .json file.
     Args:
-        file_path (str): Path which to save raw file
+        filepath (str): Path which to save raw file
         status (dict): Must contain keys
           * data: json returned from API
           * error: error catched from API request
@@ -469,20 +469,20 @@ def _save_raw_local(
     """
 
     # diferentes tipos de arquivos para salvar
-    _file_path = file_path.format(mode=mode, filetype=filetype)
-    Path(_file_path).parent.mkdir(parents=True, exist_ok=True)
+    _filepath = filepath.format(mode=mode, filetype=filetype)
+    Path(_filepath).parent.mkdir(parents=True, exist_ok=True)
 
     if filetype == "json":
-        json.dump(data, Path(_file_path).open("w", encoding="utf-8"))
+        json.dump(data, Path(_filepath).open("w", encoding="utf-8"))
 
     # if filetype == "csv":
     #     pass
     if filetype == "txt":
-        with open(_file_path, "w", encoding="utf-8") as file:
+        with open(_filepath, "w", encoding="utf-8") as file:
             file.write(data)
 
-    log(f"Raw data saved to: {_file_path}")
-    return _file_path
+    log(f"Raw data saved to: {_filepath}")
+    return _filepath
 
 
 def get_raw_data_api(  # pylint: disable=R0912
@@ -516,7 +516,9 @@ def get_raw_data_api(  # pylint: disable=R0912
         )
 
         response.raise_for_status()
-        filepath = _save_raw_local(data=response.text, filepath=filepath)
+        filepath = save_raw_local_func(
+            data=response.text, filepath=filepath
+        )  # TODO: mudar filetype
 
     except Exception as exp:
         error = exp
@@ -550,9 +552,9 @@ def get_raw_data_gcs(
         else:
             filename = blob.name
 
-        raw_filepath = _save_raw_local(
+        raw_filepath = save_raw_local_func(
             data=data.decode(encoding="utf-8"),
-            file_path=local_filepath,
+            filepath=local_filepath,
             filetype=filename.split(".")[-1],
         )
 
@@ -563,12 +565,14 @@ def get_raw_data_gcs(
     return error, raw_filepath
 
 
-def _save_treated_local(file_path: str, status: dict, mode: str = "staging") -> str:
+def save_treated_local_func(
+    filepath: str, data: pd.DataFrame, error: str, mode: str = "staging"
+) -> str:
     """
     Save treated file to CSV.
 
     Args:
-        file_path (str): Path which to save treated file
+        filepath (str): Path which to save treated file
         status (dict): Must contain keys
           * `data`: dataframe returned from treatement
           * `error`: error catched from data treatement
@@ -577,12 +581,12 @@ def _save_treated_local(file_path: str, status: dict, mode: str = "staging") ->
     Returns:
         str: Path to the saved file
     """
-    _file_path = file_path.format(mode=mode, filetype="csv")
-    Path(_file_path).parent.mkdir(parents=True, exist_ok=True)
-    if status["error"] is None:
-        status["data"].to_csv(_file_path, index=False)
-        log(f"Treated data saved to: {_file_path}")
-    return _file_path
+    _filepath = filepath.format(mode=mode, filetype="csv")
+    Path(_filepath).parent.mkdir(parents=True, exist_ok=True)
+    if error is None:
+        data.to_csv(_filepath, index=False)
+        log(f"Treated data saved to: {_filepath}")
+    return _filepath
 
 
 def upload_run_logs_to_bq(  # pylint: disable=R0913
@@ -646,3 +650,48 @@ def upload_run_logs_to_bq(  # pylint: disable=R0913
     )
     if error is not None:
         raise Exception(f"Pipeline failed with error: {error}")
+
+
+def get_datetime_range(
+    timestamp: datetime,
+    interval: int,
+) -> dict:
+    """
+    Task to get datetime range in UTC
+
+    Args:
+        timestamp (datetime): timestamp to get datetime range
+        interval (int): interval in seconds
+
+    Returns:
+        dict: datetime range
+    """
+
+    start = (
+        (timestamp - timedelta(seconds=interval))
+        .astimezone(tz=pytz.timezone("UTC"))
+        .strftime("%Y-%m-%d %H:%M:%S")
+    )
+
+    end = timestamp.astimezone(tz=pytz.timezone("UTC")).strftime("%Y-%m-%d %H:%M:%S")
+
+    return {"start": start, "end": end}
+
+
+def transform_data_to_json(data: str, file_type: str, csv_args: dict = {}):
+    try:
+        if file_type == "json":
+            data = json.loads(data)
+
+        elif file_type in ("txt", "csv"):
+            if csv_args is None:
+                csv_args = {}
+            data = pd.read_csv(io.StringIO(data), **csv_args).to_dict(orient="records")
+        else:
+            error = "Unsupported raw file extension. Supported only: json, csv and txt"
+
+    except Exception as exp:
+        error = exp
+        log(f"[CATCHED] Task failed with error: \n{error}", level="error")
+
+    return error, data

From fa17be21b41769895fb4154b78d86d373652d368 Mon Sep 17 00:00:00 2001
From: Rafael <rafael.c.pinheiro96@gmail.com>
Date: Wed, 27 Sep 2023 11:20:15 -0300
Subject: [PATCH 18/59] adjust constant structure

---
 pipelines/rj_smtr/constants.py | 36 +++++++++++++++++++++++++-------
 pipelines/rj_smtr/flows.py     |  6 ++----
 pipelines/rj_smtr/tasks.py     | 38 +++++++++++++++-------------------
 3 files changed, 48 insertions(+), 32 deletions(-)

diff --git a/pipelines/rj_smtr/constants.py b/pipelines/rj_smtr/constants.py
index 00558f9cc..7eb18ef85 100644
--- a/pipelines/rj_smtr/constants.py
+++ b/pipelines/rj_smtr/constants.py
@@ -165,6 +165,18 @@ class constants(Enum):  # pylint: disable=c0103
 
     # BILHETAGEM
     BILHETAGEM_DATASET_ID = "br_rj_riodejaneiro_bilhetagem"
+    BILHETAGEM_DATABASES = {
+        "principal_db": {
+            "engine": "mysql",
+            "host": "principal-database-replica.internal",
+        },
+        "tarifa_db": {"engine": "postgres", "host": "tarifa-database-replica.internal"},
+        "transacao_db": {
+            "engine": "postgres",
+            "host": "transacao-database-replica.internal",
+        },
+    }
+    BILHETAGEM_VPN_URL = "http://vpn-jae.mobilidade.rio/"
     BILHETAGEM_TRANSACAO_TABLE_PARAMS = [
         {
             "partition_date_only": False,
@@ -264,14 +276,24 @@ class constants(Enum):  # pylint: disable=c0103
     BILHETAGEM_SECRET_PATH = "smtr_jae_access_data"
 
     # GTFS
-    GTFS_DATASET_ID = "br_rj_riodejaneiro_gtfs"
+    GTFS_CAPTURE_PARAMS = [
+        {"table_id": "agency", "primary_key": ["agency_id"]},
+        {"table_id": "calendar_dates", "primary_key": ["service_id"]},
+        {"table_id": "calendar", "primary_key": ["service_id"]},
+        {"table_id": "feed_info", "primary_key": ["feed_publisher_name"]},
+        {"table_id": "frequencies", "primary_key": ["trip_id"]},
+        {"table_id": "routes", "primary_key": ["route_id"]},
+        {"table_id": "shapes", "primary_key": ["shape_id"]},
+        {"table_id": "stops", "primary_key": ["stop_id"]},
+        {"table_id": "trips", "primary_key": ["trip_id"]},
+        {"table_id": "fare_attributes", "primary_key": ["fare_id"]},
+        {"table_id": "fare_rules", "primary_key": ["fare_id"]},
+    ]
 
-    GTFS_SOURCE_TYPE = "gcs"
+    GTFS_GENERAL_CAPTURE_PARAMS = {"partition_date_only": True, "source_type": "gcs"}
 
-    GTFS_AGENCY_REQUEST_PARAMS = {
-        "filepath": "development/br_rj_riodejaneiro_gtfs/upload/gtfs.zip"
-    }
+    GTFS_QUADRO_CAPTURE_PARAMS = {"table_id": "quadro", "primary_key": "servico"}
 
-    GTFS_AGENCY_TABLE_ID = "agency"
+    GTFS_DATASET_ID = "br_rj_riodejaneiro_gtfs"
 
-    GTFS_QUADRO_TABLE_ID = "quadro"
+    GTFS_BASE_GCS_PATH = "development/br_rj_riodejaneiro_gtfs/upload"
diff --git a/pipelines/rj_smtr/flows.py b/pipelines/rj_smtr/flows.py
index 3dd834a75..94a3ffb93 100644
--- a/pipelines/rj_smtr/flows.py
+++ b/pipelines/rj_smtr/flows.py
@@ -44,7 +44,7 @@
 
     table_id = Parameter("table_id", default=None)
     partition_date_only = Parameter("partition_date_only", default=None)
-    request_params = Parameter("request_params", default=None)
+    extract_params = Parameter("extract_params", default=None)
     dataset_id = Parameter("dataset_id", default=None)
     secret_path = Parameter("secret_path", default=None)
     primary_key = Parameter("primary_key", default=None)
@@ -71,11 +71,9 @@
     )
 
     # Extração #
-    # é necessária task ou função dentro da extract_raw_data?
     request_params, request_path = create_request_params(
-        secret_path=secret_path,
         dataset_id=dataset_id,
-        request_params=request_params,
+        extract_params=extract_params,
         table_id=table_id,
         timestamp=timestamp,
     )
diff --git a/pipelines/rj_smtr/tasks.py b/pipelines/rj_smtr/tasks.py
index 89beae6f2..a134dd966 100644
--- a/pipelines/rj_smtr/tasks.py
+++ b/pipelines/rj_smtr/tasks.py
@@ -879,7 +879,7 @@ def get_previous_date(days):
 def transform_raw_to_nested_structure(
     raw_filepath: str,
     filepath: str,
-    error: bool,
+    error: str,
     timestamp: datetime,
     primary_key: list = None,
 ):
@@ -898,6 +898,10 @@ def transform_raw_to_nested_structure(
             * `error` (str): catched error, if any. Otherwise, returns None
     """
 
+    # Check previous error
+    if error is not None:
+        return error, None
+
     with open(raw_filepath, "r", encoding="utf-8") as file:
         data = file.read()
 
@@ -907,10 +911,6 @@ def transform_raw_to_nested_structure(
         file_type=raw_filepath.split(".")[-1],
     )
 
-    # Check previous error
-    if error is not None:
-        return error, None
-
     # Check empty dataframe
     # if len(status["data"]) == 0:
     #     log("Empty dataframe, skipping transformation...")
@@ -999,11 +999,8 @@ def transform_raw_to_nested_structure(
 
 @task(checkpoint=False, nout=2)
 def create_request_params(
-    # datetime_range: dict,
-    # table_params: dict,
-    request_params: dict,
+    extract_params: dict,
     table_id: str,
-    secret_path: str,
     dataset_id: str,
     timestamp: datetime,
 ) -> tuple:
@@ -1020,28 +1017,27 @@ def create_request_params(
         request_params: host, database and query to request data
         request_url: url to request data
     """
+    request_params = None
 
     if dataset_id == constants.BILHETAGEM_DATASET_ID.value:
-        secrets = get_vault_secret(secret_path)["data"]
-
-        database_secrets = secrets["databases"][request_params["database"]]
-        request_url = secrets["vpn_url"] + database_secrets["engine"]
+        database = constants.BILHETAGEM_DATABASES.value[extract_params["database"]]
+        request_url = constants.BILHETAGEM_VPN_URL.value + database["engine"]
 
         datetime_range = get_datetime_range(
-            timestamp=timestamp, interval=request_params["run_interval"]
+            timestamp=timestamp, interval=extract_params["run_interval"]
         )
+
         request_params = {
-            "host": database_secrets["host"],  # TODO: exibir no log em ambiente fechado
-            "database": request_params["database"],
-            "query": request_params["query"].format(**datetime_range),
+            "host": database["host"],  # TODO: exibir no log em ambiente fechado
+            "database": extract_params["database"],
+            "query": extract_params["query"].format(**datetime_range),
         }
 
     elif dataset_id == constants.GTFS_DATASET_ID.value:
-        gtfs_base_path = "development/br_rj_riodejaneiro_gtfs/upload"
-        if table_id == constants.GTFS_QUADRO_TABLE_ID.value:
-            request_url = f"{gtfs_base_path}/quadro.csv"
+        if table_id == constants.GTFS_QUADRO_CAPTURE_PARAMS.value["table_id"]:
+            request_url = f"{constants.GTFS_BASE_GCS_PATH.value}/{table_id}.csv"
         else:
-            request_url = f"{gtfs_base_path}/gtfs.zip"
+            request_url = f"{constants.GTFS_BASE_GCS_PATH.value}/gtfs.zip"
 
     return request_params, request_url
 

From bdc3881cde88840b62175e1ce8ac66a596e37feb Mon Sep 17 00:00:00 2001
From: Rafael <rafael.c.pinheiro96@gmail.com>
Date: Wed, 27 Sep 2023 13:27:11 -0300
Subject: [PATCH 19/59] change bilhetagem to new capture flow structure

---
 .../schedules.py                              |  18 +-
 pipelines/rj_smtr/constants.py                | 186 ++++++++++--------
 pipelines/rj_smtr/tasks.py                    |  14 +-
 pipelines/rj_smtr/utils.py                    |  40 ++--
 4 files changed, 145 insertions(+), 113 deletions(-)

diff --git a/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/schedules.py b/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/schedules.py
index 38fca85a9..538e5b816 100644
--- a/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/schedules.py
+++ b/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/schedules.py
@@ -16,26 +16,32 @@
 )
 
 bilhetagem_principal_clocks = generate_execute_schedules(
-    interval=timedelta(days=1),
+    clock_interval=timedelta(
+        **constants.BILHETAGEM_GENERAL_CAPTURE_PARAMS.value["principal_run_interval"]
+    ),
     labels=[
-        emd_constants.RJ_SMTR_AGENT_LABEL.value,
+        emd_constants.RJ_SMTR_DEV_AGENT_LABEL.value,
     ],
-    table_parameters=constants.BILHETAGEM_TABLES_PARAMS.value,
+    table_parameters=constants.BILHETAGEM_CAPTURE_PARAMS.value,
     dataset_id=constants.BILHETAGEM_DATASET_ID.value,
     secret_path=constants.BILHETAGEM_SECRET_PATH.value,
+    source_type=constants.BILHETAGEM_GENERAL_CAPTURE_PARAMS.value["source_type"],
     runs_interval_minutes=15,
 )
 
 bilhetagem_principal_schedule = Schedule(clocks=untuple(bilhetagem_principal_clocks))
 
 bilhetagem_transacao_clocks = generate_execute_schedules(
-    interval=timedelta(minutes=1),
+    clock_interval=timedelta(
+        constants.BILHETAGEM_GENERAL_CAPTURE_PARAMS.value["transacao_run_interval"]
+    ),
     labels=[
-        emd_constants.RJ_SMTR_AGENT_LABEL.value,
+        emd_constants.RJ_SMTR_DEV_AGENT_LABEL.value,
     ],
-    table_parameters=constants.BILHETAGEM_TRANSACAO_TABLE_PARAMS.value,
+    table_parameters=constants.BILHETAGEM_TRANSACAO_CAPTURE_PARAMS.value,
     dataset_id=constants.BILHETAGEM_DATASET_ID.value,
     secret_path=constants.BILHETAGEM_SECRET_PATH.value,
+    source_type=constants.BILHETAGEM_GENERAL_CAPTURE_PARAMS.value["source_type"],
     runs_interval_minutes=0,
 )
 
diff --git a/pipelines/rj_smtr/constants.py b/pipelines/rj_smtr/constants.py
index 7eb18ef85..969ccd871 100644
--- a/pipelines/rj_smtr/constants.py
+++ b/pipelines/rj_smtr/constants.py
@@ -165,117 +165,142 @@ class constants(Enum):  # pylint: disable=c0103
 
     # BILHETAGEM
     BILHETAGEM_DATASET_ID = "br_rj_riodejaneiro_bilhetagem"
-    BILHETAGEM_DATABASES = {
-        "principal_db": {
-            "engine": "mysql",
-            "host": "principal-database-replica.internal",
+
+    BILHETAGEM_GENERAL_CAPTURE_PARAMS = {
+        "databases": {
+            "principal_db": {
+                "engine": "mysql",
+                "host": "principal-database-replica.internal",
+            },
+            "tarifa_db": {
+                "engine": "postgres",
+                "host": "tarifa-database-replica.internal",
+            },
+            "transacao_db": {
+                "engine": "postgres",
+                "host": "transacao-database-replica.internal",
+            },
         },
-        "tarifa_db": {"engine": "postgres", "host": "tarifa-database-replica.internal"},
-        "transacao_db": {
-            "engine": "postgres",
-            "host": "transacao-database-replica.internal",
+        "vpn_url": "http://vpn-jae.mobilidade.rio/",
+        "source_type": "api-json",
+        "transacao_run_interval": {"minutes": 1},
+        "principal_run_interval": {"days": 1},
+    }
+
+    BILHETAGEM_TRANSACAO_CAPTURE_PARAMS = {
+        "table_id": "transacao",
+        "partition_date_only": False,
+        "extract_params": {
+            "database": "transacao_db",
+            "query": """
+                SELECT
+                    *
+                FROM
+                    transacao
+                WHERE
+                    data_processamento BETWEEN '{start}'
+                    AND '{end}'
+                ORDER BY
+                    data_processamento
+            """,
+            "run_interval": BILHETAGEM_GENERAL_CAPTURE_PARAMS["transacao_run_interval"],
         },
+        "primary_key": ["id"],
     }
-    BILHETAGEM_VPN_URL = "http://vpn-jae.mobilidade.rio/"
-    BILHETAGEM_TRANSACAO_TABLE_PARAMS = [
+
+    BILHETAGEM_CAPTURE_PARAMS = [
         {
-            "partition_date_only": False,
-            "flow_run_name": "transacao",
-            "extraction": {
-                "table_id": "transacao",
-                "database": "transacao_db",
+            "table_id": "linha",
+            "partition_date_only": True,
+            "extract_params": {
+                "database": "principal_db",
                 "query": """
                     SELECT
                         *
                     FROM
-                        transacao
+                        LINHA
                     WHERE
-                        data_processamento BETWEEN '{start}'
-                        AND '{end}'
+                        DT_INCLUSAO >= '{start}'
                     ORDER BY
-                        data_processamento
+                        DT_INCLUSAO
                 """,
-                "source": "api",
-            },
-            "pre-treatment": {
-                "table_id": "transacao",
-                "file_type": "json",
-                "primary_key": ["id"],  # id column to nest data on
+                "run_interval": BILHETAGEM_GENERAL_CAPTURE_PARAMS[
+                    "principal_run_interval"
+                ],
             },
-        }
-    ]
-    BILHETAGEM_TABLES_PARAMS = [
-        {
-            "table_id": "linha",
-            "database": "principal_db",
-            "query": """
-                SELECT
-                    *
-                FROM
-                    LINHA
-                WHERE
-                    DT_INCLUSAO >= '{start}'
-                ORDER BY
-                    DT_INCLUSAO
-            """,
             "primary_key": ["CD_LINHA"],  # id column to nest data on
-            "partition_date_only": True,
         },
         {
             "table_id": "grupo",
-            "database": "principal_db",
-            "query": """
-                SELECT
-                    *
-                FROM
-                    GRUPO
-                WHERE
-                    DT_INCLUSAO >= '{start}'
-                ORDER BY
-                    DT_INCLUSAO
-            """,
-            "primary_key": ["CD_GRUPO"],
             "partition_date_only": True,
+            "extract_params": {
+                "database": "principal_db",
+                "query": """
+                    SELECT
+                        *
+                    FROM
+                        GRUPO
+                    WHERE
+                        DT_INCLUSAO >= '{start}'
+                    ORDER BY
+                        DT_INCLUSAO
+                """,
+                "run_interval": BILHETAGEM_GENERAL_CAPTURE_PARAMS[
+                    "principal_run_interval"
+                ],
+            },
+            "primary_key": ["CD_GRUPO"],
         },
         {
             "table_id": "grupo_linha",
-            "database": "principal_db",
-            "query": """
-                SELECT
-                    *
-                FROM
-                    GRUPO_LINHA
-                WHERE
-                    DT_INCLUSAO >= '{start}'
-                ORDER BY
-                    DT_INCLUSAO
-            """,
-            "primary_key": ["CD_GRUPO", "CD_LINHA"],  # id column to nest data on
             "partition_date_only": True,
+            "extract_params": {
+                "database": "principal_db",
+                "query": """
+                    SELECT
+                        *
+                    FROM
+                        GRUPO_LINHA
+                    WHERE
+                        DT_INCLUSAO >= '{start}'
+                    ORDER BY
+                        DT_INCLUSAO
+                """,
+                "run_interval": BILHETAGEM_GENERAL_CAPTURE_PARAMS[
+                    "principal_run_interval"
+                ],
+            },
+            "primary_key": ["CD_GRUPO", "CD_LINHA"],  # id column to nest data on
         },
         {
             "table_id": "matriz_integracao",
-            "database": "tarifa_db",
-            "query": """
-                SELECT
-                    *
-                FROM
-                    matriz_integracao
-                WHERE
-                    dt_inclusao >= '{start}'
-                ORDER BY
-                    dt_inclusao
-            """,
+            "partition_date_only": True,
+            "extract_params": {
+                "database": "tarifa_db",
+                "query": """
+                    SELECT
+                        *
+                    FROM
+                        matriz_integracao
+                    WHERE
+                        dt_inclusao >= '{start}'
+                    ORDER BY
+                        dt_inclusao
+                """,
+                "run_interval": BILHETAGEM_GENERAL_CAPTURE_PARAMS[
+                    "principal_run_interval"
+                ],
+            },
             "primary_key": [
                 "cd_versao_matriz",
                 "cd_integracao",
             ],  # id column to nest data on
-            "partition_date_only": True,
         },
     ]
     BILHETAGEM_SECRET_PATH = "smtr_jae_access_data"
 
     # GTFS
+    GTFS_DATASET_ID = "br_rj_riodejaneiro_gtfs"
     GTFS_CAPTURE_PARAMS = [
         {"table_id": "agency", "primary_key": ["agency_id"]},
         {"table_id": "calendar_dates", "primary_key": ["service_id"]},
@@ -289,11 +314,6 @@ class constants(Enum):  # pylint: disable=c0103
         {"table_id": "fare_attributes", "primary_key": ["fare_id"]},
         {"table_id": "fare_rules", "primary_key": ["fare_id"]},
     ]
-
     GTFS_GENERAL_CAPTURE_PARAMS = {"partition_date_only": True, "source_type": "gcs"}
-
     GTFS_QUADRO_CAPTURE_PARAMS = {"table_id": "quadro", "primary_key": "servico"}
-
-    GTFS_DATASET_ID = "br_rj_riodejaneiro_gtfs"
-
     GTFS_BASE_GCS_PATH = "development/br_rj_riodejaneiro_gtfs/upload"
diff --git a/pipelines/rj_smtr/tasks.py b/pipelines/rj_smtr/tasks.py
index a134dd966..e414f1c70 100644
--- a/pipelines/rj_smtr/tasks.py
+++ b/pipelines/rj_smtr/tasks.py
@@ -1020,11 +1020,16 @@ def create_request_params(
     request_params = None
 
     if dataset_id == constants.BILHETAGEM_DATASET_ID.value:
-        database = constants.BILHETAGEM_DATABASES.value[extract_params["database"]]
-        request_url = constants.BILHETAGEM_VPN_URL.value + database["engine"]
+        database = constants.BILHETAGEM_GENERAL_CAPTURE_PARAMS.value["databases"][
+            extract_params["database"]
+        ]
+        request_url = (
+            constants.BILHETAGEM_GENERAL_CAPTURE_PARAMS.value["vpn_url"]
+            + database["engine"]
+        )
 
         datetime_range = get_datetime_range(
-            timestamp=timestamp, interval=extract_params["run_interval"]
+            timestamp=timestamp, interval=timedelta(**extract_params["run_interval"])
         )
 
         request_params = {
@@ -1051,12 +1056,15 @@ def get_raw_from_sources(
     secret_path: str = None,
     api_params: dict = None,
 ):
+    source_type, filetype = source_type.split("-", maxsplit=1)
+
     if source_type == "api":
         return get_raw_data_api(
             url=source_path,
             secret_path=secret_path,
             api_params=api_params,
             filepath=local_filepath,
+            filetype=filetype,
         )
     if source_type == "gcs":
         return get_raw_data_gcs(
diff --git a/pipelines/rj_smtr/utils.py b/pipelines/rj_smtr/utils.py
index 184a93df7..d354ae6ab 100644
--- a/pipelines/rj_smtr/utils.py
+++ b/pipelines/rj_smtr/utils.py
@@ -8,7 +8,7 @@
 from pathlib import Path
 
 from datetime import timedelta, datetime
-from typing import List
+from typing import List, Union
 import io
 import json
 import zipfile
@@ -31,7 +31,6 @@
     get_vault_secret,
     send_discord_message,
     get_redis_client,
-    get_storage_blobs,
     get_storage_blob,
 )
 
@@ -404,46 +403,41 @@ def data_info_str(data: pd.DataFrame):
 
 
 def generate_execute_schedules(  # pylint: disable=too-many-arguments,too-many-locals
-    interval: timedelta,
+    clock_interval: timedelta,
     labels: List[str],
-    table_parameters: list,
-    dataset_id: str,
-    secret_path: str,
+    table_parameters: Union[list[dict], dict],
     runs_interval_minutes: int = 15,
     start_date: datetime = datetime(
         2020, 1, 1, tzinfo=pytz.timezone(emd_constants.DEFAULT_TIMEZONE.value)
     ),
+    **general_flow_params,
 ) -> List[IntervalClock]:
     """
     Generates multiple schedules
 
     Args:
-        interval (timedelta): The interval to run the schedule
+        clock_interval (timedelta): The interval to run the schedule
         labels (List[str]): The labels to be added to the schedule
-        table_parameters (list): The table parameters
-        dataset_id (str): The dataset_id to be used in the schedule
-        secret_path (str): The secret path to be used in the schedule
+        table_parameters (list): The table parameters to iterate over
         runs_interval_minutes (int, optional): The interval between each schedule. Defaults to 15.
         start_date (datetime, optional): The start date of the schedule.
             Defaults to datetime(2020, 1, 1, tzinfo=pytz.timezone(emd_constants.DEFAULT_TIMEZONE.value)).
-
+        general_flow_params: Any param that you want to pass to the flow
     Returns:
         List[IntervalClock]: The list of schedules
 
     """
+    if isinstance(table_parameters, dict):
+        table_parameters = [table_parameters]
 
     clocks = []
     for count, parameters in enumerate(table_parameters):
-        parameter_defaults = {
-            "table_params": parameters,
-            "dataset_id": dataset_id,
-            "secret_path": secret_path,
-            "interval": interval.total_seconds(),
-        }
+        parameter_defaults = parameters | general_flow_params
+
         log(f"parameter_defaults: {parameter_defaults}")
         clocks.append(
             IntervalClock(
-                interval=interval,
+                interval=clock_interval,
                 start_date=start_date
                 + timedelta(minutes=runs_interval_minutes * count),
                 labels=labels,
@@ -486,7 +480,11 @@ def save_raw_local_func(
 
 
 def get_raw_data_api(  # pylint: disable=R0912
-    url: str, secret_path: str = None, api_params: dict = None, filepath: str = None
+    url: str,
+    secret_path: str = None,
+    api_params: dict = None,
+    filepath: str = None,
+    filetype: str = None,
 ) -> list[dict]:
     """
     Request data from URL API
@@ -517,8 +515,8 @@ def get_raw_data_api(  # pylint: disable=R0912
 
         response.raise_for_status()
         filepath = save_raw_local_func(
-            data=response.text, filepath=filepath
-        )  # TODO: mudar filetype
+            data=response.text, filepath=filepath, filetype=filetype
+        )
 
     except Exception as exp:
         error = exp

From fc61c4762c7a416872ba6fbbfa5a064a43e846a4 Mon Sep 17 00:00:00 2001
From: Rafael <rafael.c.pinheiro96@gmail.com>
Date: Wed, 27 Sep 2023 14:24:48 -0300
Subject: [PATCH 20/59] fix get_storage_blob function

---
 pipelines/rj_smtr/constants.py | 2 +-
 pipelines/utils/utils.py       | 3 +--
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/pipelines/rj_smtr/constants.py b/pipelines/rj_smtr/constants.py
index 969ccd871..2faeccb25 100644
--- a/pipelines/rj_smtr/constants.py
+++ b/pipelines/rj_smtr/constants.py
@@ -316,4 +316,4 @@ class constants(Enum):  # pylint: disable=c0103
     ]
     GTFS_GENERAL_CAPTURE_PARAMS = {"partition_date_only": True, "source_type": "gcs"}
     GTFS_QUADRO_CAPTURE_PARAMS = {"table_id": "quadro", "primary_key": "servico"}
-    GTFS_BASE_GCS_PATH = "development/br_rj_riodejaneiro_gtfs/upload"
+    GTFS_BASE_GCS_PATH = "raw/development/br_rj_riodejaneiro_gtfs/upload"
diff --git a/pipelines/utils/utils.py b/pipelines/utils/utils.py
index 147e54f4f..57384f8f4 100644
--- a/pipelines/utils/utils.py
+++ b/pipelines/utils/utils.py
@@ -726,9 +726,8 @@ def get_storage_blobs(dataset_id: str, table_id: str, mode: str = "staging") ->
 
 def get_storage_blob(
     gcs_path: str,
-    mode: str = "staging",
 ):
-    bucket = bd.Storage()
+    bucket = bd.Storage(dataset_id="", table_id="")
     return (
         bucket.client["storage_staging"]
         .bucket(bucket.bucket_name)

From 0fc26cbc9d786fd28b369ab35784b636c3ecdc12 Mon Sep 17 00:00:00 2001
From: Rafael <rafael.c.pinheiro96@gmail.com>
Date: Wed, 27 Sep 2023 14:25:27 -0300
Subject: [PATCH 21/59] fix get_storage_blob call

---
 pipelines/rj_smtr/utils.py | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/pipelines/rj_smtr/utils.py b/pipelines/rj_smtr/utils.py
index d354ae6ab..55abfc9cf 100644
--- a/pipelines/rj_smtr/utils.py
+++ b/pipelines/rj_smtr/utils.py
@@ -533,10 +533,7 @@ def get_raw_data_gcs(
     error = None
 
     try:
-        blob = get_storage_blob(
-            gcs_path=gcs_path,
-            mode="raw",
-        )
+        blob = get_storage_blob(gcs_path=gcs_path)
 
         data = blob.download_as_bytes()
 

From 634df851e41bff549fe5f9daab4801f0eb6e0858 Mon Sep 17 00:00:00 2001
From: Rafael <rafael.c.pinheiro96@gmail.com>
Date: Wed, 27 Sep 2023 14:45:26 -0300
Subject: [PATCH 22/59] organize constants order

---
 pipelines/rj_smtr/constants.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pipelines/rj_smtr/constants.py b/pipelines/rj_smtr/constants.py
index 2faeccb25..722d7e9e1 100644
--- a/pipelines/rj_smtr/constants.py
+++ b/pipelines/rj_smtr/constants.py
@@ -301,6 +301,7 @@ class constants(Enum):  # pylint: disable=c0103
 
     # GTFS
     GTFS_DATASET_ID = "br_rj_riodejaneiro_gtfs"
+    GTFS_GENERAL_CAPTURE_PARAMS = {"partition_date_only": True, "source_type": "gcs"}
     GTFS_CAPTURE_PARAMS = [
         {"table_id": "agency", "primary_key": ["agency_id"]},
         {"table_id": "calendar_dates", "primary_key": ["service_id"]},
@@ -314,6 +315,5 @@ class constants(Enum):  # pylint: disable=c0103
         {"table_id": "fare_attributes", "primary_key": ["fare_id"]},
         {"table_id": "fare_rules", "primary_key": ["fare_id"]},
     ]
-    GTFS_GENERAL_CAPTURE_PARAMS = {"partition_date_only": True, "source_type": "gcs"}
     GTFS_QUADRO_CAPTURE_PARAMS = {"table_id": "quadro", "primary_key": "servico"}
-    GTFS_BASE_GCS_PATH = "raw/development/br_rj_riodejaneiro_gtfs/upload"
+    GTFS_BASE_GCS_PATH = "development/br_rj_riodejaneiro_gtfs/upload"

From bda52aa6eedb6eedec2c6334f0843e2a80edcd4a Mon Sep 17 00:00:00 2001
From: Rafael <rafael.c.pinheiro96@gmail.com>
Date: Wed, 27 Sep 2023 14:46:45 -0300
Subject: [PATCH 23/59] fix get_raw_from_sources function call

---
 pipelines/rj_smtr/flows.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pipelines/rj_smtr/flows.py b/pipelines/rj_smtr/flows.py
index 94a3ffb93..19ac776b7 100644
--- a/pipelines/rj_smtr/flows.py
+++ b/pipelines/rj_smtr/flows.py
@@ -82,9 +82,9 @@
         source_type=source_type,  # parametro de extracao, onde ficar?
         local_filepath=filepath,
         source_path=request_path,
-        zip_filename=table_id,
+        table_id=table_id,
         secret_path=secret_path,
-        request_params=request_params,
+        api_params=request_params,
     )
 
     RAW_UPLOADED = upload_raw_data_to_gcs(

From b2548d6b8cd1f56bf9dbd4676e52011ce5fdfa16 Mon Sep 17 00:00:00 2001
From: Rafael <rafael.c.pinheiro96@gmail.com>
Date: Wed, 27 Sep 2023 14:47:35 -0300
Subject: [PATCH 24/59] change transform_raw_to_json to read_raw_data

---
 pipelines/rj_smtr/tasks.py | 17 +++++------------
 1 file changed, 5 insertions(+), 12 deletions(-)

diff --git a/pipelines/rj_smtr/tasks.py b/pipelines/rj_smtr/tasks.py
index e414f1c70..ee99ff654 100644
--- a/pipelines/rj_smtr/tasks.py
+++ b/pipelines/rj_smtr/tasks.py
@@ -32,7 +32,7 @@
     get_raw_data_gcs,
     upload_run_logs_to_bq,
     get_datetime_range,
-    transform_data_to_json,
+    read_raw_data,
     save_treated_local_func,
 )
 from pipelines.utils.execute_dbt_model.utils import get_dbt_client
@@ -899,17 +899,11 @@ def transform_raw_to_nested_structure(
     """
 
     # Check previous error
+
     if error is not None:
         return error, None
 
-    with open(raw_filepath, "r", encoding="utf-8") as file:
-        data = file.read()
-
     # ORGANIZAR:
-    error, data = transform_data_to_json(
-        data=data,
-        file_type=raw_filepath.split(".")[-1],
-    )
 
     # Check empty dataframe
     # if len(status["data"]) == 0:
@@ -917,13 +911,12 @@ def transform_raw_to_nested_structure(
     #     return {"data": pd.DataFrame(), "error": error}
 
     try:
+        # leitura do dado raw
+        error, data = read_raw_data(filepath=raw_filepath)
+
         if primary_key is None:
             primary_key = []
 
-        error = None
-        # leitura do dado raw
-        data = pd.DataFrame(data)
-
         log(
             f"""
         Received inputs:

From 307863a1d381cefeeb5a9001fb8f4ef235923cbb Mon Sep 17 00:00:00 2001
From: Rafael <rafael.c.pinheiro96@gmail.com>
Date: Wed, 27 Sep 2023 14:48:30 -0300
Subject: [PATCH 25/59] transform transform_raw_data_to_json to read_raw_data

---
 pipelines/rj_smtr/utils.py | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/pipelines/rj_smtr/utils.py b/pipelines/rj_smtr/utils.py
index 55abfc9cf..3f4281a2c 100644
--- a/pipelines/rj_smtr/utils.py
+++ b/pipelines/rj_smtr/utils.py
@@ -673,15 +673,18 @@ def get_datetime_range(
     return {"start": start, "end": end}
 
 
-def transform_data_to_json(data: str, file_type: str, csv_args: dict = {}):
+def read_raw_data(filepath: str, csv_args: dict = {}) -> tuple[str, pd.DataFrame]:
     try:
+        file_type = filepath.split(".")[-1]
+
         if file_type == "json":
-            data = json.loads(data)
+            data = pd.read_json(filepath)
+            # data = json.loads(data)
 
         elif file_type in ("txt", "csv"):
             if csv_args is None:
                 csv_args = {}
-            data = pd.read_csv(io.StringIO(data), **csv_args).to_dict(orient="records")
+            data = pd.read_csv(filepath, **csv_args)
         else:
             error = "Unsupported raw file extension. Supported only: json, csv and txt"
 

From 7f2c1e3fe3db535868943404e945b5b44eefad74 Mon Sep 17 00:00:00 2001
From: Rafael <rafael.c.pinheiro96@gmail.com>
Date: Wed, 27 Sep 2023 14:59:43 -0300
Subject: [PATCH 26/59] fix nout task parameter

---
 pipelines/rj_smtr/tasks.py | 4 ++--
 pipelines/rj_smtr/utils.py | 6 +++---
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/pipelines/rj_smtr/tasks.py b/pipelines/rj_smtr/tasks.py
index ee99ff654..9beb5a87e 100644
--- a/pipelines/rj_smtr/tasks.py
+++ b/pipelines/rj_smtr/tasks.py
@@ -875,7 +875,7 @@ def get_previous_date(days):
 ###############
 
 
-@task
+@task(nout=2)
 def transform_raw_to_nested_structure(
     raw_filepath: str,
     filepath: str,
@@ -1040,7 +1040,7 @@ def create_request_params(
     return request_params, request_url
 
 
-@task(checkpoint=False)
+@task(checkpoint=False, nout=2)
 def get_raw_from_sources(
     source_type: str,
     local_filepath: str,
diff --git a/pipelines/rj_smtr/utils.py b/pipelines/rj_smtr/utils.py
index 3f4281a2c..8a8804474 100644
--- a/pipelines/rj_smtr/utils.py
+++ b/pipelines/rj_smtr/utils.py
@@ -485,7 +485,7 @@ def get_raw_data_api(  # pylint: disable=R0912
     api_params: dict = None,
     filepath: str = None,
     filetype: str = None,
-) -> list[dict]:
+) -> tuple[str, str]:
     """
     Request data from URL API
 
@@ -529,7 +529,7 @@ def get_raw_data_gcs(
     gcs_path: str,
     local_filepath: str,
     filename_to_unzip: str = None,
-) -> dict:
+) -> tuple[str, str]:
     error = None
 
     try:
@@ -673,7 +673,7 @@ def get_datetime_range(
     return {"start": start, "end": end}
 
 
-def read_raw_data(filepath: str, csv_args: dict = {}) -> tuple[str, pd.DataFrame]:
+def read_raw_data(filepath: str, csv_args: dict = dict()) -> tuple[str, pd.DataFrame]:
     try:
         file_type = filepath.split(".")[-1]
 

From 51977c10621d34ea3643004cba5bc4f990d249db Mon Sep 17 00:00:00 2001
From: Rafael <rafael.c.pinheiro96@gmail.com>
Date: Wed, 27 Sep 2023 15:16:38 -0300
Subject: [PATCH 27/59] fix timedelta instantiation

---
 pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/schedules.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/schedules.py b/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/schedules.py
index 538e5b816..f19f0d8ad 100644
--- a/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/schedules.py
+++ b/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/schedules.py
@@ -33,7 +33,7 @@
 
 bilhetagem_transacao_clocks = generate_execute_schedules(
     clock_interval=timedelta(
-        constants.BILHETAGEM_GENERAL_CAPTURE_PARAMS.value["transacao_run_interval"]
+        **constants.BILHETAGEM_GENERAL_CAPTURE_PARAMS.value["transacao_run_interval"]
     ),
     labels=[
         emd_constants.RJ_SMTR_DEV_AGENT_LABEL.value,

From 8ef0b5df7c31ebb7f59ff719c338e029e34cf031 Mon Sep 17 00:00:00 2001
From: Rafael <rafael.c.pinheiro96@gmail.com>
Date: Wed, 27 Sep 2023 15:58:05 -0300
Subject: [PATCH 28/59] set upstream tasks

---
 pipelines/rj_smtr/flows.py |  1 +
 pipelines/rj_smtr/tasks.py | 10 +++++++---
 2 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/pipelines/rj_smtr/flows.py b/pipelines/rj_smtr/flows.py
index 19ac776b7..a4044933a 100644
--- a/pipelines/rj_smtr/flows.py
+++ b/pipelines/rj_smtr/flows.py
@@ -104,6 +104,7 @@
         error=error,
         timestamp=timestamp,
         primary_key=primary_key,
+        upstream_tasks=[RAW_UPLOADED],
     )
 
     STAGING_UPLOADED = upload_staging_data_to_gcs(
diff --git a/pipelines/rj_smtr/tasks.py b/pipelines/rj_smtr/tasks.py
index 9beb5a87e..269ee73eb 100644
--- a/pipelines/rj_smtr/tasks.py
+++ b/pipelines/rj_smtr/tasks.py
@@ -1052,19 +1052,23 @@ def get_raw_from_sources(
     source_type, filetype = source_type.split("-", maxsplit=1)
 
     if source_type == "api":
-        return get_raw_data_api(
+        error, filepath = get_raw_data_api(
             url=source_path,
             secret_path=secret_path,
             api_params=api_params,
             filepath=local_filepath,
             filetype=filetype,
         )
-    if source_type == "gcs":
-        return get_raw_data_gcs(
+    elif source_type == "gcs":
+        error, filepath = get_raw_data_gcs(
             gcs_path=source_path,
             filename_to_unzip=table_id,
             local_filepath=local_filepath,
         )
+    else:
+        raise NotImplementedError(f"{source_type} not supported")
+
+    return error, filepath
 
 
 # TODO: passar para função para dentro da transform_raw_to_nested_structure

From 4f21f0af7fff375354538c868e7b4cedd7943f4d Mon Sep 17 00:00:00 2001
From: Rafael <rafael.c.pinheiro96@gmail.com>
Date: Wed, 27 Sep 2023 16:02:09 -0300
Subject: [PATCH 29/59] declare raw_filepath

---
 pipelines/rj_smtr/utils.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/pipelines/rj_smtr/utils.py b/pipelines/rj_smtr/utils.py
index 8a8804474..0fd5c7d6c 100644
--- a/pipelines/rj_smtr/utils.py
+++ b/pipelines/rj_smtr/utils.py
@@ -531,6 +531,7 @@ def get_raw_data_gcs(
     filename_to_unzip: str = None,
 ) -> tuple[str, str]:
     error = None
+    raw_filepath = None
 
     try:
         blob = get_storage_blob(gcs_path=gcs_path)

From 11b973581c7ccc103d16bccc09dccd41f86f68da Mon Sep 17 00:00:00 2001
From: eng-rodrigocunha <engtransportes.rodrigocunha@gmail.com>
Date: Wed, 27 Sep 2023 16:19:43 -0300
Subject: [PATCH 30/59] update docstrings

---
 pipelines/rj_smtr/tasks.py | 76 +++++++++++++++++++++++++++++++-------
 pipelines/rj_smtr/utils.py | 50 ++++++++++++++++++-------
 pipelines/utils/utils.py   | 17 +++++++++
 3 files changed, 116 insertions(+), 27 deletions(-)

diff --git a/pipelines/rj_smtr/tasks.py b/pipelines/rj_smtr/tasks.py
index 269ee73eb..b12f0604c 100644
--- a/pipelines/rj_smtr/tasks.py
+++ b/pipelines/rj_smtr/tasks.py
@@ -168,7 +168,14 @@ def create_date_hour_partition(
     timestamp: datetime, partition_date_only: bool = False
 ) -> str:
     """
-    Get date hour Hive partition structure from timestamp.
+    Generate partition string for date and hour.
+
+    Args:
+        timestamp (datetime): timestamp to be used as reference
+        partition_date_only (bool, optional): whether to add hour partition or not
+
+    Returns:
+        str: partition string
     """
     partition = f"data={timestamp.strftime('%Y-%m-%d')}"
     if not partition_date_only:
@@ -614,6 +621,20 @@ def upload_raw_data_to_gcs(
     dataset_id: str,
     partitions: list,
 ):
+    """
+    Upload raw data to GCS.
+
+    Args:
+        error (bool): whether the upstream tasks failed or not
+        raw_filepath (str): Path to the saved raw .json file
+        timestamp (datetime): timestamp for flow run
+        table_id (str): table_id on BigQuery
+        dataset_id (str): dataset_id on BigQuery
+        partitions (list): list of partition strings
+
+    Returns:
+        None
+    """
     if not error:
         try:
             st_obj = Storage(table_id=table_id, dataset_id=dataset_id)
@@ -649,6 +670,20 @@ def upload_staging_data_to_gcs(
     dataset_id: str,
     partitions: list,
 ):
+    """
+    Upload staging data to GCS.
+
+    Args:
+        error (bool): whether the upstream tasks failed or not
+        staging_filepath (str): Path to the saved treated .csv file
+        timestamp (datetime): timestamp for flow run
+        table_id (str): table_id on BigQuery
+        dataset_id (str): dataset_id on BigQuery
+        partitions (list): list of partition strings
+
+    Returns:
+        None
+    """
     if not error:
         try:
             # Creates and publish table if it does not exist, append to it otherwise
@@ -883,19 +918,18 @@ def transform_raw_to_nested_structure(
     timestamp: datetime,
     primary_key: list = None,
 ):
-    """Transform dataframe to nested structure
+    """
+    Task to transform raw data to nested structure
 
     Args:
-        status (dict): Must contain keys
-            * `data`: dataframe returned from treatement
-            * `error`: error catched from data treatement
-        timestamp (datetime): timestamp of the capture
-        primary_key (list, optional): List of primary keys to be used for nesting.
+        raw_filepath (str): Path to the saved raw .json file
+        filepath (str): Path to the saved treated .csv file
+        error (str): Error catched from upstream tasks
+        timestamp (datetime): timestamp for flow run
+        primary_key (list, optional): Primary key to be used on nested structure
 
     Returns:
-        dict: Conatining keys
-            * `data` (json): nested data
-            * `error` (str): catched error, if any. Otherwise, returns None
+        str: Path to the saved treated .csv file
     """
 
     # Check previous error
@@ -1001,10 +1035,10 @@ def create_request_params(
     Task to create request params
 
     Args:
-        datetime_range (dict): datetime range to get params
-        table_params (dict): table params to get params
-        secret_path (str): secret path to get params
-        dataset_id (str): dataset id to get params
+        extract_params (dict): extract parameters
+        table_id (str): table_id on BigQuery
+        dataset_id (str): dataset_id on BigQuery
+        timestamp (datetime): timestamp for flow run
 
     Returns:
         request_params: host, database and query to request data
@@ -1049,6 +1083,20 @@ def get_raw_from_sources(
     secret_path: str = None,
     api_params: dict = None,
 ):
+    """
+    Task to get raw data from sources
+
+    Args:
+        source_type (str): source type
+        local_filepath (str): local filepath
+        source_path (str, optional): source path. Defaults to None.
+        table_id (str, optional): table_id on BigQuery. Defaults to None.
+        secret_path (str, optional): secret path. Defaults to None.
+        api_params (dict, optional): api parameters. Defaults to None.
+
+    Returns:
+        error: error
+    """
     source_type, filetype = source_type.split("-", maxsplit=1)
 
     if source_type == "api":
diff --git a/pipelines/rj_smtr/utils.py b/pipelines/rj_smtr/utils.py
index 0fd5c7d6c..801c8d336 100644
--- a/pipelines/rj_smtr/utils.py
+++ b/pipelines/rj_smtr/utils.py
@@ -490,14 +490,14 @@ def get_raw_data_api(  # pylint: disable=R0912
     Request data from URL API
 
     Args:
-        url (str): URL to send request
-        secret_path (str, optional): Path to secrets guardeded on Vault, if needed.
-        params (dict, optional): Params to be sent on request
+        url (str): URL to request data
+        secret_path (str, optional): Secret path to get headers. Defaults to None.
+        api_params (dict, optional): Parameters to pass to API. Defaults to None.
+        filepath (str, optional): Path to save raw file. Defaults to None.
+        filetype (str, optional): Filetype to save raw file. Defaults to None.
 
     Returns:
-        dict: Conatining keys
-          * `data` (json): data result
-          * `error` (str): catched error, if any. Otherwise, returns None
+        tuple[str, str]: Error and filepath
     """
     error = None
     try:
@@ -530,6 +530,17 @@ def get_raw_data_gcs(
     local_filepath: str,
     filename_to_unzip: str = None,
 ) -> tuple[str, str]:
+    """
+    Get raw data from GCS
+
+    Args:
+        gcs_path (str): GCS path to get data
+        local_filepath (str): Local filepath to save raw data
+        filename_to_unzip (str, optional): Filename to unzip. Defaults to None.
+
+    Returns:
+        tuple[str, str]: Error and filepath
+    """
     error = None
     raw_filepath = None
 
@@ -568,10 +579,9 @@ def save_treated_local_func(
     Save treated file to CSV.
 
     Args:
-        filepath (str): Path which to save treated file
-        status (dict): Must contain keys
-          * `data`: dataframe returned from treatement
-          * `error`: error catched from data treatement
+        filepath (str): Path to save file
+        data (pd.DataFrame): Dataframe to save
+        error (str): Error catched during execution
         mode (str, optional): Folder to save locally, later folder which to upload to GCS.
 
     Returns:
@@ -601,9 +611,13 @@ def upload_run_logs_to_bq(  # pylint: disable=R0913
 
     Args:
         dataset_id (str): dataset_id on BigQuery
-        parent_table_id (str): Parent table id related to the status table
-        timestamp (str): ISO formatted timestamp string
-        error (str, optional): String associated with error caught during execution
+        parent_table_id (str): table_id on BigQuery
+        timestamp (str): timestamp to get datetime range
+        error (str): error catched during execution
+        previous_error (str): previous error catched during execution
+        recapture (bool): if the execution was a recapture
+        mode (str): folder to save locally, later folder which to upload to GCS
+
     Returns:
         None
     """
@@ -675,6 +689,16 @@ def get_datetime_range(
 
 
 def read_raw_data(filepath: str, csv_args: dict = dict()) -> tuple[str, pd.DataFrame]:
+    """
+    Read raw data from file
+
+    Args:
+        filepath (str): filepath to read
+        csv_args (dict): arguments to pass to pandas.read_csv
+
+    Returns:
+        tuple[str, pd.DataFrame]: error and data
+    """
     try:
         file_type = filepath.split(".")[-1]
 
diff --git a/pipelines/utils/utils.py b/pipelines/utils/utils.py
index 57384f8f4..e37a88d8b 100644
--- a/pipelines/utils/utils.py
+++ b/pipelines/utils/utils.py
@@ -714,6 +714,14 @@ def get_credentials_from_env(
 def get_storage_blobs(dataset_id: str, table_id: str, mode: str = "staging") -> list:
     """
     Get all blobs from a table in a dataset.
+
+    Args:
+        dataset_id (str): dataset id
+        table_id (str): table id
+        mode (str, optional): mode to use. Defaults to "staging".
+
+    Returns:
+        list: list of blobs
     """
 
     bd_storage = bd.Storage(dataset_id=dataset_id, table_id=table_id)
@@ -727,6 +735,15 @@ def get_storage_blobs(dataset_id: str, table_id: str, mode: str = "staging") ->
 def get_storage_blob(
     gcs_path: str,
 ):
+    """
+    Get a blob from a path.
+
+    Args:
+        gcs_path (str): path to blob
+
+    Returns:
+        Blob: blob object
+    """
     bucket = bd.Storage(dataset_id="", table_id="")
     return (
         bucket.client["storage_staging"]

From f484b880d54367e375a2ce72b02d9835f20fe4d1 Mon Sep 17 00:00:00 2001
From: Rafael <rafael.c.pinheiro96@gmail.com>
Date: Wed, 27 Sep 2023 16:20:42 -0300
Subject: [PATCH 31/59] adjust get_raw_from_sources return

---
 pipelines/rj_smtr/tasks.py | 38 ++++++++++++++++++++++----------------
 1 file changed, 22 insertions(+), 16 deletions(-)

diff --git a/pipelines/rj_smtr/tasks.py b/pipelines/rj_smtr/tasks.py
index 269ee73eb..023ea2796 100644
--- a/pipelines/rj_smtr/tasks.py
+++ b/pipelines/rj_smtr/tasks.py
@@ -1051,22 +1051,28 @@ def get_raw_from_sources(
 ):
     source_type, filetype = source_type.split("-", maxsplit=1)
 
-    if source_type == "api":
-        error, filepath = get_raw_data_api(
-            url=source_path,
-            secret_path=secret_path,
-            api_params=api_params,
-            filepath=local_filepath,
-            filetype=filetype,
-        )
-    elif source_type == "gcs":
-        error, filepath = get_raw_data_gcs(
-            gcs_path=source_path,
-            filename_to_unzip=table_id,
-            local_filepath=local_filepath,
-        )
-    else:
-        raise NotImplementedError(f"{source_type} not supported")
+    log(f"Source type: {source_type}")
+
+    try:
+        if source_type == "api":
+            error, filepath = get_raw_data_api(
+                url=source_path,
+                secret_path=secret_path,
+                api_params=api_params,
+                filepath=local_filepath,
+                filetype=filetype,
+            )
+        elif source_type == "gcs":
+            error, filepath = get_raw_data_gcs(
+                gcs_path=source_path,
+                filename_to_unzip=table_id,
+                local_filepath=local_filepath,
+            )
+        else:
+            raise NotImplementedError(f"{source_type} not supported")
+    except NotImplementedError as exp:
+        error = exp
+        filepath = None
 
     return error, filepath
 

From 2df4318dc407b58ca6a6c4bf5a3bfad8db7fab37 Mon Sep 17 00:00:00 2001
From: Rafael <rafael.c.pinheiro96@gmail.com>
Date: Wed, 27 Sep 2023 16:41:00 -0300
Subject: [PATCH 32/59] fix errors

---
 pipelines/rj_smtr/tasks.py | 13 +++++++++++--
 pipelines/rj_smtr/utils.py |  1 +
 2 files changed, 12 insertions(+), 2 deletions(-)

diff --git a/pipelines/rj_smtr/tasks.py b/pipelines/rj_smtr/tasks.py
index 7ff9ee637..9c2ae3be0 100644
--- a/pipelines/rj_smtr/tasks.py
+++ b/pipelines/rj_smtr/tasks.py
@@ -1097,7 +1097,15 @@ def get_raw_from_sources(
     Returns:
         error: error
     """
-    source_type, filetype = source_type.split("-", maxsplit=1)
+    error = None
+    filepath = None
+
+    source_values = source_type.split("-", maxsplit=1)
+    source_type = source_values[0]
+    try:
+        filetype = source_values[1]
+    except IndexError:
+        filetype = None
 
     log(f"Source type: {source_type}")
 
@@ -1120,8 +1128,9 @@ def get_raw_from_sources(
             raise NotImplementedError(f"{source_type} not supported")
     except NotImplementedError as exp:
         error = exp
-        filepath = None
+        log(f"[CATCHED] Task failed with error: \n{error}", level="error")
 
+    log(f"Raw extraction ended returned values: {error}, {filepath}")
     return error, filepath
 
 
diff --git a/pipelines/rj_smtr/utils.py b/pipelines/rj_smtr/utils.py
index 801c8d336..743e955e1 100644
--- a/pipelines/rj_smtr/utils.py
+++ b/pipelines/rj_smtr/utils.py
@@ -699,6 +699,7 @@ def read_raw_data(filepath: str, csv_args: dict = dict()) -> tuple[str, pd.DataF
     Returns:
         tuple[str, pd.DataFrame]: error and data
     """
+    error = None
     try:
         file_type = filepath.split(".")[-1]
 

From df6525ac9e946f5a3d3709b768e02f2c26aae1c8 Mon Sep 17 00:00:00 2001
From: Rafael <rafael.c.pinheiro96@gmail.com>
Date: Wed, 27 Sep 2023 16:45:37 -0300
Subject: [PATCH 33/59] change agent label to dev

---
 pipelines/rj_smtr/flows.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pipelines/rj_smtr/flows.py b/pipelines/rj_smtr/flows.py
index a4044933a..27eaa76a4 100644
--- a/pipelines/rj_smtr/flows.py
+++ b/pipelines/rj_smtr/flows.py
@@ -119,5 +119,5 @@
 default_capture_flow.storage = GCS(emd_constants.GCS_FLOWS_BUCKET.value)
 default_capture_flow.run_config = KubernetesRun(
     image=emd_constants.DOCKER_IMAGE.value,
-    labels=[emd_constants.RJ_SMTR_AGENT_LABEL.value],
+    labels=[emd_constants.RJ_SMTR_DEV_AGENT_LABEL.value],
 )

From 2983b687fb1910cc1086cb875367493706ed905e Mon Sep 17 00:00:00 2001
From: Rafael <rafael.c.pinheiro96@gmail.com>
Date: Thu, 28 Sep 2023 10:54:51 -0300
Subject: [PATCH 34/59] refactore source values

---
 pipelines/rj_smtr/tasks.py | 36 ++++++------------------------------
 1 file changed, 6 insertions(+), 30 deletions(-)

diff --git a/pipelines/rj_smtr/tasks.py b/pipelines/rj_smtr/tasks.py
index 9c2ae3be0..4a7182daf 100644
--- a/pipelines/rj_smtr/tasks.py
+++ b/pipelines/rj_smtr/tasks.py
@@ -1100,14 +1100,13 @@ def get_raw_from_sources(
     error = None
     filepath = None
 
-    source_values = source_type.split("-", maxsplit=1)
-    source_type = source_values[0]
-    try:
-        filetype = source_values[1]
-    except IndexError:
-        filetype = None
+    source_values = source_type.split("-", 1)
+
+    source_type, filetype = (
+        source_values if len(source_values) == 2 else (source_values[0], None)
+    )
 
-    log(f"Source type: {source_type}")
+    log(f"Getting raw data from source type: {source_type}")
 
     try:
         if source_type == "api":
@@ -1132,26 +1131,3 @@ def get_raw_from_sources(
 
     log(f"Raw extraction ended returned values: {error}, {filepath}")
     return error, filepath
-
-
-# TODO: passar para função para dentro da transform_raw_to_nested_structure
-# @task(checkpoint=False)
-# def transform_data_to_json(status: dict, file_type: str, csv_args: dict):
-#     data = status["data"]
-#     error = status["error"]
-
-#     if file_type == "json":
-#         pass
-
-#         # todo: move to data check on specfic API # pylint: disable=W0102
-#         # if isinstance(data, dict) and "DescricaoErro" in data.keys():
-#         #     error = data["DescricaoErro"]
-
-#     elif file_type in ("txt", "csv"):
-#         if csv_args is None:
-#             csv_args = {}
-#         data = pd.read_csv(io.StringIO(data), **csv_args).to_dict(orient="records")
-#     else:
-#         error = "Unsupported raw file extension. Supported only: json, csv and txt"
-
-#     return {"data": data, "error": error}

From 2c78b09404680d561a5afe5096428cb44a3b8032 Mon Sep 17 00:00:00 2001
From: eng-rodrigocunha <engtransportes.rodrigocunha@gmail.com>
Date: Thu, 28 Sep 2023 11:27:23 -0300
Subject: [PATCH 35/59] update constants

---
 pipelines/rj_smtr/constants.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/pipelines/rj_smtr/constants.py b/pipelines/rj_smtr/constants.py
index 722d7e9e1..3afb0b1cd 100644
--- a/pipelines/rj_smtr/constants.py
+++ b/pipelines/rj_smtr/constants.py
@@ -185,6 +185,8 @@ class constants(Enum):  # pylint: disable=c0103
         "source_type": "api-json",
         "transacao_run_interval": {"minutes": 1},
         "principal_run_interval": {"days": 1},
+        "transacao_runs_interval_minutes": 0,
+        "principal_runs_interval_minutes": 15,
     }
 
     BILHETAGEM_TRANSACAO_CAPTURE_PARAMS = {
@@ -205,7 +207,7 @@ class constants(Enum):  # pylint: disable=c0103
             """,
             "run_interval": BILHETAGEM_GENERAL_CAPTURE_PARAMS["transacao_run_interval"],
         },
-        "primary_key": ["id"],
+        "primary_key": ["id"],  # id column to nest data on
     }
 
     BILHETAGEM_CAPTURE_PARAMS = [
@@ -249,7 +251,7 @@ class constants(Enum):  # pylint: disable=c0103
                     "principal_run_interval"
                 ],
             },
-            "primary_key": ["CD_GRUPO"],
+            "primary_key": ["CD_GRUPO"],  # id column to nest data on
         },
         {
             "table_id": "grupo_linha",

From 1f3c2fc307e21e77de206f5ded612a690e8108cf Mon Sep 17 00:00:00 2001
From: eng-rodrigocunha <engtransportes.rodrigocunha@gmail.com>
Date: Thu, 28 Sep 2023 11:28:23 -0300
Subject: [PATCH 36/59] update agent

---
 pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/flows.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/flows.py b/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/flows.py
index d7f44e3b9..793d37c0d 100644
--- a/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/flows.py
+++ b/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/flows.py
@@ -30,7 +30,7 @@
 bilhetagem_transacao_captura.storage = GCS(emd_constants.GCS_FLOWS_BUCKET.value)
 bilhetagem_transacao_captura.run_config = KubernetesRun(
     image=emd_constants.DOCKER_IMAGE.value,
-    labels=[emd_constants.RJ_SMTR_AGENT_LABEL.value],
+    labels=[emd_constants.RJ_SMTR_DEV_AGENT_LABEL.value],
 )
 bilhetagem_transacao_captura.schedule = bilhetagem_transacao_schedule
 
@@ -41,6 +41,6 @@
 bilhetagem_principal_captura.storage = GCS(emd_constants.GCS_FLOWS_BUCKET.value)
 bilhetagem_principal_captura.run_config = KubernetesRun(
     image=emd_constants.DOCKER_IMAGE.value,
-    labels=[emd_constants.RJ_SMTR_AGENT_LABEL.value],
+    labels=[emd_constants.RJ_SMTR_DEV_AGENT_LABEL.value],
 )
 bilhetagem_principal_captura.schedule = bilhetagem_principal_schedule

From 702e70d6ae1341889e333e2d07fc0fec70dd6cef Mon Sep 17 00:00:00 2001
From: eng-rodrigocunha <engtransportes.rodrigocunha@gmail.com>
Date: Thu, 28 Sep 2023 11:30:21 -0300
Subject: [PATCH 37/59] update schedule params

---
 .../rj_smtr/br_rj_riodejaneiro_bilhetagem/schedules.py    | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/schedules.py b/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/schedules.py
index f19f0d8ad..e897286b0 100644
--- a/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/schedules.py
+++ b/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/schedules.py
@@ -26,7 +26,9 @@
     dataset_id=constants.BILHETAGEM_DATASET_ID.value,
     secret_path=constants.BILHETAGEM_SECRET_PATH.value,
     source_type=constants.BILHETAGEM_GENERAL_CAPTURE_PARAMS.value["source_type"],
-    runs_interval_minutes=15,
+    runs_interval_minutes=constants.BILHETAGEM_GENERAL_CAPTURE_PARAMS.value[
+        "principal_runs_interval_minutes"
+    ],
 )
 
 bilhetagem_principal_schedule = Schedule(clocks=untuple(bilhetagem_principal_clocks))
@@ -42,7 +44,9 @@
     dataset_id=constants.BILHETAGEM_DATASET_ID.value,
     secret_path=constants.BILHETAGEM_SECRET_PATH.value,
     source_type=constants.BILHETAGEM_GENERAL_CAPTURE_PARAMS.value["source_type"],
-    runs_interval_minutes=0,
+    runs_interval_minutes=constants.BILHETAGEM_GENERAL_CAPTURE_PARAMS.value[
+        "transacao_runs_interval_minutes"
+    ],
 )
 
 bilhetagem_transacao_schedule = Schedule(clocks=untuple(bilhetagem_transacao_clocks))

From b5712d2746675c4925231382f2cf436da339be94 Mon Sep 17 00:00:00 2001
From: eng-rodrigocunha <engtransportes.rodrigocunha@gmail.com>
Date: Thu, 28 Sep 2023 11:42:25 -0300
Subject: [PATCH 38/59] update interval

---
 pipelines/rj_smtr/utils.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/pipelines/rj_smtr/utils.py b/pipelines/rj_smtr/utils.py
index 743e955e1..0972a22c8 100644
--- a/pipelines/rj_smtr/utils.py
+++ b/pipelines/rj_smtr/utils.py
@@ -664,21 +664,21 @@ def upload_run_logs_to_bq(  # pylint: disable=R0913
 
 def get_datetime_range(
     timestamp: datetime,
-    interval: int,
+    interval: timedelta,
 ) -> dict:
     """
     Task to get datetime range in UTC
 
     Args:
         timestamp (datetime): timestamp to get datetime range
-        interval (int): interval in seconds
+        interval (timedelta): interval to get datetime range
 
     Returns:
         dict: datetime range
     """
 
     start = (
-        (timestamp - timedelta(seconds=interval))
+        (timestamp - timedelta(interval))
         .astimezone(tz=pytz.timezone("UTC"))
         .strftime("%Y-%m-%d %H:%M:%S")
     )

From e3df22cc2cec64b6fcc7e0258caafdf542c8ab86 Mon Sep 17 00:00:00 2001
From: eng-rodrigocunha <engtransportes.rodrigocunha@gmail.com>
Date: Thu, 28 Sep 2023 11:44:39 -0300
Subject: [PATCH 39/59] fix get_datetime_range interval

---
 pipelines/rj_smtr/utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pipelines/rj_smtr/utils.py b/pipelines/rj_smtr/utils.py
index 0972a22c8..7b32e2831 100644
--- a/pipelines/rj_smtr/utils.py
+++ b/pipelines/rj_smtr/utils.py
@@ -678,7 +678,7 @@ def get_datetime_range(
     """
 
     start = (
-        (timestamp - timedelta(interval))
+        (timestamp - interval)
         .astimezone(tz=pytz.timezone("UTC"))
         .strftime("%Y-%m-%d %H:%M:%S")
     )

From 6ed06dad2772cb2d4ff32e6a19393d2e24cfe47f Mon Sep 17 00:00:00 2001
From: eng-rodrigocunha <engtransportes.rodrigocunha@gmail.com>
Date: Thu, 28 Sep 2023 12:21:35 -0300
Subject: [PATCH 40/59] remove order by from queries

---
 pipelines/rj_smtr/constants.py | 10 ----------
 1 file changed, 10 deletions(-)

diff --git a/pipelines/rj_smtr/constants.py b/pipelines/rj_smtr/constants.py
index 3afb0b1cd..4f2b1c95a 100644
--- a/pipelines/rj_smtr/constants.py
+++ b/pipelines/rj_smtr/constants.py
@@ -202,8 +202,6 @@ class constants(Enum):  # pylint: disable=c0103
                 WHERE
                     data_processamento BETWEEN '{start}'
                     AND '{end}'
-                ORDER BY
-                    data_processamento
             """,
             "run_interval": BILHETAGEM_GENERAL_CAPTURE_PARAMS["transacao_run_interval"],
         },
@@ -223,8 +221,6 @@ class constants(Enum):  # pylint: disable=c0103
                         LINHA
                     WHERE
                         DT_INCLUSAO >= '{start}'
-                    ORDER BY
-                        DT_INCLUSAO
                 """,
                 "run_interval": BILHETAGEM_GENERAL_CAPTURE_PARAMS[
                     "principal_run_interval"
@@ -244,8 +240,6 @@ class constants(Enum):  # pylint: disable=c0103
                         GRUPO
                     WHERE
                         DT_INCLUSAO >= '{start}'
-                    ORDER BY
-                        DT_INCLUSAO
                 """,
                 "run_interval": BILHETAGEM_GENERAL_CAPTURE_PARAMS[
                     "principal_run_interval"
@@ -265,8 +259,6 @@ class constants(Enum):  # pylint: disable=c0103
                         GRUPO_LINHA
                     WHERE
                         DT_INCLUSAO >= '{start}'
-                    ORDER BY
-                        DT_INCLUSAO
                 """,
                 "run_interval": BILHETAGEM_GENERAL_CAPTURE_PARAMS[
                     "principal_run_interval"
@@ -286,8 +278,6 @@ class constants(Enum):  # pylint: disable=c0103
                         matriz_integracao
                     WHERE
                         dt_inclusao >= '{start}'
-                    ORDER BY
-                        dt_inclusao
                 """,
                 "run_interval": BILHETAGEM_GENERAL_CAPTURE_PARAMS[
                     "principal_run_interval"

From 822c59f256d4e4ff900486a6472145bcbea4b08a Mon Sep 17 00:00:00 2001
From: eng-rodrigocunha <engtransportes.rodrigocunha@gmail.com>
Date: Thu, 28 Sep 2023 12:22:30 -0300
Subject: [PATCH 41/59] fix get_raw_data_api

---
 pipelines/rj_smtr/utils.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/pipelines/rj_smtr/utils.py b/pipelines/rj_smtr/utils.py
index 7b32e2831..445389340 100644
--- a/pipelines/rj_smtr/utils.py
+++ b/pipelines/rj_smtr/utils.py
@@ -506,6 +506,13 @@ def get_raw_data_api(  # pylint: disable=R0912
         else:
             headers = get_vault_secret(secret_path)["data"]
 
+        # remove from headers, if present
+        # TODO: remove this before merge to master
+        remove_headers = ["host", "databases"]
+        for remove_header in remove_headers:
+            if remove_header in list(headers.keys()):
+                del headers[remove_header]
+
         response = requests.get(
             url,
             headers=headers,

From c58ea9639bcb2812484dd899de6bfd33a776aec9 Mon Sep 17 00:00:00 2001
From: Rafael <rafael.c.pinheiro96@gmail.com>
Date: Thu, 28 Sep 2023 15:41:42 -0300
Subject: [PATCH 42/59] change json read function

---
 pipelines/rj_smtr/utils.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/pipelines/rj_smtr/utils.py b/pipelines/rj_smtr/utils.py
index 445389340..be8ed7bbd 100644
--- a/pipelines/rj_smtr/utils.py
+++ b/pipelines/rj_smtr/utils.py
@@ -707,11 +707,14 @@ def read_raw_data(filepath: str, csv_args: dict = dict()) -> tuple[str, pd.DataF
         tuple[str, pd.DataFrame]: error and data
     """
     error = None
+    data = None
     try:
         file_type = filepath.split(".")[-1]
 
         if file_type == "json":
-            data = pd.read_json(filepath)
+            with open(filepath, "r") as file:
+                data = json.load(file)
+            data = pd.DataFrame(data)
             # data = json.loads(data)
 
         elif file_type in ("txt", "csv"):

From 045a42368562263938b90a25feffaaed4c83318d Mon Sep 17 00:00:00 2001
From: Carolina Gomes <gsv.lina@gmail.com>
Date: Thu, 28 Sep 2023 16:01:10 -0300
Subject: [PATCH 43/59] update read_raw_data

---
 pipelines/rj_smtr/utils.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pipelines/rj_smtr/utils.py b/pipelines/rj_smtr/utils.py
index be8ed7bbd..c0c203dcd 100644
--- a/pipelines/rj_smtr/utils.py
+++ b/pipelines/rj_smtr/utils.py
@@ -713,8 +713,8 @@ def read_raw_data(filepath: str, csv_args: dict = dict()) -> tuple[str, pd.DataF
 
         if file_type == "json":
             with open(filepath, "r") as file:
-                data = json.load(file)
-            data = pd.DataFrame(data)
+                data = pd.DataFrame.from_dict(json.load(file), orient="records")
+
             # data = json.loads(data)
 
         elif file_type in ("txt", "csv"):

From d2d188f7491de19ac2554eb465e46829d04e572c Mon Sep 17 00:00:00 2001
From: Carolina Gomes <gsv.lina@gmail.com>
Date: Thu, 28 Sep 2023 16:09:27 -0300
Subject: [PATCH 44/59] update save_raw_local_func

---
 pipelines/rj_smtr/utils.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/pipelines/rj_smtr/utils.py b/pipelines/rj_smtr/utils.py
index c0c203dcd..20168b039 100644
--- a/pipelines/rj_smtr/utils.py
+++ b/pipelines/rj_smtr/utils.py
@@ -448,7 +448,7 @@ def generate_execute_schedules(  # pylint: disable=too-many-arguments,too-many-l
 
 
 def save_raw_local_func(
-    data: dict, filepath: str, mode: str = "raw", filetype: str = "json"
+    data: Union[dict, str], filepath: str, mode: str = "raw", filetype: str = "json"
 ) -> str:
     """
     Saves json response from API to .json file.
@@ -467,6 +467,8 @@ def save_raw_local_func(
     Path(_filepath).parent.mkdir(parents=True, exist_ok=True)
 
     if filetype == "json":
+        if isinstance(data, dict):
+            data = json.loads(data)
         json.dump(data, Path(_filepath).open("w", encoding="utf-8"))
 
     # if filetype == "csv":

From b7c4e2fe39b2e0d3a613a68ecab8a155787f2292 Mon Sep 17 00:00:00 2001
From: Rafael <rafael.c.pinheiro96@gmail.com>
Date: Thu, 28 Sep 2023 16:18:03 -0300
Subject: [PATCH 45/59] log error

---
 pipelines/rj_smtr/utils.py | 18 +++++++++++++++++-
 1 file changed, 17 insertions(+), 1 deletion(-)

diff --git a/pipelines/rj_smtr/utils.py b/pipelines/rj_smtr/utils.py
index 20168b039..6219aaa78 100644
--- a/pipelines/rj_smtr/utils.py
+++ b/pipelines/rj_smtr/utils.py
@@ -9,6 +9,8 @@
 
 from datetime import timedelta, datetime
 from typing import List, Union
+import traceback
+import sys
 import io
 import json
 import zipfile
@@ -52,6 +54,19 @@ def log_critical(message: str, secret_path: str = constants.CRITICAL_SECRET_PATH
     return send_discord_message(message=message, webhook_url=url)
 
 
+def log_error(error: str):
+    tb = sys.exc_info()[-1]
+    frame = traceback.extract_tb(tb, 1)[0]
+    file_name = frame[0]
+    function_name = frame[2]
+    line_no = frame[1]
+
+    log(
+        f"[CATCHED] Task failed in file {file_name} - ({function_name}) line: {line_no} with error: \n{error}",
+        level="error",
+    )
+
+
 def create_or_append_table(
     dataset_id: str, table_id: str, path: str, partitions: str = None
 ):
@@ -728,6 +743,7 @@ def read_raw_data(filepath: str, csv_args: dict = dict()) -> tuple[str, pd.DataF
 
     except Exception as exp:
         error = exp
-        log(f"[CATCHED] Task failed with error: \n{error}", level="error")
+        log_error(error=error)
+        # log(f"[CATCHED] Task failed with error: \n{error}", level="error")
 
     return error, data

From 2bedf890ee42187088bfa645d61a0af08598f4f7 Mon Sep 17 00:00:00 2001
From: Rafael <rafael.c.pinheiro96@gmail.com>
Date: Thu, 28 Sep 2023 16:44:41 -0300
Subject: [PATCH 46/59] change raw api extraction for json

---
 pipelines/rj_smtr/tasks.py |  7 ++++---
 pipelines/rj_smtr/utils.py | 14 +++++++++-----
 2 files changed, 13 insertions(+), 8 deletions(-)

diff --git a/pipelines/rj_smtr/tasks.py b/pipelines/rj_smtr/tasks.py
index 4a7182daf..be878db21 100644
--- a/pipelines/rj_smtr/tasks.py
+++ b/pipelines/rj_smtr/tasks.py
@@ -34,6 +34,7 @@
     get_datetime_range,
     read_raw_data,
     save_treated_local_func,
+    log_error,
 )
 from pipelines.utils.execute_dbt_model.utils import get_dbt_client
 from pipelines.utils.utils import log, get_redis_client, get_vault_secret
@@ -434,7 +435,7 @@ def get_raw(  # pylint: disable=R0912
         error = exp
 
     if error is not None:
-        log(f"[CATCHED] Task failed with error: \n{error}", level="error")
+        log_error(error=error)
 
     return {"data": data, "error": error}
 
@@ -992,7 +993,7 @@ def transform_raw_to_nested_structure(
         error = exp
 
     if error is not None:
-        log(f"[CATCHED] Task failed with error: \n{error}", level="error")
+        log_error(error=error)
 
     return error, filepath
 
@@ -1127,7 +1128,7 @@ def get_raw_from_sources(
             raise NotImplementedError(f"{source_type} not supported")
     except NotImplementedError as exp:
         error = exp
-        log(f"[CATCHED] Task failed with error: \n{error}", level="error")
+        log_error(error=error)
 
     log(f"Raw extraction ended returned values: {error}, {filepath}")
     return error, filepath
diff --git a/pipelines/rj_smtr/utils.py b/pipelines/rj_smtr/utils.py
index 6219aaa78..41b29d41e 100644
--- a/pipelines/rj_smtr/utils.py
+++ b/pipelines/rj_smtr/utils.py
@@ -538,13 +538,17 @@ def get_raw_data_api(  # pylint: disable=R0912
         )
 
         response.raise_for_status()
-        filepath = save_raw_local_func(
-            data=response.text, filepath=filepath, filetype=filetype
-        )
+
+        if filetype == "json":
+            data = response.json()
+        else:
+            data = response.text
+
+        filepath = save_raw_local_func(data=data, filepath=filepath, filetype=filetype)
 
     except Exception as exp:
         error = exp
-        log(f"[CATCHED] Task failed with error: \n{error}", level="error")
+        log_error(error=error)
 
     return error, filepath
 
@@ -591,7 +595,7 @@ def get_raw_data_gcs(
 
     except Exception as exp:
         error = exp
-        log(f"[CATCHED] Task failed with error: \n{error}", level="error")
+        log_error(error=error)
 
     return error, raw_filepath
 

From 20b48dfb2950ba513c049e922b8768da9ab03e57 Mon Sep 17 00:00:00 2001
From: Rafael <rafael.c.pinheiro96@gmail.com>
Date: Thu, 28 Sep 2023 16:53:26 -0300
Subject: [PATCH 47/59] change read json function

---
 pipelines/rj_smtr/utils.py | 13 +++----------
 1 file changed, 3 insertions(+), 10 deletions(-)

diff --git a/pipelines/rj_smtr/utils.py b/pipelines/rj_smtr/utils.py
index 41b29d41e..9c04ed701 100644
--- a/pipelines/rj_smtr/utils.py
+++ b/pipelines/rj_smtr/utils.py
@@ -55,14 +55,9 @@ def log_critical(message: str, secret_path: str = constants.CRITICAL_SECRET_PATH
 
 
 def log_error(error: str):
-    tb = sys.exc_info()[-1]
-    frame = traceback.extract_tb(tb, 1)[0]
-    file_name = frame[0]
-    function_name = frame[2]
-    line_no = frame[1]
-
+    error = traceback.format_exc()
     log(
-        f"[CATCHED] Task failed in file {file_name} - ({function_name}) line: {line_no} with error: \n{error}",
+        f"[CATCHED] Task failed with error: \n{error}",
         level="error",
     )
 
@@ -733,11 +728,9 @@ def read_raw_data(filepath: str, csv_args: dict = dict()) -> tuple[str, pd.DataF
         file_type = filepath.split(".")[-1]
 
         if file_type == "json":
-            with open(filepath, "r") as file:
-                data = pd.DataFrame.from_dict(json.load(file), orient="records")
+            data = pd.read_json(filepath)
 
             # data = json.loads(data)
-
         elif file_type in ("txt", "csv"):
             if csv_args is None:
                 csv_args = {}

From 42c6ac008e6e8f569993c9b0a40958941c0750a0 Mon Sep 17 00:00:00 2001
From: Rafael <rafael.c.pinheiro96@gmail.com>
Date: Thu, 28 Sep 2023 17:45:44 -0300
Subject: [PATCH 48/59] print log traceback

---
 pipelines/rj_smtr/tasks.py | 23 +++++++++--------------
 pipelines/rj_smtr/utils.py | 21 ++++++---------------
 2 files changed, 15 insertions(+), 29 deletions(-)

diff --git a/pipelines/rj_smtr/tasks.py b/pipelines/rj_smtr/tasks.py
index be878db21..dd48d2c64 100644
--- a/pipelines/rj_smtr/tasks.py
+++ b/pipelines/rj_smtr/tasks.py
@@ -34,7 +34,6 @@
     get_datetime_range,
     read_raw_data,
     save_treated_local_func,
-    log_error,
 )
 from pipelines.utils.execute_dbt_model.utils import get_dbt_client
 from pipelines.utils.utils import log, get_redis_client, get_vault_secret
@@ -431,11 +430,9 @@ def get_raw(  # pylint: disable=R0912
                     "Unsupported raw file extension. Supported only: json, csv and txt"
                 )
 
-    except Exception as exp:
-        error = exp
-
-    if error is not None:
-        log_error(error=error)
+    except Exception:
+        error = traceback.format_exc()
+        log(f"[CATCHED] Task failed with error: \n{error}", level="error")
 
     return {"data": data, "error": error}
 
@@ -989,11 +986,9 @@ def transform_raw_to_nested_structure(
         # save treated local
         filepath = save_treated_local_func(data=data, error=error, filepath=filepath)
 
-    except Exception as exp:  # pylint: disable=W0703
-        error = exp
-
-    if error is not None:
-        log_error(error=error)
+    except Exception:  # pylint: disable=W0703
+        error = traceback.format_exc()
+        log(f"[CATCHED] Task failed with error: \n{error}", level="error")
 
     return error, filepath
 
@@ -1126,9 +1121,9 @@ def get_raw_from_sources(
             )
         else:
             raise NotImplementedError(f"{source_type} not supported")
-    except NotImplementedError as exp:
-        error = exp
-        log_error(error=error)
+    except NotImplementedError:
+        error = traceback.format_exc()
+        log(f"[CATCHED] Task failed with error: \n{error}", level="error")
 
     log(f"Raw extraction ended returned values: {error}, {filepath}")
     return error, filepath
diff --git a/pipelines/rj_smtr/utils.py b/pipelines/rj_smtr/utils.py
index 9c04ed701..553bd860a 100644
--- a/pipelines/rj_smtr/utils.py
+++ b/pipelines/rj_smtr/utils.py
@@ -54,14 +54,6 @@ def log_critical(message: str, secret_path: str = constants.CRITICAL_SECRET_PATH
     return send_discord_message(message=message, webhook_url=url)
 
 
-def log_error(error: str):
-    error = traceback.format_exc()
-    log(
-        f"[CATCHED] Task failed with error: \n{error}",
-        level="error",
-    )
-
-
 def create_or_append_table(
     dataset_id: str, table_id: str, path: str, partitions: str = None
 ):
@@ -542,8 +534,8 @@ def get_raw_data_api(  # pylint: disable=R0912
         filepath = save_raw_local_func(data=data, filepath=filepath, filetype=filetype)
 
     except Exception as exp:
-        error = exp
-        log_error(error=error)
+        error = traceback.format_exc()
+        log(f"[CATCHED] Task failed with error: \n{error}", level="error")
 
     return error, filepath
 
@@ -589,8 +581,8 @@ def get_raw_data_gcs(
         )
 
     except Exception as exp:
-        error = exp
-        log_error(error=error)
+        error = traceback.format_exc()
+        log(f"[CATCHED] Task failed with error: \n{error}", level="error")
 
     return error, raw_filepath
 
@@ -739,8 +731,7 @@ def read_raw_data(filepath: str, csv_args: dict = dict()) -> tuple[str, pd.DataF
             error = "Unsupported raw file extension. Supported only: json, csv and txt"
 
     except Exception as exp:
-        error = exp
-        log_error(error=error)
-        # log(f"[CATCHED] Task failed with error: \n{error}", level="error")
+        error = traceback.format_exc()
+        log(f"[CATCHED] Task failed with error: \n{error}", level="error")
 
     return error, data

From 2a1a25d41f18e45db740921fd40ce848e184605c Mon Sep 17 00:00:00 2001
From: eng-rodrigocunha <engtransportes.rodrigocunha@gmail.com>
Date: Thu, 28 Sep 2023 18:52:18 -0300
Subject: [PATCH 49/59] enrich logs

---
 .../rj_smtr/br_rj_riodejaneiro_rdo/tasks.py     | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/pipelines/rj_smtr/br_rj_riodejaneiro_rdo/tasks.py b/pipelines/rj_smtr/br_rj_riodejaneiro_rdo/tasks.py
index 1594e33f9..aeca6fb75 100644
--- a/pipelines/rj_smtr/br_rj_riodejaneiro_rdo/tasks.py
+++ b/pipelines/rj_smtr/br_rj_riodejaneiro_rdo/tasks.py
@@ -172,7 +172,18 @@ def pre_treatment_br_rj_riodejaneiro_rdo(
     log(f"Received {len(files)} to treat")
     for file_info in files:
         log(f"Processing file {files.index(file_info)}")
+
+        log(
+            f"""rdo_constants.RDO_PRE_TREATMENT_CONFIG is:\n
+            {rdo_constants.RDO_PRE_TREATMENT_CONFIG.value}"""
+        )
+        log(f"File info is:\n{file_info}")
+
         try:
+            with open(file_info["raw_path"], "r") as raw_file:
+                log(f"Opened raw file {file_info['raw_path']}")
+                log(f"raw_file is:\n{raw_file}")
+
             config = rdo_constants.RDO_PRE_TREATMENT_CONFIG.value[
                 file_info["transport_mode"]
             ][file_info["report_type"]]
@@ -234,6 +245,12 @@ def pre_treatment_br_rj_riodejaneiro_rdo(
             raw_paths.append(None)
             partitions.append(None)
             status.append({"error": e})
+
+    log(f"Returning treated paths:\n {treated_paths}")
+    log(f"Returning raw paths:\n {raw_paths}")
+    log(f"Returning partitions:\n {partitions}")
+    log(f"Returning status:\n {status}")
+
     return treated_paths, raw_paths, partitions, status
 
 

From 0cf71887e47d6415490e9e83631182748a1b3f6d Mon Sep 17 00:00:00 2001
From: eng-rodrigocunha <engtransportes.rodrigocunha@gmail.com>
Date: Fri, 29 Sep 2023 16:10:51 -0300
Subject: [PATCH 50/59] treat error

---
 pipelines/rj_smtr/br_rj_riodejaneiro_rdo/tasks.py | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/pipelines/rj_smtr/br_rj_riodejaneiro_rdo/tasks.py b/pipelines/rj_smtr/br_rj_riodejaneiro_rdo/tasks.py
index aeca6fb75..89687f6d2 100644
--- a/pipelines/rj_smtr/br_rj_riodejaneiro_rdo/tasks.py
+++ b/pipelines/rj_smtr/br_rj_riodejaneiro_rdo/tasks.py
@@ -177,12 +177,21 @@ def pre_treatment_br_rj_riodejaneiro_rdo(
             f"""rdo_constants.RDO_PRE_TREATMENT_CONFIG is:\n
             {rdo_constants.RDO_PRE_TREATMENT_CONFIG.value}"""
         )
+
         log(f"File info is:\n{file_info}")
 
         try:
+            if file_info["error"] is not None:
+                log(f"Pre Treatment failed with error: {file_info['error']}")
+                treated_paths.append(None)
+                raw_paths.append(None)
+                partitions.append(None)
+                status.append({"error": file_info["error"]})
+                continue
+
             with open(file_info["raw_path"], "r") as raw_file:
                 log(f"Opened raw file {file_info['raw_path']}")
-                log(f"raw_file is:\n{raw_file}")
+                log(f"raw_file is:\n{raw_file.read()}")
 
             config = rdo_constants.RDO_PRE_TREATMENT_CONFIG.value[
                 file_info["transport_mode"]

From dbdbffeb772f5011c8b5416d571256b024700cd5 Mon Sep 17 00:00:00 2001
From: eng-rodrigocunha <engtransportes.rodrigocunha@gmail.com>
Date: Fri, 29 Sep 2023 17:24:04 -0300
Subject: [PATCH 51/59] update to connect just once

---
 .../rj_smtr/br_rj_riodejaneiro_rdo/flows.py   | 37 ++++++++++++++-----
 .../rj_smtr/br_rj_riodejaneiro_rdo/tasks.py   | 20 +++++++---
 pipelines/rj_smtr/tasks.py                    | 16 ++++++++
 3 files changed, 59 insertions(+), 14 deletions(-)

diff --git a/pipelines/rj_smtr/br_rj_riodejaneiro_rdo/flows.py b/pipelines/rj_smtr/br_rj_riodejaneiro_rdo/flows.py
index b7be66945..d301f2aab 100644
--- a/pipelines/rj_smtr/br_rj_riodejaneiro_rdo/flows.py
+++ b/pipelines/rj_smtr/br_rj_riodejaneiro_rdo/flows.py
@@ -23,6 +23,7 @@
     bq_upload,
     get_current_timestamp,
     set_last_run_timestamp,
+    connect_ftp_task,
 )
 from pipelines.rj_smtr.schedules import every_day
 
@@ -210,13 +211,24 @@
         wait=None,
     )
     # EXTRACT
+    ftp_client = connect_ftp_task(
+        secret_path=constants.RDO_FTPS_SECRET_PATH.value, connect_flag=True
+    )
+
     files = get_file_paths_from_ftp(
-        transport_mode=transport_mode, report_type=report_type, dump=dump
+        transport_mode=transport_mode,
+        report_type=report_type,
+        dump=dump,
+        ftp_client=ftp_client,
     )
     download_files = check_files_for_download(
         files=files, dataset_id=constants.RDO_DATASET_ID.value, table_id=table_id
     )
-    updated_info = download_and_save_local_from_ftp.map(file_info=download_files)
+    updated_info = download_and_save_local_from_ftp.map(
+        file_info=download_files, ftp_client=ftp_client
+    )
+
+    connect_ftp_task(ftp_client=ftp_client, disconnect_flag=True)
     # TRANSFORM
     treated_path, raw_path, partitions, status = pre_treatment_br_rj_riodejaneiro_rdo(
         files=updated_info
@@ -258,13 +270,25 @@
         wait=None,
     )
     # EXTRACT
+    ftp_client = connect_ftp_task(
+        secret_path=constants.RDO_FTPS_SECRET_PATH.value, connect_flag=True
+    )
+
     files = get_file_paths_from_ftp(
-        transport_mode=transport_mode, report_type=report_type, dump=dump
+        transport_mode=transport_mode,
+        report_type=report_type,
+        dump=dump,
+        ftp_client=ftp_client,
     )
     download_files = check_files_for_download(
         files=files, dataset_id=constants.RDO_DATASET_ID.value, table_id=table_id
     )
-    updated_info = download_and_save_local_from_ftp.map(file_info=download_files)
+    updated_info = download_and_save_local_from_ftp.map(
+        file_info=download_files, ftp_client=ftp_client
+    )
+
+    connect_ftp_task(ftp_client=ftp_client, disconnect_flag=True)
+
     # TRANSFORM
     treated_path, raw_path, partitions, status = pre_treatment_br_rj_riodejaneiro_rdo(
         files=updated_info
@@ -288,8 +312,3 @@
     labels=[emd_constants.RJ_SMTR_DEV_AGENT_LABEL.value],
 )
 captura_stpl_rdo.schedule = every_day
-
-
-# captura_sppo_rho = deepcopy(captura_sppo_rdo)
-# captura_sppo_rho.storage = GCS(emd_constants.GCS_FLOWS_BUCKET.value)
-# captura_sppo_rho.run_config = KubernetesRun(image=emd_constants.DOCKER_IMAGE.value)
diff --git a/pipelines/rj_smtr/br_rj_riodejaneiro_rdo/tasks.py b/pipelines/rj_smtr/br_rj_riodejaneiro_rdo/tasks.py
index 89687f6d2..d23103953 100644
--- a/pipelines/rj_smtr/br_rj_riodejaneiro_rdo/tasks.py
+++ b/pipelines/rj_smtr/br_rj_riodejaneiro_rdo/tasks.py
@@ -34,7 +34,7 @@
 
 @task
 def get_file_paths_from_ftp(
-    transport_mode: str, report_type: str, wait=None, dump=False
+    transport_mode: str, report_type: str, wait=None, dump=False, ftp_client=None
 ):  # pylint: disable=W0613
     """
     Search for files inside previous interval (days) from current date,
@@ -44,7 +44,9 @@ def get_file_paths_from_ftp(
     min_timestamp = datetime(2022, 1, 1).timestamp()  # set min timestamp for search
     # Connect to FTP & search files
     # try:
-    ftp_client = connect_ftp(constants.RDO_FTPS_SECRET_PATH.value)
+    if ftp_client is None:
+        ftp_client = connect_ftp(constants.RDO_FTPS_SECRET_PATH.value)
+
     files_updated_times = {
         file: datetime.timestamp(parser.parse(info["modify"]))
         for file, info in ftp_client.mlsd(transport_mode)
@@ -105,7 +107,7 @@ def check_files_for_download(files: list, dataset_id: str, table_id: str):
 
 
 @task
-def download_and_save_local_from_ftp(file_info: dict):
+def download_and_save_local_from_ftp(file_info: dict, ftp_client=None):
     """
     Downloads file from FTP and saves to data/raw/<dataset_id>/<table_id>.
     """
@@ -122,6 +124,8 @@ def download_and_save_local_from_ftp(file_info: dict):
         mode=file_info["transport_mode"], report_type=file_info["report_type"]
     )
 
+    ftp_client_quit_flag = False
+
     # Set general local path to save file (bucket_modes: raw or staging)
     file_info[
         "local_path"
@@ -133,14 +137,20 @@ def download_and_save_local_from_ftp(file_info: dict):
     Path(file_info["raw_path"]).parent.mkdir(parents=True, exist_ok=True)
     try:
         # Get data from FTP - TODO: create get_raw() error alike
-        ftp_client = connect_ftp(constants.RDO_FTPS_SECRET_PATH.value)
+        if ftp_client is None:
+            ftp_client_quit_flag = True
+            ftp_client = connect_ftp(constants.RDO_FTPS_SECRET_PATH.value)
+
         if not Path(file_info["raw_path"]).is_file():
             with open(file_info["raw_path"], "wb") as raw_file:
                 ftp_client.retrbinary(
                     "RETR " + file_info["ftp_path"],
                     raw_file.write,
                 )
-        ftp_client.quit()
+
+        if ftp_client_quit_flag:
+            ftp_client.quit()
+
         # Get timestamp of download time
         file_info["timestamp_captura"] = pendulum.now(
             constants.TIMEZONE.value
diff --git a/pipelines/rj_smtr/tasks.py b/pipelines/rj_smtr/tasks.py
index dd48d2c64..e9d360ea7 100644
--- a/pipelines/rj_smtr/tasks.py
+++ b/pipelines/rj_smtr/tasks.py
@@ -34,6 +34,7 @@
     get_datetime_range,
     read_raw_data,
     save_treated_local_func,
+    connect_ftp,
 )
 from pipelines.utils.execute_dbt_model.utils import get_dbt_client
 from pipelines.utils.utils import log, get_redis_client, get_vault_secret
@@ -1127,3 +1128,18 @@ def get_raw_from_sources(
 
     log(f"Raw extraction ended returned values: {error}, {filepath}")
     return error, filepath
+
+
+@task(checkpoint=False)
+def connect_ftp_task(
+    secret_path: str = None,
+    secure: bool = True,
+    connect_flag: bool = False,
+    ftp_client=None,
+    disconnect_flag: bool = False,
+):
+    if connect_flag:
+        return connect_ftp(secret_path, secure)
+
+    if disconnect_flag:
+        ftp_client.quit()

From c68d376608135f85951d0a32afdb8eb4bd1a69a1 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Mon, 2 Oct 2023 18:04:18 +0000
Subject: [PATCH 52/59] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 pipelines/rj_smtr/tasks.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/pipelines/rj_smtr/tasks.py b/pipelines/rj_smtr/tasks.py
index efed417b0..8e8462086 100644
--- a/pipelines/rj_smtr/tasks.py
+++ b/pipelines/rj_smtr/tasks.py
@@ -744,7 +744,6 @@ def upload_raw_data_to_gcs(
         Union[str, None]: if there is an error returns it traceback, otherwise returns None
     """
     if error is None:
-
         try:
             st_obj = Storage(table_id=table_id, dataset_id=dataset_id)
             log(
@@ -773,7 +772,6 @@ def upload_staging_data_to_gcs(
     dataset_id: str,
     partitions: list,
 ) -> Union[str, None]:
-
     """
     Upload staging data to GCS.
 
@@ -789,7 +787,6 @@ def upload_staging_data_to_gcs(
         Union[str, None]: if there is an error returns it traceback, otherwise returns None
     """
     if error is None:
-
         try:
             # Creates and publish table if it does not exist, append to it otherwise
             create_or_append_table(
@@ -812,6 +809,7 @@ def upload_staging_data_to_gcs(
 
     return error
 
+
 ###############
 #
 # Daterange tasks
@@ -1099,6 +1097,7 @@ def transform_raw_to_nested_structure(
 
     return error, filepath
 
+
 @task(checkpoint=False)
 def connect_ftp_task(
     secret_path: str = None,

From 8da9b9d3e6d1543f2ac16abe72658520afca0a18 Mon Sep 17 00:00:00 2001
From: eng-rodrigocunha <engtransportes.rodrigocunha@gmail.com>
Date: Mon, 2 Oct 2023 15:11:04 -0300
Subject: [PATCH 53/59] update utils

---
 pipelines/rj_smtr/utils.py | 338 +++++++++++++++++++++++++++++++++++--
 1 file changed, 320 insertions(+), 18 deletions(-)

diff --git a/pipelines/rj_smtr/utils.py b/pipelines/rj_smtr/utils.py
index 9ddf7d687..1d71dd3dd 100644
--- a/pipelines/rj_smtr/utils.py
+++ b/pipelines/rj_smtr/utils.py
@@ -8,12 +8,18 @@
 from pathlib import Path
 
 from datetime import timedelta, datetime
-from typing import List
+from typing import List, Union
+import traceback
 import io
+import json
+import zipfile
+import pytz
+import requests
 import basedosdados as bd
 from basedosdados import Table
 import pandas as pd
-import pytz
+from google.cloud.storage.blob import Blob
+
 
 from prefect.schedules.clocks import IntervalClock
 
@@ -398,46 +404,41 @@ def data_info_str(data: pd.DataFrame):
 
 
 def generate_execute_schedules(  # pylint: disable=too-many-arguments,too-many-locals
-    interval: timedelta,
+    clock_interval: timedelta,
     labels: List[str],
-    table_parameters: list,
-    dataset_id: str,
-    secret_path: str,
+    table_parameters: Union[list[dict], dict],
     runs_interval_minutes: int = 15,
     start_date: datetime = datetime(
         2020, 1, 1, tzinfo=pytz.timezone(emd_constants.DEFAULT_TIMEZONE.value)
     ),
+    **general_flow_params,
 ) -> List[IntervalClock]:
     """
     Generates multiple schedules
 
     Args:
-        interval (timedelta): The interval to run the schedule
+        clock_interval (timedelta): The interval to run the schedule
         labels (List[str]): The labels to be added to the schedule
-        table_parameters (list): The table parameters
-        dataset_id (str): The dataset_id to be used in the schedule
-        secret_path (str): The secret path to be used in the schedule
+        table_parameters (list): The table parameters to iterate over
         runs_interval_minutes (int, optional): The interval between each schedule. Defaults to 15.
         start_date (datetime, optional): The start date of the schedule.
             Defaults to datetime(2020, 1, 1, tzinfo=pytz.timezone(emd_constants.DEFAULT_TIMEZONE.value)).
-
+        general_flow_params: Any param that you want to pass to the flow
     Returns:
         List[IntervalClock]: The list of schedules
 
     """
+    if isinstance(table_parameters, dict):
+        table_parameters = [table_parameters]
 
     clocks = []
     for count, parameters in enumerate(table_parameters):
-        parameter_defaults = {
-            "table_params": parameters,
-            "dataset_id": dataset_id,
-            "secret_path": secret_path,
-            "interval": interval.total_seconds(),
-        }
+        parameter_defaults = parameters | general_flow_params
+
         log(f"parameter_defaults: {parameter_defaults}")
         clocks.append(
             IntervalClock(
-                interval=interval,
+                interval=clock_interval,
                 start_date=start_date
                 + timedelta(minutes=runs_interval_minutes * count),
                 labels=labels,
@@ -445,3 +446,304 @@ def generate_execute_schedules(  # pylint: disable=too-many-arguments,too-many-l
             )
         )
     return clocks
+
+
+def save_raw_local_func(
+    data: Union[dict, str], filepath: str, mode: str = "raw", filetype: str = "json"
+) -> str:
+    """
+    Saves json response from API to .json file.
+    Args:
+        filepath (str): Path which to save raw file
+        status (dict): Must contain keys
+          * data: json returned from API
+          * error: error catched from API request
+        mode (str, optional): Folder to save locally, later folder which to upload to GCS.
+    Returns:
+        str: Path to the saved file
+    """
+
+    # diferentes tipos de arquivos para salvar
+    _filepath = filepath.format(mode=mode, filetype=filetype)
+    Path(_filepath).parent.mkdir(parents=True, exist_ok=True)
+
+    if filetype == "json":
+        if isinstance(data, dict):
+            data = json.loads(data)
+        json.dump(data, Path(_filepath).open("w", encoding="utf-8"))
+
+    # if filetype == "csv":
+    #     pass
+    if filetype in ("txt", "csv"):
+        with open(_filepath, "w", encoding="utf-8") as file:
+            file.write(data)
+
+    log(f"Raw data saved to: {_filepath}")
+    return _filepath
+
+
+def get_raw_data_api(  # pylint: disable=R0912
+    url: str,
+    secret_path: str = None,
+    api_params: dict = None,
+    filetype: str = None,
+) -> tuple[str, str, str]:
+    """
+    Request data from URL API
+
+    Args:
+        url (str): URL to request data
+        secret_path (str, optional): Secret path to get headers. Defaults to None.
+        api_params (dict, optional): Parameters to pass to API. Defaults to None.
+        filetype (str, optional): Filetype to save raw file. Defaults to None.
+
+    Returns:
+        tuple[str, str, str]: Error, data and filetype
+    """
+    error = None
+    data = None
+    try:
+        if secret_path is None:
+            headers = secret_path
+        else:
+            headers = get_vault_secret(secret_path)["data"]
+
+        response = requests.get(
+            url,
+            headers=headers,
+            timeout=constants.MAX_TIMEOUT_SECONDS.value,
+            params=api_params,
+        )
+
+        response.raise_for_status()
+
+        if filetype == "json":
+            data = response.json()
+        else:
+            data = response.text
+
+    except Exception:
+        error = traceback.format_exc()
+        log(f"[CATCHED] Task failed with error: \n{error}", level="error")
+
+    return error, data, filetype
+
+
+def get_upload_storage_blob(
+    dataset_id: str,
+    filename: str,
+) -> Blob:
+    """
+    Get a blob from upload zone in storage
+
+    Args:
+        dataset_id (str): The dataset id on BigQuery.
+        filename (str): The filename in GCS.
+
+    Returns:
+        Blob: blob object
+    """
+    bucket = bd.Storage(dataset_id="", table_id="")
+    blob_list = list(
+        bucket.client["storage_staging"]
+        .bucket(bucket.bucket_name)
+        .list_blobs(prefix=f"upload/{dataset_id}/{filename}.")
+    )
+    return blob_list[0]
+
+
+def get_raw_data_gcs(
+    dataset_id: str,
+    table_id: str,
+    zip_filename: str = None,
+) -> tuple[str, str, str]:
+    """
+    Get raw data from GCS
+
+    Args:
+        dataset_id (str): The dataset id on BigQuery.
+        table_id (str): The table id on BigQuery.
+        zip_filename (str, optional): The zip file name. Defaults to None.
+
+    Returns:
+        tuple[str, str, str]: Error, data and filetype
+    """
+    error = None
+    data = None
+    filetype = None
+
+    try:
+        blob_search_name = zip_filename or table_id
+        blob = get_upload_storage_blob(dataset_id=dataset_id, filename=blob_search_name)
+
+        filename = blob.name
+        filetype = filename.split(".")[-1]
+
+        data = blob.download_as_bytes()
+
+        if filetype == "zip":
+            with zipfile.ZipFile(io.BytesIO(data), "r") as zipped_file:
+                filenames = zipped_file.namelist()
+                filename = list(
+                    filter(lambda x: x.split(".")[0] == table_id, filenames)
+                )[0]
+                filetype = filename.split(".")[-1]
+                data = zipped_file.read(filename)
+
+        data = data.decode(encoding="utf-8")
+
+    except Exception:
+        error = traceback.format_exc()
+        log(f"[CATCHED] Task failed with error: \n{error}", level="error")
+
+    return error, data, filetype
+
+
+def save_treated_local_func(
+    filepath: str, data: pd.DataFrame, error: str, mode: str = "staging"
+) -> str:
+    """
+    Save treated file to CSV.
+
+    Args:
+        filepath (str): Path to save file
+        data (pd.DataFrame): Dataframe to save
+        error (str): Error catched during execution
+        mode (str, optional): Folder to save locally, later folder which to upload to GCS.
+
+    Returns:
+        str: Path to the saved file
+    """
+    _filepath = filepath.format(mode=mode, filetype="csv")
+    Path(_filepath).parent.mkdir(parents=True, exist_ok=True)
+    if error is None:
+        data.to_csv(_filepath, index=False)
+        log(f"Treated data saved to: {_filepath}")
+    return _filepath
+
+
+def upload_run_logs_to_bq(  # pylint: disable=R0913
+    dataset_id: str,
+    parent_table_id: str,
+    timestamp: str,
+    error: str = None,
+    previous_error: str = None,
+    recapture: bool = False,
+    mode: str = "raw",
+):
+    """
+    Upload execution status table to BigQuery.
+    Table is uploaded to the same dataset, named {parent_table_id}_logs.
+    If passing status_dict, should not pass timestamp and error.
+
+    Args:
+        dataset_id (str): dataset_id on BigQuery
+        parent_table_id (str): table_id on BigQuery
+        timestamp (str): timestamp to get datetime range
+        error (str): error catched during execution
+        previous_error (str): previous error catched during execution
+        recapture (bool): if the execution was a recapture
+        mode (str): folder to save locally, later folder which to upload to GCS
+
+    Returns:
+        None
+    """
+    table_id = parent_table_id + "_logs"
+    # Create partition directory
+    filename = f"{table_id}_{timestamp.isoformat()}"
+    partition = f"data={timestamp.date()}"
+    filepath = Path(
+        f"""data/{mode}/{dataset_id}/{table_id}/{partition}/{filename}.csv"""
+    )
+    filepath.parent.mkdir(exist_ok=True, parents=True)
+    # Create dataframe to be uploaded
+    if not error and recapture is True:
+        # if the recapture is succeeded, update the column erro
+        dataframe = pd.DataFrame(
+            {
+                "timestamp_captura": [timestamp],
+                "sucesso": [True],
+                "erro": [f"[recapturado]{previous_error}"],
+            }
+        )
+        log(f"Recapturing {timestamp} with previous error:\n{error}")
+    else:
+        # not recapturing or error during flow execution
+        dataframe = pd.DataFrame(
+            {
+                "timestamp_captura": [timestamp],
+                "sucesso": [error is None],
+                "erro": [error],
+            }
+        )
+    # Save data local
+    dataframe.to_csv(filepath, index=False)
+    # Upload to Storage
+    create_or_append_table(
+        dataset_id=dataset_id,
+        table_id=table_id,
+        path=filepath.as_posix(),
+        partitions=partition,
+    )
+    if error is not None:
+        raise Exception(f"Pipeline failed with error: {error}")
+
+
+def get_datetime_range(
+    timestamp: datetime,
+    interval: timedelta,
+) -> dict:
+    """
+    Task to get datetime range in UTC
+
+    Args:
+        timestamp (datetime): timestamp to get datetime range
+        interval (timedelta): interval to get datetime range
+
+    Returns:
+        dict: datetime range
+    """
+
+    start = (
+        (timestamp - interval)
+        .astimezone(tz=pytz.timezone("UTC"))
+        .strftime("%Y-%m-%d %H:%M:%S")
+    )
+
+    end = timestamp.astimezone(tz=pytz.timezone("UTC")).strftime("%Y-%m-%d %H:%M:%S")
+
+    return {"start": start, "end": end}
+
+
+def read_raw_data(filepath: str, csv_args: dict = None) -> tuple[str, pd.DataFrame]:
+    """
+    Read raw data from file
+
+    Args:
+        filepath (str): filepath to read
+        csv_args (dict): arguments to pass to pandas.read_csv
+
+    Returns:
+        tuple[str, pd.DataFrame]: error and data
+    """
+    error = None
+    data = None
+    try:
+        file_type = filepath.split(".")[-1]
+
+        if file_type == "json":
+            data = pd.read_json(filepath)
+
+            # data = json.loads(data)
+        elif file_type in ("txt", "csv"):
+            if csv_args is None:
+                csv_args = {}
+            data = pd.read_csv(filepath, **csv_args)
+        else:
+            error = "Unsupported raw file extension. Supported only: json, csv and txt"
+
+    except Exception:
+        error = traceback.format_exc()
+        log(f"[CATCHED] Task failed with error: \n{error}", level="error")
+
+    return error, data

From 2732aa93af6afaa545a3beb77808c6e5aaa62e49 Mon Sep 17 00:00:00 2001
From: eng-rodrigocunha <engtransportes.rodrigocunha@gmail.com>
Date: Mon, 2 Oct 2023 15:16:31 -0300
Subject: [PATCH 54/59] update utils

---
 pipelines/utils/utils.py | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/pipelines/utils/utils.py b/pipelines/utils/utils.py
index efc21c133..adf89bc94 100644
--- a/pipelines/utils/utils.py
+++ b/pipelines/utils/utils.py
@@ -711,16 +711,24 @@ def get_credentials_from_env(
     return cred
 
 
-def get_storage_blobs(dataset_id: str, table_id: str) -> list:
+def get_storage_blobs(dataset_id: str, table_id: str, mode: str = "staging") -> list:
     """
     Get all blobs from a table in a dataset.
+
+    Args:
+        dataset_id (str): dataset id
+        table_id (str): table id
+        mode (str, optional): mode to use. Defaults to "staging".
+
+    Returns:
+        list: list of blobs
     """
 
     bd_storage = bd.Storage(dataset_id=dataset_id, table_id=table_id)
     return list(
         bd_storage.client["storage_staging"]
         .bucket(bd_storage.bucket_name)
-        .list_blobs(prefix=f"staging/{bd_storage.dataset_id}/{bd_storage.table_id}/")
+        .list_blobs(prefix=f"{mode}/{bd_storage.dataset_id}/{bd_storage.table_id}/")
     )
 
 

From fec51a299f0c0fb0a9f955aab855e29890a45668 Mon Sep 17 00:00:00 2001
From: eng-rodrigocunha <engtransportes.rodrigocunha@gmail.com>
Date: Mon, 2 Oct 2023 15:20:05 -0300
Subject: [PATCH 55/59] update constants

---
 pipelines/rj_smtr/constants.py | 19 -------------------
 1 file changed, 19 deletions(-)

diff --git a/pipelines/rj_smtr/constants.py b/pipelines/rj_smtr/constants.py
index d30989743..52e30d9f8 100644
--- a/pipelines/rj_smtr/constants.py
+++ b/pipelines/rj_smtr/constants.py
@@ -290,22 +290,3 @@ class constants(Enum):  # pylint: disable=c0103
         },
     ]
     BILHETAGEM_SECRET_PATH = "smtr_jae_access_data"
-
-    # GTFS
-    GTFS_DATASET_ID = "br_rj_riodejaneiro_gtfs"
-    GTFS_GENERAL_CAPTURE_PARAMS = {"partition_date_only": True, "source_type": "gcs"}
-    GTFS_CAPTURE_PARAMS = [
-        {"table_id": "agency", "primary_key": ["agency_id"]},
-        {"table_id": "calendar_dates", "primary_key": ["service_id"]},
-        {"table_id": "calendar", "primary_key": ["service_id"]},
-        {"table_id": "feed_info", "primary_key": ["feed_publisher_name"]},
-        {"table_id": "frequencies", "primary_key": ["trip_id"]},
-        {"table_id": "routes", "primary_key": ["route_id"]},
-        {"table_id": "shapes", "primary_key": ["shape_id"]},
-        {"table_id": "stops", "primary_key": ["stop_id"]},
-        {"table_id": "trips", "primary_key": ["trip_id"]},
-        {"table_id": "fare_attributes", "primary_key": ["fare_id"]},
-        {"table_id": "fare_rules", "primary_key": ["fare_id"]},
-    ]
-    GTFS_QUADRO_CAPTURE_PARAMS = {"table_id": "quadro", "primary_key": "servico"}
-    GTFS_BASE_GCS_PATH = "development/br_rj_riodejaneiro_gtfs/upload"

From a7e47e763f39464d7ee0f4b3104fa6edc530e77e Mon Sep 17 00:00:00 2001
From: eng-rodrigocunha <engtransportes.rodrigocunha@gmail.com>
Date: Mon, 2 Oct 2023 16:44:17 -0300
Subject: [PATCH 56/59] atualiza estrutura de ftp_client

---
 .../rj_smtr/br_rj_riodejaneiro_rdo/flows.py    | 18 ++++++++----------
 .../rj_smtr/br_rj_riodejaneiro_rdo/tasks.py    |  9 ++++-----
 2 files changed, 12 insertions(+), 15 deletions(-)

diff --git a/pipelines/rj_smtr/br_rj_riodejaneiro_rdo/flows.py b/pipelines/rj_smtr/br_rj_riodejaneiro_rdo/flows.py
index d301f2aab..781bcc181 100644
--- a/pipelines/rj_smtr/br_rj_riodejaneiro_rdo/flows.py
+++ b/pipelines/rj_smtr/br_rj_riodejaneiro_rdo/flows.py
@@ -211,15 +211,14 @@
         wait=None,
     )
     # EXTRACT
-    ftp_client = connect_ftp_task(
-        secret_path=constants.RDO_FTPS_SECRET_PATH.value, connect_flag=True
-    )
+    # ftp_client = connect_ftp_task(
+    #     secret_path=constants.RDO_FTPS_SECRET_PATH.value, connect_flag=True
+    # )
 
-    files = get_file_paths_from_ftp(
+    files, ftp_client = get_file_paths_from_ftp(
         transport_mode=transport_mode,
         report_type=report_type,
         dump=dump,
-        ftp_client=ftp_client,
     )
     download_files = check_files_for_download(
         files=files, dataset_id=constants.RDO_DATASET_ID.value, table_id=table_id
@@ -270,15 +269,14 @@
         wait=None,
     )
     # EXTRACT
-    ftp_client = connect_ftp_task(
-        secret_path=constants.RDO_FTPS_SECRET_PATH.value, connect_flag=True
-    )
+    # ftp_client = connect_ftp_task(
+    #     secret_path=constants.RDO_FTPS_SECRET_PATH.value, connect_flag=True
+    # )
 
-    files = get_file_paths_from_ftp(
+    files, ftp_client = get_file_paths_from_ftp(
         transport_mode=transport_mode,
         report_type=report_type,
         dump=dump,
-        ftp_client=ftp_client,
     )
     download_files = check_files_for_download(
         files=files, dataset_id=constants.RDO_DATASET_ID.value, table_id=table_id
diff --git a/pipelines/rj_smtr/br_rj_riodejaneiro_rdo/tasks.py b/pipelines/rj_smtr/br_rj_riodejaneiro_rdo/tasks.py
index d23103953..c126d8e1f 100644
--- a/pipelines/rj_smtr/br_rj_riodejaneiro_rdo/tasks.py
+++ b/pipelines/rj_smtr/br_rj_riodejaneiro_rdo/tasks.py
@@ -32,9 +32,9 @@
 from pipelines.utils.utils import log, get_redis_client
 
 
-@task
+@task(nout=2)
 def get_file_paths_from_ftp(
-    transport_mode: str, report_type: str, wait=None, dump=False, ftp_client=None
+    transport_mode: str, report_type: str, wait=None, dump=False
 ):  # pylint: disable=W0613
     """
     Search for files inside previous interval (days) from current date,
@@ -44,8 +44,7 @@ def get_file_paths_from_ftp(
     min_timestamp = datetime(2022, 1, 1).timestamp()  # set min timestamp for search
     # Connect to FTP & search files
     # try:
-    if ftp_client is None:
-        ftp_client = connect_ftp(constants.RDO_FTPS_SECRET_PATH.value)
+    ftp_client = connect_ftp(constants.RDO_FTPS_SECRET_PATH.value)
 
     files_updated_times = {
         file: datetime.timestamp(parser.parse(info["modify"]))
@@ -75,7 +74,7 @@ def get_file_paths_from_ftp(
     files = files[:10]
 
     log(f"There are {len(files)} files at the FTP")
-    return files
+    return files, ftp_client
 
 
 @task

From fa6dfbe639a96f03ad2a25112584cd0057a15b89 Mon Sep 17 00:00:00 2001
From: eng-rodrigocunha <engtransportes.rodrigocunha@gmail.com>
Date: Mon, 2 Oct 2023 20:32:34 -0300
Subject: [PATCH 57/59] altera para o agent anterior

---
 pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/flows.py     | 4 ++--
 pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/schedules.py | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/flows.py b/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/flows.py
index 793d37c0d..d7f44e3b9 100644
--- a/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/flows.py
+++ b/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/flows.py
@@ -30,7 +30,7 @@
 bilhetagem_transacao_captura.storage = GCS(emd_constants.GCS_FLOWS_BUCKET.value)
 bilhetagem_transacao_captura.run_config = KubernetesRun(
     image=emd_constants.DOCKER_IMAGE.value,
-    labels=[emd_constants.RJ_SMTR_DEV_AGENT_LABEL.value],
+    labels=[emd_constants.RJ_SMTR_AGENT_LABEL.value],
 )
 bilhetagem_transacao_captura.schedule = bilhetagem_transacao_schedule
 
@@ -41,6 +41,6 @@
 bilhetagem_principal_captura.storage = GCS(emd_constants.GCS_FLOWS_BUCKET.value)
 bilhetagem_principal_captura.run_config = KubernetesRun(
     image=emd_constants.DOCKER_IMAGE.value,
-    labels=[emd_constants.RJ_SMTR_DEV_AGENT_LABEL.value],
+    labels=[emd_constants.RJ_SMTR_AGENT_LABEL.value],
 )
 bilhetagem_principal_captura.schedule = bilhetagem_principal_schedule
diff --git a/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/schedules.py b/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/schedules.py
index e897286b0..2f7804811 100644
--- a/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/schedules.py
+++ b/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/schedules.py
@@ -20,7 +20,7 @@
         **constants.BILHETAGEM_GENERAL_CAPTURE_PARAMS.value["principal_run_interval"]
     ),
     labels=[
-        emd_constants.RJ_SMTR_DEV_AGENT_LABEL.value,
+        emd_constants.RJ_SMTR_AGENT_LABEL.value,
     ],
     table_parameters=constants.BILHETAGEM_CAPTURE_PARAMS.value,
     dataset_id=constants.BILHETAGEM_DATASET_ID.value,
@@ -38,7 +38,7 @@
         **constants.BILHETAGEM_GENERAL_CAPTURE_PARAMS.value["transacao_run_interval"]
     ),
     labels=[
-        emd_constants.RJ_SMTR_DEV_AGENT_LABEL.value,
+        emd_constants.RJ_SMTR_AGENT_LABEL.value,
     ],
     table_parameters=constants.BILHETAGEM_TRANSACAO_CAPTURE_PARAMS.value,
     dataset_id=constants.BILHETAGEM_DATASET_ID.value,

From 8be8f0c53712edbe613145ce9e51d588833143e8 Mon Sep 17 00:00:00 2001
From: eng-rodrigocunha <engtransportes.rodrigocunha@gmail.com>
Date: Mon, 2 Oct 2023 20:49:27 -0300
Subject: [PATCH 58/59] remove task connect_ftp_task

---
 pipelines/rj_smtr/tasks.py | 16 ----------------
 1 file changed, 16 deletions(-)

diff --git a/pipelines/rj_smtr/tasks.py b/pipelines/rj_smtr/tasks.py
index 8e8462086..a846851b5 100644
--- a/pipelines/rj_smtr/tasks.py
+++ b/pipelines/rj_smtr/tasks.py
@@ -34,7 +34,6 @@
     get_datetime_range,
     read_raw_data,
     save_treated_local_func,
-    connect_ftp,
     save_raw_local_func,
 )
 from pipelines.utils.execute_dbt_model.utils import get_dbt_client
@@ -1096,18 +1095,3 @@ def transform_raw_to_nested_structure(
             log(f"[CATCHED] Task failed with error: \n{error}", level="error")
 
     return error, filepath
-
-
-@task(checkpoint=False)
-def connect_ftp_task(
-    secret_path: str = None,
-    secure: bool = True,
-    connect_flag: bool = False,
-    ftp_client=None,
-    disconnect_flag: bool = False,
-):
-    if connect_flag:
-        return connect_ftp(secret_path, secure)
-
-    if disconnect_flag:
-        ftp_client.quit()

From cb38e921a1813b9235f96c60e8350daae1a343ff Mon Sep 17 00:00:00 2001
From: eng-rodrigocunha <engtransportes.rodrigocunha@gmail.com>
Date: Mon, 2 Oct 2023 20:56:54 -0300
Subject: [PATCH 59/59] cria task download_and_save_list_local_from_ftp

---
 .../rj_smtr/br_rj_riodejaneiro_rdo/flows.py   | 31 ++++-----
 .../rj_smtr/br_rj_riodejaneiro_rdo/tasks.py   | 65 ++++++++++++++++++-
 2 files changed, 74 insertions(+), 22 deletions(-)

diff --git a/pipelines/rj_smtr/br_rj_riodejaneiro_rdo/flows.py b/pipelines/rj_smtr/br_rj_riodejaneiro_rdo/flows.py
index 781bcc181..612b21a96 100644
--- a/pipelines/rj_smtr/br_rj_riodejaneiro_rdo/flows.py
+++ b/pipelines/rj_smtr/br_rj_riodejaneiro_rdo/flows.py
@@ -17,13 +17,13 @@
     pre_treatment_br_rj_riodejaneiro_rdo,
     get_rdo_date_range,
     update_rdo_redis,
+    download_and_save_list_local_from_ftp,
 )
 from pipelines.rj_smtr.constants import constants
 from pipelines.rj_smtr.tasks import (
     bq_upload,
     get_current_timestamp,
     set_last_run_timestamp,
-    connect_ftp_task,
 )
 from pipelines.rj_smtr.schedules import every_day
 
@@ -210,12 +210,8 @@
         now_time=get_current_timestamp(),
         wait=None,
     )
-    # EXTRACT
-    # ftp_client = connect_ftp_task(
-    #     secret_path=constants.RDO_FTPS_SECRET_PATH.value, connect_flag=True
-    # )
 
-    files, ftp_client = get_file_paths_from_ftp(
+    files = get_file_paths_from_ftp(
         transport_mode=transport_mode,
         report_type=report_type,
         dump=dump,
@@ -223,11 +219,11 @@
     download_files = check_files_for_download(
         files=files, dataset_id=constants.RDO_DATASET_ID.value, table_id=table_id
     )
-    updated_info = download_and_save_local_from_ftp.map(
-        file_info=download_files, ftp_client=ftp_client
-    )
+    # updated_info = download_and_save_local_from_ftp.map(
+    #     file_info=download_files
+    # )
+    updated_info = download_and_save_list_local_from_ftp(files_info=download_files)
 
-    connect_ftp_task(ftp_client=ftp_client, disconnect_flag=True)
     # TRANSFORM
     treated_path, raw_path, partitions, status = pre_treatment_br_rj_riodejaneiro_rdo(
         files=updated_info
@@ -268,12 +264,8 @@
         now_time=get_current_timestamp(),
         wait=None,
     )
-    # EXTRACT
-    # ftp_client = connect_ftp_task(
-    #     secret_path=constants.RDO_FTPS_SECRET_PATH.value, connect_flag=True
-    # )
 
-    files, ftp_client = get_file_paths_from_ftp(
+    files = get_file_paths_from_ftp(
         transport_mode=transport_mode,
         report_type=report_type,
         dump=dump,
@@ -281,11 +273,10 @@
     download_files = check_files_for_download(
         files=files, dataset_id=constants.RDO_DATASET_ID.value, table_id=table_id
     )
-    updated_info = download_and_save_local_from_ftp.map(
-        file_info=download_files, ftp_client=ftp_client
-    )
-
-    connect_ftp_task(ftp_client=ftp_client, disconnect_flag=True)
+    # updated_info = download_and_save_local_from_ftp.map(
+    #     file_info=download_files
+    # )
+    updated_info = download_and_save_list_local_from_ftp(files_info=download_files)
 
     # TRANSFORM
     treated_path, raw_path, partitions, status = pre_treatment_br_rj_riodejaneiro_rdo(
diff --git a/pipelines/rj_smtr/br_rj_riodejaneiro_rdo/tasks.py b/pipelines/rj_smtr/br_rj_riodejaneiro_rdo/tasks.py
index c126d8e1f..c0704e418 100644
--- a/pipelines/rj_smtr/br_rj_riodejaneiro_rdo/tasks.py
+++ b/pipelines/rj_smtr/br_rj_riodejaneiro_rdo/tasks.py
@@ -1,4 +1,5 @@
 # -*- coding: utf-8 -*-
+# flake8: noqa: E501
 """
 Tasks for br_rj_riodejaneiro_rdo
 """
@@ -32,7 +33,7 @@
 from pipelines.utils.utils import log, get_redis_client
 
 
-@task(nout=2)
+@task
 def get_file_paths_from_ftp(
     transport_mode: str, report_type: str, wait=None, dump=False
 ):  # pylint: disable=W0613
@@ -74,7 +75,9 @@ def get_file_paths_from_ftp(
     files = files[:10]
 
     log(f"There are {len(files)} files at the FTP")
-    return files, ftp_client
+    ftp_client.quit()
+
+    return files
 
 
 @task
@@ -337,3 +340,61 @@ def get_rdo_date_range(dataset_id: str, table_id: str, mode: str = "prod"):
         "date_range_start": last_run_date,
         "date_range_end": pendulum.now(constants.TIMEZONE.value).date().isoformat(),
     }
+
+
+@task
+def download_and_save_list_local_from_ftp(files_info: list) -> list:
+    """
+    Downloads files from FTP and saves to data/raw/<dataset_id>/<table_id>.
+    """
+
+    file_info_list = []
+    ftp_client = connect_ftp(constants.RDO_FTPS_SECRET_PATH.value)
+
+    try:
+        for file_info in files_info:
+            if file_info["error"] is not None:
+                file_info_list.append(file_info)
+                continue
+
+            dataset_id = constants.RDO_DATASET_ID.value
+            base_path = f'{os.getcwd()}/{os.getenv("DATA_FOLDER", "data")}/{{bucket_mode}}/{dataset_id}'
+
+            table_id = build_table_id(  # mudar pra task
+                mode=file_info["transport_mode"], report_type=file_info["report_type"]
+            )
+
+            # Set general local path to save file (bucket_modes: raw or staging)
+            file_info[
+                "local_path"
+            ] = f"{base_path}/{table_id}/{file_info['partitions']}/{file_info['filename']}.{{file_ext}}"
+            # Get raw data
+            file_info["raw_path"] = file_info["local_path"].format(
+                bucket_mode="raw", file_ext="txt"
+            )
+            Path(file_info["raw_path"]).parent.mkdir(parents=True, exist_ok=True)
+
+            if not Path(file_info["raw_path"]).is_file():
+                with open(file_info["raw_path"], "wb") as raw_file:
+                    ftp_client.retrbinary(
+                        "RETR " + file_info["ftp_path"],
+                        raw_file.write,
+                    )
+
+            # Get timestamp of download time
+            file_info["timestamp_captura"] = pendulum.now(
+                constants.TIMEZONE.value
+            ).isoformat()
+
+            log(f"Timestamp captura is {file_info['timestamp_captura']}")
+            log(f"Update file info: {file_info}")
+
+            file_info_list.append(file_info)
+
+        ftp_client.quit()
+
+    except Exception as error:  # pylint: disable=W0703
+        file_info["error"] = error
+        file_info_list.append(file_info)
+
+    return file_info_list