From b6089fa2bf7fd75fd5abb1fb8affe74eb63f3ff8 Mon Sep 17 00:00:00 2001
From: fernandascovino <fscovinom@gmail.com>
Date: Mon, 25 Sep 2023 19:09:32 -0300
Subject: [PATCH 001/145] remove task de particao nao usada

---
 pipelines/rj_smtr/tasks.py | 28 ----------------------------
 1 file changed, 28 deletions(-)

diff --git a/pipelines/rj_smtr/tasks.py b/pipelines/rj_smtr/tasks.py
index de52c03df..983f93fbf 100644
--- a/pipelines/rj_smtr/tasks.py
+++ b/pipelines/rj_smtr/tasks.py
@@ -181,34 +181,6 @@ def parse_timestamp_to_string(timestamp: datetime, pattern="%Y-%m-%d-%H-%M-%S")
     return timestamp.strftime(pattern)
 
 
-@task
-def create_current_date_hour_partition(capture_time=None):
-    """Create partitioned directory structure to save data locally based
-    on capture time.
-
-    Args:
-        capture_time(pendulum.datetime.DateTime, optional):
-            if recapturing data, will create partitions based
-            on the failed timestamps being recaptured
-
-    Returns:
-        dict: "filename" contains the name which to upload the csv, "partitions" contains
-        the partitioned directory path
-    """
-    if capture_time is None:
-        capture_time = datetime.now(tz=constants.TIMEZONE.value).replace(
-            minute=0, second=0, microsecond=0
-        )
-    date = capture_time.strftime("%Y-%m-%d")
-    hour = capture_time.strftime("%H")
-
-    return {
-        "filename": capture_time.strftime("%Y-%m-%d-%H-%M-%S"),
-        "partitions": f"data={date}/hora={hour}",
-        "timestamp": capture_time,
-    }
-
-
 @task
 def create_local_partition_path(
     dataset_id: str, table_id: str, filename: str, partitions: str = None

From dc197ccac6d2be6af8b6025974cbdd6e8c826041 Mon Sep 17 00:00:00 2001
From: fernandascovino <fscovinom@gmail.com>
Date: Mon, 25 Sep 2023 19:17:54 -0300
Subject: [PATCH 002/145] unifica tasks de particao de data e hora

---
 pipelines/rj_smtr/constants.py     | 11 +++++------
 pipelines/rj_smtr/flows.py         | 12 ++----------
 pipelines/rj_smtr/tasks.py         | 15 +++++----------
 pipelines/rj_smtr/veiculo/flows.py |  6 +++---
 4 files changed, 15 insertions(+), 29 deletions(-)

diff --git a/pipelines/rj_smtr/constants.py b/pipelines/rj_smtr/constants.py
index 7133b8abe..b22c4a412 100644
--- a/pipelines/rj_smtr/constants.py
+++ b/pipelines/rj_smtr/constants.py
@@ -180,8 +180,7 @@ class constants(Enum):  # pylint: disable=c0103
                 ORDER BY
                     data_processamento
             """,
-            "primary_key": ["id"],  # id column to nest data on
-            "flag_date_partition": False,
+            "primary_key": ["id"]  # id column to nest data on
         },
     ]
     BILHETAGEM_TABLES_PARAMS = [
@@ -199,7 +198,7 @@ class constants(Enum):  # pylint: disable=c0103
                     DT_INCLUSAO
             """,
             "primary_key": ["CD_LINHA"],  # id column to nest data on
-            "flag_date_partition": True,
+            "partition_date_only": True,
         },
         {
             "table_id": "grupo",
@@ -215,7 +214,7 @@ class constants(Enum):  # pylint: disable=c0103
                     DT_INCLUSAO
             """,
             "primary_key": ["CD_GRUPO"],
-            "flag_date_partition": True,
+            "partition_date_only": True,
         },
         {
             "table_id": "grupo_linha",
@@ -231,7 +230,7 @@ class constants(Enum):  # pylint: disable=c0103
                     DT_INCLUSAO
             """,
             "primary_key": ["CD_GRUPO", "CD_LINHA"],  # id column to nest data on
-            "flag_date_partition": True,
+            "partition_date_only": True,
         },
         {
             "table_id": "matriz_integracao",
@@ -250,7 +249,7 @@ class constants(Enum):  # pylint: disable=c0103
                 "cd_versao_matriz",
                 "cd_integracao",
             ],  # id column to nest data on
-            "flag_date_partition": True,
+            "partition_date_only": True,
         },
     ]
     BILHETAGEM_SECRET_PATH = "smtr_jae_access_data"
diff --git a/pipelines/rj_smtr/flows.py b/pipelines/rj_smtr/flows.py
index f1d29ed10..bfe9d86e4 100644
--- a/pipelines/rj_smtr/flows.py
+++ b/pipelines/rj_smtr/flows.py
@@ -5,8 +5,7 @@
 
 from prefect.run_configs import KubernetesRun
 from prefect.storage import GCS
-from prefect import case, Parameter
-from prefect.tasks.control_flow import merge
+from prefect import Parameter
 
 # EMD Imports #
 
@@ -19,7 +18,6 @@
 # SMTR Imports #
 
 from pipelines.rj_smtr.tasks import (
-    create_date_partition,
     create_date_hour_partition,
     create_local_partition_path,
     get_current_timestamp,
@@ -66,13 +64,7 @@
         dataset_id=dataset_id,
     )
 
-    with case(table_params["flag_date_partition"], True):
-        date_partitions = create_date_partition(timestamp)
-
-    with case(table_params["flag_date_partition"], False):
-        date_hour_partitions = create_date_hour_partition(timestamp)
-
-    partitions = merge(date_partitions, date_hour_partitions)
+    partitions = create_date_hour_partition(timestamp, partition_date_only=table_params["partition_date_only"])
 
     filename = parse_timestamp_to_string(timestamp)
 
diff --git a/pipelines/rj_smtr/tasks.py b/pipelines/rj_smtr/tasks.py
index 983f93fbf..a2a5adddc 100644
--- a/pipelines/rj_smtr/tasks.py
+++ b/pipelines/rj_smtr/tasks.py
@@ -158,19 +158,14 @@ def get_current_timestamp(timestamp=None, truncate_minute: bool = True) -> datet
 
 
 @task
-def create_date_hour_partition(timestamp: datetime) -> str:
+def create_date_hour_partition(timestamp: datetime, partition_date_only: bool = False) -> str:
     """
     Get date hour Hive partition structure from timestamp.
     """
-    return f"data={timestamp.strftime('%Y-%m-%d')}/hora={timestamp.strftime('%H')}"
-
-
-@task
-def create_date_partition(timestamp: datetime) -> str:
-    """
-    Get date hour Hive partition structure from timestamp.
-    """
-    return f"data={timestamp.date()}"
+    partition = f"data={timestamp.strftime('%Y-%m-%d')}"
+    if partition_date_only:
+        parition += f"/hora={timestamp.strftime('%H')}"
+    return partition
 
 
 @task
diff --git a/pipelines/rj_smtr/veiculo/flows.py b/pipelines/rj_smtr/veiculo/flows.py
index 28188a129..e1fab515e 100644
--- a/pipelines/rj_smtr/veiculo/flows.py
+++ b/pipelines/rj_smtr/veiculo/flows.py
@@ -30,7 +30,7 @@
     every_day_hour_seven,
 )
 from pipelines.rj_smtr.tasks import (
-    create_date_partition,
+    create_date_hour_partition,
     create_local_partition_path,
     get_current_timestamp,
     get_raw,
@@ -71,7 +71,7 @@
     )
 
     # SETUP #
-    partitions = create_date_partition(timestamp)
+    partitions = create_date_hour_partition(timestamp, partition_date_only=True)
 
     filename = parse_timestamp_to_string(timestamp)
 
@@ -140,7 +140,7 @@
     )
 
     # SETUP #
-    partitions = create_date_partition(timestamp)
+    partitions = create_date_hour_partition(timestamp, partition_date_only=True)
 
     filename = parse_timestamp_to_string(timestamp)
 

From 66e84a1e2b2b24ead92842b604c2210238fb037b Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Mon, 25 Sep 2023 22:22:31 +0000
Subject: [PATCH 003/145] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 pipelines/rj_smtr/constants.py | 2 +-
 pipelines/rj_smtr/flows.py     | 4 +++-
 pipelines/rj_smtr/tasks.py     | 4 +++-
 3 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/pipelines/rj_smtr/constants.py b/pipelines/rj_smtr/constants.py
index b22c4a412..93303e5b7 100644
--- a/pipelines/rj_smtr/constants.py
+++ b/pipelines/rj_smtr/constants.py
@@ -180,7 +180,7 @@ class constants(Enum):  # pylint: disable=c0103
                 ORDER BY
                     data_processamento
             """,
-            "primary_key": ["id"]  # id column to nest data on
+            "primary_key": ["id"],  # id column to nest data on
         },
     ]
     BILHETAGEM_TABLES_PARAMS = [
diff --git a/pipelines/rj_smtr/flows.py b/pipelines/rj_smtr/flows.py
index bfe9d86e4..87d506813 100644
--- a/pipelines/rj_smtr/flows.py
+++ b/pipelines/rj_smtr/flows.py
@@ -64,7 +64,9 @@
         dataset_id=dataset_id,
     )
 
-    partitions = create_date_hour_partition(timestamp, partition_date_only=table_params["partition_date_only"])
+    partitions = create_date_hour_partition(
+        timestamp, partition_date_only=table_params["partition_date_only"]
+    )
 
     filename = parse_timestamp_to_string(timestamp)
 
diff --git a/pipelines/rj_smtr/tasks.py b/pipelines/rj_smtr/tasks.py
index a2a5adddc..f35a9db72 100644
--- a/pipelines/rj_smtr/tasks.py
+++ b/pipelines/rj_smtr/tasks.py
@@ -158,7 +158,9 @@ def get_current_timestamp(timestamp=None, truncate_minute: bool = True) -> datet
 
 
 @task
-def create_date_hour_partition(timestamp: datetime, partition_date_only: bool = False) -> str:
+def create_date_hour_partition(
+    timestamp: datetime, partition_date_only: bool = False
+) -> str:
     """
     Get date hour Hive partition structure from timestamp.
     """

From 7cb436bc9d0fc7cf045ca56248ef58a63ed634e7 Mon Sep 17 00:00:00 2001
From: fernandascovino <fscovinom@gmail.com>
Date: Mon, 25 Sep 2023 19:29:50 -0300
Subject: [PATCH 004/145] corrige condicional

---
 pipelines/rj_smtr/tasks.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pipelines/rj_smtr/tasks.py b/pipelines/rj_smtr/tasks.py
index f35a9db72..e1a0d0c7d 100644
--- a/pipelines/rj_smtr/tasks.py
+++ b/pipelines/rj_smtr/tasks.py
@@ -165,8 +165,8 @@ def create_date_hour_partition(
     Get date hour Hive partition structure from timestamp.
     """
     partition = f"data={timestamp.strftime('%Y-%m-%d')}"
-    if partition_date_only:
-        parition += f"/hora={timestamp.strftime('%H')}"
+    if not partition_date_only:
+        partition += f"/hora={timestamp.strftime('%H')}"
     return partition
 
 

From 588fe7d3f3cc02500930d2bd94996152b51a5bce Mon Sep 17 00:00:00 2001
From: Rafael <rafael.c.pinheiro96@gmail.com>
Date: Tue, 26 Sep 2023 11:20:28 -0300
Subject: [PATCH 005/145] change capture flow

---
 pipelines/rj_smtr/constants.py |   1 +
 pipelines/rj_smtr/flows.py     |  44 +++++++++-----
 pipelines/rj_smtr/tasks.py     |  45 +++++++++++++++
 pipelines/rj_smtr/utils.py     | 101 +++++++++++++++++++++++++++++++++
 pipelines/utils/custom.py      |  10 ++--
 pipelines/utils/utils.py       |  15 ++++-
 6 files changed, 196 insertions(+), 20 deletions(-)

diff --git a/pipelines/rj_smtr/constants.py b/pipelines/rj_smtr/constants.py
index 7133b8abe..34b63781a 100644
--- a/pipelines/rj_smtr/constants.py
+++ b/pipelines/rj_smtr/constants.py
@@ -182,6 +182,7 @@ class constants(Enum):  # pylint: disable=c0103
             """,
             "primary_key": ["id"],  # id column to nest data on
             "flag_date_partition": False,
+            "source": "api",
         },
     ]
     BILHETAGEM_TABLES_PARAMS = [
diff --git a/pipelines/rj_smtr/flows.py b/pipelines/rj_smtr/flows.py
index f1d29ed10..e36c8e676 100644
--- a/pipelines/rj_smtr/flows.py
+++ b/pipelines/rj_smtr/flows.py
@@ -23,13 +23,13 @@
     create_date_hour_partition,
     create_local_partition_path,
     get_current_timestamp,
-    get_raw,
     parse_timestamp_to_string,
     save_raw_local,
     save_treated_local,
     upload_logs_to_bq,
     bq_upload,
     transform_to_nested_structure,
+    get_raw,
 )
 
 from pipelines.rj_smtr.tasks import (
@@ -37,6 +37,14 @@
     get_datetime_range,
 )
 
+with Flow(
+    "SMTR: Pre-Treatment",
+    code_owners=["caio", "fernanda", "boris", "rodrigo"],
+) as default_pre_treatment_flow:
+    # SETUP #
+    table_params = Parameter("table_params", default=None)
+    dataset_id = Parameter("dataset_id", default=None)
+
 
 with Flow(
     "SMTR: Captura",
@@ -59,13 +67,6 @@
         now_time=timestamp,
     )
 
-    request_params, request_url = create_request_params(
-        datetime_range=datetime_range,
-        table_params=table_params,
-        secret_path=secret_path,
-        dataset_id=dataset_id,
-    )
-
     with case(table_params["flag_date_partition"], True):
         date_partitions = create_date_partition(timestamp)
 
@@ -83,11 +84,28 @@
         partitions=partitions,
     )
 
-    raw_status = get_raw(
-        url=request_url,
-        headers=secret_path,
-        params=request_params,
-    )
+    raw_status_list = []
+
+    with case(table_params["source"], "api"):
+        request_params, request_url = create_request_params(
+            datetime_range=datetime_range,
+            table_params=table_params,
+            secret_path=secret_path,
+            dataset_id=dataset_id,
+        )
+
+        api_raw_status = get_raw(
+            url=request_url,
+            headers=secret_path,
+            params=request_params,
+        )
+
+        raw_status_list.append(api_raw_status)
+
+    with case(table_params["source"], "gcs"):
+        pass
+
+    raw_status = merge(*raw_status_list)
 
     raw_filepath = save_raw_local(status=raw_status, file_path=filepath)
 
diff --git a/pipelines/rj_smtr/tasks.py b/pipelines/rj_smtr/tasks.py
index de52c03df..49c745076 100644
--- a/pipelines/rj_smtr/tasks.py
+++ b/pipelines/rj_smtr/tasks.py
@@ -28,6 +28,7 @@
     get_last_run_timestamp,
     log_critical,
     data_info_str,
+    get_raw_data_api,
 )
 from pipelines.utils.execute_dbt_model.utils import get_dbt_client
 from pipelines.utils.utils import log, get_redis_client, get_vault_secret
@@ -960,3 +961,47 @@ def create_request_params(
         }
 
     return request_params, request_url
+
+
+# @task(checkpoint=False)
+# def get_raw_from_sources(
+#     source: str,
+#     url:str,
+#     dataset_id:str = None,
+#     table_id:str = None,
+#     mode:str = None,
+#     headers: str = None,
+#     filetype: str = "json",
+#     csv_args: dict = None,
+#     params: dict = None,
+# ):
+#     if source == "api":
+#         return get_raw_data_api(
+#             url=url,
+#             headers=headers,
+#             filetype=filetype,
+#             csv_args=csv_args,
+#             params=params
+#         )
+#     if source == "gcs":
+#         file =
+
+
+@task(checkpoint=False)
+def save_raw_storage(
+    dataset_id: str,
+    table_id: str,
+    raw_filepath: str,
+    partitions: str = None,
+):
+    st_obj = Storage(table_id=table_id, dataset_id=dataset_id)
+    log(
+        f"""Uploading raw file to bucket {st_obj.bucket_name} at
+        {st_obj.bucket_name}/{dataset_id}/{table_id}"""
+    )
+    st_obj.upload(
+        path=raw_filepath,
+        partitions=partitions,
+        mode="raw",
+        if_exists="replace",
+    )
diff --git a/pipelines/rj_smtr/utils.py b/pipelines/rj_smtr/utils.py
index 9ddf7d687..3b3c7377d 100644
--- a/pipelines/rj_smtr/utils.py
+++ b/pipelines/rj_smtr/utils.py
@@ -14,6 +14,8 @@
 from basedosdados import Table
 import pandas as pd
 import pytz
+import requests
+import zipfile
 
 from prefect.schedules.clocks import IntervalClock
 
@@ -27,6 +29,8 @@
     get_vault_secret,
     send_discord_message,
     get_redis_client,
+    get_storage_blobs,
+    get_storage_blob,
 )
 
 
@@ -445,3 +449,100 @@ def generate_execute_schedules(  # pylint: disable=too-many-arguments,too-many-l
             )
         )
     return clocks
+
+
+def get_raw_data_api(  # pylint: disable=R0912
+    url: str,
+    headers: str = None,
+    filetype: str = "json",
+    csv_args: dict = None,
+    params: dict = None,
+) -> list[dict]:
+    """
+    Request data from URL API
+
+    Args:
+        url (str): URL to send request
+        headers (str, optional): Path to headers guardeded on Vault, if needed.
+        filetype (str, optional): Filetype to be formatted (supported only: json, csv and txt)
+        csv_args (dict, optional): Arguments for read_csv, if needed
+        params (dict, optional): Params to be sent on request
+
+    Returns:
+        dict: Conatining keys
+          * `data` (json): data result
+          * `error` (str): catched error, if any. Otherwise, returns None
+    """
+    data = None
+    error = None
+
+    try:
+        if headers is not None:
+            headers = get_vault_secret(headers)["data"]
+
+            # remove from headers, if present
+            remove_headers = ["host", "databases"]
+            for remove_header in remove_headers:
+                if remove_header in list(headers.keys()):
+                    del headers[remove_header]
+
+        response = requests.get(
+            url,
+            headers=headers,
+            timeout=constants.MAX_TIMEOUT_SECONDS.value,
+            params=params,
+        )
+
+        if response.ok:  # status code is less than 400
+            if filetype == "json":
+                data = response.json()
+
+                # todo: move to data check on specfic API # pylint: disable=W0102
+                if isinstance(data, dict) and "DescricaoErro" in data.keys():
+                    error = data["DescricaoErro"]
+
+            elif filetype in ("txt", "csv"):
+                if csv_args is None:
+                    csv_args = {}
+                data = pd.read_csv(io.StringIO(response.text), **csv_args).to_dict(
+                    orient="records"
+                )
+            else:
+                error = (
+                    "Unsupported raw file extension. Supported only: json, csv and txt"
+                )
+
+    except Exception as exp:
+        error = exp
+
+    if error is not None:
+        log(f"[CATCHED] Task failed with error: \n{error}", level="error")
+
+    return {"data": data, "error": error}
+
+
+def get_raw_data_gcs(
+    dataset_id: str, table_id: str, file_name: str, mode: str, zip_file_name: str = None
+) -> dict:
+    error = None
+    data = None
+    try:
+        if zip_file_name:
+            blob = get_storage_blob(
+                dataset_id=dataset_id,
+                table_id=table_id,
+                file_name=zip_file_name,
+                mode=mode,
+            )
+            compressed_data = blob.download_as_bytes()
+            with zipfile.ZipFile(io.BytesIO(compressed_data), "r") as zipped_file:
+                data = zipped_file.read(file_name).decode(encoding="utf-8")
+        else:
+            blob = get_storage_blob(
+                dataset_id=dataset_id, table_id=table_id, file_name=file_name, mode=mode
+            )
+            data = blob.download_as_string()
+    except Exception as exp:
+        error = exp
+
+    return {"data": data, "error": error}
diff --git a/pipelines/utils/custom.py b/pipelines/utils/custom.py
index 13ae82dd5..d91739817 100644
--- a/pipelines/utils/custom.py
+++ b/pipelines/utils/custom.py
@@ -68,11 +68,11 @@ def __init__(  # pylint: disable=too-many-arguments, too-many-locals
             edges=edges,
             reference_tasks=reference_tasks,
             state_handlers=state_handlers,
-            on_failure=partial(
-                notify_discord_on_failure,
-                secret_path=constants.EMD_DISCORD_WEBHOOK_SECRET_PATH.value,
-                code_owners=code_owners,
-            ),
+            # on_failure=partial(
+            #     notify_discord_on_failure,
+            #     secret_path=constants.EMD_DISCORD_WEBHOOK_SECRET_PATH.value,
+            #     code_owners=code_owners,
+            # ),
             validate=validate,
             result=result,
             terminal_state_handler=terminal_state_handler,
diff --git a/pipelines/utils/utils.py b/pipelines/utils/utils.py
index efc21c133..7042709e9 100644
--- a/pipelines/utils/utils.py
+++ b/pipelines/utils/utils.py
@@ -711,7 +711,7 @@ def get_credentials_from_env(
     return cred
 
 
-def get_storage_blobs(dataset_id: str, table_id: str) -> list:
+def get_storage_blobs(dataset_id: str, table_id: str, mode: str = "staging") -> list:
     """
     Get all blobs from a table in a dataset.
     """
@@ -720,7 +720,18 @@ def get_storage_blobs(dataset_id: str, table_id: str) -> list:
     return list(
         bd_storage.client["storage_staging"]
         .bucket(bd_storage.bucket_name)
-        .list_blobs(prefix=f"staging/{bd_storage.dataset_id}/{bd_storage.table_id}/")
+        .list_blobs(prefix=f"{mode}/{bd_storage.dataset_id}/{bd_storage.table_id}/")
+    )
+
+
+def get_storage_blob(
+    dataset_id: str, table_id: str, file_name: str, mode: str = "staging"
+):
+    bd_storage = bd.Storage(dataset_id=dataset_id, table_id=table_id)
+    return (
+        bd_storage.client["storage_staging"]
+        .bucket(bd_storage.bucket_name)
+        .get_blob(blob_name=f"{mode}/{dataset_id}/{table_id}/{file_name}")
     )
 
 

From 97746e1c34db7410a78a69e0b5ce4e7df4b12ad7 Mon Sep 17 00:00:00 2001
From: Rafael <rafael.c.pinheiro96@gmail.com>
Date: Tue, 26 Sep 2023 15:04:09 -0300
Subject: [PATCH 006/145] change generic capture flow

---
 pipelines/rj_smtr/constants.py | 39 +++++++++------
 pipelines/rj_smtr/flows.py     | 72 +++++++++++++--------------
 pipelines/rj_smtr/tasks.py     | 89 ++++++++++++++++++----------------
 pipelines/rj_smtr/utils.py     | 52 ++++++++------------
 pipelines/utils/utils.py       | 15 +++++-
 5 files changed, 135 insertions(+), 132 deletions(-)

diff --git a/pipelines/rj_smtr/constants.py b/pipelines/rj_smtr/constants.py
index 34b63781a..caa4a5e23 100644
--- a/pipelines/rj_smtr/constants.py
+++ b/pipelines/rj_smtr/constants.py
@@ -167,23 +167,30 @@ class constants(Enum):  # pylint: disable=c0103
     BILHETAGEM_DATASET_ID = "br_rj_riodejaneiro_bilhetagem"
     BILHETAGEM_TRANSACAO_TABLE_PARAMS = [
         {
-            "table_id": "transacao",
-            "database": "transacao_db",
-            "query": """
-                SELECT
-                    *
-                FROM
-                    transacao
-                WHERE
-                    data_processamento BETWEEN '{start}'
-                    AND '{end}'
-                ORDER BY
-                    data_processamento
-            """,
-            "primary_key": ["id"],  # id column to nest data on
             "flag_date_partition": False,
-            "source": "api",
-        },
+            "flow_run_name": "transacao",
+            "extraction": {
+                "table_id": "transacao",
+                "database": "transacao_db",
+                "query": """
+                    SELECT
+                        *
+                    FROM
+                        transacao
+                    WHERE
+                        data_processamento BETWEEN '{start}'
+                        AND '{end}'
+                    ORDER BY
+                        data_processamento
+                """,
+                "source": "api",
+            },
+            "pre-treatment": {
+                "table_id": "transacao",
+                "file_type": "json",
+                "primary_key": ["id"],  # id column to nest data on
+            },
+        }
     ]
     BILHETAGEM_TABLES_PARAMS = [
         {
diff --git a/pipelines/rj_smtr/flows.py b/pipelines/rj_smtr/flows.py
index e36c8e676..8076633c8 100644
--- a/pipelines/rj_smtr/flows.py
+++ b/pipelines/rj_smtr/flows.py
@@ -7,6 +7,7 @@
 from prefect.storage import GCS
 from prefect import case, Parameter
 from prefect.tasks.control_flow import merge
+from prefect.utilities.collections import DotDict
 
 # EMD Imports #
 
@@ -29,22 +30,12 @@
     upload_logs_to_bq,
     bq_upload,
     transform_to_nested_structure,
-    get_raw,
-)
-
-from pipelines.rj_smtr.tasks import (
+    get_raw_from_sources,
+    transform_data_to_json,
     create_request_params,
     get_datetime_range,
 )
 
-with Flow(
-    "SMTR: Pre-Treatment",
-    code_owners=["caio", "fernanda", "boris", "rodrigo"],
-) as default_pre_treatment_flow:
-    # SETUP #
-    table_params = Parameter("table_params", default=None)
-    dataset_id = Parameter("dataset_id", default=None)
-
 
 with Flow(
     "SMTR: Captura",
@@ -63,7 +54,7 @@
     datetime_range = get_datetime_range(timestamp, interval=interval)
 
     rename_flow_run = rename_current_flow_run_now_time(
-        prefix=default_capture_flow.name + " " + table_params["table_id"] + ": ",
+        prefix=default_capture_flow.name + " " + table_params["flow_run_name"] + ": ",
         now_time=timestamp,
     )
 
@@ -79,41 +70,44 @@
 
     filepath = create_local_partition_path(
         dataset_id=dataset_id,
-        table_id=table_params["table_id"],
+        table_id=table_params["pre-treatment"]["table_id"],
         filename=filename,
         partitions=partitions,
     )
 
-    raw_status_list = []
-
-    with case(table_params["source"], "api"):
-        request_params, request_url = create_request_params(
-            datetime_range=datetime_range,
-            table_params=table_params,
-            secret_path=secret_path,
-            dataset_id=dataset_id,
-        )
-
-        api_raw_status = get_raw(
-            url=request_url,
-            headers=secret_path,
-            params=request_params,
-        )
-
-        raw_status_list.append(api_raw_status)
-
-    with case(table_params["source"], "gcs"):
-        pass
+    # CAPTURA
+    request_params, request_url = create_request_params(
+        datetime_range=datetime_range,
+        table_params=table_params,
+        secret_path=secret_path,
+        dataset_id=dataset_id,
+    )
 
-    raw_status = merge(*raw_status_list)
+    raw_status = get_raw_from_sources(
+        source=table_params["extraction"]["source"],
+        url=request_url,
+        dataset_id=dataset_id,
+        table_id=table_params["extraction"]["table_id"],
+        file_name=table_params["extraction"]["file_name"],
+        zip_file_name=table_params["extraction"]["zip_file_name"],
+        mode=table_params["extraction"]["mode"],
+        headers=secret_path,
+        params=request_params,
+    )
 
     raw_filepath = save_raw_local(status=raw_status, file_path=filepath)
 
     # TREAT & CLEAN #
-    treated_status = transform_to_nested_structure(
+    json_status = transform_data_to_json(
         status=raw_status,
+        file_type=table_params["pre-treatment"]["file_type"],
+        csv_args=table_params["pre-treatment"]["csv_args"],
+    )
+
+    treated_status = transform_to_nested_structure(
+        status=json_status,
         timestamp=timestamp,
-        primary_key=table_params["primary_key"],
+        primary_key=table_params["pre-treatment"]["primary_key"],
     )
 
     treated_filepath = save_treated_local(status=treated_status, file_path=filepath)
@@ -121,7 +115,7 @@
     # LOAD #
     error = bq_upload(
         dataset_id=dataset_id,
-        table_id=table_params["table_id"],
+        table_id=table_params["pre-treatment"]["table_id"],
         filepath=treated_filepath,
         raw_filepath=raw_filepath,
         partitions=partitions,
@@ -130,7 +124,7 @@
 
     upload_logs_to_bq(
         dataset_id=dataset_id,
-        parent_table_id=table_params["table_id"],
+        parent_table_id=table_params["pre-treatment"]["table_id"],
         error=error,
         timestamp=timestamp,
     )
diff --git a/pipelines/rj_smtr/tasks.py b/pipelines/rj_smtr/tasks.py
index 49c745076..1b9545ca8 100644
--- a/pipelines/rj_smtr/tasks.py
+++ b/pipelines/rj_smtr/tasks.py
@@ -29,6 +29,7 @@
     log_critical,
     data_info_str,
     get_raw_data_api,
+    get_raw_data_gcs,
 )
 from pipelines.utils.execute_dbt_model.utils import get_dbt_client
 from pipelines.utils.utils import log, get_redis_client, get_vault_secret
@@ -950,58 +951,62 @@ def create_request_params(
     if dataset_id == constants.BILHETAGEM_DATASET_ID.value:
         secrets = get_vault_secret(secret_path)["data"]
 
-        database_secrets = secrets["databases"][table_params["database"]]
+        database_secrets = secrets["databases"][table_params["extraction"]["database"]]
 
         request_url = secrets["vpn_url"] + database_secrets["engine"]
 
         request_params = {
             "host": database_secrets["host"],  # TODO: exibir no log em ambiente fechado
-            "database": table_params["database"],
-            "query": table_params["query"].format(**datetime_range),
+            "database": table_params["extraction"]["database"],
+            "query": table_params["extraction"]["query"].format(**datetime_range),
         }
 
     return request_params, request_url
 
 
-# @task(checkpoint=False)
-# def get_raw_from_sources(
-#     source: str,
-#     url:str,
-#     dataset_id:str = None,
-#     table_id:str = None,
-#     mode:str = None,
-#     headers: str = None,
-#     filetype: str = "json",
-#     csv_args: dict = None,
-#     params: dict = None,
-# ):
-#     if source == "api":
-#         return get_raw_data_api(
-#             url=url,
-#             headers=headers,
-#             filetype=filetype,
-#             csv_args=csv_args,
-#             params=params
-#         )
-#     if source == "gcs":
-#         file =
-
-
 @task(checkpoint=False)
-def save_raw_storage(
-    dataset_id: str,
-    table_id: str,
-    raw_filepath: str,
+def get_raw_from_sources(
+    source: str,
+    url: str,
+    dataset_id: str = None,
+    table_id: str = None,
+    file_name: str = None,
     partitions: str = None,
+    zip_file_name: str = None,
+    mode: str = None,
+    headers: str = None,
+    params: dict = None,
 ):
-    st_obj = Storage(table_id=table_id, dataset_id=dataset_id)
-    log(
-        f"""Uploading raw file to bucket {st_obj.bucket_name} at
-        {st_obj.bucket_name}/{dataset_id}/{table_id}"""
-    )
-    st_obj.upload(
-        path=raw_filepath,
-        partitions=partitions,
-        mode="raw",
-        if_exists="replace",
-    )
+    if source == "api":
+        return get_raw_data_api(url=url, headers=headers, params=params)
+    if source == "gcs":
+        return get_raw_data_gcs(
+            dataset_id=dataset_id,
+            table_id=table_id,
+            file_name=file_name,
+            mode=mode,
+            partitions=partitions,
+            zip_file_name=zip_file_name,
+        )
+
+
+@task(checkpoint=False)
+def transform_data_to_json(status: dict, file_type: str, csv_args: dict):
+    data = status["data"]
+    error = status["error"]
+
+    if file_type == "json":
+        pass
+
+        # todo: move to data check on specfic API # pylint: disable=W0102
+        # if isinstance(data, dict) and "DescricaoErro" in data.keys():
+        #     error = data["DescricaoErro"]
+
+    elif file_type in ("txt", "csv"):
+        if csv_args is None:
+            csv_args = {}
+        data = pd.read_csv(io.StringIO(data), **csv_args).to_dict(orient="records")
+    else:
+        error = "Unsupported raw file extension. Supported only: json, csv and txt"
+
+    return {"data": data, "error": error}
diff --git a/pipelines/rj_smtr/utils.py b/pipelines/rj_smtr/utils.py
index 3b3c7377d..c7b13bfc3 100644
--- a/pipelines/rj_smtr/utils.py
+++ b/pipelines/rj_smtr/utils.py
@@ -454,8 +454,6 @@ def generate_execute_schedules(  # pylint: disable=too-many-arguments,too-many-l
 def get_raw_data_api(  # pylint: disable=R0912
     url: str,
     headers: str = None,
-    filetype: str = "json",
-    csv_args: dict = None,
     params: dict = None,
 ) -> list[dict]:
     """
@@ -464,8 +462,6 @@ def get_raw_data_api(  # pylint: disable=R0912
     Args:
         url (str): URL to send request
         headers (str, optional): Path to headers guardeded on Vault, if needed.
-        filetype (str, optional): Filetype to be formatted (supported only: json, csv and txt)
-        csv_args (dict, optional): Arguments for read_csv, if needed
         params (dict, optional): Params to be sent on request
 
     Returns:
@@ -493,24 +489,9 @@ def get_raw_data_api(  # pylint: disable=R0912
             params=params,
         )
 
-        if response.ok:  # status code is less than 400
-            if filetype == "json":
-                data = response.json()
+        response.raise_for_status()
 
-                # todo: move to data check on specfic API # pylint: disable=W0102
-                if isinstance(data, dict) and "DescricaoErro" in data.keys():
-                    error = data["DescricaoErro"]
-
-            elif filetype in ("txt", "csv"):
-                if csv_args is None:
-                    csv_args = {}
-                data = pd.read_csv(io.StringIO(response.text), **csv_args).to_dict(
-                    orient="records"
-                )
-            else:
-                error = (
-                    "Unsupported raw file extension. Supported only: json, csv and txt"
-                )
+        data = response.text
 
     except Exception as exp:
         error = exp
@@ -522,25 +503,30 @@ def get_raw_data_api(  # pylint: disable=R0912
 
 
 def get_raw_data_gcs(
-    dataset_id: str, table_id: str, file_name: str, mode: str, zip_file_name: str = None
+    dataset_id: str,
+    table_id: str,
+    file_name: str,
+    mode: str,
+    partitions: str = None,
+    zip_extracted_file: str = None,
 ) -> dict:
     error = None
     data = None
     try:
-        if zip_file_name:
-            blob = get_storage_blob(
-                dataset_id=dataset_id,
-                table_id=table_id,
-                file_name=zip_file_name,
-                mode=mode,
-            )
+        blob = get_storage_blob(
+            dataset_id=dataset_id,
+            table_id=table_id,
+            file_name=file_name,
+            partitions=partitions,
+            mode=mode,
+        )
+
+        if zip_extracted_file:
             compressed_data = blob.download_as_bytes()
+
             with zipfile.ZipFile(io.BytesIO(compressed_data), "r") as zipped_file:
-                data = zipped_file.read(file_name).decode(encoding="utf-8")
+                data = zipped_file.read(zip_extracted_file).decode(encoding="utf-8")
         else:
-            blob = get_storage_blob(
-                dataset_id=dataset_id, table_id=table_id, file_name=file_name, mode=mode
-            )
             data = blob.download_as_string()
     except Exception as exp:
         error = exp
diff --git a/pipelines/utils/utils.py b/pipelines/utils/utils.py
index 7042709e9..79a264017 100644
--- a/pipelines/utils/utils.py
+++ b/pipelines/utils/utils.py
@@ -725,13 +725,24 @@ def get_storage_blobs(dataset_id: str, table_id: str, mode: str = "staging") ->
 
 
 def get_storage_blob(
-    dataset_id: str, table_id: str, file_name: str, mode: str = "staging"
+    dataset_id: str,
+    table_id: str,
+    file_name: str,
+    partitions: str = None,
+    mode: str = "staging",
 ):
+    path = f"{mode}/{dataset_id}/{table_id}/"
+
+    if partitions:
+        path += f"{partitions}/"
+
+    path += file_name
+
     bd_storage = bd.Storage(dataset_id=dataset_id, table_id=table_id)
     return (
         bd_storage.client["storage_staging"]
         .bucket(bd_storage.bucket_name)
-        .get_blob(blob_name=f"{mode}/{dataset_id}/{table_id}/{file_name}")
+        .get_blob(blob_name=path)
     )
 
 

From 6f12477d14e45a2bb83c817976a597282625a66b Mon Sep 17 00:00:00 2001
From: fernandascovino <fscovinom@gmail.com>
Date: Tue, 26 Sep 2023 17:18:56 -0300
Subject: [PATCH 007/145] atualiza esquema do flow padrao

---
 pipelines/rj_smtr/constants.py |   3 +
 pipelines/rj_smtr/flows.py     | 121 +++++++++----------
 pipelines/rj_smtr/tasks.py     | 206 ++++++++++++++++++++++-----------
 pipelines/rj_smtr/utils.py     | 163 +++++++++++++++++++++-----
 pipelines/utils/utils.py       |  20 +---
 5 files changed, 337 insertions(+), 176 deletions(-)

diff --git a/pipelines/rj_smtr/constants.py b/pipelines/rj_smtr/constants.py
index 3b1b6dc8d..d402bb6e9 100644
--- a/pipelines/rj_smtr/constants.py
+++ b/pipelines/rj_smtr/constants.py
@@ -262,3 +262,6 @@ class constants(Enum):  # pylint: disable=c0103
         },
     ]
     BILHETAGEM_SECRET_PATH = "smtr_jae_access_data"
+
+    # GTFS
+    
diff --git a/pipelines/rj_smtr/flows.py b/pipelines/rj_smtr/flows.py
index da802d277..fb763cc5a 100644
--- a/pipelines/rj_smtr/flows.py
+++ b/pipelines/rj_smtr/flows.py
@@ -22,15 +22,17 @@
     create_local_partition_path,
     get_current_timestamp,
     parse_timestamp_to_string,
-    save_raw_local,
-    save_treated_local,
-    upload_logs_to_bq,
-    bq_upload,
-    transform_to_nested_structure,
+    # save_raw_local,
+    # save_treated_local,
+    # upload_logs_to_bq,
+    # bq_upload,
+    upload_raw_data_to_gcs,
+    upload_staging_data_to_gcs,
+    transform_raw_to_nested_structure,
     get_raw_from_sources,
-    transform_data_to_json,
+    # transform_data_to_json,
     create_request_params,
-    get_datetime_range,
+    # get_datetime_range,
 )
 
 
@@ -38,96 +40,87 @@
     "SMTR: Captura",
     code_owners=["caio", "fernanda", "boris", "rodrigo"],
 ) as default_capture_flow:
-    # SETUP #
+    
+    ### Configuração ###
 
-    table_params = Parameter("table_params", default=None)
-    timestamp_param = Parameter("timestamp", default=None)
-    interval = Parameter("interval", default=None)
+    table_id = Parameter("table_id", default=None)
+    partition_date_only = Parameter("partition_date_only", default=None)
+    request_params = Parameter("request_params", default=None)
     dataset_id = Parameter("dataset_id", default=None)
     secret_path = Parameter("secret_path", default=None)
+    primary_key = Parameter("primary_key", default=None)
+    source_type = Parameter("source_type", default=None)
 
-    timestamp = get_current_timestamp(timestamp_param)
-
-    datetime_range = get_datetime_range(timestamp, interval=interval)
+    timestamp = get_current_timestamp()
 
     rename_flow_run = rename_current_flow_run_now_time(
-        prefix=default_capture_flow.name + " " + table_params["flow_run_name"] + ": ",
+        prefix=default_capture_flow.name + " " + table_id + ": ",
         now_time=timestamp,
     )
 
-    request_params, request_url = create_request_params(
-        datetime_range=datetime_range,
-        table_params=table_params,
-        secret_path=secret_path,
-        dataset_id=dataset_id,
-    )
-
     partitions = create_date_hour_partition(
-        timestamp, partition_date_only=table_params["partition_date_only"]
+        timestamp, partition_date_only=partition_date_only
     )
 
     filename = parse_timestamp_to_string(timestamp)
 
     filepath = create_local_partition_path(
         dataset_id=dataset_id,
-        table_id=table_params["pre-treatment"]["table_id"],
+        table_id=table_id,
         filename=filename,
         partitions=partitions,
     )
 
-    # CAPTURA
-    request_params, request_url = create_request_params(
-        datetime_range=datetime_range,
-        table_params=table_params,
+    ### Extração ###
+    # é necessária task ou função dentro da extract_raw_data?
+    request_params, request_path = create_request_params(
         secret_path=secret_path,
         dataset_id=dataset_id,
     )
 
-    raw_status = get_raw_from_sources(
-        source=table_params["extraction"]["source"],
-        url=request_url,
-        dataset_id=dataset_id,
-        table_id=table_params["extraction"]["table_id"],
-        file_name=table_params["extraction"]["file_name"],
-        zip_file_name=table_params["extraction"]["zip_file_name"],
-        mode=table_params["extraction"]["mode"],
-        headers=secret_path,
-        params=request_params,
+    error, raw_filepath = get_raw_from_sources(
+        source_type=source_type, # parametro de extracao, onde ficar?
+        source_path=request_path,
+        zip_filename=table_id,
+        secret_path=secret_path,
+        request_params=request_params,
     )
 
-    raw_filepath = save_raw_local(status=raw_status, file_path=filepath)
-
-    # TREAT & CLEAN #
-    json_status = transform_data_to_json(
-        status=raw_status,
-        file_type=table_params["pre-treatment"]["file_type"],
-        csv_args=table_params["pre-treatment"]["csv_args"],
+    RAW_UPLOADED = upload_raw_data_to_gcs(
+        error=error, 
+        filepath=raw_filepath, 
+        timestamp=timestamp, 
+        partitions=partitions
     )
 
-    treated_status = transform_to_nested_structure(
-        status=json_status,
+    ### Pré-tratamento ###
+
+    error, staging_filepath = transform_raw_to_nested_structure(
+        raw_filepath=raw_filepath,
         timestamp=timestamp,
-        primary_key=table_params["pre-treatment"]["primary_key"],
+        primary_key=primary_key,
     )
 
-    treated_filepath = save_treated_local(status=treated_status, file_path=filepath)
+    STAGING_UPLOADED = upload_staging_data_to_gcs(error=error, filepath=staging_filepath, timestamp=timestamp)
 
-    # LOAD #
-    error = bq_upload(
-        dataset_id=dataset_id,
-        table_id=table_params["pre-treatment"]["table_id"],
-        filepath=treated_filepath,
-        raw_filepath=raw_filepath,
-        partitions=partitions,
-        status=treated_status,
-    )
+    # treated_filepath = save_treated_local(status=treated_status, file_path=filepath)
 
-    upload_logs_to_bq(
-        dataset_id=dataset_id,
-        parent_table_id=table_params["pre-treatment"]["table_id"],
-        error=error,
-        timestamp=timestamp,
-    )
+    # LOAD #
+    # error = bq_upload(
+    #     dataset_id=dataset_id,
+    #     table_id=table_params["pre-treatment"]["table_id"],
+    #     filepath=treated_filepath,
+    #     raw_filepath=raw_filepath,
+    #     partitions=partitions,
+    #     status=treated_status,
+    # )
+
+    # upload_logs_to_bq(
+    #     dataset_id=dataset_id,
+    #     parent_table_id=table_params["pre-treatment"]["table_id"],
+    #     error=error,
+    #     timestamp=timestamp,
+    # )
 
 default_capture_flow.storage = GCS(emd_constants.GCS_FLOWS_BUCKET.value)
 default_capture_flow.run_config = KubernetesRun(
diff --git a/pipelines/rj_smtr/tasks.py b/pipelines/rj_smtr/tasks.py
index bf0aec407..b7f484171 100644
--- a/pipelines/rj_smtr/tasks.py
+++ b/pipelines/rj_smtr/tasks.py
@@ -30,6 +30,7 @@
     data_info_str,
     get_raw_data_api,
     get_raw_data_gcs,
+    upload_run_logs_to_bq
 )
 from pipelines.utils.execute_dbt_model.utils import get_dbt_client
 from pipelines.utils.utils import log, get_redis_client, get_vault_secret
@@ -601,6 +602,69 @@ def upload_logs_to_bq(  # pylint: disable=R0913
         raise Exception(f"Pipeline failed with error: {error}")
 
 
+@task
+def upload_raw_data_to_gcs(
+    error: bool, raw_filepath: str, timestamp: datetime, table_id: str, dataset_id: str, partitions: list
+):
+    if not error:
+        try:
+            st_obj = Storage(table_id=table_id, dataset_id=dataset_id)
+            log(
+                f"""Uploading raw file to bucket {st_obj.bucket_name} at
+                {st_obj.bucket_name}/{dataset_id}/{table_id}"""
+            )
+            st_obj.upload(
+                path=raw_filepath,
+                partitions=partitions,
+                mode="raw",
+                if_exists="replace",
+            )
+        except Exception:
+            error = traceback.format_exc()
+            log(f"[CATCHED] Task failed with error: \n{error}", level="error")
+    
+    upload_run_logs_to_bq(
+        dataset_id=dataset_id,
+        parent_table_id=table_id,
+        error=error,
+        timestamp=timestamp,
+        mode="raw"
+    )
+
+
+@task
+def upload_staging_data_to_gcs(
+    error: bool, staging_filepath: str, timestamp: datetime, table_id: str, dataset_id: str, partitions: list
+):
+    if not error:
+        try:
+            # Creates and publish table if it does not exist, append to it otherwise
+            create_or_append_table(
+                dataset_id=dataset_id,
+                table_id=table_id,
+                path=staging_filepath,
+                partitions=partitions
+            )
+        except Exception:
+            error = traceback.format_exc()
+            log(f"[CATCHED] Task failed with error: \n{error}", level="error")
+    
+    upload_run_logs_to_bq(
+        dataset_id=dataset_id,
+        parent_table_id=table_id,
+        error=error,
+        timestamp=timestamp,
+        mode="staging"
+    )
+    
+
+###############
+#
+# Daterange tasks
+#
+###############
+
+
 @task(
     checkpoint=False,
     max_retries=constants.MAX_RETRIES.value,
@@ -791,9 +855,16 @@ def get_previous_date(days):
     return now.to_date_string()
 
 
+###############
+#
+# Pretreat data
+#
+###############
+
+
 @task
-def transform_to_nested_structure(
-    status: dict, timestamp: datetime, primary_key: list = None
+def transform_raw_to_nested_structure(
+    filepath: str, error: bool, timestamp: datetime, primary_key: list = None
 ):
     """Transform dataframe to nested structure
 
@@ -810,21 +881,29 @@ def transform_to_nested_structure(
             * `error` (str): catched error, if any. Otherwise, returns None
     """
 
+    # ORGANIZAR:
+    # json_status = transform_data_to_json(
+    #     status=raw_status,
+    #     file_type=table_params["pre-treatment"]["file_type"],
+    #     csv_args=table_params["pre-treatment"]["csv_args"],
+    # )
+
     # Check previous error
-    if status["error"] is not None:
-        return {"data": pd.DataFrame(), "error": status["error"]}
+    if error is not None:
+        return {"data": pd.DataFrame(), "error": error}
 
     # Check empty dataframe
-    if len(status["data"]) == 0:
-        log("Empty dataframe, skipping transformation...")
-        return {"data": pd.DataFrame(), "error": status["error"]}
+    # if len(status["data"]) == 0:
+    #     log("Empty dataframe, skipping transformation...")
+    #     return {"data": pd.DataFrame(), "error": error}
 
     try:
         if primary_key is None:
             primary_key = []
 
         error = None
-        data = pd.DataFrame(status["data"])
+        # leitura do dado raw
+        # data = pd.DataFrame(status["data"])
 
         log(
             f"""
@@ -860,40 +939,43 @@ def transform_to_nested_structure(
             level="info",
         )
 
+        # save treated local
+        filepath = _save_trated_local(data=data, filepath=filepath)
+
     except Exception as exp:  # pylint: disable=W0703
         error = exp
 
     if error is not None:
         log(f"[CATCHED] Task failed with error: \n{error}", level="error")
 
-    return {"data": data, "error": error}
+    return error, filepath
 
 
-@task(checkpoint=False)
-def get_datetime_range(
-    timestamp: datetime,
-    interval: int,
-) -> dict:
-    """
-    Task to get datetime range in UTC
+# @task(checkpoint=False)
+# def get_datetime_range(
+#     timestamp: datetime,
+#     interval: int,
+# ) -> dict:
+#     """
+#     Task to get datetime range in UTC
 
-    Args:
-        timestamp (datetime): timestamp to get datetime range
-        interval (int): interval in seconds
+#     Args:
+#         timestamp (datetime): timestamp to get datetime range
+#         interval (int): interval in seconds
 
-    Returns:
-        dict: datetime range
-    """
+#     Returns:
+#         dict: datetime range
+#     """
 
-    start = (
-        (timestamp - timedelta(seconds=interval))
-        .astimezone(tz=timezone("UTC"))
-        .strftime("%Y-%m-%d %H:%M:%S")
-    )
+#     start = (
+#         (timestamp - timedelta(seconds=interval))
+#         .astimezone(tz=timezone("UTC"))
+#         .strftime("%Y-%m-%d %H:%M:%S")
+#     )
 
-    end = timestamp.astimezone(tz=timezone("UTC")).strftime("%Y-%m-%d %H:%M:%S")
+#     end = timestamp.astimezone(tz=timezone("UTC")).strftime("%Y-%m-%d %H:%M:%S")
 
-    return {"start": start, "end": end}
+#     return {"start": start, "end": end}
 
 
 @task(checkpoint=False, nout=2)
@@ -916,11 +998,8 @@ def create_request_params(
 
     if dataset_id == constants.BILHETAGEM_DATASET_ID.value:
         secrets = get_vault_secret(secret_path)["data"]
-
         database_secrets = secrets["databases"][table_params["extraction"]["database"]]
-
         request_url = secrets["vpn_url"] + database_secrets["engine"]
-
         request_params = {
             "host": database_secrets["host"],  # TODO: exibir no log em ambiente fechado
             "database": table_params["extraction"]["database"],
@@ -932,47 +1011,40 @@ def create_request_params(
 
 @task(checkpoint=False)
 def get_raw_from_sources(
-    source: str,
-    url: str,
-    dataset_id: str = None,
-    table_id: str = None,
-    file_name: str = None,
-    partitions: str = None,
-    zip_file_name: str = None,
-    mode: str = None,
-    headers: str = None,
-    params: dict = None,
+    source_type: str,
+    source_path: str = None,
+    zip_filename: str = None,
+    secret_path: str = None,
+    api_params: dict = None,
 ):
-    if source == "api":
-        return get_raw_data_api(url=url, headers=headers, params=params)
-    if source == "gcs":
+    if source_type == "api":
+        return get_raw_data_api(url=source_path, secret_path=secret_path, params=api_params)
+    if source_type == "gcs":
         return get_raw_data_gcs(
-            dataset_id=dataset_id,
-            table_id=table_id,
-            file_name=file_name,
-            mode=mode,
-            partitions=partitions,
-            zip_file_name=zip_file_name,
+            gcs_path=source_path,
+            mode="raw",
+            zip_filename=zip_filename,
         )
 
 
-@task(checkpoint=False)
-def transform_data_to_json(status: dict, file_type: str, csv_args: dict):
-    data = status["data"]
-    error = status["error"]
+# TODO: passar para função para dentro da transform_raw_to_nested_structure
+# @task(checkpoint=False)
+# def transform_data_to_json(status: dict, file_type: str, csv_args: dict):
+#     data = status["data"]
+#     error = status["error"]
 
-    if file_type == "json":
-        pass
+#     if file_type == "json":
+#         pass
 
-        # todo: move to data check on specfic API # pylint: disable=W0102
-        # if isinstance(data, dict) and "DescricaoErro" in data.keys():
-        #     error = data["DescricaoErro"]
+#         # todo: move to data check on specfic API # pylint: disable=W0102
+#         # if isinstance(data, dict) and "DescricaoErro" in data.keys():
+#         #     error = data["DescricaoErro"]
 
-    elif file_type in ("txt", "csv"):
-        if csv_args is None:
-            csv_args = {}
-        data = pd.read_csv(io.StringIO(data), **csv_args).to_dict(orient="records")
-    else:
-        error = "Unsupported raw file extension. Supported only: json, csv and txt"
+#     elif file_type in ("txt", "csv"):
+#         if csv_args is None:
+#             csv_args = {}
+#         data = pd.read_csv(io.StringIO(data), **csv_args).to_dict(orient="records")
+#     else:
+#         error = "Unsupported raw file extension. Supported only: json, csv and txt"
 
-    return {"data": data, "error": error}
+#     return {"data": data, "error": error}
diff --git a/pipelines/rj_smtr/utils.py b/pipelines/rj_smtr/utils.py
index c7b13bfc3..a4376bb88 100644
--- a/pipelines/rj_smtr/utils.py
+++ b/pipelines/rj_smtr/utils.py
@@ -451,17 +451,47 @@ def generate_execute_schedules(  # pylint: disable=too-many-arguments,too-many-l
     return clocks
 
 
+def _save_raw_local(data: dict, file_path: str, mode: str = "raw", filetype: str = "json") -> str:
+    """
+    Saves json response from API to .json file.
+    Args:
+        file_path (str): Path which to save raw file
+        status (dict): Must contain keys
+          * data: json returned from API
+          * error: error catched from API request
+        mode (str, optional): Folder to save locally, later folder which to upload to GCS.
+    Returns:
+        str: Path to the saved file
+    """
+
+    # diferentes tipos de arquivos para salvar
+    _file_path = file_path.format(mode=mode, filetype=filetype)
+    Path(_file_path).parent.mkdir(parents=True, exist_ok=True)
+
+    if filetype == "json":
+        json.dump(data, Path(_file_path).open("w", encoding="utf-8"))
+
+    if filetype == "csv":
+        pass
+    if filetype == "txt":
+        pass
+
+    log(f"Raw data saved to: {_file_path}")
+    return _file_path
+
+
 def get_raw_data_api(  # pylint: disable=R0912
     url: str,
-    headers: str = None,
-    params: dict = None,
+    secret_path: str = None,
+    api_params: dict = None,
+    filepath: str = None
 ) -> list[dict]:
     """
     Request data from URL API
 
     Args:
         url (str): URL to send request
-        headers (str, optional): Path to headers guardeded on Vault, if needed.
+        secret_path (str, optional): Path to secrets guardeded on Vault, if needed.
         params (dict, optional): Params to be sent on request
 
     Returns:
@@ -469,58 +499,45 @@ def get_raw_data_api(  # pylint: disable=R0912
           * `data` (json): data result
           * `error` (str): catched error, if any. Otherwise, returns None
     """
-    data = None
     error = None
-
     try:
-        if headers is not None:
-            headers = get_vault_secret(headers)["data"]
-
-            # remove from headers, if present
-            remove_headers = ["host", "databases"]
-            for remove_header in remove_headers:
-                if remove_header in list(headers.keys()):
-                    del headers[remove_header]
+        if secret_path is None:
+            headers = secret_path
+        else:
+            headers = get_vault_secret(secret_path)["data"]
 
         response = requests.get(
             url,
             headers=headers,
             timeout=constants.MAX_TIMEOUT_SECONDS.value,
-            params=params,
+            params=api_params,
         )
 
         response.raise_for_status()
-
-        data = response.text
+        filepath = _save_raw_local(data=response.text, filepath=filepath)
 
     except Exception as exp:
         error = exp
-
-    if error is not None:
         log(f"[CATCHED] Task failed with error: \n{error}", level="error")
 
-    return {"data": data, "error": error}
+    return error, filepath
 
 
 def get_raw_data_gcs(
-    dataset_id: str,
-    table_id: str,
-    file_name: str,
-    mode: str,
-    partitions: str = None,
+    gcs_path: str,
     zip_extracted_file: str = None,
 ) -> dict:
+    
     error = None
-    data = None
+
     try:
         blob = get_storage_blob(
-            dataset_id=dataset_id,
-            table_id=table_id,
-            file_name=file_name,
-            partitions=partitions,
-            mode=mode,
+            gcs_path=gcs_path,
+            mode="raw",
         )
 
+        data = blob.download_as_bytes()
+
         if zip_extracted_file:
             compressed_data = blob.download_as_bytes()
 
@@ -528,7 +545,93 @@ def get_raw_data_gcs(
                 data = zipped_file.read(zip_extracted_file).decode(encoding="utf-8")
         else:
             data = blob.download_as_string()
+
     except Exception as exp:
         error = exp
 
     return {"data": data, "error": error}
+
+
+def _save_treated_local(file_path: str, status: dict, mode: str = "staging") -> str:
+    """
+    Save treated file to CSV.
+
+    Args:
+        file_path (str): Path which to save treated file
+        status (dict): Must contain keys
+          * `data`: dataframe returned from treatement
+          * `error`: error catched from data treatement
+        mode (str, optional): Folder to save locally, later folder which to upload to GCS.
+
+    Returns:
+        str: Path to the saved file
+    """
+    _file_path = file_path.format(mode=mode, filetype="csv")
+    Path(_file_path).parent.mkdir(parents=True, exist_ok=True)
+    if status["error"] is None:
+        status["data"].to_csv(_file_path, index=False)
+        log(f"Treated data saved to: {_file_path}")
+    return _file_path
+
+
+def upload_run_logs_to_bq(  # pylint: disable=R0913
+    dataset_id: str,
+    parent_table_id: str,
+    timestamp: str,
+    error: str = None,
+    previous_error: str = None,
+    recapture: bool = False,
+    mode: str = "raw"
+):
+    """
+    Upload execution status table to BigQuery.
+    Table is uploaded to the same dataset, named {parent_table_id}_logs.
+    If passing status_dict, should not pass timestamp and error.
+
+    Args:
+        dataset_id (str): dataset_id on BigQuery
+        parent_table_id (str): Parent table id related to the status table
+        timestamp (str): ISO formatted timestamp string
+        error (str, optional): String associated with error caught during execution
+    Returns:
+        None
+    """
+    table_id = parent_table_id + "_logs"
+    # Create partition directory
+    filename = f"{table_id}_{timestamp.isoformat()}"
+    partition = f"data={timestamp.date()}"
+    filepath = Path(
+        f"""data/{mode}/{dataset_id}/{table_id}/{partition}/{filename}.csv"""
+    )
+    filepath.parent.mkdir(exist_ok=True, parents=True)
+    # Create dataframe to be uploaded
+    if not error and recapture is True:
+        # if the recapture is succeeded, update the column erro
+        dataframe = pd.DataFrame(
+            {
+                "timestamp_captura": [timestamp],
+                "sucesso": [True],
+                "erro": [f"[recapturado]{previous_error}"],
+            }
+        )
+        log(f"Recapturing {timestamp} with previous error:\n{error}")
+    else:
+        # not recapturing or error during flow execution
+        dataframe = pd.DataFrame(
+            {
+                "timestamp_captura": [timestamp],
+                "sucesso": [error is None],
+                "erro": [error],
+            }
+        )
+    # Save data local
+    dataframe.to_csv(filepath, index=False)
+    # Upload to Storage
+    create_or_append_table(
+        dataset_id=dataset_id,
+        table_id=table_id,
+        path=filepath.as_posix(),
+        partitions=partition,
+    )
+    if error is not None:
+        raise Exception(f"Pipeline failed with error: {error}")
\ No newline at end of file
diff --git a/pipelines/utils/utils.py b/pipelines/utils/utils.py
index 79a264017..147e54f4f 100644
--- a/pipelines/utils/utils.py
+++ b/pipelines/utils/utils.py
@@ -725,24 +725,14 @@ def get_storage_blobs(dataset_id: str, table_id: str, mode: str = "staging") ->
 
 
 def get_storage_blob(
-    dataset_id: str,
-    table_id: str,
-    file_name: str,
-    partitions: str = None,
+    gcs_path: str,
     mode: str = "staging",
 ):
-    path = f"{mode}/{dataset_id}/{table_id}/"
-
-    if partitions:
-        path += f"{partitions}/"
-
-    path += file_name
-
-    bd_storage = bd.Storage(dataset_id=dataset_id, table_id=table_id)
+    bucket = bd.Storage()
     return (
-        bd_storage.client["storage_staging"]
-        .bucket(bd_storage.bucket_name)
-        .get_blob(blob_name=path)
+        bucket.client["storage_staging"]
+        .bucket(bucket.bucket_name)
+        .get_blob(blob_name=gcs_path)
     )
 
 

From 0c3df1b05e8a257a20d9367cb282050a1df74cb9 Mon Sep 17 00:00:00 2001
From: Rafael <rafael.c.pinheiro96@gmail.com>
Date: Tue, 26 Sep 2023 22:41:01 -0300
Subject: [PATCH 008/145] change default capture flow structure

---
 pipelines/rj_smtr/constants.py | 12 ++++-
 pipelines/rj_smtr/tasks.py     | 87 ++++++++++++++++++++++------------
 pipelines/rj_smtr/utils.py     | 55 ++++++++++++---------
 3 files changed, 102 insertions(+), 52 deletions(-)

diff --git a/pipelines/rj_smtr/constants.py b/pipelines/rj_smtr/constants.py
index d402bb6e9..00558f9cc 100644
--- a/pipelines/rj_smtr/constants.py
+++ b/pipelines/rj_smtr/constants.py
@@ -264,4 +264,14 @@ class constants(Enum):  # pylint: disable=c0103
     BILHETAGEM_SECRET_PATH = "smtr_jae_access_data"
 
     # GTFS
-    
+    GTFS_DATASET_ID = "br_rj_riodejaneiro_gtfs"
+
+    GTFS_SOURCE_TYPE = "gcs"
+
+    GTFS_AGENCY_REQUEST_PARAMS = {
+        "filepath": "development/br_rj_riodejaneiro_gtfs/upload/gtfs.zip"
+    }
+
+    GTFS_AGENCY_TABLE_ID = "agency"
+
+    GTFS_QUADRO_TABLE_ID = "quadro"
diff --git a/pipelines/rj_smtr/tasks.py b/pipelines/rj_smtr/tasks.py
index b7f484171..0a40dae26 100644
--- a/pipelines/rj_smtr/tasks.py
+++ b/pipelines/rj_smtr/tasks.py
@@ -30,7 +30,7 @@
     data_info_str,
     get_raw_data_api,
     get_raw_data_gcs,
-    upload_run_logs_to_bq
+    upload_run_logs_to_bq,
 )
 from pipelines.utils.execute_dbt_model.utils import get_dbt_client
 from pipelines.utils.utils import log, get_redis_client, get_vault_secret
@@ -604,7 +604,12 @@ def upload_logs_to_bq(  # pylint: disable=R0913
 
 @task
 def upload_raw_data_to_gcs(
-    error: bool, raw_filepath: str, timestamp: datetime, table_id: str, dataset_id: str, partitions: list
+    error: bool,
+    raw_filepath: str,
+    timestamp: datetime,
+    table_id: str,
+    dataset_id: str,
+    partitions: list,
 ):
     if not error:
         try:
@@ -622,19 +627,24 @@ def upload_raw_data_to_gcs(
         except Exception:
             error = traceback.format_exc()
             log(f"[CATCHED] Task failed with error: \n{error}", level="error")
-    
+
     upload_run_logs_to_bq(
         dataset_id=dataset_id,
         parent_table_id=table_id,
         error=error,
         timestamp=timestamp,
-        mode="raw"
+        mode="raw",
     )
 
 
 @task
 def upload_staging_data_to_gcs(
-    error: bool, staging_filepath: str, timestamp: datetime, table_id: str, dataset_id: str, partitions: list
+    error: bool,
+    staging_filepath: str,
+    timestamp: datetime,
+    table_id: str,
+    dataset_id: str,
+    partitions: list,
 ):
     if not error:
         try:
@@ -643,20 +653,20 @@ def upload_staging_data_to_gcs(
                 dataset_id=dataset_id,
                 table_id=table_id,
                 path=staging_filepath,
-                partitions=partitions
+                partitions=partitions,
             )
         except Exception:
             error = traceback.format_exc()
             log(f"[CATCHED] Task failed with error: \n{error}", level="error")
-    
+
     upload_run_logs_to_bq(
         dataset_id=dataset_id,
         parent_table_id=table_id,
         error=error,
         timestamp=timestamp,
-        mode="staging"
+        mode="staging",
     )
-    
+
 
 ###############
 #
@@ -904,7 +914,7 @@ def transform_raw_to_nested_structure(
         error = None
         # leitura do dado raw
         # data = pd.DataFrame(status["data"])
-
+        data = None
         log(
             f"""
         Received inputs:
@@ -940,7 +950,7 @@ def transform_raw_to_nested_structure(
         )
 
         # save treated local
-        filepath = _save_trated_local(data=data, filepath=filepath)
+        # filepath = _save_trated_local(data=data, filepath=filepath)
 
     except Exception as exp:  # pylint: disable=W0703
         error = exp
@@ -980,7 +990,11 @@ def transform_raw_to_nested_structure(
 
 @task(checkpoint=False, nout=2)
 def create_request_params(
-    datetime_range: dict, table_params: dict, secret_path: str, dataset_id: str
+    # datetime_range: dict,
+    # table_params: dict,
+    table_id: str,
+    secret_path: str,
+    dataset_id: str,
 ) -> tuple:
     """
     Task to create request params
@@ -995,16 +1009,28 @@ def create_request_params(
         request_params: host, database and query to request data
         request_url: url to request data
     """
-
+    request_params = None  # TODO: retirar essa linha
     if dataset_id == constants.BILHETAGEM_DATASET_ID.value:
         secrets = get_vault_secret(secret_path)["data"]
-        database_secrets = secrets["databases"][table_params["extraction"]["database"]]
-        request_url = secrets["vpn_url"] + database_secrets["engine"]
-        request_params = {
-            "host": database_secrets["host"],  # TODO: exibir no log em ambiente fechado
-            "database": table_params["extraction"]["database"],
-            "query": table_params["extraction"]["query"].format(**datetime_range),
-        }
+
+        # TODO: RETIRAR ESSA LINHA
+        request_params = secrets
+
+        # TODO: mudar modo de pegar os parametros
+        # database_secrets = secrets["databases"][table_params["extraction"]["database"]]
+        # request_url = secrets["vpn_url"] + database_secrets["engine"]
+        # request_params = {
+        #     "host": database_secrets["host"],  # TODO: exibir no log em ambiente fechado
+        #     "database": table_params["extraction"]["database"],
+        #     "query": table_params["extraction"]["query"].format(**datetime_range),
+        # }
+
+    elif dataset_id == constants.GTFS_DATASET_ID.value:
+        gtfs_base_path = "development/br_rj_riodejaneiro_gtfs/upload"
+        if table_id == constants.GTFS_QUADRO_ID.value:
+            request_url = f"{gtfs_base_path}/quadro.csv"
+        else:
+            request_url = f"{gtfs_base_path}/gtfs.zip"
 
     return request_params, request_url
 
@@ -1013,18 +1039,21 @@ def create_request_params(
 def get_raw_from_sources(
     source_type: str,
     source_path: str = None,
-    zip_filename: str = None,
+    table_id: str = None,
     secret_path: str = None,
     api_params: dict = None,
 ):
-    if source_type == "api":
-        return get_raw_data_api(url=source_path, secret_path=secret_path, params=api_params)
-    if source_type == "gcs":
-        return get_raw_data_gcs(
-            gcs_path=source_path,
-            mode="raw",
-            zip_filename=zip_filename,
-        )
+    pass
+    # TODO: descomentar linhas abaixo, passando argumentos corretos
+    # if source_type == "api":
+    #     return get_raw_data_api(
+    #         url=source_path, secret_path=secret_path, params=api_params
+    #     )
+    # if source_type == "gcs":
+    #     return get_raw_data_gcs(
+    #         gcs_path=source_path,
+    #         filename_to_unzip=table_id,
+    #     )
 
 
 # TODO: passar para função para dentro da transform_raw_to_nested_structure
diff --git a/pipelines/rj_smtr/utils.py b/pipelines/rj_smtr/utils.py
index a4376bb88..68774c17d 100644
--- a/pipelines/rj_smtr/utils.py
+++ b/pipelines/rj_smtr/utils.py
@@ -10,12 +10,14 @@
 from datetime import timedelta, datetime
 from typing import List
 import io
-import basedosdados as bd
-from basedosdados import Table
-import pandas as pd
+import json
 import pytz
 import requests
 import zipfile
+import basedosdados as bd
+from basedosdados import Table
+import pandas as pd
+
 
 from prefect.schedules.clocks import IntervalClock
 
@@ -451,7 +453,9 @@ def generate_execute_schedules(  # pylint: disable=too-many-arguments,too-many-l
     return clocks
 
 
-def _save_raw_local(data: dict, file_path: str, mode: str = "raw", filetype: str = "json") -> str:
+def _save_raw_local(
+    data: dict, file_path: str, mode: str = "raw", filetype: str = "json"
+) -> str:
     """
     Saves json response from API to .json file.
     Args:
@@ -471,20 +475,18 @@ def _save_raw_local(data: dict, file_path: str, mode: str = "raw", filetype: str
     if filetype == "json":
         json.dump(data, Path(_file_path).open("w", encoding="utf-8"))
 
-    if filetype == "csv":
-        pass
+    # if filetype == "csv":
+    #     pass
     if filetype == "txt":
-        pass
+        with open(_file_path, "w", encoding="utf-8") as file:
+            file.write(data)
 
     log(f"Raw data saved to: {_file_path}")
     return _file_path
 
 
 def get_raw_data_api(  # pylint: disable=R0912
-    url: str,
-    secret_path: str = None,
-    api_params: dict = None,
-    filepath: str = None
+    url: str, secret_path: str = None, api_params: dict = None, filepath: str = None
 ) -> list[dict]:
     """
     Request data from URL API
@@ -525,9 +527,9 @@ def get_raw_data_api(  # pylint: disable=R0912
 
 def get_raw_data_gcs(
     gcs_path: str,
-    zip_extracted_file: str = None,
+    local_filepath: str,
+    filename_to_unzip: str = None,
 ) -> dict:
-    
     error = None
 
     try:
@@ -538,18 +540,27 @@ def get_raw_data_gcs(
 
         data = blob.download_as_bytes()
 
-        if zip_extracted_file:
-            compressed_data = blob.download_as_bytes()
-
-            with zipfile.ZipFile(io.BytesIO(compressed_data), "r") as zipped_file:
-                data = zipped_file.read(zip_extracted_file).decode(encoding="utf-8")
+        if filename_to_unzip:
+            with zipfile.ZipFile(io.BytesIO(data), "r") as zipped_file:
+                filenames = zipped_file.namelist()
+                filename = list(
+                    filter(lambda x: x.split(".")[0] == filename_to_unzip, filenames)
+                )[0]
+                data = zipped_file.read(filename)
         else:
-            data = blob.download_as_string()
+            filename = blob.name
+
+        raw_filepath = _save_raw_local(
+            data=data.decode(encoding="utf-8"),
+            file_path=local_filepath,
+            filetype=filename.split(".")[-1],
+        )
 
     except Exception as exp:
         error = exp
+        log(f"[CATCHED] Task failed with error: \n{error}", level="error")
 
-    return {"data": data, "error": error}
+    return error, raw_filepath
 
 
 def _save_treated_local(file_path: str, status: dict, mode: str = "staging") -> str:
@@ -581,7 +592,7 @@ def upload_run_logs_to_bq(  # pylint: disable=R0913
     error: str = None,
     previous_error: str = None,
     recapture: bool = False,
-    mode: str = "raw"
+    mode: str = "raw",
 ):
     """
     Upload execution status table to BigQuery.
@@ -634,4 +645,4 @@ def upload_run_logs_to_bq(  # pylint: disable=R0913
         partitions=partition,
     )
     if error is not None:
-        raise Exception(f"Pipeline failed with error: {error}")
\ No newline at end of file
+        raise Exception(f"Pipeline failed with error: {error}")

From f6ca7ab8c23ad720e30b00c1862837848ad1fad3 Mon Sep 17 00:00:00 2001
From: Rafael <rafael.c.pinheiro96@gmail.com>
Date: Wed, 27 Sep 2023 10:36:00 -0300
Subject: [PATCH 009/145] change generic capture flow

---
 pipelines/rj_smtr/flows.py | 53 ++++++++++------------
 pipelines/rj_smtr/tasks.py | 80 +++++++++++++++++++--------------
 pipelines/rj_smtr/utils.py | 91 +++++++++++++++++++++++++++++---------
 3 files changed, 141 insertions(+), 83 deletions(-)

diff --git a/pipelines/rj_smtr/flows.py b/pipelines/rj_smtr/flows.py
index fb763cc5a..3dd834a75 100644
--- a/pipelines/rj_smtr/flows.py
+++ b/pipelines/rj_smtr/flows.py
@@ -40,8 +40,7 @@
     "SMTR: Captura",
     code_owners=["caio", "fernanda", "boris", "rodrigo"],
 ) as default_capture_flow:
-    
-    ### Configuração ###
+    # Configuração #
 
     table_id = Parameter("table_id", default=None)
     partition_date_only = Parameter("partition_date_only", default=None)
@@ -71,15 +70,19 @@
         partitions=partitions,
     )
 
-    ### Extração ###
+    # Extração #
     # é necessária task ou função dentro da extract_raw_data?
     request_params, request_path = create_request_params(
         secret_path=secret_path,
         dataset_id=dataset_id,
+        request_params=request_params,
+        table_id=table_id,
+        timestamp=timestamp,
     )
 
     error, raw_filepath = get_raw_from_sources(
-        source_type=source_type, # parametro de extracao, onde ficar?
+        source_type=source_type,  # parametro de extracao, onde ficar?
+        local_filepath=filepath,
         source_path=request_path,
         zip_filename=table_id,
         secret_path=secret_path,
@@ -87,40 +90,32 @@
     )
 
     RAW_UPLOADED = upload_raw_data_to_gcs(
-        error=error, 
-        filepath=raw_filepath, 
-        timestamp=timestamp, 
-        partitions=partitions
+        error=error,
+        raw_filepath=raw_filepath,
+        timestamp=timestamp,
+        table_id=table_id,
+        dataset_id=dataset_id,
+        partitions=partitions,
     )
 
-    ### Pré-tratamento ###
+    # Pré-tratamento #
 
     error, staging_filepath = transform_raw_to_nested_structure(
         raw_filepath=raw_filepath,
+        filepath=filepath,
+        error=error,
         timestamp=timestamp,
         primary_key=primary_key,
     )
 
-    STAGING_UPLOADED = upload_staging_data_to_gcs(error=error, filepath=staging_filepath, timestamp=timestamp)
-
-    # treated_filepath = save_treated_local(status=treated_status, file_path=filepath)
-
-    # LOAD #
-    # error = bq_upload(
-    #     dataset_id=dataset_id,
-    #     table_id=table_params["pre-treatment"]["table_id"],
-    #     filepath=treated_filepath,
-    #     raw_filepath=raw_filepath,
-    #     partitions=partitions,
-    #     status=treated_status,
-    # )
-
-    # upload_logs_to_bq(
-    #     dataset_id=dataset_id,
-    #     parent_table_id=table_params["pre-treatment"]["table_id"],
-    #     error=error,
-    #     timestamp=timestamp,
-    # )
+    STAGING_UPLOADED = upload_staging_data_to_gcs(
+        error=error,
+        staging_filepath=staging_filepath,
+        timestamp=timestamp,
+        table_id=table_id,
+        dataset_id=dataset_id,
+        partitions=partitions,
+    )
 
 default_capture_flow.storage = GCS(emd_constants.GCS_FLOWS_BUCKET.value)
 default_capture_flow.run_config = KubernetesRun(
diff --git a/pipelines/rj_smtr/tasks.py b/pipelines/rj_smtr/tasks.py
index 0a40dae26..89beae6f2 100644
--- a/pipelines/rj_smtr/tasks.py
+++ b/pipelines/rj_smtr/tasks.py
@@ -31,6 +31,9 @@
     get_raw_data_api,
     get_raw_data_gcs,
     upload_run_logs_to_bq,
+    get_datetime_range,
+    transform_data_to_json,
+    save_treated_local_func,
 )
 from pipelines.utils.execute_dbt_model.utils import get_dbt_client
 from pipelines.utils.utils import log, get_redis_client, get_vault_secret
@@ -874,7 +877,11 @@ def get_previous_date(days):
 
 @task
 def transform_raw_to_nested_structure(
-    filepath: str, error: bool, timestamp: datetime, primary_key: list = None
+    raw_filepath: str,
+    filepath: str,
+    error: bool,
+    timestamp: datetime,
+    primary_key: list = None,
 ):
     """Transform dataframe to nested structure
 
@@ -891,16 +898,18 @@ def transform_raw_to_nested_structure(
             * `error` (str): catched error, if any. Otherwise, returns None
     """
 
+    with open(raw_filepath, "r", encoding="utf-8") as file:
+        data = file.read()
+
     # ORGANIZAR:
-    # json_status = transform_data_to_json(
-    #     status=raw_status,
-    #     file_type=table_params["pre-treatment"]["file_type"],
-    #     csv_args=table_params["pre-treatment"]["csv_args"],
-    # )
+    error, data = transform_data_to_json(
+        data=data,
+        file_type=raw_filepath.split(".")[-1],
+    )
 
     # Check previous error
     if error is not None:
-        return {"data": pd.DataFrame(), "error": error}
+        return error, None
 
     # Check empty dataframe
     # if len(status["data"]) == 0:
@@ -913,8 +922,8 @@ def transform_raw_to_nested_structure(
 
         error = None
         # leitura do dado raw
-        # data = pd.DataFrame(status["data"])
-        data = None
+        data = pd.DataFrame(data)
+
         log(
             f"""
         Received inputs:
@@ -950,7 +959,7 @@ def transform_raw_to_nested_structure(
         )
 
         # save treated local
-        # filepath = _save_trated_local(data=data, filepath=filepath)
+        filepath = save_treated_local_func(data=data, error=error, filepath=filepath)
 
     except Exception as exp:  # pylint: disable=W0703
         error = exp
@@ -992,9 +1001,11 @@ def transform_raw_to_nested_structure(
 def create_request_params(
     # datetime_range: dict,
     # table_params: dict,
+    request_params: dict,
     table_id: str,
     secret_path: str,
     dataset_id: str,
+    timestamp: datetime,
 ) -> tuple:
     """
     Task to create request params
@@ -1009,25 +1020,25 @@ def create_request_params(
         request_params: host, database and query to request data
         request_url: url to request data
     """
-    request_params = None  # TODO: retirar essa linha
+
     if dataset_id == constants.BILHETAGEM_DATASET_ID.value:
         secrets = get_vault_secret(secret_path)["data"]
 
-        # TODO: RETIRAR ESSA LINHA
-        request_params = secrets
+        database_secrets = secrets["databases"][request_params["database"]]
+        request_url = secrets["vpn_url"] + database_secrets["engine"]
 
-        # TODO: mudar modo de pegar os parametros
-        # database_secrets = secrets["databases"][table_params["extraction"]["database"]]
-        # request_url = secrets["vpn_url"] + database_secrets["engine"]
-        # request_params = {
-        #     "host": database_secrets["host"],  # TODO: exibir no log em ambiente fechado
-        #     "database": table_params["extraction"]["database"],
-        #     "query": table_params["extraction"]["query"].format(**datetime_range),
-        # }
+        datetime_range = get_datetime_range(
+            timestamp=timestamp, interval=request_params["run_interval"]
+        )
+        request_params = {
+            "host": database_secrets["host"],  # TODO: exibir no log em ambiente fechado
+            "database": request_params["database"],
+            "query": request_params["query"].format(**datetime_range),
+        }
 
     elif dataset_id == constants.GTFS_DATASET_ID.value:
         gtfs_base_path = "development/br_rj_riodejaneiro_gtfs/upload"
-        if table_id == constants.GTFS_QUADRO_ID.value:
+        if table_id == constants.GTFS_QUADRO_TABLE_ID.value:
             request_url = f"{gtfs_base_path}/quadro.csv"
         else:
             request_url = f"{gtfs_base_path}/gtfs.zip"
@@ -1038,22 +1049,25 @@ def create_request_params(
 @task(checkpoint=False)
 def get_raw_from_sources(
     source_type: str,
+    local_filepath: str,
     source_path: str = None,
     table_id: str = None,
     secret_path: str = None,
     api_params: dict = None,
 ):
-    pass
-    # TODO: descomentar linhas abaixo, passando argumentos corretos
-    # if source_type == "api":
-    #     return get_raw_data_api(
-    #         url=source_path, secret_path=secret_path, params=api_params
-    #     )
-    # if source_type == "gcs":
-    #     return get_raw_data_gcs(
-    #         gcs_path=source_path,
-    #         filename_to_unzip=table_id,
-    #     )
+    if source_type == "api":
+        return get_raw_data_api(
+            url=source_path,
+            secret_path=secret_path,
+            api_params=api_params,
+            filepath=local_filepath,
+        )
+    if source_type == "gcs":
+        return get_raw_data_gcs(
+            gcs_path=source_path,
+            filename_to_unzip=table_id,
+            local_filepath=local_filepath,
+        )
 
 
 # TODO: passar para função para dentro da transform_raw_to_nested_structure
diff --git a/pipelines/rj_smtr/utils.py b/pipelines/rj_smtr/utils.py
index 68774c17d..184a93df7 100644
--- a/pipelines/rj_smtr/utils.py
+++ b/pipelines/rj_smtr/utils.py
@@ -11,9 +11,9 @@
 from typing import List
 import io
 import json
+import zipfile
 import pytz
 import requests
-import zipfile
 import basedosdados as bd
 from basedosdados import Table
 import pandas as pd
@@ -453,13 +453,13 @@ def generate_execute_schedules(  # pylint: disable=too-many-arguments,too-many-l
     return clocks
 
 
-def _save_raw_local(
-    data: dict, file_path: str, mode: str = "raw", filetype: str = "json"
+def save_raw_local_func(
+    data: dict, filepath: str, mode: str = "raw", filetype: str = "json"
 ) -> str:
     """
     Saves json response from API to .json file.
     Args:
-        file_path (str): Path which to save raw file
+        filepath (str): Path which to save raw file
         status (dict): Must contain keys
           * data: json returned from API
           * error: error catched from API request
@@ -469,20 +469,20 @@ def _save_raw_local(
     """
 
     # diferentes tipos de arquivos para salvar
-    _file_path = file_path.format(mode=mode, filetype=filetype)
-    Path(_file_path).parent.mkdir(parents=True, exist_ok=True)
+    _filepath = filepath.format(mode=mode, filetype=filetype)
+    Path(_filepath).parent.mkdir(parents=True, exist_ok=True)
 
     if filetype == "json":
-        json.dump(data, Path(_file_path).open("w", encoding="utf-8"))
+        json.dump(data, Path(_filepath).open("w", encoding="utf-8"))
 
     # if filetype == "csv":
     #     pass
     if filetype == "txt":
-        with open(_file_path, "w", encoding="utf-8") as file:
+        with open(_filepath, "w", encoding="utf-8") as file:
             file.write(data)
 
-    log(f"Raw data saved to: {_file_path}")
-    return _file_path
+    log(f"Raw data saved to: {_filepath}")
+    return _filepath
 
 
 def get_raw_data_api(  # pylint: disable=R0912
@@ -516,7 +516,9 @@ def get_raw_data_api(  # pylint: disable=R0912
         )
 
         response.raise_for_status()
-        filepath = _save_raw_local(data=response.text, filepath=filepath)
+        filepath = save_raw_local_func(
+            data=response.text, filepath=filepath
+        )  # TODO: mudar filetype
 
     except Exception as exp:
         error = exp
@@ -550,9 +552,9 @@ def get_raw_data_gcs(
         else:
             filename = blob.name
 
-        raw_filepath = _save_raw_local(
+        raw_filepath = save_raw_local_func(
             data=data.decode(encoding="utf-8"),
-            file_path=local_filepath,
+            filepath=local_filepath,
             filetype=filename.split(".")[-1],
         )
 
@@ -563,12 +565,14 @@ def get_raw_data_gcs(
     return error, raw_filepath
 
 
-def _save_treated_local(file_path: str, status: dict, mode: str = "staging") -> str:
+def save_treated_local_func(
+    filepath: str, data: pd.DataFrame, error: str, mode: str = "staging"
+) -> str:
     """
     Save treated file to CSV.
 
     Args:
-        file_path (str): Path which to save treated file
+        filepath (str): Path which to save treated file
         status (dict): Must contain keys
           * `data`: dataframe returned from treatement
           * `error`: error catched from data treatement
@@ -577,12 +581,12 @@ def _save_treated_local(file_path: str, status: dict, mode: str = "staging") ->
     Returns:
         str: Path to the saved file
     """
-    _file_path = file_path.format(mode=mode, filetype="csv")
-    Path(_file_path).parent.mkdir(parents=True, exist_ok=True)
-    if status["error"] is None:
-        status["data"].to_csv(_file_path, index=False)
-        log(f"Treated data saved to: {_file_path}")
-    return _file_path
+    _filepath = filepath.format(mode=mode, filetype="csv")
+    Path(_filepath).parent.mkdir(parents=True, exist_ok=True)
+    if error is None:
+        data.to_csv(_filepath, index=False)
+        log(f"Treated data saved to: {_filepath}")
+    return _filepath
 
 
 def upload_run_logs_to_bq(  # pylint: disable=R0913
@@ -646,3 +650,48 @@ def upload_run_logs_to_bq(  # pylint: disable=R0913
     )
     if error is not None:
         raise Exception(f"Pipeline failed with error: {error}")
+
+
+def get_datetime_range(
+    timestamp: datetime,
+    interval: int,
+) -> dict:
+    """
+    Task to get datetime range in UTC
+
+    Args:
+        timestamp (datetime): timestamp to get datetime range
+        interval (int): interval in seconds
+
+    Returns:
+        dict: datetime range
+    """
+
+    start = (
+        (timestamp - timedelta(seconds=interval))
+        .astimezone(tz=pytz.timezone("UTC"))
+        .strftime("%Y-%m-%d %H:%M:%S")
+    )
+
+    end = timestamp.astimezone(tz=pytz.timezone("UTC")).strftime("%Y-%m-%d %H:%M:%S")
+
+    return {"start": start, "end": end}
+
+
+def transform_data_to_json(data: str, file_type: str, csv_args: dict = {}):
+    try:
+        if file_type == "json":
+            data = json.loads(data)
+
+        elif file_type in ("txt", "csv"):
+            if csv_args is None:
+                csv_args = {}
+            data = pd.read_csv(io.StringIO(data), **csv_args).to_dict(orient="records")
+        else:
+            error = "Unsupported raw file extension. Supported only: json, csv and txt"
+
+    except Exception as exp:
+        error = exp
+        log(f"[CATCHED] Task failed with error: \n{error}", level="error")
+
+    return error, data

From fa17be21b41769895fb4154b78d86d373652d368 Mon Sep 17 00:00:00 2001
From: Rafael <rafael.c.pinheiro96@gmail.com>
Date: Wed, 27 Sep 2023 11:20:15 -0300
Subject: [PATCH 010/145] adjust constant structure

---
 pipelines/rj_smtr/constants.py | 36 +++++++++++++++++++++++++-------
 pipelines/rj_smtr/flows.py     |  6 ++----
 pipelines/rj_smtr/tasks.py     | 38 +++++++++++++++-------------------
 3 files changed, 48 insertions(+), 32 deletions(-)

diff --git a/pipelines/rj_smtr/constants.py b/pipelines/rj_smtr/constants.py
index 00558f9cc..7eb18ef85 100644
--- a/pipelines/rj_smtr/constants.py
+++ b/pipelines/rj_smtr/constants.py
@@ -165,6 +165,18 @@ class constants(Enum):  # pylint: disable=c0103
 
     # BILHETAGEM
     BILHETAGEM_DATASET_ID = "br_rj_riodejaneiro_bilhetagem"
+    BILHETAGEM_DATABASES = {
+        "principal_db": {
+            "engine": "mysql",
+            "host": "principal-database-replica.internal",
+        },
+        "tarifa_db": {"engine": "postgres", "host": "tarifa-database-replica.internal"},
+        "transacao_db": {
+            "engine": "postgres",
+            "host": "transacao-database-replica.internal",
+        },
+    }
+    BILHETAGEM_VPN_URL = "http://vpn-jae.mobilidade.rio/"
     BILHETAGEM_TRANSACAO_TABLE_PARAMS = [
         {
             "partition_date_only": False,
@@ -264,14 +276,24 @@ class constants(Enum):  # pylint: disable=c0103
     BILHETAGEM_SECRET_PATH = "smtr_jae_access_data"
 
     # GTFS
-    GTFS_DATASET_ID = "br_rj_riodejaneiro_gtfs"
+    GTFS_CAPTURE_PARAMS = [
+        {"table_id": "agency", "primary_key": ["agency_id"]},
+        {"table_id": "calendar_dates", "primary_key": ["service_id"]},
+        {"table_id": "calendar", "primary_key": ["service_id"]},
+        {"table_id": "feed_info", "primary_key": ["feed_publisher_name"]},
+        {"table_id": "frequencies", "primary_key": ["trip_id"]},
+        {"table_id": "routes", "primary_key": ["route_id"]},
+        {"table_id": "shapes", "primary_key": ["shape_id"]},
+        {"table_id": "stops", "primary_key": ["stop_id"]},
+        {"table_id": "trips", "primary_key": ["trip_id"]},
+        {"table_id": "fare_attributes", "primary_key": ["fare_id"]},
+        {"table_id": "fare_rules", "primary_key": ["fare_id"]},
+    ]
 
-    GTFS_SOURCE_TYPE = "gcs"
+    GTFS_GENERAL_CAPTURE_PARAMS = {"partition_date_only": True, "source_type": "gcs"}
 
-    GTFS_AGENCY_REQUEST_PARAMS = {
-        "filepath": "development/br_rj_riodejaneiro_gtfs/upload/gtfs.zip"
-    }
+    GTFS_QUADRO_CAPTURE_PARAMS = {"table_id": "quadro", "primary_key": "servico"}
 
-    GTFS_AGENCY_TABLE_ID = "agency"
+    GTFS_DATASET_ID = "br_rj_riodejaneiro_gtfs"
 
-    GTFS_QUADRO_TABLE_ID = "quadro"
+    GTFS_BASE_GCS_PATH = "development/br_rj_riodejaneiro_gtfs/upload"
diff --git a/pipelines/rj_smtr/flows.py b/pipelines/rj_smtr/flows.py
index 3dd834a75..94a3ffb93 100644
--- a/pipelines/rj_smtr/flows.py
+++ b/pipelines/rj_smtr/flows.py
@@ -44,7 +44,7 @@
 
     table_id = Parameter("table_id", default=None)
     partition_date_only = Parameter("partition_date_only", default=None)
-    request_params = Parameter("request_params", default=None)
+    extract_params = Parameter("extract_params", default=None)
     dataset_id = Parameter("dataset_id", default=None)
     secret_path = Parameter("secret_path", default=None)
     primary_key = Parameter("primary_key", default=None)
@@ -71,11 +71,9 @@
     )
 
     # Extração #
-    # é necessária task ou função dentro da extract_raw_data?
     request_params, request_path = create_request_params(
-        secret_path=secret_path,
         dataset_id=dataset_id,
-        request_params=request_params,
+        extract_params=extract_params,
         table_id=table_id,
         timestamp=timestamp,
     )
diff --git a/pipelines/rj_smtr/tasks.py b/pipelines/rj_smtr/tasks.py
index 89beae6f2..a134dd966 100644
--- a/pipelines/rj_smtr/tasks.py
+++ b/pipelines/rj_smtr/tasks.py
@@ -879,7 +879,7 @@ def get_previous_date(days):
 def transform_raw_to_nested_structure(
     raw_filepath: str,
     filepath: str,
-    error: bool,
+    error: str,
     timestamp: datetime,
     primary_key: list = None,
 ):
@@ -898,6 +898,10 @@ def transform_raw_to_nested_structure(
             * `error` (str): catched error, if any. Otherwise, returns None
     """
 
+    # Check previous error
+    if error is not None:
+        return error, None
+
     with open(raw_filepath, "r", encoding="utf-8") as file:
         data = file.read()
 
@@ -907,10 +911,6 @@ def transform_raw_to_nested_structure(
         file_type=raw_filepath.split(".")[-1],
     )
 
-    # Check previous error
-    if error is not None:
-        return error, None
-
     # Check empty dataframe
     # if len(status["data"]) == 0:
     #     log("Empty dataframe, skipping transformation...")
@@ -999,11 +999,8 @@ def transform_raw_to_nested_structure(
 
 @task(checkpoint=False, nout=2)
 def create_request_params(
-    # datetime_range: dict,
-    # table_params: dict,
-    request_params: dict,
+    extract_params: dict,
     table_id: str,
-    secret_path: str,
     dataset_id: str,
     timestamp: datetime,
 ) -> tuple:
@@ -1020,28 +1017,27 @@ def create_request_params(
         request_params: host, database and query to request data
         request_url: url to request data
     """
+    request_params = None
 
     if dataset_id == constants.BILHETAGEM_DATASET_ID.value:
-        secrets = get_vault_secret(secret_path)["data"]
-
-        database_secrets = secrets["databases"][request_params["database"]]
-        request_url = secrets["vpn_url"] + database_secrets["engine"]
+        database = constants.BILHETAGEM_DATABASES.value[extract_params["database"]]
+        request_url = constants.BILHETAGEM_VPN_URL.value + database["engine"]
 
         datetime_range = get_datetime_range(
-            timestamp=timestamp, interval=request_params["run_interval"]
+            timestamp=timestamp, interval=extract_params["run_interval"]
         )
+
         request_params = {
-            "host": database_secrets["host"],  # TODO: exibir no log em ambiente fechado
-            "database": request_params["database"],
-            "query": request_params["query"].format(**datetime_range),
+            "host": database["host"],  # TODO: exibir no log em ambiente fechado
+            "database": extract_params["database"],
+            "query": extract_params["query"].format(**datetime_range),
         }
 
     elif dataset_id == constants.GTFS_DATASET_ID.value:
-        gtfs_base_path = "development/br_rj_riodejaneiro_gtfs/upload"
-        if table_id == constants.GTFS_QUADRO_TABLE_ID.value:
-            request_url = f"{gtfs_base_path}/quadro.csv"
+        if table_id == constants.GTFS_QUADRO_CAPTURE_PARAMS.value["table_id"]:
+            request_url = f"{constants.GTFS_BASE_GCS_PATH.value}/{table_id}.csv"
         else:
-            request_url = f"{gtfs_base_path}/gtfs.zip"
+            request_url = f"{constants.GTFS_BASE_GCS_PATH.value}/gtfs.zip"
 
     return request_params, request_url
 

From bdc3881cde88840b62175e1ce8ac66a596e37feb Mon Sep 17 00:00:00 2001
From: Rafael <rafael.c.pinheiro96@gmail.com>
Date: Wed, 27 Sep 2023 13:27:11 -0300
Subject: [PATCH 011/145] change bilhetagem to new capture flow structure

---
 .../schedules.py                              |  18 +-
 pipelines/rj_smtr/constants.py                | 186 ++++++++++--------
 pipelines/rj_smtr/tasks.py                    |  14 +-
 pipelines/rj_smtr/utils.py                    |  40 ++--
 4 files changed, 145 insertions(+), 113 deletions(-)

diff --git a/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/schedules.py b/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/schedules.py
index 38fca85a9..538e5b816 100644
--- a/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/schedules.py
+++ b/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/schedules.py
@@ -16,26 +16,32 @@
 )
 
 bilhetagem_principal_clocks = generate_execute_schedules(
-    interval=timedelta(days=1),
+    clock_interval=timedelta(
+        **constants.BILHETAGEM_GENERAL_CAPTURE_PARAMS.value["principal_run_interval"]
+    ),
     labels=[
-        emd_constants.RJ_SMTR_AGENT_LABEL.value,
+        emd_constants.RJ_SMTR_DEV_AGENT_LABEL.value,
     ],
-    table_parameters=constants.BILHETAGEM_TABLES_PARAMS.value,
+    table_parameters=constants.BILHETAGEM_CAPTURE_PARAMS.value,
     dataset_id=constants.BILHETAGEM_DATASET_ID.value,
     secret_path=constants.BILHETAGEM_SECRET_PATH.value,
+    source_type=constants.BILHETAGEM_GENERAL_CAPTURE_PARAMS.value["source_type"],
     runs_interval_minutes=15,
 )
 
 bilhetagem_principal_schedule = Schedule(clocks=untuple(bilhetagem_principal_clocks))
 
 bilhetagem_transacao_clocks = generate_execute_schedules(
-    interval=timedelta(minutes=1),
+    clock_interval=timedelta(
+        constants.BILHETAGEM_GENERAL_CAPTURE_PARAMS.value["transacao_run_interval"]
+    ),
     labels=[
-        emd_constants.RJ_SMTR_AGENT_LABEL.value,
+        emd_constants.RJ_SMTR_DEV_AGENT_LABEL.value,
     ],
-    table_parameters=constants.BILHETAGEM_TRANSACAO_TABLE_PARAMS.value,
+    table_parameters=constants.BILHETAGEM_TRANSACAO_CAPTURE_PARAMS.value,
     dataset_id=constants.BILHETAGEM_DATASET_ID.value,
     secret_path=constants.BILHETAGEM_SECRET_PATH.value,
+    source_type=constants.BILHETAGEM_GENERAL_CAPTURE_PARAMS.value["source_type"],
     runs_interval_minutes=0,
 )
 
diff --git a/pipelines/rj_smtr/constants.py b/pipelines/rj_smtr/constants.py
index 7eb18ef85..969ccd871 100644
--- a/pipelines/rj_smtr/constants.py
+++ b/pipelines/rj_smtr/constants.py
@@ -165,117 +165,142 @@ class constants(Enum):  # pylint: disable=c0103
 
     # BILHETAGEM
     BILHETAGEM_DATASET_ID = "br_rj_riodejaneiro_bilhetagem"
-    BILHETAGEM_DATABASES = {
-        "principal_db": {
-            "engine": "mysql",
-            "host": "principal-database-replica.internal",
+
+    BILHETAGEM_GENERAL_CAPTURE_PARAMS = {
+        "databases": {
+            "principal_db": {
+                "engine": "mysql",
+                "host": "principal-database-replica.internal",
+            },
+            "tarifa_db": {
+                "engine": "postgres",
+                "host": "tarifa-database-replica.internal",
+            },
+            "transacao_db": {
+                "engine": "postgres",
+                "host": "transacao-database-replica.internal",
+            },
         },
-        "tarifa_db": {"engine": "postgres", "host": "tarifa-database-replica.internal"},
-        "transacao_db": {
-            "engine": "postgres",
-            "host": "transacao-database-replica.internal",
+        "vpn_url": "http://vpn-jae.mobilidade.rio/",
+        "source_type": "api-json",
+        "transacao_run_interval": {"minutes": 1},
+        "principal_run_interval": {"days": 1},
+    }
+
+    BILHETAGEM_TRANSACAO_CAPTURE_PARAMS = {
+        "table_id": "transacao",
+        "partition_date_only": False,
+        "extract_params": {
+            "database": "transacao_db",
+            "query": """
+                SELECT
+                    *
+                FROM
+                    transacao
+                WHERE
+                    data_processamento BETWEEN '{start}'
+                    AND '{end}'
+                ORDER BY
+                    data_processamento
+            """,
+            "run_interval": BILHETAGEM_GENERAL_CAPTURE_PARAMS["transacao_run_interval"],
         },
+        "primary_key": ["id"],
     }
-    BILHETAGEM_VPN_URL = "http://vpn-jae.mobilidade.rio/"
-    BILHETAGEM_TRANSACAO_TABLE_PARAMS = [
+
+    BILHETAGEM_CAPTURE_PARAMS = [
         {
-            "partition_date_only": False,
-            "flow_run_name": "transacao",
-            "extraction": {
-                "table_id": "transacao",
-                "database": "transacao_db",
+            "table_id": "linha",
+            "partition_date_only": True,
+            "extract_params": {
+                "database": "principal_db",
                 "query": """
                     SELECT
                         *
                     FROM
-                        transacao
+                        LINHA
                     WHERE
-                        data_processamento BETWEEN '{start}'
-                        AND '{end}'
+                        DT_INCLUSAO >= '{start}'
                     ORDER BY
-                        data_processamento
+                        DT_INCLUSAO
                 """,
-                "source": "api",
-            },
-            "pre-treatment": {
-                "table_id": "transacao",
-                "file_type": "json",
-                "primary_key": ["id"],  # id column to nest data on
+                "run_interval": BILHETAGEM_GENERAL_CAPTURE_PARAMS[
+                    "principal_run_interval"
+                ],
             },
-        }
-    ]
-    BILHETAGEM_TABLES_PARAMS = [
-        {
-            "table_id": "linha",
-            "database": "principal_db",
-            "query": """
-                SELECT
-                    *
-                FROM
-                    LINHA
-                WHERE
-                    DT_INCLUSAO >= '{start}'
-                ORDER BY
-                    DT_INCLUSAO
-            """,
             "primary_key": ["CD_LINHA"],  # id column to nest data on
-            "partition_date_only": True,
         },
         {
             "table_id": "grupo",
-            "database": "principal_db",
-            "query": """
-                SELECT
-                    *
-                FROM
-                    GRUPO
-                WHERE
-                    DT_INCLUSAO >= '{start}'
-                ORDER BY
-                    DT_INCLUSAO
-            """,
-            "primary_key": ["CD_GRUPO"],
             "partition_date_only": True,
+            "extract_params": {
+                "database": "principal_db",
+                "query": """
+                    SELECT
+                        *
+                    FROM
+                        GRUPO
+                    WHERE
+                        DT_INCLUSAO >= '{start}'
+                    ORDER BY
+                        DT_INCLUSAO
+                """,
+                "run_interval": BILHETAGEM_GENERAL_CAPTURE_PARAMS[
+                    "principal_run_interval"
+                ],
+            },
+            "primary_key": ["CD_GRUPO"],
         },
         {
             "table_id": "grupo_linha",
-            "database": "principal_db",
-            "query": """
-                SELECT
-                    *
-                FROM
-                    GRUPO_LINHA
-                WHERE
-                    DT_INCLUSAO >= '{start}'
-                ORDER BY
-                    DT_INCLUSAO
-            """,
-            "primary_key": ["CD_GRUPO", "CD_LINHA"],  # id column to nest data on
             "partition_date_only": True,
+            "extract_params": {
+                "database": "principal_db",
+                "query": """
+                    SELECT
+                        *
+                    FROM
+                        GRUPO_LINHA
+                    WHERE
+                        DT_INCLUSAO >= '{start}'
+                    ORDER BY
+                        DT_INCLUSAO
+                """,
+                "run_interval": BILHETAGEM_GENERAL_CAPTURE_PARAMS[
+                    "principal_run_interval"
+                ],
+            },
+            "primary_key": ["CD_GRUPO", "CD_LINHA"],  # id column to nest data on
         },
         {
             "table_id": "matriz_integracao",
-            "database": "tarifa_db",
-            "query": """
-                SELECT
-                    *
-                FROM
-                    matriz_integracao
-                WHERE
-                    dt_inclusao >= '{start}'
-                ORDER BY
-                    dt_inclusao
-            """,
+            "partition_date_only": True,
+            "extract_params": {
+                "database": "tarifa_db",
+                "query": """
+                    SELECT
+                        *
+                    FROM
+                        matriz_integracao
+                    WHERE
+                        dt_inclusao >= '{start}'
+                    ORDER BY
+                        dt_inclusao
+                """,
+                "run_interval": BILHETAGEM_GENERAL_CAPTURE_PARAMS[
+                    "principal_run_interval"
+                ],
+            },
             "primary_key": [
                 "cd_versao_matriz",
                 "cd_integracao",
             ],  # id column to nest data on
-            "partition_date_only": True,
         },
     ]
     BILHETAGEM_SECRET_PATH = "smtr_jae_access_data"
 
     # GTFS
+    GTFS_DATASET_ID = "br_rj_riodejaneiro_gtfs"
     GTFS_CAPTURE_PARAMS = [
         {"table_id": "agency", "primary_key": ["agency_id"]},
         {"table_id": "calendar_dates", "primary_key": ["service_id"]},
@@ -289,11 +314,6 @@ class constants(Enum):  # pylint: disable=c0103
         {"table_id": "fare_attributes", "primary_key": ["fare_id"]},
         {"table_id": "fare_rules", "primary_key": ["fare_id"]},
     ]
-
     GTFS_GENERAL_CAPTURE_PARAMS = {"partition_date_only": True, "source_type": "gcs"}
-
     GTFS_QUADRO_CAPTURE_PARAMS = {"table_id": "quadro", "primary_key": "servico"}
-
-    GTFS_DATASET_ID = "br_rj_riodejaneiro_gtfs"
-
     GTFS_BASE_GCS_PATH = "development/br_rj_riodejaneiro_gtfs/upload"
diff --git a/pipelines/rj_smtr/tasks.py b/pipelines/rj_smtr/tasks.py
index a134dd966..e414f1c70 100644
--- a/pipelines/rj_smtr/tasks.py
+++ b/pipelines/rj_smtr/tasks.py
@@ -1020,11 +1020,16 @@ def create_request_params(
     request_params = None
 
     if dataset_id == constants.BILHETAGEM_DATASET_ID.value:
-        database = constants.BILHETAGEM_DATABASES.value[extract_params["database"]]
-        request_url = constants.BILHETAGEM_VPN_URL.value + database["engine"]
+        database = constants.BILHETAGEM_GENERAL_CAPTURE_PARAMS.value["databases"][
+            extract_params["database"]
+        ]
+        request_url = (
+            constants.BILHETAGEM_GENERAL_CAPTURE_PARAMS.value["vpn_url"]
+            + database["engine"]
+        )
 
         datetime_range = get_datetime_range(
-            timestamp=timestamp, interval=extract_params["run_interval"]
+            timestamp=timestamp, interval=timedelta(**extract_params["run_interval"])
         )
 
         request_params = {
@@ -1051,12 +1056,15 @@ def get_raw_from_sources(
     secret_path: str = None,
     api_params: dict = None,
 ):
+    source_type, filetype = source_type.split("-", maxsplit=1)
+
     if source_type == "api":
         return get_raw_data_api(
             url=source_path,
             secret_path=secret_path,
             api_params=api_params,
             filepath=local_filepath,
+            filetype=filetype,
         )
     if source_type == "gcs":
         return get_raw_data_gcs(
diff --git a/pipelines/rj_smtr/utils.py b/pipelines/rj_smtr/utils.py
index 184a93df7..d354ae6ab 100644
--- a/pipelines/rj_smtr/utils.py
+++ b/pipelines/rj_smtr/utils.py
@@ -8,7 +8,7 @@
 from pathlib import Path
 
 from datetime import timedelta, datetime
-from typing import List
+from typing import List, Union
 import io
 import json
 import zipfile
@@ -31,7 +31,6 @@
     get_vault_secret,
     send_discord_message,
     get_redis_client,
-    get_storage_blobs,
     get_storage_blob,
 )
 
@@ -404,46 +403,41 @@ def data_info_str(data: pd.DataFrame):
 
 
 def generate_execute_schedules(  # pylint: disable=too-many-arguments,too-many-locals
-    interval: timedelta,
+    clock_interval: timedelta,
     labels: List[str],
-    table_parameters: list,
-    dataset_id: str,
-    secret_path: str,
+    table_parameters: Union[list[dict], dict],
     runs_interval_minutes: int = 15,
     start_date: datetime = datetime(
         2020, 1, 1, tzinfo=pytz.timezone(emd_constants.DEFAULT_TIMEZONE.value)
     ),
+    **general_flow_params,
 ) -> List[IntervalClock]:
     """
     Generates multiple schedules
 
     Args:
-        interval (timedelta): The interval to run the schedule
+        clock_interval (timedelta): The interval to run the schedule
         labels (List[str]): The labels to be added to the schedule
-        table_parameters (list): The table parameters
-        dataset_id (str): The dataset_id to be used in the schedule
-        secret_path (str): The secret path to be used in the schedule
+        table_parameters (list): The table parameters to iterate over
         runs_interval_minutes (int, optional): The interval between each schedule. Defaults to 15.
         start_date (datetime, optional): The start date of the schedule.
             Defaults to datetime(2020, 1, 1, tzinfo=pytz.timezone(emd_constants.DEFAULT_TIMEZONE.value)).
-
+        general_flow_params: Any param that you want to pass to the flow
     Returns:
         List[IntervalClock]: The list of schedules
 
     """
+    if isinstance(table_parameters, dict):
+        table_parameters = [table_parameters]
 
     clocks = []
     for count, parameters in enumerate(table_parameters):
-        parameter_defaults = {
-            "table_params": parameters,
-            "dataset_id": dataset_id,
-            "secret_path": secret_path,
-            "interval": interval.total_seconds(),
-        }
+        parameter_defaults = parameters | general_flow_params
+
         log(f"parameter_defaults: {parameter_defaults}")
         clocks.append(
             IntervalClock(
-                interval=interval,
+                interval=clock_interval,
                 start_date=start_date
                 + timedelta(minutes=runs_interval_minutes * count),
                 labels=labels,
@@ -486,7 +480,11 @@ def save_raw_local_func(
 
 
 def get_raw_data_api(  # pylint: disable=R0912
-    url: str, secret_path: str = None, api_params: dict = None, filepath: str = None
+    url: str,
+    secret_path: str = None,
+    api_params: dict = None,
+    filepath: str = None,
+    filetype: str = None,
 ) -> list[dict]:
     """
     Request data from URL API
@@ -517,8 +515,8 @@ def get_raw_data_api(  # pylint: disable=R0912
 
         response.raise_for_status()
         filepath = save_raw_local_func(
-            data=response.text, filepath=filepath
-        )  # TODO: mudar filetype
+            data=response.text, filepath=filepath, filetype=filetype
+        )
 
     except Exception as exp:
         error = exp

From fc61c4762c7a416872ba6fbbfa5a064a43e846a4 Mon Sep 17 00:00:00 2001
From: Rafael <rafael.c.pinheiro96@gmail.com>
Date: Wed, 27 Sep 2023 14:24:48 -0300
Subject: [PATCH 012/145] fix get_storage_blob function

---
 pipelines/rj_smtr/constants.py | 2 +-
 pipelines/utils/utils.py       | 3 +--
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/pipelines/rj_smtr/constants.py b/pipelines/rj_smtr/constants.py
index 969ccd871..2faeccb25 100644
--- a/pipelines/rj_smtr/constants.py
+++ b/pipelines/rj_smtr/constants.py
@@ -316,4 +316,4 @@ class constants(Enum):  # pylint: disable=c0103
     ]
     GTFS_GENERAL_CAPTURE_PARAMS = {"partition_date_only": True, "source_type": "gcs"}
     GTFS_QUADRO_CAPTURE_PARAMS = {"table_id": "quadro", "primary_key": "servico"}
-    GTFS_BASE_GCS_PATH = "development/br_rj_riodejaneiro_gtfs/upload"
+    GTFS_BASE_GCS_PATH = "raw/development/br_rj_riodejaneiro_gtfs/upload"
diff --git a/pipelines/utils/utils.py b/pipelines/utils/utils.py
index 147e54f4f..57384f8f4 100644
--- a/pipelines/utils/utils.py
+++ b/pipelines/utils/utils.py
@@ -726,9 +726,8 @@ def get_storage_blobs(dataset_id: str, table_id: str, mode: str = "staging") ->
 
 def get_storage_blob(
     gcs_path: str,
-    mode: str = "staging",
 ):
-    bucket = bd.Storage()
+    bucket = bd.Storage(dataset_id="", table_id="")
     return (
         bucket.client["storage_staging"]
         .bucket(bucket.bucket_name)

From 0fc26cbc9d786fd28b369ab35784b636c3ecdc12 Mon Sep 17 00:00:00 2001
From: Rafael <rafael.c.pinheiro96@gmail.com>
Date: Wed, 27 Sep 2023 14:25:27 -0300
Subject: [PATCH 013/145] fix get_storage_blob call

---
 pipelines/rj_smtr/utils.py | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/pipelines/rj_smtr/utils.py b/pipelines/rj_smtr/utils.py
index d354ae6ab..55abfc9cf 100644
--- a/pipelines/rj_smtr/utils.py
+++ b/pipelines/rj_smtr/utils.py
@@ -533,10 +533,7 @@ def get_raw_data_gcs(
     error = None
 
     try:
-        blob = get_storage_blob(
-            gcs_path=gcs_path,
-            mode="raw",
-        )
+        blob = get_storage_blob(gcs_path=gcs_path)
 
         data = blob.download_as_bytes()
 

From 634df851e41bff549fe5f9daab4801f0eb6e0858 Mon Sep 17 00:00:00 2001
From: Rafael <rafael.c.pinheiro96@gmail.com>
Date: Wed, 27 Sep 2023 14:45:26 -0300
Subject: [PATCH 014/145] organize constants order

---
 pipelines/rj_smtr/constants.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pipelines/rj_smtr/constants.py b/pipelines/rj_smtr/constants.py
index 2faeccb25..722d7e9e1 100644
--- a/pipelines/rj_smtr/constants.py
+++ b/pipelines/rj_smtr/constants.py
@@ -301,6 +301,7 @@ class constants(Enum):  # pylint: disable=c0103
 
     # GTFS
     GTFS_DATASET_ID = "br_rj_riodejaneiro_gtfs"
+    GTFS_GENERAL_CAPTURE_PARAMS = {"partition_date_only": True, "source_type": "gcs"}
     GTFS_CAPTURE_PARAMS = [
         {"table_id": "agency", "primary_key": ["agency_id"]},
         {"table_id": "calendar_dates", "primary_key": ["service_id"]},
@@ -314,6 +315,5 @@ class constants(Enum):  # pylint: disable=c0103
         {"table_id": "fare_attributes", "primary_key": ["fare_id"]},
         {"table_id": "fare_rules", "primary_key": ["fare_id"]},
     ]
-    GTFS_GENERAL_CAPTURE_PARAMS = {"partition_date_only": True, "source_type": "gcs"}
     GTFS_QUADRO_CAPTURE_PARAMS = {"table_id": "quadro", "primary_key": "servico"}
-    GTFS_BASE_GCS_PATH = "raw/development/br_rj_riodejaneiro_gtfs/upload"
+    GTFS_BASE_GCS_PATH = "development/br_rj_riodejaneiro_gtfs/upload"

From bda52aa6eedb6eedec2c6334f0843e2a80edcd4a Mon Sep 17 00:00:00 2001
From: Rafael <rafael.c.pinheiro96@gmail.com>
Date: Wed, 27 Sep 2023 14:46:45 -0300
Subject: [PATCH 015/145] fix get_raw_from_sources function call

---
 pipelines/rj_smtr/flows.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pipelines/rj_smtr/flows.py b/pipelines/rj_smtr/flows.py
index 94a3ffb93..19ac776b7 100644
--- a/pipelines/rj_smtr/flows.py
+++ b/pipelines/rj_smtr/flows.py
@@ -82,9 +82,9 @@
         source_type=source_type,  # parametro de extracao, onde ficar?
         local_filepath=filepath,
         source_path=request_path,
-        zip_filename=table_id,
+        table_id=table_id,
         secret_path=secret_path,
-        request_params=request_params,
+        api_params=request_params,
     )
 
     RAW_UPLOADED = upload_raw_data_to_gcs(

From b2548d6b8cd1f56bf9dbd4676e52011ce5fdfa16 Mon Sep 17 00:00:00 2001
From: Rafael <rafael.c.pinheiro96@gmail.com>
Date: Wed, 27 Sep 2023 14:47:35 -0300
Subject: [PATCH 016/145] change transform_raw_to_json to read_raw_data

---
 pipelines/rj_smtr/tasks.py | 17 +++++------------
 1 file changed, 5 insertions(+), 12 deletions(-)

diff --git a/pipelines/rj_smtr/tasks.py b/pipelines/rj_smtr/tasks.py
index e414f1c70..ee99ff654 100644
--- a/pipelines/rj_smtr/tasks.py
+++ b/pipelines/rj_smtr/tasks.py
@@ -32,7 +32,7 @@
     get_raw_data_gcs,
     upload_run_logs_to_bq,
     get_datetime_range,
-    transform_data_to_json,
+    read_raw_data,
     save_treated_local_func,
 )
 from pipelines.utils.execute_dbt_model.utils import get_dbt_client
@@ -899,17 +899,11 @@ def transform_raw_to_nested_structure(
     """
 
     # Check previous error
+
     if error is not None:
         return error, None
 
-    with open(raw_filepath, "r", encoding="utf-8") as file:
-        data = file.read()
-
     # ORGANIZAR:
-    error, data = transform_data_to_json(
-        data=data,
-        file_type=raw_filepath.split(".")[-1],
-    )
 
     # Check empty dataframe
     # if len(status["data"]) == 0:
@@ -917,13 +911,12 @@ def transform_raw_to_nested_structure(
     #     return {"data": pd.DataFrame(), "error": error}
 
     try:
+        # leitura do dado raw
+        error, data = read_raw_data(filepath=raw_filepath)
+
         if primary_key is None:
             primary_key = []
 
-        error = None
-        # leitura do dado raw
-        data = pd.DataFrame(data)
-
         log(
             f"""
         Received inputs:

From 307863a1d381cefeeb5a9001fb8f4ef235923cbb Mon Sep 17 00:00:00 2001
From: Rafael <rafael.c.pinheiro96@gmail.com>
Date: Wed, 27 Sep 2023 14:48:30 -0300
Subject: [PATCH 017/145] transform transform_raw_data_to_json to read_raw_data

---
 pipelines/rj_smtr/utils.py | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/pipelines/rj_smtr/utils.py b/pipelines/rj_smtr/utils.py
index 55abfc9cf..3f4281a2c 100644
--- a/pipelines/rj_smtr/utils.py
+++ b/pipelines/rj_smtr/utils.py
@@ -673,15 +673,18 @@ def get_datetime_range(
     return {"start": start, "end": end}
 
 
-def transform_data_to_json(data: str, file_type: str, csv_args: dict = {}):
+def read_raw_data(filepath: str, csv_args: dict = {}) -> tuple[str, pd.DataFrame]:
     try:
+        file_type = filepath.split(".")[-1]
+
         if file_type == "json":
-            data = json.loads(data)
+            data = pd.read_json(filepath)
+            # data = json.loads(data)
 
         elif file_type in ("txt", "csv"):
             if csv_args is None:
                 csv_args = {}
-            data = pd.read_csv(io.StringIO(data), **csv_args).to_dict(orient="records")
+            data = pd.read_csv(filepath, **csv_args)
         else:
             error = "Unsupported raw file extension. Supported only: json, csv and txt"
 

From 7f2c1e3fe3db535868943404e945b5b44eefad74 Mon Sep 17 00:00:00 2001
From: Rafael <rafael.c.pinheiro96@gmail.com>
Date: Wed, 27 Sep 2023 14:59:43 -0300
Subject: [PATCH 018/145] fix nout task parameter

---
 pipelines/rj_smtr/tasks.py | 4 ++--
 pipelines/rj_smtr/utils.py | 6 +++---
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/pipelines/rj_smtr/tasks.py b/pipelines/rj_smtr/tasks.py
index ee99ff654..9beb5a87e 100644
--- a/pipelines/rj_smtr/tasks.py
+++ b/pipelines/rj_smtr/tasks.py
@@ -875,7 +875,7 @@ def get_previous_date(days):
 ###############
 
 
-@task
+@task(nout=2)
 def transform_raw_to_nested_structure(
     raw_filepath: str,
     filepath: str,
@@ -1040,7 +1040,7 @@ def create_request_params(
     return request_params, request_url
 
 
-@task(checkpoint=False)
+@task(checkpoint=False, nout=2)
 def get_raw_from_sources(
     source_type: str,
     local_filepath: str,
diff --git a/pipelines/rj_smtr/utils.py b/pipelines/rj_smtr/utils.py
index 3f4281a2c..8a8804474 100644
--- a/pipelines/rj_smtr/utils.py
+++ b/pipelines/rj_smtr/utils.py
@@ -485,7 +485,7 @@ def get_raw_data_api(  # pylint: disable=R0912
     api_params: dict = None,
     filepath: str = None,
     filetype: str = None,
-) -> list[dict]:
+) -> tuple[str, str]:
     """
     Request data from URL API
 
@@ -529,7 +529,7 @@ def get_raw_data_gcs(
     gcs_path: str,
     local_filepath: str,
     filename_to_unzip: str = None,
-) -> dict:
+) -> tuple[str, str]:
     error = None
 
     try:
@@ -673,7 +673,7 @@ def get_datetime_range(
     return {"start": start, "end": end}
 
 
-def read_raw_data(filepath: str, csv_args: dict = {}) -> tuple[str, pd.DataFrame]:
+def read_raw_data(filepath: str, csv_args: dict = dict()) -> tuple[str, pd.DataFrame]:
     try:
         file_type = filepath.split(".")[-1]
 

From 51977c10621d34ea3643004cba5bc4f990d249db Mon Sep 17 00:00:00 2001
From: Rafael <rafael.c.pinheiro96@gmail.com>
Date: Wed, 27 Sep 2023 15:16:38 -0300
Subject: [PATCH 019/145] fix timedelta instantiation

---
 pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/schedules.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/schedules.py b/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/schedules.py
index 538e5b816..f19f0d8ad 100644
--- a/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/schedules.py
+++ b/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/schedules.py
@@ -33,7 +33,7 @@
 
 bilhetagem_transacao_clocks = generate_execute_schedules(
     clock_interval=timedelta(
-        constants.BILHETAGEM_GENERAL_CAPTURE_PARAMS.value["transacao_run_interval"]
+        **constants.BILHETAGEM_GENERAL_CAPTURE_PARAMS.value["transacao_run_interval"]
     ),
     labels=[
         emd_constants.RJ_SMTR_DEV_AGENT_LABEL.value,

From 8ef0b5df7c31ebb7f59ff719c338e029e34cf031 Mon Sep 17 00:00:00 2001
From: Rafael <rafael.c.pinheiro96@gmail.com>
Date: Wed, 27 Sep 2023 15:58:05 -0300
Subject: [PATCH 020/145] set upstream tasks

---
 pipelines/rj_smtr/flows.py |  1 +
 pipelines/rj_smtr/tasks.py | 10 +++++++---
 2 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/pipelines/rj_smtr/flows.py b/pipelines/rj_smtr/flows.py
index 19ac776b7..a4044933a 100644
--- a/pipelines/rj_smtr/flows.py
+++ b/pipelines/rj_smtr/flows.py
@@ -104,6 +104,7 @@
         error=error,
         timestamp=timestamp,
         primary_key=primary_key,
+        upstream_tasks=[RAW_UPLOADED],
     )
 
     STAGING_UPLOADED = upload_staging_data_to_gcs(
diff --git a/pipelines/rj_smtr/tasks.py b/pipelines/rj_smtr/tasks.py
index 9beb5a87e..269ee73eb 100644
--- a/pipelines/rj_smtr/tasks.py
+++ b/pipelines/rj_smtr/tasks.py
@@ -1052,19 +1052,23 @@ def get_raw_from_sources(
     source_type, filetype = source_type.split("-", maxsplit=1)
 
     if source_type == "api":
-        return get_raw_data_api(
+        error, filepath = get_raw_data_api(
             url=source_path,
             secret_path=secret_path,
             api_params=api_params,
             filepath=local_filepath,
             filetype=filetype,
         )
-    if source_type == "gcs":
-        return get_raw_data_gcs(
+    elif source_type == "gcs":
+        error, filepath = get_raw_data_gcs(
             gcs_path=source_path,
             filename_to_unzip=table_id,
             local_filepath=local_filepath,
         )
+    else:
+        raise NotImplementedError(f"{source_type} not supported")
+
+    return error, filepath
 
 
 # TODO: passar para função para dentro da transform_raw_to_nested_structure

From 4f21f0af7fff375354538c868e7b4cedd7943f4d Mon Sep 17 00:00:00 2001
From: Rafael <rafael.c.pinheiro96@gmail.com>
Date: Wed, 27 Sep 2023 16:02:09 -0300
Subject: [PATCH 021/145] declare raw_filepath

---
 pipelines/rj_smtr/utils.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/pipelines/rj_smtr/utils.py b/pipelines/rj_smtr/utils.py
index 8a8804474..0fd5c7d6c 100644
--- a/pipelines/rj_smtr/utils.py
+++ b/pipelines/rj_smtr/utils.py
@@ -531,6 +531,7 @@ def get_raw_data_gcs(
     filename_to_unzip: str = None,
 ) -> tuple[str, str]:
     error = None
+    raw_filepath = None
 
     try:
         blob = get_storage_blob(gcs_path=gcs_path)

From 11b973581c7ccc103d16bccc09dccd41f86f68da Mon Sep 17 00:00:00 2001
From: eng-rodrigocunha <engtransportes.rodrigocunha@gmail.com>
Date: Wed, 27 Sep 2023 16:19:43 -0300
Subject: [PATCH 022/145] update docstrings

---
 pipelines/rj_smtr/tasks.py | 76 +++++++++++++++++++++++++++++++-------
 pipelines/rj_smtr/utils.py | 50 ++++++++++++++++++-------
 pipelines/utils/utils.py   | 17 +++++++++
 3 files changed, 116 insertions(+), 27 deletions(-)

diff --git a/pipelines/rj_smtr/tasks.py b/pipelines/rj_smtr/tasks.py
index 269ee73eb..b12f0604c 100644
--- a/pipelines/rj_smtr/tasks.py
+++ b/pipelines/rj_smtr/tasks.py
@@ -168,7 +168,14 @@ def create_date_hour_partition(
     timestamp: datetime, partition_date_only: bool = False
 ) -> str:
     """
-    Get date hour Hive partition structure from timestamp.
+    Generate partition string for date and hour.
+
+    Args:
+        timestamp (datetime): timestamp to be used as reference
+        partition_date_only (bool, optional): whether to add hour partition or not
+
+    Returns:
+        str: partition string
     """
     partition = f"data={timestamp.strftime('%Y-%m-%d')}"
     if not partition_date_only:
@@ -614,6 +621,20 @@ def upload_raw_data_to_gcs(
     dataset_id: str,
     partitions: list,
 ):
+    """
+    Upload raw data to GCS.
+
+    Args:
+        error (bool): whether the upstream tasks failed or not
+        raw_filepath (str): Path to the saved raw .json file
+        timestamp (datetime): timestamp for flow run
+        table_id (str): table_id on BigQuery
+        dataset_id (str): dataset_id on BigQuery
+        partitions (list): list of partition strings
+
+    Returns:
+        None
+    """
     if not error:
         try:
             st_obj = Storage(table_id=table_id, dataset_id=dataset_id)
@@ -649,6 +670,20 @@ def upload_staging_data_to_gcs(
     dataset_id: str,
     partitions: list,
 ):
+    """
+    Upload staging data to GCS.
+
+    Args:
+        error (bool): whether the upstream tasks failed or not
+        staging_filepath (str): Path to the saved treated .csv file
+        timestamp (datetime): timestamp for flow run
+        table_id (str): table_id on BigQuery
+        dataset_id (str): dataset_id on BigQuery
+        partitions (list): list of partition strings
+
+    Returns:
+        None
+    """
     if not error:
         try:
             # Creates and publish table if it does not exist, append to it otherwise
@@ -883,19 +918,18 @@ def transform_raw_to_nested_structure(
     timestamp: datetime,
     primary_key: list = None,
 ):
-    """Transform dataframe to nested structure
+    """
+    Task to transform raw data to nested structure
 
     Args:
-        status (dict): Must contain keys
-            * `data`: dataframe returned from treatement
-            * `error`: error catched from data treatement
-        timestamp (datetime): timestamp of the capture
-        primary_key (list, optional): List of primary keys to be used for nesting.
+        raw_filepath (str): Path to the saved raw .json file
+        filepath (str): Path to the saved treated .csv file
+        error (str): Error catched from upstream tasks
+        timestamp (datetime): timestamp for flow run
+        primary_key (list, optional): Primary key to be used on nested structure
 
     Returns:
-        dict: Conatining keys
-            * `data` (json): nested data
-            * `error` (str): catched error, if any. Otherwise, returns None
+        str: Path to the saved treated .csv file
     """
 
     # Check previous error
@@ -1001,10 +1035,10 @@ def create_request_params(
     Task to create request params
 
     Args:
-        datetime_range (dict): datetime range to get params
-        table_params (dict): table params to get params
-        secret_path (str): secret path to get params
-        dataset_id (str): dataset id to get params
+        extract_params (dict): extract parameters
+        table_id (str): table_id on BigQuery
+        dataset_id (str): dataset_id on BigQuery
+        timestamp (datetime): timestamp for flow run
 
     Returns:
         request_params: host, database and query to request data
@@ -1049,6 +1083,20 @@ def get_raw_from_sources(
     secret_path: str = None,
     api_params: dict = None,
 ):
+    """
+    Task to get raw data from sources
+
+    Args:
+        source_type (str): source type
+        local_filepath (str): local filepath
+        source_path (str, optional): source path. Defaults to None.
+        table_id (str, optional): table_id on BigQuery. Defaults to None.
+        secret_path (str, optional): secret path. Defaults to None.
+        api_params (dict, optional): api parameters. Defaults to None.
+
+    Returns:
+        error: error
+    """
     source_type, filetype = source_type.split("-", maxsplit=1)
 
     if source_type == "api":
diff --git a/pipelines/rj_smtr/utils.py b/pipelines/rj_smtr/utils.py
index 0fd5c7d6c..801c8d336 100644
--- a/pipelines/rj_smtr/utils.py
+++ b/pipelines/rj_smtr/utils.py
@@ -490,14 +490,14 @@ def get_raw_data_api(  # pylint: disable=R0912
     Request data from URL API
 
     Args:
-        url (str): URL to send request
-        secret_path (str, optional): Path to secrets guardeded on Vault, if needed.
-        params (dict, optional): Params to be sent on request
+        url (str): URL to request data
+        secret_path (str, optional): Secret path to get headers. Defaults to None.
+        api_params (dict, optional): Parameters to pass to API. Defaults to None.
+        filepath (str, optional): Path to save raw file. Defaults to None.
+        filetype (str, optional): Filetype to save raw file. Defaults to None.
 
     Returns:
-        dict: Conatining keys
-          * `data` (json): data result
-          * `error` (str): catched error, if any. Otherwise, returns None
+        tuple[str, str]: Error and filepath
     """
     error = None
     try:
@@ -530,6 +530,17 @@ def get_raw_data_gcs(
     local_filepath: str,
     filename_to_unzip: str = None,
 ) -> tuple[str, str]:
+    """
+    Get raw data from GCS
+
+    Args:
+        gcs_path (str): GCS path to get data
+        local_filepath (str): Local filepath to save raw data
+        filename_to_unzip (str, optional): Filename to unzip. Defaults to None.
+
+    Returns:
+        tuple[str, str]: Error and filepath
+    """
     error = None
     raw_filepath = None
 
@@ -568,10 +579,9 @@ def save_treated_local_func(
     Save treated file to CSV.
 
     Args:
-        filepath (str): Path which to save treated file
-        status (dict): Must contain keys
-          * `data`: dataframe returned from treatement
-          * `error`: error catched from data treatement
+        filepath (str): Path to save file
+        data (pd.DataFrame): Dataframe to save
+        error (str): Error catched during execution
         mode (str, optional): Folder to save locally, later folder which to upload to GCS.
 
     Returns:
@@ -601,9 +611,13 @@ def upload_run_logs_to_bq(  # pylint: disable=R0913
 
     Args:
         dataset_id (str): dataset_id on BigQuery
-        parent_table_id (str): Parent table id related to the status table
-        timestamp (str): ISO formatted timestamp string
-        error (str, optional): String associated with error caught during execution
+        parent_table_id (str): table_id on BigQuery
+        timestamp (str): timestamp to get datetime range
+        error (str): error catched during execution
+        previous_error (str): previous error catched during execution
+        recapture (bool): if the execution was a recapture
+        mode (str): folder to save locally, later folder which to upload to GCS
+
     Returns:
         None
     """
@@ -675,6 +689,16 @@ def get_datetime_range(
 
 
 def read_raw_data(filepath: str, csv_args: dict = dict()) -> tuple[str, pd.DataFrame]:
+    """
+    Read raw data from file
+
+    Args:
+        filepath (str): filepath to read
+        csv_args (dict): arguments to pass to pandas.read_csv
+
+    Returns:
+        tuple[str, pd.DataFrame]: error and data
+    """
     try:
         file_type = filepath.split(".")[-1]
 
diff --git a/pipelines/utils/utils.py b/pipelines/utils/utils.py
index 57384f8f4..e37a88d8b 100644
--- a/pipelines/utils/utils.py
+++ b/pipelines/utils/utils.py
@@ -714,6 +714,14 @@ def get_credentials_from_env(
 def get_storage_blobs(dataset_id: str, table_id: str, mode: str = "staging") -> list:
     """
     Get all blobs from a table in a dataset.
+
+    Args:
+        dataset_id (str): dataset id
+        table_id (str): table id
+        mode (str, optional): mode to use. Defaults to "staging".
+
+    Returns:
+        list: list of blobs
     """
 
     bd_storage = bd.Storage(dataset_id=dataset_id, table_id=table_id)
@@ -727,6 +735,15 @@ def get_storage_blobs(dataset_id: str, table_id: str, mode: str = "staging") ->
 def get_storage_blob(
     gcs_path: str,
 ):
+    """
+    Get a blob from a path.
+
+    Args:
+        gcs_path (str): path to blob
+
+    Returns:
+        Blob: blob object
+    """
     bucket = bd.Storage(dataset_id="", table_id="")
     return (
         bucket.client["storage_staging"]

From f484b880d54367e375a2ce72b02d9835f20fe4d1 Mon Sep 17 00:00:00 2001
From: Rafael <rafael.c.pinheiro96@gmail.com>
Date: Wed, 27 Sep 2023 16:20:42 -0300
Subject: [PATCH 023/145] adjust get_raw_from_sources return

---
 pipelines/rj_smtr/tasks.py | 38 ++++++++++++++++++++++----------------
 1 file changed, 22 insertions(+), 16 deletions(-)

diff --git a/pipelines/rj_smtr/tasks.py b/pipelines/rj_smtr/tasks.py
index 269ee73eb..023ea2796 100644
--- a/pipelines/rj_smtr/tasks.py
+++ b/pipelines/rj_smtr/tasks.py
@@ -1051,22 +1051,28 @@ def get_raw_from_sources(
 ):
     source_type, filetype = source_type.split("-", maxsplit=1)
 
-    if source_type == "api":
-        error, filepath = get_raw_data_api(
-            url=source_path,
-            secret_path=secret_path,
-            api_params=api_params,
-            filepath=local_filepath,
-            filetype=filetype,
-        )
-    elif source_type == "gcs":
-        error, filepath = get_raw_data_gcs(
-            gcs_path=source_path,
-            filename_to_unzip=table_id,
-            local_filepath=local_filepath,
-        )
-    else:
-        raise NotImplementedError(f"{source_type} not supported")
+    log(f"Source type: {source_type}")
+
+    try:
+        if source_type == "api":
+            error, filepath = get_raw_data_api(
+                url=source_path,
+                secret_path=secret_path,
+                api_params=api_params,
+                filepath=local_filepath,
+                filetype=filetype,
+            )
+        elif source_type == "gcs":
+            error, filepath = get_raw_data_gcs(
+                gcs_path=source_path,
+                filename_to_unzip=table_id,
+                local_filepath=local_filepath,
+            )
+        else:
+            raise NotImplementedError(f"{source_type} not supported")
+    except NotImplementedError as exp:
+        error = exp
+        filepath = None
 
     return error, filepath
 

From 2df4318dc407b58ca6a6c4bf5a3bfad8db7fab37 Mon Sep 17 00:00:00 2001
From: Rafael <rafael.c.pinheiro96@gmail.com>
Date: Wed, 27 Sep 2023 16:41:00 -0300
Subject: [PATCH 024/145] fix errors

---
 pipelines/rj_smtr/tasks.py | 13 +++++++++++--
 pipelines/rj_smtr/utils.py |  1 +
 2 files changed, 12 insertions(+), 2 deletions(-)

diff --git a/pipelines/rj_smtr/tasks.py b/pipelines/rj_smtr/tasks.py
index 7ff9ee637..9c2ae3be0 100644
--- a/pipelines/rj_smtr/tasks.py
+++ b/pipelines/rj_smtr/tasks.py
@@ -1097,7 +1097,15 @@ def get_raw_from_sources(
     Returns:
         error: error
     """
-    source_type, filetype = source_type.split("-", maxsplit=1)
+    error = None
+    filepath = None
+
+    source_values = source_type.split("-", maxsplit=1)
+    source_type = source_values[0]
+    try:
+        filetype = source_values[1]
+    except IndexError:
+        filetype = None
 
     log(f"Source type: {source_type}")
 
@@ -1120,8 +1128,9 @@ def get_raw_from_sources(
             raise NotImplementedError(f"{source_type} not supported")
     except NotImplementedError as exp:
         error = exp
-        filepath = None
+        log(f"[CATCHED] Task failed with error: \n{error}", level="error")
 
+    log(f"Raw extraction ended returned values: {error}, {filepath}")
     return error, filepath
 
 
diff --git a/pipelines/rj_smtr/utils.py b/pipelines/rj_smtr/utils.py
index 801c8d336..743e955e1 100644
--- a/pipelines/rj_smtr/utils.py
+++ b/pipelines/rj_smtr/utils.py
@@ -699,6 +699,7 @@ def read_raw_data(filepath: str, csv_args: dict = dict()) -> tuple[str, pd.DataF
     Returns:
         tuple[str, pd.DataFrame]: error and data
     """
+    error = None
     try:
         file_type = filepath.split(".")[-1]
 

From df6525ac9e946f5a3d3709b768e02f2c26aae1c8 Mon Sep 17 00:00:00 2001
From: Rafael <rafael.c.pinheiro96@gmail.com>
Date: Wed, 27 Sep 2023 16:45:37 -0300
Subject: [PATCH 025/145] change agent label to dev

---
 pipelines/rj_smtr/flows.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pipelines/rj_smtr/flows.py b/pipelines/rj_smtr/flows.py
index a4044933a..27eaa76a4 100644
--- a/pipelines/rj_smtr/flows.py
+++ b/pipelines/rj_smtr/flows.py
@@ -119,5 +119,5 @@
 default_capture_flow.storage = GCS(emd_constants.GCS_FLOWS_BUCKET.value)
 default_capture_flow.run_config = KubernetesRun(
     image=emd_constants.DOCKER_IMAGE.value,
-    labels=[emd_constants.RJ_SMTR_AGENT_LABEL.value],
+    labels=[emd_constants.RJ_SMTR_DEV_AGENT_LABEL.value],
 )

From 2983b687fb1910cc1086cb875367493706ed905e Mon Sep 17 00:00:00 2001
From: Rafael <rafael.c.pinheiro96@gmail.com>
Date: Thu, 28 Sep 2023 10:54:51 -0300
Subject: [PATCH 026/145] refactore source values

---
 pipelines/rj_smtr/tasks.py | 36 ++++++------------------------------
 1 file changed, 6 insertions(+), 30 deletions(-)

diff --git a/pipelines/rj_smtr/tasks.py b/pipelines/rj_smtr/tasks.py
index 9c2ae3be0..4a7182daf 100644
--- a/pipelines/rj_smtr/tasks.py
+++ b/pipelines/rj_smtr/tasks.py
@@ -1100,14 +1100,13 @@ def get_raw_from_sources(
     error = None
     filepath = None
 
-    source_values = source_type.split("-", maxsplit=1)
-    source_type = source_values[0]
-    try:
-        filetype = source_values[1]
-    except IndexError:
-        filetype = None
+    source_values = source_type.split("-", 1)
+
+    source_type, filetype = (
+        source_values if len(source_values) == 2 else (source_values[0], None)
+    )
 
-    log(f"Source type: {source_type}")
+    log(f"Getting raw data from source type: {source_type}")
 
     try:
         if source_type == "api":
@@ -1132,26 +1131,3 @@ def get_raw_from_sources(
 
     log(f"Raw extraction ended returned values: {error}, {filepath}")
     return error, filepath
-
-
-# TODO: passar para função para dentro da transform_raw_to_nested_structure
-# @task(checkpoint=False)
-# def transform_data_to_json(status: dict, file_type: str, csv_args: dict):
-#     data = status["data"]
-#     error = status["error"]
-
-#     if file_type == "json":
-#         pass
-
-#         # todo: move to data check on specfic API # pylint: disable=W0102
-#         # if isinstance(data, dict) and "DescricaoErro" in data.keys():
-#         #     error = data["DescricaoErro"]
-
-#     elif file_type in ("txt", "csv"):
-#         if csv_args is None:
-#             csv_args = {}
-#         data = pd.read_csv(io.StringIO(data), **csv_args).to_dict(orient="records")
-#     else:
-#         error = "Unsupported raw file extension. Supported only: json, csv and txt"
-
-#     return {"data": data, "error": error}

From 2c78b09404680d561a5afe5096428cb44a3b8032 Mon Sep 17 00:00:00 2001
From: eng-rodrigocunha <engtransportes.rodrigocunha@gmail.com>
Date: Thu, 28 Sep 2023 11:27:23 -0300
Subject: [PATCH 027/145] update constants

---
 pipelines/rj_smtr/constants.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/pipelines/rj_smtr/constants.py b/pipelines/rj_smtr/constants.py
index 722d7e9e1..3afb0b1cd 100644
--- a/pipelines/rj_smtr/constants.py
+++ b/pipelines/rj_smtr/constants.py
@@ -185,6 +185,8 @@ class constants(Enum):  # pylint: disable=c0103
         "source_type": "api-json",
         "transacao_run_interval": {"minutes": 1},
         "principal_run_interval": {"days": 1},
+        "transacao_runs_interval_minutes": 0,
+        "principal_runs_interval_minutes": 15,
     }
 
     BILHETAGEM_TRANSACAO_CAPTURE_PARAMS = {
@@ -205,7 +207,7 @@ class constants(Enum):  # pylint: disable=c0103
             """,
             "run_interval": BILHETAGEM_GENERAL_CAPTURE_PARAMS["transacao_run_interval"],
         },
-        "primary_key": ["id"],
+        "primary_key": ["id"],  # id column to nest data on
     }
 
     BILHETAGEM_CAPTURE_PARAMS = [
@@ -249,7 +251,7 @@ class constants(Enum):  # pylint: disable=c0103
                     "principal_run_interval"
                 ],
             },
-            "primary_key": ["CD_GRUPO"],
+            "primary_key": ["CD_GRUPO"],  # id column to nest data on
         },
         {
             "table_id": "grupo_linha",

From 1f3c2fc307e21e77de206f5ded612a690e8108cf Mon Sep 17 00:00:00 2001
From: eng-rodrigocunha <engtransportes.rodrigocunha@gmail.com>
Date: Thu, 28 Sep 2023 11:28:23 -0300
Subject: [PATCH 028/145] update agent

---
 pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/flows.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/flows.py b/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/flows.py
index d7f44e3b9..793d37c0d 100644
--- a/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/flows.py
+++ b/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/flows.py
@@ -30,7 +30,7 @@
 bilhetagem_transacao_captura.storage = GCS(emd_constants.GCS_FLOWS_BUCKET.value)
 bilhetagem_transacao_captura.run_config = KubernetesRun(
     image=emd_constants.DOCKER_IMAGE.value,
-    labels=[emd_constants.RJ_SMTR_AGENT_LABEL.value],
+    labels=[emd_constants.RJ_SMTR_DEV_AGENT_LABEL.value],
 )
 bilhetagem_transacao_captura.schedule = bilhetagem_transacao_schedule
 
@@ -41,6 +41,6 @@
 bilhetagem_principal_captura.storage = GCS(emd_constants.GCS_FLOWS_BUCKET.value)
 bilhetagem_principal_captura.run_config = KubernetesRun(
     image=emd_constants.DOCKER_IMAGE.value,
-    labels=[emd_constants.RJ_SMTR_AGENT_LABEL.value],
+    labels=[emd_constants.RJ_SMTR_DEV_AGENT_LABEL.value],
 )
 bilhetagem_principal_captura.schedule = bilhetagem_principal_schedule

From 702e70d6ae1341889e333e2d07fc0fec70dd6cef Mon Sep 17 00:00:00 2001
From: eng-rodrigocunha <engtransportes.rodrigocunha@gmail.com>
Date: Thu, 28 Sep 2023 11:30:21 -0300
Subject: [PATCH 029/145] update schedule params

---
 .../rj_smtr/br_rj_riodejaneiro_bilhetagem/schedules.py    | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/schedules.py b/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/schedules.py
index f19f0d8ad..e897286b0 100644
--- a/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/schedules.py
+++ b/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/schedules.py
@@ -26,7 +26,9 @@
     dataset_id=constants.BILHETAGEM_DATASET_ID.value,
     secret_path=constants.BILHETAGEM_SECRET_PATH.value,
     source_type=constants.BILHETAGEM_GENERAL_CAPTURE_PARAMS.value["source_type"],
-    runs_interval_minutes=15,
+    runs_interval_minutes=constants.BILHETAGEM_GENERAL_CAPTURE_PARAMS.value[
+        "principal_runs_interval_minutes"
+    ],
 )
 
 bilhetagem_principal_schedule = Schedule(clocks=untuple(bilhetagem_principal_clocks))
@@ -42,7 +44,9 @@
     dataset_id=constants.BILHETAGEM_DATASET_ID.value,
     secret_path=constants.BILHETAGEM_SECRET_PATH.value,
     source_type=constants.BILHETAGEM_GENERAL_CAPTURE_PARAMS.value["source_type"],
-    runs_interval_minutes=0,
+    runs_interval_minutes=constants.BILHETAGEM_GENERAL_CAPTURE_PARAMS.value[
+        "transacao_runs_interval_minutes"
+    ],
 )
 
 bilhetagem_transacao_schedule = Schedule(clocks=untuple(bilhetagem_transacao_clocks))

From b5712d2746675c4925231382f2cf436da339be94 Mon Sep 17 00:00:00 2001
From: eng-rodrigocunha <engtransportes.rodrigocunha@gmail.com>
Date: Thu, 28 Sep 2023 11:42:25 -0300
Subject: [PATCH 030/145] update interval

---
 pipelines/rj_smtr/utils.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/pipelines/rj_smtr/utils.py b/pipelines/rj_smtr/utils.py
index 743e955e1..0972a22c8 100644
--- a/pipelines/rj_smtr/utils.py
+++ b/pipelines/rj_smtr/utils.py
@@ -664,21 +664,21 @@ def upload_run_logs_to_bq(  # pylint: disable=R0913
 
 def get_datetime_range(
     timestamp: datetime,
-    interval: int,
+    interval: timedelta,
 ) -> dict:
     """
     Task to get datetime range in UTC
 
     Args:
         timestamp (datetime): timestamp to get datetime range
-        interval (int): interval in seconds
+        interval (timedelta): interval to get datetime range
 
     Returns:
         dict: datetime range
     """
 
     start = (
-        (timestamp - timedelta(seconds=interval))
+        (timestamp - timedelta(interval))
         .astimezone(tz=pytz.timezone("UTC"))
         .strftime("%Y-%m-%d %H:%M:%S")
     )

From e3df22cc2cec64b6fcc7e0258caafdf542c8ab86 Mon Sep 17 00:00:00 2001
From: eng-rodrigocunha <engtransportes.rodrigocunha@gmail.com>
Date: Thu, 28 Sep 2023 11:44:39 -0300
Subject: [PATCH 031/145] fix get_datetime_range interval

---
 pipelines/rj_smtr/utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pipelines/rj_smtr/utils.py b/pipelines/rj_smtr/utils.py
index 0972a22c8..7b32e2831 100644
--- a/pipelines/rj_smtr/utils.py
+++ b/pipelines/rj_smtr/utils.py
@@ -678,7 +678,7 @@ def get_datetime_range(
     """
 
     start = (
-        (timestamp - timedelta(interval))
+        (timestamp - interval)
         .astimezone(tz=pytz.timezone("UTC"))
         .strftime("%Y-%m-%d %H:%M:%S")
     )

From 6ed06dad2772cb2d4ff32e6a19393d2e24cfe47f Mon Sep 17 00:00:00 2001
From: eng-rodrigocunha <engtransportes.rodrigocunha@gmail.com>
Date: Thu, 28 Sep 2023 12:21:35 -0300
Subject: [PATCH 032/145] remove order by from queries

---
 pipelines/rj_smtr/constants.py | 10 ----------
 1 file changed, 10 deletions(-)

diff --git a/pipelines/rj_smtr/constants.py b/pipelines/rj_smtr/constants.py
index 3afb0b1cd..4f2b1c95a 100644
--- a/pipelines/rj_smtr/constants.py
+++ b/pipelines/rj_smtr/constants.py
@@ -202,8 +202,6 @@ class constants(Enum):  # pylint: disable=c0103
                 WHERE
                     data_processamento BETWEEN '{start}'
                     AND '{end}'
-                ORDER BY
-                    data_processamento
             """,
             "run_interval": BILHETAGEM_GENERAL_CAPTURE_PARAMS["transacao_run_interval"],
         },
@@ -223,8 +221,6 @@ class constants(Enum):  # pylint: disable=c0103
                         LINHA
                     WHERE
                         DT_INCLUSAO >= '{start}'
-                    ORDER BY
-                        DT_INCLUSAO
                 """,
                 "run_interval": BILHETAGEM_GENERAL_CAPTURE_PARAMS[
                     "principal_run_interval"
@@ -244,8 +240,6 @@ class constants(Enum):  # pylint: disable=c0103
                         GRUPO
                     WHERE
                         DT_INCLUSAO >= '{start}'
-                    ORDER BY
-                        DT_INCLUSAO
                 """,
                 "run_interval": BILHETAGEM_GENERAL_CAPTURE_PARAMS[
                     "principal_run_interval"
@@ -265,8 +259,6 @@ class constants(Enum):  # pylint: disable=c0103
                         GRUPO_LINHA
                     WHERE
                         DT_INCLUSAO >= '{start}'
-                    ORDER BY
-                        DT_INCLUSAO
                 """,
                 "run_interval": BILHETAGEM_GENERAL_CAPTURE_PARAMS[
                     "principal_run_interval"
@@ -286,8 +278,6 @@ class constants(Enum):  # pylint: disable=c0103
                         matriz_integracao
                     WHERE
                         dt_inclusao >= '{start}'
-                    ORDER BY
-                        dt_inclusao
                 """,
                 "run_interval": BILHETAGEM_GENERAL_CAPTURE_PARAMS[
                     "principal_run_interval"

From 822c59f256d4e4ff900486a6472145bcbea4b08a Mon Sep 17 00:00:00 2001
From: eng-rodrigocunha <engtransportes.rodrigocunha@gmail.com>
Date: Thu, 28 Sep 2023 12:22:30 -0300
Subject: [PATCH 033/145] fix get_raw_data_api

---
 pipelines/rj_smtr/utils.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/pipelines/rj_smtr/utils.py b/pipelines/rj_smtr/utils.py
index 7b32e2831..445389340 100644
--- a/pipelines/rj_smtr/utils.py
+++ b/pipelines/rj_smtr/utils.py
@@ -506,6 +506,13 @@ def get_raw_data_api(  # pylint: disable=R0912
         else:
             headers = get_vault_secret(secret_path)["data"]
 
+        # remove from headers, if present
+        # TODO: remove this before merge to master
+        remove_headers = ["host", "databases"]
+        for remove_header in remove_headers:
+            if remove_header in list(headers.keys()):
+                del headers[remove_header]
+
         response = requests.get(
             url,
             headers=headers,

From c58ea9639bcb2812484dd899de6bfd33a776aec9 Mon Sep 17 00:00:00 2001
From: Rafael <rafael.c.pinheiro96@gmail.com>
Date: Thu, 28 Sep 2023 15:41:42 -0300
Subject: [PATCH 034/145] change json read function

---
 pipelines/rj_smtr/utils.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/pipelines/rj_smtr/utils.py b/pipelines/rj_smtr/utils.py
index 445389340..be8ed7bbd 100644
--- a/pipelines/rj_smtr/utils.py
+++ b/pipelines/rj_smtr/utils.py
@@ -707,11 +707,14 @@ def read_raw_data(filepath: str, csv_args: dict = dict()) -> tuple[str, pd.DataF
         tuple[str, pd.DataFrame]: error and data
     """
     error = None
+    data = None
     try:
         file_type = filepath.split(".")[-1]
 
         if file_type == "json":
-            data = pd.read_json(filepath)
+            with open(filepath, "r") as file:
+                data = json.load(file)
+            data = pd.DataFrame(data)
             # data = json.loads(data)
 
         elif file_type in ("txt", "csv"):

From 045a42368562263938b90a25feffaaed4c83318d Mon Sep 17 00:00:00 2001
From: Carolina Gomes <gsv.lina@gmail.com>
Date: Thu, 28 Sep 2023 16:01:10 -0300
Subject: [PATCH 035/145] update read_raw_data

---
 pipelines/rj_smtr/utils.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pipelines/rj_smtr/utils.py b/pipelines/rj_smtr/utils.py
index be8ed7bbd..c0c203dcd 100644
--- a/pipelines/rj_smtr/utils.py
+++ b/pipelines/rj_smtr/utils.py
@@ -713,8 +713,8 @@ def read_raw_data(filepath: str, csv_args: dict = dict()) -> tuple[str, pd.DataF
 
         if file_type == "json":
             with open(filepath, "r") as file:
-                data = json.load(file)
-            data = pd.DataFrame(data)
+                data = pd.DataFrame.from_dict(json.load(file), orient="records")
+
             # data = json.loads(data)
 
         elif file_type in ("txt", "csv"):

From d2d188f7491de19ac2554eb465e46829d04e572c Mon Sep 17 00:00:00 2001
From: Carolina Gomes <gsv.lina@gmail.com>
Date: Thu, 28 Sep 2023 16:09:27 -0300
Subject: [PATCH 036/145] update save_raw_local_func

---
 pipelines/rj_smtr/utils.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/pipelines/rj_smtr/utils.py b/pipelines/rj_smtr/utils.py
index c0c203dcd..20168b039 100644
--- a/pipelines/rj_smtr/utils.py
+++ b/pipelines/rj_smtr/utils.py
@@ -448,7 +448,7 @@ def generate_execute_schedules(  # pylint: disable=too-many-arguments,too-many-l
 
 
 def save_raw_local_func(
-    data: dict, filepath: str, mode: str = "raw", filetype: str = "json"
+    data: Union[dict, str], filepath: str, mode: str = "raw", filetype: str = "json"
 ) -> str:
     """
     Saves json response from API to .json file.
@@ -467,6 +467,8 @@ def save_raw_local_func(
     Path(_filepath).parent.mkdir(parents=True, exist_ok=True)
 
     if filetype == "json":
+        if isinstance(data, dict):
+            data = json.loads(data)
         json.dump(data, Path(_filepath).open("w", encoding="utf-8"))
 
     # if filetype == "csv":

From b7c4e2fe39b2e0d3a613a68ecab8a155787f2292 Mon Sep 17 00:00:00 2001
From: Rafael <rafael.c.pinheiro96@gmail.com>
Date: Thu, 28 Sep 2023 16:18:03 -0300
Subject: [PATCH 037/145] log error

---
 pipelines/rj_smtr/utils.py | 18 +++++++++++++++++-
 1 file changed, 17 insertions(+), 1 deletion(-)

diff --git a/pipelines/rj_smtr/utils.py b/pipelines/rj_smtr/utils.py
index 20168b039..6219aaa78 100644
--- a/pipelines/rj_smtr/utils.py
+++ b/pipelines/rj_smtr/utils.py
@@ -9,6 +9,8 @@
 
 from datetime import timedelta, datetime
 from typing import List, Union
+import traceback
+import sys
 import io
 import json
 import zipfile
@@ -52,6 +54,19 @@ def log_critical(message: str, secret_path: str = constants.CRITICAL_SECRET_PATH
     return send_discord_message(message=message, webhook_url=url)
 
 
+def log_error(error: str):
+    tb = sys.exc_info()[-1]
+    frame = traceback.extract_tb(tb, 1)[0]
+    file_name = frame[0]
+    function_name = frame[2]
+    line_no = frame[1]
+
+    log(
+        f"[CATCHED] Task failed in file {file_name} - ({function_name}) line: {line_no} with error: \n{error}",
+        level="error",
+    )
+
+
 def create_or_append_table(
     dataset_id: str, table_id: str, path: str, partitions: str = None
 ):
@@ -728,6 +743,7 @@ def read_raw_data(filepath: str, csv_args: dict = dict()) -> tuple[str, pd.DataF
 
     except Exception as exp:
         error = exp
-        log(f"[CATCHED] Task failed with error: \n{error}", level="error")
+        log_error(error=error)
+        # log(f"[CATCHED] Task failed with error: \n{error}", level="error")
 
     return error, data

From 2bedf890ee42187088bfa645d61a0af08598f4f7 Mon Sep 17 00:00:00 2001
From: Rafael <rafael.c.pinheiro96@gmail.com>
Date: Thu, 28 Sep 2023 16:44:41 -0300
Subject: [PATCH 038/145] change raw api extraction for json

---
 pipelines/rj_smtr/tasks.py |  7 ++++---
 pipelines/rj_smtr/utils.py | 14 +++++++++-----
 2 files changed, 13 insertions(+), 8 deletions(-)

diff --git a/pipelines/rj_smtr/tasks.py b/pipelines/rj_smtr/tasks.py
index 4a7182daf..be878db21 100644
--- a/pipelines/rj_smtr/tasks.py
+++ b/pipelines/rj_smtr/tasks.py
@@ -34,6 +34,7 @@
     get_datetime_range,
     read_raw_data,
     save_treated_local_func,
+    log_error,
 )
 from pipelines.utils.execute_dbt_model.utils import get_dbt_client
 from pipelines.utils.utils import log, get_redis_client, get_vault_secret
@@ -434,7 +435,7 @@ def get_raw(  # pylint: disable=R0912
         error = exp
 
     if error is not None:
-        log(f"[CATCHED] Task failed with error: \n{error}", level="error")
+        log_error(error=error)
 
     return {"data": data, "error": error}
 
@@ -992,7 +993,7 @@ def transform_raw_to_nested_structure(
         error = exp
 
     if error is not None:
-        log(f"[CATCHED] Task failed with error: \n{error}", level="error")
+        log_error(error=error)
 
     return error, filepath
 
@@ -1127,7 +1128,7 @@ def get_raw_from_sources(
             raise NotImplementedError(f"{source_type} not supported")
     except NotImplementedError as exp:
         error = exp
-        log(f"[CATCHED] Task failed with error: \n{error}", level="error")
+        log_error(error=error)
 
     log(f"Raw extraction ended returned values: {error}, {filepath}")
     return error, filepath
diff --git a/pipelines/rj_smtr/utils.py b/pipelines/rj_smtr/utils.py
index 6219aaa78..41b29d41e 100644
--- a/pipelines/rj_smtr/utils.py
+++ b/pipelines/rj_smtr/utils.py
@@ -538,13 +538,17 @@ def get_raw_data_api(  # pylint: disable=R0912
         )
 
         response.raise_for_status()
-        filepath = save_raw_local_func(
-            data=response.text, filepath=filepath, filetype=filetype
-        )
+
+        if filetype == "json":
+            data = response.json()
+        else:
+            data = response.text
+
+        filepath = save_raw_local_func(data=data, filepath=filepath, filetype=filetype)
 
     except Exception as exp:
         error = exp
-        log(f"[CATCHED] Task failed with error: \n{error}", level="error")
+        log_error(error=error)
 
     return error, filepath
 
@@ -591,7 +595,7 @@ def get_raw_data_gcs(
 
     except Exception as exp:
         error = exp
-        log(f"[CATCHED] Task failed with error: \n{error}", level="error")
+        log_error(error=error)
 
     return error, raw_filepath
 

From 20b48dfb2950ba513c049e922b8768da9ab03e57 Mon Sep 17 00:00:00 2001
From: Rafael <rafael.c.pinheiro96@gmail.com>
Date: Thu, 28 Sep 2023 16:53:26 -0300
Subject: [PATCH 039/145] change read json function

---
 pipelines/rj_smtr/utils.py | 13 +++----------
 1 file changed, 3 insertions(+), 10 deletions(-)

diff --git a/pipelines/rj_smtr/utils.py b/pipelines/rj_smtr/utils.py
index 41b29d41e..9c04ed701 100644
--- a/pipelines/rj_smtr/utils.py
+++ b/pipelines/rj_smtr/utils.py
@@ -55,14 +55,9 @@ def log_critical(message: str, secret_path: str = constants.CRITICAL_SECRET_PATH
 
 
 def log_error(error: str):
-    tb = sys.exc_info()[-1]
-    frame = traceback.extract_tb(tb, 1)[0]
-    file_name = frame[0]
-    function_name = frame[2]
-    line_no = frame[1]
-
+    error = traceback.format_exc()
     log(
-        f"[CATCHED] Task failed in file {file_name} - ({function_name}) line: {line_no} with error: \n{error}",
+        f"[CATCHED] Task failed with error: \n{error}",
         level="error",
     )
 
@@ -733,11 +728,9 @@ def read_raw_data(filepath: str, csv_args: dict = dict()) -> tuple[str, pd.DataF
         file_type = filepath.split(".")[-1]
 
         if file_type == "json":
-            with open(filepath, "r") as file:
-                data = pd.DataFrame.from_dict(json.load(file), orient="records")
+            data = pd.read_json(filepath)
 
             # data = json.loads(data)
-
         elif file_type in ("txt", "csv"):
             if csv_args is None:
                 csv_args = {}

From 42c6ac008e6e8f569993c9b0a40958941c0750a0 Mon Sep 17 00:00:00 2001
From: Rafael <rafael.c.pinheiro96@gmail.com>
Date: Thu, 28 Sep 2023 17:45:44 -0300
Subject: [PATCH 040/145] print log traceback

---
 pipelines/rj_smtr/tasks.py | 23 +++++++++--------------
 pipelines/rj_smtr/utils.py | 21 ++++++---------------
 2 files changed, 15 insertions(+), 29 deletions(-)

diff --git a/pipelines/rj_smtr/tasks.py b/pipelines/rj_smtr/tasks.py
index be878db21..dd48d2c64 100644
--- a/pipelines/rj_smtr/tasks.py
+++ b/pipelines/rj_smtr/tasks.py
@@ -34,7 +34,6 @@
     get_datetime_range,
     read_raw_data,
     save_treated_local_func,
-    log_error,
 )
 from pipelines.utils.execute_dbt_model.utils import get_dbt_client
 from pipelines.utils.utils import log, get_redis_client, get_vault_secret
@@ -431,11 +430,9 @@ def get_raw(  # pylint: disable=R0912
                     "Unsupported raw file extension. Supported only: json, csv and txt"
                 )
 
-    except Exception as exp:
-        error = exp
-
-    if error is not None:
-        log_error(error=error)
+    except Exception:
+        error = traceback.format_exc()
+        log(f"[CATCHED] Task failed with error: \n{error}", level="error")
 
     return {"data": data, "error": error}
 
@@ -989,11 +986,9 @@ def transform_raw_to_nested_structure(
         # save treated local
         filepath = save_treated_local_func(data=data, error=error, filepath=filepath)
 
-    except Exception as exp:  # pylint: disable=W0703
-        error = exp
-
-    if error is not None:
-        log_error(error=error)
+    except Exception:  # pylint: disable=W0703
+        error = traceback.format_exc()
+        log(f"[CATCHED] Task failed with error: \n{error}", level="error")
 
     return error, filepath
 
@@ -1126,9 +1121,9 @@ def get_raw_from_sources(
             )
         else:
             raise NotImplementedError(f"{source_type} not supported")
-    except NotImplementedError as exp:
-        error = exp
-        log_error(error=error)
+    except NotImplementedError:
+        error = traceback.format_exc()
+        log(f"[CATCHED] Task failed with error: \n{error}", level="error")
 
     log(f"Raw extraction ended returned values: {error}, {filepath}")
     return error, filepath
diff --git a/pipelines/rj_smtr/utils.py b/pipelines/rj_smtr/utils.py
index 9c04ed701..553bd860a 100644
--- a/pipelines/rj_smtr/utils.py
+++ b/pipelines/rj_smtr/utils.py
@@ -54,14 +54,6 @@ def log_critical(message: str, secret_path: str = constants.CRITICAL_SECRET_PATH
     return send_discord_message(message=message, webhook_url=url)
 
 
-def log_error(error: str):
-    error = traceback.format_exc()
-    log(
-        f"[CATCHED] Task failed with error: \n{error}",
-        level="error",
-    )
-
-
 def create_or_append_table(
     dataset_id: str, table_id: str, path: str, partitions: str = None
 ):
@@ -542,8 +534,8 @@ def get_raw_data_api(  # pylint: disable=R0912
         filepath = save_raw_local_func(data=data, filepath=filepath, filetype=filetype)
 
     except Exception as exp:
-        error = exp
-        log_error(error=error)
+        error = traceback.format_exc()
+        log(f"[CATCHED] Task failed with error: \n{error}", level="error")
 
     return error, filepath
 
@@ -589,8 +581,8 @@ def get_raw_data_gcs(
         )
 
     except Exception as exp:
-        error = exp
-        log_error(error=error)
+        error = traceback.format_exc()
+        log(f"[CATCHED] Task failed with error: \n{error}", level="error")
 
     return error, raw_filepath
 
@@ -739,8 +731,7 @@ def read_raw_data(filepath: str, csv_args: dict = dict()) -> tuple[str, pd.DataF
             error = "Unsupported raw file extension. Supported only: json, csv and txt"
 
     except Exception as exp:
-        error = exp
-        log_error(error=error)
-        # log(f"[CATCHED] Task failed with error: \n{error}", level="error")
+        error = traceback.format_exc()
+        log(f"[CATCHED] Task failed with error: \n{error}", level="error")
 
     return error, data

From 25276040630a11950a1c5f556ccff944e2202a39 Mon Sep 17 00:00:00 2001
From: Rafael <rafael.c.pinheiro96@gmail.com>
Date: Thu, 28 Sep 2023 23:08:40 -0300
Subject: [PATCH 041/145] skip pre treatment if empty df

---
 pipelines/rj_smtr/tasks.py | 53 ++++++++++++++++++++------------------
 1 file changed, 28 insertions(+), 25 deletions(-)

diff --git a/pipelines/rj_smtr/tasks.py b/pipelines/rj_smtr/tasks.py
index dd48d2c64..8a24934ce 100644
--- a/pipelines/rj_smtr/tasks.py
+++ b/pipelines/rj_smtr/tasks.py
@@ -937,11 +937,6 @@ def transform_raw_to_nested_structure(
 
     # ORGANIZAR:
 
-    # Check empty dataframe
-    # if len(status["data"]) == 0:
-    #     log("Empty dataframe, skipping transformation...")
-    #     return {"data": pd.DataFrame(), "error": error}
-
     try:
         # leitura do dado raw
         error, data = read_raw_data(filepath=raw_filepath)
@@ -956,32 +951,40 @@ def transform_raw_to_nested_structure(
         - data:\n{data.head()}"""
         )
 
-        log(f"Raw data:\n{data_info_str(data)}", level="info")
+        # Check empty dataframe
+        if data.empty:
+            log("Empty dataframe, skipping transformation...")
+        else:
+            log(f"Raw data:\n{data_info_str(data)}", level="info")
 
-        log("Adding captured timestamp column...", level="info")
-        data["timestamp_captura"] = timestamp
+            log("Adding captured timestamp column...", level="info")
+            data["timestamp_captura"] = timestamp
 
-        log("Striping string columns...", level="info")
-        for col in data.columns[data.dtypes == "object"].to_list():
-            data[col] = data[col].str.strip()
+            log("Striping string columns...", level="info")
+            for col in data.columns[data.dtypes == "object"].to_list():
+                data[col] = data[col].str.strip()
 
-        log(f"Finished cleaning! Data:\n{data_info_str(data)}", level="info")
+            log(f"Finished cleaning! Data:\n{data_info_str(data)}", level="info")
 
-        log("Creating nested structure...", level="info")
-        pk_cols = primary_key + ["timestamp_captura"]
-        data = (
-            data.groupby(pk_cols)
-            .apply(
-                lambda x: x[data.columns.difference(pk_cols)].to_json(orient="records")
+            log("Creating nested structure...", level="info")
+            pk_cols = primary_key + ["timestamp_captura"]
+            data = (
+                data.groupby(pk_cols)
+                .apply(
+                    lambda x: x[data.columns.difference(pk_cols)].to_json(
+                        orient="records"
+                    )
+                )
+                .str.strip("[]")
+                .reset_index(name="content")[
+                    primary_key + ["content", "timestamp_captura"]
+                ]
             )
-            .str.strip("[]")
-            .reset_index(name="content")[primary_key + ["content", "timestamp_captura"]]
-        )
 
-        log(
-            f"Finished nested structure! Data:\n{data_info_str(data)}",
-            level="info",
-        )
+            log(
+                f"Finished nested structure! Data:\n{data_info_str(data)}",
+                level="info",
+            )
 
         # save treated local
         filepath = save_treated_local_func(data=data, error=error, filepath=filepath)

From 0f907b977b075949373c790fabf221409bae1ca6 Mon Sep 17 00:00:00 2001
From: Rafael <rafael.c.pinheiro96@gmail.com>
Date: Fri, 29 Sep 2023 06:47:53 -0300
Subject: [PATCH 042/145] skip save staging if dataframe is empty / save raw

---
 pipelines/rj_smtr/flows.py |  3 ++-
 pipelines/rj_smtr/tasks.py | 27 ++++++++++++++++++---------
 pipelines/rj_smtr/utils.py | 27 +++++++++++----------------
 3 files changed, 31 insertions(+), 26 deletions(-)

diff --git a/pipelines/rj_smtr/flows.py b/pipelines/rj_smtr/flows.py
index 27eaa76a4..f1b30335a 100644
--- a/pipelines/rj_smtr/flows.py
+++ b/pipelines/rj_smtr/flows.py
@@ -98,7 +98,7 @@
 
     # Pré-tratamento #
 
-    error, staging_filepath = transform_raw_to_nested_structure(
+    error, staging_filepath, flag_empty_data = transform_raw_to_nested_structure(
         raw_filepath=raw_filepath,
         filepath=filepath,
         error=error,
@@ -114,6 +114,7 @@
         table_id=table_id,
         dataset_id=dataset_id,
         partitions=partitions,
+        flag_empty_data=flag_empty_data,
     )
 
 default_capture_flow.storage = GCS(emd_constants.GCS_FLOWS_BUCKET.value)
diff --git a/pipelines/rj_smtr/tasks.py b/pipelines/rj_smtr/tasks.py
index 8a24934ce..ee6d5bfa1 100644
--- a/pipelines/rj_smtr/tasks.py
+++ b/pipelines/rj_smtr/tasks.py
@@ -34,6 +34,7 @@
     get_datetime_range,
     read_raw_data,
     save_treated_local_func,
+    save_raw_local_func,
 )
 from pipelines.utils.execute_dbt_model.utils import get_dbt_client
 from pipelines.utils.utils import log, get_redis_client, get_vault_secret
@@ -667,6 +668,7 @@ def upload_staging_data_to_gcs(
     table_id: str,
     dataset_id: str,
     partitions: list,
+    flag_empty_data: bool,
 ):
     """
     Upload staging data to GCS.
@@ -682,7 +684,9 @@ def upload_staging_data_to_gcs(
     Returns:
         None
     """
-    if not error:
+    if flag_empty_data:
+        log("Empty dataframe, skipping upload")
+    elif not error:
         try:
             # Creates and publish table if it does not exist, append to it otherwise
             create_or_append_table(
@@ -908,7 +912,7 @@ def get_previous_date(days):
 ###############
 
 
-@task(nout=2)
+@task(nout=3)
 def transform_raw_to_nested_structure(
     raw_filepath: str,
     filepath: str,
@@ -931,9 +935,9 @@ def transform_raw_to_nested_structure(
     """
 
     # Check previous error
-
+    flag_empty_data = False
     if error is not None:
-        return error, None
+        return error, None, flag_empty_data
 
     # ORGANIZAR:
 
@@ -953,6 +957,7 @@ def transform_raw_to_nested_structure(
 
         # Check empty dataframe
         if data.empty:
+            flag_empty_data = True
             log("Empty dataframe, skipping transformation...")
         else:
             log(f"Raw data:\n{data_info_str(data)}", level="info")
@@ -993,7 +998,7 @@ def transform_raw_to_nested_structure(
         error = traceback.format_exc()
         log(f"[CATCHED] Task failed with error: \n{error}", level="error")
 
-    return error, filepath
+    return error, filepath, flag_empty_data
 
 
 # @task(checkpoint=False)
@@ -1098,6 +1103,7 @@ def get_raw_from_sources(
     """
     error = None
     filepath = None
+    data = None
 
     source_values = source_type.split("-", 1)
 
@@ -1109,21 +1115,24 @@ def get_raw_from_sources(
 
     try:
         if source_type == "api":
-            error, filepath = get_raw_data_api(
+            error, data, filetype = get_raw_data_api(
                 url=source_path,
                 secret_path=secret_path,
                 api_params=api_params,
-                filepath=local_filepath,
                 filetype=filetype,
             )
         elif source_type == "gcs":
-            error, filepath = get_raw_data_gcs(
+            error, data, filetype = get_raw_data_gcs(
                 gcs_path=source_path,
                 filename_to_unzip=table_id,
-                local_filepath=local_filepath,
             )
         else:
             raise NotImplementedError(f"{source_type} not supported")
+
+        filepath = save_raw_local_func(
+            data=data, filepath=local_filepath, filetype=filetype
+        )
+
     except NotImplementedError:
         error = traceback.format_exc()
         log(f"[CATCHED] Task failed with error: \n{error}", level="error")
diff --git a/pipelines/rj_smtr/utils.py b/pipelines/rj_smtr/utils.py
index 553bd860a..f3ff410c4 100644
--- a/pipelines/rj_smtr/utils.py
+++ b/pipelines/rj_smtr/utils.py
@@ -487,9 +487,8 @@ def get_raw_data_api(  # pylint: disable=R0912
     url: str,
     secret_path: str = None,
     api_params: dict = None,
-    filepath: str = None,
     filetype: str = None,
-) -> tuple[str, str]:
+) -> tuple[str, str, str]:
     """
     Request data from URL API
 
@@ -504,6 +503,7 @@ def get_raw_data_api(  # pylint: disable=R0912
         tuple[str, str]: Error and filepath
     """
     error = None
+    data = None
     try:
         if secret_path is None:
             headers = secret_path
@@ -531,20 +531,17 @@ def get_raw_data_api(  # pylint: disable=R0912
         else:
             data = response.text
 
-        filepath = save_raw_local_func(data=data, filepath=filepath, filetype=filetype)
-
-    except Exception as exp:
+    except Exception:
         error = traceback.format_exc()
         log(f"[CATCHED] Task failed with error: \n{error}", level="error")
 
-    return error, filepath
+    return error, data, filetype
 
 
 def get_raw_data_gcs(
     gcs_path: str,
-    local_filepath: str,
     filename_to_unzip: str = None,
-) -> tuple[str, str]:
+) -> tuple[str, str, str]:
     """
     Get raw data from GCS
 
@@ -557,7 +554,8 @@ def get_raw_data_gcs(
         tuple[str, str]: Error and filepath
     """
     error = None
-    raw_filepath = None
+    data = None
+    filetype = None
 
     try:
         blob = get_storage_blob(gcs_path=gcs_path)
@@ -574,17 +572,14 @@ def get_raw_data_gcs(
         else:
             filename = blob.name
 
-        raw_filepath = save_raw_local_func(
-            data=data.decode(encoding="utf-8"),
-            filepath=local_filepath,
-            filetype=filename.split(".")[-1],
-        )
+        data = data.decode(encoding="utf-8")
+        filetype = filename.split(".")[-1]
 
-    except Exception as exp:
+    except Exception:
         error = traceback.format_exc()
         log(f"[CATCHED] Task failed with error: \n{error}", level="error")
 
-    return error, raw_filepath
+    return error, data, filetype
 
 
 def save_treated_local_func(

From ba1dad2654709c11c0fedd9d88d7a5eb6a969c60 Mon Sep 17 00:00:00 2001
From: Rafael <rafael.c.pinheiro96@gmail.com>
Date: Fri, 29 Sep 2023 11:30:15 -0300
Subject: [PATCH 043/145] remove skip upload if empty dataframe

---
 pipelines/rj_smtr/flows.py | 3 +--
 pipelines/rj_smtr/tasks.py | 8 +++-----
 2 files changed, 4 insertions(+), 7 deletions(-)

diff --git a/pipelines/rj_smtr/flows.py b/pipelines/rj_smtr/flows.py
index f1b30335a..27eaa76a4 100644
--- a/pipelines/rj_smtr/flows.py
+++ b/pipelines/rj_smtr/flows.py
@@ -98,7 +98,7 @@
 
     # Pré-tratamento #
 
-    error, staging_filepath, flag_empty_data = transform_raw_to_nested_structure(
+    error, staging_filepath = transform_raw_to_nested_structure(
         raw_filepath=raw_filepath,
         filepath=filepath,
         error=error,
@@ -114,7 +114,6 @@
         table_id=table_id,
         dataset_id=dataset_id,
         partitions=partitions,
-        flag_empty_data=flag_empty_data,
     )
 
 default_capture_flow.storage = GCS(emd_constants.GCS_FLOWS_BUCKET.value)
diff --git a/pipelines/rj_smtr/tasks.py b/pipelines/rj_smtr/tasks.py
index ee6d5bfa1..2fe9e27ed 100644
--- a/pipelines/rj_smtr/tasks.py
+++ b/pipelines/rj_smtr/tasks.py
@@ -912,7 +912,7 @@ def get_previous_date(days):
 ###############
 
 
-@task(nout=3)
+@task(nout=2)
 def transform_raw_to_nested_structure(
     raw_filepath: str,
     filepath: str,
@@ -935,9 +935,8 @@ def transform_raw_to_nested_structure(
     """
 
     # Check previous error
-    flag_empty_data = False
     if error is not None:
-        return error, None, flag_empty_data
+        return error, None
 
     # ORGANIZAR:
 
@@ -957,7 +956,6 @@ def transform_raw_to_nested_structure(
 
         # Check empty dataframe
         if data.empty:
-            flag_empty_data = True
             log("Empty dataframe, skipping transformation...")
         else:
             log(f"Raw data:\n{data_info_str(data)}", level="info")
@@ -998,7 +996,7 @@ def transform_raw_to_nested_structure(
         error = traceback.format_exc()
         log(f"[CATCHED] Task failed with error: \n{error}", level="error")
 
-    return error, filepath, flag_empty_data
+    return error, filepath
 
 
 # @task(checkpoint=False)

From 4c3d1cffa53f376a8aa4ed3493db531e89bbc378 Mon Sep 17 00:00:00 2001
From: Rafael <rafael.c.pinheiro96@gmail.com>
Date: Fri, 29 Sep 2023 11:31:37 -0300
Subject: [PATCH 044/145] update docstring and returned values

---
 pipelines/rj_smtr/tasks.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/pipelines/rj_smtr/tasks.py b/pipelines/rj_smtr/tasks.py
index 2fe9e27ed..14da15069 100644
--- a/pipelines/rj_smtr/tasks.py
+++ b/pipelines/rj_smtr/tasks.py
@@ -919,7 +919,7 @@ def transform_raw_to_nested_structure(
     error: str,
     timestamp: datetime,
     primary_key: list = None,
-):
+) -> tuple(str, str):
     """
     Task to transform raw data to nested structure
 
@@ -931,6 +931,7 @@ def transform_raw_to_nested_structure(
         primary_key (list, optional): Primary key to be used on nested structure
 
     Returns:
+        str: Error traceback
         str: Path to the saved treated .csv file
     """
 

From 39e8606ffba05a096cc05f672085c05d5b4bd091 Mon Sep 17 00:00:00 2001
From: Rafael <rafael.c.pinheiro96@gmail.com>
Date: Fri, 29 Sep 2023 11:33:28 -0300
Subject: [PATCH 045/145] reorganize task order

---
 pipelines/rj_smtr/tasks.py | 226 ++++++++++++++++++-------------------
 1 file changed, 113 insertions(+), 113 deletions(-)

diff --git a/pipelines/rj_smtr/tasks.py b/pipelines/rj_smtr/tasks.py
index 14da15069..9c372e213 100644
--- a/pipelines/rj_smtr/tasks.py
+++ b/pipelines/rj_smtr/tasks.py
@@ -438,6 +438,119 @@ def get_raw(  # pylint: disable=R0912
     return {"data": data, "error": error}
 
 
+@task(checkpoint=False, nout=2)
+def create_request_params(
+    extract_params: dict,
+    table_id: str,
+    dataset_id: str,
+    timestamp: datetime,
+) -> tuple:
+    """
+    Task to create request params
+
+    Args:
+        extract_params (dict): extract parameters
+        table_id (str): table_id on BigQuery
+        dataset_id (str): dataset_id on BigQuery
+        timestamp (datetime): timestamp for flow run
+
+    Returns:
+        request_params: host, database and query to request data
+        request_url: url to request data
+    """
+    request_params = None
+
+    if dataset_id == constants.BILHETAGEM_DATASET_ID.value:
+        database = constants.BILHETAGEM_GENERAL_CAPTURE_PARAMS.value["databases"][
+            extract_params["database"]
+        ]
+        request_url = (
+            constants.BILHETAGEM_GENERAL_CAPTURE_PARAMS.value["vpn_url"]
+            + database["engine"]
+        )
+
+        datetime_range = get_datetime_range(
+            timestamp=timestamp, interval=timedelta(**extract_params["run_interval"])
+        )
+
+        request_params = {
+            "host": database["host"],  # TODO: exibir no log em ambiente fechado
+            "database": extract_params["database"],
+            "query": extract_params["query"].format(**datetime_range),
+        }
+
+    elif dataset_id == constants.GTFS_DATASET_ID.value:
+        if table_id == constants.GTFS_QUADRO_CAPTURE_PARAMS.value["table_id"]:
+            request_url = f"{constants.GTFS_BASE_GCS_PATH.value}/{table_id}.csv"
+        else:
+            request_url = f"{constants.GTFS_BASE_GCS_PATH.value}/gtfs.zip"
+
+    return request_params, request_url
+
+
+@task(checkpoint=False, nout=2)
+def get_raw_from_sources(
+    source_type: str,
+    local_filepath: str,
+    source_path: str = None,
+    table_id: str = None,
+    secret_path: str = None,
+    api_params: dict = None,
+):
+    """
+    Task to get raw data from sources
+
+    Args:
+        source_type (str): source type
+        local_filepath (str): local filepath
+        source_path (str, optional): source path. Defaults to None.
+        table_id (str, optional): table_id on BigQuery. Defaults to None.
+        secret_path (str, optional): secret path. Defaults to None.
+        api_params (dict, optional): api parameters. Defaults to None.
+
+    Returns:
+        error: error
+    """
+    error = None
+    filepath = None
+    data = None
+
+    source_values = source_type.split("-", 1)
+
+    source_type, filetype = (
+        source_values if len(source_values) == 2 else (source_values[0], None)
+    )
+
+    log(f"Getting raw data from source type: {source_type}")
+
+    try:
+        if source_type == "api":
+            error, data, filetype = get_raw_data_api(
+                url=source_path,
+                secret_path=secret_path,
+                api_params=api_params,
+                filetype=filetype,
+            )
+        elif source_type == "gcs":
+            error, data, filetype = get_raw_data_gcs(
+                gcs_path=source_path,
+                filename_to_unzip=table_id,
+            )
+        else:
+            raise NotImplementedError(f"{source_type} not supported")
+
+        filepath = save_raw_local_func(
+            data=data, filepath=local_filepath, filetype=filetype
+        )
+
+    except NotImplementedError:
+        error = traceback.format_exc()
+        log(f"[CATCHED] Task failed with error: \n{error}", level="error")
+
+    log(f"Raw extraction ended returned values: {error}, {filepath}")
+    return error, filepath
+
+
 ###############
 #
 # Load data
@@ -1025,116 +1138,3 @@ def transform_raw_to_nested_structure(
 #     end = timestamp.astimezone(tz=timezone("UTC")).strftime("%Y-%m-%d %H:%M:%S")
 
 #     return {"start": start, "end": end}
-
-
-@task(checkpoint=False, nout=2)
-def create_request_params(
-    extract_params: dict,
-    table_id: str,
-    dataset_id: str,
-    timestamp: datetime,
-) -> tuple:
-    """
-    Task to create request params
-
-    Args:
-        extract_params (dict): extract parameters
-        table_id (str): table_id on BigQuery
-        dataset_id (str): dataset_id on BigQuery
-        timestamp (datetime): timestamp for flow run
-
-    Returns:
-        request_params: host, database and query to request data
-        request_url: url to request data
-    """
-    request_params = None
-
-    if dataset_id == constants.BILHETAGEM_DATASET_ID.value:
-        database = constants.BILHETAGEM_GENERAL_CAPTURE_PARAMS.value["databases"][
-            extract_params["database"]
-        ]
-        request_url = (
-            constants.BILHETAGEM_GENERAL_CAPTURE_PARAMS.value["vpn_url"]
-            + database["engine"]
-        )
-
-        datetime_range = get_datetime_range(
-            timestamp=timestamp, interval=timedelta(**extract_params["run_interval"])
-        )
-
-        request_params = {
-            "host": database["host"],  # TODO: exibir no log em ambiente fechado
-            "database": extract_params["database"],
-            "query": extract_params["query"].format(**datetime_range),
-        }
-
-    elif dataset_id == constants.GTFS_DATASET_ID.value:
-        if table_id == constants.GTFS_QUADRO_CAPTURE_PARAMS.value["table_id"]:
-            request_url = f"{constants.GTFS_BASE_GCS_PATH.value}/{table_id}.csv"
-        else:
-            request_url = f"{constants.GTFS_BASE_GCS_PATH.value}/gtfs.zip"
-
-    return request_params, request_url
-
-
-@task(checkpoint=False, nout=2)
-def get_raw_from_sources(
-    source_type: str,
-    local_filepath: str,
-    source_path: str = None,
-    table_id: str = None,
-    secret_path: str = None,
-    api_params: dict = None,
-):
-    """
-    Task to get raw data from sources
-
-    Args:
-        source_type (str): source type
-        local_filepath (str): local filepath
-        source_path (str, optional): source path. Defaults to None.
-        table_id (str, optional): table_id on BigQuery. Defaults to None.
-        secret_path (str, optional): secret path. Defaults to None.
-        api_params (dict, optional): api parameters. Defaults to None.
-
-    Returns:
-        error: error
-    """
-    error = None
-    filepath = None
-    data = None
-
-    source_values = source_type.split("-", 1)
-
-    source_type, filetype = (
-        source_values if len(source_values) == 2 else (source_values[0], None)
-    )
-
-    log(f"Getting raw data from source type: {source_type}")
-
-    try:
-        if source_type == "api":
-            error, data, filetype = get_raw_data_api(
-                url=source_path,
-                secret_path=secret_path,
-                api_params=api_params,
-                filetype=filetype,
-            )
-        elif source_type == "gcs":
-            error, data, filetype = get_raw_data_gcs(
-                gcs_path=source_path,
-                filename_to_unzip=table_id,
-            )
-        else:
-            raise NotImplementedError(f"{source_type} not supported")
-
-        filepath = save_raw_local_func(
-            data=data, filepath=local_filepath, filetype=filetype
-        )
-
-    except NotImplementedError:
-        error = traceback.format_exc()
-        log(f"[CATCHED] Task failed with error: \n{error}", level="error")
-
-    log(f"Raw extraction ended returned values: {error}, {filepath}")
-    return error, filepath

From 465ee525648dae41c1e938986f4acc08ec5bac18 Mon Sep 17 00:00:00 2001
From: Rafael <rafael.c.pinheiro96@gmail.com>
Date: Fri, 29 Sep 2023 11:47:24 -0300
Subject: [PATCH 046/145] fix tuple

---
 pipelines/rj_smtr/tasks.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pipelines/rj_smtr/tasks.py b/pipelines/rj_smtr/tasks.py
index 9c372e213..f5fb79ede 100644
--- a/pipelines/rj_smtr/tasks.py
+++ b/pipelines/rj_smtr/tasks.py
@@ -1032,7 +1032,7 @@ def transform_raw_to_nested_structure(
     error: str,
     timestamp: datetime,
     primary_key: list = None,
-) -> tuple(str, str):
+) -> tuple[str, str]:
     """
     Task to transform raw data to nested structure
 

From 67a1056a3e363b01fa8573c247ab41aa453ffd2c Mon Sep 17 00:00:00 2001
From: Rafael <rafael.c.pinheiro96@gmail.com>
Date: Fri, 29 Sep 2023 12:01:42 -0300
Subject: [PATCH 047/145] change zip logic

---
 pipelines/rj_smtr/utils.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/pipelines/rj_smtr/utils.py b/pipelines/rj_smtr/utils.py
index f3ff410c4..338f2e07b 100644
--- a/pipelines/rj_smtr/utils.py
+++ b/pipelines/rj_smtr/utils.py
@@ -560,9 +560,11 @@ def get_raw_data_gcs(
     try:
         blob = get_storage_blob(gcs_path=gcs_path)
 
+        blob_type = blob.name.split(".")[-1]
+
         data = blob.download_as_bytes()
 
-        if filename_to_unzip:
+        if blob_type == "zip":
             with zipfile.ZipFile(io.BytesIO(data), "r") as zipped_file:
                 filenames = zipped_file.namelist()
                 filename = list(

From 3f5f34cabc75a05f424d0e3ed8c1443915f9656c Mon Sep 17 00:00:00 2001
From: Rafael <rafael.c.pinheiro96@gmail.com>
Date: Fri, 29 Sep 2023 12:03:15 -0300
Subject: [PATCH 048/145] remove skip

---
 pipelines/rj_smtr/tasks.py | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/pipelines/rj_smtr/tasks.py b/pipelines/rj_smtr/tasks.py
index f5fb79ede..4d4088866 100644
--- a/pipelines/rj_smtr/tasks.py
+++ b/pipelines/rj_smtr/tasks.py
@@ -781,7 +781,6 @@ def upload_staging_data_to_gcs(
     table_id: str,
     dataset_id: str,
     partitions: list,
-    flag_empty_data: bool,
 ):
     """
     Upload staging data to GCS.
@@ -797,9 +796,7 @@ def upload_staging_data_to_gcs(
     Returns:
         None
     """
-    if flag_empty_data:
-        log("Empty dataframe, skipping upload")
-    elif not error:
+    if not error:
         try:
             # Creates and publish table if it does not exist, append to it otherwise
             create_or_append_table(

From 7860a4bdd58e1f35bd0260ce0088db7049a67a3f Mon Sep 17 00:00:00 2001
From: Rafael <rafael.c.pinheiro96@gmail.com>
Date: Fri, 29 Sep 2023 12:09:20 -0300
Subject: [PATCH 049/145] create gtfs zip constant

---
 pipelines/rj_smtr/constants.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/pipelines/rj_smtr/constants.py b/pipelines/rj_smtr/constants.py
index 4f2b1c95a..009c241e1 100644
--- a/pipelines/rj_smtr/constants.py
+++ b/pipelines/rj_smtr/constants.py
@@ -309,3 +309,4 @@ class constants(Enum):  # pylint: disable=c0103
     ]
     GTFS_QUADRO_CAPTURE_PARAMS = {"table_id": "quadro", "primary_key": "servico"}
     GTFS_BASE_GCS_PATH = "development/br_rj_riodejaneiro_gtfs/upload"
+    GTFS_ZIP_FILENAME = "gtfs.zip"

From 2d7c9cb8b12f4e2ca71fcffd8b2896952acbc11a Mon Sep 17 00:00:00 2001
From: Rafael <rafael.c.pinheiro96@gmail.com>
Date: Fri, 29 Sep 2023 12:13:06 -0300
Subject: [PATCH 050/145] add gtfs zip file name

---
 pipelines/rj_smtr/constants.py | 2 +-
 pipelines/rj_smtr/tasks.py     | 4 +++-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/pipelines/rj_smtr/constants.py b/pipelines/rj_smtr/constants.py
index 009c241e1..4975a246f 100644
--- a/pipelines/rj_smtr/constants.py
+++ b/pipelines/rj_smtr/constants.py
@@ -309,4 +309,4 @@ class constants(Enum):  # pylint: disable=c0103
     ]
     GTFS_QUADRO_CAPTURE_PARAMS = {"table_id": "quadro", "primary_key": "servico"}
     GTFS_BASE_GCS_PATH = "development/br_rj_riodejaneiro_gtfs/upload"
-    GTFS_ZIP_FILENAME = "gtfs.zip"
+    GTFS_ZIP_NAME = "gtfs.zip"
diff --git a/pipelines/rj_smtr/tasks.py b/pipelines/rj_smtr/tasks.py
index 4d4088866..8863f6405 100644
--- a/pipelines/rj_smtr/tasks.py
+++ b/pipelines/rj_smtr/tasks.py
@@ -483,7 +483,9 @@ def create_request_params(
         if table_id == constants.GTFS_QUADRO_CAPTURE_PARAMS.value["table_id"]:
             request_url = f"{constants.GTFS_BASE_GCS_PATH.value}/{table_id}.csv"
         else:
-            request_url = f"{constants.GTFS_BASE_GCS_PATH.value}/gtfs.zip"
+            request_url = (
+                f"{constants.GTFS_BASE_GCS_PATH.value}/{constants.GTFS_ZIP_NAME.value}"
+            )
 
     return request_params, request_url
 

From bfa62739a46ce2d8f287e4989755aa4e2b6505e9 Mon Sep 17 00:00:00 2001
From: Rafael <rafael.c.pinheiro96@gmail.com>
Date: Fri, 29 Sep 2023 12:18:11 -0300
Subject: [PATCH 051/145] add csv to save raw / change filetype logic

---
 pipelines/rj_smtr/utils.py | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/pipelines/rj_smtr/utils.py b/pipelines/rj_smtr/utils.py
index 338f2e07b..f3362bd91 100644
--- a/pipelines/rj_smtr/utils.py
+++ b/pipelines/rj_smtr/utils.py
@@ -475,7 +475,7 @@ def save_raw_local_func(
 
     # if filetype == "csv":
     #     pass
-    if filetype == "txt":
+    if filetype in ("txt", "csv"):
         with open(_filepath, "w", encoding="utf-8") as file:
             file.write(data)
 
@@ -560,22 +560,21 @@ def get_raw_data_gcs(
     try:
         blob = get_storage_blob(gcs_path=gcs_path)
 
-        blob_type = blob.name.split(".")[-1]
+        filename = blob.name
+        filetype = filename.split(".")[-1]
 
         data = blob.download_as_bytes()
 
-        if blob_type == "zip":
+        if filetype == "zip":
             with zipfile.ZipFile(io.BytesIO(data), "r") as zipped_file:
                 filenames = zipped_file.namelist()
                 filename = list(
                     filter(lambda x: x.split(".")[0] == filename_to_unzip, filenames)
                 )[0]
+                filetype = filename.split(".")[-1]
                 data = zipped_file.read(filename)
-        else:
-            filename = blob.name
 
         data = data.decode(encoding="utf-8")
-        filetype = filename.split(".")[-1]
 
     except Exception:
         error = traceback.format_exc()

From 524cd07363cc5866db94a1c67d4b32f04ceac87f Mon Sep 17 00:00:00 2001
From: Rafael <rafael.c.pinheiro96@gmail.com>
Date: Fri, 29 Sep 2023 12:18:32 -0300
Subject: [PATCH 052/145] remove comments

---
 pipelines/rj_smtr/flows.py | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

diff --git a/pipelines/rj_smtr/flows.py b/pipelines/rj_smtr/flows.py
index 27eaa76a4..3fc18b7b0 100644
--- a/pipelines/rj_smtr/flows.py
+++ b/pipelines/rj_smtr/flows.py
@@ -22,17 +22,11 @@
     create_local_partition_path,
     get_current_timestamp,
     parse_timestamp_to_string,
-    # save_raw_local,
-    # save_treated_local,
-    # upload_logs_to_bq,
-    # bq_upload,
     upload_raw_data_to_gcs,
     upload_staging_data_to_gcs,
     transform_raw_to_nested_structure,
     get_raw_from_sources,
-    # transform_data_to_json,
     create_request_params,
-    # get_datetime_range,
 )
 
 
@@ -79,7 +73,7 @@
     )
 
     error, raw_filepath = get_raw_from_sources(
-        source_type=source_type,  # parametro de extracao, onde ficar?
+        source_type=source_type,
         local_filepath=filepath,
         source_path=request_path,
         table_id=table_id,

From 3477a2c53306bacff80f08fb2d94a7863381e61e Mon Sep 17 00:00:00 2001
From: Rafael <rafael.c.pinheiro96@gmail.com>
Date: Fri, 29 Sep 2023 12:21:00 -0300
Subject: [PATCH 053/145] fix csv_args default value

---
 pipelines/rj_smtr/utils.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pipelines/rj_smtr/utils.py b/pipelines/rj_smtr/utils.py
index f3362bd91..b60d8d8ac 100644
--- a/pipelines/rj_smtr/utils.py
+++ b/pipelines/rj_smtr/utils.py
@@ -699,7 +699,7 @@ def get_datetime_range(
     return {"start": start, "end": end}
 
 
-def read_raw_data(filepath: str, csv_args: dict = dict()) -> tuple[str, pd.DataFrame]:
+def read_raw_data(filepath: str, csv_args: dict = None) -> tuple[str, pd.DataFrame]:
     """
     Read raw data from file
 
@@ -726,7 +726,7 @@ def read_raw_data(filepath: str, csv_args: dict = dict()) -> tuple[str, pd.DataF
         else:
             error = "Unsupported raw file extension. Supported only: json, csv and txt"
 
-    except Exception as exp:
+    except Exception:
         error = traceback.format_exc()
         log(f"[CATCHED] Task failed with error: \n{error}", level="error")
 

From 16e61c879172bce4679d696884ba486294963e5c Mon Sep 17 00:00:00 2001
From: Rafael <rafael.c.pinheiro96@gmail.com>
Date: Fri, 29 Sep 2023 12:24:43 -0300
Subject: [PATCH 054/145] change docstring get raw api

---
 pipelines/rj_smtr/utils.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/pipelines/rj_smtr/utils.py b/pipelines/rj_smtr/utils.py
index b60d8d8ac..33378f049 100644
--- a/pipelines/rj_smtr/utils.py
+++ b/pipelines/rj_smtr/utils.py
@@ -496,7 +496,6 @@ def get_raw_data_api(  # pylint: disable=R0912
         url (str): URL to request data
         secret_path (str, optional): Secret path to get headers. Defaults to None.
         api_params (dict, optional): Parameters to pass to API. Defaults to None.
-        filepath (str, optional): Path to save raw file. Defaults to None.
         filetype (str, optional): Filetype to save raw file. Defaults to None.
 
     Returns:

From 4bdaa4fe4ed5b576aa3a04af44fed9a792b274ea Mon Sep 17 00:00:00 2001
From: Rafael <rafael.c.pinheiro96@gmail.com>
Date: Fri, 29 Sep 2023 12:27:33 -0300
Subject: [PATCH 055/145] change raw data gcs docstring

---
 pipelines/rj_smtr/utils.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/pipelines/rj_smtr/utils.py b/pipelines/rj_smtr/utils.py
index 33378f049..aeafa8ae1 100644
--- a/pipelines/rj_smtr/utils.py
+++ b/pipelines/rj_smtr/utils.py
@@ -546,7 +546,6 @@ def get_raw_data_gcs(
 
     Args:
         gcs_path (str): GCS path to get data
-        local_filepath (str): Local filepath to save raw data
         filename_to_unzip (str, optional): Filename to unzip. Defaults to None.
 
     Returns:

From e3b7c140db20a62c4d8be850abab2087406c890f Mon Sep 17 00:00:00 2001
From: Rafael <rafael.c.pinheiro96@gmail.com>
Date: Fri, 29 Sep 2023 12:29:21 -0300
Subject: [PATCH 056/145] remove commented task

---
 pipelines/rj_smtr/tasks.py | 27 ---------------------------
 1 file changed, 27 deletions(-)

diff --git a/pipelines/rj_smtr/tasks.py b/pipelines/rj_smtr/tasks.py
index 8863f6405..65ab9505e 100644
--- a/pipelines/rj_smtr/tasks.py
+++ b/pipelines/rj_smtr/tasks.py
@@ -1110,30 +1110,3 @@ def transform_raw_to_nested_structure(
         log(f"[CATCHED] Task failed with error: \n{error}", level="error")
 
     return error, filepath
-
-
-# @task(checkpoint=False)
-# def get_datetime_range(
-#     timestamp: datetime,
-#     interval: int,
-# ) -> dict:
-#     """
-#     Task to get datetime range in UTC
-
-#     Args:
-#         timestamp (datetime): timestamp to get datetime range
-#         interval (int): interval in seconds
-
-#     Returns:
-#         dict: datetime range
-#     """
-
-#     start = (
-#         (timestamp - timedelta(seconds=interval))
-#         .astimezone(tz=timezone("UTC"))
-#         .strftime("%Y-%m-%d %H:%M:%S")
-#     )
-
-#     end = timestamp.astimezone(tz=timezone("UTC")).strftime("%Y-%m-%d %H:%M:%S")
-
-#     return {"start": start, "end": end}

From 0935cbd46f0fd720f3675b991f4c6fa64e5b58ef Mon Sep 17 00:00:00 2001
From: Rafael <rafael.c.pinheiro96@gmail.com>
Date: Fri, 29 Sep 2023 12:30:08 -0300
Subject: [PATCH 057/145] change quadro primary key to list

---
 pipelines/rj_smtr/constants.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pipelines/rj_smtr/constants.py b/pipelines/rj_smtr/constants.py
index 4975a246f..b1072b607 100644
--- a/pipelines/rj_smtr/constants.py
+++ b/pipelines/rj_smtr/constants.py
@@ -307,6 +307,6 @@ class constants(Enum):  # pylint: disable=c0103
         {"table_id": "fare_attributes", "primary_key": ["fare_id"]},
         {"table_id": "fare_rules", "primary_key": ["fare_id"]},
     ]
-    GTFS_QUADRO_CAPTURE_PARAMS = {"table_id": "quadro", "primary_key": "servico"}
+    GTFS_QUADRO_CAPTURE_PARAMS = {"table_id": "quadro", "primary_key": ["servico"]}
     GTFS_BASE_GCS_PATH = "development/br_rj_riodejaneiro_gtfs/upload"
     GTFS_ZIP_NAME = "gtfs.zip"

From e5bad98594e931bf06c4ed6aec3b8887490729c8 Mon Sep 17 00:00:00 2001
From: Carolina Gomes <gsv.lina@gmail.com>
Date: Fri, 29 Sep 2023 13:59:40 -0300
Subject: [PATCH 058/145] update GTFS constants

---
 pipelines/rj_smtr/constants.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/pipelines/rj_smtr/constants.py b/pipelines/rj_smtr/constants.py
index b1072b607..d9dd7055c 100644
--- a/pipelines/rj_smtr/constants.py
+++ b/pipelines/rj_smtr/constants.py
@@ -296,16 +296,16 @@ class constants(Enum):  # pylint: disable=c0103
     GTFS_GENERAL_CAPTURE_PARAMS = {"partition_date_only": True, "source_type": "gcs"}
     GTFS_CAPTURE_PARAMS = [
         {"table_id": "agency", "primary_key": ["agency_id"]},
-        {"table_id": "calendar_dates", "primary_key": ["service_id"]},
+        {"table_id": "calendar_dates", "primary_key": ["service_id", "date"]},
         {"table_id": "calendar", "primary_key": ["service_id"]},
         {"table_id": "feed_info", "primary_key": ["feed_publisher_name"]},
-        {"table_id": "frequencies", "primary_key": ["trip_id"]},
+        {"table_id": "frequencies", "primary_key": ["trip_id", "start_time"]},
         {"table_id": "routes", "primary_key": ["route_id"]},
-        {"table_id": "shapes", "primary_key": ["shape_id"]},
+        {"table_id": "shapes", "primary_key": ["shape_id", "shape_pt_sequence"]},
         {"table_id": "stops", "primary_key": ["stop_id"]},
         {"table_id": "trips", "primary_key": ["trip_id"]},
         {"table_id": "fare_attributes", "primary_key": ["fare_id"]},
-        {"table_id": "fare_rules", "primary_key": ["fare_id"]},
+        {"table_id": "fare_rules", "primary_key": []},
     ]
     GTFS_QUADRO_CAPTURE_PARAMS = {"table_id": "quadro", "primary_key": ["servico"]}
     GTFS_BASE_GCS_PATH = "development/br_rj_riodejaneiro_gtfs/upload"

From d4230bb0c860e542fc066bf768ff93056c0e1dc5 Mon Sep 17 00:00:00 2001
From: Rafael <rafael.c.pinheiro96@gmail.com>
Date: Fri, 29 Sep 2023 15:59:04 -0300
Subject: [PATCH 059/145] change upload folder structure

---
 pipelines/rj_smtr/constants.py |  3 +--
 pipelines/rj_smtr/flows.py     |  3 ++-
 pipelines/rj_smtr/tasks.py     | 20 ++++++++---------
 pipelines/rj_smtr/utils.py     | 41 +++++++++++++++++++++++++++-------
 pipelines/utils/utils.py       | 20 -----------------
 5 files changed, 45 insertions(+), 42 deletions(-)

diff --git a/pipelines/rj_smtr/constants.py b/pipelines/rj_smtr/constants.py
index b1072b607..eece75525 100644
--- a/pipelines/rj_smtr/constants.py
+++ b/pipelines/rj_smtr/constants.py
@@ -308,5 +308,4 @@ class constants(Enum):  # pylint: disable=c0103
         {"table_id": "fare_rules", "primary_key": ["fare_id"]},
     ]
     GTFS_QUADRO_CAPTURE_PARAMS = {"table_id": "quadro", "primary_key": ["servico"]}
-    GTFS_BASE_GCS_PATH = "development/br_rj_riodejaneiro_gtfs/upload"
-    GTFS_ZIP_NAME = "gtfs.zip"
+    GTFS_ZIP_NAME = "gtfs"
diff --git a/pipelines/rj_smtr/flows.py b/pipelines/rj_smtr/flows.py
index 3fc18b7b0..c53a3f7d8 100644
--- a/pipelines/rj_smtr/flows.py
+++ b/pipelines/rj_smtr/flows.py
@@ -76,9 +76,10 @@
         source_type=source_type,
         local_filepath=filepath,
         source_path=request_path,
+        dataset_id=dataset_id,
         table_id=table_id,
         secret_path=secret_path,
-        api_params=request_params,
+        request_params=request_params,
     )
 
     RAW_UPLOADED = upload_raw_data_to_gcs(
diff --git a/pipelines/rj_smtr/tasks.py b/pipelines/rj_smtr/tasks.py
index 65ab9505e..65e3e95b1 100644
--- a/pipelines/rj_smtr/tasks.py
+++ b/pipelines/rj_smtr/tasks.py
@@ -459,6 +459,7 @@ def create_request_params(
         request_url: url to request data
     """
     request_params = None
+    request_url = None
 
     if dataset_id == constants.BILHETAGEM_DATASET_ID.value:
         database = constants.BILHETAGEM_GENERAL_CAPTURE_PARAMS.value["databases"][
@@ -480,12 +481,8 @@ def create_request_params(
         }
 
     elif dataset_id == constants.GTFS_DATASET_ID.value:
-        if table_id == constants.GTFS_QUADRO_CAPTURE_PARAMS.value["table_id"]:
-            request_url = f"{constants.GTFS_BASE_GCS_PATH.value}/{table_id}.csv"
-        else:
-            request_url = (
-                f"{constants.GTFS_BASE_GCS_PATH.value}/{constants.GTFS_ZIP_NAME.value}"
-            )
+        if table_id != constants.GTFS_QUADRO_CAPTURE_PARAMS.value["table_id"]:
+            request_params = constants.GTFS_ZIP_NAME.value
 
     return request_params, request_url
 
@@ -495,9 +492,10 @@ def get_raw_from_sources(
     source_type: str,
     local_filepath: str,
     source_path: str = None,
+    dataset_id: str = None,
     table_id: str = None,
     secret_path: str = None,
-    api_params: dict = None,
+    request_params: dict = None,
 ):
     """
     Task to get raw data from sources
@@ -506,9 +504,10 @@ def get_raw_from_sources(
         source_type (str): source type
         local_filepath (str): local filepath
         source_path (str, optional): source path. Defaults to None.
+        dataset_id (str, optional): dataset_id on BigQuery. Defaults to None.
         table_id (str, optional): table_id on BigQuery. Defaults to None.
         secret_path (str, optional): secret path. Defaults to None.
-        api_params (dict, optional): api parameters. Defaults to None.
+        request_params (dict, optional): request parameters. Defaults to None.
 
     Returns:
         error: error
@@ -530,13 +529,12 @@ def get_raw_from_sources(
             error, data, filetype = get_raw_data_api(
                 url=source_path,
                 secret_path=secret_path,
-                api_params=api_params,
+                api_params=request_params,
                 filetype=filetype,
             )
         elif source_type == "gcs":
             error, data, filetype = get_raw_data_gcs(
-                gcs_path=source_path,
-                filename_to_unzip=table_id,
+                dataset_id=dataset_id, table_id=table_id, zip_filename=request_params
             )
         else:
             raise NotImplementedError(f"{source_type} not supported")
diff --git a/pipelines/rj_smtr/utils.py b/pipelines/rj_smtr/utils.py
index aeafa8ae1..9265e1a59 100644
--- a/pipelines/rj_smtr/utils.py
+++ b/pipelines/rj_smtr/utils.py
@@ -10,7 +10,6 @@
 from datetime import timedelta, datetime
 from typing import List, Union
 import traceback
-import sys
 import io
 import json
 import zipfile
@@ -33,7 +32,6 @@
     get_vault_secret,
     send_discord_message,
     get_redis_client,
-    get_storage_blob,
 )
 
 
@@ -537,16 +535,42 @@ def get_raw_data_api(  # pylint: disable=R0912
     return error, data, filetype
 
 
+def get_upload_storage_blob(
+    dataset_id: str,
+    filename: str,
+):
+    """
+    Get a blob from upload zone in storage
+
+    Args:
+        dataset_id (str): The dataset id on BigQuery.
+        filename (str): The filename in GCS.
+
+
+    Returns:
+        Blob: blob object
+    """
+    bucket = bd.Storage(dataset_id="", table_id="")
+    blob_list = list(
+        bucket.client["storage_staging"]
+        .bucket(bucket.bucket_name)
+        .list_blobs(prefix=f"upload/{dataset_id}/{filename}.")
+    )
+    return blob_list[0]
+
+
 def get_raw_data_gcs(
-    gcs_path: str,
-    filename_to_unzip: str = None,
+    dataset_id: str,
+    table_id: str,
+    zip_filename: str = None,
 ) -> tuple[str, str, str]:
     """
     Get raw data from GCS
 
     Args:
-        gcs_path (str): GCS path to get data
-        filename_to_unzip (str, optional): Filename to unzip. Defaults to None.
+        dataset_id (str): The dataset id on BigQuery.
+        table_id (str): The table id on BigQuery.
+        zip_filename (str, optional): The zip file name. Defaults to None.
 
     Returns:
         tuple[str, str]: Error and filepath
@@ -556,7 +580,8 @@ def get_raw_data_gcs(
     filetype = None
 
     try:
-        blob = get_storage_blob(gcs_path=gcs_path)
+        blob_search_name = zip_filename or table_id
+        blob = get_upload_storage_blob(dataset_id=dataset_id, filename=blob_search_name)
 
         filename = blob.name
         filetype = filename.split(".")[-1]
@@ -567,7 +592,7 @@ def get_raw_data_gcs(
             with zipfile.ZipFile(io.BytesIO(data), "r") as zipped_file:
                 filenames = zipped_file.namelist()
                 filename = list(
-                    filter(lambda x: x.split(".")[0] == filename_to_unzip, filenames)
+                    filter(lambda x: x.split(".")[0] == table_id, filenames)
                 )[0]
                 filetype = filename.split(".")[-1]
                 data = zipped_file.read(filename)
diff --git a/pipelines/utils/utils.py b/pipelines/utils/utils.py
index e37a88d8b..adf89bc94 100644
--- a/pipelines/utils/utils.py
+++ b/pipelines/utils/utils.py
@@ -732,26 +732,6 @@ def get_storage_blobs(dataset_id: str, table_id: str, mode: str = "staging") ->
     )
 
 
-def get_storage_blob(
-    gcs_path: str,
-):
-    """
-    Get a blob from a path.
-
-    Args:
-        gcs_path (str): path to blob
-
-    Returns:
-        Blob: blob object
-    """
-    bucket = bd.Storage(dataset_id="", table_id="")
-    return (
-        bucket.client["storage_staging"]
-        .bucket(bucket.bucket_name)
-        .get_blob(blob_name=gcs_path)
-    )
-
-
 def list_blobs_with_prefix(
     bucket_name: str, prefix: str, mode: str = "prod"
 ) -> List[Blob]:

From 7c43d1d0ec5dfc8abdb6f2c17e7c56606b894eb7 Mon Sep 17 00:00:00 2001
From: fernandascovino <fscovinom@gmail.com>
Date: Fri, 29 Sep 2023 17:24:31 -0300
Subject: [PATCH 060/145] =?UTF-8?q?undo=20silenciamento=20de=20falha=20de?=
 =?UTF-8?q?=20notifica=C3=A7=C3=A3o?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 pipelines/utils/custom.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/pipelines/utils/custom.py b/pipelines/utils/custom.py
index d91739817..13ae82dd5 100644
--- a/pipelines/utils/custom.py
+++ b/pipelines/utils/custom.py
@@ -68,11 +68,11 @@ def __init__(  # pylint: disable=too-many-arguments, too-many-locals
             edges=edges,
             reference_tasks=reference_tasks,
             state_handlers=state_handlers,
-            # on_failure=partial(
-            #     notify_discord_on_failure,
-            #     secret_path=constants.EMD_DISCORD_WEBHOOK_SECRET_PATH.value,
-            #     code_owners=code_owners,
-            # ),
+            on_failure=partial(
+                notify_discord_on_failure,
+                secret_path=constants.EMD_DISCORD_WEBHOOK_SECRET_PATH.value,
+                code_owners=code_owners,
+            ),
             validate=validate,
             result=result,
             terminal_state_handler=terminal_state_handler,

From 089e9334300798660f6a2bde67be6e06112e4c6d Mon Sep 17 00:00:00 2001
From: Rafael <rafael.c.pinheiro96@gmail.com>
Date: Fri, 29 Sep 2023 17:40:58 -0300
Subject: [PATCH 061/145] adicionar partition date only na transacao

---
 pipelines/rj_smtr/constants.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/pipelines/rj_smtr/constants.py b/pipelines/rj_smtr/constants.py
index 93303e5b7..c9f18f2fd 100644
--- a/pipelines/rj_smtr/constants.py
+++ b/pipelines/rj_smtr/constants.py
@@ -181,6 +181,7 @@ class constants(Enum):  # pylint: disable=c0103
                     data_processamento
             """,
             "primary_key": ["id"],  # id column to nest data on
+            "partition_date_only": False,
         },
     ]
     BILHETAGEM_TABLES_PARAMS = [

From 685aae52143ec88eecefbe6fab61fe01953b0fe2 Mon Sep 17 00:00:00 2001
From: fernandascovino <fscovinom@gmail.com>
Date: Fri, 29 Sep 2023 18:33:40 -0300
Subject: [PATCH 062/145] remove parametros de testes (gtfs)

---
 pipelines/rj_smtr/constants.py | 19 -------------------
 pipelines/rj_smtr/tasks.py     |  4 ----
 2 files changed, 23 deletions(-)

diff --git a/pipelines/rj_smtr/constants.py b/pipelines/rj_smtr/constants.py
index 943304191..d7cf3e771 100644
--- a/pipelines/rj_smtr/constants.py
+++ b/pipelines/rj_smtr/constants.py
@@ -290,22 +290,3 @@ class constants(Enum):  # pylint: disable=c0103
         },
     ]
     BILHETAGEM_SECRET_PATH = "smtr_jae_access_data"
-
-    # GTFS
-    GTFS_DATASET_ID = "br_rj_riodejaneiro_gtfs"
-    GTFS_GENERAL_CAPTURE_PARAMS = {"partition_date_only": True, "source_type": "gcs"}
-    GTFS_CAPTURE_PARAMS = [
-        {"table_id": "agency", "primary_key": ["agency_id"]},
-        {"table_id": "calendar_dates", "primary_key": ["service_id", "date"]},
-        {"table_id": "calendar", "primary_key": ["service_id"]},
-        {"table_id": "feed_info", "primary_key": ["feed_publisher_name"]},
-        {"table_id": "frequencies", "primary_key": ["trip_id", "start_time"]},
-        {"table_id": "routes", "primary_key": ["route_id"]},
-        {"table_id": "shapes", "primary_key": ["shape_id", "shape_pt_sequence"]},
-        {"table_id": "stops", "primary_key": ["stop_id"]},
-        {"table_id": "trips", "primary_key": ["trip_id"]},
-        {"table_id": "fare_attributes", "primary_key": ["fare_id"]},
-        {"table_id": "fare_rules", "primary_key": []},
-    ]
-    GTFS_QUADRO_CAPTURE_PARAMS = {"table_id": "quadro", "primary_key": ["servico"]}
-    GTFS_ZIP_NAME = "gtfs"
diff --git a/pipelines/rj_smtr/tasks.py b/pipelines/rj_smtr/tasks.py
index 916721a74..c5dae7741 100644
--- a/pipelines/rj_smtr/tasks.py
+++ b/pipelines/rj_smtr/tasks.py
@@ -480,10 +480,6 @@ def create_request_params(
             "query": extract_params["query"].format(**datetime_range),
         }
 
-    elif dataset_id == constants.GTFS_DATASET_ID.value:
-        if table_id != constants.GTFS_QUADRO_CAPTURE_PARAMS.value["table_id"]:
-            request_params = constants.GTFS_ZIP_NAME.value
-
     return request_params, request_url
 
 

From cd5048e56e26eae6cbf571d262c0278be8c6ef7e Mon Sep 17 00:00:00 2001
From: Rodrigo Cunha <66736583+eng-rodrigocunha@users.noreply.github.com>
Date: Fri, 29 Sep 2023 18:36:15 -0300
Subject: [PATCH 063/145] Update pipelines/rj_smtr/constants.py

Co-authored-by: Fernanda Scovino <fscovinom@gmail.com>
---
 pipelines/rj_smtr/constants.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pipelines/rj_smtr/constants.py b/pipelines/rj_smtr/constants.py
index d7cf3e771..52e30d9f8 100644
--- a/pipelines/rj_smtr/constants.py
+++ b/pipelines/rj_smtr/constants.py
@@ -186,7 +186,7 @@ class constants(Enum):  # pylint: disable=c0103
         "transacao_run_interval": {"minutes": 1},
         "principal_run_interval": {"days": 1},
         "transacao_runs_interval_minutes": 0,
-        "principal_runs_interval_minutes": 15,
+        "principal_runs_interval_minutes": 5,
     }
 
     BILHETAGEM_TRANSACAO_CAPTURE_PARAMS = {

From d17d16127ed3012b0377a7f7b4311af72a3dd911 Mon Sep 17 00:00:00 2001
From: fernandascovino <fscovinom@gmail.com>
Date: Fri, 29 Sep 2023 18:53:23 -0300
Subject: [PATCH 064/145] corrige encadeamento de erros no flow

---
 pipelines/rj_smtr/flows.py |   3 +-
 pipelines/rj_smtr/tasks.py | 110 +++++++++++++++++--------------------
 2 files changed, 51 insertions(+), 62 deletions(-)

diff --git a/pipelines/rj_smtr/flows.py b/pipelines/rj_smtr/flows.py
index c53a3f7d8..0cac7769f 100644
--- a/pipelines/rj_smtr/flows.py
+++ b/pipelines/rj_smtr/flows.py
@@ -82,7 +82,7 @@
         request_params=request_params,
     )
 
-    RAW_UPLOADED = upload_raw_data_to_gcs(
+    error = upload_raw_data_to_gcs(
         error=error,
         raw_filepath=raw_filepath,
         timestamp=timestamp,
@@ -99,7 +99,6 @@
         error=error,
         timestamp=timestamp,
         primary_key=primary_key,
-        upstream_tasks=[RAW_UPLOADED],
     )
 
     STAGING_UPLOADED = upload_staging_data_to_gcs(
diff --git a/pipelines/rj_smtr/tasks.py b/pipelines/rj_smtr/tasks.py
index c5dae7741..86948899f 100644
--- a/pipelines/rj_smtr/tasks.py
+++ b/pipelines/rj_smtr/tasks.py
@@ -743,7 +743,7 @@ def upload_raw_data_to_gcs(
     Returns:
         None
     """
-    if not error:
+    if error is None:
         try:
             st_obj = Storage(table_id=table_id, dataset_id=dataset_id)
             log(
@@ -759,14 +759,8 @@ def upload_raw_data_to_gcs(
         except Exception:
             error = traceback.format_exc()
             log(f"[CATCHED] Task failed with error: \n{error}", level="error")
-
-    upload_run_logs_to_bq(
-        dataset_id=dataset_id,
-        parent_table_id=table_id,
-        error=error,
-        timestamp=timestamp,
-        mode="raw",
-    )
+    
+    return error
 
 
 @task
@@ -792,7 +786,7 @@ def upload_staging_data_to_gcs(
     Returns:
         None
     """
-    if not error:
+    if error is None:
         try:
             # Creates and publish table if it does not exist, append to it otherwise
             create_or_append_table(
@@ -813,6 +807,8 @@ def upload_staging_data_to_gcs(
         mode="staging",
     )
 
+    return error
+
 
 ###############
 #
@@ -1040,67 +1036,61 @@ def transform_raw_to_nested_structure(
         str: Error traceback
         str: Path to the saved treated .csv file
     """
+    if error is None:
+        try:
+            # leitura do dado raw
+            error, data = read_raw_data(filepath=raw_filepath)
 
-    # Check previous error
-    if error is not None:
-        return error, None
-
-    # ORGANIZAR:
-
-    try:
-        # leitura do dado raw
-        error, data = read_raw_data(filepath=raw_filepath)
-
-        if primary_key is None:
-            primary_key = []
+            if primary_key is None:
+                primary_key = []
 
-        log(
-            f"""
-        Received inputs:
-        - timestamp:\n{timestamp}
-        - data:\n{data.head()}"""
-        )
+            log(
+                f"""
+            Received inputs:
+            - timestamp:\n{timestamp}
+            - data:\n{data.head()}"""
+            )
 
-        # Check empty dataframe
-        if data.empty:
-            log("Empty dataframe, skipping transformation...")
-        else:
-            log(f"Raw data:\n{data_info_str(data)}", level="info")
+            # Check empty dataframe
+            if data.empty:
+                log("Empty dataframe, skipping transformation...")
+            else:
+                log(f"Raw data:\n{data_info_str(data)}", level="info")
 
-            log("Adding captured timestamp column...", level="info")
-            data["timestamp_captura"] = timestamp
+                log("Adding captured timestamp column...", level="info")
+                data["timestamp_captura"] = timestamp
 
-            log("Striping string columns...", level="info")
-            for col in data.columns[data.dtypes == "object"].to_list():
-                data[col] = data[col].str.strip()
+                log("Striping string columns...", level="info")
+                for col in data.columns[data.dtypes == "object"].to_list():
+                    data[col] = data[col].str.strip()
 
-            log(f"Finished cleaning! Data:\n{data_info_str(data)}", level="info")
+                log(f"Finished cleaning! Data:\n{data_info_str(data)}", level="info")
 
-            log("Creating nested structure...", level="info")
-            pk_cols = primary_key + ["timestamp_captura"]
-            data = (
-                data.groupby(pk_cols)
-                .apply(
-                    lambda x: x[data.columns.difference(pk_cols)].to_json(
-                        orient="records"
+                log("Creating nested structure...", level="info")
+                pk_cols = primary_key + ["timestamp_captura"]
+                data = (
+                    data.groupby(pk_cols)
+                    .apply(
+                        lambda x: x[data.columns.difference(pk_cols)].to_json(
+                            orient="records"
+                        )
                     )
+                    .str.strip("[]")
+                    .reset_index(name="content")[
+                        primary_key + ["content", "timestamp_captura"]
+                    ]
                 )
-                .str.strip("[]")
-                .reset_index(name="content")[
-                    primary_key + ["content", "timestamp_captura"]
-                ]
-            )
 
-            log(
-                f"Finished nested structure! Data:\n{data_info_str(data)}",
-                level="info",
-            )
+                log(
+                    f"Finished nested structure! Data:\n{data_info_str(data)}",
+                    level="info",
+                )
 
-        # save treated local
-        filepath = save_treated_local_func(data=data, error=error, filepath=filepath)
+            # save treated local
+            filepath = save_treated_local_func(data=data, error=error, filepath=filepath)
 
-    except Exception:  # pylint: disable=W0703
-        error = traceback.format_exc()
-        log(f"[CATCHED] Task failed with error: \n{error}", level="error")
+        except Exception:  # pylint: disable=W0703
+            error = traceback.format_exc()
+            log(f"[CATCHED] Task failed with error: \n{error}", level="error")
 
     return error, filepath

From 02b948a66a7a11515dabb7995dc76ecd7d3f2c3c Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Fri, 29 Sep 2023 21:53:43 +0000
Subject: [PATCH 065/145] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 pipelines/rj_smtr/tasks.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/pipelines/rj_smtr/tasks.py b/pipelines/rj_smtr/tasks.py
index 86948899f..1a6c1e876 100644
--- a/pipelines/rj_smtr/tasks.py
+++ b/pipelines/rj_smtr/tasks.py
@@ -759,7 +759,7 @@ def upload_raw_data_to_gcs(
         except Exception:
             error = traceback.format_exc()
             log(f"[CATCHED] Task failed with error: \n{error}", level="error")
-    
+
     return error
 
 
@@ -1087,7 +1087,9 @@ def transform_raw_to_nested_structure(
                 )
 
             # save treated local
-            filepath = save_treated_local_func(data=data, error=error, filepath=filepath)
+            filepath = save_treated_local_func(
+                data=data, error=error, filepath=filepath
+            )
 
         except Exception:  # pylint: disable=W0703
             error = traceback.format_exc()

From fac7821be88db6d07982591ef3793f1810430d24 Mon Sep 17 00:00:00 2001
From: Rafael <rafael.c.pinheiro96@gmail.com>
Date: Mon, 2 Oct 2023 11:24:44 -0300
Subject: [PATCH 066/145] remove header treatment

---
 pipelines/rj_smtr/utils.py | 7 -------
 1 file changed, 7 deletions(-)

diff --git a/pipelines/rj_smtr/utils.py b/pipelines/rj_smtr/utils.py
index 9265e1a59..e2fffe8dc 100644
--- a/pipelines/rj_smtr/utils.py
+++ b/pipelines/rj_smtr/utils.py
@@ -507,13 +507,6 @@ def get_raw_data_api(  # pylint: disable=R0912
         else:
             headers = get_vault_secret(secret_path)["data"]
 
-        # remove from headers, if present
-        # TODO: remove this before merge to master
-        remove_headers = ["host", "databases"]
-        for remove_header in remove_headers:
-            if remove_header in list(headers.keys()):
-                del headers[remove_header]
-
         response = requests.get(
             url,
             headers=headers,

From e291e514f2b08063ba92e4112d116e4c86821392 Mon Sep 17 00:00:00 2001
From: Rafael <rafael.c.pinheiro96@gmail.com>
Date: Mon, 2 Oct 2023 11:25:14 -0300
Subject: [PATCH 067/145] mudar agent dev para prd

---
 pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/flows.py | 4 ++--
 pipelines/rj_smtr/flows.py                               | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/flows.py b/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/flows.py
index 793d37c0d..d7f44e3b9 100644
--- a/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/flows.py
+++ b/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/flows.py
@@ -30,7 +30,7 @@
 bilhetagem_transacao_captura.storage = GCS(emd_constants.GCS_FLOWS_BUCKET.value)
 bilhetagem_transacao_captura.run_config = KubernetesRun(
     image=emd_constants.DOCKER_IMAGE.value,
-    labels=[emd_constants.RJ_SMTR_DEV_AGENT_LABEL.value],
+    labels=[emd_constants.RJ_SMTR_AGENT_LABEL.value],
 )
 bilhetagem_transacao_captura.schedule = bilhetagem_transacao_schedule
 
@@ -41,6 +41,6 @@
 bilhetagem_principal_captura.storage = GCS(emd_constants.GCS_FLOWS_BUCKET.value)
 bilhetagem_principal_captura.run_config = KubernetesRun(
     image=emd_constants.DOCKER_IMAGE.value,
-    labels=[emd_constants.RJ_SMTR_DEV_AGENT_LABEL.value],
+    labels=[emd_constants.RJ_SMTR_AGENT_LABEL.value],
 )
 bilhetagem_principal_captura.schedule = bilhetagem_principal_schedule
diff --git a/pipelines/rj_smtr/flows.py b/pipelines/rj_smtr/flows.py
index 0cac7769f..c7638676b 100644
--- a/pipelines/rj_smtr/flows.py
+++ b/pipelines/rj_smtr/flows.py
@@ -113,5 +113,5 @@
 default_capture_flow.storage = GCS(emd_constants.GCS_FLOWS_BUCKET.value)
 default_capture_flow.run_config = KubernetesRun(
     image=emd_constants.DOCKER_IMAGE.value,
-    labels=[emd_constants.RJ_SMTR_DEV_AGENT_LABEL.value],
+    labels=[emd_constants.RJ_SMTR_AGENT_LABEL.value],
 )

From e57d4576ddb729f3871dd190d61be7c37ae9b9a3 Mon Sep 17 00:00:00 2001
From: Rafael <rafael.c.pinheiro96@gmail.com>
Date: Mon, 2 Oct 2023 11:33:33 -0300
Subject: [PATCH 068/145] mudar agent de dev para prd

---
 pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/schedules.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/schedules.py b/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/schedules.py
index e897286b0..2f7804811 100644
--- a/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/schedules.py
+++ b/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/schedules.py
@@ -20,7 +20,7 @@
         **constants.BILHETAGEM_GENERAL_CAPTURE_PARAMS.value["principal_run_interval"]
     ),
     labels=[
-        emd_constants.RJ_SMTR_DEV_AGENT_LABEL.value,
+        emd_constants.RJ_SMTR_AGENT_LABEL.value,
     ],
     table_parameters=constants.BILHETAGEM_CAPTURE_PARAMS.value,
     dataset_id=constants.BILHETAGEM_DATASET_ID.value,
@@ -38,7 +38,7 @@
         **constants.BILHETAGEM_GENERAL_CAPTURE_PARAMS.value["transacao_run_interval"]
     ),
     labels=[
-        emd_constants.RJ_SMTR_DEV_AGENT_LABEL.value,
+        emd_constants.RJ_SMTR_AGENT_LABEL.value,
     ],
     table_parameters=constants.BILHETAGEM_TRANSACAO_CAPTURE_PARAMS.value,
     dataset_id=constants.BILHETAGEM_DATASET_ID.value,

From 3767a5622ae6beb9a7758070e71b6830a967a84f Mon Sep 17 00:00:00 2001
From: Rafael <rafael.c.pinheiro96@gmail.com>
Date: Mon, 2 Oct 2023 11:33:59 -0300
Subject: [PATCH 069/145] ajustar retorno das funcoes

---
 pipelines/rj_smtr/flows.py |  1 -
 pipelines/rj_smtr/tasks.py | 30 ++++++++++++++----------------
 2 files changed, 14 insertions(+), 17 deletions(-)

diff --git a/pipelines/rj_smtr/flows.py b/pipelines/rj_smtr/flows.py
index c7638676b..4860c6d07 100644
--- a/pipelines/rj_smtr/flows.py
+++ b/pipelines/rj_smtr/flows.py
@@ -85,7 +85,6 @@
     error = upload_raw_data_to_gcs(
         error=error,
         raw_filepath=raw_filepath,
-        timestamp=timestamp,
         table_id=table_id,
         dataset_id=dataset_id,
         partitions=partitions,
diff --git a/pipelines/rj_smtr/tasks.py b/pipelines/rj_smtr/tasks.py
index 1a6c1e876..5edee3f7c 100644
--- a/pipelines/rj_smtr/tasks.py
+++ b/pipelines/rj_smtr/tasks.py
@@ -8,7 +8,7 @@
 import os
 from pathlib import Path
 import traceback
-from typing import Dict, List
+from typing import Dict, List, Union
 import io
 
 from basedosdados import Storage, Table
@@ -722,26 +722,24 @@ def upload_logs_to_bq(  # pylint: disable=R0913
 
 @task
 def upload_raw_data_to_gcs(
-    error: bool,
+    error: str,
     raw_filepath: str,
-    timestamp: datetime,
     table_id: str,
     dataset_id: str,
     partitions: list,
-):
+) -> Union[str, None]:
     """
     Upload raw data to GCS.
 
     Args:
-        error (bool): whether the upstream tasks failed or not
+        error (str): Error catched from upstream tasks.
         raw_filepath (str): Path to the saved raw .json file
-        timestamp (datetime): timestamp for flow run
         table_id (str): table_id on BigQuery
         dataset_id (str): dataset_id on BigQuery
         partitions (list): list of partition strings
 
     Returns:
-        None
+        Union[str, None]: if there is an error returns it traceback, otherwise returns None
     """
     if error is None:
         try:
@@ -765,26 +763,26 @@ def upload_raw_data_to_gcs(
 
 @task
 def upload_staging_data_to_gcs(
-    error: bool,
+    error: str,
     staging_filepath: str,
     timestamp: datetime,
     table_id: str,
     dataset_id: str,
     partitions: list,
-):
+) -> Union[str, None]:
     """
     Upload staging data to GCS.
 
     Args:
-        error (bool): whether the upstream tasks failed or not
-        staging_filepath (str): Path to the saved treated .csv file
-        timestamp (datetime): timestamp for flow run
-        table_id (str): table_id on BigQuery
-        dataset_id (str): dataset_id on BigQuery
-        partitions (list): list of partition strings
+        error (str): Error catched from upstream tasks.
+        staging_filepath (str): Path to the saved treated .csv file.
+        timestamp (datetime): timestamp for flow run.
+        table_id (str): table_id on BigQuery.
+        dataset_id (str): dataset_id on BigQuery.
+        partitions (list): list of partition strings.
 
     Returns:
-        None
+        Union[str, None]: if there is an error returns it traceback, otherwise returns None
     """
     if error is None:
         try:

From 6564ea663204adb01d9b13852746da9e3eebd97f Mon Sep 17 00:00:00 2001
From: eng-rodrigocunha <engtransportes.rodrigocunha@gmail.com>
Date: Mon, 2 Oct 2023 11:39:17 -0300
Subject: [PATCH 070/145] =?UTF-8?q?Atualiza=20documenta=C3=A7=C3=A3o?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 pipelines/rj_smtr/tasks.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/pipelines/rj_smtr/tasks.py b/pipelines/rj_smtr/tasks.py
index 5edee3f7c..9d46d49ac 100644
--- a/pipelines/rj_smtr/tasks.py
+++ b/pipelines/rj_smtr/tasks.py
@@ -444,7 +444,7 @@ def create_request_params(
     table_id: str,
     dataset_id: str,
     timestamp: datetime,
-) -> tuple:
+) -> tuple[str, str]:
     """
     Task to create request params
 
@@ -492,7 +492,7 @@ def get_raw_from_sources(
     table_id: str = None,
     secret_path: str = None,
     request_params: dict = None,
-):
+) -> tuple[str, str]:
     """
     Task to get raw data from sources
 
@@ -506,7 +506,8 @@ def get_raw_from_sources(
         request_params (dict, optional): request parameters. Defaults to None.
 
     Returns:
-        error: error
+        error: error catched from upstream tasks
+        filepath: filepath to raw data
     """
     error = None
     filepath = None

From 19bb0bedde0b7d7e0f14fbe6bff70196e8da5679 Mon Sep 17 00:00:00 2001
From: Rafael <rafael.c.pinheiro96@gmail.com>
Date: Mon, 2 Oct 2023 11:52:34 -0300
Subject: [PATCH 071/145] adicionar retorno em get_upload_storage_blob

---
 pipelines/rj_smtr/utils.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/pipelines/rj_smtr/utils.py b/pipelines/rj_smtr/utils.py
index e2fffe8dc..a89a95541 100644
--- a/pipelines/rj_smtr/utils.py
+++ b/pipelines/rj_smtr/utils.py
@@ -18,6 +18,7 @@
 import basedosdados as bd
 from basedosdados import Table
 import pandas as pd
+from google.cloud.storage.blob import Blob
 
 
 from prefect.schedules.clocks import IntervalClock
@@ -531,7 +532,7 @@ def get_raw_data_api(  # pylint: disable=R0912
 def get_upload_storage_blob(
     dataset_id: str,
     filename: str,
-):
+) -> Blob:
     """
     Get a blob from upload zone in storage
 

From bc87f44aa2a0202790255a8658d14ad14454ae76 Mon Sep 17 00:00:00 2001
From: eng-rodrigocunha <engtransportes.rodrigocunha@gmail.com>
Date: Mon, 2 Oct 2023 11:55:31 -0300
Subject: [PATCH 072/145] =?UTF-8?q?Atualiza=20documenta=C3=A7=C3=A3o?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 pipelines/rj_smtr/utils.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/pipelines/rj_smtr/utils.py b/pipelines/rj_smtr/utils.py
index a89a95541..1d71dd3dd 100644
--- a/pipelines/rj_smtr/utils.py
+++ b/pipelines/rj_smtr/utils.py
@@ -498,7 +498,7 @@ def get_raw_data_api(  # pylint: disable=R0912
         filetype (str, optional): Filetype to save raw file. Defaults to None.
 
     Returns:
-        tuple[str, str]: Error and filepath
+        tuple[str, str, str]: Error, data and filetype
     """
     error = None
     data = None
@@ -540,7 +540,6 @@ def get_upload_storage_blob(
         dataset_id (str): The dataset id on BigQuery.
         filename (str): The filename in GCS.
 
-
     Returns:
         Blob: blob object
     """
@@ -567,7 +566,7 @@ def get_raw_data_gcs(
         zip_filename (str, optional): The zip file name. Defaults to None.
 
     Returns:
-        tuple[str, str]: Error and filepath
+        tuple[str, str, str]: Error, data and filetype
     """
     error = None
     data = None

From 185d695ff4dbcd647e93e938639721a6120276d6 Mon Sep 17 00:00:00 2001
From: eng-rodrigocunha <engtransportes.rodrigocunha@gmail.com>
Date: Mon, 2 Oct 2023 11:58:30 -0300
Subject: [PATCH 073/145] Atualiza string

---
 pipelines/rj_smtr/tasks.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/pipelines/rj_smtr/tasks.py b/pipelines/rj_smtr/tasks.py
index 9d46d49ac..a846851b5 100644
--- a/pipelines/rj_smtr/tasks.py
+++ b/pipelines/rj_smtr/tasks.py
@@ -1045,9 +1045,9 @@ def transform_raw_to_nested_structure(
 
             log(
                 f"""
-            Received inputs:
-            - timestamp:\n{timestamp}
-            - data:\n{data.head()}"""
+                Received inputs:
+                - timestamp:\n{timestamp}
+                - data:\n{data.head()}"""
             )
 
             # Check empty dataframe

From 4a975d52e2a3cd7a6a41f772320d0e6b890d4eda Mon Sep 17 00:00:00 2001
From: Rafael <rafael.c.pinheiro96@gmail.com>
Date: Tue, 10 Oct 2023 12:06:09 -0300
Subject: [PATCH 074/145] adiciona recaptura no flow generico

---
 pipelines/rj_smtr/flows.py | 101 +++++++++++++++++++++----------------
 pipelines/rj_smtr/tasks.py |   4 ++
 2 files changed, 61 insertions(+), 44 deletions(-)

diff --git a/pipelines/rj_smtr/flows.py b/pipelines/rj_smtr/flows.py
index 4860c6d07..f1433f52c 100644
--- a/pipelines/rj_smtr/flows.py
+++ b/pipelines/rj_smtr/flows.py
@@ -5,15 +5,14 @@
 
 from prefect.run_configs import KubernetesRun
 from prefect.storage import GCS
-from prefect import Parameter
+from prefect import Parameter, case, unmapped
+from prefect.tasks.control_flow import merge
 
 # EMD Imports #
 
 from pipelines.constants import constants as emd_constants
 from pipelines.utils.decorators import Flow
-from pipelines.utils.tasks import (
-    rename_current_flow_run_now_time,
-)
+from pipelines.utils.tasks import rename_current_flow_run_now_time, get_now_time
 
 # SMTR Imports #
 
@@ -27,6 +26,7 @@
     transform_raw_to_nested_structure,
     get_raw_from_sources,
     create_request_params,
+    query_logs,
 )
 
 
@@ -43,74 +43,87 @@
     secret_path = Parameter("secret_path", default=None)
     primary_key = Parameter("primary_key", default=None)
     source_type = Parameter("source_type", default=None)
+    recapture = Parameter("recapture", default=False)
+
+    with case(recapture, True):
+        _, recapture_timestamps, previous_errors = query_logs(
+            dataset_id=dataset_id,
+            table_id=table_id,
+        )
 
-    timestamp = get_current_timestamp()
+    with case(recapture, False):
+        capture_timestamp = [get_current_timestamp()]
+        previous_errors = [None]
+
+    timestamps = merge(recapture_timestamps, capture_timestamp)
 
     rename_flow_run = rename_current_flow_run_now_time(
         prefix=default_capture_flow.name + " " + table_id + ": ",
-        now_time=timestamp,
+        now_time=get_now_time(),
     )
 
-    partitions = create_date_hour_partition(
-        timestamp, partition_date_only=partition_date_only
+    partitions = create_date_hour_partition.map(
+        timestamps, partition_date_only=unmapped(partition_date_only)
     )
 
-    filename = parse_timestamp_to_string(timestamp)
+    filenames = parse_timestamp_to_string.map(timestamps)
 
-    filepath = create_local_partition_path(
-        dataset_id=dataset_id,
-        table_id=table_id,
-        filename=filename,
+    filepaths = create_local_partition_path.map(
+        dataset_id=unmapped(dataset_id),
+        table_id=unmapped(table_id),
+        filename=filenames,
         partitions=partitions,
     )
 
     # Extração #
-    request_params, request_path = create_request_params(
-        dataset_id=dataset_id,
-        extract_params=extract_params,
-        table_id=table_id,
-        timestamp=timestamp,
+    request_params, request_paths = create_request_params.map(
+        dataset_id=unmapped(dataset_id),
+        extract_params=unmapped(extract_params),
+        table_id=unmapped(table_id),
+        timestamp=timestamps,
     )
 
-    error, raw_filepath = get_raw_from_sources(
-        source_type=source_type,
-        local_filepath=filepath,
-        source_path=request_path,
-        dataset_id=dataset_id,
-        table_id=table_id,
-        secret_path=secret_path,
+    errors, raw_filepaths = get_raw_from_sources.map(
+        source_type=unmapped(source_type),
+        local_filepath=unmapped(filepaths),
+        source_path=request_paths,
+        dataset_id=unmapped(dataset_id),
+        table_id=unmapped(table_id),
+        secret_path=unmapped(secret_path),
         request_params=request_params,
     )
 
-    error = upload_raw_data_to_gcs(
-        error=error,
-        raw_filepath=raw_filepath,
-        table_id=table_id,
-        dataset_id=dataset_id,
-        partitions=partitions,
+    errors = upload_raw_data_to_gcs.map(
+        error=errors,
+        raw_filepath=raw_filepaths,
+        table_id=unmapped(table_id),
+        dataset_id=unmapped(dataset_id),
+        partitions=unmapped(partitions),
     )
 
     # Pré-tratamento #
 
-    error, staging_filepath = transform_raw_to_nested_structure(
-        raw_filepath=raw_filepath,
-        filepath=filepath,
-        error=error,
-        timestamp=timestamp,
-        primary_key=primary_key,
+    errors, staging_filepaths = transform_raw_to_nested_structure.map(
+        raw_filepath=raw_filepaths,
+        filepath=filepaths,
+        error=errors,
+        timestamp=timestamps,
+        primary_key=unmapped(primary_key),
     )
 
-    STAGING_UPLOADED = upload_staging_data_to_gcs(
-        error=error,
-        staging_filepath=staging_filepath,
-        timestamp=timestamp,
-        table_id=table_id,
-        dataset_id=dataset_id,
+    STAGING_UPLOADED = upload_staging_data_to_gcs.map(
+        error=errors,
+        staging_filepath=staging_filepaths,
+        timestamp=timestamps,
+        table_id=unmapped(table_id),
+        dataset_id=unmapped(dataset_id),
         partitions=partitions,
+        previous_error=previous_errors,
+        recapture=recapture,
     )
 
 default_capture_flow.storage = GCS(emd_constants.GCS_FLOWS_BUCKET.value)
 default_capture_flow.run_config = KubernetesRun(
     image=emd_constants.DOCKER_IMAGE.value,
-    labels=[emd_constants.RJ_SMTR_AGENT_LABEL.value],
+    labels=[emd_constants.RJ_SMTR_DEV_AGENT_LABEL.value],
 )
diff --git a/pipelines/rj_smtr/tasks.py b/pipelines/rj_smtr/tasks.py
index a846851b5..71a1e2891 100644
--- a/pipelines/rj_smtr/tasks.py
+++ b/pipelines/rj_smtr/tasks.py
@@ -770,6 +770,8 @@ def upload_staging_data_to_gcs(
     table_id: str,
     dataset_id: str,
     partitions: list,
+    previous_error: str = None,
+    recapture: bool = False,
 ) -> Union[str, None]:
     """
     Upload staging data to GCS.
@@ -803,6 +805,8 @@ def upload_staging_data_to_gcs(
         parent_table_id=table_id,
         error=error,
         timestamp=timestamp,
+        previous_error=previous_error,
+        recapture=recapture,
         mode="staging",
     )
 

From 33a23e5b7c1a732d5b23cdc551edbbb3fac0b28b Mon Sep 17 00:00:00 2001
From: Rafael <rafael.c.pinheiro96@gmail.com>
Date: Wed, 11 Oct 2023 10:57:22 -0300
Subject: [PATCH 075/145] alterar labels para dev

---
 pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/flows.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/flows.py b/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/flows.py
index 568f96154..fb6e67594 100644
--- a/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/flows.py
+++ b/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/flows.py
@@ -50,7 +50,7 @@
 bilhetagem_transacao_captura.storage = GCS(emd_constants.GCS_FLOWS_BUCKET.value)
 bilhetagem_transacao_captura.run_config = KubernetesRun(
     image=emd_constants.DOCKER_IMAGE.value,
-    labels=[emd_constants.RJ_SMTR_AGENT_LABEL.value],
+    labels=[emd_constants.RJ_SMTR_DEV_AGENT_LABEL.value],
 )
 bilhetagem_transacao_captura.schedule = bilhetagem_transacao_schedule
 
@@ -61,7 +61,7 @@
 bilhetagem_auxiliar_captura.storage = GCS(emd_constants.GCS_FLOWS_BUCKET.value)
 bilhetagem_auxiliar_captura.run_config = KubernetesRun(
     image=emd_constants.DOCKER_IMAGE.value,
-    labels=[emd_constants.RJ_SMTR_AGENT_LABEL.value],
+    labels=[emd_constants.RJ_SMTR_DEV_AGENT_LABEL.value],
 )
 
 bilhetagem_auxiliar_captura = set_default_parameters(
@@ -79,7 +79,7 @@
 bilhetagem_materializacao.storage = GCS(emd_constants.GCS_FLOWS_BUCKET.value)
 bilhetagem_materializacao.run_config = KubernetesRun(
     image=emd_constants.DOCKER_IMAGE.value,
-    labels=[emd_constants.RJ_SMTR_AGENT_LABEL.value],
+    labels=[emd_constants.RJ_SMTR_DEV_AGENT_LABEL.value],
 )
 
 bilhetagem_materializacao_parameters = {
@@ -91,6 +91,7 @@
     default_parameters=bilhetagem_materializacao_parameters,
 )
 
+
 # TRATAMENTO - RODA DE HORA EM HORA, CAPTURA AUXILIAR + MATERIALIZAÇÃO
 with Flow(
     "SMTR: Bilhetagem Transação - Tratamento",
@@ -138,7 +139,7 @@
 bilhetagem_transacao_tratamento.storage = GCS(emd_constants.GCS_FLOWS_BUCKET.value)
 bilhetagem_transacao_tratamento.run_config = KubernetesRun(
     image=emd_constants.DOCKER_IMAGE.value,
-    labels=[emd_constants.RJ_SMTR_AGENT_LABEL.value],
+    labels=[emd_constants.RJ_SMTR_DEV_AGENT_LABEL.value],
 )
 bilhetagem_transacao_tratamento.schedule = every_hour
 # bilhetagem_materializacao.schedule = bilhetagem_materializacao_schedule

From 0eb4e92b2640fc67e3c8296addb17c51ed25b6d3 Mon Sep 17 00:00:00 2001
From: Rafael <rafael.c.pinheiro96@gmail.com>
Date: Wed, 11 Oct 2023 10:57:39 -0300
Subject: [PATCH 076/145] adicionar logica de recaptura

---
 pipelines/rj_smtr/flows.py | 95 ++++++++++++++++++++++----------------
 1 file changed, 55 insertions(+), 40 deletions(-)

diff --git a/pipelines/rj_smtr/flows.py b/pipelines/rj_smtr/flows.py
index 419a6e6a1..6012aae22 100644
--- a/pipelines/rj_smtr/flows.py
+++ b/pipelines/rj_smtr/flows.py
@@ -7,6 +7,7 @@
 from prefect.storage import GCS
 from prefect import case, Parameter
 from prefect.utilities.edges import unmapped
+from prefect.tasks.control_flow import merge
 
 # EMD Imports #
 
@@ -35,6 +36,7 @@
     upload_staging_data_to_gcs,
     get_raw_from_sources,
     create_request_params,
+    query_logs,
 )
 
 from pipelines.utils.execute_dbt_model.tasks import run_dbt_model
@@ -52,70 +54,83 @@
     secret_path = Parameter("secret_path", default=None)
     primary_key = Parameter("primary_key", default=None)
     source_type = Parameter("source_type", default=None)
+    recapture = Parameter("recapture", default=False)
 
-    timestamp = get_current_timestamp()
+    with case(recapture, True):
+        _, recapture_timestamps, previous_errors = query_logs(
+            dataset_id=dataset_id,
+            table_id=table_id,
+        )
+
+    with case(recapture, False):
+        capture_timestamp = [get_current_timestamp()]
+        previous_errors = [None]
+
+    timestamps = merge(recapture_timestamps, capture_timestamp)
 
     rename_flow_run = rename_current_flow_run_now_time(
         prefix=default_capture_flow.name + " " + table_id + ": ",
-        now_time=timestamp,
+        now_time=get_now_time(),
     )
 
-    partitions = create_date_hour_partition(
-        timestamp, partition_date_only=partition_date_only
+    partitions = create_date_hour_partition.map(
+        timestamps, partition_date_only=unmapped(partition_date_only)
     )
 
-    filename = parse_timestamp_to_string(timestamp)
+    filenames = parse_timestamp_to_string.map(timestamps)
 
-    filepath = create_local_partition_path(
-        dataset_id=dataset_id,
-        table_id=table_id,
-        filename=filename,
+    filepaths = create_local_partition_path.map(
+        dataset_id=unmapped(dataset_id),
+        table_id=unmapped(table_id),
+        filename=filenames,
         partitions=partitions,
     )
 
     # Extração #
-    request_params, request_path = create_request_params(
-        dataset_id=dataset_id,
-        extract_params=extract_params,
-        table_id=table_id,
-        timestamp=timestamp,
+    request_params, request_paths = create_request_params.map(
+        dataset_id=unmapped(dataset_id),
+        extract_params=unmapped(extract_params),
+        table_id=unmapped(table_id),
+        timestamp=timestamps,
     )
 
-    error, raw_filepath = get_raw_from_sources(
-        source_type=source_type,
-        local_filepath=filepath,
-        source_path=request_path,
-        dataset_id=dataset_id,
-        table_id=table_id,
-        secret_path=secret_path,
+    errors, raw_filepaths = get_raw_from_sources.map(
+        source_type=unmapped(source_type),
+        local_filepath=unmapped(filepaths),
+        source_path=request_paths,
+        dataset_id=unmapped(dataset_id),
+        table_id=unmapped(table_id),
+        secret_path=unmapped(secret_path),
         request_params=request_params,
     )
 
-    error = upload_raw_data_to_gcs(
-        error=error,
-        raw_filepath=raw_filepath,
-        table_id=table_id,
-        dataset_id=dataset_id,
-        partitions=partitions,
+    errors = upload_raw_data_to_gcs.map(
+        error=errors,
+        raw_filepath=raw_filepaths,
+        table_id=unmapped(table_id),
+        dataset_id=unmapped(dataset_id),
+        partitions=unmapped(partitions),
     )
 
     # Pré-tratamento #
 
-    error, staging_filepath = transform_raw_to_nested_structure(
-        raw_filepath=raw_filepath,
-        filepath=filepath,
-        error=error,
-        timestamp=timestamp,
-        primary_key=primary_key,
+    errors, staging_filepaths = transform_raw_to_nested_structure.map(
+        raw_filepath=raw_filepaths,
+        filepath=filepaths,
+        error=errors,
+        timestamp=timestamps,
+        primary_key=unmapped(primary_key),
     )
 
-    STAGING_UPLOADED = upload_staging_data_to_gcs(
-        error=error,
-        staging_filepath=staging_filepath,
-        timestamp=timestamp,
-        table_id=table_id,
-        dataset_id=dataset_id,
+    STAGING_UPLOADED = upload_staging_data_to_gcs.map(
+        error=errors,
+        staging_filepath=staging_filepaths,
+        timestamp=timestamps,
+        table_id=unmapped(table_id),
+        dataset_id=unmapped(dataset_id),
         partitions=partitions,
+        previous_error=previous_errors,
+        recapture=recapture,
     )
 
 default_capture_flow.storage = GCS(emd_constants.GCS_FLOWS_BUCKET.value)
@@ -259,5 +274,5 @@
 default_materialization_flow.storage = GCS(emd_constants.GCS_FLOWS_BUCKET.value)
 default_materialization_flow.run_config = KubernetesRun(
     image=emd_constants.DOCKER_IMAGE.value,
-    labels=[emd_constants.RJ_SMTR_AGENT_LABEL.value],
+    labels=[emd_constants.RJ_SMTR_DEV_AGENT_LABEL.value],
 )

From ecc67d14e525b1697a46da3aabcf06d95294b7b6 Mon Sep 17 00:00:00 2001
From: Rafael <rafael.c.pinheiro96@gmail.com>
Date: Wed, 11 Oct 2023 10:58:20 -0300
Subject: [PATCH 077/145] =?UTF-8?q?criar=20conex=C3=A3o=20com=20banco=20de?=
 =?UTF-8?q?=20dados?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 pipelines/rj_smtr/tasks.py | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/pipelines/rj_smtr/tasks.py b/pipelines/rj_smtr/tasks.py
index f7d687dea..bd4b45680 100644
--- a/pipelines/rj_smtr/tasks.py
+++ b/pipelines/rj_smtr/tasks.py
@@ -31,6 +31,7 @@
     dict_contains_keys,
     get_raw_data_api,
     get_raw_data_gcs,
+    get_raw_data_db,
     upload_run_logs_to_bq,
     get_datetime_range,
     read_raw_data,
@@ -534,6 +535,10 @@ def get_raw_from_sources(
             error, data, filetype = get_raw_data_gcs(
                 dataset_id=dataset_id, table_id=table_id, zip_filename=request_params
             )
+        elif source_type == "db":
+            error, data, filetype = get_raw_data_db(
+                host=source_path, secret_path=secret_path, **request_params
+            )
         else:
             raise NotImplementedError(f"{source_type} not supported")
 
@@ -771,6 +776,8 @@ def upload_staging_data_to_gcs(
     table_id: str,
     dataset_id: str,
     partitions: list,
+    previous_error: str = None,
+    recapture: bool = False,
 ) -> Union[str, None]:
     """
     Upload staging data to GCS.
@@ -805,6 +812,8 @@ def upload_staging_data_to_gcs(
         error=error,
         timestamp=timestamp,
         mode="staging",
+        previous_error=previous_error,
+        recapture=recapture,
     )
 
     return error

From 2a882865254857d5451f1666a4564e736b082df5 Mon Sep 17 00:00:00 2001
From: Rafael <rafael.c.pinheiro96@gmail.com>
Date: Wed, 11 Oct 2023 10:58:35 -0300
Subject: [PATCH 078/145] =?UTF-8?q?criar=20conex=C3=A3o=20com=20banco=20de?=
 =?UTF-8?q?=20dados?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 pipelines/rj_smtr/utils.py | 58 ++++++++++++++++++++++++++++++++++++--
 1 file changed, 56 insertions(+), 2 deletions(-)

diff --git a/pipelines/rj_smtr/utils.py b/pipelines/rj_smtr/utils.py
index f9b98afab..6a6c70ee5 100644
--- a/pipelines/rj_smtr/utils.py
+++ b/pipelines/rj_smtr/utils.py
@@ -19,6 +19,7 @@
 from basedosdados import Table
 import pandas as pd
 from google.cloud.storage.blob import Blob
+import pymysql
 
 
 from prefect.schedules.clocks import IntervalClock
@@ -480,9 +481,10 @@ def save_raw_local_func(
     Path(_filepath).parent.mkdir(parents=True, exist_ok=True)
 
     if filetype == "json":
-        if isinstance(data, dict):
+        if isinstance(data, str):
             data = json.loads(data)
-        json.dump(data, Path(_filepath).open("w", encoding="utf-8"))
+        with Path(_filepath).open("w", encoding="utf-8") as fi:
+            json.dump(data, fi)
 
     # if filetype == "csv":
     #     pass
@@ -611,6 +613,58 @@ def get_raw_data_gcs(
     return error, data, filetype
 
 
+def get_raw_data_db(
+    sql: str, dbms: str, host: str, secret_path: str, database: str
+) -> tuple[str, str, str]:
+    """
+    Get data from Databases
+
+    Args:
+        sql (str): the SQL Query to execute
+        dbms (str): The datase management system
+        host (str): The database host
+        secret_path (str): Secret path to get credentials
+        database (str): The database to connect
+
+    Returns:
+        tuple[str, str, str]: Error, data and filetype
+    """
+    connection_mapping = {
+        # 'postgresql': {'connector': psycopg2.connect, 'port': '5432', 'cursor':{'cursor_factory': psycopg2.extras.RealDictCursor}},
+        "mysql": {
+            "connector": pymysql.connect,
+            "port": "3306",
+            "cursor": {"cursor": pymysql.cursors.DictCursor},
+        }
+    }
+
+    data = None
+    error = None
+    filetype = "json"
+
+    try:
+        credentials = get_vault_secret(secret_path)["data"]
+
+        connection = connection_mapping[dbms](
+            host=host,
+            user=credentials["user"],
+            password=credentials["password"],
+            database=database,
+        )
+
+        with connection:
+            with connection.cursor(**connection_mapping[dbms]["cursor"]) as cursor:
+                cursor.execute(sql)
+                data = cursor.fetchall()
+
+        data = [dict(d) for d in data]
+    except Exception:
+        error = traceback.format_exc()
+        log(f"[CATCHED] Task failed with error: \n{error}", level="error")
+
+    return error, data, filetype
+
+
 def save_treated_local_func(
     filepath: str, data: pd.DataFrame, error: str, mode: str = "staging"
 ) -> str:

From ae1c88201635cf629bb2ad76ee0122cbbe0bce21 Mon Sep 17 00:00:00 2001
From: Rafael <rafael.c.pinheiro96@gmail.com>
Date: Wed, 11 Oct 2023 12:48:36 -0300
Subject: [PATCH 079/145] =?UTF-8?q?cria=20fun=C3=A7=C3=A3o=20para=20map=20?=
 =?UTF-8?q?de=20multiplos=20retornos?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 pipelines/rj_smtr/flows.py |  19 +++-
 pipelines/rj_smtr/tasks.py | 208 +++++++++++++++++++++----------------
 2 files changed, 133 insertions(+), 94 deletions(-)

diff --git a/pipelines/rj_smtr/flows.py b/pipelines/rj_smtr/flows.py
index 6012aae22..d408c75f7 100644
--- a/pipelines/rj_smtr/flows.py
+++ b/pipelines/rj_smtr/flows.py
@@ -37,6 +37,7 @@
     get_raw_from_sources,
     create_request_params,
     query_logs,
+    unpack_mapped_results_nout2,
 )
 
 from pipelines.utils.execute_dbt_model.tasks import run_dbt_model
@@ -87,14 +88,18 @@
     )
 
     # Extração #
-    request_params, request_paths = create_request_params.map(
+    create_request_params_returns = create_request_params.map(
         dataset_id=unmapped(dataset_id),
         extract_params=unmapped(extract_params),
         table_id=unmapped(table_id),
         timestamp=timestamps,
     )
 
-    errors, raw_filepaths = get_raw_from_sources.map(
+    request_params, request_paths = unpack_mapped_results_nout2(
+        mapped_results=create_request_params_returns
+    )
+
+    get_raw_from_sources_returns = get_raw_from_sources.map(
         source_type=unmapped(source_type),
         local_filepath=unmapped(filepaths),
         source_path=request_paths,
@@ -104,6 +109,10 @@
         request_params=request_params,
     )
 
+    errors, raw_filepaths = unpack_mapped_results_nout2(
+        mapped_results=get_raw_from_sources_returns
+    )
+
     errors = upload_raw_data_to_gcs.map(
         error=errors,
         raw_filepath=raw_filepaths,
@@ -114,7 +123,7 @@
 
     # Pré-tratamento #
 
-    errors, staging_filepaths = transform_raw_to_nested_structure.map(
+    nested_structure_returns = transform_raw_to_nested_structure.map(
         raw_filepath=raw_filepaths,
         filepath=filepaths,
         error=errors,
@@ -122,6 +131,10 @@
         primary_key=unmapped(primary_key),
     )
 
+    errors, staging_filepaths = unpack_mapped_results_nout2(
+        mapped_results=nested_structure_returns
+    )
+
     STAGING_UPLOADED = upload_staging_data_to_gcs.map(
         error=errors,
         staging_filepath=staging_filepaths,
diff --git a/pipelines/rj_smtr/tasks.py b/pipelines/rj_smtr/tasks.py
index bd4b45680..5d2083e8f 100644
--- a/pipelines/rj_smtr/tasks.py
+++ b/pipelines/rj_smtr/tasks.py
@@ -8,7 +8,7 @@
 import os
 from pathlib import Path
 import traceback
-from typing import Dict, List, Union, Iterable
+from typing import Dict, List, Union, Iterable, Any
 import io
 
 from basedosdados import Storage, Table
@@ -138,6 +138,103 @@ def build_incremental_model(  # pylint: disable=too-many-arguments
     return False
 
 
+@task(checkpoint=False, nout=3)
+def create_dbt_run_vars(
+    dataset_id: str,
+    dbt_vars: dict,
+    table_id: str,
+    raw_dataset_id: str,
+    raw_table_id: str,
+    mode: str,
+) -> tuple[list[dict], Union[list[dict], dict, None], bool]:
+    """
+    Create the variables to be used in dbt materialization based on a dict
+
+    Args:
+        dataset_id (str): the dataset_id to get the variables
+        dbt_vars (dict): dict containing the parameters
+        table_id (str): the table_id get the date_range variable
+        raw_dataset_id (str): the raw_dataset_id get the date_range variable
+        raw_table_id (str): the raw_table_id get the date_range variable
+        mode (str): the mode to get the date_range variable
+
+    Returns:
+        tuple[list[dict]: the variables to be used in DBT
+        Union[list[dict], dict, None]: the date variable (date_range or run_date)
+        bool: a flag that indicates if the date_range variable came from Redis
+    """
+
+    log(f"Creating DBT variables. Parameter received: {dbt_vars}")
+
+    if (not dbt_vars) or (not table_id):
+        log("dbt_vars or table_id are blank. Skiping task")
+        return [None], None, False
+
+    final_vars = []
+    date_var = None
+    flag_date_range = False
+
+    if "date_range" in dbt_vars.keys():
+        log("Creating date_range variable")
+
+        # Set date_range variable manually
+        if dict_contains_keys(
+            dbt_vars["date_range"], ["date_range_start", "date_range_end"]
+        ):
+            date_var = {
+                "date_range_start": dbt_vars["date_range"]["date_range_start"],
+                "date_range_end": dbt_vars["date_range"]["date_range_end"],
+            }
+        # Create date_range using Redis
+        else:
+            raw_table_id = raw_table_id or table_id
+
+            date_var = get_materialization_date_range.run(
+                dataset_id=dataset_id,
+                table_id=table_id,
+                raw_dataset_id=raw_dataset_id,
+                raw_table_id=raw_table_id,
+                table_run_datetime_column_name=dbt_vars["date_range"].get(
+                    "table_run_datetime_column_name"
+                ),
+                mode=mode,
+                delay_hours=dbt_vars["date_range"].get("delay_hours", 0),
+            )
+
+            flag_date_range = True
+
+        final_vars.append(date_var.copy())
+
+        log(f"date_range created: {date_var}")
+
+    elif "run_date" in dbt_vars.keys():
+        log("Creating run_date variable")
+
+        date_var = get_run_dates.run(
+            dbt_vars["run_date"].get("date_range_start"),
+            dbt_vars["run_date"].get("date_range_end"),
+        )
+        final_vars.append([d.copy() for d in date_var])
+
+        log(f"run_date created: {date_var}")
+
+    if "version" in dbt_vars.keys():
+        log("Creating version variable")
+        dataset_sha = fetch_dataset_sha.run(dataset_id=dataset_id)
+
+        # if there are other variables inside the list, update each item adding the version variable
+        if final_vars:
+            final_vars = get_join_dict.run(dict_list=final_vars, new_dict=dataset_sha)
+        else:
+            final_vars.append(dataset_sha)
+
+        log(f"version created: {dataset_sha}")
+
+    log(f"All variables was created, final value is: {final_vars}")
+
+    return final_vars, date_var, flag_date_range
+
+
 ###############
 #
 # Local file managment
@@ -1107,6 +1204,13 @@ def transform_raw_to_nested_structure(
     return error, filepath
 
 
+###############
+#
+# Utilitary tasks
+#
+###############
+
+
 @task(checkpoint=False)
 def coalesce_task(value_list: Iterable):
     """
@@ -1121,101 +1225,23 @@ def coalesce_task(value_list: Iterable):
     try:
         return next(value for value in value_list if value is not None)
     except StopIteration:
-        return
+        return None
 
 
-@task(checkpoint=False, nout=3)
-def create_dbt_run_vars(
-    dataset_id: str,
-    dbt_vars: dict,
-    table_id: str,
-    raw_dataset_id: str,
-    raw_table_id: str,
-    mode: str,
-) -> tuple[list[dict], Union[list[dict], dict, None], bool]:
+@task(checkpoint=False, nout=2)
+def unpack_mapped_results_nout2(
+    mapped_results: Iterable,
+) -> tuple[list[Any], list[Any]]:
     """
-    Create the variables to be used in dbt materialization based on a dict
+    Task to unpack the results from an nout=2 tasks in 2 lists when it is mapped
 
     Args:
-        dataset_id (str): the dataset_id to get the variables
-        dbt_vars (dict): dict containing the parameters
-        table_id (str): the table_id get the date_range variable
-        raw_dataset_id (str): the raw_dataset_id get the date_range variable
-        raw_table_id (str): the raw_table_id get the date_range variable
-        mode (str): the mode to get the date_range variable
+        mapped_results (Iterable): The mapped task return
 
     Returns:
-        tuple[list[dict]: the variables to be used in DBT
-        Union[list[dict], dict, None]: the date variable (date_range or run_date)
-        bool: a flag that indicates if the date_range variable came from Redis
-    """
-
-    log(f"Creating DBT variables. Parameter received: {dbt_vars}")
-
-    if (not dbt_vars) or (not table_id):
-        log("dbt_vars or table_id are blank. Skiping task")
-        return [None], None, False
-
-    final_vars = []
-    date_var = None
-    flag_date_range = False
+        tuple[list[Any], list[Any]]: The task original return splited in 2 lists:
+            - 1st list being all the first return
+            - 2nd list being all the second return
 
-    if "date_range" in dbt_vars.keys():
-        log("Creating date_range variable")
-
-        # Set date_range variable manually
-        if dict_contains_keys(
-            dbt_vars["date_range"], ["date_range_start", "date_range_end"]
-        ):
-            date_var = {
-                "date_range_start": dbt_vars["date_range"]["date_range_start"],
-                "date_range_end": dbt_vars["date_range"]["date_range_end"],
-            }
-        # Create date_range using Redis
-        else:
-            raw_table_id = raw_table_id or table_id
-
-            date_var = get_materialization_date_range.run(
-                dataset_id=dataset_id,
-                table_id=table_id,
-                raw_dataset_id=raw_dataset_id,
-                raw_table_id=raw_table_id,
-                table_run_datetime_column_name=dbt_vars["date_range"].get(
-                    "table_run_datetime_column_name"
-                ),
-                mode=mode,
-                delay_hours=dbt_vars["date_range"].get("delay_hours", 0),
-            )
-
-            flag_date_range = True
-
-        final_vars.append(date_var.copy())
-
-        log(f"date_range created: {date_var}")
-
-    elif "run_date" in dbt_vars.keys():
-        log("Creating run_date variable")
-
-        date_var = get_run_dates.run(
-            dbt_vars["run_date"].get("date_range_start"),
-            dbt_vars["run_date"].get("date_range_end"),
-        )
-        final_vars.append([d.copy() for d in date_var])
-
-        log(f"run_date created: {date_var}")
-
-    if "version" in dbt_vars.keys():
-        log("Creating version variable")
-        dataset_sha = fetch_dataset_sha.run(dataset_id=dataset_id)
-
-        # if there are other variables inside the list, update each item adding the version variable
-        if final_vars:
-            final_vars = get_join_dict.run(dict_list=final_vars, new_dict=dataset_sha)
-        else:
-            final_vars.append(dataset_sha)
-
-        log(f"version created: {dataset_sha}")
-
-    log(f"All variables was created, final value is: {final_vars}")
-
-    return final_vars, date_var, flag_date_range
+    """
+    return [r[0] for r in mapped_results], [r[1] for r in mapped_results]

From 22bb4ce20f2a9bfa59f078921d449e1d3689defd Mon Sep 17 00:00:00 2001
From: Rafael <rafael.c.pinheiro96@gmail.com>
Date: Wed, 11 Oct 2023 13:19:29 -0300
Subject: [PATCH 080/145] remover unmapped dos filepaths

---
 pipelines/rj_smtr/flows.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pipelines/rj_smtr/flows.py b/pipelines/rj_smtr/flows.py
index d408c75f7..7636ce081 100644
--- a/pipelines/rj_smtr/flows.py
+++ b/pipelines/rj_smtr/flows.py
@@ -101,7 +101,7 @@
 
     get_raw_from_sources_returns = get_raw_from_sources.map(
         source_type=unmapped(source_type),
-        local_filepath=unmapped(filepaths),
+        local_filepath=filepaths,
         source_path=request_paths,
         dataset_id=unmapped(dataset_id),
         table_id=unmapped(table_id),

From 8cb440459fe77d0b016d5f67fe1e352cf047da4f Mon Sep 17 00:00:00 2001
From: Rafael <rafael.c.pinheiro96@gmail.com>
Date: Wed, 11 Oct 2023 13:19:41 -0300
Subject: [PATCH 081/145] log para debbug

---
 pipelines/rj_smtr/tasks.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/pipelines/rj_smtr/tasks.py b/pipelines/rj_smtr/tasks.py
index 5d2083e8f..eea38f3f2 100644
--- a/pipelines/rj_smtr/tasks.py
+++ b/pipelines/rj_smtr/tasks.py
@@ -903,6 +903,8 @@ def upload_staging_data_to_gcs(
             error = traceback.format_exc()
             log(f"[CATCHED] Task failed with error: \n{error}", level="error")
 
+    log(f"previous_error = {previous_error}")
+
     upload_run_logs_to_bq(
         dataset_id=dataset_id,
         parent_table_id=table_id,

From e8d9fb7be62464b5160e13fbc7b2a65f95e4953a Mon Sep 17 00:00:00 2001
From: Rafael <rafael.c.pinheiro96@gmail.com>
Date: Wed, 11 Oct 2023 14:58:09 -0300
Subject: [PATCH 082/145] =?UTF-8?q?retirar=20unmapped=20das=20parti=C3=A7?=
 =?UTF-8?q?=C3=B5es?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 pipelines/rj_smtr/flows.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pipelines/rj_smtr/flows.py b/pipelines/rj_smtr/flows.py
index 7636ce081..5000c6d57 100644
--- a/pipelines/rj_smtr/flows.py
+++ b/pipelines/rj_smtr/flows.py
@@ -118,7 +118,7 @@
         raw_filepath=raw_filepaths,
         table_id=unmapped(table_id),
         dataset_id=unmapped(dataset_id),
-        partitions=unmapped(partitions),
+        partitions=partitions,
     )
 
     # Pré-tratamento #

From cb7e7e5d52b38b8285c8c3089e0575a0176f10fe Mon Sep 17 00:00:00 2001
From: Rafael <rafael.c.pinheiro96@gmail.com>
Date: Wed, 11 Oct 2023 15:16:51 -0300
Subject: [PATCH 083/145] adicionar unmapped no parametro recapture

---
 pipelines/rj_smtr/flows.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pipelines/rj_smtr/flows.py b/pipelines/rj_smtr/flows.py
index 5000c6d57..e95eae285 100644
--- a/pipelines/rj_smtr/flows.py
+++ b/pipelines/rj_smtr/flows.py
@@ -143,7 +143,7 @@
         dataset_id=unmapped(dataset_id),
         partitions=partitions,
         previous_error=previous_errors,
-        recapture=recapture,
+        recapture=unmapped(recapture),
     )
 
 default_capture_flow.storage = GCS(emd_constants.GCS_FLOWS_BUCKET.value)

From 59789abafd22c19d31c62e2c555c58cf493990f9 Mon Sep 17 00:00:00 2001
From: Rafael <rafael.c.pinheiro96@gmail.com>
Date: Mon, 16 Oct 2023 15:43:29 -0300
Subject: [PATCH 084/145] adicionar psycopg2

---
 poetry.lock    | 121 +++++++++++++++++++++++++++++++++++++++++++------
 pyproject.toml |   1 +
 2 files changed, 108 insertions(+), 14 deletions(-)

diff --git a/poetry.lock b/poetry.lock
index f106de89b..330ce7b4b 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -1,4 +1,4 @@
-# This file is automatically @generated by Poetry 1.5.1 and should not be changed by hand.
+# This file is automatically @generated by Poetry 1.6.1 and should not be changed by hand.
 
 [[package]]
 name = "adal"
@@ -2483,6 +2483,7 @@ files = [
     {file = "greenlet-2.0.2-cp27-cp27m-win32.whl", hash = "sha256:6c3acb79b0bfd4fe733dff8bc62695283b57949ebcca05ae5c129eb606ff2d74"},
     {file = "greenlet-2.0.2-cp27-cp27m-win_amd64.whl", hash = "sha256:283737e0da3f08bd637b5ad058507e578dd462db259f7f6e4c5c365ba4ee9343"},
     {file = "greenlet-2.0.2-cp27-cp27mu-manylinux2010_x86_64.whl", hash = "sha256:d27ec7509b9c18b6d73f2f5ede2622441de812e7b1a80bbd446cb0633bd3d5ae"},
+    {file = "greenlet-2.0.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:d967650d3f56af314b72df7089d96cda1083a7fc2da05b375d2bc48c82ab3f3c"},
     {file = "greenlet-2.0.2-cp310-cp310-macosx_11_0_x86_64.whl", hash = "sha256:30bcf80dda7f15ac77ba5af2b961bdd9dbc77fd4ac6105cee85b0d0a5fcf74df"},
     {file = "greenlet-2.0.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:26fbfce90728d82bc9e6c38ea4d038cba20b7faf8a0ca53a9c07b67318d46088"},
     {file = "greenlet-2.0.2-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:9190f09060ea4debddd24665d6804b995a9c122ef5917ab26e1566dcc712ceeb"},
@@ -2491,6 +2492,7 @@ files = [
     {file = "greenlet-2.0.2-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:76ae285c8104046b3a7f06b42f29c7b73f77683df18c49ab5af7983994c2dd91"},
     {file = "greenlet-2.0.2-cp310-cp310-win_amd64.whl", hash = "sha256:2d4686f195e32d36b4d7cf2d166857dbd0ee9f3d20ae349b6bf8afc8485b3645"},
     {file = "greenlet-2.0.2-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:c4302695ad8027363e96311df24ee28978162cdcdd2006476c43970b384a244c"},
+    {file = "greenlet-2.0.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:d4606a527e30548153be1a9f155f4e283d109ffba663a15856089fb55f933e47"},
     {file = "greenlet-2.0.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c48f54ef8e05f04d6eff74b8233f6063cb1ed960243eacc474ee73a2ea8573ca"},
     {file = "greenlet-2.0.2-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:a1846f1b999e78e13837c93c778dcfc3365902cfb8d1bdb7dd73ead37059f0d0"},
     {file = "greenlet-2.0.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3a06ad5312349fec0ab944664b01d26f8d1f05009566339ac6f63f56589bc1a2"},
@@ -2520,6 +2522,7 @@ files = [
     {file = "greenlet-2.0.2-cp37-cp37m-win32.whl", hash = "sha256:3f6ea9bd35eb450837a3d80e77b517ea5bc56b4647f5502cd28de13675ee12f7"},
     {file = "greenlet-2.0.2-cp37-cp37m-win_amd64.whl", hash = "sha256:7492e2b7bd7c9b9916388d9df23fa49d9b88ac0640db0a5b4ecc2b653bf451e3"},
     {file = "greenlet-2.0.2-cp38-cp38-macosx_10_15_x86_64.whl", hash = "sha256:b864ba53912b6c3ab6bcb2beb19f19edd01a6bfcbdfe1f37ddd1778abfe75a30"},
+    {file = "greenlet-2.0.2-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:1087300cf9700bbf455b1b97e24db18f2f77b55302a68272c56209d5587c12d1"},
     {file = "greenlet-2.0.2-cp38-cp38-manylinux2010_x86_64.whl", hash = "sha256:ba2956617f1c42598a308a84c6cf021a90ff3862eddafd20c3333d50f0edb45b"},
     {file = "greenlet-2.0.2-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:fc3a569657468b6f3fb60587e48356fe512c1754ca05a564f11366ac9e306526"},
     {file = "greenlet-2.0.2-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:8eab883b3b2a38cc1e050819ef06a7e6344d4a990d24d45bc6f2cf959045a45b"},
@@ -2528,6 +2531,7 @@ files = [
     {file = "greenlet-2.0.2-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:b0ef99cdbe2b682b9ccbb964743a6aca37905fda5e0452e5ee239b1654d37f2a"},
     {file = "greenlet-2.0.2-cp38-cp38-win32.whl", hash = "sha256:b80f600eddddce72320dbbc8e3784d16bd3fb7b517e82476d8da921f27d4b249"},
     {file = "greenlet-2.0.2-cp38-cp38-win_amd64.whl", hash = "sha256:4d2e11331fc0c02b6e84b0d28ece3a36e0548ee1a1ce9ddde03752d9b79bba40"},
+    {file = "greenlet-2.0.2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:8512a0c38cfd4e66a858ddd1b17705587900dd760c6003998e9472b77b56d417"},
     {file = "greenlet-2.0.2-cp39-cp39-macosx_11_0_x86_64.whl", hash = "sha256:88d9ab96491d38a5ab7c56dd7a3cc37d83336ecc564e4e8816dbed12e5aaefc8"},
     {file = "greenlet-2.0.2-cp39-cp39-manylinux2010_x86_64.whl", hash = "sha256:561091a7be172ab497a3527602d467e2b3fbe75f9e783d8b8ce403fa414f71a6"},
     {file = "greenlet-2.0.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:971ce5e14dc5e73715755d0ca2975ac88cfdaefcaab078a284fea6cfabf866df"},
@@ -3533,10 +3537,7 @@ packaging = "<24"
 pandas = "<3"
 prometheus-flask-exporter = {version = "*", optional = true, markers = "extra == \"extras\""}
 protobuf = ">=3.12.0,<5"
-pyarrow = [
-    {version = ">=4.0.0,<13"},
-    {version = "*", optional = true, markers = "extra == \"extras\""},
-]
+pyarrow = ">=4.0.0,<13"
 pysftp = {version = "*", optional = true, markers = "extra == \"extras\""}
 pytz = "<2024"
 pyyaml = ">=5.1,<7"
@@ -3945,12 +3946,11 @@ files = [
 
 [package.dependencies]
 numpy = [
-    {version = ">=1.21.0", markers = "python_version <= \"3.9\" and platform_system == \"Darwin\" and platform_machine == \"arm64\""},
-    {version = ">=1.19.3", markers = "python_version >= \"3.6\" and platform_system == \"Linux\" and platform_machine == \"aarch64\" or python_version >= \"3.9\""},
-    {version = ">=1.17.0", markers = "python_version >= \"3.7\""},
-    {version = ">=1.17.3", markers = "python_version >= \"3.8\""},
-    {version = ">=1.21.2", markers = "python_version >= \"3.10\""},
+    {version = ">=1.21.0", markers = "python_version <= \"3.9\" and platform_system == \"Darwin\" and platform_machine == \"arm64\" and python_version >= \"3.8\""},
+    {version = ">=1.19.3", markers = "platform_system == \"Linux\" and platform_machine == \"aarch64\" and python_version >= \"3.8\" and python_version < \"3.10\" or python_version > \"3.9\" and python_version < \"3.10\" or python_version >= \"3.9\" and platform_system != \"Darwin\" and python_version < \"3.10\" or python_version >= \"3.9\" and platform_machine != \"arm64\" and python_version < \"3.10\""},
+    {version = ">=1.17.3", markers = "(platform_system != \"Darwin\" and platform_system != \"Linux\") and python_version >= \"3.8\" and python_version < \"3.9\" or platform_system != \"Darwin\" and python_version >= \"3.8\" and python_version < \"3.9\" and platform_machine != \"aarch64\" or platform_machine != \"arm64\" and python_version >= \"3.8\" and python_version < \"3.9\" and platform_system != \"Linux\" or (platform_machine != \"arm64\" and platform_machine != \"aarch64\") and python_version >= \"3.8\" and python_version < \"3.9\""},
     {version = ">=1.21.4", markers = "python_version >= \"3.10\" and platform_system == \"Darwin\""},
+    {version = ">=1.21.2", markers = "platform_system != \"Darwin\" and python_version >= \"3.10\""},
 ]
 
 [[package]]
@@ -3968,6 +3968,7 @@ files = [
     {file = "orjson-3.9.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4a39c2529d75373b7167bf84c814ef9b8f3737a339c225ed6c0df40736df8748"},
     {file = "orjson-3.9.2-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:84ebd6fdf138eb0eb4280045442331ee71c0aab5e16397ba6645f32f911bfb37"},
     {file = "orjson-3.9.2-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:5a60a1cfcfe310547a1946506dd4f1ed0a7d5bd5b02c8697d9d5dcd8d2e9245e"},
+    {file = "orjson-3.9.2-cp310-none-win32.whl", hash = "sha256:2ae61f5d544030a6379dbc23405df66fea0777c48a0216d2d83d3e08b69eb676"},
     {file = "orjson-3.9.2-cp310-none-win_amd64.whl", hash = "sha256:c290c4f81e8fd0c1683638802c11610b2f722b540f8e5e858b6914b495cf90c8"},
     {file = "orjson-3.9.2-cp311-cp311-macosx_10_15_x86_64.macosx_11_0_arm64.macosx_10_15_universal2.whl", hash = "sha256:02ef014f9a605e84b675060785e37ec9c0d2347a04f1307a9d6840ab8ecd6f55"},
     {file = "orjson-3.9.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:992af54265ada1c1579500d6594ed73fe333e726de70d64919cf37f93defdd06"},
@@ -3977,6 +3978,7 @@ files = [
     {file = "orjson-3.9.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:275b5a18fd9ed60b2720543d3ddac170051c43d680e47d04ff5203d2c6d8ebf1"},
     {file = "orjson-3.9.2-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:b9aea6dcb99fcbc9f6d1dd84fca92322fda261da7fb014514bb4689c7c2097a8"},
     {file = "orjson-3.9.2-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:7d74ae0e101d17c22ef67b741ba356ab896fc0fa64b301c2bf2bb0a4d874b190"},
+    {file = "orjson-3.9.2-cp311-none-win32.whl", hash = "sha256:a9a7d618f99b2d67365f2b3a588686195cb6e16666cd5471da603a01315c17cc"},
     {file = "orjson-3.9.2-cp311-none-win_amd64.whl", hash = "sha256:6320b28e7bdb58c3a3a5efffe04b9edad3318d82409e84670a9b24e8035a249d"},
     {file = "orjson-3.9.2-cp37-cp37m-macosx_10_15_x86_64.macosx_11_0_arm64.macosx_10_15_universal2.whl", hash = "sha256:368e9cc91ecb7ac21f2aa475e1901204110cf3e714e98649c2502227d248f947"},
     {file = "orjson-3.9.2-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:58e9e70f0dcd6a802c35887f306b555ff7a214840aad7de24901fc8bd9cf5dde"},
@@ -3986,6 +3988,7 @@ files = [
     {file = "orjson-3.9.2-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e46e9c5b404bb9e41d5555762fd410d5466b7eb1ec170ad1b1609cbebe71df21"},
     {file = "orjson-3.9.2-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:8170157288714678ffd64f5de33039e1164a73fd8b6be40a8a273f80093f5c4f"},
     {file = "orjson-3.9.2-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:e3e2f087161947dafe8319ea2cfcb9cea4bb9d2172ecc60ac3c9738f72ef2909"},
+    {file = "orjson-3.9.2-cp37-none-win32.whl", hash = "sha256:373b7b2ad11975d143556fdbd2c27e1150b535d2c07e0b48dc434211ce557fe6"},
     {file = "orjson-3.9.2-cp37-none-win_amd64.whl", hash = "sha256:d7de3dbbe74109ae598692113cec327fd30c5a30ebca819b21dfa4052f7b08ef"},
     {file = "orjson-3.9.2-cp38-cp38-macosx_10_15_x86_64.macosx_11_0_arm64.macosx_10_15_universal2.whl", hash = "sha256:8cd4385c59bbc1433cad4a80aca65d2d9039646a9c57f8084897549b55913b17"},
     {file = "orjson-3.9.2-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a74036aab1a80c361039290cdbc51aa7adc7ea13f56e5ef94e9be536abd227bd"},
@@ -3995,6 +3998,7 @@ files = [
     {file = "orjson-3.9.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1882a70bb69595b9ec5aac0040a819e94d2833fe54901e2b32f5e734bc259a8b"},
     {file = "orjson-3.9.2-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:fc05e060d452145ab3c0b5420769e7356050ea311fc03cb9d79c481982917cca"},
     {file = "orjson-3.9.2-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:f8bc2c40d9bb26efefb10949d261a47ca196772c308babc538dd9f4b73e8d386"},
+    {file = "orjson-3.9.2-cp38-none-win32.whl", hash = "sha256:302d80198d8d5b658065627da3a356cbe5efa082b89b303f162f030c622e0a17"},
     {file = "orjson-3.9.2-cp38-none-win_amd64.whl", hash = "sha256:3164fc20a585ec30a9aff33ad5de3b20ce85702b2b2a456852c413e3f0d7ab09"},
     {file = "orjson-3.9.2-cp39-cp39-macosx_10_15_x86_64.macosx_11_0_arm64.macosx_10_15_universal2.whl", hash = "sha256:7a6ccadf788531595ed4728aa746bc271955448d2460ff0ef8e21eb3f2a281ba"},
     {file = "orjson-3.9.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3245d230370f571c945f69aab823c279a868dc877352817e22e551de155cb06c"},
@@ -4004,6 +4008,7 @@ files = [
     {file = "orjson-3.9.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:03fb36f187a0c19ff38f6289418863df8b9b7880cdbe279e920bef3a09d8dab1"},
     {file = "orjson-3.9.2-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:20925d07a97c49c6305bff1635318d9fc1804aa4ccacb5fb0deb8a910e57d97a"},
     {file = "orjson-3.9.2-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:eebfed53bec5674e981ebe8ed2cf00b3f7bcda62d634733ff779c264307ea505"},
+    {file = "orjson-3.9.2-cp39-none-win32.whl", hash = "sha256:ba60f09d735f16593950c6adf033fbb526faa94d776925579a87b777db7d0838"},
     {file = "orjson-3.9.2-cp39-none-win_amd64.whl", hash = "sha256:869b961df5fcedf6c79f4096119b35679b63272362e9b745e668f0391a892d39"},
     {file = "orjson-3.9.2.tar.gz", hash = "sha256:24257c8f641979bf25ecd3e27251b5cc194cdd3a6e96004aac8446f5e63d9664"},
 ]
@@ -4594,6 +4599,84 @@ files = [
 [package.extras]
 test = ["enum34", "ipaddress", "mock", "pywin32", "wmi"]
 
+[[package]]
+name = "psycopg2-binary"
+version = "2.9.9"
+description = "psycopg2 - Python-PostgreSQL Database Adapter"
+optional = false
+python-versions = ">=3.7"
+files = [
+    {file = "psycopg2-binary-2.9.9.tar.gz", hash = "sha256:7f01846810177d829c7692f1f5ada8096762d9172af1b1a28d4ab5b77c923c1c"},
+    {file = "psycopg2_binary-2.9.9-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:c2470da5418b76232f02a2fcd2229537bb2d5a7096674ce61859c3229f2eb202"},
+    {file = "psycopg2_binary-2.9.9-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:c6af2a6d4b7ee9615cbb162b0738f6e1fd1f5c3eda7e5da17861eacf4c717ea7"},
+    {file = "psycopg2_binary-2.9.9-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:75723c3c0fbbf34350b46a3199eb50638ab22a0228f93fb472ef4d9becc2382b"},
+    {file = "psycopg2_binary-2.9.9-cp310-cp310-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:83791a65b51ad6ee6cf0845634859d69a038ea9b03d7b26e703f94c7e93dbcf9"},
+    {file = "psycopg2_binary-2.9.9-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:0ef4854e82c09e84cc63084a9e4ccd6d9b154f1dbdd283efb92ecd0b5e2b8c84"},
+    {file = "psycopg2_binary-2.9.9-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ed1184ab8f113e8d660ce49a56390ca181f2981066acc27cf637d5c1e10ce46e"},
+    {file = "psycopg2_binary-2.9.9-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:d2997c458c690ec2bc6b0b7ecbafd02b029b7b4283078d3b32a852a7ce3ddd98"},
+    {file = "psycopg2_binary-2.9.9-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:b58b4710c7f4161b5e9dcbe73bb7c62d65670a87df7bcce9e1faaad43e715245"},
+    {file = "psycopg2_binary-2.9.9-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:0c009475ee389757e6e34611d75f6e4f05f0cf5ebb76c6037508318e1a1e0d7e"},
+    {file = "psycopg2_binary-2.9.9-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:8dbf6d1bc73f1d04ec1734bae3b4fb0ee3cb2a493d35ede9badbeb901fb40f6f"},
+    {file = "psycopg2_binary-2.9.9-cp310-cp310-win32.whl", hash = "sha256:3f78fd71c4f43a13d342be74ebbc0666fe1f555b8837eb113cb7416856c79682"},
+    {file = "psycopg2_binary-2.9.9-cp310-cp310-win_amd64.whl", hash = "sha256:876801744b0dee379e4e3c38b76fc89f88834bb15bf92ee07d94acd06ec890a0"},
+    {file = "psycopg2_binary-2.9.9-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:ee825e70b1a209475622f7f7b776785bd68f34af6e7a46e2e42f27b659b5bc26"},
+    {file = "psycopg2_binary-2.9.9-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:1ea665f8ce695bcc37a90ee52de7a7980be5161375d42a0b6c6abedbf0d81f0f"},
+    {file = "psycopg2_binary-2.9.9-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:143072318f793f53819048fdfe30c321890af0c3ec7cb1dfc9cc87aa88241de2"},
+    {file = "psycopg2_binary-2.9.9-cp311-cp311-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c332c8d69fb64979ebf76613c66b985414927a40f8defa16cf1bc028b7b0a7b0"},
+    {file = "psycopg2_binary-2.9.9-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:f7fc5a5acafb7d6ccca13bfa8c90f8c51f13d8fb87d95656d3950f0158d3ce53"},
+    {file = "psycopg2_binary-2.9.9-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:977646e05232579d2e7b9c59e21dbe5261f403a88417f6a6512e70d3f8a046be"},
+    {file = "psycopg2_binary-2.9.9-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:b6356793b84728d9d50ead16ab43c187673831e9d4019013f1402c41b1db9b27"},
+    {file = "psycopg2_binary-2.9.9-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:bc7bb56d04601d443f24094e9e31ae6deec9ccb23581f75343feebaf30423359"},
+    {file = "psycopg2_binary-2.9.9-cp311-cp311-musllinux_1_1_ppc64le.whl", hash = "sha256:77853062a2c45be16fd6b8d6de2a99278ee1d985a7bd8b103e97e41c034006d2"},
+    {file = "psycopg2_binary-2.9.9-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:78151aa3ec21dccd5cdef6c74c3e73386dcdfaf19bced944169697d7ac7482fc"},
+    {file = "psycopg2_binary-2.9.9-cp311-cp311-win32.whl", hash = "sha256:dc4926288b2a3e9fd7b50dc6a1909a13bbdadfc67d93f3374d984e56f885579d"},
+    {file = "psycopg2_binary-2.9.9-cp311-cp311-win_amd64.whl", hash = "sha256:b76bedd166805480ab069612119ea636f5ab8f8771e640ae103e05a4aae3e417"},
+    {file = "psycopg2_binary-2.9.9-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:8532fd6e6e2dc57bcb3bc90b079c60de896d2128c5d9d6f24a63875a95a088cf"},
+    {file = "psycopg2_binary-2.9.9-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8f8544b092a29a6ddd72f3556a9fcf249ec412e10ad28be6a0c0d948924f2212"},
+    {file = "psycopg2_binary-2.9.9-cp312-cp312-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:2d423c8d8a3c82d08fe8af900ad5b613ce3632a1249fd6a223941d0735fce493"},
+    {file = "psycopg2_binary-2.9.9-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:2e5afae772c00980525f6d6ecf7cbca55676296b580c0e6abb407f15f3706996"},
+    {file = "psycopg2_binary-2.9.9-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6e6f98446430fdf41bd36d4faa6cb409f5140c1c2cf58ce0bbdaf16af7d3f119"},
+    {file = "psycopg2_binary-2.9.9-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:c77e3d1862452565875eb31bdb45ac62502feabbd53429fdc39a1cc341d681ba"},
+    {file = "psycopg2_binary-2.9.9-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:cb16c65dcb648d0a43a2521f2f0a2300f40639f6f8c1ecbc662141e4e3e1ee07"},
+    {file = "psycopg2_binary-2.9.9-cp312-cp312-musllinux_1_1_ppc64le.whl", hash = "sha256:911dda9c487075abd54e644ccdf5e5c16773470a6a5d3826fda76699410066fb"},
+    {file = "psycopg2_binary-2.9.9-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:57fede879f08d23c85140a360c6a77709113efd1c993923c59fde17aa27599fe"},
+    {file = "psycopg2_binary-2.9.9-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:2293b001e319ab0d869d660a704942c9e2cce19745262a8aba2115ef41a0a42a"},
+    {file = "psycopg2_binary-2.9.9-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:03ef7df18daf2c4c07e2695e8cfd5ee7f748a1d54d802330985a78d2a5a6dca9"},
+    {file = "psycopg2_binary-2.9.9-cp37-cp37m-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:0a602ea5aff39bb9fac6308e9c9d82b9a35c2bf288e184a816002c9fae930b77"},
+    {file = "psycopg2_binary-2.9.9-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:8359bf4791968c5a78c56103702000105501adb557f3cf772b2c207284273984"},
+    {file = "psycopg2_binary-2.9.9-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:275ff571376626195ab95a746e6a04c7df8ea34638b99fc11160de91f2fef503"},
+    {file = "psycopg2_binary-2.9.9-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:f9b5571d33660d5009a8b3c25dc1db560206e2d2f89d3df1cb32d72c0d117d52"},
+    {file = "psycopg2_binary-2.9.9-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:420f9bbf47a02616e8554e825208cb947969451978dceb77f95ad09c37791dae"},
+    {file = "psycopg2_binary-2.9.9-cp37-cp37m-musllinux_1_1_ppc64le.whl", hash = "sha256:4154ad09dac630a0f13f37b583eae260c6aa885d67dfbccb5b02c33f31a6d420"},
+    {file = "psycopg2_binary-2.9.9-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:a148c5d507bb9b4f2030a2025c545fccb0e1ef317393eaba42e7eabd28eb6041"},
+    {file = "psycopg2_binary-2.9.9-cp37-cp37m-win32.whl", hash = "sha256:68fc1f1ba168724771e38bee37d940d2865cb0f562380a1fb1ffb428b75cb692"},
+    {file = "psycopg2_binary-2.9.9-cp37-cp37m-win_amd64.whl", hash = "sha256:281309265596e388ef483250db3640e5f414168c5a67e9c665cafce9492eda2f"},
+    {file = "psycopg2_binary-2.9.9-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:60989127da422b74a04345096c10d416c2b41bd7bf2a380eb541059e4e999980"},
+    {file = "psycopg2_binary-2.9.9-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:246b123cc54bb5361588acc54218c8c9fb73068bf227a4a531d8ed56fa3ca7d6"},
+    {file = "psycopg2_binary-2.9.9-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:34eccd14566f8fe14b2b95bb13b11572f7c7d5c36da61caf414d23b91fcc5d94"},
+    {file = "psycopg2_binary-2.9.9-cp38-cp38-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:18d0ef97766055fec15b5de2c06dd8e7654705ce3e5e5eed3b6651a1d2a9a152"},
+    {file = "psycopg2_binary-2.9.9-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:d3f82c171b4ccd83bbaf35aa05e44e690113bd4f3b7b6cc54d2219b132f3ae55"},
+    {file = "psycopg2_binary-2.9.9-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ead20f7913a9c1e894aebe47cccf9dc834e1618b7aa96155d2091a626e59c972"},
+    {file = "psycopg2_binary-2.9.9-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:ca49a8119c6cbd77375ae303b0cfd8c11f011abbbd64601167ecca18a87e7cdd"},
+    {file = "psycopg2_binary-2.9.9-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:323ba25b92454adb36fa425dc5cf6f8f19f78948cbad2e7bc6cdf7b0d7982e59"},
+    {file = "psycopg2_binary-2.9.9-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:1236ed0952fbd919c100bc839eaa4a39ebc397ed1c08a97fc45fee2a595aa1b3"},
+    {file = "psycopg2_binary-2.9.9-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:729177eaf0aefca0994ce4cffe96ad3c75e377c7b6f4efa59ebf003b6d398716"},
+    {file = "psycopg2_binary-2.9.9-cp38-cp38-win32.whl", hash = "sha256:804d99b24ad523a1fe18cc707bf741670332f7c7412e9d49cb5eab67e886b9b5"},
+    {file = "psycopg2_binary-2.9.9-cp38-cp38-win_amd64.whl", hash = "sha256:a6cdcc3ede532f4a4b96000b6362099591ab4a3e913d70bcbac2b56c872446f7"},
+    {file = "psycopg2_binary-2.9.9-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:72dffbd8b4194858d0941062a9766f8297e8868e1dd07a7b36212aaa90f49472"},
+    {file = "psycopg2_binary-2.9.9-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:30dcc86377618a4c8f3b72418df92e77be4254d8f89f14b8e8f57d6d43603c0f"},
+    {file = "psycopg2_binary-2.9.9-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:31a34c508c003a4347d389a9e6fcc2307cc2150eb516462a7a17512130de109e"},
+    {file = "psycopg2_binary-2.9.9-cp39-cp39-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:15208be1c50b99203fe88d15695f22a5bed95ab3f84354c494bcb1d08557df67"},
+    {file = "psycopg2_binary-2.9.9-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:1873aade94b74715be2246321c8650cabf5a0d098a95bab81145ffffa4c13876"},
+    {file = "psycopg2_binary-2.9.9-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3a58c98a7e9c021f357348867f537017057c2ed7f77337fd914d0bedb35dace7"},
+    {file = "psycopg2_binary-2.9.9-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:4686818798f9194d03c9129a4d9a702d9e113a89cb03bffe08c6cf799e053291"},
+    {file = "psycopg2_binary-2.9.9-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:ebdc36bea43063116f0486869652cb2ed7032dbc59fbcb4445c4862b5c1ecf7f"},
+    {file = "psycopg2_binary-2.9.9-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:ca08decd2697fdea0aea364b370b1249d47336aec935f87b8bbfd7da5b2ee9c1"},
+    {file = "psycopg2_binary-2.9.9-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:ac05fb791acf5e1a3e39402641827780fe44d27e72567a000412c648a85ba860"},
+    {file = "psycopg2_binary-2.9.9-cp39-cp39-win32.whl", hash = "sha256:9dba73be7305b399924709b91682299794887cbbd88e38226ed9f6712eabee90"},
+    {file = "psycopg2_binary-2.9.9-cp39-cp39-win_amd64.whl", hash = "sha256:f7ae5d65ccfbebdfa761585228eb4d0df3a8b15cfb53bd953e713e09fbb12957"},
+]
+
 [[package]]
 name = "ptyprocess"
 version = "0.7.0"
@@ -5296,6 +5379,7 @@ files = [
     {file = "PyYAML-6.0.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:69b023b2b4daa7548bcfbd4aa3da05b3a74b772db9e23b982788168117739938"},
     {file = "PyYAML-6.0.1-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:81e0b275a9ecc9c0c0c07b4b90ba548307583c125f54d5b6946cfee6360c733d"},
     {file = "PyYAML-6.0.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ba336e390cd8e4d1739f42dfe9bb83a3cc2e80f567d8805e11b46f4a943f5515"},
+    {file = "PyYAML-6.0.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:326c013efe8048858a6d312ddd31d56e468118ad4cdeda36c719bf5bb6192290"},
     {file = "PyYAML-6.0.1-cp310-cp310-win32.whl", hash = "sha256:bd4af7373a854424dabd882decdc5579653d7868b8fb26dc7d0e99f823aa5924"},
     {file = "PyYAML-6.0.1-cp310-cp310-win_amd64.whl", hash = "sha256:fd1592b3fdf65fff2ad0004b5e363300ef59ced41c2e6b3a99d4089fa8c5435d"},
     {file = "PyYAML-6.0.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:6965a7bc3cf88e5a1c3bd2e0b5c22f8d677dc88a455344035f03399034eb3007"},
@@ -5303,8 +5387,15 @@ files = [
     {file = "PyYAML-6.0.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:42f8152b8dbc4fe7d96729ec2b99c7097d656dc1213a3229ca5383f973a5ed6d"},
     {file = "PyYAML-6.0.1-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:062582fca9fabdd2c8b54a3ef1c978d786e0f6b3a1510e0ac93ef59e0ddae2bc"},
     {file = "PyYAML-6.0.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d2b04aac4d386b172d5b9692e2d2da8de7bfb6c387fa4f801fbf6fb2e6ba4673"},
+    {file = "PyYAML-6.0.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:e7d73685e87afe9f3b36c799222440d6cf362062f78be1013661b00c5c6f678b"},
     {file = "PyYAML-6.0.1-cp311-cp311-win32.whl", hash = "sha256:1635fd110e8d85d55237ab316b5b011de701ea0f29d07611174a1b42f1444741"},
     {file = "PyYAML-6.0.1-cp311-cp311-win_amd64.whl", hash = "sha256:bf07ee2fef7014951eeb99f56f39c9bb4af143d8aa3c21b1677805985307da34"},
+    {file = "PyYAML-6.0.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:855fb52b0dc35af121542a76b9a84f8d1cd886ea97c84703eaa6d88e37a2ad28"},
+    {file = "PyYAML-6.0.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:40df9b996c2b73138957fe23a16a4f0ba614f4c0efce1e9406a184b6d07fa3a9"},
+    {file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6c22bec3fbe2524cde73d7ada88f6566758a8f7227bfbf93a408a9d86bcc12a0"},
+    {file = "PyYAML-6.0.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:8d4e9c88387b0f5c7d5f281e55304de64cf7f9c0021a3525bd3b1c542da3b0e4"},
+    {file = "PyYAML-6.0.1-cp312-cp312-win32.whl", hash = "sha256:d483d2cdf104e7c9fa60c544d92981f12ad66a457afae824d146093b8c294c54"},
+    {file = "PyYAML-6.0.1-cp312-cp312-win_amd64.whl", hash = "sha256:0d3304d8c0adc42be59c5f8a4d9e3d7379e6955ad754aa9d6ab7a398b59dd1df"},
     {file = "PyYAML-6.0.1-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:50550eb667afee136e9a77d6dc71ae76a44df8b3e51e41b77f6de2932bfe0f47"},
     {file = "PyYAML-6.0.1-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1fe35611261b29bd1de0070f0b2f47cb6ff71fa6595c077e42bd0c419fa27b98"},
     {file = "PyYAML-6.0.1-cp36-cp36m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:704219a11b772aea0d8ecd7058d0082713c3562b4e271b849ad7dc4a5c90c13c"},
@@ -5321,6 +5412,7 @@ files = [
     {file = "PyYAML-6.0.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a0cd17c15d3bb3fa06978b4e8958dcdc6e0174ccea823003a106c7d4d7899ac5"},
     {file = "PyYAML-6.0.1-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:28c119d996beec18c05208a8bd78cbe4007878c6dd15091efb73a30e90539696"},
     {file = "PyYAML-6.0.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7e07cbde391ba96ab58e532ff4803f79c4129397514e1413a7dc761ccd755735"},
+    {file = "PyYAML-6.0.1-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:49a183be227561de579b4a36efbb21b3eab9651dd81b1858589f796549873dd6"},
     {file = "PyYAML-6.0.1-cp38-cp38-win32.whl", hash = "sha256:184c5108a2aca3c5b3d3bf9395d50893a7ab82a38004c8f61c258d4428e80206"},
     {file = "PyYAML-6.0.1-cp38-cp38-win_amd64.whl", hash = "sha256:1e2722cc9fbb45d9b87631ac70924c11d3a401b2d7f410cc0e3bbf249f2dca62"},
     {file = "PyYAML-6.0.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:9eb6caa9a297fc2c2fb8862bc5370d0303ddba53ba97e71f08023b6cd73d16a8"},
@@ -5328,6 +5420,7 @@ files = [
     {file = "PyYAML-6.0.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5773183b6446b2c99bb77e77595dd486303b4faab2b086e7b17bc6bef28865f6"},
     {file = "PyYAML-6.0.1-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:b786eecbdf8499b9ca1d697215862083bd6d2a99965554781d0d8d1ad31e13a0"},
     {file = "PyYAML-6.0.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bc1bf2925a1ecd43da378f4db9e4f799775d6367bdb94671027b73b393a7c42c"},
+    {file = "PyYAML-6.0.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:04ac92ad1925b2cff1db0cfebffb6ffc43457495c9b3c39d3fcae417d7125dc5"},
     {file = "PyYAML-6.0.1-cp39-cp39-win32.whl", hash = "sha256:faca3bdcf85b2fc05d06ff3fbc1f83e1391b3e724afa3feba7d13eeab355484c"},
     {file = "PyYAML-6.0.1-cp39-cp39-win_amd64.whl", hash = "sha256:510c9deebc5c0225e8c96813043e62b680ba2f9c50a08d3724c7f28a747d1486"},
     {file = "PyYAML-6.0.1.tar.gz", hash = "sha256:bfdf460b1736c775f2ba9f6a92bca30bc2095067b8a9d77876d1fad6cc3b4a43"},
@@ -6088,7 +6181,7 @@ files = [
 ]
 
 [package.dependencies]
-greenlet = {version = "!=0.4.17", markers = "platform_machine == \"win32\" or platform_machine == \"WIN32\" or platform_machine == \"AMD64\" or platform_machine == \"amd64\" or platform_machine == \"x86_64\" or platform_machine == \"ppc64le\" or platform_machine == \"aarch64\""}
+greenlet = {version = "!=0.4.17", markers = "platform_machine == \"aarch64\" or platform_machine == \"ppc64le\" or platform_machine == \"x86_64\" or platform_machine == \"amd64\" or platform_machine == \"AMD64\" or platform_machine == \"win32\" or platform_machine == \"WIN32\""}
 typing-extensions = ">=4.2.0"
 
 [package.extras]
@@ -6248,8 +6341,8 @@ packaging = ">=21.3"
 pandas = ">=0.25"
 patsy = ">=0.5.2"
 scipy = [
-    {version = ">=1.3", markers = "(python_version > \"3.9\" or platform_system != \"Windows\" or platform_machine != \"x86\") and python_version < \"3.12\""},
-    {version = ">=1.3,<1.9", markers = "(python_version == \"3.8\" or python_version == \"3.9\") and platform_system == \"Windows\" and platform_machine == \"x86\""},
+    {version = ">=1.3", markers = "python_version > \"3.9\" and python_version < \"3.12\" or platform_system != \"Windows\" and python_version < \"3.12\" or platform_machine != \"x86\" and python_version < \"3.12\""},
+    {version = ">=1.3,<1.9", markers = "python_version == \"3.8\" and platform_system == \"Windows\" and platform_machine == \"x86\" or python_version == \"3.9\" and platform_system == \"Windows\" and platform_machine == \"x86\""},
 ]
 
 [package.extras]
@@ -7119,4 +7212,4 @@ testing = ["coverage (>=5.0.3)", "zope.event", "zope.testing"]
 [metadata]
 lock-version = "2.0"
 python-versions = ">=3.8,<3.11"
-content-hash = "ed25c76ba0aeea3d6fc6c59725c127160d13c12b527a3cf3900cb58db177750c"
+content-hash = "44c47c0f926f2494ef43ed357af82aa10b2ce5d1c5a46197a594ed94ec1e8b6a"
diff --git a/pyproject.toml b/pyproject.toml
index 0c8318999..36a66722a 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -60,6 +60,7 @@ statsmodels = "^0.13.0"
 tweepy = "4.4"
 xarray = "^2022.6.0"
 xgboost = "^1.7.4"
+psycopg2-binary = "^2.9.9"
 
 [tool.poetry.dev-dependencies]
 pylint = "^2.12.2"

From 60b1a93339085bfc9ca7a6122b952e544a9e6605 Mon Sep 17 00:00:00 2001
From: Rafael <rafael.c.pinheiro96@gmail.com>
Date: Mon, 16 Oct 2023 15:43:44 -0300
Subject: [PATCH 085/145] =?UTF-8?q?coment=C3=A1rios=20dos=20parametros?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 pipelines/rj_smtr/flows.py | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/pipelines/rj_smtr/flows.py b/pipelines/rj_smtr/flows.py
index e95eae285..e08bb919e 100644
--- a/pipelines/rj_smtr/flows.py
+++ b/pipelines/rj_smtr/flows.py
@@ -48,15 +48,20 @@
 ) as default_capture_flow:
     # Configuração #
 
+    # Parâmetros Gerais #
     table_id = Parameter("table_id", default=None)
+    dataset_id = Parameter("dataset_id", default=None)
     partition_date_only = Parameter("partition_date_only", default=None)
+
+    # Parâmetros Captura #
     extract_params = Parameter("extract_params", default=None)
-    dataset_id = Parameter("dataset_id", default=None)
     secret_path = Parameter("secret_path", default=None)
-    primary_key = Parameter("primary_key", default=None)
     source_type = Parameter("source_type", default=None)
     recapture = Parameter("recapture", default=False)
 
+    # Parâmetros Pré-tratamento #
+    primary_key = Parameter("primary_key", default=None)
+
     with case(recapture, True):
         _, recapture_timestamps, previous_errors = query_logs(
             dataset_id=dataset_id,

From ff7797355e8dc90db951e23eee9e7c10c32b29e0 Mon Sep 17 00:00:00 2001
From: Rafael <rafael.c.pinheiro96@gmail.com>
Date: Mon, 16 Oct 2023 15:44:02 -0300
Subject: [PATCH 086/145] =?UTF-8?q?adicionar=20conex=C3=A3o=20com=20postgr?=
 =?UTF-8?q?esql?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 pipelines/rj_smtr/utils.py | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/pipelines/rj_smtr/utils.py b/pipelines/rj_smtr/utils.py
index 6a6c70ee5..8775ca9b8 100644
--- a/pipelines/rj_smtr/utils.py
+++ b/pipelines/rj_smtr/utils.py
@@ -20,6 +20,8 @@
 import pandas as pd
 from google.cloud.storage.blob import Blob
 import pymysql
+import psycopg2
+import psycopg2.extras
 
 
 from prefect.schedules.clocks import IntervalClock
@@ -435,7 +437,6 @@ def generate_execute_schedules(  # pylint: disable=too-many-arguments,too-many-l
     clocks = []
     for count, parameters in enumerate(table_parameters):
         parameter_defaults = parameters | general_flow_params
-        log(f"parameter_defaults: {parameter_defaults}")
         clocks.append(
             IntervalClock(
                 interval=clock_interval,
@@ -630,12 +631,16 @@ def get_raw_data_db(
         tuple[str, str, str]: Error, data and filetype
     """
     connection_mapping = {
-        # 'postgresql': {'connector': psycopg2.connect, 'port': '5432', 'cursor':{'cursor_factory': psycopg2.extras.RealDictCursor}},
+        "postgresql": {
+            "connector": psycopg2.connect,
+            "port": "5432",
+            "cursor": {"cursor_factory": psycopg2.extras.RealDictCursor},
+        },
         "mysql": {
             "connector": pymysql.connect,
             "port": "3306",
             "cursor": {"cursor": pymysql.cursors.DictCursor},
-        }
+        },
     }
 
     data = None

From b0843876f6c4f61094b3481bdb0da546555094d3 Mon Sep 17 00:00:00 2001
From: Rafael <rafael.c.pinheiro96@gmail.com>
Date: Mon, 16 Oct 2023 16:19:05 -0300
Subject: [PATCH 087/145] mudar bilhetagem para extrair do db

---
 pipelines/rj_smtr/constants.py | 9 ++++-----
 pipelines/rj_smtr/tasks.py     | 7 ++-----
 2 files changed, 6 insertions(+), 10 deletions(-)

diff --git a/pipelines/rj_smtr/constants.py b/pipelines/rj_smtr/constants.py
index 0037c6989..ee96e8a79 100644
--- a/pipelines/rj_smtr/constants.py
+++ b/pipelines/rj_smtr/constants.py
@@ -170,19 +170,18 @@ class constants(Enum):  # pylint: disable=c0103
         "databases": {
             "principal_db": {
                 "engine": "mysql",
-                "host": "principal-database-replica.internal",
+                "host": "10.5.114.121",
             },
             "tarifa_db": {
                 "engine": "postgres",
-                "host": "tarifa-database-replica.internal",
+                "host": "10.5.113.254",
             },
             "transacao_db": {
                 "engine": "postgres",
-                "host": "transacao-database-replica.internal",
+                "host": "10.5.114.65",
             },
         },
-        "vpn_url": "http://vpn-jae.mobilidade.rio/",
-        "source_type": "api-json",
+        "source_type": "db",
     }
 
     BILHETAGEM_CAPTURE_RUN_INTERVAL = {
diff --git a/pipelines/rj_smtr/tasks.py b/pipelines/rj_smtr/tasks.py
index eea38f3f2..3969f28b9 100644
--- a/pipelines/rj_smtr/tasks.py
+++ b/pipelines/rj_smtr/tasks.py
@@ -564,18 +564,15 @@ def create_request_params(
         database = constants.BILHETAGEM_GENERAL_CAPTURE_PARAMS.value["databases"][
             extract_params["database"]
         ]
-        request_url = (
-            constants.BILHETAGEM_GENERAL_CAPTURE_PARAMS.value["vpn_url"]
-            + database["engine"]
-        )
+        request_url = database["host"]
 
         datetime_range = get_datetime_range(
             timestamp=timestamp, interval=timedelta(**extract_params["run_interval"])
         )
 
         request_params = {
-            "host": database["host"],  # TODO: exibir no log em ambiente fechado
             "database": extract_params["database"],
+            "engine": database["engine"],
             "query": extract_params["query"].format(**datetime_range),
         }
 

From 032763c6f65fcf9310775e2d70a5ecba2bbfafba Mon Sep 17 00:00:00 2001
From: Rafael <rafael.c.pinheiro96@gmail.com>
Date: Mon, 16 Oct 2023 16:19:32 -0300
Subject: [PATCH 088/145] padronizar nomenclatura dos argumentos

---
 pipelines/rj_smtr/utils.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/pipelines/rj_smtr/utils.py b/pipelines/rj_smtr/utils.py
index 8775ca9b8..cf69edc2c 100644
--- a/pipelines/rj_smtr/utils.py
+++ b/pipelines/rj_smtr/utils.py
@@ -615,14 +615,14 @@ def get_raw_data_gcs(
 
 
 def get_raw_data_db(
-    sql: str, dbms: str, host: str, secret_path: str, database: str
+    query: str, engine: str, host: str, secret_path: str, database: str
 ) -> tuple[str, str, str]:
     """
     Get data from Databases
 
     Args:
-        sql (str): the SQL Query to execute
-        dbms (str): The datase management system
+        query (str): the SQL Query to execute
+        engine (str): The datase management system
         host (str): The database host
         secret_path (str): Secret path to get credentials
         database (str): The database to connect
@@ -650,7 +650,7 @@ def get_raw_data_db(
     try:
         credentials = get_vault_secret(secret_path)["data"]
 
-        connection = connection_mapping[dbms](
+        connection = connection_mapping[engine](
             host=host,
             user=credentials["user"],
             password=credentials["password"],
@@ -658,8 +658,8 @@ def get_raw_data_db(
         )
 
         with connection:
-            with connection.cursor(**connection_mapping[dbms]["cursor"]) as cursor:
-                cursor.execute(sql)
+            with connection.cursor(**connection_mapping[engine]["cursor"]) as cursor:
+                cursor.execute(query)
                 data = cursor.fetchall()
 
         data = [dict(d) for d in data]

From ffb205176bed69b67f5578392ac4951a40483691 Mon Sep 17 00:00:00 2001
From: Rafael <rafael.c.pinheiro96@gmail.com>
Date: Mon, 16 Oct 2023 16:43:41 -0300
Subject: [PATCH 089/145] mudar label schedule para dev

---
 pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/schedules.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/schedules.py b/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/schedules.py
index 21e13f05b..6cb4b0724 100644
--- a/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/schedules.py
+++ b/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/schedules.py
@@ -21,7 +21,7 @@
         **constants.BILHETAGEM_CAPTURE_RUN_INTERVAL.value["transacao_run_interval"]
     ),
     labels=[
-        emd_constants.RJ_SMTR_AGENT_LABEL.value,
+        emd_constants.RJ_SMTR_DEV_AGENT_LABEL.value,
     ],
     table_parameters=constants.BILHETAGEM_TRANSACAO_CAPTURE_PARAMS.value,
     dataset_id=constants.BILHETAGEM_DATASET_ID.value,

From 10911c60af21122883c0c6557678128b6397a595 Mon Sep 17 00:00:00 2001
From: Rafael <rafael.c.pinheiro96@gmail.com>
Date: Mon, 16 Oct 2023 18:19:37 -0300
Subject: [PATCH 090/145] corrigir constante db bilhetagem postgresql

---
 pipelines/rj_smtr/constants.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pipelines/rj_smtr/constants.py b/pipelines/rj_smtr/constants.py
index ee96e8a79..5dde5c55a 100644
--- a/pipelines/rj_smtr/constants.py
+++ b/pipelines/rj_smtr/constants.py
@@ -173,11 +173,11 @@ class constants(Enum):  # pylint: disable=c0103
                 "host": "10.5.114.121",
             },
             "tarifa_db": {
-                "engine": "postgres",
+                "engine": "postgresql",
                 "host": "10.5.113.254",
             },
             "transacao_db": {
-                "engine": "postgres",
+                "engine": "postgresql",
                 "host": "10.5.114.65",
             },
         },

From 7e51e69ce7189306683ee4142c461bd1ee1a5068 Mon Sep 17 00:00:00 2001
From: Rafael <rafael.c.pinheiro96@gmail.com>
Date: Mon, 16 Oct 2023 18:24:07 -0300
Subject: [PATCH 091/145] =?UTF-8?q?alterar=20nomea=C3=A7=C3=A3o=20para=20r?=
 =?UTF-8?q?uns=20de=20recaptura?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 pipelines/rj_smtr/flows.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/pipelines/rj_smtr/flows.py b/pipelines/rj_smtr/flows.py
index e08bb919e..14d8a1207 100644
--- a/pipelines/rj_smtr/flows.py
+++ b/pipelines/rj_smtr/flows.py
@@ -67,15 +67,18 @@
             dataset_id=dataset_id,
             table_id=table_id,
         )
+        RECAPTURE_RUNNAME_SUFIX = " Recaptura"
 
     with case(recapture, False):
         capture_timestamp = [get_current_timestamp()]
         previous_errors = [None]
+        CAPTURE_RUNNAME_SUFIX = ""
 
     timestamps = merge(recapture_timestamps, capture_timestamp)
+    runname_sufix = merge(RECAPTURE_RUNNAME_SUFIX, CAPTURE_RUNNAME_SUFIX)
 
     rename_flow_run = rename_current_flow_run_now_time(
-        prefix=default_capture_flow.name + " " + table_id + ": ",
+        prefix=default_capture_flow.name + " " + table_id + runname_sufix + ": ",
         now_time=get_now_time(),
     )
 

From e256f120f52b128367301b39ce8c34541c6423a7 Mon Sep 17 00:00:00 2001
From: Rafael <rafael.c.pinheiro96@gmail.com>
Date: Mon, 16 Oct 2023 19:15:03 -0300
Subject: [PATCH 092/145] ajuste connector

---
 pipelines/rj_smtr/utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pipelines/rj_smtr/utils.py b/pipelines/rj_smtr/utils.py
index cf69edc2c..89bd2a3c7 100644
--- a/pipelines/rj_smtr/utils.py
+++ b/pipelines/rj_smtr/utils.py
@@ -650,7 +650,7 @@ def get_raw_data_db(
     try:
         credentials = get_vault_secret(secret_path)["data"]
 
-        connection = connection_mapping[engine](
+        connection = connection_mapping[engine]["connector"](
             host=host,
             user=credentials["user"],
             password=credentials["password"],

From c67a93e970450b0a2316bda6c66ba1b2055c2a5f Mon Sep 17 00:00:00 2001
From: Rafael <rafael.c.pinheiro96@gmail.com>
Date: Tue, 17 Oct 2023 09:06:48 -0300
Subject: [PATCH 093/145] alterar IP para DNS

---
 pipelines/rj_smtr/constants.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/pipelines/rj_smtr/constants.py b/pipelines/rj_smtr/constants.py
index 5dde5c55a..9369bb465 100644
--- a/pipelines/rj_smtr/constants.py
+++ b/pipelines/rj_smtr/constants.py
@@ -170,15 +170,15 @@ class constants(Enum):  # pylint: disable=c0103
         "databases": {
             "principal_db": {
                 "engine": "mysql",
-                "host": "10.5.114.121",
+                "host": "principal-database-replica.internal",
             },
             "tarifa_db": {
                 "engine": "postgresql",
-                "host": "10.5.113.254",
+                "host": "tarifa-database-replica.internal",
             },
             "transacao_db": {
                 "engine": "postgresql",
-                "host": "10.5.114.65",
+                "host": "transacao-database-replica.internal",
             },
         },
         "source_type": "db",

From a5d342c98fb4d531ed77f2c4e3cf986dc8b85dcb Mon Sep 17 00:00:00 2001
From: Rafael <rafael.c.pinheiro96@gmail.com>
Date: Tue, 17 Oct 2023 10:28:32 -0300
Subject: [PATCH 094/145] Serialize datetime objects / read sql with pandas

---
 pipelines/rj_smtr/utils.py | 64 +++++++++++++++++++++-----------------
 1 file changed, 36 insertions(+), 28 deletions(-)

diff --git a/pipelines/rj_smtr/utils.py b/pipelines/rj_smtr/utils.py
index 89bd2a3c7..16ed538d3 100644
--- a/pipelines/rj_smtr/utils.py
+++ b/pipelines/rj_smtr/utils.py
@@ -7,8 +7,8 @@
 from ftplib import FTP
 from pathlib import Path
 
-from datetime import timedelta, datetime
-from typing import List, Union
+from datetime import timedelta, datetime, date
+from typing import List, Union, Any
 import traceback
 import io
 import json
@@ -462,17 +462,40 @@ def dict_contains_keys(input_dict: dict, keys: list[str]) -> bool:
     return all(x in input_dict.keys() for x in keys)
 
 
+def custom_serialization(obj: Any) -> Any:
+    """
+    Function to serialize not JSON serializable objects
+
+    Args:
+        obj (Any): Object to serialize
+
+    Returns:
+        Any: Serialized object
+    """
+    if isinstance(obj, (datetime, date, pd.Timestamp)):
+        if obj.tzinfo is None:
+            obj = obj.tz_localize(emd_constants.DEFAULT_TIMEZONE.value)
+        else:
+            obj = obj.tz_convert(emd_constants.DEFAULT_TIMEZONE.value)
+
+        return obj.isoformat()
+
+    raise TypeError(f"Object of type {type(obj)} is not JSON serializable")
+
+
 def save_raw_local_func(
-    data: Union[dict, str], filepath: str, mode: str = "raw", filetype: str = "json"
+    data: Union[dict, str],
+    filepath: str,
+    mode: str = "raw",
+    filetype: str = "json",
 ) -> str:
     """
     Saves json response from API to .json file.
     Args:
+        data (Union[dict, str]): Raw data to save
         filepath (str): Path which to save raw file
-        status (dict): Must contain keys
-          * data: json returned from API
-          * error: error catched from API request
         mode (str, optional): Folder to save locally, later folder which to upload to GCS.
+        filetype (str, optional): The file format
     Returns:
         str: Path to the saved file
     """
@@ -485,10 +508,8 @@ def save_raw_local_func(
         if isinstance(data, str):
             data = json.loads(data)
         with Path(_filepath).open("w", encoding="utf-8") as fi:
-            json.dump(data, fi)
+            json.dump(data, fi, default=custom_serialization)
 
-    # if filetype == "csv":
-    #     pass
     if filetype in ("txt", "csv"):
         with open(_filepath, "w", encoding="utf-8") as file:
             file.write(data)
@@ -630,17 +651,9 @@ def get_raw_data_db(
     Returns:
         tuple[str, str, str]: Error, data and filetype
     """
-    connection_mapping = {
-        "postgresql": {
-            "connector": psycopg2.connect,
-            "port": "5432",
-            "cursor": {"cursor_factory": psycopg2.extras.RealDictCursor},
-        },
-        "mysql": {
-            "connector": pymysql.connect,
-            "port": "3306",
-            "cursor": {"cursor": pymysql.cursors.DictCursor},
-        },
+    connector_mapping = {
+        "postgresql": psycopg2.connect,
+        "mysql": pymysql.connect,
     }
 
     data = None
@@ -650,19 +663,14 @@ def get_raw_data_db(
     try:
         credentials = get_vault_secret(secret_path)["data"]
 
-        connection = connection_mapping[engine]["connector"](
+        with connector_mapping[engine](
             host=host,
             user=credentials["user"],
             password=credentials["password"],
             database=database,
-        )
-
-        with connection:
-            with connection.cursor(**connection_mapping[engine]["cursor"]) as cursor:
-                cursor.execute(query)
-                data = cursor.fetchall()
+        ) as connection:
+            data = pd.read_sql(sql=query, con=connection).to_dict(orient="records")
 
-        data = [dict(d) for d in data]
     except Exception:
         error = traceback.format_exc()
         log(f"[CATCHED] Task failed with error: \n{error}", level="error")

From 16ffff35eee7cc10d56334174602ced25b2e4461 Mon Sep 17 00:00:00 2001
From: Rafael <rafael.c.pinheiro96@gmail.com>
Date: Tue, 17 Oct 2023 12:20:37 -0300
Subject: [PATCH 095/145] mudar logica do nome da run

---
 pipelines/rj_smtr/flows.py | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/pipelines/rj_smtr/flows.py b/pipelines/rj_smtr/flows.py
index 14d8a1207..db72bfd3d 100644
--- a/pipelines/rj_smtr/flows.py
+++ b/pipelines/rj_smtr/flows.py
@@ -5,7 +5,7 @@
 
 from prefect.run_configs import KubernetesRun
 from prefect.storage import GCS
-from prefect import case, Parameter
+from prefect import case, Parameter, task
 from prefect.utilities.edges import unmapped
 from prefect.tasks.control_flow import merge
 
@@ -62,23 +62,26 @@
     # Parâmetros Pré-tratamento #
     primary_key = Parameter("primary_key", default=None)
 
+    get_run_name_prefix = task(
+        lambda recap: "Recaptura" if recap else "Captura",
+        name="get_run_name_prefix",
+        checkpoint=False,
+    )
+
     with case(recapture, True):
         _, recapture_timestamps, previous_errors = query_logs(
             dataset_id=dataset_id,
             table_id=table_id,
         )
-        RECAPTURE_RUNNAME_SUFIX = " Recaptura"
 
     with case(recapture, False):
         capture_timestamp = [get_current_timestamp()]
         previous_errors = [None]
-        CAPTURE_RUNNAME_SUFIX = ""
 
     timestamps = merge(recapture_timestamps, capture_timestamp)
-    runname_sufix = merge(RECAPTURE_RUNNAME_SUFIX, CAPTURE_RUNNAME_SUFIX)
 
     rename_flow_run = rename_current_flow_run_now_time(
-        prefix=default_capture_flow.name + " " + table_id + runname_sufix + ": ",
+        prefix="SMTR: " + get_run_name_prefix(recap=recapture) + " " + table_id + ": ",
         now_time=get_now_time(),
     )
 

From 55fbe34f12bbe6d26264e269de0e4b82244caa67 Mon Sep 17 00:00:00 2001
From: Rafael <rafael.c.pinheiro96@gmail.com>
Date: Tue, 17 Oct 2023 12:21:30 -0300
Subject: [PATCH 096/145] cria recaptura bilhetagem

---
 .../br_rj_riodejaneiro_bilhetagem/flows.py    | 51 ++++++++++++++++---
 1 file changed, 44 insertions(+), 7 deletions(-)

diff --git a/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/flows.py b/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/flows.py
index fb6e67594..0a1e29ba9 100644
--- a/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/flows.py
+++ b/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/flows.py
@@ -39,7 +39,14 @@
 
 from pipelines.rj_smtr.constants import constants
 
-from pipelines.rj_smtr.schedules import every_hour
+from pipelines.rj_smtr.schedules import every_hour, every_minute
+
+
+GENERAL_CAPTURE_DEFAULT_PARAMS = {
+    "dataset_id": constants.BILHETAGEM_DATASET_ID.value,
+    "secret_path": constants.BILHETAGEM_SECRET_PATH.value,
+    "source_type": constants.BILHETAGEM_GENERAL_CAPTURE_PARAMS.value["source_type"],
+}
 
 # Flows #
 
@@ -52,7 +59,25 @@
     image=emd_constants.DOCKER_IMAGE.value,
     labels=[emd_constants.RJ_SMTR_DEV_AGENT_LABEL.value],
 )
-bilhetagem_transacao_captura.schedule = bilhetagem_transacao_schedule
+
+bilhetagem_transacao_captura = set_default_parameters(
+    flow=bilhetagem_transacao_captura,
+    default_parameters=GENERAL_CAPTURE_DEFAULT_PARAMS
+    | constants.BILHETAGEM_TRANSACAO_CAPTURE_PARAMS.value,
+)
+
+bilhetagem_transacao_captura.schedule = every_minute
+
+bilhetagem_transacao_recaptura = deepcopy(default_capture_flow)
+bilhetagem_transacao_recaptura.name = "SMTR: Bilhetagem Transação - Recaptura (subflow)"
+bilhetagem_transacao_recaptura.storage = GCS(emd_constants.GCS_FLOWS_BUCKET.value)
+bilhetagem_transacao_recaptura = set_default_parameters(
+    flow=bilhetagem_transacao_recaptura,
+    default_parameters=GENERAL_CAPTURE_DEFAULT_PARAMS
+    | constants.BILHETAGEM_TRANSACAO_CAPTURE_PARAMS.value
+    | {"recapture": True},
+)
+
 
 # BILHETAGEM AUXILIAR - SUBFLOW PARA RODAR ANTES DE CADA MATERIALIZAÇÃO #
 
@@ -66,11 +91,7 @@
 
 bilhetagem_auxiliar_captura = set_default_parameters(
     flow=bilhetagem_auxiliar_captura,
-    default_parameters={
-        "dataset_id": constants.BILHETAGEM_DATASET_ID.value,
-        "secret_path": constants.BILHETAGEM_SECRET_PATH.value,
-        "source_type": constants.BILHETAGEM_GENERAL_CAPTURE_PARAMS.value["source_type"],
-    },
+    default_parameters=GENERAL_CAPTURE_DEFAULT_PARAMS,
 )
 
 # MATERIALIZAÇÃO - SUBFLOW DE MATERIALIZAÇÃO
@@ -106,12 +127,28 @@
 
     LABELS = get_current_flow_labels()
 
+    # Recaptura Transações
+
+    run_recaptura = create_flow_run(
+        flow_name=bilhetagem_transacao_recaptura.name,
+        project_name=emd_constants.PREFECT_DEFAULT_PROJECT.value,
+        labels=LABELS,
+    )
+
+    wait_recaptura = wait_for_flow_run(
+        run_recaptura,
+        stream_states=True,
+        stream_logs=True,
+        raise_final_state=True,
+    )
+
     # Captura
     runs_captura = create_flow_run.map(
         flow_name=unmapped(bilhetagem_auxiliar_captura.name),
         project_name=unmapped(emd_constants.PREFECT_DEFAULT_PROJECT.value),
         parameters=constants.BILHETAGEM_CAPTURE_PARAMS.value,
         labels=unmapped(LABELS),
+        upstream_tasks=[wait_recaptura],
     )
 
     wait_captura = wait_for_flow_run.map(

From db6e6d9ada18bb35f27d1b8ee46ac473dc17bea7 Mon Sep 17 00:00:00 2001
From: Rafael <rafael.c.pinheiro96@gmail.com>
Date: Tue, 17 Oct 2023 14:32:59 -0300
Subject: [PATCH 097/145] mudar host para IP / adiciona interval_minutes

---
 pipelines/rj_smtr/constants.py | 33 ++++++++++-----------------------
 1 file changed, 10 insertions(+), 23 deletions(-)

diff --git a/pipelines/rj_smtr/constants.py b/pipelines/rj_smtr/constants.py
index 9369bb465..ccf1c6c44 100644
--- a/pipelines/rj_smtr/constants.py
+++ b/pipelines/rj_smtr/constants.py
@@ -170,25 +170,20 @@ class constants(Enum):  # pylint: disable=c0103
         "databases": {
             "principal_db": {
                 "engine": "mysql",
-                "host": "principal-database-replica.internal",
+                "host": "10.5.114.121",
             },
             "tarifa_db": {
                 "engine": "postgresql",
-                "host": "tarifa-database-replica.internal",
+                "host": "10.5.113.254",
             },
             "transacao_db": {
                 "engine": "postgresql",
-                "host": "transacao-database-replica.internal",
+                "host": "10.5.115.1",
             },
         },
         "source_type": "db",
     }
 
-    BILHETAGEM_CAPTURE_RUN_INTERVAL = {
-        "transacao_run_interval": {"minutes": 1},
-        "principal_run_interval": {"hours": 1},
-    }
-
     BILHETAGEM_TRANSACAO_CAPTURE_PARAMS = {
         "table_id": "transacao",
         "partition_date_only": False,
@@ -203,9 +198,9 @@ class constants(Enum):  # pylint: disable=c0103
                     data_processamento BETWEEN '{start}'
                     AND '{end}'
             """,
-            "run_interval": BILHETAGEM_CAPTURE_RUN_INTERVAL["transacao_run_interval"],
         },
-        "primary_key": ["id"],  # id column to nest data on
+        "primary_key": ["id"],
+        "interval_minutes": 1,
     }
 
     BILHETAGEM_SECRET_PATH = "smtr_jae_access_data"
@@ -224,11 +219,9 @@ class constants(Enum):  # pylint: disable=c0103
                     WHERE
                         DT_INCLUSAO >= '{start}'
                 """,
-                "run_interval": BILHETAGEM_CAPTURE_RUN_INTERVAL[
-                    "principal_run_interval"
-                ],
             },
             "primary_key": ["CD_LINHA"],  # id column to nest data on
+            "interval_minutes": 60,
         },
         {
             "table_id": "grupo",
@@ -243,11 +236,9 @@ class constants(Enum):  # pylint: disable=c0103
                     WHERE
                         DT_INCLUSAO >= '{start}'
                 """,
-                "run_interval": BILHETAGEM_CAPTURE_RUN_INTERVAL[
-                    "principal_run_interval"
-                ],
             },
             "primary_key": ["CD_GRUPO"],  # id column to nest data on
+            "interval_minutes": 60,
         },
         {
             "table_id": "grupo_linha",
@@ -262,11 +253,9 @@ class constants(Enum):  # pylint: disable=c0103
                     WHERE
                         DT_INCLUSAO >= '{start}'
                 """,
-                "run_interval": BILHETAGEM_CAPTURE_RUN_INTERVAL[
-                    "principal_run_interval"
-                ],
             },
-            "primary_key": ["CD_GRUPO", "CD_LINHA"],  # id column to nest data on
+            "primary_key": ["CD_GRUPO", "CD_LINHA"],
+            "interval_minutes": 60,
         },
         {
             "table_id": "matriz_integracao",
@@ -281,14 +270,12 @@ class constants(Enum):  # pylint: disable=c0103
                     WHERE
                         dt_inclusao >= '{start}'
                 """,
-                "run_interval": BILHETAGEM_CAPTURE_RUN_INTERVAL[
-                    "principal_run_interval"
-                ],
             },
             "primary_key": [
                 "cd_versao_matriz",
                 "cd_integracao",
             ],  # id column to nest data on
+            "interval_minutes": 60,
         },
     ]
 

From d115126bcf7ab550c8a7ac271b52c356264b0be6 Mon Sep 17 00:00:00 2001
From: Rafael <rafael.c.pinheiro96@gmail.com>
Date: Tue, 17 Oct 2023 14:33:16 -0300
Subject: [PATCH 098/145] adiciona parametro interval minutes

---
 pipelines/rj_smtr/flows.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/pipelines/rj_smtr/flows.py b/pipelines/rj_smtr/flows.py
index db72bfd3d..f9c40bd23 100644
--- a/pipelines/rj_smtr/flows.py
+++ b/pipelines/rj_smtr/flows.py
@@ -58,6 +58,7 @@
     secret_path = Parameter("secret_path", default=None)
     source_type = Parameter("source_type", default=None)
     recapture = Parameter("recapture", default=False)
+    interval_minutes = Parameter("interval_minutes", default=None)
 
     # Parâmetros Pré-tratamento #
     primary_key = Parameter("primary_key", default=None)
@@ -70,8 +71,7 @@
 
     with case(recapture, True):
         _, recapture_timestamps, previous_errors = query_logs(
-            dataset_id=dataset_id,
-            table_id=table_id,
+            dataset_id=dataset_id, table_id=table_id, interval_minutes=interval_minutes
         )
 
     with case(recapture, False):
@@ -104,6 +104,7 @@
         extract_params=unmapped(extract_params),
         table_id=unmapped(table_id),
         timestamp=timestamps,
+        interval_minutes=interval_minutes,
     )
 
     request_params, request_paths = unpack_mapped_results_nout2(

From 97c865ae23b2fbc83ec9d88c274cc698379db037 Mon Sep 17 00:00:00 2001
From: Rafael <rafael.c.pinheiro96@gmail.com>
Date: Tue, 17 Oct 2023 14:33:27 -0300
Subject: [PATCH 099/145] remove linha comentada

---
 pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/flows.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/flows.py b/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/flows.py
index 0a1e29ba9..df47f17eb 100644
--- a/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/flows.py
+++ b/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/flows.py
@@ -179,4 +179,3 @@
     labels=[emd_constants.RJ_SMTR_DEV_AGENT_LABEL.value],
 )
 bilhetagem_transacao_tratamento.schedule = every_hour
-# bilhetagem_materializacao.schedule = bilhetagem_materializacao_schedule

From 0bf3ade6cc89ebc1714421ecb3207524dd969cbb Mon Sep 17 00:00:00 2001
From: Rafael <rafael.c.pinheiro96@gmail.com>
Date: Tue, 17 Oct 2023 14:34:01 -0300
Subject: [PATCH 100/145] remove arquivo de schedules da bilhetagem

---
 .../schedules.py                              | 33 -------------------
 1 file changed, 33 deletions(-)
 delete mode 100644 pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/schedules.py

diff --git a/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/schedules.py b/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/schedules.py
deleted file mode 100644
index 6cb4b0724..000000000
--- a/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/schedules.py
+++ /dev/null
@@ -1,33 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-Schedules for br_rj_riodejaneiro_bilhetagem
-"""
-
-from datetime import timedelta
-
-from prefect.schedules import Schedule
-
-from pipelines.constants import constants as emd_constants
-from pipelines.utils.utils import untuple_clocks as untuple
-
-from pipelines.rj_smtr.constants import constants
-from pipelines.rj_smtr.utils import (
-    generate_execute_schedules,
-)
-
-BILHETAGEM_TRANSACAO_INTERVAL = timedelta(minutes=1)
-bilhetagem_transacao_clocks = generate_execute_schedules(
-    clock_interval=timedelta(
-        **constants.BILHETAGEM_CAPTURE_RUN_INTERVAL.value["transacao_run_interval"]
-    ),
-    labels=[
-        emd_constants.RJ_SMTR_DEV_AGENT_LABEL.value,
-    ],
-    table_parameters=constants.BILHETAGEM_TRANSACAO_CAPTURE_PARAMS.value,
-    dataset_id=constants.BILHETAGEM_DATASET_ID.value,
-    secret_path=constants.BILHETAGEM_SECRET_PATH.value,
-    source_type=constants.BILHETAGEM_GENERAL_CAPTURE_PARAMS.value["source_type"],
-    runs_interval_minutes=0,
-)
-
-bilhetagem_transacao_schedule = Schedule(clocks=untuple(bilhetagem_transacao_clocks))

From 35c80d4532adc2c4ee12b300d4de201ee934bd7a Mon Sep 17 00:00:00 2001
From: Rafael <rafael.c.pinheiro96@gmail.com>
Date: Tue, 17 Oct 2023 14:34:31 -0300
Subject: [PATCH 101/145] =?UTF-8?q?generaliza=20fun=C3=A7=C3=A3o=20query?=
 =?UTF-8?q?=20logs?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 pipelines/rj_smtr/tasks.py |  99 +++++------------------------
 pipelines/rj_smtr/utils.py | 123 +++++++++++++++++++++++++++++++++++++
 2 files changed, 139 insertions(+), 83 deletions(-)

diff --git a/pipelines/rj_smtr/tasks.py b/pipelines/rj_smtr/tasks.py
index 3969f28b9..2b733aef0 100644
--- a/pipelines/rj_smtr/tasks.py
+++ b/pipelines/rj_smtr/tasks.py
@@ -26,7 +26,6 @@
     bq_project,
     get_table_min_max_value,
     get_last_run_timestamp,
-    log_critical,
     data_info_str,
     dict_contains_keys,
     get_raw_data_api,
@@ -37,6 +36,7 @@
     read_raw_data,
     save_treated_local_func,
     save_raw_local_func,
+    query_logs_func,
 )
 from pipelines.utils.execute_dbt_model.utils import get_dbt_client
 from pipelines.utils.utils import log, get_redis_client, get_vault_secret
@@ -370,6 +370,7 @@ def query_logs(
     table_id: str,
     datetime_filter=None,
     max_recaptures: int = 60,
+    interval_minutes: int = 1,
 ):
     """
     Queries capture logs to check for errors
@@ -380,92 +381,22 @@ def query_logs(
         datetime_filter (pendulum.datetime.DateTime, optional):
         filter passed to query. This task will query the logs table
         for the last 1 day before datetime_filter
+        max_recaptures (int, optional): maximum number of recaptures to be done
+        interval_minutes (int, optional): interval in minutes between each recapture
 
     Returns:
-        list: containing timestamps for which the capture failed
+        lists: errors (bool),
+        timestamps (list of pendulum.datetime.DateTime),
+        previous_errors (list of previous errors)
     """
 
-    if not datetime_filter:
-        datetime_filter = pendulum.now(constants.TIMEZONE.value).replace(
-            second=0, microsecond=0
-        )
-    elif isinstance(datetime_filter, str):
-        datetime_filter = datetime.fromisoformat(datetime_filter).replace(
-            second=0, microsecond=0
-        )
-
-    query = f"""
-    with t as (
-    select
-        datetime(timestamp_array) as timestamp_array
-    from
-        unnest(GENERATE_TIMESTAMP_ARRAY(
-            timestamp_sub('{datetime_filter.strftime('%Y-%m-%d %H:%M:%S')}', interval 1 day),
-            timestamp('{datetime_filter.strftime('%Y-%m-%d %H:%M:%S')}'),
-            interval 1 minute)
-        ) as timestamp_array
-    where timestamp_array < '{datetime_filter.strftime('%Y-%m-%d %H:%M:%S')}'
-    ),
-    logs as (
-        select
-            *,
-            timestamp_trunc(timestamp_captura, minute) as timestamp_array
-        from
-            rj-smtr.{dataset_id}.{table_id}_logs
-        where
-            data between
-                date(datetime_sub('{datetime_filter.strftime('%Y-%m-%d %H:%M:%S')}',
-                interval 1 day))
-                and date('{datetime_filter.strftime('%Y-%m-%d %H:%M:%S')}')
-        and
-            timestamp_captura between
-                datetime_sub('{datetime_filter.strftime('%Y-%m-%d %H:%M:%S')}', interval 1 day)
-                and '{datetime_filter.strftime('%Y-%m-%d %H:%M:%S')}'
-        order by timestamp_captura
+    return query_logs_func(
+        dataset_id=dataset_id,
+        table_id=table_id,
+        datetime_filter=datetime_filter,
+        max_recaptures=max_recaptures,
+        interval_minutes=interval_minutes,
     )
-    select
-        case
-            when logs.timestamp_captura is not null then logs.timestamp_captura
-            else t.timestamp_array
-        end as timestamp_captura,
-        logs.erro
-    from
-        t
-    left join
-        logs
-    on
-        logs.timestamp_array = t.timestamp_array
-    where
-        logs.sucesso is not True
-    order by
-        timestamp_captura
-    """
-    log(f"Run query to check logs:\n{query}")
-    results = bd.read_sql(query=query, billing_project_id=bq_project())
-    if len(results) > 0:
-        results["timestamp_captura"] = (
-            pd.to_datetime(results["timestamp_captura"])
-            .dt.tz_localize(constants.TIMEZONE.value)
-            .to_list()
-        )
-        log(f"Recapture data for the following {len(results)} timestamps:\n{results}")
-        if len(results) > max_recaptures:
-            message = f"""
-            [SPPO - Recaptures]
-            Encontradas {len(results)} timestamps para serem recapturadas.
-            Essa run processará as seguintes:
-            #####
-            {results[:max_recaptures]}
-            #####
-            Sobraram as seguintes para serem recapturadas na próxima run:
-            #####
-            {results[max_recaptures:]}
-            #####
-            """
-            log_critical(message)
-            results = results[:max_recaptures]
-        return True, results["timestamp_captura"].to_list(), results["erro"].to_list()
-    return False, [], []
 
 
 @task
@@ -543,6 +474,7 @@ def create_request_params(
     table_id: str,
     dataset_id: str,
     timestamp: datetime,
+    interval_minutes: int,
 ) -> tuple[str, str]:
     """
     Task to create request params
@@ -552,6 +484,7 @@ def create_request_params(
         table_id (str): table_id on BigQuery
         dataset_id (str): dataset_id on BigQuery
         timestamp (datetime): timestamp for flow run
+        interval_minutes (int): interval in minutes between each capture
 
     Returns:
         request_params: host, database and query to request data
@@ -567,7 +500,7 @@ def create_request_params(
         request_url = database["host"]
 
         datetime_range = get_datetime_range(
-            timestamp=timestamp, interval=timedelta(**extract_params["run_interval"])
+            timestamp=timestamp, interval=timedelta(minutes=interval_minutes)
         )
 
         request_params = {
diff --git a/pipelines/rj_smtr/utils.py b/pipelines/rj_smtr/utils.py
index 16ed538d3..41dc1dd02 100644
--- a/pipelines/rj_smtr/utils.py
+++ b/pipelines/rj_smtr/utils.py
@@ -8,6 +8,7 @@
 from pathlib import Path
 
 from datetime import timedelta, datetime, date
+import pendulum
 from typing import List, Union, Any
 import traceback
 import io
@@ -449,6 +450,128 @@ def generate_execute_schedules(  # pylint: disable=too-many-arguments,too-many-l
     return clocks
 
 
+def query_logs_func(
+    dataset_id: str,
+    table_id: str,
+    datetime_filter=None,
+    max_recaptures: int = 60,
+    interval_minutes: int = 1,
+):
+    """
+    Queries capture logs to check for errors
+
+    Args:
+        dataset_id (str): dataset_id on BigQuery
+        table_id (str): table_id on BigQuery
+        datetime_filter (pendulum.datetime.DateTime, optional):
+        filter passed to query. This task will query the logs table
+        for the last 1 day before datetime_filter
+        max_recaptures (int, optional): maximum number of recaptures to be done
+        interval_minutes (int, optional): interval in minutes between each recapture
+
+    Returns:
+        lists: errors (bool),
+        timestamps (list of pendulum.datetime.DateTime),
+        previous_errors (list of previous errors)
+    """
+
+    if not datetime_filter:
+        datetime_filter = pendulum.now(constants.TIMEZONE.value).replace(
+            second=0, microsecond=0
+        )
+    elif isinstance(datetime_filter, str):
+        datetime_filter = datetime.fromisoformat(datetime_filter).replace(
+            second=0, microsecond=0
+        )
+
+    datetime_filter = datetime_filter.strftime("%Y-%m-%d %H:%M:%S")
+
+    query = f"""
+    WITH
+    t AS (
+    SELECT
+        DATETIME(timestamp_array) AS timestamp_array
+    FROM
+        UNNEST(
+            GENERATE_TIMESTAMP_ARRAY(
+                TIMESTAMP_SUB('{datetime_filter}', INTERVAL 1 day),
+                TIMESTAMP('{datetime_filter}'),
+                INTERVAL {interval_minutes} minute) )
+        AS timestamp_array
+    WHERE
+        timestamp_array < '{datetime_filter}' ),
+    logs_table AS (
+        SELECT
+            SAFE_CAST(DATETIME(TIMESTAMP(timestamp_captura),
+                    "America/Sao_Paulo") AS DATETIME) timestamp_captura,
+            SAFE_CAST(sucesso AS BOOLEAN) sucesso,
+            SAFE_CAST(erro AS STRING) erro,
+            SAFE_CAST(DATA AS DATE) DATA
+        FROM
+            rj-smtr-staging.{dataset_id}_staging.{table_id}_logs AS t
+    ),
+    logs AS (
+        SELECT
+            *,
+            TIMESTAMP_TRUNC(timestamp_captura, minute) AS timestamp_array
+        FROM
+            logs_table
+        WHERE
+            DATA BETWEEN DATE(DATETIME_SUB('{datetime_filter}',
+                            INTERVAL 1 day))
+            AND DATE('{datetime_filter}')
+            AND timestamp_captura BETWEEN
+                DATETIME_SUB('{datetime_filter}', INTERVAL 1 day)
+            AND '{datetime_filter}'
+        ORDER BY
+            timestamp_captura )
+    SELECT
+        CASE
+            WHEN logs.timestamp_captura IS NOT NULL THEN logs.timestamp_captura
+        ELSE
+            t.timestamp_array
+        END
+            AS timestamp_captura,
+            logs.erro
+    FROM
+        t
+    LEFT JOIN
+        logs
+    ON
+        logs.timestamp_array = t.timestamp_array
+    WHERE
+        logs.sucesso IS NOT TRUE
+    ORDER BY
+        timestamp_captura
+    """
+    log(f"Run query to check logs:\n{query}")
+    results = bd.read_sql(query=query, billing_project_id=bq_project())
+    if len(results) > 0:
+        results["timestamp_captura"] = (
+            pd.to_datetime(results["timestamp_captura"])
+            .dt.tz_localize(constants.TIMEZONE.value)
+            .to_list()
+        )
+        log(f"Recapture data for the following {len(results)} timestamps:\n{results}")
+        if len(results) > max_recaptures:
+            message = f"""
+            [SPPO - Recaptures]
+            Encontradas {len(results)} timestamps para serem recapturadas.
+            Essa run processará as seguintes:
+            #####
+            {results[:max_recaptures]}
+            #####
+            Sobraram as seguintes para serem recapturadas na próxima run:
+            #####
+            {results[max_recaptures:]}
+            #####
+            """
+            log_critical(message)
+            results = results[:max_recaptures]
+        return True, results["timestamp_captura"].to_list(), results["erro"].to_list()
+    return False, [], []
+
+
 def dict_contains_keys(input_dict: dict, keys: list[str]) -> bool:
     """
     Test if the input dict has all keys present in the list

From a59e353ff63699695542ce63e72871a9282412b6 Mon Sep 17 00:00:00 2001
From: Rafael <rafael.c.pinheiro96@gmail.com>
Date: Tue, 17 Oct 2023 14:45:44 -0300
Subject: [PATCH 102/145] ajuste remove schedule personalizado

---
 pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/flows.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/flows.py b/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/flows.py
index df47f17eb..4c54424ba 100644
--- a/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/flows.py
+++ b/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/flows.py
@@ -33,10 +33,6 @@
     get_current_timestamp,
 )
 
-from pipelines.rj_smtr.br_rj_riodejaneiro_bilhetagem.schedules import (
-    bilhetagem_transacao_schedule,
-)
-
 from pipelines.rj_smtr.constants import constants
 
 from pipelines.rj_smtr.schedules import every_hour, every_minute

From 2616565cae881262abe6eef88444f32ca2f07a81 Mon Sep 17 00:00:00 2001
From: Rafael <rafael.c.pinheiro96@gmail.com>
Date: Tue, 17 Oct 2023 15:05:06 -0300
Subject: [PATCH 103/145] unmap interval_minutes

---
 pipelines/rj_smtr/flows.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pipelines/rj_smtr/flows.py b/pipelines/rj_smtr/flows.py
index f9c40bd23..b951a18a4 100644
--- a/pipelines/rj_smtr/flows.py
+++ b/pipelines/rj_smtr/flows.py
@@ -104,7 +104,7 @@
         extract_params=unmapped(extract_params),
         table_id=unmapped(table_id),
         timestamp=timestamps,
-        interval_minutes=interval_minutes,
+        interval_minutes=unmapped(interval_minutes),
     )
 
     request_params, request_paths = unpack_mapped_results_nout2(

From 0696626dc6057901b2344b145137cc94bc5d3e5d Mon Sep 17 00:00:00 2001
From: Rafael <rafael.c.pinheiro96@gmail.com>
Date: Tue, 17 Oct 2023 17:50:43 -0300
Subject: [PATCH 104/145] =?UTF-8?q?altera=C3=A7=C3=A3o=20de=20pasta=20de?=
 =?UTF-8?q?=20grava=C3=A7=C3=A3o=20para=20teste?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 pipelines/rj_smtr/tasks.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/pipelines/rj_smtr/tasks.py b/pipelines/rj_smtr/tasks.py
index 2b733aef0..9e80873fa 100644
--- a/pipelines/rj_smtr/tasks.py
+++ b/pipelines/rj_smtr/tasks.py
@@ -309,7 +309,7 @@ def create_local_partition_path(
     either to save raw or staging files.
     """
     data_folder = os.getenv("DATA_FOLDER", "data")
-    file_path = f"{os.getcwd()}/{data_folder}/{{mode}}/{dataset_id}/{table_id}"
+    file_path = f"{os.getcwd()}/{data_folder}/{{mode}}/{dataset_id}_dev/{table_id}"
     file_path += f"/{partitions}/{filename}.{{filetype}}"
     log(f"Creating file path: {file_path}")
     return file_path
@@ -780,7 +780,7 @@ def upload_raw_data_to_gcs(
             st_obj = Storage(table_id=table_id, dataset_id=dataset_id)
             log(
                 f"""Uploading raw file to bucket {st_obj.bucket_name} at
-                {st_obj.bucket_name}/{dataset_id}/{table_id}"""
+                {st_obj.bucket_name}/{dataset_id}_dev/{table_id}"""
             )
             st_obj.upload(
                 path=raw_filepath,
@@ -824,7 +824,7 @@ def upload_staging_data_to_gcs(
         try:
             # Creates and publish table if it does not exist, append to it otherwise
             create_or_append_table(
-                dataset_id=dataset_id,
+                dataset_id=f"{dataset_id}_dev",
                 table_id=table_id,
                 path=staging_filepath,
                 partitions=partitions,
@@ -836,7 +836,7 @@ def upload_staging_data_to_gcs(
     log(f"previous_error = {previous_error}")
 
     upload_run_logs_to_bq(
-        dataset_id=dataset_id,
+        dataset_id=f"{dataset_id}_dev",
         parent_table_id=table_id,
         error=error,
         timestamp=timestamp,

From ee0c4408abbc5c3bc81f3fdda95605a51bad2b43 Mon Sep 17 00:00:00 2001
From: Rafael <rafael.c.pinheiro96@gmail.com>
Date: Wed, 18 Oct 2023 08:04:51 -0300
Subject: [PATCH 105/145] teste retirar timezone

---
 pipelines/rj_smtr/utils.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/pipelines/rj_smtr/utils.py b/pipelines/rj_smtr/utils.py
index 41dc1dd02..2ea0a8b57 100644
--- a/pipelines/rj_smtr/utils.py
+++ b/pipelines/rj_smtr/utils.py
@@ -596,10 +596,10 @@ def custom_serialization(obj: Any) -> Any:
         Any: Serialized object
     """
     if isinstance(obj, (datetime, date, pd.Timestamp)):
-        if obj.tzinfo is None:
-            obj = obj.tz_localize(emd_constants.DEFAULT_TIMEZONE.value)
-        else:
-            obj = obj.tz_convert(emd_constants.DEFAULT_TIMEZONE.value)
+        # if obj.tzinfo is None:
+        #     obj = obj.tz_localize(emd_constants.DEFAULT_TIMEZONE.value)
+        # else:
+        #     obj = obj.tz_convert(emd_constants.DEFAULT_TIMEZONE.value)
 
         return obj.isoformat()
 

From a8bb7f1e864fee561e9fb9e8b35d350abb8d1358 Mon Sep 17 00:00:00 2001
From: Rafael <rafael.c.pinheiro96@gmail.com>
Date: Wed, 18 Oct 2023 08:28:27 -0300
Subject: [PATCH 106/145] mudar timezone

---
 pipelines/rj_smtr/utils.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/pipelines/rj_smtr/utils.py b/pipelines/rj_smtr/utils.py
index 2ea0a8b57..f33e93759 100644
--- a/pipelines/rj_smtr/utils.py
+++ b/pipelines/rj_smtr/utils.py
@@ -595,7 +595,11 @@ def custom_serialization(obj: Any) -> Any:
     Returns:
         Any: Serialized object
     """
-    if isinstance(obj, (datetime, date, pd.Timestamp)):
+    if isinstance(obj, pd.Timestamp):
+        if obj.tzinfo is None:
+            obj = obj.tz_localize("UTC").tz_convert(
+                emd_constants.DEFAULT_TIMEZONE.value
+            )
         # if obj.tzinfo is None:
         #     obj = obj.tz_localize(emd_constants.DEFAULT_TIMEZONE.value)
         # else:

From d956a5317b6d75b4f8e0fe12101f59759fde73c5 Mon Sep 17 00:00:00 2001
From: Rafael <rafael.c.pinheiro96@gmail.com>
Date: Wed, 18 Oct 2023 09:15:58 -0300
Subject: [PATCH 107/145] corrigir logica de recaptura

---
 pipelines/rj_smtr/flows.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/pipelines/rj_smtr/flows.py b/pipelines/rj_smtr/flows.py
index b951a18a4..a63d8681d 100644
--- a/pipelines/rj_smtr/flows.py
+++ b/pipelines/rj_smtr/flows.py
@@ -76,7 +76,9 @@
 
     with case(recapture, False):
         capture_timestamp = [get_current_timestamp()]
-        previous_errors = [None]
+        previous_errors = task(
+            lambda: [None], checkpoint=False, name="assign_none_to_previous_errors"
+        )()
 
     timestamps = merge(recapture_timestamps, capture_timestamp)
 

From 2261952e40f7728025fd913b55588a946cf5c5cd Mon Sep 17 00:00:00 2001
From: Rafael <rafael.c.pinheiro96@gmail.com>
Date: Wed, 18 Oct 2023 09:31:39 -0300
Subject: [PATCH 108/145] adicionar possibilidade de recapturar mais dias

---
 pipelines/rj_smtr/flows.py |  8 ++++++--
 pipelines/rj_smtr/utils.py | 10 ++++++----
 2 files changed, 12 insertions(+), 6 deletions(-)

diff --git a/pipelines/rj_smtr/flows.py b/pipelines/rj_smtr/flows.py
index a63d8681d..b9ad0252f 100644
--- a/pipelines/rj_smtr/flows.py
+++ b/pipelines/rj_smtr/flows.py
@@ -57,8 +57,9 @@
     extract_params = Parameter("extract_params", default=None)
     secret_path = Parameter("secret_path", default=None)
     source_type = Parameter("source_type", default=None)
-    recapture = Parameter("recapture", default=False)
     interval_minutes = Parameter("interval_minutes", default=None)
+    recapture = Parameter("recapture", default=False)
+    recapture_window_days = Parameter("recapture_window_days", default=None)
 
     # Parâmetros Pré-tratamento #
     primary_key = Parameter("primary_key", default=None)
@@ -71,7 +72,10 @@
 
     with case(recapture, True):
         _, recapture_timestamps, previous_errors = query_logs(
-            dataset_id=dataset_id, table_id=table_id, interval_minutes=interval_minutes
+            dataset_id=dataset_id,
+            table_id=table_id,
+            interval_minutes=interval_minutes,
+            recapture_window_days=recapture_window_days,
         )
 
     with case(recapture, False):
diff --git a/pipelines/rj_smtr/utils.py b/pipelines/rj_smtr/utils.py
index f33e93759..45b7de7d8 100644
--- a/pipelines/rj_smtr/utils.py
+++ b/pipelines/rj_smtr/utils.py
@@ -456,6 +456,7 @@ def query_logs_func(
     datetime_filter=None,
     max_recaptures: int = 60,
     interval_minutes: int = 1,
+    recapture_window_days: int = 1,
 ):
     """
     Queries capture logs to check for errors
@@ -465,9 +466,10 @@ def query_logs_func(
         table_id (str): table_id on BigQuery
         datetime_filter (pendulum.datetime.DateTime, optional):
         filter passed to query. This task will query the logs table
-        for the last 1 day before datetime_filter
+        for the last n (n = recapture_window_days) days before datetime_filter
         max_recaptures (int, optional): maximum number of recaptures to be done
         interval_minutes (int, optional): interval in minutes between each recapture
+        recapture_window_days (int, optional): Number of days to query for erros
 
     Returns:
         lists: errors (bool),
@@ -494,7 +496,7 @@ def query_logs_func(
     FROM
         UNNEST(
             GENERATE_TIMESTAMP_ARRAY(
-                TIMESTAMP_SUB('{datetime_filter}', INTERVAL 1 day),
+                TIMESTAMP_SUB('{datetime_filter}', INTERVAL {recapture_window_days} day),
                 TIMESTAMP('{datetime_filter}'),
                 INTERVAL {interval_minutes} minute) )
         AS timestamp_array
@@ -518,10 +520,10 @@ def query_logs_func(
             logs_table
         WHERE
             DATA BETWEEN DATE(DATETIME_SUB('{datetime_filter}',
-                            INTERVAL 1 day))
+                            INTERVAL {recapture_window_days} day))
             AND DATE('{datetime_filter}')
             AND timestamp_captura BETWEEN
-                DATETIME_SUB('{datetime_filter}', INTERVAL 1 day)
+                DATETIME_SUB('{datetime_filter}', INTERVAL {recapture_window_days} day)
             AND '{datetime_filter}'
         ORDER BY
             timestamp_captura )

From b8ac6b8465018add18986ce411d6d4241b774312 Mon Sep 17 00:00:00 2001
From: Rafael <rafael.c.pinheiro96@gmail.com>
Date: Wed, 18 Oct 2023 09:33:17 -0300
Subject: [PATCH 109/145] ajustar recapture_window_days default

---
 pipelines/rj_smtr/flows.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pipelines/rj_smtr/flows.py b/pipelines/rj_smtr/flows.py
index b9ad0252f..5017b226e 100644
--- a/pipelines/rj_smtr/flows.py
+++ b/pipelines/rj_smtr/flows.py
@@ -59,7 +59,7 @@
     source_type = Parameter("source_type", default=None)
     interval_minutes = Parameter("interval_minutes", default=None)
     recapture = Parameter("recapture", default=False)
-    recapture_window_days = Parameter("recapture_window_days", default=None)
+    recapture_window_days = Parameter("recapture_window_days", default=1)
 
     # Parâmetros Pré-tratamento #
     primary_key = Parameter("primary_key", default=None)

From 7f0c3098bd3a7f6e516c1bd2a1e72b6e19c3d3b5 Mon Sep 17 00:00:00 2001
From: Rafael <rafael.c.pinheiro96@gmail.com>
Date: Wed, 18 Oct 2023 10:18:34 -0300
Subject: [PATCH 110/145] adicionae recapture_window na task query_logs

---
 pipelines/rj_smtr/tasks.py | 5 ++++-
 pipelines/rj_smtr/utils.py | 4 ++--
 2 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/pipelines/rj_smtr/tasks.py b/pipelines/rj_smtr/tasks.py
index 9e80873fa..40129e4fa 100644
--- a/pipelines/rj_smtr/tasks.py
+++ b/pipelines/rj_smtr/tasks.py
@@ -371,6 +371,7 @@ def query_logs(
     datetime_filter=None,
     max_recaptures: int = 60,
     interval_minutes: int = 1,
+    recapture_window_days: int = 1,
 ):
     """
     Queries capture logs to check for errors
@@ -380,9 +381,10 @@ def query_logs(
         table_id (str): table_id on BigQuery
         datetime_filter (pendulum.datetime.DateTime, optional):
         filter passed to query. This task will query the logs table
-        for the last 1 day before datetime_filter
+        for the last n (n = recapture_window_days) days before datetime_filter
         max_recaptures (int, optional): maximum number of recaptures to be done
         interval_minutes (int, optional): interval in minutes between each recapture
+        recapture_window_days (int, optional): Number of days to query for erros
 
     Returns:
         lists: errors (bool),
@@ -396,6 +398,7 @@ def query_logs(
         datetime_filter=datetime_filter,
         max_recaptures=max_recaptures,
         interval_minutes=interval_minutes,
+        recapture_window_days=recapture_window_days,
     )
 
 
diff --git a/pipelines/rj_smtr/utils.py b/pipelines/rj_smtr/utils.py
index 45b7de7d8..4d7ac329e 100644
--- a/pipelines/rj_smtr/utils.py
+++ b/pipelines/rj_smtr/utils.py
@@ -7,13 +7,13 @@
 from ftplib import FTP
 from pathlib import Path
 
-from datetime import timedelta, datetime, date
-import pendulum
+from datetime import timedelta, datetime
 from typing import List, Union, Any
 import traceback
 import io
 import json
 import zipfile
+import pendulum
 import pytz
 import requests
 import basedosdados as bd

From ae1774657822387e9a897d9cfdd56d1e3976bd97 Mon Sep 17 00:00:00 2001
From: Rafael <rafael.c.pinheiro96@gmail.com>
Date: Wed, 18 Oct 2023 10:54:14 -0300
Subject: [PATCH 111/145] merge previous_errors

---
 pipelines/rj_smtr/flows.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/pipelines/rj_smtr/flows.py b/pipelines/rj_smtr/flows.py
index 5017b226e..63acc1497 100644
--- a/pipelines/rj_smtr/flows.py
+++ b/pipelines/rj_smtr/flows.py
@@ -71,7 +71,7 @@
     )
 
     with case(recapture, True):
-        _, recapture_timestamps, previous_errors = query_logs(
+        _, recapture_timestamps, recapture_previous_errors = query_logs(
             dataset_id=dataset_id,
             table_id=table_id,
             interval_minutes=interval_minutes,
@@ -80,11 +80,12 @@
 
     with case(recapture, False):
         capture_timestamp = [get_current_timestamp()]
-        previous_errors = task(
+        capture_previous_errors = task(
             lambda: [None], checkpoint=False, name="assign_none_to_previous_errors"
         )()
 
     timestamps = merge(recapture_timestamps, capture_timestamp)
+    previous_errors = merge(recapture_previous_errors, capture_previous_errors)
 
     rename_flow_run = rename_current_flow_run_now_time(
         prefix="SMTR: " + get_run_name_prefix(recap=recapture) + " " + table_id + ": ",

From b172a63ff9638352dbc8c01765d3591b933855fd Mon Sep 17 00:00:00 2001
From: Rafael <rafael.c.pinheiro96@gmail.com>
Date: Wed, 18 Oct 2023 11:24:44 -0300
Subject: [PATCH 112/145] remover log de teste

---
 pipelines/rj_smtr/tasks.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/pipelines/rj_smtr/tasks.py b/pipelines/rj_smtr/tasks.py
index 40129e4fa..5f5daf1a1 100644
--- a/pipelines/rj_smtr/tasks.py
+++ b/pipelines/rj_smtr/tasks.py
@@ -836,8 +836,6 @@ def upload_staging_data_to_gcs(
             error = traceback.format_exc()
             log(f"[CATCHED] Task failed with error: \n{error}", level="error")
 
-    log(f"previous_error = {previous_error}")
-
     upload_run_logs_to_bq(
         dataset_id=f"{dataset_id}_dev",
         parent_table_id=table_id,

From 0bfe9cf6ad9f6c67403d3a528434c10c107d52b2 Mon Sep 17 00:00:00 2001
From: Rafael <rafael.c.pinheiro96@gmail.com>
Date: Wed, 18 Oct 2023 11:24:57 -0300
Subject: [PATCH 113/145] ajustar log recaptura

---
 pipelines/rj_smtr/utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pipelines/rj_smtr/utils.py b/pipelines/rj_smtr/utils.py
index 4d7ac329e..386053e66 100644
--- a/pipelines/rj_smtr/utils.py
+++ b/pipelines/rj_smtr/utils.py
@@ -874,7 +874,7 @@ def upload_run_logs_to_bq(  # pylint: disable=R0913
                 "erro": [f"[recapturado]{previous_error}"],
             }
         )
-        log(f"Recapturing {timestamp} with previous error:\n{error}")
+        log(f"Recapturing {timestamp} with previous error:\n{previous_error}")
     else:
         # not recapturing or error during flow execution
         dataframe = pd.DataFrame(

From 7ca3764070eeb8af8be68fbcefe523ebc4d8408c Mon Sep 17 00:00:00 2001
From: Rafael <rafael.c.pinheiro96@gmail.com>
Date: Wed, 18 Oct 2023 13:19:34 -0300
Subject: [PATCH 114/145] adicionar recaptura auxiliar

---
 .../br_rj_riodejaneiro_bilhetagem/flows.py    | 33 +++++++++++++++----
 1 file changed, 27 insertions(+), 6 deletions(-)

diff --git a/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/flows.py b/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/flows.py
index 4c54424ba..7799c3eac 100644
--- a/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/flows.py
+++ b/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/flows.py
@@ -9,6 +9,7 @@
 from prefect.storage import GCS
 from prefect.tasks.prefect import create_flow_run, wait_for_flow_run
 from prefect.utilities.edges import unmapped
+from prefect import Parameter
 
 # EMD Imports #
 
@@ -109,11 +110,14 @@
 )
 
 
-# TRATAMENTO - RODA DE HORA EM HORA, CAPTURA AUXILIAR + MATERIALIZAÇÃO
+# TRATAMENTO - RODA DE HORA EM HORA, RECAPTURAS + CAPTURA AUXILIAR + MATERIALIZAÇÃO
 with Flow(
     "SMTR: Bilhetagem Transação - Tratamento",
     code_owners=["caio", "fernanda", "boris", "rodrigo"],
 ) as bilhetagem_transacao_tratamento:
+    # Configuração #
+    recapture_window_days = Parameter("recapture_window_days", default=1)
+
     timestamp = get_current_timestamp()
 
     rename_flow_run = rename_current_flow_run_now_time(
@@ -123,28 +127,45 @@
 
     LABELS = get_current_flow_labels()
 
-    # Recaptura Transações
+    # Recapturas
 
-    run_recaptura = create_flow_run(
+    run_recaptura_trasacao = create_flow_run(
         flow_name=bilhetagem_transacao_recaptura.name,
         project_name=emd_constants.PREFECT_DEFAULT_PROJECT.value,
         labels=LABELS,
+        parameters={"recapture_window_days": recapture_window_days},
     )
 
-    wait_recaptura = wait_for_flow_run(
-        run_recaptura,
+    wait_recaptura_trasacao = wait_for_flow_run(
+        run_recaptura_trasacao,
         stream_states=True,
         stream_logs=True,
         raise_final_state=True,
     )
 
+    runs_recaptura_auxiliar = create_flow_run.map(
+        flow_name=unmapped(bilhetagem_auxiliar_captura.name),
+        project_name=unmapped(emd_constants.PREFECT_DEFAULT_PROJECT.value),
+        parameters=constants.BILHETAGEM_CAPTURE_PARAMS.value
+        | {"recapture": True, "recapture_window_days": recapture_window_days},
+        labels=unmapped(LABELS),
+        upstream_tasks=[wait_recaptura_trasacao],
+    )
+
+    wait_recaptura_auxiliar = wait_for_flow_run.map(
+        runs_recaptura_auxiliar,
+        stream_states=unmapped(True),
+        stream_logs=unmapped(True),
+        raise_final_state=unmapped(True),
+    )
+
     # Captura
     runs_captura = create_flow_run.map(
         flow_name=unmapped(bilhetagem_auxiliar_captura.name),
         project_name=unmapped(emd_constants.PREFECT_DEFAULT_PROJECT.value),
         parameters=constants.BILHETAGEM_CAPTURE_PARAMS.value,
         labels=unmapped(LABELS),
-        upstream_tasks=[wait_recaptura],
+        upstream_tasks=[wait_recaptura_auxiliar],
     )
 
     wait_captura = wait_for_flow_run.map(

From c5f369f9375c7c74ac1595645f91c9d703801c22 Mon Sep 17 00:00:00 2001
From: Rafael <rafael.c.pinheiro96@gmail.com>
Date: Wed, 18 Oct 2023 13:46:56 -0300
Subject: [PATCH 115/145] criar parametros recaptura tabelas auxiliares

---
 .../br_rj_riodejaneiro_bilhetagem/flows.py    | 48 +++++++++++--------
 pipelines/rj_smtr/tasks.py                    | 15 ++++++
 2 files changed, 44 insertions(+), 19 deletions(-)

diff --git a/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/flows.py b/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/flows.py
index 7799c3eac..2545ee04e 100644
--- a/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/flows.py
+++ b/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/flows.py
@@ -30,9 +30,7 @@
     default_materialization_flow,
 )
 
-from pipelines.rj_smtr.tasks import (
-    get_current_timestamp,
-)
+from pipelines.rj_smtr.tasks import get_current_timestamp, merge_dict_with_dict_list
 
 from pipelines.rj_smtr.constants import constants
 
@@ -65,16 +63,6 @@
 
 bilhetagem_transacao_captura.schedule = every_minute
 
-bilhetagem_transacao_recaptura = deepcopy(default_capture_flow)
-bilhetagem_transacao_recaptura.name = "SMTR: Bilhetagem Transação - Recaptura (subflow)"
-bilhetagem_transacao_recaptura.storage = GCS(emd_constants.GCS_FLOWS_BUCKET.value)
-bilhetagem_transacao_recaptura = set_default_parameters(
-    flow=bilhetagem_transacao_recaptura,
-    default_parameters=GENERAL_CAPTURE_DEFAULT_PARAMS
-    | constants.BILHETAGEM_TRANSACAO_CAPTURE_PARAMS.value
-    | {"recapture": True},
-)
-
 
 # BILHETAGEM AUXILIAR - SUBFLOW PARA RODAR ANTES DE CADA MATERIALIZAÇÃO #
 
@@ -109,6 +97,17 @@
     default_parameters=bilhetagem_materializacao_parameters,
 )
 
+# RECAPTURA
+
+bilhetagem_transacao_recaptura = deepcopy(default_capture_flow)
+bilhetagem_transacao_recaptura.name = "SMTR: Bilhetagem Transação - Recaptura (subflow)"
+bilhetagem_transacao_recaptura.storage = GCS(emd_constants.GCS_FLOWS_BUCKET.value)
+bilhetagem_transacao_recaptura = set_default_parameters(
+    flow=bilhetagem_transacao_recaptura,
+    default_parameters=GENERAL_CAPTURE_DEFAULT_PARAMS
+    | constants.BILHETAGEM_TRANSACAO_CAPTURE_PARAMS.value
+    | {"recapture": True},
+)
 
 # TRATAMENTO - RODA DE HORA EM HORA, RECAPTURAS + CAPTURA AUXILIAR + MATERIALIZAÇÃO
 with Flow(
@@ -131,7 +130,8 @@
 
     run_recaptura_trasacao = create_flow_run(
         flow_name=bilhetagem_transacao_recaptura.name,
-        project_name=emd_constants.PREFECT_DEFAULT_PROJECT.value,
+        # project_name=emd_constants.PREFECT_DEFAULT_PROJECT.value,
+        project_name="staging",
         labels=LABELS,
         parameters={"recapture_window_days": recapture_window_days},
     )
@@ -143,11 +143,19 @@
         raise_final_state=True,
     )
 
+    recaptura_auxiliar_params = merge_dict_with_dict_list(
+        dict_list=constants.BILHETAGEM_CAPTURE_PARAMS.value,
+        dict_to_merge={
+            "recapture": True,
+            "recapture_window_days": recapture_window_days,
+        },
+    )
+
     runs_recaptura_auxiliar = create_flow_run.map(
         flow_name=unmapped(bilhetagem_auxiliar_captura.name),
-        project_name=unmapped(emd_constants.PREFECT_DEFAULT_PROJECT.value),
-        parameters=constants.BILHETAGEM_CAPTURE_PARAMS.value
-        | {"recapture": True, "recapture_window_days": recapture_window_days},
+        # project_name=unmapped(emd_constants.PREFECT_DEFAULT_PROJECT.value),
+        project_name=unmapped("staging"),
+        parameters=recaptura_auxiliar_params,
         labels=unmapped(LABELS),
         upstream_tasks=[wait_recaptura_trasacao],
     )
@@ -162,7 +170,8 @@
     # Captura
     runs_captura = create_flow_run.map(
         flow_name=unmapped(bilhetagem_auxiliar_captura.name),
-        project_name=unmapped(emd_constants.PREFECT_DEFAULT_PROJECT.value),
+        # project_name=unmapped(emd_constants.PREFECT_DEFAULT_PROJECT.value),
+        project_name=unmapped("staging"),
         parameters=constants.BILHETAGEM_CAPTURE_PARAMS.value,
         labels=unmapped(LABELS),
         upstream_tasks=[wait_recaptura_auxiliar],
@@ -178,7 +187,8 @@
     # Materialização
     run_materializacao = create_flow_run(
         flow_name=bilhetagem_materializacao.name,
-        project_name=emd_constants.PREFECT_DEFAULT_PROJECT.value,
+        # project_name=emd_constants.PREFECT_DEFAULT_PROJECT.value,
+        project_name="staging",
         labels=LABELS,
         upstream_tasks=[wait_captura],
     )
diff --git a/pipelines/rj_smtr/tasks.py b/pipelines/rj_smtr/tasks.py
index 5f5daf1a1..fa930a893 100644
--- a/pipelines/rj_smtr/tasks.py
+++ b/pipelines/rj_smtr/tasks.py
@@ -1178,3 +1178,18 @@ def unpack_mapped_results_nout2(
 
     """
     return [r[0] for r in mapped_results], [r[1] for r in mapped_results]
+
+
+@task(checkpoint=False)
+def merge_dict_with_dict_list(dict_list: list[dict], dict_to_merge: dict) -> list[dict]:
+    """
+    Task to merge a dict with every dict inside a list
+
+    Args:
+        dict_list (list[dict]): A list of dictionaries to update
+        dict_to_merge (dict): The dict that will be merged in every dict inside the list
+
+    Returns:
+        list[dict]: The updated list
+    """
+    return [inside_dict | dict_to_merge for inside_dict in dict_list]

From e75e7a64b9e21bba495cf208cce33ff6fbc73e67 Mon Sep 17 00:00:00 2001
From: Rafael <rafael.c.pinheiro96@gmail.com>
Date: Wed, 18 Oct 2023 14:06:38 -0300
Subject: [PATCH 116/145] comentar materializacao

---
 .../br_rj_riodejaneiro_bilhetagem/flows.py    | 28 +++++++++----------
 1 file changed, 14 insertions(+), 14 deletions(-)

diff --git a/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/flows.py b/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/flows.py
index 2545ee04e..1638817ef 100644
--- a/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/flows.py
+++ b/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/flows.py
@@ -185,20 +185,20 @@
     )
 
     # Materialização
-    run_materializacao = create_flow_run(
-        flow_name=bilhetagem_materializacao.name,
-        # project_name=emd_constants.PREFECT_DEFAULT_PROJECT.value,
-        project_name="staging",
-        labels=LABELS,
-        upstream_tasks=[wait_captura],
-    )
-
-    wait_materializacao = wait_for_flow_run(
-        run_materializacao,
-        stream_states=True,
-        stream_logs=True,
-        raise_final_state=True,
-    )
+    # run_materializacao = create_flow_run(
+    #     flow_name=bilhetagem_materializacao.name,
+    #     # project_name=emd_constants.PREFECT_DEFAULT_PROJECT.value,
+    #     project_name="staging",
+    #     labels=LABELS,
+    #     upstream_tasks=[wait_captura],
+    # )
+
+    # wait_materializacao = wait_for_flow_run(
+    #     run_materializacao,
+    #     stream_states=True,
+    #     stream_logs=True,
+    #     raise_final_state=True,
+    # )
 
 bilhetagem_transacao_tratamento.storage = GCS(emd_constants.GCS_FLOWS_BUCKET.value)
 bilhetagem_transacao_tratamento.run_config = KubernetesRun(

From 6b6d0cbce8b341e6c4d93ef368caf1b26b2f85c4 Mon Sep 17 00:00:00 2001
From: Rafael <rafael.c.pinheiro96@gmail.com>
Date: Wed, 18 Oct 2023 15:19:21 -0300
Subject: [PATCH 117/145] teste log

---
 pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/flows.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/flows.py b/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/flows.py
index 1638817ef..9b28676da 100644
--- a/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/flows.py
+++ b/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/flows.py
@@ -11,6 +11,9 @@
 from prefect.utilities.edges import unmapped
 from prefect import Parameter
 
+from prefect import task
+from pipelines.utils.utils import log
+
 # EMD Imports #
 
 from pipelines.constants import constants as emd_constants
@@ -151,6 +154,8 @@
         },
     )
 
+    task(lambda x: log(x))(x=recaptura_auxiliar_params)
+
     runs_recaptura_auxiliar = create_flow_run.map(
         flow_name=unmapped(bilhetagem_auxiliar_captura.name),
         # project_name=unmapped(emd_constants.PREFECT_DEFAULT_PROJECT.value),

From ec23cf62abbd1523b8a309b57ca144c734f304a8 Mon Sep 17 00:00:00 2001
From: Rafael <rafael.c.pinheiro96@gmail.com>
Date: Wed, 18 Oct 2023 15:56:47 -0300
Subject: [PATCH 118/145] muda logica recaptura bilhetagem

---
 .../br_rj_riodejaneiro_bilhetagem/flows.py    | 38 ++++++-------------
 pipelines/rj_smtr/tasks.py                    | 15 --------
 2 files changed, 11 insertions(+), 42 deletions(-)

diff --git a/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/flows.py b/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/flows.py
index 9b28676da..6e2c976c1 100644
--- a/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/flows.py
+++ b/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/flows.py
@@ -9,10 +9,7 @@
 from prefect.storage import GCS
 from prefect.tasks.prefect import create_flow_run, wait_for_flow_run
 from prefect.utilities.edges import unmapped
-from prefect import Parameter
 
-from prefect import task
-from pipelines.utils.utils import log
 
 # EMD Imports #
 
@@ -33,7 +30,7 @@
     default_materialization_flow,
 )
 
-from pipelines.rj_smtr.tasks import get_current_timestamp, merge_dict_with_dict_list
+from pipelines.rj_smtr.tasks import get_current_timestamp
 
 from pipelines.rj_smtr.constants import constants
 
@@ -102,14 +99,12 @@
 
 # RECAPTURA
 
-bilhetagem_transacao_recaptura = deepcopy(default_capture_flow)
-bilhetagem_transacao_recaptura.name = "SMTR: Bilhetagem Transação - Recaptura (subflow)"
-bilhetagem_transacao_recaptura.storage = GCS(emd_constants.GCS_FLOWS_BUCKET.value)
-bilhetagem_transacao_recaptura = set_default_parameters(
-    flow=bilhetagem_transacao_recaptura,
-    default_parameters=GENERAL_CAPTURE_DEFAULT_PARAMS
-    | constants.BILHETAGEM_TRANSACAO_CAPTURE_PARAMS.value
-    | {"recapture": True},
+bilhetagem_recaptura = deepcopy(default_capture_flow)
+bilhetagem_recaptura.name = "SMTR: Bilhetagem - Recaptura (subflow)"
+bilhetagem_recaptura.storage = GCS(emd_constants.GCS_FLOWS_BUCKET.value)
+bilhetagem_recaptura = set_default_parameters(
+    flow=bilhetagem_recaptura,
+    default_parameters=GENERAL_CAPTURE_DEFAULT_PARAMS | {"recapture": True},
 )
 
 # TRATAMENTO - RODA DE HORA EM HORA, RECAPTURAS + CAPTURA AUXILIAR + MATERIALIZAÇÃO
@@ -118,7 +113,6 @@
     code_owners=["caio", "fernanda", "boris", "rodrigo"],
 ) as bilhetagem_transacao_tratamento:
     # Configuração #
-    recapture_window_days = Parameter("recapture_window_days", default=1)
 
     timestamp = get_current_timestamp()
 
@@ -132,11 +126,11 @@
     # Recapturas
 
     run_recaptura_trasacao = create_flow_run(
-        flow_name=bilhetagem_transacao_recaptura.name,
+        flow_name=bilhetagem_recaptura.name,
         # project_name=emd_constants.PREFECT_DEFAULT_PROJECT.value,
         project_name="staging",
         labels=LABELS,
-        parameters={"recapture_window_days": recapture_window_days},
+        parameters=constants.BILHETAGEM_TRANSACAO_CAPTURE_PARAMS.value,
     )
 
     wait_recaptura_trasacao = wait_for_flow_run(
@@ -146,21 +140,11 @@
         raise_final_state=True,
     )
 
-    recaptura_auxiliar_params = merge_dict_with_dict_list(
-        dict_list=constants.BILHETAGEM_CAPTURE_PARAMS.value,
-        dict_to_merge={
-            "recapture": True,
-            "recapture_window_days": recapture_window_days,
-        },
-    )
-
-    task(lambda x: log(x))(x=recaptura_auxiliar_params)
-
     runs_recaptura_auxiliar = create_flow_run.map(
-        flow_name=unmapped(bilhetagem_auxiliar_captura.name),
+        flow_name=unmapped(bilhetagem_recaptura.name),
         # project_name=unmapped(emd_constants.PREFECT_DEFAULT_PROJECT.value),
         project_name=unmapped("staging"),
-        parameters=recaptura_auxiliar_params,
+        parameters=constants.BILHETAGEM_CAPTURE_PARAMS.value,
         labels=unmapped(LABELS),
         upstream_tasks=[wait_recaptura_trasacao],
     )
diff --git a/pipelines/rj_smtr/tasks.py b/pipelines/rj_smtr/tasks.py
index fa930a893..5f5daf1a1 100644
--- a/pipelines/rj_smtr/tasks.py
+++ b/pipelines/rj_smtr/tasks.py
@@ -1178,18 +1178,3 @@ def unpack_mapped_results_nout2(
 
     """
     return [r[0] for r in mapped_results], [r[1] for r in mapped_results]
-
-
-@task(checkpoint=False)
-def merge_dict_with_dict_list(dict_list: list[dict], dict_to_merge: dict) -> list[dict]:
-    """
-    Task to merge a dict with every dict inside a list
-
-    Args:
-        dict_list (list[dict]): A list of dictionaries to update
-        dict_to_merge (dict): The dict that will be merged in every dict inside the list
-
-    Returns:
-        list[dict]: The updated list
-    """
-    return [inside_dict | dict_to_merge for inside_dict in dict_list]

From 1644b7204f0e4bd77a18248e18010d7989d7d64a Mon Sep 17 00:00:00 2001
From: Rafael <rafael.c.pinheiro96@gmail.com>
Date: Wed, 18 Oct 2023 16:31:44 -0300
Subject: [PATCH 119/145] unmapped upstream tasks

---
 pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/flows.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/flows.py b/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/flows.py
index 6e2c976c1..787ff13e2 100644
--- a/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/flows.py
+++ b/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/flows.py
@@ -146,7 +146,7 @@
         project_name=unmapped("staging"),
         parameters=constants.BILHETAGEM_CAPTURE_PARAMS.value,
         labels=unmapped(LABELS),
-        upstream_tasks=[wait_recaptura_trasacao],
+        upstream_tasks=unmapped([wait_recaptura_trasacao]),
     )
 
     wait_recaptura_auxiliar = wait_for_flow_run.map(
@@ -163,7 +163,7 @@
         project_name=unmapped("staging"),
         parameters=constants.BILHETAGEM_CAPTURE_PARAMS.value,
         labels=unmapped(LABELS),
-        upstream_tasks=[wait_recaptura_auxiliar],
+        upstream_tasks=unmapped([wait_recaptura_auxiliar]),
     )
 
     wait_captura = wait_for_flow_run.map(

From a33a4b8f09007aabeebc0bd8c07731762f2dcd16 Mon Sep 17 00:00:00 2001
From: Rafael <rafael.c.pinheiro96@gmail.com>
Date: Wed, 18 Oct 2023 16:42:18 -0300
Subject: [PATCH 120/145] mudar forma de upstream

---
 pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/flows.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/flows.py b/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/flows.py
index 787ff13e2..02b89a155 100644
--- a/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/flows.py
+++ b/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/flows.py
@@ -146,9 +146,10 @@
         project_name=unmapped("staging"),
         parameters=constants.BILHETAGEM_CAPTURE_PARAMS.value,
         labels=unmapped(LABELS),
-        upstream_tasks=unmapped([wait_recaptura_trasacao]),
     )
 
+    runs_recaptura_auxiliar.set_upstream(wait_recaptura_trasacao)
+
     wait_recaptura_auxiliar = wait_for_flow_run.map(
         runs_recaptura_auxiliar,
         stream_states=unmapped(True),
@@ -163,9 +164,10 @@
         project_name=unmapped("staging"),
         parameters=constants.BILHETAGEM_CAPTURE_PARAMS.value,
         labels=unmapped(LABELS),
-        upstream_tasks=unmapped([wait_recaptura_auxiliar]),
     )
 
+    runs_captura.set_upstream(wait_recaptura_auxiliar)
+
     wait_captura = wait_for_flow_run.map(
         runs_captura,
         stream_states=unmapped(True),

From e47ff2d303e34e5618e50cc44eddc111c2724a3e Mon Sep 17 00:00:00 2001
From: Rafael <rafael.c.pinheiro96@gmail.com>
Date: Thu, 19 Oct 2023 07:37:12 -0300
Subject: [PATCH 121/145] =?UTF-8?q?remover=20altera=C3=A7=C3=B5es=20de=20t?=
 =?UTF-8?q?este?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../br_rj_riodejaneiro_bilhetagem/flows.py    | 28 +++++++++----------
 pipelines/rj_smtr/tasks.py                    |  8 +++---
 2 files changed, 18 insertions(+), 18 deletions(-)

diff --git a/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/flows.py b/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/flows.py
index 02b89a155..6cb5b47ae 100644
--- a/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/flows.py
+++ b/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/flows.py
@@ -176,20 +176,20 @@
     )
 
     # Materialização
-    # run_materializacao = create_flow_run(
-    #     flow_name=bilhetagem_materializacao.name,
-    #     # project_name=emd_constants.PREFECT_DEFAULT_PROJECT.value,
-    #     project_name="staging",
-    #     labels=LABELS,
-    #     upstream_tasks=[wait_captura],
-    # )
-
-    # wait_materializacao = wait_for_flow_run(
-    #     run_materializacao,
-    #     stream_states=True,
-    #     stream_logs=True,
-    #     raise_final_state=True,
-    # )
+    run_materializacao = create_flow_run(
+        flow_name=bilhetagem_materializacao.name,
+        # project_name=emd_constants.PREFECT_DEFAULT_PROJECT.value,
+        project_name="staging",
+        labels=LABELS,
+        upstream_tasks=[wait_captura],
+    )
+
+    wait_materializacao = wait_for_flow_run(
+        run_materializacao,
+        stream_states=True,
+        stream_logs=True,
+        raise_final_state=True,
+    )
 
 bilhetagem_transacao_tratamento.storage = GCS(emd_constants.GCS_FLOWS_BUCKET.value)
 bilhetagem_transacao_tratamento.run_config = KubernetesRun(
diff --git a/pipelines/rj_smtr/tasks.py b/pipelines/rj_smtr/tasks.py
index 5f5daf1a1..671d6171a 100644
--- a/pipelines/rj_smtr/tasks.py
+++ b/pipelines/rj_smtr/tasks.py
@@ -309,7 +309,7 @@ def create_local_partition_path(
     either to save raw or staging files.
     """
     data_folder = os.getenv("DATA_FOLDER", "data")
-    file_path = f"{os.getcwd()}/{data_folder}/{{mode}}/{dataset_id}_dev/{table_id}"
+    file_path = f"{os.getcwd()}/{data_folder}/{{mode}}/{dataset_id}/{table_id}"
     file_path += f"/{partitions}/{filename}.{{filetype}}"
     log(f"Creating file path: {file_path}")
     return file_path
@@ -783,7 +783,7 @@ def upload_raw_data_to_gcs(
             st_obj = Storage(table_id=table_id, dataset_id=dataset_id)
             log(
                 f"""Uploading raw file to bucket {st_obj.bucket_name} at
-                {st_obj.bucket_name}/{dataset_id}_dev/{table_id}"""
+                {st_obj.bucket_name}/{dataset_id}/{table_id}"""
             )
             st_obj.upload(
                 path=raw_filepath,
@@ -827,7 +827,7 @@ def upload_staging_data_to_gcs(
         try:
             # Creates and publish table if it does not exist, append to it otherwise
             create_or_append_table(
-                dataset_id=f"{dataset_id}_dev",
+                dataset_id=dataset_id,
                 table_id=table_id,
                 path=staging_filepath,
                 partitions=partitions,
@@ -837,7 +837,7 @@ def upload_staging_data_to_gcs(
             log(f"[CATCHED] Task failed with error: \n{error}", level="error")
 
     upload_run_logs_to_bq(
-        dataset_id=f"{dataset_id}_dev",
+        dataset_id=dataset_id,
         parent_table_id=table_id,
         error=error,
         timestamp=timestamp,

From ba730a4d4c453e9cf585282df4372c078c55bb55 Mon Sep 17 00:00:00 2001
From: Rafael <rafael.c.pinheiro96@gmail.com>
Date: Thu, 19 Oct 2023 07:38:48 -0300
Subject: [PATCH 122/145] mudar agent para prd

---
 pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/flows.py | 8 ++++----
 pipelines/rj_smtr/flows.py                               | 4 ++--
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/flows.py b/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/flows.py
index 6cb5b47ae..66e834170 100644
--- a/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/flows.py
+++ b/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/flows.py
@@ -52,7 +52,7 @@
 bilhetagem_transacao_captura.storage = GCS(emd_constants.GCS_FLOWS_BUCKET.value)
 bilhetagem_transacao_captura.run_config = KubernetesRun(
     image=emd_constants.DOCKER_IMAGE.value,
-    labels=[emd_constants.RJ_SMTR_DEV_AGENT_LABEL.value],
+    labels=[emd_constants.RJ_SMTR_AGENT_LABEL.value],
 )
 
 bilhetagem_transacao_captura = set_default_parameters(
@@ -71,7 +71,7 @@
 bilhetagem_auxiliar_captura.storage = GCS(emd_constants.GCS_FLOWS_BUCKET.value)
 bilhetagem_auxiliar_captura.run_config = KubernetesRun(
     image=emd_constants.DOCKER_IMAGE.value,
-    labels=[emd_constants.RJ_SMTR_DEV_AGENT_LABEL.value],
+    labels=[emd_constants.RJ_SMTR_AGENT_LABEL.value],
 )
 
 bilhetagem_auxiliar_captura = set_default_parameters(
@@ -85,7 +85,7 @@
 bilhetagem_materializacao.storage = GCS(emd_constants.GCS_FLOWS_BUCKET.value)
 bilhetagem_materializacao.run_config = KubernetesRun(
     image=emd_constants.DOCKER_IMAGE.value,
-    labels=[emd_constants.RJ_SMTR_DEV_AGENT_LABEL.value],
+    labels=[emd_constants.RJ_SMTR_AGENT_LABEL.value],
 )
 
 bilhetagem_materializacao_parameters = {
@@ -194,6 +194,6 @@
 bilhetagem_transacao_tratamento.storage = GCS(emd_constants.GCS_FLOWS_BUCKET.value)
 bilhetagem_transacao_tratamento.run_config = KubernetesRun(
     image=emd_constants.DOCKER_IMAGE.value,
-    labels=[emd_constants.RJ_SMTR_DEV_AGENT_LABEL.value],
+    labels=[emd_constants.RJ_SMTR_AGENT_LABEL.value],
 )
 bilhetagem_transacao_tratamento.schedule = every_hour
diff --git a/pipelines/rj_smtr/flows.py b/pipelines/rj_smtr/flows.py
index 63acc1497..18a7fb1a3 100644
--- a/pipelines/rj_smtr/flows.py
+++ b/pipelines/rj_smtr/flows.py
@@ -168,7 +168,7 @@
 default_capture_flow.storage = GCS(emd_constants.GCS_FLOWS_BUCKET.value)
 default_capture_flow.run_config = KubernetesRun(
     image=emd_constants.DOCKER_IMAGE.value,
-    labels=[emd_constants.RJ_SMTR_DEV_AGENT_LABEL.value],
+    labels=[emd_constants.RJ_SMTR_AGENT_LABEL.value],
 )
 
 with Flow(
@@ -306,5 +306,5 @@
 default_materialization_flow.storage = GCS(emd_constants.GCS_FLOWS_BUCKET.value)
 default_materialization_flow.run_config = KubernetesRun(
     image=emd_constants.DOCKER_IMAGE.value,
-    labels=[emd_constants.RJ_SMTR_DEV_AGENT_LABEL.value],
+    labels=[emd_constants.RJ_SMTR_AGENT_LABEL.value],
 )

From 4421043e7915528f7760266e57749c24b76e7c73 Mon Sep 17 00:00:00 2001
From: Rafael <rafael.c.pinheiro96@gmail.com>
Date: Thu, 19 Oct 2023 13:02:59 -0300
Subject: [PATCH 123/145] corrigir project_name

---
 pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/flows.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/flows.py b/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/flows.py
index 66e834170..5f4a82f75 100644
--- a/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/flows.py
+++ b/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/flows.py
@@ -142,8 +142,7 @@
 
     runs_recaptura_auxiliar = create_flow_run.map(
         flow_name=unmapped(bilhetagem_recaptura.name),
-        # project_name=unmapped(emd_constants.PREFECT_DEFAULT_PROJECT.value),
-        project_name=unmapped("staging"),
+        project_name=unmapped(emd_constants.PREFECT_DEFAULT_PROJECT.value),
         parameters=constants.BILHETAGEM_CAPTURE_PARAMS.value,
         labels=unmapped(LABELS),
     )
@@ -160,8 +159,7 @@
     # Captura
     runs_captura = create_flow_run.map(
         flow_name=unmapped(bilhetagem_auxiliar_captura.name),
-        # project_name=unmapped(emd_constants.PREFECT_DEFAULT_PROJECT.value),
-        project_name=unmapped("staging"),
+        project_name=unmapped(emd_constants.PREFECT_DEFAULT_PROJECT.value),
         parameters=constants.BILHETAGEM_CAPTURE_PARAMS.value,
         labels=unmapped(LABELS),
     )

From f1a3bbd0dec741f314d1e1fb7ec636eeb1753715 Mon Sep 17 00:00:00 2001
From: Rafael <rafael.c.pinheiro96@gmail.com>
Date: Thu, 19 Oct 2023 13:34:42 -0300
Subject: [PATCH 124/145] passar tirar query_logs_func

---
 pipelines/rj_smtr/tasks.py | 107 +++++++++++++++++++++++++++++++++----
 1 file changed, 97 insertions(+), 10 deletions(-)

diff --git a/pipelines/rj_smtr/tasks.py b/pipelines/rj_smtr/tasks.py
index 671d6171a..e329069af 100644
--- a/pipelines/rj_smtr/tasks.py
+++ b/pipelines/rj_smtr/tasks.py
@@ -36,7 +36,7 @@
     read_raw_data,
     save_treated_local_func,
     save_raw_local_func,
-    query_logs_func,
+    log_critical,
 )
 from pipelines.utils.execute_dbt_model.utils import get_dbt_client
 from pipelines.utils.utils import log, get_redis_client, get_vault_secret
@@ -159,7 +159,7 @@ def create_dbt_run_vars(
         mode (str): the mode to get the date_range variable
 
     Returns:
-        tuple[list[dict]: the variables to be used in DBT
+        list[dict]: the variables to be used in DBT
         Union[list[dict], dict, None]: the date variable (date_range or run_date)
         bool: a flag that indicates if the date_range variable came from Redis
     """
@@ -392,14 +392,101 @@ def query_logs(
         previous_errors (list of previous errors)
     """
 
-    return query_logs_func(
-        dataset_id=dataset_id,
-        table_id=table_id,
-        datetime_filter=datetime_filter,
-        max_recaptures=max_recaptures,
-        interval_minutes=interval_minutes,
-        recapture_window_days=recapture_window_days,
-    )
+    if not datetime_filter:
+        datetime_filter = pendulum.now(constants.TIMEZONE.value).replace(
+            second=0, microsecond=0
+        )
+    elif isinstance(datetime_filter, str):
+        datetime_filter = datetime.fromisoformat(datetime_filter).replace(
+            second=0, microsecond=0
+        )
+
+    datetime_filter = datetime_filter.strftime("%Y-%m-%d %H:%M:%S")
+
+    query = f"""
+    WITH
+    t AS (
+    SELECT
+        DATETIME(timestamp_array) AS timestamp_array
+    FROM
+        UNNEST(
+            GENERATE_TIMESTAMP_ARRAY(
+                TIMESTAMP_SUB('{datetime_filter}', INTERVAL {recapture_window_days} day),
+                TIMESTAMP('{datetime_filter}'),
+                INTERVAL {interval_minutes} minute) )
+        AS timestamp_array
+    WHERE
+        timestamp_array < '{datetime_filter}' ),
+    logs_table AS (
+        SELECT
+            SAFE_CAST(DATETIME(TIMESTAMP(timestamp_captura),
+                    "America/Sao_Paulo") AS DATETIME) timestamp_captura,
+            SAFE_CAST(sucesso AS BOOLEAN) sucesso,
+            SAFE_CAST(erro AS STRING) erro,
+            SAFE_CAST(DATA AS DATE) DATA
+        FROM
+            rj-smtr-staging.{dataset_id}_staging.{table_id}_logs AS t
+    ),
+    logs AS (
+        SELECT
+            *,
+            TIMESTAMP_TRUNC(timestamp_captura, minute) AS timestamp_array
+        FROM
+            logs_table
+        WHERE
+            DATA BETWEEN DATE(DATETIME_SUB('{datetime_filter}',
+                            INTERVAL {recapture_window_days} day))
+            AND DATE('{datetime_filter}')
+            AND timestamp_captura BETWEEN
+                DATETIME_SUB('{datetime_filter}', INTERVAL {recapture_window_days} day)
+            AND '{datetime_filter}'
+        ORDER BY
+            timestamp_captura )
+    SELECT
+        CASE
+            WHEN logs.timestamp_captura IS NOT NULL THEN logs.timestamp_captura
+        ELSE
+            t.timestamp_array
+        END
+            AS timestamp_captura,
+            logs.erro
+    FROM
+        t
+    LEFT JOIN
+        logs
+    ON
+        logs.timestamp_array = t.timestamp_array
+    WHERE
+        logs.sucesso IS NOT TRUE
+    ORDER BY
+        timestamp_captura
+    """
+    log(f"Run query to check logs:\n{query}")
+    results = bd.read_sql(query=query, billing_project_id=bq_project())
+    if len(results) > 0:
+        results["timestamp_captura"] = (
+            pd.to_datetime(results["timestamp_captura"])
+            .dt.tz_localize(constants.TIMEZONE.value)
+            .to_list()
+        )
+        log(f"Recapture data for the following {len(results)} timestamps:\n{results}")
+        if len(results) > max_recaptures:
+            message = f"""
+            [SPPO - Recaptures]
+            Encontradas {len(results)} timestamps para serem recapturadas.
+            Essa run processará as seguintes:
+            #####
+            {results[:max_recaptures]}
+            #####
+            Sobraram as seguintes para serem recapturadas na próxima run:
+            #####
+            {results[max_recaptures:]}
+            #####
+            """
+            log_critical(message)
+            results = results[:max_recaptures]
+        return True, results["timestamp_captura"].to_list(), results["erro"].to_list()
+    return False, [], []
 
 
 @task

From 517a9a2c88f15ce6f0ff298d988be11821d6c349 Mon Sep 17 00:00:00 2001
From: Rafael <rafael.c.pinheiro96@gmail.com>
Date: Thu, 19 Oct 2023 13:35:04 -0300
Subject: [PATCH 125/145] corrigir project_name

---
 pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/flows.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/flows.py b/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/flows.py
index 5f4a82f75..096e5d3e3 100644
--- a/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/flows.py
+++ b/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/flows.py
@@ -127,8 +127,7 @@
 
     run_recaptura_trasacao = create_flow_run(
         flow_name=bilhetagem_recaptura.name,
-        # project_name=emd_constants.PREFECT_DEFAULT_PROJECT.value,
-        project_name="staging",
+        project_name=emd_constants.PREFECT_DEFAULT_PROJECT.value,
         labels=LABELS,
         parameters=constants.BILHETAGEM_TRANSACAO_CAPTURE_PARAMS.value,
     )
@@ -176,8 +175,7 @@
     # Materialização
     run_materializacao = create_flow_run(
         flow_name=bilhetagem_materializacao.name,
-        # project_name=emd_constants.PREFECT_DEFAULT_PROJECT.value,
-        project_name="staging",
+        project_name=emd_constants.PREFECT_DEFAULT_PROJECT.value,
         labels=LABELS,
         upstream_tasks=[wait_captura],
     )

From df6ee96a5dad1f0d49fa9988a53f2edda7d9d12f Mon Sep 17 00:00:00 2001
From: Rafael <rafael.c.pinheiro96@gmail.com>
Date: Thu, 19 Oct 2023 13:35:21 -0300
Subject: [PATCH 126/145] =?UTF-8?q?remover=20coment=C3=A1rios?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 pipelines/rj_smtr/utils.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/pipelines/rj_smtr/utils.py b/pipelines/rj_smtr/utils.py
index 386053e66..bf9fb6778 100644
--- a/pipelines/rj_smtr/utils.py
+++ b/pipelines/rj_smtr/utils.py
@@ -602,10 +602,6 @@ def custom_serialization(obj: Any) -> Any:
             obj = obj.tz_localize("UTC").tz_convert(
                 emd_constants.DEFAULT_TIMEZONE.value
             )
-        # if obj.tzinfo is None:
-        #     obj = obj.tz_localize(emd_constants.DEFAULT_TIMEZONE.value)
-        # else:
-        #     obj = obj.tz_convert(emd_constants.DEFAULT_TIMEZONE.value)
 
         return obj.isoformat()
 

From 7b2e1dfce2e8f353fef7dadf7453552a49b54f78 Mon Sep 17 00:00:00 2001
From: Rafael <rafael.c.pinheiro96@gmail.com>
Date: Thu, 19 Oct 2023 14:12:52 -0300
Subject: [PATCH 127/145] remover query_logs_func

---
 pipelines/rj_smtr/utils.py | 124 -------------------------------------
 1 file changed, 124 deletions(-)

diff --git a/pipelines/rj_smtr/utils.py b/pipelines/rj_smtr/utils.py
index bf9fb6778..0d05ffb09 100644
--- a/pipelines/rj_smtr/utils.py
+++ b/pipelines/rj_smtr/utils.py
@@ -450,130 +450,6 @@ def generate_execute_schedules(  # pylint: disable=too-many-arguments,too-many-l
     return clocks
 
 
-def query_logs_func(
-    dataset_id: str,
-    table_id: str,
-    datetime_filter=None,
-    max_recaptures: int = 60,
-    interval_minutes: int = 1,
-    recapture_window_days: int = 1,
-):
-    """
-    Queries capture logs to check for errors
-
-    Args:
-        dataset_id (str): dataset_id on BigQuery
-        table_id (str): table_id on BigQuery
-        datetime_filter (pendulum.datetime.DateTime, optional):
-        filter passed to query. This task will query the logs table
-        for the last n (n = recapture_window_days) days before datetime_filter
-        max_recaptures (int, optional): maximum number of recaptures to be done
-        interval_minutes (int, optional): interval in minutes between each recapture
-        recapture_window_days (int, optional): Number of days to query for erros
-
-    Returns:
-        lists: errors (bool),
-        timestamps (list of pendulum.datetime.DateTime),
-        previous_errors (list of previous errors)
-    """
-
-    if not datetime_filter:
-        datetime_filter = pendulum.now(constants.TIMEZONE.value).replace(
-            second=0, microsecond=0
-        )
-    elif isinstance(datetime_filter, str):
-        datetime_filter = datetime.fromisoformat(datetime_filter).replace(
-            second=0, microsecond=0
-        )
-
-    datetime_filter = datetime_filter.strftime("%Y-%m-%d %H:%M:%S")
-
-    query = f"""
-    WITH
-    t AS (
-    SELECT
-        DATETIME(timestamp_array) AS timestamp_array
-    FROM
-        UNNEST(
-            GENERATE_TIMESTAMP_ARRAY(
-                TIMESTAMP_SUB('{datetime_filter}', INTERVAL {recapture_window_days} day),
-                TIMESTAMP('{datetime_filter}'),
-                INTERVAL {interval_minutes} minute) )
-        AS timestamp_array
-    WHERE
-        timestamp_array < '{datetime_filter}' ),
-    logs_table AS (
-        SELECT
-            SAFE_CAST(DATETIME(TIMESTAMP(timestamp_captura),
-                    "America/Sao_Paulo") AS DATETIME) timestamp_captura,
-            SAFE_CAST(sucesso AS BOOLEAN) sucesso,
-            SAFE_CAST(erro AS STRING) erro,
-            SAFE_CAST(DATA AS DATE) DATA
-        FROM
-            rj-smtr-staging.{dataset_id}_staging.{table_id}_logs AS t
-    ),
-    logs AS (
-        SELECT
-            *,
-            TIMESTAMP_TRUNC(timestamp_captura, minute) AS timestamp_array
-        FROM
-            logs_table
-        WHERE
-            DATA BETWEEN DATE(DATETIME_SUB('{datetime_filter}',
-                            INTERVAL {recapture_window_days} day))
-            AND DATE('{datetime_filter}')
-            AND timestamp_captura BETWEEN
-                DATETIME_SUB('{datetime_filter}', INTERVAL {recapture_window_days} day)
-            AND '{datetime_filter}'
-        ORDER BY
-            timestamp_captura )
-    SELECT
-        CASE
-            WHEN logs.timestamp_captura IS NOT NULL THEN logs.timestamp_captura
-        ELSE
-            t.timestamp_array
-        END
-            AS timestamp_captura,
-            logs.erro
-    FROM
-        t
-    LEFT JOIN
-        logs
-    ON
-        logs.timestamp_array = t.timestamp_array
-    WHERE
-        logs.sucesso IS NOT TRUE
-    ORDER BY
-        timestamp_captura
-    """
-    log(f"Run query to check logs:\n{query}")
-    results = bd.read_sql(query=query, billing_project_id=bq_project())
-    if len(results) > 0:
-        results["timestamp_captura"] = (
-            pd.to_datetime(results["timestamp_captura"])
-            .dt.tz_localize(constants.TIMEZONE.value)
-            .to_list()
-        )
-        log(f"Recapture data for the following {len(results)} timestamps:\n{results}")
-        if len(results) > max_recaptures:
-            message = f"""
-            [SPPO - Recaptures]
-            Encontradas {len(results)} timestamps para serem recapturadas.
-            Essa run processará as seguintes:
-            #####
-            {results[:max_recaptures]}
-            #####
-            Sobraram as seguintes para serem recapturadas na próxima run:
-            #####
-            {results[max_recaptures:]}
-            #####
-            """
-            log_critical(message)
-            results = results[:max_recaptures]
-        return True, results["timestamp_captura"].to_list(), results["erro"].to_list()
-    return False, [], []
-
-
 def dict_contains_keys(input_dict: dict, keys: list[str]) -> bool:
     """
     Test if the input dict has all keys present in the list

From 5f303849ea483f5f37115238a5084c9aa207f2fd Mon Sep 17 00:00:00 2001
From: Rafael <rafael.c.pinheiro96@gmail.com>
Date: Thu, 19 Oct 2023 16:12:44 -0300
Subject: [PATCH 128/145] aumentar max_recaptures

---
 pipelines/rj_smtr/tasks.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pipelines/rj_smtr/tasks.py b/pipelines/rj_smtr/tasks.py
index e329069af..79cd84751 100644
--- a/pipelines/rj_smtr/tasks.py
+++ b/pipelines/rj_smtr/tasks.py
@@ -369,7 +369,7 @@ def query_logs(
     dataset_id: str,
     table_id: str,
     datetime_filter=None,
-    max_recaptures: int = 60,
+    max_recaptures: int = 360,
     interval_minutes: int = 1,
     recapture_window_days: int = 1,
 ):

From 431f0047af35e41eaf03fddc67bb8a62d852a4cf Mon Sep 17 00:00:00 2001
From: Rafael <rafael.c.pinheiro96@gmail.com>
Date: Thu, 19 Oct 2023 20:16:21 -0300
Subject: [PATCH 129/145] adiciona extracao tracking

---
 .../br_rj_riodejaneiro_bilhetagem/flows.py    | 18 +++++++++++++++++
 pipelines/rj_smtr/constants.py                | 20 +++++++++++++++++++
 2 files changed, 38 insertions(+)

diff --git a/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/flows.py b/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/flows.py
index 096e5d3e3..11c66ba69 100644
--- a/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/flows.py
+++ b/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/flows.py
@@ -63,6 +63,24 @@
 
 bilhetagem_transacao_captura.schedule = every_minute
 
+# BILHETAGEM GPS
+
+bilhetagem_tracking_captura = deepcopy(default_capture_flow)
+bilhetagem_tracking_captura.name = "SMTR: Bilhetagem GPS VALIDADOR - Captura"
+bilhetagem_tracking_captura.storage = GCS(emd_constants.GCS_FLOWS_BUCKET.value)
+bilhetagem_tracking_captura.run_config = KubernetesRun(
+    image=emd_constants.DOCKER_IMAGE.value,
+    labels=[emd_constants.RJ_SMTR_AGENT_LABEL.value],
+)
+
+bilhetagem_tracking_captura = set_default_parameters(
+    flow=bilhetagem_tracking_captura,
+    default_parameters=GENERAL_CAPTURE_DEFAULT_PARAMS
+    | constants.BILHETAGEM_TRACKING_CAPTURE_PARAMS.value,
+)
+
+bilhetagem_tracking_captura.schedule = every_minute
+
 
 # BILHETAGEM AUXILIAR - SUBFLOW PARA RODAR ANTES DE CADA MATERIALIZAÇÃO #
 
diff --git a/pipelines/rj_smtr/constants.py b/pipelines/rj_smtr/constants.py
index ccf1c6c44..923383d04 100644
--- a/pipelines/rj_smtr/constants.py
+++ b/pipelines/rj_smtr/constants.py
@@ -180,6 +180,7 @@ class constants(Enum):  # pylint: disable=c0103
                 "engine": "postgresql",
                 "host": "10.5.115.1",
             },
+            "tracking_db": {"engine": "postgresql", "host": "10.5.15.25"},
         },
         "source_type": "db",
     }
@@ -203,6 +204,25 @@ class constants(Enum):  # pylint: disable=c0103
         "interval_minutes": 1,
     }
 
+    BILHETAGEM_TRACKING_CAPTURE_PARAMS = {
+        "table_id": "gps_validador",
+        "partition_date_only": False,
+        "extract_params": {
+            "database": "tracking_db",
+            "query": """
+                SELECT
+                    *
+                FROM
+                    tracking_detalhe
+                WHERE
+                    data_tracking BETWEEN '{start}'
+                    AND '{end}'
+            """,
+        },
+        "primary_key": ["id"],
+        "interval_minutes": 1,
+    }
+
     BILHETAGEM_SECRET_PATH = "smtr_jae_access_data"
 
     BILHETAGEM_CAPTURE_PARAMS = [

From e7ca572108773525a4002fab189c078a6d2c1465 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Thu, 19 Oct 2023 23:21:24 +0000
Subject: [PATCH 130/145] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 pipelines/rj_smtr/constants.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pipelines/rj_smtr/constants.py b/pipelines/rj_smtr/constants.py
index 222440ca0..9d7465e9e 100644
--- a/pipelines/rj_smtr/constants.py
+++ b/pipelines/rj_smtr/constants.py
@@ -219,7 +219,7 @@ class constants(Enum):  # pylint: disable=c0103
             """,
         },
         "primary_key": ["id"],
-        "interval_minutes": 1,   
+        "interval_minutes": 1,
     }
 
     BILHETAGEM_SECRET_PATH = "smtr_jae_access_data"

From dae77e84160a57fb4e17803d3cf2f0cc6922032f Mon Sep 17 00:00:00 2001
From: Rafael <rafael.c.pinheiro96@gmail.com>
Date: Thu, 19 Oct 2023 20:22:06 -0300
Subject: [PATCH 131/145] muda agent para dev

---
 pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/flows.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/flows.py b/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/flows.py
index 11c66ba69..cf8e39644 100644
--- a/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/flows.py
+++ b/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/flows.py
@@ -70,7 +70,7 @@
 bilhetagem_tracking_captura.storage = GCS(emd_constants.GCS_FLOWS_BUCKET.value)
 bilhetagem_tracking_captura.run_config = KubernetesRun(
     image=emd_constants.DOCKER_IMAGE.value,
-    labels=[emd_constants.RJ_SMTR_AGENT_LABEL.value],
+    labels=[emd_constants.RJ_SMTR_DEV_AGENT_LABEL.value],
 )
 
 bilhetagem_tracking_captura = set_default_parameters(

From 5fb96e6fd09077dc9f36c1fe1b3e2662af51eec9 Mon Sep 17 00:00:00 2001
From: Rafael <rafael.c.pinheiro96@gmail.com>
Date: Thu, 19 Oct 2023 20:23:41 -0300
Subject: [PATCH 132/145] corrige constante

---
 pipelines/rj_smtr/constants.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/pipelines/rj_smtr/constants.py b/pipelines/rj_smtr/constants.py
index 9d7465e9e..923383d04 100644
--- a/pipelines/rj_smtr/constants.py
+++ b/pipelines/rj_smtr/constants.py
@@ -202,6 +202,7 @@ class constants(Enum):  # pylint: disable=c0103
         },
         "primary_key": ["id"],
         "interval_minutes": 1,
+    }
 
     BILHETAGEM_TRACKING_CAPTURE_PARAMS = {
         "table_id": "gps_validador",

From 7c3de1a436e3412fc3b5133a38bc01414307fae6 Mon Sep 17 00:00:00 2001
From: Rafael <rafael.c.pinheiro96@gmail.com>
Date: Thu, 19 Oct 2023 20:26:24 -0300
Subject: [PATCH 133/145] formatar constante database

---
 pipelines/rj_smtr/constants.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/pipelines/rj_smtr/constants.py b/pipelines/rj_smtr/constants.py
index 923383d04..79aa91b5d 100644
--- a/pipelines/rj_smtr/constants.py
+++ b/pipelines/rj_smtr/constants.py
@@ -180,7 +180,10 @@ class constants(Enum):  # pylint: disable=c0103
                 "engine": "postgresql",
                 "host": "10.5.115.1",
             },
-            "tracking_db": {"engine": "postgresql", "host": "10.5.15.25"},
+            "tracking_db": {
+                "engine": "postgresql",
+                "host": "10.5.15.25",
+            },
         },
         "source_type": "db",
     }

From de245398ddd95f937f056750dccd4b1bae4d182c Mon Sep 17 00:00:00 2001
From: Rafael <rafael.c.pinheiro96@gmail.com>
Date: Thu, 19 Oct 2023 20:36:44 -0300
Subject: [PATCH 134/145] altera nome do flow

---
 pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/flows.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/flows.py b/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/flows.py
index 61cec24dd..a9489c7d8 100644
--- a/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/flows.py
+++ b/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/flows.py
@@ -66,7 +66,7 @@
 # BILHETAGEM GPS
 
 bilhetagem_tracking_captura = deepcopy(default_capture_flow)
-bilhetagem_tracking_captura.name = "SMTR: Bilhetagem GPS VALIDADOR - Captura"
+bilhetagem_tracking_captura.name = "SMTR: Bilhetagem GPS Validador - Captura"
 bilhetagem_tracking_captura.storage = GCS(emd_constants.GCS_FLOWS_BUCKET.value)
 bilhetagem_tracking_captura.run_config = KubernetesRun(
     image=emd_constants.DOCKER_IMAGE.value,

From ccddeead4099e172c2e4445d97f63f149cc91ba4 Mon Sep 17 00:00:00 2001
From: Rafael <rafael.c.pinheiro96@gmail.com>
Date: Mon, 23 Oct 2023 08:41:20 -0300
Subject: [PATCH 135/145] alterar queries bilhetagem auxiliar

---
 pipelines/rj_smtr/constants.py | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/pipelines/rj_smtr/constants.py b/pipelines/rj_smtr/constants.py
index ccf1c6c44..0f1c8dff0 100644
--- a/pipelines/rj_smtr/constants.py
+++ b/pipelines/rj_smtr/constants.py
@@ -217,7 +217,8 @@ class constants(Enum):  # pylint: disable=c0103
                     FROM
                         LINHA
                     WHERE
-                        DT_INCLUSAO >= '{start}'
+                        DT_INCLUSAO BETWEEN '{start}'
+                        AND '{end}'
                 """,
             },
             "primary_key": ["CD_LINHA"],  # id column to nest data on
@@ -234,7 +235,8 @@ class constants(Enum):  # pylint: disable=c0103
                     FROM
                         GRUPO
                     WHERE
-                        DT_INCLUSAO >= '{start}'
+                        DT_INCLUSAO BETWEEN '{start}'
+                        AND '{end}'
                 """,
             },
             "primary_key": ["CD_GRUPO"],  # id column to nest data on
@@ -251,7 +253,8 @@ class constants(Enum):  # pylint: disable=c0103
                     FROM
                         GRUPO_LINHA
                     WHERE
-                        DT_INCLUSAO >= '{start}'
+                        DT_INCLUSAO BETWEEN '{start}'
+                        AND '{end}'
                 """,
             },
             "primary_key": ["CD_GRUPO", "CD_LINHA"],
@@ -268,7 +271,8 @@ class constants(Enum):  # pylint: disable=c0103
                     FROM
                         matriz_integracao
                     WHERE
-                        dt_inclusao >= '{start}'
+                        dt_inclusao BETWEEN '{start}'
+                        AND '{end}'
                 """,
             },
             "primary_key": [

From 830c52f98179b800a816981a367c7f4ab3c6da10 Mon Sep 17 00:00:00 2001
From: Rafael <rafael.c.pinheiro96@gmail.com>
Date: Mon, 23 Oct 2023 11:53:08 -0300
Subject: [PATCH 136/145] ajuste na logica de recaptura bilhetagem auxiliar

---
 .../br_rj_riodejaneiro_bilhetagem/flows.py    | 57 +++++++++------
 pipelines/rj_smtr/constants.py                |  4 +
 pipelines/rj_smtr/flows.py                    | 10 ++-
 pipelines/rj_smtr/tasks.py                    | 73 ++++++++++++++++++-
 4 files changed, 117 insertions(+), 27 deletions(-)

diff --git a/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/flows.py b/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/flows.py
index 096e5d3e3..dd1c9865d 100644
--- a/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/flows.py
+++ b/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/flows.py
@@ -30,7 +30,7 @@
     default_materialization_flow,
 )
 
-from pipelines.rj_smtr.tasks import get_current_timestamp
+from pipelines.rj_smtr.tasks import get_rounded_timestamp, join_dicts
 
 from pipelines.rj_smtr.constants import constants
 
@@ -114,7 +114,7 @@
 ) as bilhetagem_transacao_tratamento:
     # Configuração #
 
-    timestamp = get_current_timestamp()
+    timestamp = get_rounded_timestamp(interval_minutes=60)
 
     rename_flow_run = rename_current_flow_run_now_time(
         prefix=bilhetagem_transacao_tratamento.name + " ",
@@ -123,13 +123,17 @@
 
     LABELS = get_current_flow_labels()
 
-    # Recapturas
-
+    # Recaptura Transação
+    transacao_recapture_params = join_dicts(
+        original_dict=constants.BILHETAGEM_TRANSACAO_CAPTURE_PARAMS.value,
+        dict_to_join={"timestamp": timestamp},
+    )
     run_recaptura_trasacao = create_flow_run(
         flow_name=bilhetagem_recaptura.name,
-        project_name=emd_constants.PREFECT_DEFAULT_PROJECT.value,
+        # project_name=emd_constants.PREFECT_DEFAULT_PROJECT.value,
+        project_name="staging",
         labels=LABELS,
-        parameters=constants.BILHETAGEM_TRANSACAO_CAPTURE_PARAMS.value,
+        parameters=transacao_recapture_params,
     )
 
     wait_recaptura_trasacao = wait_for_flow_run(
@@ -139,34 +143,41 @@
         raise_final_state=True,
     )
 
-    runs_recaptura_auxiliar = create_flow_run.map(
-        flow_name=unmapped(bilhetagem_recaptura.name),
-        project_name=unmapped(emd_constants.PREFECT_DEFAULT_PROJECT.value),
-        parameters=constants.BILHETAGEM_CAPTURE_PARAMS.value,
+    # Captura Auxiliar
+
+    auxiliar_capture_params = join_dicts(
+        original_dict=constants.BILHETAGEM_CAPTURE_PARAMS.value,
+        dict_to_join={"timestamp": timestamp},
+    )
+    runs_captura = create_flow_run.map(
+        flow_name=unmapped(bilhetagem_auxiliar_captura.name),
+        # project_name=unmapped(emd_constants.PREFECT_DEFAULT_PROJECT.value),
+        project_name=unmapped("staging"),
+        parameters=auxiliar_capture_params,
         labels=unmapped(LABELS),
     )
 
-    runs_recaptura_auxiliar.set_upstream(wait_recaptura_trasacao)
-
-    wait_recaptura_auxiliar = wait_for_flow_run.map(
-        runs_recaptura_auxiliar,
+    wait_captura = wait_for_flow_run.map(
+        runs_captura,
         stream_states=unmapped(True),
         stream_logs=unmapped(True),
         raise_final_state=unmapped(True),
     )
 
-    # Captura
-    runs_captura = create_flow_run.map(
-        flow_name=unmapped(bilhetagem_auxiliar_captura.name),
-        project_name=unmapped(emd_constants.PREFECT_DEFAULT_PROJECT.value),
-        parameters=constants.BILHETAGEM_CAPTURE_PARAMS.value,
+    # Recaptura Auxiliar
+
+    runs_recaptura_auxiliar = create_flow_run.map(
+        flow_name=unmapped(bilhetagem_recaptura.name),
+        # project_name=unmapped(emd_constants.PREFECT_DEFAULT_PROJECT.value),
+        project_name=unmapped("staging"),
+        parameters=auxiliar_capture_params,
         labels=unmapped(LABELS),
     )
 
-    runs_captura.set_upstream(wait_recaptura_auxiliar)
+    runs_recaptura_auxiliar.set_upstream(wait_captura)
 
-    wait_captura = wait_for_flow_run.map(
-        runs_captura,
+    wait_recaptura_auxiliar = wait_for_flow_run.map(
+        runs_recaptura_auxiliar,
         stream_states=unmapped(True),
         stream_logs=unmapped(True),
         raise_final_state=unmapped(True),
@@ -190,6 +201,6 @@
 bilhetagem_transacao_tratamento.storage = GCS(emd_constants.GCS_FLOWS_BUCKET.value)
 bilhetagem_transacao_tratamento.run_config = KubernetesRun(
     image=emd_constants.DOCKER_IMAGE.value,
-    labels=[emd_constants.RJ_SMTR_AGENT_LABEL.value],
+    labels=[emd_constants.RJ_SMTR_DEV_AGENT_LABEL.value],
 )
 bilhetagem_transacao_tratamento.schedule = every_hour
diff --git a/pipelines/rj_smtr/constants.py b/pipelines/rj_smtr/constants.py
index 0f1c8dff0..63a959419 100644
--- a/pipelines/rj_smtr/constants.py
+++ b/pipelines/rj_smtr/constants.py
@@ -223,6 +223,7 @@ class constants(Enum):  # pylint: disable=c0103
             },
             "primary_key": ["CD_LINHA"],  # id column to nest data on
             "interval_minutes": 60,
+            "truncate_hour": True,
         },
         {
             "table_id": "grupo",
@@ -241,6 +242,7 @@ class constants(Enum):  # pylint: disable=c0103
             },
             "primary_key": ["CD_GRUPO"],  # id column to nest data on
             "interval_minutes": 60,
+            "truncate_hour": True,
         },
         {
             "table_id": "grupo_linha",
@@ -259,6 +261,7 @@ class constants(Enum):  # pylint: disable=c0103
             },
             "primary_key": ["CD_GRUPO", "CD_LINHA"],
             "interval_minutes": 60,
+            "truncate_hour": True,
         },
         {
             "table_id": "matriz_integracao",
@@ -280,6 +283,7 @@ class constants(Enum):  # pylint: disable=c0103
                 "cd_integracao",
             ],  # id column to nest data on
             "interval_minutes": 60,
+            "truncate_hour": True,
         },
     ]
 
diff --git a/pipelines/rj_smtr/flows.py b/pipelines/rj_smtr/flows.py
index 18a7fb1a3..7340e6a3b 100644
--- a/pipelines/rj_smtr/flows.py
+++ b/pipelines/rj_smtr/flows.py
@@ -26,7 +26,7 @@
 from pipelines.rj_smtr.tasks import (
     create_date_hour_partition,
     create_local_partition_path,
-    get_current_timestamp,
+    get_rounded_timestamp,
     parse_timestamp_to_string,
     transform_raw_to_nested_structure,
     create_dbt_run_vars,
@@ -52,6 +52,7 @@
     table_id = Parameter("table_id", default=None)
     dataset_id = Parameter("dataset_id", default=None)
     partition_date_only = Parameter("partition_date_only", default=None)
+    timestamp = Parameter("timestamp", default=None)
 
     # Parâmetros Captura #
     extract_params = Parameter("extract_params", default=None)
@@ -70,16 +71,21 @@
         checkpoint=False,
     )
 
+    current_timestamp = get_rounded_timestamp(
+        timestamp=timestamp, interval_minutes=interval_minutes
+    )
+
     with case(recapture, True):
         _, recapture_timestamps, recapture_previous_errors = query_logs(
             dataset_id=dataset_id,
             table_id=table_id,
+            datetime_filter=current_timestamp,
             interval_minutes=interval_minutes,
             recapture_window_days=recapture_window_days,
         )
 
     with case(recapture, False):
-        capture_timestamp = [get_current_timestamp()]
+        capture_timestamp = [current_timestamp]
         capture_previous_errors = task(
             lambda: [None], checkpoint=False, name="assign_none_to_previous_errors"
         )()
diff --git a/pipelines/rj_smtr/tasks.py b/pipelines/rj_smtr/tasks.py
index 79cd84751..4bb7e481d 100644
--- a/pipelines/rj_smtr/tasks.py
+++ b/pipelines/rj_smtr/tasks.py
@@ -243,13 +243,54 @@ def create_dbt_run_vars(
 
 
 @task
-def get_current_timestamp(timestamp=None, truncate_minute: bool = True) -> datetime:
+def get_rounded_timestamp(
+    timestamp: Union[str, datetime, None] = None,
+    interval_minutes: Union[int, None] = None,
+) -> datetime:
+    """
+    Calculate rounded timestamp for flow run.
+
+    Args:
+        timestamp (Union[str, datetime, None]): timestamp to be used as reference
+        interval_minutes (Union[int, None], optional): interval in minutes between each recapture
+
+    Returns:
+        datetime: timestamp for flow run
+    """
+    if isinstance(timestamp, str):
+        timestamp = datetime.fromisoformat(timestamp)
+
+    if not timestamp:
+        timestamp = datetime.now(tz=timezone(constants.TIMEZONE.value))
+
+    timestamp = timestamp.replace(second=0, microsecond=0)
+
+    if interval_minutes:
+        if interval_minutes >= 60:
+            hours = interval_minutes / 60
+            interval_minutes = round(((hours) % 1) * 60)
+
+        if interval_minutes == 0:
+            rounded_minutes = interval_minutes
+        else:
+            rounded_minutes = (timestamp.minute // interval_minutes) * interval_minutes
+
+        timestamp = timestamp.replace(minute=rounded_minutes)
+
+    return timestamp
+
+
+@task
+def get_current_timestamp(
+    timestamp=None, truncate_minute: bool = True, truncate_hour: bool = False
+) -> datetime:
     """
     Get current timestamp for flow run.
 
     Args:
         timestamp: timestamp to be used as reference (optionally, it can be a string)
         truncate_minute: whether to truncate the timestamp to the minute or not
+        truncate_hour: whether to truncate the timestamp to the hour or not
 
     Returns:
         datetime: timestamp for flow run
@@ -259,7 +300,9 @@ def get_current_timestamp(timestamp=None, truncate_minute: bool = True) -> datet
     if not timestamp:
         timestamp = datetime.now(tz=timezone(constants.TIMEZONE.value))
     if truncate_minute:
-        return timestamp.replace(second=0, microsecond=0)
+        timestamp = timestamp.replace(second=0, microsecond=0)
+    if truncate_hour:
+        timestamp = timestamp.replace(minute=0)
     return timestamp
 
 
@@ -385,6 +428,7 @@ def query_logs(
         max_recaptures (int, optional): maximum number of recaptures to be done
         interval_minutes (int, optional): interval in minutes between each recapture
         recapture_window_days (int, optional): Number of days to query for erros
+        truncate_hour: whether to truncate the timestamp to the hour or not
 
     Returns:
         lists: errors (bool),
@@ -1265,3 +1309,28 @@ def unpack_mapped_results_nout2(
 
     """
     return [r[0] for r in mapped_results], [r[1] for r in mapped_results]
+
+
+@task(checkpoint=False)
+def join_dicts(
+    original_dict: Union[dict, list[dict]], dict_to_join: dict
+) -> Union[dict, list[dict]]:
+    """
+    Task to join a dict or list of dicts with another dict
+
+    Args:
+        original_dict (Union[dict, list[dict]]): The input dict or list of dicts
+        dict_to_join (dict): The dict to be joined with original_dict
+
+    Returns:
+        Union[dict, list[dict]]: The joined value
+    """
+
+    if isinstance(original_dict, list):
+        return [d | dict_to_join for d in original_dict]
+    elif isinstance(original_dict, dict):
+        return original_dict | dict_to_join
+    else:
+        raise ValueError(
+            f"original_dict must be dict or list, received: {type(original_dict)}"
+        )

From d4e16db9e7ddaf87156fd9142d944643a28a5077 Mon Sep 17 00:00:00 2001
From: Rafael <rafael.c.pinheiro96@gmail.com>
Date: Mon, 23 Oct 2023 12:59:22 -0300
Subject: [PATCH 137/145] remover parametro timestamp

---
 .../br_rj_riodejaneiro_bilhetagem/flows.py    | 17 ++++---------
 pipelines/rj_smtr/flows.py                    |  5 +---
 pipelines/rj_smtr/tasks.py                    | 25 -------------------
 3 files changed, 6 insertions(+), 41 deletions(-)

diff --git a/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/flows.py b/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/flows.py
index 1c8c5c934..8dcfe80cd 100644
--- a/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/flows.py
+++ b/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/flows.py
@@ -30,7 +30,7 @@
     default_materialization_flow,
 )
 
-from pipelines.rj_smtr.tasks import get_rounded_timestamp, join_dicts
+from pipelines.rj_smtr.tasks import get_rounded_timestamp
 
 from pipelines.rj_smtr.constants import constants
 
@@ -141,16 +141,13 @@
     LABELS = get_current_flow_labels()
 
     # Recaptura Transação
-    transacao_recapture_params = join_dicts(
-        original_dict=constants.BILHETAGEM_TRANSACAO_CAPTURE_PARAMS.value,
-        dict_to_join={"timestamp": timestamp},
-    )
+
     run_recaptura_trasacao = create_flow_run(
         flow_name=bilhetagem_recaptura.name,
         # project_name=emd_constants.PREFECT_DEFAULT_PROJECT.value,
         project_name="staging",
         labels=LABELS,
-        parameters=transacao_recapture_params,
+        parameters=constants.BILHETAGEM_TRANSACAO_CAPTURE_PARAMS.value,
     )
 
     wait_recaptura_trasacao = wait_for_flow_run(
@@ -162,15 +159,11 @@
 
     # Captura Auxiliar
 
-    auxiliar_capture_params = join_dicts(
-        original_dict=constants.BILHETAGEM_CAPTURE_PARAMS.value,
-        dict_to_join={"timestamp": timestamp},
-    )
     runs_captura = create_flow_run.map(
         flow_name=unmapped(bilhetagem_auxiliar_captura.name),
         # project_name=unmapped(emd_constants.PREFECT_DEFAULT_PROJECT.value),
         project_name=unmapped("staging"),
-        parameters=auxiliar_capture_params,
+        parameters=constants.BILHETAGEM_CAPTURE_PARAMS.value,
         labels=unmapped(LABELS),
     )
 
@@ -187,7 +180,7 @@
         flow_name=unmapped(bilhetagem_recaptura.name),
         # project_name=unmapped(emd_constants.PREFECT_DEFAULT_PROJECT.value),
         project_name=unmapped("staging"),
-        parameters=auxiliar_capture_params,
+        parameters=constants.BILHETAGEM_CAPTURE_PARAMS.value,
         labels=unmapped(LABELS),
     )
 
diff --git a/pipelines/rj_smtr/flows.py b/pipelines/rj_smtr/flows.py
index 7340e6a3b..0dddf166b 100644
--- a/pipelines/rj_smtr/flows.py
+++ b/pipelines/rj_smtr/flows.py
@@ -52,7 +52,6 @@
     table_id = Parameter("table_id", default=None)
     dataset_id = Parameter("dataset_id", default=None)
     partition_date_only = Parameter("partition_date_only", default=None)
-    timestamp = Parameter("timestamp", default=None)
 
     # Parâmetros Captura #
     extract_params = Parameter("extract_params", default=None)
@@ -71,9 +70,7 @@
         checkpoint=False,
     )
 
-    current_timestamp = get_rounded_timestamp(
-        timestamp=timestamp, interval_minutes=interval_minutes
-    )
+    current_timestamp = get_rounded_timestamp(interval_minutes=interval_minutes)
 
     with case(recapture, True):
         _, recapture_timestamps, recapture_previous_errors = query_logs(
diff --git a/pipelines/rj_smtr/tasks.py b/pipelines/rj_smtr/tasks.py
index 4bb7e481d..887e2c29f 100644
--- a/pipelines/rj_smtr/tasks.py
+++ b/pipelines/rj_smtr/tasks.py
@@ -1309,28 +1309,3 @@ def unpack_mapped_results_nout2(
 
     """
     return [r[0] for r in mapped_results], [r[1] for r in mapped_results]
-
-
-@task(checkpoint=False)
-def join_dicts(
-    original_dict: Union[dict, list[dict]], dict_to_join: dict
-) -> Union[dict, list[dict]]:
-    """
-    Task to join a dict or list of dicts with another dict
-
-    Args:
-        original_dict (Union[dict, list[dict]]): The input dict or list of dicts
-        dict_to_join (dict): The dict to be joined with original_dict
-
-    Returns:
-        Union[dict, list[dict]]: The joined value
-    """
-
-    if isinstance(original_dict, list):
-        return [d | dict_to_join for d in original_dict]
-    elif isinstance(original_dict, dict):
-        return original_dict | dict_to_join
-    else:
-        raise ValueError(
-            f"original_dict must be dict or list, received: {type(original_dict)}"
-        )

From af89e2fe07244b9878c429fa7b0606b8a852c26b Mon Sep 17 00:00:00 2001
From: Rafael <rafael.c.pinheiro96@gmail.com>
Date: Mon, 23 Oct 2023 13:09:46 -0300
Subject: [PATCH 138/145] remove truncate hour

---
 pipelines/rj_smtr/tasks.py | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/pipelines/rj_smtr/tasks.py b/pipelines/rj_smtr/tasks.py
index 887e2c29f..2c7e60c16 100644
--- a/pipelines/rj_smtr/tasks.py
+++ b/pipelines/rj_smtr/tasks.py
@@ -281,16 +281,13 @@ def get_rounded_timestamp(
 
 
 @task
-def get_current_timestamp(
-    timestamp=None, truncate_minute: bool = True, truncate_hour: bool = False
-) -> datetime:
+def get_current_timestamp(timestamp=None, truncate_minute: bool = True) -> datetime:
     """
     Get current timestamp for flow run.
 
     Args:
         timestamp: timestamp to be used as reference (optionally, it can be a string)
         truncate_minute: whether to truncate the timestamp to the minute or not
-        truncate_hour: whether to truncate the timestamp to the hour or not
 
     Returns:
         datetime: timestamp for flow run
@@ -301,8 +298,7 @@ def get_current_timestamp(
         timestamp = datetime.now(tz=timezone(constants.TIMEZONE.value))
     if truncate_minute:
         timestamp = timestamp.replace(second=0, microsecond=0)
-    if truncate_hour:
-        timestamp = timestamp.replace(minute=0)
+
     return timestamp
 
 

From ddffd6cdf69f20eae8c92775ab4d19bd16dc4cfe Mon Sep 17 00:00:00 2001
From: Rafael <rafael.c.pinheiro96@gmail.com>
Date: Mon, 23 Oct 2023 13:32:44 -0300
Subject: [PATCH 139/145] mudar agent para prd

---
 pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/flows.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/flows.py b/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/flows.py
index 8dcfe80cd..899ad3127 100644
--- a/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/flows.py
+++ b/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/flows.py
@@ -211,6 +211,6 @@
 bilhetagem_transacao_tratamento.storage = GCS(emd_constants.GCS_FLOWS_BUCKET.value)
 bilhetagem_transacao_tratamento.run_config = KubernetesRun(
     image=emd_constants.DOCKER_IMAGE.value,
-    labels=[emd_constants.RJ_SMTR_DEV_AGENT_LABEL.value],
+    labels=[emd_constants.RJ_SMTR_AGENT_LABEL.value],
 )
 bilhetagem_transacao_tratamento.schedule = every_hour

From a4660d169b4fcc60c0fa8af176d26ca034fcb55b Mon Sep 17 00:00:00 2001
From: Rafael <rafael.c.pinheiro96@gmail.com>
Date: Mon, 23 Oct 2023 15:25:28 -0300
Subject: [PATCH 140/145] mudar project name

---
 pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/flows.py | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/flows.py b/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/flows.py
index 899ad3127..4ca7bc6ec 100644
--- a/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/flows.py
+++ b/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/flows.py
@@ -144,8 +144,7 @@
 
     run_recaptura_trasacao = create_flow_run(
         flow_name=bilhetagem_recaptura.name,
-        # project_name=emd_constants.PREFECT_DEFAULT_PROJECT.value,
-        project_name="staging",
+        project_name=emd_constants.PREFECT_DEFAULT_PROJECT.value,
         labels=LABELS,
         parameters=constants.BILHETAGEM_TRANSACAO_CAPTURE_PARAMS.value,
     )
@@ -161,8 +160,7 @@
 
     runs_captura = create_flow_run.map(
         flow_name=unmapped(bilhetagem_auxiliar_captura.name),
-        # project_name=unmapped(emd_constants.PREFECT_DEFAULT_PROJECT.value),
-        project_name=unmapped("staging"),
+        project_name=unmapped(emd_constants.PREFECT_DEFAULT_PROJECT.value),
         parameters=constants.BILHETAGEM_CAPTURE_PARAMS.value,
         labels=unmapped(LABELS),
     )
@@ -178,8 +176,7 @@
 
     runs_recaptura_auxiliar = create_flow_run.map(
         flow_name=unmapped(bilhetagem_recaptura.name),
-        # project_name=unmapped(emd_constants.PREFECT_DEFAULT_PROJECT.value),
-        project_name=unmapped("staging"),
+        project_name=unmapped(emd_constants.PREFECT_DEFAULT_PROJECT.value),
         parameters=constants.BILHETAGEM_CAPTURE_PARAMS.value,
         labels=unmapped(LABELS),
     )

From 5f3596b40afe8e078b6ae037fb7fe47f940b9c89 Mon Sep 17 00:00:00 2001
From: Rafael <rafael.c.pinheiro96@gmail.com>
Date: Mon, 23 Oct 2023 15:33:21 -0300
Subject: [PATCH 141/145] criar constante interval

---
 .../rj_smtr/br_rj_riodejaneiro_bilhetagem/flows.py     |  4 +++-
 pipelines/rj_smtr/constants.py                         | 10 ++++++----
 2 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/flows.py b/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/flows.py
index 4ca7bc6ec..03293ca0c 100644
--- a/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/flows.py
+++ b/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/flows.py
@@ -131,7 +131,9 @@
 ) as bilhetagem_transacao_tratamento:
     # Configuração #
 
-    timestamp = get_rounded_timestamp(interval_minutes=60)
+    timestamp = get_rounded_timestamp(
+        interval_minutes=constants.BILHETAGEM_AUXILIAR_INTERVAL.value
+    )
 
     rename_flow_run = rename_current_flow_run_now_time(
         prefix=bilhetagem_transacao_tratamento.name + " ",
diff --git a/pipelines/rj_smtr/constants.py b/pipelines/rj_smtr/constants.py
index 0c8c5e134..a85e2af73 100644
--- a/pipelines/rj_smtr/constants.py
+++ b/pipelines/rj_smtr/constants.py
@@ -228,6 +228,8 @@ class constants(Enum):  # pylint: disable=c0103
 
     BILHETAGEM_SECRET_PATH = "smtr_jae_access_data"
 
+    BILHETAGEM_AUXILIAR_INTERVAL = 60
+
     BILHETAGEM_CAPTURE_PARAMS = [
         {
             "table_id": "linha",
@@ -245,7 +247,7 @@ class constants(Enum):  # pylint: disable=c0103
                 """,
             },
             "primary_key": ["CD_LINHA"],  # id column to nest data on
-            "interval_minutes": 60,
+            "interval_minutes": BILHETAGEM_AUXILIAR_INTERVAL,
             "truncate_hour": True,
         },
         {
@@ -264,7 +266,7 @@ class constants(Enum):  # pylint: disable=c0103
                 """,
             },
             "primary_key": ["CD_GRUPO"],  # id column to nest data on
-            "interval_minutes": 60,
+            "interval_minutes": BILHETAGEM_AUXILIAR_INTERVAL,
             "truncate_hour": True,
         },
         {
@@ -283,7 +285,7 @@ class constants(Enum):  # pylint: disable=c0103
                 """,
             },
             "primary_key": ["CD_GRUPO", "CD_LINHA"],
-            "interval_minutes": 60,
+            "interval_minutes": BILHETAGEM_AUXILIAR_INTERVAL,
             "truncate_hour": True,
         },
         {
@@ -305,7 +307,7 @@ class constants(Enum):  # pylint: disable=c0103
                 "cd_versao_matriz",
                 "cd_integracao",
             ],  # id column to nest data on
-            "interval_minutes": 60,
+            "interval_minutes": BILHETAGEM_AUXILIAR_INTERVAL,
             "truncate_hour": True,
         },
     ]

From 9a057080da825807b738de162b8cdded57150c7c Mon Sep 17 00:00:00 2001
From: Rafael <rafael.c.pinheiro96@gmail.com>
Date: Mon, 23 Oct 2023 15:42:15 -0300
Subject: [PATCH 142/145] criar recaptura gps

---
 .../br_rj_riodejaneiro_bilhetagem/flows.py    | 36 +++++++++++++++++--
 pipelines/rj_smtr/constants.py                | 10 +++---
 2 files changed, 39 insertions(+), 7 deletions(-)

diff --git a/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/flows.py b/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/flows.py
index 03293ca0c..02c675860 100644
--- a/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/flows.py
+++ b/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/flows.py
@@ -70,7 +70,7 @@
 bilhetagem_tracking_captura.storage = GCS(emd_constants.GCS_FLOWS_BUCKET.value)
 bilhetagem_tracking_captura.run_config = KubernetesRun(
     image=emd_constants.DOCKER_IMAGE.value,
-    labels=[emd_constants.RJ_SMTR_DEV_AGENT_LABEL.value],
+    labels=[emd_constants.RJ_SMTR_AGENT_LABEL.value],
 )
 
 bilhetagem_tracking_captura = set_default_parameters(
@@ -132,7 +132,7 @@
     # Configuração #
 
     timestamp = get_rounded_timestamp(
-        interval_minutes=constants.BILHETAGEM_AUXILIAR_INTERVAL.value
+        interval_minutes=constants.BILHETAGEM_TRATAMENTO_INTERVAL.value
     )
 
     rename_flow_run = rename_current_flow_run_now_time(
@@ -213,3 +213,35 @@
     labels=[emd_constants.RJ_SMTR_AGENT_LABEL.value],
 )
 bilhetagem_transacao_tratamento.schedule = every_hour
+
+
+with Flow(
+    "SMTR: Bilhetagem GPS Validador - Tratamento",
+    code_owners=["caio", "fernanda", "boris", "rodrigo"],
+) as bilhetagem_gps_tratamento:
+    timestamp = get_rounded_timestamp(
+        interval_minutes=constants.BILHETAGEM_TRATAMENTO_INTERVAL.value
+    )
+
+    rename_flow_run = rename_current_flow_run_now_time(
+        prefix=bilhetagem_transacao_tratamento.name + " ",
+        now_time=timestamp,
+    )
+
+    LABELS = get_current_flow_labels()
+
+    # Recaptura Transação
+
+    run_recaptura_gps = create_flow_run(
+        flow_name=bilhetagem_recaptura.name,
+        project_name=emd_constants.PREFECT_DEFAULT_PROJECT.value,
+        labels=LABELS,
+        parameters=constants.BILHETAGEM_TRACKING_CAPTURE_PARAMS.value,
+    )
+
+    wait_recaptura_gps = wait_for_flow_run(
+        run_recaptura_gps,
+        stream_states=True,
+        stream_logs=True,
+        raise_final_state=True,
+    )
diff --git a/pipelines/rj_smtr/constants.py b/pipelines/rj_smtr/constants.py
index a85e2af73..a4a8c375f 100644
--- a/pipelines/rj_smtr/constants.py
+++ b/pipelines/rj_smtr/constants.py
@@ -228,7 +228,7 @@ class constants(Enum):  # pylint: disable=c0103
 
     BILHETAGEM_SECRET_PATH = "smtr_jae_access_data"
 
-    BILHETAGEM_AUXILIAR_INTERVAL = 60
+    BILHETAGEM_TRATAMENTO_INTERVAL = 60
 
     BILHETAGEM_CAPTURE_PARAMS = [
         {
@@ -247,7 +247,7 @@ class constants(Enum):  # pylint: disable=c0103
                 """,
             },
             "primary_key": ["CD_LINHA"],  # id column to nest data on
-            "interval_minutes": BILHETAGEM_AUXILIAR_INTERVAL,
+            "interval_minutes": BILHETAGEM_TRATAMENTO_INTERVAL,
             "truncate_hour": True,
         },
         {
@@ -266,7 +266,7 @@ class constants(Enum):  # pylint: disable=c0103
                 """,
             },
             "primary_key": ["CD_GRUPO"],  # id column to nest data on
-            "interval_minutes": BILHETAGEM_AUXILIAR_INTERVAL,
+            "interval_minutes": BILHETAGEM_TRATAMENTO_INTERVAL,
             "truncate_hour": True,
         },
         {
@@ -285,7 +285,7 @@ class constants(Enum):  # pylint: disable=c0103
                 """,
             },
             "primary_key": ["CD_GRUPO", "CD_LINHA"],
-            "interval_minutes": BILHETAGEM_AUXILIAR_INTERVAL,
+            "interval_minutes": BILHETAGEM_TRATAMENTO_INTERVAL,
             "truncate_hour": True,
         },
         {
@@ -307,7 +307,7 @@ class constants(Enum):  # pylint: disable=c0103
                 "cd_versao_matriz",
                 "cd_integracao",
             ],  # id column to nest data on
-            "interval_minutes": BILHETAGEM_AUXILIAR_INTERVAL,
+            "interval_minutes": BILHETAGEM_TRATAMENTO_INTERVAL,
             "truncate_hour": True,
         },
     ]

From b5a403d9f9f8822487b43f15068100f90b3a5886 Mon Sep 17 00:00:00 2001
From: Rafael <rafael.c.pinheiro96@gmail.com>
Date: Mon, 23 Oct 2023 15:42:26 -0300
Subject: [PATCH 143/145] corrigir docstring

---
 pipelines/rj_smtr/tasks.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/pipelines/rj_smtr/tasks.py b/pipelines/rj_smtr/tasks.py
index 2c7e60c16..9a8188ebe 100644
--- a/pipelines/rj_smtr/tasks.py
+++ b/pipelines/rj_smtr/tasks.py
@@ -424,7 +424,6 @@ def query_logs(
         max_recaptures (int, optional): maximum number of recaptures to be done
         interval_minutes (int, optional): interval in minutes between each recapture
         recapture_window_days (int, optional): Number of days to query for erros
-        truncate_hour: whether to truncate the timestamp to the hour or not
 
     Returns:
         lists: errors (bool),

From 165f9abedd825ac8aff14fe2215f2f1ed659b996 Mon Sep 17 00:00:00 2001
From: Rafael <rafael.c.pinheiro96@gmail.com>
Date: Mon, 23 Oct 2023 15:43:47 -0300
Subject: [PATCH 144/145] alterar comentario recaptura

---
 pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/flows.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/flows.py b/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/flows.py
index 02c675860..04f6eb61f 100644
--- a/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/flows.py
+++ b/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/flows.py
@@ -230,7 +230,7 @@
 
     LABELS = get_current_flow_labels()
 
-    # Recaptura Transação
+    # Recaptura GPS
 
     run_recaptura_gps = create_flow_run(
         flow_name=bilhetagem_recaptura.name,

From e8711b6bc9cbcc4bbfee46c7e2fc8752cccd231d Mon Sep 17 00:00:00 2001
From: Rafael <rafael.c.pinheiro96@gmail.com>
Date: Mon, 23 Oct 2023 15:51:34 -0300
Subject: [PATCH 145/145] voltar task get_current_timestamp

---
 pipelines/rj_smtr/tasks.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/pipelines/rj_smtr/tasks.py b/pipelines/rj_smtr/tasks.py
index 9a8188ebe..236988282 100644
--- a/pipelines/rj_smtr/tasks.py
+++ b/pipelines/rj_smtr/tasks.py
@@ -297,9 +297,7 @@ def get_current_timestamp(timestamp=None, truncate_minute: bool = True) -> datet
     if not timestamp:
         timestamp = datetime.now(tz=timezone(constants.TIMEZONE.value))
     if truncate_minute:
-        timestamp = timestamp.replace(second=0, microsecond=0)
-
-    return timestamp
+        return timestamp.replace(second=0, microsecond=0)
 
 
 @task