From 3edf4587bb7992acdcb9cd4ca7f4fc155b4b4d8c Mon Sep 17 00:00:00 2001
From: d116626 <d116626@gmail.com>
Date: Mon, 18 Sep 2023 17:04:05 -0300
Subject: [PATCH 01/41] chore: add sms sigma estoque

---
 pipelines/rj_sms/__init__.py                |   1 +
 pipelines/rj_sms/dump_db_sigma/__init__.py  |   0
 pipelines/rj_sms/dump_db_sigma/flows.py     |  47 ++++
 pipelines/rj_sms/dump_db_sigma/schedules.py | 232 ++++++++++++++++++++
 4 files changed, 280 insertions(+)
 create mode 100644 pipelines/rj_sms/dump_db_sigma/__init__.py
 create mode 100644 pipelines/rj_sms/dump_db_sigma/flows.py
 create mode 100644 pipelines/rj_sms/dump_db_sigma/schedules.py

diff --git a/pipelines/rj_sms/__init__.py b/pipelines/rj_sms/__init__.py
index e73a968a6..eab9c1cc4 100644
--- a/pipelines/rj_sms/__init__.py
+++ b/pipelines/rj_sms/__init__.py
@@ -3,5 +3,6 @@
 Prefect flows for rj_sms project
 """
 
+from pipelines.rj_sms.dump_db_sigma.flows import *
 from pipelines.rj_sms.dump_db_sivep.flows import *
 from pipelines.rj_sms.pubsub.flows import *
diff --git a/pipelines/rj_sms/dump_db_sigma/__init__.py b/pipelines/rj_sms/dump_db_sigma/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/pipelines/rj_sms/dump_db_sigma/flows.py b/pipelines/rj_sms/dump_db_sigma/flows.py
new file mode 100644
index 000000000..8c238a995
--- /dev/null
+++ b/pipelines/rj_sms/dump_db_sigma/flows.py
@@ -0,0 +1,47 @@
+# -*- coding: utf-8 -*-
+"""
+Database dumping flows for SMS SIGMA system
+"""
+
+from copy import deepcopy
+
+from prefect.run_configs import KubernetesRun
+from prefect.storage import GCS
+
+from pipelines.constants import constants
+
+# importa o schedule
+from pipelines.rj_sms.dump_db_sigma.schedules import (
+    sigma_daily_update_schedule,
+)
+from pipelines.utils.dump_db.flows import dump_sql_flow
+from pipelines.utils.utils import set_default_parameters
+
+rj_sms_dump_db_sigma_flow = deepcopy(dump_sql_flow)
+rj_sms_dump_db_sigma_flow.name = (
+    "SMS: SIGMA - ESTOQUE MEDICAMENTOS - Ingerir tabelas de banco SQL"
+)
+rj_sms_dump_db_sigma_flow.storage = GCS(constants.GCS_FLOWS_BUCKET.value)
+
+rj_sms_dump_db_sigma_flow.run_config = KubernetesRun(
+    image=constants.DOCKER_IMAGE.value,
+    labels=[
+        constants.RJ_SMS_AGENT_LABEL.value,  # label do agente
+    ],
+)
+
+rj_sms_dump_db_sigma_default_parameters = {
+    "db_database": "CP01.SMF",
+    "db_host": "10.90.31.22",
+    "db_port": "1521",
+    "db_type": "oracle",
+    "dataset_id": "saude_estoque_medicamentos_sigma",
+    "vault_secret_path": "db-sigma",
+}
+
+rj_sms_dump_db_sigma_flow = set_default_parameters(
+    rj_sms_dump_db_sigma_flow,
+    default_parameters=rj_sms_dump_db_sigma_default_parameters,
+)
+
+rj_sms_dump_db_sigma_flow.schedule = sigma_daily_update_schedule
diff --git a/pipelines/rj_sms/dump_db_sigma/schedules.py b/pipelines/rj_sms/dump_db_sigma/schedules.py
new file mode 100644
index 000000000..b9c032b52
--- /dev/null
+++ b/pipelines/rj_sms/dump_db_sigma/schedules.py
@@ -0,0 +1,232 @@
+# -*- coding: utf-8 -*-
+"""
+Schedules for the SMS SIGMA dump_db pipeline.
+"""
+
+from datetime import timedelta, datetime
+
+from prefect.schedules import Schedule
+import pytz
+
+from pipelines.constants import constants
+from pipelines.utils.dump_db.utils import generate_dump_db_schedules
+from pipelines.utils.utils import untuple_clocks as untuple
+
+
+#####################################
+#
+# SMS SIGMA Schedules
+#
+#####################################
+
+_sigma_queries = {
+    "classe": {
+        "biglake_table": True,
+        "materialize_after_dump": True,
+        "materialization_mode": "prod",
+        "dump_mode": "overwrite",
+        "execute_query": """
+            SELECT 
+                CD_GRUPO, 
+                CD_CLASSE, 
+                DS_CLASSE, 
+                ST_STATUS
+            FROM SIGMA.VW_CLASSE
+        """,  # noqa
+    },
+    "fornecedor": {
+        "biglake_table": True,
+        "materialize_after_dump": True,
+        "materialization_mode": "prod",
+        "dump_mode": "overwrite",
+        "execute_query": """
+            SELECT 
+                CPF_CNPJ,
+                TIPO_CPF_CNPJ,
+                INSCRICAO_MUNICIPAL,
+                INSCRICAO_ESTADUAL,
+                RAZAO_SOCIAL,
+                NOME_FANTASIA,
+                NOME_CONTATO,
+                EMAIL,
+                EMAIL_CONTATO,
+                FAX,
+                DDD,
+                DDI,
+                RAMAL,
+                TELEFONE,
+                LOGRADOURO,
+                NUMERO_PORTA,
+                COMPLEMENTO,
+                BAIRRO,
+                MUNICIPIO,
+                UF,
+                CEP,
+                ATIVO_INATIVO_BLOQUEADO,
+                CD_NATUREZA_JURIDICA,
+                DS_NATUREZA_JURIDICA,
+                RAMO_ATIVIDADE,
+                CD_PORTE_EMPRESA,
+                DATA_ULTIMA_ATUALIZACAO,
+                FORNECEDOR_EVENTUAL
+            FROM SIGMA.VW_FORNECEDOR
+        """,  # noqa
+    },
+    "fornecedor_sem_vinculo": {
+        "biglake_table": True,
+        "materialize_after_dump": True,
+        "materialization_mode": "prod",
+        "dump_mode": "overwrite",
+        "execute_query": """
+            SELECT 
+                CPF_CNPJ,
+                TIPO_CPF_CNPJ,
+                NOME,
+                NUMERO_PORTA,
+                COMPLEMENTO
+            FROM SIGMA.VW_FORNECEDOR_SEM_VINCULO
+        """,  # noqa
+    },
+    "grupo": {
+        "biglake_table": True,
+        "materialize_after_dump": True,
+        "materialization_mode": "prod",
+        "dump_mode": "overwrite",
+        "execute_query": """
+            SELECT 
+                CD_GRUPO, 
+                DS_GRUPO, 
+                ST_STATUS
+            FROM SIGMA.VW_GRUPO
+        """,  # noqa
+    },
+    "material": {
+        "biglake_table": True,
+        "materialize_after_dump": True,
+        "materialization_mode": "prod",
+        "dump_mode": "overwrite",
+        "execute_query": """
+            SELECT 
+                CD_MATERIAL,
+                CD_GRUPO,
+                CD_CLASSE,
+                CD_SUBCLASSE,
+                SEQUENCIAL,
+                DV1,
+                DV2,
+                NM_PADRONIZADO,
+                NM_COMPLEMENTAR_MATERIAL,
+                UNIDADE,
+                DS_DETALHE_MATERIAL,
+                DT_DESATIVACAO,
+                ST_STATUS,
+                REMUME
+            FROM SIGMA.VW_MATERIAL
+        """,  # noqa
+    },
+    "movimentacao": {
+        "biglake_table": True,
+        "materialize_after_dump": True,
+        "materialization_mode": "prod",
+        "dump_mode": "overwrite",
+        "execute_query": """
+            SELECT 
+                CD_MATERIAL,
+                CNPJ_FORNECEDOR,
+                NOTA_FISCAL,
+                SERIE,
+                DATA_NOTA_FISCAL,
+                QUANTIDADE_ITEM,
+                PRECO_ITEM,
+                TOTAL_ITEM,
+                DATA_ULTIMA_ATUALIZACAO,
+                CD_MOVIMENTACAO,
+                DS_MOVIMENTACAO,
+                TP_ALMOXARIFADO,
+                CD_SECRETARIA,
+                DS_SECRETARIA,
+                CD_ALMOXARIFADO_DESTINO,
+                DS_ALMOXARIFADO_DESTINO,
+                CD_ALMOXARIFADO_ORIGEM,
+                DS_ALMOXARIFADO_ORIGEM,
+                CD_OS,
+                DT_INI_CONTRATO_OS,
+                DT_FIM_CONTRATO_OS,
+                NR_EMPENHO,
+                CNPJ_FABRICANTE
+            FROM SIGMA.VW_MOVIMENTACAO
+        """,  # noqa
+    },
+    "ramo_atividade": {
+        "biglake_table": True,
+        "materialize_after_dump": True,
+        "materialization_mode": "prod",
+        "dump_mode": "overwrite",
+        "execute_query": """
+            SELECT 
+                CD_RAMO, 
+                DS_RAMO, 
+                ST_RAMO
+            FROM SIGMA.VW_RAMO_ATIVIDADE
+        """,  # noqa
+    },
+    "servico": {
+        "biglake_table": True,
+        "materialize_after_dump": True,
+        "materialization_mode": "prod",
+        "dump_mode": "overwrite",
+        "execute_query": """
+            SELECT 
+                CD_SERV,
+                CD_SEQ,
+                CD_SERVICO,
+                DS_SERVICO,
+                ST_STATUS
+            FROM SIGMA.VW_SERVICO
+        """,  # noqa
+    },
+    "subclasse": {
+        "biglake_table": True,
+        "materialize_after_dump": True,
+        "materialization_mode": "prod",
+        "dump_mode": "overwrite",
+        "execute_query": """
+            SELECT 
+                CD_GRUPO,
+                CD_CLASSE,
+                CD_SUBCLASSE,
+                DS_SUBCLASSE,
+                ST_STATUS
+            FROM SIGMA.VW_SUBCLASSE
+        """,  # noqa
+    },
+    "unidade": {
+        "biglake_table": True,
+        "materialize_after_dump": True,
+        "materialization_mode": "prod",
+        "dump_mode": "overwrite",
+        "execute_query": """
+            SELECT 
+                UNIDADE, 
+                DS_UNIDADE
+            FROM SIGMA.VW_UNIDADE
+        """,  # noqa
+    },
+}
+
+sigma_infra_clocks = generate_dump_db_schedules(
+    interval=timedelta(days=1),
+    start_date=datetime(2022, 3, 21, 1, 0, tzinfo=pytz.timezone("America/Sao_Paulo")),
+    labels=[
+        constants.RJ_SMS_AGENT_LABEL.value,
+    ],
+    db_database="CP01.SMF",
+    db_host="10.90.31.22",
+    db_port="1521",
+    db_type="oracle",
+    dataset_id="saude_estoque_medicamentos_sigma",
+    vault_secret_path="db-sigma",
+    table_parameters=_sigma_queries,
+)
+
+sigma_daily_update_schedule = Schedule(clocks=untuple(sigma_infra_clocks))

From 70d5df1c25ced0ea304d7da118288aeff39f4186 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Mon, 18 Sep 2023 20:04:55 +0000
Subject: [PATCH 02/41] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 pipelines/rj_sms/dump_db_sigma/schedules.py | 36 ++++++++++-----------
 1 file changed, 18 insertions(+), 18 deletions(-)

diff --git a/pipelines/rj_sms/dump_db_sigma/schedules.py b/pipelines/rj_sms/dump_db_sigma/schedules.py
index b9c032b52..1a2549a94 100644
--- a/pipelines/rj_sms/dump_db_sigma/schedules.py
+++ b/pipelines/rj_sms/dump_db_sigma/schedules.py
@@ -26,10 +26,10 @@
         "materialization_mode": "prod",
         "dump_mode": "overwrite",
         "execute_query": """
-            SELECT 
-                CD_GRUPO, 
-                CD_CLASSE, 
-                DS_CLASSE, 
+            SELECT
+                CD_GRUPO,
+                CD_CLASSE,
+                DS_CLASSE,
                 ST_STATUS
             FROM SIGMA.VW_CLASSE
         """,  # noqa
@@ -40,7 +40,7 @@
         "materialization_mode": "prod",
         "dump_mode": "overwrite",
         "execute_query": """
-            SELECT 
+            SELECT
                 CPF_CNPJ,
                 TIPO_CPF_CNPJ,
                 INSCRICAO_MUNICIPAL,
@@ -78,7 +78,7 @@
         "materialization_mode": "prod",
         "dump_mode": "overwrite",
         "execute_query": """
-            SELECT 
+            SELECT
                 CPF_CNPJ,
                 TIPO_CPF_CNPJ,
                 NOME,
@@ -93,9 +93,9 @@
         "materialization_mode": "prod",
         "dump_mode": "overwrite",
         "execute_query": """
-            SELECT 
-                CD_GRUPO, 
-                DS_GRUPO, 
+            SELECT
+                CD_GRUPO,
+                DS_GRUPO,
                 ST_STATUS
             FROM SIGMA.VW_GRUPO
         """,  # noqa
@@ -106,7 +106,7 @@
         "materialization_mode": "prod",
         "dump_mode": "overwrite",
         "execute_query": """
-            SELECT 
+            SELECT
                 CD_MATERIAL,
                 CD_GRUPO,
                 CD_CLASSE,
@@ -130,7 +130,7 @@
         "materialization_mode": "prod",
         "dump_mode": "overwrite",
         "execute_query": """
-            SELECT 
+            SELECT
                 CD_MATERIAL,
                 CNPJ_FORNECEDOR,
                 NOTA_FISCAL,
@@ -163,9 +163,9 @@
         "materialization_mode": "prod",
         "dump_mode": "overwrite",
         "execute_query": """
-            SELECT 
-                CD_RAMO, 
-                DS_RAMO, 
+            SELECT
+                CD_RAMO,
+                DS_RAMO,
                 ST_RAMO
             FROM SIGMA.VW_RAMO_ATIVIDADE
         """,  # noqa
@@ -176,7 +176,7 @@
         "materialization_mode": "prod",
         "dump_mode": "overwrite",
         "execute_query": """
-            SELECT 
+            SELECT
                 CD_SERV,
                 CD_SEQ,
                 CD_SERVICO,
@@ -191,7 +191,7 @@
         "materialization_mode": "prod",
         "dump_mode": "overwrite",
         "execute_query": """
-            SELECT 
+            SELECT
                 CD_GRUPO,
                 CD_CLASSE,
                 CD_SUBCLASSE,
@@ -206,8 +206,8 @@
         "materialization_mode": "prod",
         "dump_mode": "overwrite",
         "execute_query": """
-            SELECT 
-                UNIDADE, 
+            SELECT
+                UNIDADE,
                 DS_UNIDADE
             FROM SIGMA.VW_UNIDADE
         """,  # noqa

From a889247b284641278cfd43b41245ac247c4e8a9e Mon Sep 17 00:00:00 2001
From: patriciacatandi <patriciabcatandi@gmail.com>
Date: Tue, 19 Sep 2023 19:44:02 -0300
Subject: [PATCH 03/41] starting inea flow using ftp

---
 .../rj_escritorio/dump_ftp_inea/flows.py      |  73 +++++++++++
 .../rj_escritorio/dump_ftp_inea/schedules.py  | 100 +++++++++++++++
 .../rj_escritorio/dump_ftp_inea/tasks.py      | 120 ++++++++++++++++++
 3 files changed, 293 insertions(+)
 create mode 100644 pipelines/rj_escritorio/dump_ftp_inea/flows.py
 create mode 100644 pipelines/rj_escritorio/dump_ftp_inea/schedules.py
 create mode 100644 pipelines/rj_escritorio/dump_ftp_inea/tasks.py

diff --git a/pipelines/rj_escritorio/dump_ftp_inea/flows.py b/pipelines/rj_escritorio/dump_ftp_inea/flows.py
new file mode 100644
index 000000000..94312214a
--- /dev/null
+++ b/pipelines/rj_escritorio/dump_ftp_inea/flows.py
@@ -0,0 +1,73 @@
+# -*- coding: utf-8 -*-
+"""
+Dumping  data from INEA FTP to BigQuery
+"""
+# pylint: disable=E1101,C0103
+
+from copy import deepcopy
+
+from prefect import Parameter
+from prefect.run_configs import LocalRun
+from prefect.storage import GCS
+from prefect.utilities.edges import unmapped
+
+from pipelines.constants import constants
+from pipelines.rj_escritorio.dump_ftp_inea.tasks import (
+    get_ftp_client,
+    get_files_to_download,
+    download_files,
+    upload_file_to_gcs,
+)
+from pipelines.rj_escritorio.dump_ftp_inea.schedules import (
+    every_5_minutes,
+    every_5_minutes_mac,
+)
+from pipelines.utils.decorators import Flow
+
+
+with Flow(
+    "INEA: Captura dados de radar (Guaratiba)",
+    code_owners=[
+        "paty",
+    ],
+) as inea_ftp_radar_flow:
+    bucket_name = Parameter("bucket_name", default="rj-escritorio-dev", required=False)
+    prefix = Parameter(
+        "prefix", default="raw/meio_ambiente_clima/inea_radar_hdf5", required=False
+    )
+    mode = Parameter("mode", default="prod", required=False)
+    radar = Parameter("radar", default="mac", required=False)
+    product = Parameter("product", default="ppi", required=False)
+
+    client = get_ftp_client()
+
+    files_to_download = get_files_to_download(
+        client=client,
+        radar=radar,
+    )
+
+    files_to_upload = download_files(
+        client=client,
+        files=files_to_download,
+        radar=radar,
+    )
+
+    upload_file_to_gcs.map(
+        file_to_upload=files_to_upload,
+        bucket_name=unmapped(bucket_name),
+        prefix=unmapped(prefix),
+        mode=unmapped(mode),
+        radar=unmapped(radar),
+        product=unmapped(product),
+    )
+
+
+inea_ftp_radar_flow.storage = GCS(constants.GCS_FLOWS_BUCKET.value)
+inea_ftp_radar_flow.run_config = LocalRun(labels=[constants.INEA_AGENT_LABEL.value])
+inea_ftp_radar_flow.schedule = every_5_minutes
+
+inea_ftp_radar_flow_mac = deepcopy(inea_ftp_radar_flow)
+inea_ftp_radar_flow_mac.name = "INEA: Captura dados de radar (Macaé)"
+inea_ftp_radar_flow_mac.storage = GCS(constants.GCS_FLOWS_BUCKET.value)
+inea_ftp_radar_flow_mac.run_config = LocalRun(labels=[constants.INEA_AGENT_LABEL.value])
+inea_ftp_radar_flow_mac.schedule = every_5_minutes_mac
diff --git a/pipelines/rj_escritorio/dump_ftp_inea/schedules.py b/pipelines/rj_escritorio/dump_ftp_inea/schedules.py
new file mode 100644
index 000000000..ae59322ff
--- /dev/null
+++ b/pipelines/rj_escritorio/dump_ftp_inea/schedules.py
@@ -0,0 +1,100 @@
+# -*- coding: utf-8 -*-
+# pylint: disable=C0103
+"""
+Schedules for the INEA flows.
+"""
+
+from datetime import timedelta, datetime
+
+from prefect.schedules import Schedule
+from prefect.schedules.clocks import IntervalClock
+import pytz
+
+from pipelines.constants import constants
+
+every_5_minutes = Schedule(
+    clocks=[
+        IntervalClock(
+            interval=timedelta(minutes=5),
+            start_date=datetime(2021, 1, 1, tzinfo=pytz.timezone("America/Sao_Paulo")),
+            labels=[
+                constants.INEA_AGENT_LABEL.value,
+            ],
+            parameter_defaults={
+                "bucket_name": "rj-escritorio-dev",
+                "convert_params": "-k=ODIM2.1 -M=All",
+                "mode": "prod",
+                "output_format": "HDF5",
+                "prefix": "raw/meio_ambiente_clima/inea_radar_hdf5",
+                "product": "ppi",
+                "radar": "gua",
+                "vols_remote_directory": "/var/opt/edge/vols",
+            },
+        ),
+    ]
+)
+every_5_minutes_mac = Schedule(
+    clocks=[
+        IntervalClock(
+            interval=timedelta(minutes=5),
+            start_date=datetime(2021, 1, 1, tzinfo=pytz.timezone("America/Sao_Paulo")),
+            labels=[
+                constants.INEA_AGENT_LABEL.value,
+            ],
+            parameter_defaults={
+                "bucket_name": "rj-escritorio-dev",
+                "convert_params": "-k=ODIM2.1 -M=All",
+                "mode": "prod",
+                "output_format": "HDF5",
+                "prefix": "raw/meio_ambiente_clima/inea_radar_hdf5",
+                "product": "ppi",
+                "radar": "mac",
+                "vols_remote_directory": "/var/opt/edge/vols",
+            },
+        ),
+    ]
+)
+every_1_day = Schedule(
+    clocks=[
+        IntervalClock(
+            interval=timedelta(days=1),
+            start_date=datetime(2021, 1, 1, tzinfo=pytz.timezone("America/Sao_Paulo")),
+            labels=[
+                constants.INEA_AGENT_LABEL.value,
+            ],
+            parameter_defaults={
+                "bucket_name": "rj-escritorio-dev",
+                "convert_params": "-k=ODIM2.1 -M=All",
+                "mode": "prod",
+                "output_format": "HDF5",
+                "prefix": "raw/meio_ambiente_clima/inea_radar_hdf5",
+                "product": "ppi",
+                "radar": "gua",
+                "get_only_last_file": False,
+                "vols_remote_directory": "/var/opt/edge/vols",
+            },
+        ),
+    ]
+)
+every_1_day_mac = Schedule(
+    clocks=[
+        IntervalClock(
+            interval=timedelta(days=1),
+            start_date=datetime(2021, 1, 1, tzinfo=pytz.timezone("America/Sao_Paulo")),
+            labels=[
+                constants.INEA_AGENT_LABEL.value,
+            ],
+            parameter_defaults={
+                "bucket_name": "rj-escritorio-dev",
+                "convert_params": "-k=ODIM2.1 -M=All",
+                "mode": "prod",
+                "output_format": "HDF5",
+                "prefix": "raw/meio_ambiente_clima/inea_radar_hdf5",
+                "product": "ppi",
+                "radar": "mac",
+                "get_only_last_file": False,
+                "vols_remote_directory": "/var/opt/edge/vols",
+            },
+        ),
+    ]
+)
diff --git a/pipelines/rj_escritorio/dump_ftp_inea/tasks.py b/pipelines/rj_escritorio/dump_ftp_inea/tasks.py
new file mode 100644
index 000000000..f63b0d92c
--- /dev/null
+++ b/pipelines/rj_escritorio/dump_ftp_inea/tasks.py
@@ -0,0 +1,120 @@
+# -*- coding: utf-8 -*-
+"""
+Tasks to dump data from a INEA FTP to BigQuery
+"""
+# pylint: disable=E0702,E1137,E1136,E1101,C0207,W0613
+from datetime import datetime, timedelta
+from pathlib import Path
+
+from google.cloud import storage
+from prefect import task
+
+from pipelines.utils.ftp.client import FTPClient
+from pipelines.utils.utils import (
+    log,
+    get_credentials_from_env,
+    get_vault_secret,
+)
+
+
+@task
+def get_ftp_client(wait=None):
+    """
+    Get FTP client
+    """
+    inea_secret = get_vault_secret("ftp_inea_radar")
+    hostname = inea_secret["data"]["hostname"]
+    username = inea_secret["data"]["username"]
+    password = inea_secret["data"]["password"]
+
+    return FTPClient(
+        hostname=hostname,
+        username=username,
+        password=password,
+    )
+
+
+@task(
+    max_retries=3,
+    retry_delay=timedelta(seconds=30),
+)
+def get_files_to_download(client, radar):
+    """
+    Get files to download FTP and GCS
+    """
+
+    client.connect()
+    files = client.list_files(path=f"./{radar.upper()}/")
+    files = files[-4:]
+    log(f"files: {files}")
+
+    return files
+
+
+@task(
+    max_retries=3,
+    retry_delay=timedelta(seconds=30),
+)
+def download_files(client, files, radar):
+    """
+    Download files from FTP
+    """
+
+    save_path = Path(radar.upper())
+    save_path.mkdir(parents=True, exist_ok=True)
+
+    client.connect()
+    files_downloaded = []
+    for file in files:
+        # file_path = save_path / file
+        file_path = file
+        client.download(remote_path=file, local_path=file_path)
+        files_downloaded.append(file_path)
+    log(f"files_downloaded: {files_downloaded}")
+    return files_downloaded
+
+
+@task(
+    max_retries=3,
+    retry_delay=timedelta(seconds=30),
+)
+# pylint: disable=too-many-arguments, too-many-locals
+def upload_file_to_gcs(
+    file_to_upload: str,
+    bucket_name: str,
+    prefix: str,
+    radar: str,
+    product: str,
+    mode="prod",
+    task_mode="partitioned",
+    unlink: bool = True,
+):
+    """
+    Upload files to GCS
+    """
+    credentials = get_credentials_from_env(mode=mode)
+    storage_client = storage.Client(credentials=credentials)
+
+    bucket = storage_client.bucket(bucket_name)
+
+    file = Path(file_to_upload)
+    if file.is_file():
+        if task_mode == "partitioned":
+            # Converted file path is in the format:
+            # /var/opt/edge/.../YYYYMMDD/<filename>.nc.gz
+            # We need to get the datetime for the file
+            date_str = file.parent.name
+            date = datetime.strptime(date_str, "%Y%m%d").strftime("%Y-%m-%d")
+            blob_name = f"{prefix}/radar={radar}/produto={product}/data_particao={date}/{file.name}"
+            blob_name = blob_name.replace("//", "/")
+        elif task_mode == "raw":
+            blob_name = f"{prefix}/{file.name}"
+        else:
+            raise ValueError(f"Invalid task_mode: {task_mode}")
+        log(f"Uploading file {file} to GCS...")
+        log(f"Blob name will be {blob_name}")
+        blob = bucket.blob(blob_name)
+        blob.upload_from_filename(file)
+        log(f"File {file} uploaded to GCS.")
+        if unlink:
+            file.unlink()

From cbc7b8c7efe7b9ddf74f5df5323092463915375f Mon Sep 17 00:00:00 2001
From: patriciacatandi <patriciabcatandi@gmail.com>
Date: Tue, 19 Sep 2023 19:45:08 -0300
Subject: [PATCH 04/41] modifying init

---
 pipelines/rj_escritorio/__init__.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/pipelines/rj_escritorio/__init__.py b/pipelines/rj_escritorio/__init__.py
index 0813a42ba..a5c864245 100644
--- a/pipelines/rj_escritorio/__init__.py
+++ b/pipelines/rj_escritorio/__init__.py
@@ -18,6 +18,7 @@
 from pipelines.rj_escritorio.waze.flows import *
 from pipelines.rj_escritorio.geolocator.flows import *
 from pipelines.rj_escritorio.inea.flows import *
+from pipelines.rj_escritorio.dump_ftp_inea.flows import *
 from pipelines.rj_escritorio.seconserva_buracos_refresh_data.flows import *
 from pipelines.rj_escritorio.dump_url_turismo.flows import *
 from pipelines.rj_escritorio.dump_policy_matrix.flows import *

From e8186624d25bd9908e0094b4aba971dcfc9b4cc8 Mon Sep 17 00:00:00 2001
From: patriciacatandi <patriciabcatandi@gmail.com>
Date: Wed, 20 Sep 2023 09:36:39 -0300
Subject: [PATCH 05/41] bugfix

---
 pipelines/utils/__init__.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/pipelines/utils/__init__.py b/pipelines/utils/__init__.py
index 58e3911d7..b5ffe2d88 100644
--- a/pipelines/utils/__init__.py
+++ b/pipelines/utils/__init__.py
@@ -9,6 +9,7 @@
 from pipelines.utils.dump_to_gcs.flows import *
 from pipelines.utils.dump_url.flows import *
 from pipelines.utils.execute_dbt_model.flows import *
+from pipelines.utils.ftp.client import *
 from pipelines.utils.georeference.flows import *
 from pipelines.utils.predict_flow.flows import *
 from pipelines.utils.whatsapp_bot.flows import *

From d9a10d39bd99aba44ef3c28bb849d629f76eda52 Mon Sep 17 00:00:00 2001
From: patriciacatandi <patriciabcatandi@gmail.com>
Date: Wed, 20 Sep 2023 11:10:41 -0300
Subject: [PATCH 06/41] undoing adding on init

---
 pipelines/utils/__init__.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/pipelines/utils/__init__.py b/pipelines/utils/__init__.py
index b5ffe2d88..58e3911d7 100644
--- a/pipelines/utils/__init__.py
+++ b/pipelines/utils/__init__.py
@@ -9,7 +9,6 @@
 from pipelines.utils.dump_to_gcs.flows import *
 from pipelines.utils.dump_url.flows import *
 from pipelines.utils.execute_dbt_model.flows import *
-from pipelines.utils.ftp.client import *
 from pipelines.utils.georeference.flows import *
 from pipelines.utils.predict_flow.flows import *
 from pipelines.utils.whatsapp_bot.flows import *

From d75a92205b7a2492d5275543680a86def53b36e3 Mon Sep 17 00:00:00 2001
From: patriciacatandi <patriciabcatandi@gmail.com>
Date: Wed, 20 Sep 2023 11:15:59 -0300
Subject: [PATCH 07/41] changing flows name

---
 pipelines/rj_escritorio/dump_ftp_inea/flows.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pipelines/rj_escritorio/dump_ftp_inea/flows.py b/pipelines/rj_escritorio/dump_ftp_inea/flows.py
index 94312214a..60e9add45 100644
--- a/pipelines/rj_escritorio/dump_ftp_inea/flows.py
+++ b/pipelines/rj_escritorio/dump_ftp_inea/flows.py
@@ -26,7 +26,7 @@
 
 
 with Flow(
-    "INEA: Captura dados de radar (Guaratiba)",
+    "INEA: Captura FTP dados de radar (Guaratiba)",
     code_owners=[
         "paty",
     ],
@@ -67,7 +67,7 @@
 inea_ftp_radar_flow.schedule = every_5_minutes
 
 inea_ftp_radar_flow_mac = deepcopy(inea_ftp_radar_flow)
-inea_ftp_radar_flow_mac.name = "INEA: Captura dados de radar (Macaé)"
+inea_ftp_radar_flow_mac.name = "INEA: Captura FTP dados de radar (Macaé)"
 inea_ftp_radar_flow_mac.storage = GCS(constants.GCS_FLOWS_BUCKET.value)
 inea_ftp_radar_flow_mac.run_config = LocalRun(labels=[constants.INEA_AGENT_LABEL.value])
 inea_ftp_radar_flow_mac.schedule = every_5_minutes_mac

From f43d58a4ad0f15c4597f873760b8b55939c17be8 Mon Sep 17 00:00:00 2001
From: d116626 <d116626@gmail.com>
Date: Wed, 20 Sep 2023 11:42:14 -0300
Subject: [PATCH 08/41] chore: change from repo sms to smfp

---
 pipelines/rj_smfp/__init__.py                 |  1 +
 .../dump_db_sigma_medicamentos}/__init__.py   |  0
 .../dump_db_sigma_medicamentos/flows.py       | 47 +++++++++++++++++++
 .../dump_db_sigma_medicamentos}/schedules.py  |  4 +-
 pipelines/rj_sms/__init__.py                  |  1 -
 pipelines/rj_sms/dump_db_sigma/flows.py       | 47 -------------------
 6 files changed, 50 insertions(+), 50 deletions(-)
 rename pipelines/{rj_sms/dump_db_sigma => rj_smfp/dump_db_sigma_medicamentos}/__init__.py (100%)
 create mode 100644 pipelines/rj_smfp/dump_db_sigma_medicamentos/flows.py
 rename pipelines/{rj_sms/dump_db_sigma => rj_smfp/dump_db_sigma_medicamentos}/schedules.py (98%)
 delete mode 100644 pipelines/rj_sms/dump_db_sigma/flows.py

diff --git a/pipelines/rj_smfp/__init__.py b/pipelines/rj_smfp/__init__.py
index 022606109..ea6519e3b 100644
--- a/pipelines/rj_smfp/__init__.py
+++ b/pipelines/rj_smfp/__init__.py
@@ -6,6 +6,7 @@
 from pipelines.rj_smfp.dump_db_ergon_comlurb.flows import *
 from pipelines.rj_smfp.dump_db_metas.flows import *
 from pipelines.rj_smfp.dump_db_sigma.flows import *
+from pipelines.rj_smfp.dump_db_sigma_medicamentos.flows import *
 from pipelines.rj_smfp.dump_inadimplente.flows import *
 from pipelines.rj_smfp.dump_url_metas.flows import *
 from pipelines.rj_smfp.goals_dashboard_dbt.flows import *
diff --git a/pipelines/rj_sms/dump_db_sigma/__init__.py b/pipelines/rj_smfp/dump_db_sigma_medicamentos/__init__.py
similarity index 100%
rename from pipelines/rj_sms/dump_db_sigma/__init__.py
rename to pipelines/rj_smfp/dump_db_sigma_medicamentos/__init__.py
diff --git a/pipelines/rj_smfp/dump_db_sigma_medicamentos/flows.py b/pipelines/rj_smfp/dump_db_sigma_medicamentos/flows.py
new file mode 100644
index 000000000..d8b6bd62e
--- /dev/null
+++ b/pipelines/rj_smfp/dump_db_sigma_medicamentos/flows.py
@@ -0,0 +1,47 @@
+# -*- coding: utf-8 -*-
+"""
+Database dumping flows for SMFP SIGMA MEDICAMENTOS
+"""
+
+from copy import deepcopy
+
+from prefect.run_configs import KubernetesRun
+from prefect.storage import GCS
+
+from pipelines.constants import constants
+
+# importa o schedule
+from pipelines.rj_smfp.dump_db_sigma_medicamentos.schedules import (
+    sigma_daily_update_schedule,
+)
+from pipelines.utils.dump_db.flows import dump_sql_flow
+from pipelines.utils.utils import set_default_parameters
+
+rj_smfp_dump_db_sigma_medicamentos_flow = deepcopy(dump_sql_flow)
+rj_smfp_dump_db_sigma_medicamentos_flow.name = (
+    "SMFP: SIGMA - MEDICAMENTOS - Ingerir tabelas de banco SQL"
+)
+rj_smfp_dump_db_sigma_medicamentos_flow.storage = GCS(constants.GCS_FLOWS_BUCKET.value)
+
+rj_smfp_dump_db_sigma_medicamentos_flow.run_config = KubernetesRun(
+    image=constants.DOCKER_IMAGE.value,
+    labels=[
+        constants.RJ_SMFP_AGENT_LABEL.value,  # label do agente
+    ],
+)
+
+rj_smfp_dump_db_sigma_medicamentos_default_parameters = {
+    "db_database": "CP01.SMF",
+    "db_host": "10.90.31.22",
+    "db_port": "1521",
+    "db_type": "oracle",
+    "dataset_id": "saude_medicamentos_sigma",
+    "vault_secret_path": "db-sigma",
+}
+
+rj_smfp_dump_db_sigma_medicamentos_flow = set_default_parameters(
+    rj_smfp_dump_db_sigma_medicamentos_flow,
+    default_parameters=rj_smfp_dump_db_sigma_medicamentos_default_parameters,
+)
+
+rj_smfp_dump_db_sigma_medicamentos_flow.schedule = sigma_daily_update_schedule
diff --git a/pipelines/rj_sms/dump_db_sigma/schedules.py b/pipelines/rj_smfp/dump_db_sigma_medicamentos/schedules.py
similarity index 98%
rename from pipelines/rj_sms/dump_db_sigma/schedules.py
rename to pipelines/rj_smfp/dump_db_sigma_medicamentos/schedules.py
index 1a2549a94..0f61241ff 100644
--- a/pipelines/rj_sms/dump_db_sigma/schedules.py
+++ b/pipelines/rj_smfp/dump_db_sigma_medicamentos/schedules.py
@@ -218,13 +218,13 @@
     interval=timedelta(days=1),
     start_date=datetime(2022, 3, 21, 1, 0, tzinfo=pytz.timezone("America/Sao_Paulo")),
     labels=[
-        constants.RJ_SMS_AGENT_LABEL.value,
+        constants.RJ_SMFP_AGENT_LABEL.value,
     ],
     db_database="CP01.SMF",
     db_host="10.90.31.22",
     db_port="1521",
     db_type="oracle",
-    dataset_id="saude_estoque_medicamentos_sigma",
+    dataset_id="saude_medicamentos_sigma",
     vault_secret_path="db-sigma",
     table_parameters=_sigma_queries,
 )
diff --git a/pipelines/rj_sms/__init__.py b/pipelines/rj_sms/__init__.py
index eab9c1cc4..e73a968a6 100644
--- a/pipelines/rj_sms/__init__.py
+++ b/pipelines/rj_sms/__init__.py
@@ -3,6 +3,5 @@
 Prefect flows for rj_sms project
 """
 
-from pipelines.rj_sms.dump_db_sigma.flows import *
 from pipelines.rj_sms.dump_db_sivep.flows import *
 from pipelines.rj_sms.pubsub.flows import *
diff --git a/pipelines/rj_sms/dump_db_sigma/flows.py b/pipelines/rj_sms/dump_db_sigma/flows.py
deleted file mode 100644
index 8c238a995..000000000
--- a/pipelines/rj_sms/dump_db_sigma/flows.py
+++ /dev/null
@@ -1,47 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-Database dumping flows for SMS SIGMA system
-"""
-
-from copy import deepcopy
-
-from prefect.run_configs import KubernetesRun
-from prefect.storage import GCS
-
-from pipelines.constants import constants
-
-# importa o schedule
-from pipelines.rj_sms.dump_db_sigma.schedules import (
-    sigma_daily_update_schedule,
-)
-from pipelines.utils.dump_db.flows import dump_sql_flow
-from pipelines.utils.utils import set_default_parameters
-
-rj_sms_dump_db_sigma_flow = deepcopy(dump_sql_flow)
-rj_sms_dump_db_sigma_flow.name = (
-    "SMS: SIGMA - ESTOQUE MEDICAMENTOS - Ingerir tabelas de banco SQL"
-)
-rj_sms_dump_db_sigma_flow.storage = GCS(constants.GCS_FLOWS_BUCKET.value)
-
-rj_sms_dump_db_sigma_flow.run_config = KubernetesRun(
-    image=constants.DOCKER_IMAGE.value,
-    labels=[
-        constants.RJ_SMS_AGENT_LABEL.value,  # label do agente
-    ],
-)
-
-rj_sms_dump_db_sigma_default_parameters = {
-    "db_database": "CP01.SMF",
-    "db_host": "10.90.31.22",
-    "db_port": "1521",
-    "db_type": "oracle",
-    "dataset_id": "saude_estoque_medicamentos_sigma",
-    "vault_secret_path": "db-sigma",
-}
-
-rj_sms_dump_db_sigma_flow = set_default_parameters(
-    rj_sms_dump_db_sigma_flow,
-    default_parameters=rj_sms_dump_db_sigma_default_parameters,
-)
-
-rj_sms_dump_db_sigma_flow.schedule = sigma_daily_update_schedule

From 67304638e4e70b6fd2018ec08aa737e91e48dee5 Mon Sep 17 00:00:00 2001
From: patriciacatandi <patriciabcatandi@gmail.com>
Date: Wed, 20 Sep 2023 14:49:04 -0300
Subject: [PATCH 09/41] changing run

---
 pipelines/rj_escritorio/dump_ftp_inea/flows.py | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/pipelines/rj_escritorio/dump_ftp_inea/flows.py b/pipelines/rj_escritorio/dump_ftp_inea/flows.py
index 60e9add45..da5d9c0af 100644
--- a/pipelines/rj_escritorio/dump_ftp_inea/flows.py
+++ b/pipelines/rj_escritorio/dump_ftp_inea/flows.py
@@ -7,7 +7,7 @@
 from copy import deepcopy
 
 from prefect import Parameter
-from prefect.run_configs import LocalRun
+from prefect.run_configs import KubernetesRun
 from prefect.storage import GCS
 from prefect.utilities.edges import unmapped
 
@@ -63,11 +63,17 @@
 
 
 inea_ftp_radar_flow.storage = GCS(constants.GCS_FLOWS_BUCKET.value)
-inea_ftp_radar_flow.run_config = LocalRun(labels=[constants.INEA_AGENT_LABEL.value])
+inea_ftp_radar_flow.run_config = KubernetesRun(
+    image=constants.DOCKER_IMAGE.value,
+    labels=[constants.RJ_ESCRITORIO_DEV_AGENT_LABEL.value],
+)
 inea_ftp_radar_flow.schedule = every_5_minutes
 
 inea_ftp_radar_flow_mac = deepcopy(inea_ftp_radar_flow)
 inea_ftp_radar_flow_mac.name = "INEA: Captura FTP dados de radar (Macaé)"
 inea_ftp_radar_flow_mac.storage = GCS(constants.GCS_FLOWS_BUCKET.value)
-inea_ftp_radar_flow_mac.run_config = LocalRun(labels=[constants.INEA_AGENT_LABEL.value])
+inea_ftp_radar_flow_mac.run_config = KubernetesRun(
+    image=constants.DOCKER_IMAGE.value,
+    labels=[constants.RJ_ESCRITORIO_DEV_AGENT_LABEL.value],
+)
 inea_ftp_radar_flow_mac.schedule = every_5_minutes_mac

From 1f4b1357f94116499b1f099e33796d8f1768d466 Mon Sep 17 00:00:00 2001
From: patriciacatandi <patriciabcatandi@gmail.com>
Date: Wed, 20 Sep 2023 15:29:38 -0300
Subject: [PATCH 10/41] bugfix

---
 pipelines/rj_escritorio/dump_ftp_inea/tasks.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/pipelines/rj_escritorio/dump_ftp_inea/tasks.py b/pipelines/rj_escritorio/dump_ftp_inea/tasks.py
index f63b0d92c..13c1f9eb1 100644
--- a/pipelines/rj_escritorio/dump_ftp_inea/tasks.py
+++ b/pipelines/rj_escritorio/dump_ftp_inea/tasks.py
@@ -71,6 +71,8 @@ def download_files(client, files, radar):
         client.download(remote_path=file, local_path=file_path)
         files_downloaded.append(file_path)
     log(f"files_downloaded: {files_downloaded}")
+    file = Path(files_downloaded[0])
+    log(f"DEBUGGGG: {file.name}")
     return files_downloaded
 
 
@@ -103,7 +105,8 @@ def upload_file_to_gcs(
             # Converted file path is in the format:
             # /var/opt/edge/.../YYYYMMDD/<filename>.nc.gz
             # We need to get the datetime for the file
-            date_str = file.parent.name
+            log(f"DEBUG: {file} e {file.name}")
+            date_str = file.split("-")[2]
             date = datetime.strptime(date_str, "%Y%m%d").strftime("%Y-%m-%d")
             blob_name = f"{prefix}/radar={radar}/produto={product}/data_particao={date}/{file.name}"
             blob_name = blob_name.replace("//", "/")

From 6820a5558a2e9113834386008d035e8f08f87b3a Mon Sep 17 00:00:00 2001
From: patriciacatandi <patriciabcatandi@gmail.com>
Date: Wed, 20 Sep 2023 19:44:32 -0300
Subject: [PATCH 11/41] bugfix

---
 .../rj_escritorio/dump_ftp_inea/tasks.py      | 35 +++++++++----------
 1 file changed, 16 insertions(+), 19 deletions(-)

diff --git a/pipelines/rj_escritorio/dump_ftp_inea/tasks.py b/pipelines/rj_escritorio/dump_ftp_inea/tasks.py
index 13c1f9eb1..c9a888dbb 100644
--- a/pipelines/rj_escritorio/dump_ftp_inea/tasks.py
+++ b/pipelines/rj_escritorio/dump_ftp_inea/tasks.py
@@ -99,25 +99,22 @@ def upload_file_to_gcs(
 
     bucket = storage_client.bucket(bucket_name)
 
-    file = Path(file_to_upload)
-    if file.is_file():
-        if task_mode == "partitioned":
-            # Converted file path is in the format:
-            # /var/opt/edge/.../YYYYMMDD/<filename>.nc.gz
-            # We need to get the datetime for the file
-            log(f"DEBUG: {file} e {file.name}")
-            date_str = file.split("-")[2]
-            date = datetime.strptime(date_str, "%Y%m%d").strftime("%Y-%m-%d")
-            blob_name = f"{prefix}/radar={radar}/produto={product}/data_particao={date}/{file.name}"
-            blob_name = blob_name.replace("//", "/")
-        elif task_mode == "raw":
-            blob_name = f"{prefix}/{file.name}"
-        else:
-            raise ValueError(f"Invalid task_mode: {task_mode}")
-        log(f"Uploading file {file} to GCS...")
+    if task_mode == "partitioned":
+        # We need to get the datetime for the file
+        log(f"DEBUG: {file_to_upload} e {file_to_upload.name}")
+        date_str = file_to_upload.split("-")[2]
+        date = datetime.strptime(date_str, "%Y%m%d").strftime("%Y-%m-%d")
+        blob_name = (
+            f"{prefix}/radar={radar}/produto={product}/"
+            f"data_particao={date}/{file_to_upload.name}"
+        )
+        blob_name = blob_name.replace("//", "/")
+    elif task_mode == "raw":
+        blob_name = f"{prefix}/{file_to_upload.name}"
+        log(f"Uploading file {file_to_upload} to GCS...")
         log(f"Blob name will be {blob_name}")
         blob = bucket.blob(blob_name)
-        blob.upload_from_filename(file)
-        log(f"File {file} uploaded to GCS.")
+        blob.upload_from_filename(file_to_upload)
+        log(f"File {file_to_upload} uploaded to GCS.")
         if unlink:
-            file.unlink()
+            file_to_upload.unlink()

From b42a3ed15a0cc073947ae589909f186733d982dd Mon Sep 17 00:00:00 2001
From: patriciacatandi <patriciabcatandi@gmail.com>
Date: Wed, 20 Sep 2023 21:10:51 -0300
Subject: [PATCH 12/41] bugfix

---
 .../rj_escritorio/dump_ftp_inea/tasks.py      | 21 +++++++++----------
 1 file changed, 10 insertions(+), 11 deletions(-)

diff --git a/pipelines/rj_escritorio/dump_ftp_inea/tasks.py b/pipelines/rj_escritorio/dump_ftp_inea/tasks.py
index c9a888dbb..101434d5e 100644
--- a/pipelines/rj_escritorio/dump_ftp_inea/tasks.py
+++ b/pipelines/rj_escritorio/dump_ftp_inea/tasks.py
@@ -72,7 +72,7 @@ def download_files(client, files, radar):
         files_downloaded.append(file_path)
     log(f"files_downloaded: {files_downloaded}")
     file = Path(files_downloaded[0])
-    log(f"DEBUGGGG: {file.name}")
+    log(f"DEBUGGGG: {file.name.split('-')[2]}")
     return files_downloaded
 
 
@@ -99,22 +99,21 @@ def upload_file_to_gcs(
 
     bucket = storage_client.bucket(bucket_name)
 
+    file = Path(file_to_upload)
     if task_mode == "partitioned":
-        # We need to get the datetime for the file
-        log(f"DEBUG: {file_to_upload} e {file_to_upload.name}")
-        date_str = file_to_upload.split("-")[2]
+        log(f"DEBUG: {file} e {file.name}")
+        date_str = file.name.split("-")[2]
         date = datetime.strptime(date_str, "%Y%m%d").strftime("%Y-%m-%d")
         blob_name = (
-            f"{prefix}/radar={radar}/produto={product}/"
-            f"data_particao={date}/{file_to_upload.name}"
+            f"{prefix}/radar={radar}/produto={product}/data_particao={date}/{file.name}"
         )
         blob_name = blob_name.replace("//", "/")
     elif task_mode == "raw":
-        blob_name = f"{prefix}/{file_to_upload.name}"
-        log(f"Uploading file {file_to_upload} to GCS...")
+        blob_name = f"{prefix}/{file.name}"
+        log(f"Uploading file {file} to GCS...")
         log(f"Blob name will be {blob_name}")
         blob = bucket.blob(blob_name)
-        blob.upload_from_filename(file_to_upload)
-        log(f"File {file_to_upload} uploaded to GCS.")
+        blob.upload_from_filename(file)
+        log(f"File {file} uploaded to GCS.")
         if unlink:
-            file_to_upload.unlink()
+            file.unlink()

From cfb370dc031c7c1d684ae793cfd728185d11e8c6 Mon Sep 17 00:00:00 2001
From: patriciacatandi <patriciabcatandi@gmail.com>
Date: Wed, 20 Sep 2023 21:34:09 -0300
Subject: [PATCH 13/41] bugfix

---
 pipelines/rj_escritorio/dump_ftp_inea/tasks.py | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/pipelines/rj_escritorio/dump_ftp_inea/tasks.py b/pipelines/rj_escritorio/dump_ftp_inea/tasks.py
index 101434d5e..40fabcdc4 100644
--- a/pipelines/rj_escritorio/dump_ftp_inea/tasks.py
+++ b/pipelines/rj_escritorio/dump_ftp_inea/tasks.py
@@ -110,10 +110,11 @@ def upload_file_to_gcs(
         blob_name = blob_name.replace("//", "/")
     elif task_mode == "raw":
         blob_name = f"{prefix}/{file.name}"
-        log(f"Uploading file {file} to GCS...")
-        log(f"Blob name will be {blob_name}")
-        blob = bucket.blob(blob_name)
-        blob.upload_from_filename(file)
-        log(f"File {file} uploaded to GCS.")
-        if unlink:
-            file.unlink()
+
+    log(f"Uploading file {file} to GCS...")
+    log(f"Blob name will be {blob_name}")
+    blob = bucket.blob(blob_name)
+    blob.upload_from_filename(file)
+    log(f"File {file} uploaded to GCS.")
+    if unlink:
+        file.unlink()

From 885f4f84652d172e82c531e490823e70cd80ad48 Mon Sep 17 00:00:00 2001
From: patriciacatandi <patriciabcatandi@gmail.com>
Date: Thu, 21 Sep 2023 10:36:06 -0300
Subject: [PATCH 14/41] saving filenames on redis

---
 pipelines/rj_escritorio/dump_ftp_inea/tasks.py | 18 +++++++++++++++---
 1 file changed, 15 insertions(+), 3 deletions(-)

diff --git a/pipelines/rj_escritorio/dump_ftp_inea/tasks.py b/pipelines/rj_escritorio/dump_ftp_inea/tasks.py
index 40fabcdc4..dc76f8051 100644
--- a/pipelines/rj_escritorio/dump_ftp_inea/tasks.py
+++ b/pipelines/rj_escritorio/dump_ftp_inea/tasks.py
@@ -8,6 +8,8 @@
 
 from google.cloud import storage
 from prefect import task
+from prefect.engine.signals import ENDRUN
+from prefect.engine.state import Skipped
 
 from pipelines.utils.ftp.client import FTPClient
 from pipelines.utils.utils import (
@@ -38,15 +40,25 @@ def get_ftp_client(wait=None):
     max_retries=3,
     retry_delay=timedelta(seconds=30),
 )
-def get_files_to_download(client, radar):
+def get_files_to_download(client, radar, redis_files):
     """
     Get files to download FTP and GCS
     """
 
     client.connect()
     files = client.list_files(path=f"./{radar.upper()}/")
-    files = files[-4:]
-    log(f"files: {files}")
+    log(f"\n\nAvailable files on FTP: {files}")
+    log(f"\nFiles already saved on redis_files: {redis_files}")
+    files = [file for file in files if file not in redis_files]
+    log(f"\nFiles to be downloaded: {files}")
+    files = files[-4:]  # remover
+    log(f"\nFiles to be downloaded: {files}")
+
+    # Skip task if there is no new file
+    if len(files) == 0:
+        log("No new available files")
+        skip = Skipped("No new available files")
+        raise ENDRUN(state=skip)
 
     return files
 

From 3e0dac6b0a99c5019cfabdaf6119950d4ea81e34 Mon Sep 17 00:00:00 2001
From: patriciacatandi <patriciabcatandi@gmail.com>
Date: Thu, 21 Sep 2023 11:04:12 -0300
Subject: [PATCH 15/41] saving filenames on redis

---
 pipelines/rj_escritorio/dump_ftp_inea/flows.py | 18 +++++++++++++++++-
 1 file changed, 17 insertions(+), 1 deletion(-)

diff --git a/pipelines/rj_escritorio/dump_ftp_inea/flows.py b/pipelines/rj_escritorio/dump_ftp_inea/flows.py
index da5d9c0af..5f9bdea3f 100644
--- a/pipelines/rj_escritorio/dump_ftp_inea/flows.py
+++ b/pipelines/rj_escritorio/dump_ftp_inea/flows.py
@@ -22,6 +22,10 @@
     every_5_minutes,
     every_5_minutes_mac,
 )
+from pipelines.rj_cor.tasks import (
+    get_on_redis,
+    save_on_redis,
+)
 from pipelines.utils.decorators import Flow
 
 
@@ -41,9 +45,14 @@
 
     client = get_ftp_client()
 
+    redis_files = get_on_redis(
+        dataset_id="meio_ambiente_clima", table_id=radar, mode=mode
+    )
+
     files_to_download = get_files_to_download(
         client=client,
         radar=radar,
+        redis_files=redis_files,
     )
 
     files_to_upload = download_files(
@@ -52,7 +61,7 @@
         radar=radar,
     )
 
-    upload_file_to_gcs.map(
+    upload_files = upload_file_to_gcs.map(
         file_to_upload=files_to_upload,
         bucket_name=unmapped(bucket_name),
         prefix=unmapped(prefix),
@@ -61,6 +70,13 @@
         product=unmapped(product),
     )
 
+    save_on_redis(
+        dataset_id="meio_ambiente_clima",
+        table_id=radar,
+        mode=mode,
+        files=files_to_upload,
+        wait=upload_files,
+    )
 
 inea_ftp_radar_flow.storage = GCS(constants.GCS_FLOWS_BUCKET.value)
 inea_ftp_radar_flow.run_config = KubernetesRun(

From 33e571d143a002deba122f11d45abbb4227a3a38 Mon Sep 17 00:00:00 2001
From: patriciacatandi <patriciabcatandi@gmail.com>
Date: Thu, 21 Sep 2023 16:24:26 -0300
Subject: [PATCH 16/41] adding read files from datalake and choose if wants
 last file

---
 .../rj_escritorio/dump_ftp_inea/flows.py      |  15 +++
 .../rj_escritorio/dump_ftp_inea/tasks.py      | 121 +++++++++++++++++-
 2 files changed, 132 insertions(+), 4 deletions(-)

diff --git a/pipelines/rj_escritorio/dump_ftp_inea/flows.py b/pipelines/rj_escritorio/dump_ftp_inea/flows.py
index 5f9bdea3f..fb1ee08c4 100644
--- a/pipelines/rj_escritorio/dump_ftp_inea/flows.py
+++ b/pipelines/rj_escritorio/dump_ftp_inea/flows.py
@@ -14,6 +14,7 @@
 from pipelines.constants import constants
 from pipelines.rj_escritorio.dump_ftp_inea.tasks import (
     get_ftp_client,
+    get_files_datalake,
     get_files_to_download,
     download_files,
     upload_file_to_gcs,
@@ -36,6 +37,9 @@
     ],
 ) as inea_ftp_radar_flow:
     bucket_name = Parameter("bucket_name", default="rj-escritorio-dev", required=False)
+    date = Parameter("date", default=None, required=False)
+    get_only_last_file = Parameter("get_only_last_file", default=True, required=False)
+    greater_than = Parameter("greater_than", default=None, required=False)
     prefix = Parameter(
         "prefix", default="raw/meio_ambiente_clima/inea_radar_hdf5", required=False
     )
@@ -49,10 +53,21 @@
         dataset_id="meio_ambiente_clima", table_id=radar, mode=mode
     )
 
+    datalake_files = get_files_datalake(
+        bucket_name=bucket_name,
+        prefix=prefix,
+        radar=radar,
+        product=product,
+        date=date,
+        mode=mode,
+    )
+
     files_to_download = get_files_to_download(
         client=client,
         radar=radar,
         redis_files=redis_files,
+        datalake_files=datalake_files,
+        get_only_last_file=get_only_last_file,
     )
 
     files_to_upload = download_files(
diff --git a/pipelines/rj_escritorio/dump_ftp_inea/tasks.py b/pipelines/rj_escritorio/dump_ftp_inea/tasks.py
index dc76f8051..be871d4b2 100644
--- a/pipelines/rj_escritorio/dump_ftp_inea/tasks.py
+++ b/pipelines/rj_escritorio/dump_ftp_inea/tasks.py
@@ -5,6 +5,7 @@
 # pylint: disable=E0702,E1137,E1136,E1101,C0207,W0613
 from datetime import datetime, timedelta
 from pathlib import Path
+from typing import List, Tuple
 
 from google.cloud import storage
 from prefect import task
@@ -16,9 +17,96 @@
     log,
     get_credentials_from_env,
     get_vault_secret,
+    list_blobs_with_prefix,
 )
 
 
+@task(
+    nout=2,
+    max_retries=2,
+    retry_delay=timedelta(seconds=10),
+)
+# pylint: disable=too-many-arguments,too-many-locals, too-many-branches
+def get_files_datalake(
+    bucket_name: str,
+    prefix: str,
+    radar: str,
+    product: str,
+    date: str = None,
+    greater_than: str = None,
+    mode: str = "prod",
+) -> Tuple[List[str], str]:
+    """
+    List files from INEA server
+
+    Args:
+        product (str): "ppi"
+        date (str): Date of the files to be fetched (e.g. 2022-01-25)
+        greater_than (str): Fetch files with a date greater than this one
+        less_than (str): Fetch files with a date less than this one
+        output_directory (str): Directory where the files will be saved
+        radar (str): Radar name. Must be `gua` or `mac`
+        get_only_last_file (bool): Treat only the last file available
+
+    How to use:
+        to get real time data:
+            let `greater_than` and `date` as None and `get_only_last_file` as True
+            This will prevent the flow to be stucked treating all files when something happend
+            and stoped the flow. Otherwise the flow will take a long time to treat all files
+            and came back to real time.
+        to fill missing files up to two days ago:
+            let `greater_than` and `date` as None and `get_only_last_file` as False
+        for backfill or to fill missing files for dates greather than two days ago:
+            add a `greater_than` date and let `date` as None and `get_only_last_file` as False
+        get all files for one day
+            let `greater_than` as None and `get_only_last_file` as False and fill `date`
+    """
+    search_prefix = f"{prefix}/radar={radar}/produto={product}"
+
+    # Get today's blobs
+    current_date = datetime.now().date()
+    current_date_str = current_date.strftime("%Y-%m-%d")
+    blobs = list_blobs_with_prefix(
+        bucket_name=bucket_name,
+        prefix=f"{search_prefix}/data_particao={current_date_str}",
+        mode=mode,
+    )
+    log(
+        f"Searched for blobs with prefix {search_prefix}/data_particao={current_date_str}"
+    )
+
+    if greater_than is None:
+        greater_than = current_date - timedelta(days=1)
+    else:
+        greater_than = datetime.strptime(greater_than, "%Y-%m-%d")
+
+    # Next, we get past day's blobs
+    past_date = greater_than.date()
+    while past_date < current_date:
+        past_date_str = past_date.strftime("%Y-%m-%d")
+        past_blobs = list_blobs_with_prefix(
+            bucket_name=bucket_name,
+            prefix=f"{search_prefix}/data_particao={past_date_str}",
+            mode=mode,
+        )
+        log(
+            f"Searched for blobs with prefix {search_prefix}/data_particao={past_date_str}"
+        )
+        # Then, we merge the two lists
+        blobs += past_blobs
+        past_date += timedelta(days=1)
+
+    # Now, we sort it by `blob.name`
+    blobs.sort(key=lambda blob: blob.name)
+    # Get only the filenames
+    datalake_files = [blob.name.split("/")[-1] for blob in blobs]
+    # Format of the name is 9921GUA-PPIVol-20220930-121010-0004.hdf
+    # We need remove the last characters to stay with 9921GUA-PPIVol-20220930-121010
+    datalake_files = ["-".join(fname.split("-")[:-1]) for fname in datalake_files]
+
+    return datalake_files
+
+
 @task
 def get_ftp_client(wait=None):
     """
@@ -40,7 +128,13 @@ def get_ftp_client(wait=None):
     max_retries=3,
     retry_delay=timedelta(seconds=30),
 )
-def get_files_to_download(client, radar, redis_files):
+def get_files_to_download(
+    client,
+    radar,
+    redis_files,
+    datalake_files,
+    get_only_last_file: bool = True,
+):
     """
     Get files to download FTP and GCS
     """
@@ -49,10 +143,19 @@ def get_files_to_download(client, radar, redis_files):
     files = client.list_files(path=f"./{radar.upper()}/")
     log(f"\n\nAvailable files on FTP: {files}")
     log(f"\nFiles already saved on redis_files: {redis_files}")
+    # Files obtained direct from INEA ends with 0000 as "9915MAC-PPIVol-20230921-123000-0000.hdf"
+    # Files from FTP ends with an alphanumeric string as "9915MAC-PPIVol-20230921-142000-54d4.hdf"
+    # We need to be careful when changing one pipeline to other
+    # Check if files are already on redis
     files = [file for file in files if file not in redis_files]
-    log(f"\nFiles to be downloaded: {files}")
-    files = files[-4:]  # remover
-    log(f"\nFiles to be downloaded: {files}")
+
+    # Check if files are already on datalake
+    if len(datalake_files) > 0:
+        files = [
+            file
+            for file in files
+            if "-".join(file.split("-")[:-1]) not in datalake_files
+        ]
 
     # Skip task if there is no new file
     if len(files) == 0:
@@ -60,6 +163,16 @@ def get_files_to_download(client, radar, redis_files):
         skip = Skipped("No new available files")
         raise ENDRUN(state=skip)
 
+    files.sort()
+
+    log(f"\nFiles to be downloaded: {files}")
+    if len(files) > 20:
+        files = files[-20:]  # remover
+
+    if get_only_last_file:
+        files = files[-1]
+    log(f"\nFiles to be downloaded: {files}")
+
     return files
 
 

From 1531bbf3212a1bf206e3a8a1146788c7d4f88cd6 Mon Sep 17 00:00:00 2001
From: patriciacatandi <patriciabcatandi@gmail.com>
Date: Thu, 21 Sep 2023 16:58:53 -0300
Subject: [PATCH 17/41] bugfix on date

---
 pipelines/rj_escritorio/dump_ftp_inea/tasks.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/pipelines/rj_escritorio/dump_ftp_inea/tasks.py b/pipelines/rj_escritorio/dump_ftp_inea/tasks.py
index be871d4b2..119276668 100644
--- a/pipelines/rj_escritorio/dump_ftp_inea/tasks.py
+++ b/pipelines/rj_escritorio/dump_ftp_inea/tasks.py
@@ -76,12 +76,12 @@ def get_files_datalake(
     )
 
     if greater_than is None:
-        greater_than = current_date - timedelta(days=1)
+        past_date = current_date - timedelta(days=1)
     else:
-        greater_than = datetime.strptime(greater_than, "%Y-%m-%d")
+        past_date = datetime.strptime(greater_than, "%Y-%m-%d")
+        past_date = past_date.date()
 
     # Next, we get past day's blobs
-    past_date = greater_than.date()
     while past_date < current_date:
         past_date_str = past_date.strftime("%Y-%m-%d")
         past_blobs = list_blobs_with_prefix(

From fe532f328405b2ba14b8ff0a1389dd889dc6eff0 Mon Sep 17 00:00:00 2001
From: patriciacatandi <patriciabcatandi@gmail.com>
Date: Fri, 22 Sep 2023 11:42:19 -0300
Subject: [PATCH 18/41] bugfix

---
 pipelines/rj_escritorio/dump_ftp_inea/tasks.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/pipelines/rj_escritorio/dump_ftp_inea/tasks.py b/pipelines/rj_escritorio/dump_ftp_inea/tasks.py
index 119276668..ff13b0586 100644
--- a/pipelines/rj_escritorio/dump_ftp_inea/tasks.py
+++ b/pipelines/rj_escritorio/dump_ftp_inea/tasks.py
@@ -170,9 +170,8 @@ def get_files_to_download(
         files = files[-20:]  # remover
 
     if get_only_last_file:
-        files = files[-1]
+        files = list(files[-1])
     log(f"\nFiles to be downloaded: {files}")
-
     return files
 
 
@@ -191,11 +190,12 @@ def download_files(client, files, radar):
     client.connect()
     files_downloaded = []
     for file in files:
+        log(f"Downloading file: {file}")
         # file_path = save_path / file
         file_path = file
         client.download(remote_path=file, local_path=file_path)
         files_downloaded.append(file_path)
-    log(f"files_downloaded: {files_downloaded}")
+    log(f"Downloaded: {files_downloaded}")
     file = Path(files_downloaded[0])
     log(f"DEBUGGGG: {file.name.split('-')[2]}")
     return files_downloaded

From 9e356b62fe9861a5c57d92358132ed99d1a922bb Mon Sep 17 00:00:00 2001
From: patriciacatandi <patriciabcatandi@gmail.com>
Date: Fri, 22 Sep 2023 19:27:59 -0300
Subject: [PATCH 19/41] bugfix

---
 pipelines/rj_escritorio/dump_ftp_inea/tasks.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pipelines/rj_escritorio/dump_ftp_inea/tasks.py b/pipelines/rj_escritorio/dump_ftp_inea/tasks.py
index ff13b0586..ec619333f 100644
--- a/pipelines/rj_escritorio/dump_ftp_inea/tasks.py
+++ b/pipelines/rj_escritorio/dump_ftp_inea/tasks.py
@@ -170,7 +170,7 @@ def get_files_to_download(
         files = files[-20:]  # remover
 
     if get_only_last_file:
-        files = list(files[-1])
+        files = [files[-1]]
     log(f"\nFiles to be downloaded: {files}")
     return files
 

From 009a099dddec72eb058a821e5f756bf233aec02d Mon Sep 17 00:00:00 2001
From: patriciacatandi <patriciabcatandi@gmail.com>
Date: Mon, 25 Sep 2023 16:52:56 -0300
Subject: [PATCH 20/41] add date and greater_than parameters and flows
 schedulers

---
 .../rj_escritorio/dump_ftp_inea/flows.py      |  55 ++++--
 .../rj_escritorio/dump_ftp_inea/schedules.py  |  24 +--
 .../rj_escritorio/dump_ftp_inea/tasks.py      | 164 ++++++++++--------
 3 files changed, 140 insertions(+), 103 deletions(-)

diff --git a/pipelines/rj_escritorio/dump_ftp_inea/flows.py b/pipelines/rj_escritorio/dump_ftp_inea/flows.py
index fb1ee08c4..eba999e77 100644
--- a/pipelines/rj_escritorio/dump_ftp_inea/flows.py
+++ b/pipelines/rj_escritorio/dump_ftp_inea/flows.py
@@ -2,7 +2,7 @@
 """
 Dumping  data from INEA FTP to BigQuery
 """
-# pylint: disable=E1101,C0103
+# pylint: disable=E1101,C0103,bad-continuation
 
 from copy import deepcopy
 
@@ -22,24 +22,23 @@
 from pipelines.rj_escritorio.dump_ftp_inea.schedules import (
     every_5_minutes,
     every_5_minutes_mac,
+    every_1_day,
+    every_1_day_mac,
 )
-from pipelines.rj_cor.tasks import (
-    get_on_redis,
-    save_on_redis,
-)
+from pipelines.rj_cor.tasks import get_on_redis, save_on_redis
 from pipelines.utils.decorators import Flow
 
 
 with Flow(
-    "INEA: Captura FTP dados de radar (Guaratiba)",
-    code_owners=[
-        "paty",
-    ],
+    "INEA: Captura FTP dados de radar (Guaratiba)", code_owners=["paty"]
 ) as inea_ftp_radar_flow:
     bucket_name = Parameter("bucket_name", default="rj-escritorio-dev", required=False)
     date = Parameter("date", default=None, required=False)
     get_only_last_file = Parameter("get_only_last_file", default=True, required=False)
     greater_than = Parameter("greater_than", default=None, required=False)
+    check_datalake_files = Parameter(
+        "check_datalake_files", default=True, required=False
+    )
     prefix = Parameter(
         "prefix", default="raw/meio_ambiente_clima/inea_radar_hdf5", required=False
     )
@@ -59,6 +58,8 @@
         radar=radar,
         product=product,
         date=date,
+        greater_than=greater_than,
+        check_datalake_files=check_datalake_files,
         mode=mode,
     )
 
@@ -71,9 +72,7 @@
     )
 
     files_to_upload = download_files(
-        client=client,
-        files=files_to_download,
-        radar=radar,
+        client=client, files=files_to_download, radar=radar
     )
 
     upload_files = upload_file_to_gcs.map(
@@ -90,6 +89,7 @@
         table_id=radar,
         mode=mode,
         files=files_to_upload,
+        keep_last=14400,  # last 30 days files
         wait=upload_files,
     )
 
@@ -108,3 +108,34 @@
     labels=[constants.RJ_ESCRITORIO_DEV_AGENT_LABEL.value],
 )
 inea_ftp_radar_flow_mac.schedule = every_5_minutes_mac
+
+inea_ftp_radar_flow_fill_missing = deepcopy(inea_ftp_radar_flow)
+inea_ftp_radar_flow_fill_missing.name = (
+    "INEA: Captura FTP dados de radar (Guaratiba): preenchimento de arquivos faltantes"
+)
+inea_ftp_radar_flow_fill_missing.storage = GCS(constants.GCS_FLOWS_BUCKET.value)
+inea_ftp_radar_flow_fill_missing.run_config = KubernetesRun(
+    image=constants.DOCKER_IMAGE.value,
+    labels=[constants.RJ_ESCRITORIO_DEV_AGENT_LABEL.value],
+)
+inea_ftp_radar_flow_fill_missing.schedule = every_1_day
+
+inea_ftp_radar_flow_fill_missing_mac = deepcopy(inea_ftp_radar_flow)
+inea_ftp_radar_flow_fill_missing_mac.name = (
+    "INEA: Captura FTP dados de radar (Macaé): preenchimento de arquivos faltantes"
+)
+inea_ftp_radar_flow_fill_missing_mac.storage = GCS(constants.GCS_FLOWS_BUCKET.value)
+inea_ftp_radar_flow_fill_missing_mac.run_config = KubernetesRun(
+    image=constants.DOCKER_IMAGE.value,
+    labels=[constants.RJ_ESCRITORIO_DEV_AGENT_LABEL.value],
+)
+inea_ftp_radar_flow_fill_missing_mac.schedule = every_1_day_mac
+
+inea_ftp_backfill_radar_flow = deepcopy(inea_ftp_radar_flow)
+inea_ftp_backfill_radar_flow.name = "INEA: Captura dados de radar (backfill)"
+inea_ftp_backfill_radar_flow.storage = GCS(constants.GCS_FLOWS_BUCKET.value)
+inea_ftp_backfill_radar_flow.run_config = KubernetesRun(
+    image=constants.DOCKER_IMAGE.value,
+    labels=[constants.RJ_ESCRITORIO_DEV_AGENT_LABEL.value],
+)
+inea_ftp_backfill_radar_flow.schedule = None
diff --git a/pipelines/rj_escritorio/dump_ftp_inea/schedules.py b/pipelines/rj_escritorio/dump_ftp_inea/schedules.py
index ae59322ff..a8db99996 100644
--- a/pipelines/rj_escritorio/dump_ftp_inea/schedules.py
+++ b/pipelines/rj_escritorio/dump_ftp_inea/schedules.py
@@ -17,9 +17,7 @@
         IntervalClock(
             interval=timedelta(minutes=5),
             start_date=datetime(2021, 1, 1, tzinfo=pytz.timezone("America/Sao_Paulo")),
-            labels=[
-                constants.INEA_AGENT_LABEL.value,
-            ],
+            labels=[constants.INEA_AGENT_LABEL.value],
             parameter_defaults={
                 "bucket_name": "rj-escritorio-dev",
                 "convert_params": "-k=ODIM2.1 -M=All",
@@ -30,7 +28,7 @@
                 "radar": "gua",
                 "vols_remote_directory": "/var/opt/edge/vols",
             },
-        ),
+        )
     ]
 )
 every_5_minutes_mac = Schedule(
@@ -38,9 +36,7 @@
         IntervalClock(
             interval=timedelta(minutes=5),
             start_date=datetime(2021, 1, 1, tzinfo=pytz.timezone("America/Sao_Paulo")),
-            labels=[
-                constants.INEA_AGENT_LABEL.value,
-            ],
+            labels=[constants.INEA_AGENT_LABEL.value],
             parameter_defaults={
                 "bucket_name": "rj-escritorio-dev",
                 "convert_params": "-k=ODIM2.1 -M=All",
@@ -51,7 +47,7 @@
                 "radar": "mac",
                 "vols_remote_directory": "/var/opt/edge/vols",
             },
-        ),
+        )
     ]
 )
 every_1_day = Schedule(
@@ -59,9 +55,7 @@
         IntervalClock(
             interval=timedelta(days=1),
             start_date=datetime(2021, 1, 1, tzinfo=pytz.timezone("America/Sao_Paulo")),
-            labels=[
-                constants.INEA_AGENT_LABEL.value,
-            ],
+            labels=[constants.INEA_AGENT_LABEL.value],
             parameter_defaults={
                 "bucket_name": "rj-escritorio-dev",
                 "convert_params": "-k=ODIM2.1 -M=All",
@@ -73,7 +67,7 @@
                 "get_only_last_file": False,
                 "vols_remote_directory": "/var/opt/edge/vols",
             },
-        ),
+        )
     ]
 )
 every_1_day_mac = Schedule(
@@ -81,9 +75,7 @@
         IntervalClock(
             interval=timedelta(days=1),
             start_date=datetime(2021, 1, 1, tzinfo=pytz.timezone("America/Sao_Paulo")),
-            labels=[
-                constants.INEA_AGENT_LABEL.value,
-            ],
+            labels=[constants.INEA_AGENT_LABEL.value],
             parameter_defaults={
                 "bucket_name": "rj-escritorio-dev",
                 "convert_params": "-k=ODIM2.1 -M=All",
@@ -95,6 +87,6 @@
                 "get_only_last_file": False,
                 "vols_remote_directory": "/var/opt/edge/vols",
             },
-        ),
+        )
     ]
 )
diff --git a/pipelines/rj_escritorio/dump_ftp_inea/tasks.py b/pipelines/rj_escritorio/dump_ftp_inea/tasks.py
index ec619333f..f90f23ac4 100644
--- a/pipelines/rj_escritorio/dump_ftp_inea/tasks.py
+++ b/pipelines/rj_escritorio/dump_ftp_inea/tasks.py
@@ -2,7 +2,7 @@
 """
 Tasks to dump data from a INEA FTP to BigQuery
 """
-# pylint: disable=E0702,E1137,E1136,E1101,C0207,W0613
+# pylint: disable=E0702,E1137,E1136,E1101,W0613,bad-continuation
 from datetime import datetime, timedelta
 from pathlib import Path
 from typing import List, Tuple
@@ -21,11 +21,7 @@
 )
 
 
-@task(
-    nout=2,
-    max_retries=2,
-    retry_delay=timedelta(seconds=10),
-)
+@task(nout=2, max_retries=2, retry_delay=timedelta(seconds=10))
 # pylint: disable=too-many-arguments,too-many-locals, too-many-branches
 def get_files_datalake(
     bucket_name: str,
@@ -34,17 +30,16 @@ def get_files_datalake(
     product: str,
     date: str = None,
     greater_than: str = None,
+    check_datalake_files: bool = True,
     mode: str = "prod",
 ) -> Tuple[List[str], str]:
     """
-    List files from INEA server
+    List files from INEA saved on datalake
 
     Args:
         product (str): "ppi"
         date (str): Date of the files to be fetched (e.g. 2022-01-25)
-        greater_than (str): Fetch files with a date greater than this one
-        less_than (str): Fetch files with a date less than this one
-        output_directory (str): Directory where the files will be saved
+        greater_than (str): Fetch files with a date greater than this one (e.g. 2022-01-25)
         radar (str): Radar name. Must be `gua` or `mac`
         get_only_last_file (bool): Treat only the last file available
 
@@ -61,48 +56,48 @@ def get_files_datalake(
         get all files for one day
             let `greater_than` as None and `get_only_last_file` as False and fill `date`
     """
-    search_prefix = f"{prefix}/radar={radar}/produto={product}"
-
-    # Get today's blobs
-    current_date = datetime.now().date()
-    current_date_str = current_date.strftime("%Y-%m-%d")
-    blobs = list_blobs_with_prefix(
-        bucket_name=bucket_name,
-        prefix=f"{search_prefix}/data_particao={current_date_str}",
-        mode=mode,
-    )
-    log(
-        f"Searched for blobs with prefix {search_prefix}/data_particao={current_date_str}"
-    )
-
-    if greater_than is None:
-        past_date = current_date - timedelta(days=1)
+
+    if check_datalake_files:
+        search_prefix = f"{prefix}/radar={radar}/produto={product}"
+
+        # Get today's blobs
+        if date:
+            current_date = datetime.strptime(date, "%Y-%m-%d")
+        else:
+            current_date = datetime.now().date()
+
+        if greater_than is None:
+            past_date = current_date - timedelta(days=1)
+        else:
+            past_date = datetime.strptime(greater_than, "%Y-%m-%d")
+            past_date = past_date.date()
+
+        blobs = []
+        # Next, we get past day's blobs
+        while past_date <= current_date:
+            past_date_str = past_date.strftime("%Y-%m-%d")
+            past_blobs = list_blobs_with_prefix(
+                bucket_name=bucket_name,
+                prefix=f"{search_prefix}/data_particao={past_date_str}",
+                mode=mode,
+            )
+            log(
+                f"Searched for blobs with prefix {search_prefix}/data_particao={past_date_str}"
+            )
+            # Then, we merge the two lists
+            blobs += past_blobs
+            past_date += timedelta(days=1)
+
+        # Now, we sort it by `blob.name`
+        blobs.sort(key=lambda blob: blob.name)
+        # Get only the filenames
+        datalake_files = [blob.name.split("/")[-1] for blob in blobs]
+        # Format of the name is 9921GUA-PPIVol-20220930-121010-0004.hdf
+        # We need remove the last characters to stay with 9921GUA-PPIVol-20220930-121010
+        datalake_files = ["-".join(fname.split("-")[:-1]) for fname in datalake_files]
+
     else:
-        past_date = datetime.strptime(greater_than, "%Y-%m-%d")
-        past_date = past_date.date()
-
-    # Next, we get past day's blobs
-    while past_date < current_date:
-        past_date_str = past_date.strftime("%Y-%m-%d")
-        past_blobs = list_blobs_with_prefix(
-            bucket_name=bucket_name,
-            prefix=f"{search_prefix}/data_particao={past_date_str}",
-            mode=mode,
-        )
-        log(
-            f"Searched for blobs with prefix {search_prefix}/data_particao={past_date_str}"
-        )
-        # Then, we merge the two lists
-        blobs += past_blobs
-        past_date += timedelta(days=1)
-
-    # Now, we sort it by `blob.name`
-    blobs.sort(key=lambda blob: blob.name)
-    # Get only the filenames
-    datalake_files = [blob.name.split("/")[-1] for blob in blobs]
-    # Format of the name is 9921GUA-PPIVol-20220930-121010-0004.hdf
-    # We need remove the last characters to stay with 9921GUA-PPIVol-20220930-121010
-    datalake_files = ["-".join(fname.split("-")[:-1]) for fname in datalake_files]
+        datalake_files = []
 
     return datalake_files
 
@@ -117,35 +112,64 @@ def get_ftp_client(wait=None):
     username = inea_secret["data"]["username"]
     password = inea_secret["data"]["password"]
 
-    return FTPClient(
-        hostname=hostname,
-        username=username,
-        password=password,
-    )
+    return FTPClient(hostname=hostname, username=username, password=password)
 
 
-@task(
-    max_retries=3,
-    retry_delay=timedelta(seconds=30),
-)
+@task(max_retries=3, retry_delay=timedelta(seconds=30))
+# pylint: disable=too-many-arguments
 def get_files_to_download(
     client,
     radar,
     redis_files,
     datalake_files,
+    date: str = None,
+    greater_than: str = None,
     get_only_last_file: bool = True,
 ):
     """
-    Get files to download FTP and GCS
+    List and get files to download FTP
+
+    Args:
+        radar (str): Radar name. Must be `gua` or `mac`
+        redis_files (list): List with last files saved on GCP and redis
+        datalake_files (list): List with filenames saved on GCP
+        date (str): Date of the files to be fetched (e.g. 2022-01-25)
+        greater_than (str): Fetch files with a date greater than this one (e.g. 2022-01-25)
+        get_only_last_file (bool): Treat only the last file available
+
+    How to use:
+        to get real time data:
+            let `greater_than` and `date` as None and `get_only_last_file` as True
+            This will prevent the flow to be stucked treating all files when something happend
+            and stoped the flow. Otherwise the flow will take a long time to treat all files
+            and came back to real time.
+        to fill missing files up to two days ago:
+            let `greater_than` and `date` as None and `get_only_last_file` as False
+        for backfill or to fill missing files for dates greather than two days ago:
+            add a `greater_than` date and let `date` as None and `get_only_last_file` as False
+        get all files for one day
+            let `greater_than` as None and `get_only_last_file` as False and fill `date`
     """
 
     client.connect()
     files = client.list_files(path=f"./{radar.upper()}/")
-    log(f"\n\nAvailable files on FTP: {files}")
-    log(f"\nFiles already saved on redis_files: {redis_files}")
+    # log(f"\n\nAvailable files on FTP: {files}")
+    # log(f"\nFiles already saved on redis_files: {redis_files}")
+
     # Files obtained direct from INEA ends with 0000 as "9915MAC-PPIVol-20230921-123000-0000.hdf"
     # Files from FTP ends with an alphanumeric string as "9915MAC-PPIVol-20230921-142000-54d4.hdf"
     # We need to be careful when changing one pipeline to other
+
+    # Get specific files based on date and greater_than parameters
+    if date:
+        files = [file for file in files if file.split("-")[2] == date.replace("-", "")]
+    if greater_than:
+        files = [
+            file
+            for file in files
+            if file.split("-")[2] >= greater_than.replace("-", "")
+        ]
+
     # Check if files are already on redis
     files = [file for file in files if file not in redis_files]
 
@@ -165,20 +189,13 @@ def get_files_to_download(
 
     files.sort()
 
-    log(f"\nFiles to be downloaded: {files}")
-    if len(files) > 20:
-        files = files[-20:]  # remover
-
     if get_only_last_file:
         files = [files[-1]]
     log(f"\nFiles to be downloaded: {files}")
     return files
 
 
-@task(
-    max_retries=3,
-    retry_delay=timedelta(seconds=30),
-)
+@task(max_retries=3, retry_delay=timedelta(seconds=30))
 def download_files(client, files, radar):
     """
     Download files from FTP
@@ -201,10 +218,7 @@ def download_files(client, files, radar):
     return files_downloaded
 
 
-@task(
-    max_retries=3,
-    retry_delay=timedelta(seconds=30),
-)
+@task(max_retries=3, retry_delay=timedelta(seconds=30))
 # pylint: disable=too-many-arguments, too-many-locals
 def upload_file_to_gcs(
     file_to_upload: str,

From 40249a6f250322739c2711906aa27216b7da7112 Mon Sep 17 00:00:00 2001
From: patriciacatandi <patriciabcatandi@gmail.com>
Date: Mon, 25 Sep 2023 19:12:32 -0300
Subject: [PATCH 21/41] chancging back parameter

---
 pipelines/rj_escritorio/dump_ftp_inea/flows.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pipelines/rj_escritorio/dump_ftp_inea/flows.py b/pipelines/rj_escritorio/dump_ftp_inea/flows.py
index eba999e77..aa2f979b9 100644
--- a/pipelines/rj_escritorio/dump_ftp_inea/flows.py
+++ b/pipelines/rj_escritorio/dump_ftp_inea/flows.py
@@ -43,7 +43,7 @@
         "prefix", default="raw/meio_ambiente_clima/inea_radar_hdf5", required=False
     )
     mode = Parameter("mode", default="prod", required=False)
-    radar = Parameter("radar", default="mac", required=False)
+    radar = Parameter("radar", default="gua", required=False)
     product = Parameter("product", default="ppi", required=False)
 
     client = get_ftp_client()

From f7c9f0fa0477dd9e351d2244ba886018df12fc42 Mon Sep 17 00:00:00 2001
From: Rodrigo Cunha <66736583+eng-rodrigocunha@users.noreply.github.com>
Date: Tue, 26 Sep 2023 10:46:37 -0300
Subject: [PATCH 22/41] hotfix: update `bq_upload` data check (#516)

* change agent to test

* change data check

* update agent

* change data check

* update log

* update agents

* enrich logs

* enrich logs

* update tasks

* Revert "change agent to test"

This reverts commit 36dacb0559b0ef11fc7687321980fef02072cfff.

* Revert "update agent"

This reverts commit 741a2a6e75d6fecfaea89ee714c8847b48b9a933.

* Revert "update agents"

This reverts commit 70f0ed33817afa7f259ea5dafefe47fac7ff466f.

* update task bq_upload

* update rdo agents for testing

* comment update_rdo_redis + limit 10 files

* Revert "comment update_rdo_redis + limit 10 files"

This reverts commit e899c5c2fe8067beb6abe2e98c6a67637d330ad5.

* Revert "update rdo agents for testing"

This reverts commit 6cb36cbe8efc536c105ab715b10275ee650d81cb.

* remove checking empty file
---
 pipelines/rj_smtr/tasks.py | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/pipelines/rj_smtr/tasks.py b/pipelines/rj_smtr/tasks.py
index de52c03df..5b476e8de 100644
--- a/pipelines/rj_smtr/tasks.py
+++ b/pipelines/rj_smtr/tasks.py
@@ -502,11 +502,8 @@ def bq_upload(
     if status["error"] is not None:
         return status["error"]
 
-    if len(status["data"]) == 0:
-        log("Empty dataframe, skipping upload")
-        return None
-
     error = None
+
     try:
         # Upload raw to staging
         if raw_filepath:
@@ -848,7 +845,7 @@ def transform_to_nested_structure(
 
     # Check empty dataframe
     if len(status["data"]) == 0:
-        log("Empty dataframe, skipping transformation")
+        log("Empty dataframe, skipping transformation...")
         return {"data": pd.DataFrame(), "error": status["error"]}
 
     try:

From 9938bbe02cda4cd3810edc100065e1830bbbce1a Mon Sep 17 00:00:00 2001
From: Fernanda Scovino <fscovinom@gmail.com>
Date: Tue, 26 Sep 2023 11:45:45 -0300
Subject: [PATCH 23/41] hotfix: desativa flow gps stpl (#518)

---
 pipelines/rj_smtr/br_rj_riodejaneiro_stpl_gps/flows.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pipelines/rj_smtr/br_rj_riodejaneiro_stpl_gps/flows.py b/pipelines/rj_smtr/br_rj_riodejaneiro_stpl_gps/flows.py
index 615b9b11f..7d8cf1574 100644
--- a/pipelines/rj_smtr/br_rj_riodejaneiro_stpl_gps/flows.py
+++ b/pipelines/rj_smtr/br_rj_riodejaneiro_stpl_gps/flows.py
@@ -106,5 +106,5 @@
     image=emd_constants.DOCKER_IMAGE.value,
     labels=[emd_constants.RJ_SMTR_AGENT_LABEL.value],
 )
-# Seguindo o padrão de captura adotado pelo BRT
-captura_stpl.schedule = every_minute
+# Captura descontinuada (sem dados), avaliar quando voltar
+# captura_stpl.schedule = every_minute

From 323b505c43894fa79ba9aa4f2e7e5796666152ba Mon Sep 17 00:00:00 2001
From: patriciacatandi <patriciabcatandi@gmail.com>
Date: Tue, 26 Sep 2023 12:12:20 -0300
Subject: [PATCH 24/41] bugfix

---
 pipelines/rj_escritorio/dump_ftp_inea/flows.py |  2 ++
 pipelines/rj_escritorio/dump_ftp_inea/tasks.py | 16 +++++++++-------
 2 files changed, 11 insertions(+), 7 deletions(-)

diff --git a/pipelines/rj_escritorio/dump_ftp_inea/flows.py b/pipelines/rj_escritorio/dump_ftp_inea/flows.py
index aa2f979b9..0f1d87eef 100644
--- a/pipelines/rj_escritorio/dump_ftp_inea/flows.py
+++ b/pipelines/rj_escritorio/dump_ftp_inea/flows.py
@@ -68,6 +68,8 @@
         radar=radar,
         redis_files=redis_files,
         datalake_files=datalake_files,
+        date=date,
+        greater_than=greater_than,
         get_only_last_file=get_only_last_file,
     )
 
diff --git a/pipelines/rj_escritorio/dump_ftp_inea/tasks.py b/pipelines/rj_escritorio/dump_ftp_inea/tasks.py
index f90f23ac4..18f123085 100644
--- a/pipelines/rj_escritorio/dump_ftp_inea/tasks.py
+++ b/pipelines/rj_escritorio/dump_ftp_inea/tasks.py
@@ -5,7 +5,7 @@
 # pylint: disable=E0702,E1137,E1136,E1101,W0613,bad-continuation
 from datetime import datetime, timedelta
 from pathlib import Path
-from typing import List, Tuple
+from typing import List
 
 from google.cloud import storage
 from prefect import task
@@ -32,7 +32,7 @@ def get_files_datalake(
     greater_than: str = None,
     check_datalake_files: bool = True,
     mode: str = "prod",
-) -> Tuple[List[str], str]:
+) -> List[str]:
     """
     List files from INEA saved on datalake
 
@@ -95,9 +95,11 @@ def get_files_datalake(
         # Format of the name is 9921GUA-PPIVol-20220930-121010-0004.hdf
         # We need remove the last characters to stay with 9921GUA-PPIVol-20220930-121010
         datalake_files = ["-".join(fname.split("-")[:-1]) for fname in datalake_files]
+        log(f"Last 5 datalake files: {datalake_files[-5:]}")
 
     else:
         datalake_files = []
+        log("This run is not considering datalake files")
 
     return datalake_files
 
@@ -119,13 +121,13 @@ def get_ftp_client(wait=None):
 # pylint: disable=too-many-arguments
 def get_files_to_download(
     client,
-    radar,
-    redis_files,
-    datalake_files,
+    radar: str,
+    redis_files: list,
+    datalake_files: list,
     date: str = None,
     greater_than: str = None,
     get_only_last_file: bool = True,
-):
+) -> List[str]:
     """
     List and get files to download FTP
 
@@ -196,7 +198,7 @@ def get_files_to_download(
 
 
 @task(max_retries=3, retry_delay=timedelta(seconds=30))
-def download_files(client, files, radar):
+def download_files(client, files, radar) -> List[str]:
     """
     Download files from FTP
     """

From 3a406a6412620efe3f56024d99f25b4205e1a253 Mon Sep 17 00:00:00 2001
From: patriciacatandi <patriciabcatandi@gmail.com>
Date: Tue, 26 Sep 2023 12:29:47 -0300
Subject: [PATCH 25/41] bugfix

---
 pipelines/rj_escritorio/dump_ftp_inea/tasks.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/pipelines/rj_escritorio/dump_ftp_inea/tasks.py b/pipelines/rj_escritorio/dump_ftp_inea/tasks.py
index 18f123085..921b278f6 100644
--- a/pipelines/rj_escritorio/dump_ftp_inea/tasks.py
+++ b/pipelines/rj_escritorio/dump_ftp_inea/tasks.py
@@ -176,11 +176,16 @@ def get_files_to_download(
     files = [file for file in files if file not in redis_files]
 
     # Check if files are already on datalake
+    # Some datalake files use the pattern "9915MAC-PPIVol-20230921-123000-0000.hdf"
+    # Files from FTP use the pattern "./MAC/9915MAC-PPIVol-20230921-123000-3f28.hdf"
+    # We are going to compare "9915MAC-PPIVol-20230921-123000" from both places
     if len(datalake_files) > 0:
+        log("Removing files that are already on datalake")
         files = [
             file
             for file in files
-            if "-".join(file.split("-")[:-1]) not in datalake_files
+            if "-".join(file.split("/")[-1].split("-")[:-1])
+            not in ["-".join(dfile.split("-")[:-1]) for dfile in datalake_files]
         ]
 
     # Skip task if there is no new file

From 6345a78d5e23215a167fee3de8130b021a2ca571 Mon Sep 17 00:00:00 2001
From: patriciacatandi <patriciabcatandi@gmail.com>
Date: Wed, 27 Sep 2023 14:31:30 -0300
Subject: [PATCH 26/41] bugfix

---
 pipelines/rj_escritorio/dump_ftp_inea/tasks.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/pipelines/rj_escritorio/dump_ftp_inea/tasks.py b/pipelines/rj_escritorio/dump_ftp_inea/tasks.py
index 921b278f6..691d195f2 100644
--- a/pipelines/rj_escritorio/dump_ftp_inea/tasks.py
+++ b/pipelines/rj_escritorio/dump_ftp_inea/tasks.py
@@ -2,7 +2,7 @@
 """
 Tasks to dump data from a INEA FTP to BigQuery
 """
-# pylint: disable=E0702,E1137,E1136,E1101,W0613,bad-continuation
+# pylint: disable=E0702,E1137,E1136,E1101,W0613
 from datetime import datetime, timedelta
 from pathlib import Path
 from typing import List
@@ -184,8 +184,7 @@ def get_files_to_download(
         files = [
             file
             for file in files
-            if "-".join(file.split("/")[-1].split("-")[:-1])
-            not in ["-".join(dfile.split("-")[:-1]) for dfile in datalake_files]
+            if "-".join(file.split("/")[-1].split("-")[:-1]) not in datalake_files
         ]
 
     # Skip task if there is no new file

From 27739dd28d6a3301f9c1d7c727021ff117f1fc26 Mon Sep 17 00:00:00 2001
From: patriciacatandi <patriciabcatandi@gmail.com>
Date: Wed, 27 Sep 2023 14:58:48 -0300
Subject: [PATCH 27/41] bugfix

---
 pipelines/rj_escritorio/dump_ftp_inea/tasks.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/pipelines/rj_escritorio/dump_ftp_inea/tasks.py b/pipelines/rj_escritorio/dump_ftp_inea/tasks.py
index 691d195f2..04069ca71 100644
--- a/pipelines/rj_escritorio/dump_ftp_inea/tasks.py
+++ b/pipelines/rj_escritorio/dump_ftp_inea/tasks.py
@@ -95,7 +95,7 @@ def get_files_datalake(
         # Format of the name is 9921GUA-PPIVol-20220930-121010-0004.hdf
         # We need remove the last characters to stay with 9921GUA-PPIVol-20220930-121010
         datalake_files = ["-".join(fname.split("-")[:-1]) for fname in datalake_files]
-        log(f"Last 5 datalake files: {datalake_files[-5:]}")
+        log(f"Last 10 datalake files: {datalake_files[-10:]}")
 
     else:
         datalake_files = []
@@ -165,15 +165,21 @@ def get_files_to_download(
     # Get specific files based on date and greater_than parameters
     if date:
         files = [file for file in files if file.split("-")[2] == date.replace("-", "")]
+        log(f"Last 10 files on FTP for date {date}: {files[-10:]}")
+
     if greater_than:
         files = [
             file
             for file in files
             if file.split("-")[2] >= greater_than.replace("-", "")
         ]
+        log(
+            f"Last 10 files on FTP for date  greater than {greater_than}: {files[-10:]}"
+        )
 
     # Check if files are already on redis
     files = [file for file in files if file not in redis_files]
+    log(f"Last 10 files on FTP that are not on redis: {files[-10:]}")
 
     # Check if files are already on datalake
     # Some datalake files use the pattern "9915MAC-PPIVol-20230921-123000-0000.hdf"

From 13dd1e71055146aaa1417375201418f917a0f232 Mon Sep 17 00:00:00 2001
From: patriciacatandi <patriciabcatandi@gmail.com>
Date: Wed, 27 Sep 2023 15:13:55 -0300
Subject: [PATCH 28/41] adding no new files on ftp

---
 pipelines/rj_escritorio/dump_ftp_inea/tasks.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/pipelines/rj_escritorio/dump_ftp_inea/tasks.py b/pipelines/rj_escritorio/dump_ftp_inea/tasks.py
index 04069ca71..e2d7121a8 100644
--- a/pipelines/rj_escritorio/dump_ftp_inea/tasks.py
+++ b/pipelines/rj_escritorio/dump_ftp_inea/tasks.py
@@ -155,6 +155,13 @@ def get_files_to_download(
 
     client.connect()
     files = client.list_files(path=f"./{radar.upper()}/")
+
+    # Skip task if there is no new file on FTP
+    if len(files) == 0:
+        log("No new available files on FTP")
+        skip = Skipped("No new available files on FTP")
+        raise ENDRUN(state=skip)
+
     # log(f"\n\nAvailable files on FTP: {files}")
     # log(f"\nFiles already saved on redis_files: {redis_files}")
 

From 010d5cdd3597e3d908b28bde33be9970519bf88c Mon Sep 17 00:00:00 2001
From: patriciacatandi <patriciabcatandi@gmail.com>
Date: Wed, 27 Sep 2023 15:40:01 -0300
Subject: [PATCH 29/41] bugfix

---
 .../rj_escritorio/dump_ftp_inea/tasks.py      | 36 +++++++++++++------
 1 file changed, 25 insertions(+), 11 deletions(-)

diff --git a/pipelines/rj_escritorio/dump_ftp_inea/tasks.py b/pipelines/rj_escritorio/dump_ftp_inea/tasks.py
index e2d7121a8..cdf5b63ef 100644
--- a/pipelines/rj_escritorio/dump_ftp_inea/tasks.py
+++ b/pipelines/rj_escritorio/dump_ftp_inea/tasks.py
@@ -119,9 +119,32 @@ def get_ftp_client(wait=None):
 
 @task(max_retries=3, retry_delay=timedelta(seconds=30))
 # pylint: disable=too-many-arguments
-def get_files_to_download(
+def get_files_from_ftp(
     client,
     radar: str,
+) -> List[str]:
+    """
+    List and get files to download FTP
+    """
+
+    client.connect()
+    files = client.list_files(path=f"./{radar.upper()}/")
+
+    # Skip task if there is no new file on FTP
+    if len(files) == 0:
+        log("No new available files on FTP")
+        skip = Skipped("No new available files on FTP")
+        raise ENDRUN(state=skip)
+
+    log(f"Last 10 files on FTP: {files[-10:]} {len(files)}")
+
+    return files
+
+
+@task(max_retries=3, retry_delay=timedelta(seconds=30))
+# pylint: disable=too-many-arguments
+def select_files_to_download(
+    files: list,
     redis_files: list,
     datalake_files: list,
     date: str = None,
@@ -129,7 +152,7 @@ def get_files_to_download(
     get_only_last_file: bool = True,
 ) -> List[str]:
     """
-    List and get files to download FTP
+    Select files to download
 
     Args:
         radar (str): Radar name. Must be `gua` or `mac`
@@ -153,15 +176,6 @@ def get_files_to_download(
             let `greater_than` as None and `get_only_last_file` as False and fill `date`
     """
 
-    client.connect()
-    files = client.list_files(path=f"./{radar.upper()}/")
-
-    # Skip task if there is no new file on FTP
-    if len(files) == 0:
-        log("No new available files on FTP")
-        skip = Skipped("No new available files on FTP")
-        raise ENDRUN(state=skip)
-
     # log(f"\n\nAvailable files on FTP: {files}")
     # log(f"\nFiles already saved on redis_files: {redis_files}")
 

From 55a238028d645e958b6af0ce14d24441662bceae Mon Sep 17 00:00:00 2001
From: patriciacatandi <patriciabcatandi@gmail.com>
Date: Wed, 27 Sep 2023 15:53:01 -0300
Subject: [PATCH 30/41] bugfix

---
 .../rj_escritorio/dump_ftp_inea/flows.py      | 19 ++++++++++++++-----
 1 file changed, 14 insertions(+), 5 deletions(-)

diff --git a/pipelines/rj_escritorio/dump_ftp_inea/flows.py b/pipelines/rj_escritorio/dump_ftp_inea/flows.py
index 0f1d87eef..b60de657b 100644
--- a/pipelines/rj_escritorio/dump_ftp_inea/flows.py
+++ b/pipelines/rj_escritorio/dump_ftp_inea/flows.py
@@ -15,8 +15,9 @@
 from pipelines.rj_escritorio.dump_ftp_inea.tasks import (
     get_ftp_client,
     get_files_datalake,
-    get_files_to_download,
+    get_files_from_ftp,
     download_files,
+    select_files_to_download,
     upload_file_to_gcs,
 )
 from pipelines.rj_escritorio.dump_ftp_inea.schedules import (
@@ -48,8 +49,16 @@
 
     client = get_ftp_client()
 
+    files = get_files_from_ftp(
+        client=client,
+        radar=radar,
+    )
+
     redis_files = get_on_redis(
-        dataset_id="meio_ambiente_clima", table_id=radar, mode=mode
+        dataset_id="meio_ambiente_clima",
+        table_id=radar,
+        mode=mode,
+        wait=files,
     )
 
     datalake_files = get_files_datalake(
@@ -61,11 +70,11 @@
         greater_than=greater_than,
         check_datalake_files=check_datalake_files,
         mode=mode,
+        wait=files,
     )
 
-    files_to_download = get_files_to_download(
-        client=client,
-        radar=radar,
+    files_to_download = select_files_to_download(
+        files=files,
         redis_files=redis_files,
         datalake_files=datalake_files,
         date=date,

From 372d1fd2fc6ca7656854005d876b8c02666d686e Mon Sep 17 00:00:00 2001
From: patriciacatandi <patriciabcatandi@gmail.com>
Date: Wed, 27 Sep 2023 16:16:53 -0300
Subject: [PATCH 31/41] bugfix

---
 pipelines/rj_escritorio/dump_ftp_inea/tasks.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/pipelines/rj_escritorio/dump_ftp_inea/tasks.py b/pipelines/rj_escritorio/dump_ftp_inea/tasks.py
index cdf5b63ef..faa91bee7 100644
--- a/pipelines/rj_escritorio/dump_ftp_inea/tasks.py
+++ b/pipelines/rj_escritorio/dump_ftp_inea/tasks.py
@@ -32,6 +32,7 @@ def get_files_datalake(
     greater_than: str = None,
     check_datalake_files: bool = True,
     mode: str = "prod",
+    wait=None,  # pylint: disable=unused-argument
 ) -> List[str]:
     """
     List files from INEA saved on datalake

From 81b31999bc62a994011eafff213fb77326f77aa4 Mon Sep 17 00:00:00 2001
From: patriciacatandi <patriciabcatandi@gmail.com>
Date: Wed, 27 Sep 2023 20:16:19 -0300
Subject: [PATCH 32/41] bugfix

---
 pipelines/rj_escritorio/dump_ftp_inea/tasks.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/pipelines/rj_escritorio/dump_ftp_inea/tasks.py b/pipelines/rj_escritorio/dump_ftp_inea/tasks.py
index faa91bee7..cbdd6864a 100644
--- a/pipelines/rj_escritorio/dump_ftp_inea/tasks.py
+++ b/pipelines/rj_escritorio/dump_ftp_inea/tasks.py
@@ -138,6 +138,7 @@ def get_files_from_ftp(
         raise ENDRUN(state=skip)
 
     log(f"Last 10 files on FTP: {files[-10:]} {len(files)}")
+    log(f"files on FTP: {files}")
 
     return files
 

From e05c30e2eaee0c0a7e590568c868a07a2744e7fd Mon Sep 17 00:00:00 2001
From: d116626 <d116626@gmail.com>
Date: Thu, 28 Sep 2023 15:11:11 -0300
Subject: [PATCH 33/41] chore: change interval for total_contagem

---
 pipelines/rj_smfp/dump_db_ergon/flows.py     | 2 +-
 pipelines/rj_smfp/dump_db_ergon/schedules.py | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/pipelines/rj_smfp/dump_db_ergon/flows.py b/pipelines/rj_smfp/dump_db_ergon/flows.py
index dbc04cb08..4e0324338 100644
--- a/pipelines/rj_smfp/dump_db_ergon/flows.py
+++ b/pipelines/rj_smfp/dump_db_ergon/flows.py
@@ -1,6 +1,6 @@
 # -*- coding: utf-8 -*-
 """
-Database dumping flows for segovi project
+Database dumping flows for segovi project.
 """
 
 from copy import deepcopy
diff --git a/pipelines/rj_smfp/dump_db_ergon/schedules.py b/pipelines/rj_smfp/dump_db_ergon/schedules.py
index d709e913b..cb5b9b2b0 100644
--- a/pipelines/rj_smfp/dump_db_ergon/schedules.py
+++ b/pipelines/rj_smfp/dump_db_ergon/schedules.py
@@ -206,6 +206,7 @@
             TOTAL_ANOS,DATA_PROXIMO,NOME_PROXIMO,EMP_CODIGO
         FROM ERGON.TOTAL_CONTA
         """,
+        "interval": timedelta(days=15),
     },
     "pre_contagem": {
         "materialize_after_dump": True,

From 1ef01d70161c033498c1b483191dd2beab44ca45 Mon Sep 17 00:00:00 2001
From: Fernanda Scovino <fscovinom@gmail.com>
Date: Fri, 29 Sep 2023 17:05:34 -0300
Subject: [PATCH 34/41] =?UTF-8?q?Unifica=20tasks=20de=20parti=C3=A7=C3=A3o?=
 =?UTF-8?q?=20de=20data=20e=20hora=20(#517)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* remove task de particao nao usada

* unifica tasks de particao de data e hora

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* corrige condicional

---------

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: mergify[bot] <37929162+mergify[bot]@users.noreply.github.com>
---
 pipelines/rj_smtr/constants.py     |  9 +++---
 pipelines/rj_smtr/flows.py         | 14 +++-------
 pipelines/rj_smtr/tasks.py         | 45 +++++-------------------------
 pipelines/rj_smtr/veiculo/flows.py |  6 ++--
 4 files changed, 18 insertions(+), 56 deletions(-)

diff --git a/pipelines/rj_smtr/constants.py b/pipelines/rj_smtr/constants.py
index 7133b8abe..93303e5b7 100644
--- a/pipelines/rj_smtr/constants.py
+++ b/pipelines/rj_smtr/constants.py
@@ -181,7 +181,6 @@ class constants(Enum):  # pylint: disable=c0103
                     data_processamento
             """,
             "primary_key": ["id"],  # id column to nest data on
-            "flag_date_partition": False,
         },
     ]
     BILHETAGEM_TABLES_PARAMS = [
@@ -199,7 +198,7 @@ class constants(Enum):  # pylint: disable=c0103
                     DT_INCLUSAO
             """,
             "primary_key": ["CD_LINHA"],  # id column to nest data on
-            "flag_date_partition": True,
+            "partition_date_only": True,
         },
         {
             "table_id": "grupo",
@@ -215,7 +214,7 @@ class constants(Enum):  # pylint: disable=c0103
                     DT_INCLUSAO
             """,
             "primary_key": ["CD_GRUPO"],
-            "flag_date_partition": True,
+            "partition_date_only": True,
         },
         {
             "table_id": "grupo_linha",
@@ -231,7 +230,7 @@ class constants(Enum):  # pylint: disable=c0103
                     DT_INCLUSAO
             """,
             "primary_key": ["CD_GRUPO", "CD_LINHA"],  # id column to nest data on
-            "flag_date_partition": True,
+            "partition_date_only": True,
         },
         {
             "table_id": "matriz_integracao",
@@ -250,7 +249,7 @@ class constants(Enum):  # pylint: disable=c0103
                 "cd_versao_matriz",
                 "cd_integracao",
             ],  # id column to nest data on
-            "flag_date_partition": True,
+            "partition_date_only": True,
         },
     ]
     BILHETAGEM_SECRET_PATH = "smtr_jae_access_data"
diff --git a/pipelines/rj_smtr/flows.py b/pipelines/rj_smtr/flows.py
index f1d29ed10..87d506813 100644
--- a/pipelines/rj_smtr/flows.py
+++ b/pipelines/rj_smtr/flows.py
@@ -5,8 +5,7 @@
 
 from prefect.run_configs import KubernetesRun
 from prefect.storage import GCS
-from prefect import case, Parameter
-from prefect.tasks.control_flow import merge
+from prefect import Parameter
 
 # EMD Imports #
 
@@ -19,7 +18,6 @@
 # SMTR Imports #
 
 from pipelines.rj_smtr.tasks import (
-    create_date_partition,
     create_date_hour_partition,
     create_local_partition_path,
     get_current_timestamp,
@@ -66,13 +64,9 @@
         dataset_id=dataset_id,
     )
 
-    with case(table_params["flag_date_partition"], True):
-        date_partitions = create_date_partition(timestamp)
-
-    with case(table_params["flag_date_partition"], False):
-        date_hour_partitions = create_date_hour_partition(timestamp)
-
-    partitions = merge(date_partitions, date_hour_partitions)
+    partitions = create_date_hour_partition(
+        timestamp, partition_date_only=table_params["partition_date_only"]
+    )
 
     filename = parse_timestamp_to_string(timestamp)
 
diff --git a/pipelines/rj_smtr/tasks.py b/pipelines/rj_smtr/tasks.py
index 5b476e8de..e8b239957 100644
--- a/pipelines/rj_smtr/tasks.py
+++ b/pipelines/rj_smtr/tasks.py
@@ -158,19 +158,16 @@ def get_current_timestamp(timestamp=None, truncate_minute: bool = True) -> datet
 
 
 @task
-def create_date_hour_partition(timestamp: datetime) -> str:
-    """
-    Get date hour Hive partition structure from timestamp.
-    """
-    return f"data={timestamp.strftime('%Y-%m-%d')}/hora={timestamp.strftime('%H')}"
-
-
-@task
-def create_date_partition(timestamp: datetime) -> str:
+def create_date_hour_partition(
+    timestamp: datetime, partition_date_only: bool = False
+) -> str:
     """
     Get date hour Hive partition structure from timestamp.
     """
-    return f"data={timestamp.date()}"
+    partition = f"data={timestamp.strftime('%Y-%m-%d')}"
+    if not partition_date_only:
+        partition += f"/hora={timestamp.strftime('%H')}"
+    return partition
 
 
 @task
@@ -181,34 +178,6 @@ def parse_timestamp_to_string(timestamp: datetime, pattern="%Y-%m-%d-%H-%M-%S")
     return timestamp.strftime(pattern)
 
 
-@task
-def create_current_date_hour_partition(capture_time=None):
-    """Create partitioned directory structure to save data locally based
-    on capture time.
-
-    Args:
-        capture_time(pendulum.datetime.DateTime, optional):
-            if recapturing data, will create partitions based
-            on the failed timestamps being recaptured
-
-    Returns:
-        dict: "filename" contains the name which to upload the csv, "partitions" contains
-        the partitioned directory path
-    """
-    if capture_time is None:
-        capture_time = datetime.now(tz=constants.TIMEZONE.value).replace(
-            minute=0, second=0, microsecond=0
-        )
-    date = capture_time.strftime("%Y-%m-%d")
-    hour = capture_time.strftime("%H")
-
-    return {
-        "filename": capture_time.strftime("%Y-%m-%d-%H-%M-%S"),
-        "partitions": f"data={date}/hora={hour}",
-        "timestamp": capture_time,
-    }
-
-
 @task
 def create_local_partition_path(
     dataset_id: str, table_id: str, filename: str, partitions: str = None
diff --git a/pipelines/rj_smtr/veiculo/flows.py b/pipelines/rj_smtr/veiculo/flows.py
index 28188a129..e1fab515e 100644
--- a/pipelines/rj_smtr/veiculo/flows.py
+++ b/pipelines/rj_smtr/veiculo/flows.py
@@ -30,7 +30,7 @@
     every_day_hour_seven,
 )
 from pipelines.rj_smtr.tasks import (
-    create_date_partition,
+    create_date_hour_partition,
     create_local_partition_path,
     get_current_timestamp,
     get_raw,
@@ -71,7 +71,7 @@
     )
 
     # SETUP #
-    partitions = create_date_partition(timestamp)
+    partitions = create_date_hour_partition(timestamp, partition_date_only=True)
 
     filename = parse_timestamp_to_string(timestamp)
 
@@ -140,7 +140,7 @@
     )
 
     # SETUP #
-    partitions = create_date_partition(timestamp)
+    partitions = create_date_hour_partition(timestamp, partition_date_only=True)
 
     filename = parse_timestamp_to_string(timestamp)
 

From c16dc74e78fc4b33c71469ba654ecc0cfaaf19cc Mon Sep 17 00:00:00 2001
From: Rafael Carvalho Pinheiro <74972217+pixuimpou@users.noreply.github.com>
Date: Fri, 29 Sep 2023 17:46:08 -0300
Subject: [PATCH 35/41] =?UTF-8?q?Corrige=20par=C3=A2metro=20de=20`partitio?=
 =?UTF-8?q?n=5Fdate=5Fonly`=20no=20flow=20de=20bilhetagem=20(#521)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 pipelines/rj_smtr/constants.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/pipelines/rj_smtr/constants.py b/pipelines/rj_smtr/constants.py
index 93303e5b7..c9f18f2fd 100644
--- a/pipelines/rj_smtr/constants.py
+++ b/pipelines/rj_smtr/constants.py
@@ -181,6 +181,7 @@ class constants(Enum):  # pylint: disable=c0103
                     data_processamento
             """,
             "primary_key": ["id"],  # id column to nest data on
+            "partition_date_only": False,
         },
     ]
     BILHETAGEM_TABLES_PARAMS = [

From f1fc682256464af418803f5853ab73019fe716c0 Mon Sep 17 00:00:00 2001
From: Rafael Carvalho Pinheiro <74972217+pixuimpou@users.noreply.github.com>
Date: Mon, 2 Oct 2023 12:12:07 -0300
Subject: [PATCH 36/41] =?UTF-8?q?Altera=20flow=20de=20captura=20gen=C3=A9r?=
 =?UTF-8?q?ica=20(#520)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* remove task de particao nao usada

* unifica tasks de particao de data e hora

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* corrige condicional

* change capture flow

* change generic capture flow

* atualiza esquema do flow padrao

* change default capture flow structure

* change generic capture flow

* adjust constant structure

* change bilhetagem to new capture flow structure

* fix get_storage_blob function

* fix get_storage_blob call

* organize constants order

* fix get_raw_from_sources function call

* change transform_raw_to_json to read_raw_data

* transform transform_raw_data_to_json to read_raw_data

* fix nout task parameter

* fix timedelta instantiation

* set upstream tasks

* declare raw_filepath

* update docstrings

* adjust get_raw_from_sources return

* fix errors

* change agent label to dev

* refactore source values

* update constants

* update agent

* update schedule params

* update interval

* fix get_datetime_range interval

* remove order by from queries

* fix get_raw_data_api

* change json read function

* update read_raw_data

* update save_raw_local_func

* log error

* change raw api extraction for json

* change read json function

* print log traceback

* skip pre treatment if empty df

* skip save staging if dataframe is empty / save raw

* remove skip upload if empty dataframe

* update docstring and returned values

* reorganize task order

* fix tuple

* change zip logic

* remove skip

* create gtfs zip constant

* add gtfs zip file name

* add csv to save raw / change filetype logic

* remove comments

* fix csv_args default value

* change docstring get raw api

* change raw data gcs docstring

* remove commented task

* change quadro primary key to list

* update GTFS constants

* change upload folder structure

* undo silenciamento de falha de notificação

* remove parametros de testes (gtfs)

* Update pipelines/rj_smtr/constants.py

Co-authored-by: Fernanda Scovino <fscovinom@gmail.com>

* corrige encadeamento de erros no flow

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* remove header treatment

* mudar agent dev para prd

* mudar agent de dev para prd

* ajustar retorno das funcoes

* Atualiza documentação

* adicionar retorno em get_upload_storage_blob

* Atualiza documentação

* Atualiza string

---------

Co-authored-by: fernandascovino <fscovinom@gmail.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: mergify[bot] <37929162+mergify[bot]@users.noreply.github.com>
Co-authored-by: eng-rodrigocunha <engtransportes.rodrigocunha@gmail.com>
Co-authored-by: Carolina Gomes <gsv.lina@gmail.com>
Co-authored-by: Rodrigo Cunha <66736583+eng-rodrigocunha@users.noreply.github.com>
---
 .../schedules.py                              |  22 +-
 pipelines/rj_smtr/constants.py                | 150 ++++---
 pipelines/rj_smtr/flows.py                    |  94 ++--
 pipelines/rj_smtr/tasks.py                    | 423 ++++++++++++------
 pipelines/rj_smtr/utils.py                    | 338 +++++++++++++-
 pipelines/utils/utils.py                      |  12 +-
 6 files changed, 781 insertions(+), 258 deletions(-)

diff --git a/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/schedules.py b/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/schedules.py
index 38fca85a9..2f7804811 100644
--- a/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/schedules.py
+++ b/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/schedules.py
@@ -16,27 +16,37 @@
 )
 
 bilhetagem_principal_clocks = generate_execute_schedules(
-    interval=timedelta(days=1),
+    clock_interval=timedelta(
+        **constants.BILHETAGEM_GENERAL_CAPTURE_PARAMS.value["principal_run_interval"]
+    ),
     labels=[
         emd_constants.RJ_SMTR_AGENT_LABEL.value,
     ],
-    table_parameters=constants.BILHETAGEM_TABLES_PARAMS.value,
+    table_parameters=constants.BILHETAGEM_CAPTURE_PARAMS.value,
     dataset_id=constants.BILHETAGEM_DATASET_ID.value,
     secret_path=constants.BILHETAGEM_SECRET_PATH.value,
-    runs_interval_minutes=15,
+    source_type=constants.BILHETAGEM_GENERAL_CAPTURE_PARAMS.value["source_type"],
+    runs_interval_minutes=constants.BILHETAGEM_GENERAL_CAPTURE_PARAMS.value[
+        "principal_runs_interval_minutes"
+    ],
 )
 
 bilhetagem_principal_schedule = Schedule(clocks=untuple(bilhetagem_principal_clocks))
 
 bilhetagem_transacao_clocks = generate_execute_schedules(
-    interval=timedelta(minutes=1),
+    clock_interval=timedelta(
+        **constants.BILHETAGEM_GENERAL_CAPTURE_PARAMS.value["transacao_run_interval"]
+    ),
     labels=[
         emd_constants.RJ_SMTR_AGENT_LABEL.value,
     ],
-    table_parameters=constants.BILHETAGEM_TRANSACAO_TABLE_PARAMS.value,
+    table_parameters=constants.BILHETAGEM_TRANSACAO_CAPTURE_PARAMS.value,
     dataset_id=constants.BILHETAGEM_DATASET_ID.value,
     secret_path=constants.BILHETAGEM_SECRET_PATH.value,
-    runs_interval_minutes=0,
+    source_type=constants.BILHETAGEM_GENERAL_CAPTURE_PARAMS.value["source_type"],
+    runs_interval_minutes=constants.BILHETAGEM_GENERAL_CAPTURE_PARAMS.value[
+        "transacao_runs_interval_minutes"
+    ],
 )
 
 bilhetagem_transacao_schedule = Schedule(clocks=untuple(bilhetagem_transacao_clocks))
diff --git a/pipelines/rj_smtr/constants.py b/pipelines/rj_smtr/constants.py
index c9f18f2fd..52e30d9f8 100644
--- a/pipelines/rj_smtr/constants.py
+++ b/pipelines/rj_smtr/constants.py
@@ -165,9 +165,34 @@ class constants(Enum):  # pylint: disable=c0103
 
     # BILHETAGEM
     BILHETAGEM_DATASET_ID = "br_rj_riodejaneiro_bilhetagem"
-    BILHETAGEM_TRANSACAO_TABLE_PARAMS = [
-        {
-            "table_id": "transacao",
+
+    BILHETAGEM_GENERAL_CAPTURE_PARAMS = {
+        "databases": {
+            "principal_db": {
+                "engine": "mysql",
+                "host": "principal-database-replica.internal",
+            },
+            "tarifa_db": {
+                "engine": "postgres",
+                "host": "tarifa-database-replica.internal",
+            },
+            "transacao_db": {
+                "engine": "postgres",
+                "host": "transacao-database-replica.internal",
+            },
+        },
+        "vpn_url": "http://vpn-jae.mobilidade.rio/",
+        "source_type": "api-json",
+        "transacao_run_interval": {"minutes": 1},
+        "principal_run_interval": {"days": 1},
+        "transacao_runs_interval_minutes": 0,
+        "principal_runs_interval_minutes": 5,
+    }
+
+    BILHETAGEM_TRANSACAO_CAPTURE_PARAMS = {
+        "table_id": "transacao",
+        "partition_date_only": False,
+        "extract_params": {
             "database": "transacao_db",
             "query": """
                 SELECT
@@ -177,80 +202,91 @@ class constants(Enum):  # pylint: disable=c0103
                 WHERE
                     data_processamento BETWEEN '{start}'
                     AND '{end}'
-                ORDER BY
-                    data_processamento
             """,
-            "primary_key": ["id"],  # id column to nest data on
-            "partition_date_only": False,
+            "run_interval": BILHETAGEM_GENERAL_CAPTURE_PARAMS["transacao_run_interval"],
         },
-    ]
-    BILHETAGEM_TABLES_PARAMS = [
+        "primary_key": ["id"],  # id column to nest data on
+    }
+
+    BILHETAGEM_CAPTURE_PARAMS = [
         {
             "table_id": "linha",
-            "database": "principal_db",
-            "query": """
-                SELECT
-                    *
-                FROM
-                    LINHA
-                WHERE
-                    DT_INCLUSAO >= '{start}'
-                ORDER BY
-                    DT_INCLUSAO
-            """,
-            "primary_key": ["CD_LINHA"],  # id column to nest data on
             "partition_date_only": True,
+            "extract_params": {
+                "database": "principal_db",
+                "query": """
+                    SELECT
+                        *
+                    FROM
+                        LINHA
+                    WHERE
+                        DT_INCLUSAO >= '{start}'
+                """,
+                "run_interval": BILHETAGEM_GENERAL_CAPTURE_PARAMS[
+                    "principal_run_interval"
+                ],
+            },
+            "primary_key": ["CD_LINHA"],  # id column to nest data on
         },
         {
             "table_id": "grupo",
-            "database": "principal_db",
-            "query": """
-                SELECT
-                    *
-                FROM
-                    GRUPO
-                WHERE
-                    DT_INCLUSAO >= '{start}'
-                ORDER BY
-                    DT_INCLUSAO
-            """,
-            "primary_key": ["CD_GRUPO"],
             "partition_date_only": True,
+            "extract_params": {
+                "database": "principal_db",
+                "query": """
+                    SELECT
+                        *
+                    FROM
+                        GRUPO
+                    WHERE
+                        DT_INCLUSAO >= '{start}'
+                """,
+                "run_interval": BILHETAGEM_GENERAL_CAPTURE_PARAMS[
+                    "principal_run_interval"
+                ],
+            },
+            "primary_key": ["CD_GRUPO"],  # id column to nest data on
         },
         {
             "table_id": "grupo_linha",
-            "database": "principal_db",
-            "query": """
-                SELECT
-                    *
-                FROM
-                    GRUPO_LINHA
-                WHERE
-                    DT_INCLUSAO >= '{start}'
-                ORDER BY
-                    DT_INCLUSAO
-            """,
-            "primary_key": ["CD_GRUPO", "CD_LINHA"],  # id column to nest data on
             "partition_date_only": True,
+            "extract_params": {
+                "database": "principal_db",
+                "query": """
+                    SELECT
+                        *
+                    FROM
+                        GRUPO_LINHA
+                    WHERE
+                        DT_INCLUSAO >= '{start}'
+                """,
+                "run_interval": BILHETAGEM_GENERAL_CAPTURE_PARAMS[
+                    "principal_run_interval"
+                ],
+            },
+            "primary_key": ["CD_GRUPO", "CD_LINHA"],  # id column to nest data on
         },
         {
             "table_id": "matriz_integracao",
-            "database": "tarifa_db",
-            "query": """
-                SELECT
-                    *
-                FROM
-                    matriz_integracao
-                WHERE
-                    dt_inclusao >= '{start}'
-                ORDER BY
-                    dt_inclusao
-            """,
+            "partition_date_only": True,
+            "extract_params": {
+                "database": "tarifa_db",
+                "query": """
+                    SELECT
+                        *
+                    FROM
+                        matriz_integracao
+                    WHERE
+                        dt_inclusao >= '{start}'
+                """,
+                "run_interval": BILHETAGEM_GENERAL_CAPTURE_PARAMS[
+                    "principal_run_interval"
+                ],
+            },
             "primary_key": [
                 "cd_versao_matriz",
                 "cd_integracao",
             ],  # id column to nest data on
-            "partition_date_only": True,
         },
     ]
     BILHETAGEM_SECRET_PATH = "smtr_jae_access_data"
diff --git a/pipelines/rj_smtr/flows.py b/pipelines/rj_smtr/flows.py
index 87d506813..4860c6d07 100644
--- a/pipelines/rj_smtr/flows.py
+++ b/pipelines/rj_smtr/flows.py
@@ -21,18 +21,12 @@
     create_date_hour_partition,
     create_local_partition_path,
     get_current_timestamp,
-    get_raw,
     parse_timestamp_to_string,
-    save_raw_local,
-    save_treated_local,
-    upload_logs_to_bq,
-    bq_upload,
-    transform_to_nested_structure,
-)
-
-from pipelines.rj_smtr.tasks import (
+    upload_raw_data_to_gcs,
+    upload_staging_data_to_gcs,
+    transform_raw_to_nested_structure,
+    get_raw_from_sources,
     create_request_params,
-    get_datetime_range,
 )
 
 
@@ -40,75 +34,79 @@
     "SMTR: Captura",
     code_owners=["caio", "fernanda", "boris", "rodrigo"],
 ) as default_capture_flow:
-    # SETUP #
+    # Configuração #
 
-    table_params = Parameter("table_params", default=None)
-    timestamp_param = Parameter("timestamp", default=None)
-    interval = Parameter("interval", default=None)
+    table_id = Parameter("table_id", default=None)
+    partition_date_only = Parameter("partition_date_only", default=None)
+    extract_params = Parameter("extract_params", default=None)
     dataset_id = Parameter("dataset_id", default=None)
     secret_path = Parameter("secret_path", default=None)
+    primary_key = Parameter("primary_key", default=None)
+    source_type = Parameter("source_type", default=None)
 
-    timestamp = get_current_timestamp(timestamp_param)
-
-    datetime_range = get_datetime_range(timestamp, interval=interval)
+    timestamp = get_current_timestamp()
 
     rename_flow_run = rename_current_flow_run_now_time(
-        prefix=default_capture_flow.name + " " + table_params["table_id"] + ": ",
+        prefix=default_capture_flow.name + " " + table_id + ": ",
         now_time=timestamp,
     )
 
-    request_params, request_url = create_request_params(
-        datetime_range=datetime_range,
-        table_params=table_params,
-        secret_path=secret_path,
-        dataset_id=dataset_id,
-    )
-
     partitions = create_date_hour_partition(
-        timestamp, partition_date_only=table_params["partition_date_only"]
+        timestamp, partition_date_only=partition_date_only
     )
 
     filename = parse_timestamp_to_string(timestamp)
 
     filepath = create_local_partition_path(
         dataset_id=dataset_id,
-        table_id=table_params["table_id"],
+        table_id=table_id,
         filename=filename,
         partitions=partitions,
     )
 
-    raw_status = get_raw(
-        url=request_url,
-        headers=secret_path,
-        params=request_params,
+    # Extração #
+    request_params, request_path = create_request_params(
+        dataset_id=dataset_id,
+        extract_params=extract_params,
+        table_id=table_id,
+        timestamp=timestamp,
     )
 
-    raw_filepath = save_raw_local(status=raw_status, file_path=filepath)
+    error, raw_filepath = get_raw_from_sources(
+        source_type=source_type,
+        local_filepath=filepath,
+        source_path=request_path,
+        dataset_id=dataset_id,
+        table_id=table_id,
+        secret_path=secret_path,
+        request_params=request_params,
+    )
 
-    # TREAT & CLEAN #
-    treated_status = transform_to_nested_structure(
-        status=raw_status,
-        timestamp=timestamp,
-        primary_key=table_params["primary_key"],
+    error = upload_raw_data_to_gcs(
+        error=error,
+        raw_filepath=raw_filepath,
+        table_id=table_id,
+        dataset_id=dataset_id,
+        partitions=partitions,
     )
 
-    treated_filepath = save_treated_local(status=treated_status, file_path=filepath)
+    # Pré-tratamento #
 
-    # LOAD #
-    error = bq_upload(
-        dataset_id=dataset_id,
-        table_id=table_params["table_id"],
-        filepath=treated_filepath,
+    error, staging_filepath = transform_raw_to_nested_structure(
         raw_filepath=raw_filepath,
-        partitions=partitions,
-        status=treated_status,
+        filepath=filepath,
+        error=error,
+        timestamp=timestamp,
+        primary_key=primary_key,
     )
 
-    upload_logs_to_bq(
-        dataset_id=dataset_id,
-        parent_table_id=table_params["table_id"],
+    STAGING_UPLOADED = upload_staging_data_to_gcs(
         error=error,
+        staging_filepath=staging_filepath,
         timestamp=timestamp,
+        table_id=table_id,
+        dataset_id=dataset_id,
+        partitions=partitions,
     )
 
 default_capture_flow.storage = GCS(emd_constants.GCS_FLOWS_BUCKET.value)
diff --git a/pipelines/rj_smtr/tasks.py b/pipelines/rj_smtr/tasks.py
index e8b239957..a846851b5 100644
--- a/pipelines/rj_smtr/tasks.py
+++ b/pipelines/rj_smtr/tasks.py
@@ -8,7 +8,7 @@
 import os
 from pathlib import Path
 import traceback
-from typing import Dict, List
+from typing import Dict, List, Union
 import io
 
 from basedosdados import Storage, Table
@@ -28,6 +28,13 @@
     get_last_run_timestamp,
     log_critical,
     data_info_str,
+    get_raw_data_api,
+    get_raw_data_gcs,
+    upload_run_logs_to_bq,
+    get_datetime_range,
+    read_raw_data,
+    save_treated_local_func,
+    save_raw_local_func,
 )
 from pipelines.utils.execute_dbt_model.utils import get_dbt_client
 from pipelines.utils.utils import log, get_redis_client, get_vault_secret
@@ -162,7 +169,14 @@ def create_date_hour_partition(
     timestamp: datetime, partition_date_only: bool = False
 ) -> str:
     """
-    Get date hour Hive partition structure from timestamp.
+    Create a date (and hour) Hive partition structure from timestamp.
+
+    Args:
+        timestamp (datetime): timestamp to be used as reference
+        partition_date_only (bool, optional): whether to add hour partition or not
+
+    Returns:
+        str: partition string
     """
     partition = f"data={timestamp.strftime('%Y-%m-%d')}"
     if not partition_date_only:
@@ -417,15 +431,123 @@ def get_raw(  # pylint: disable=R0912
                     "Unsupported raw file extension. Supported only: json, csv and txt"
                 )
 
-    except Exception as exp:
-        error = exp
-
-    if error is not None:
+    except Exception:
+        error = traceback.format_exc()
         log(f"[CATCHED] Task failed with error: \n{error}", level="error")
 
     return {"data": data, "error": error}
 
 
+@task(checkpoint=False, nout=2)
+def create_request_params(
+    extract_params: dict,
+    table_id: str,
+    dataset_id: str,
+    timestamp: datetime,
+) -> tuple[str, str]:
+    """
+    Task to create request params
+
+    Args:
+        extract_params (dict): extract parameters
+        table_id (str): table_id on BigQuery
+        dataset_id (str): dataset_id on BigQuery
+        timestamp (datetime): timestamp for flow run
+
+    Returns:
+        request_params: host, database and query to request data
+        request_url: url to request data
+    """
+    request_params = None
+    request_url = None
+
+    if dataset_id == constants.BILHETAGEM_DATASET_ID.value:
+        database = constants.BILHETAGEM_GENERAL_CAPTURE_PARAMS.value["databases"][
+            extract_params["database"]
+        ]
+        request_url = (
+            constants.BILHETAGEM_GENERAL_CAPTURE_PARAMS.value["vpn_url"]
+            + database["engine"]
+        )
+
+        datetime_range = get_datetime_range(
+            timestamp=timestamp, interval=timedelta(**extract_params["run_interval"])
+        )
+
+        request_params = {
+            "host": database["host"],  # TODO: exibir no log em ambiente fechado
+            "database": extract_params["database"],
+            "query": extract_params["query"].format(**datetime_range),
+        }
+
+    return request_params, request_url
+
+
+@task(checkpoint=False, nout=2)
+def get_raw_from_sources(
+    source_type: str,
+    local_filepath: str,
+    source_path: str = None,
+    dataset_id: str = None,
+    table_id: str = None,
+    secret_path: str = None,
+    request_params: dict = None,
+) -> tuple[str, str]:
+    """
+    Task to get raw data from sources
+
+    Args:
+        source_type (str): source type
+        local_filepath (str): local filepath
+        source_path (str, optional): source path. Defaults to None.
+        dataset_id (str, optional): dataset_id on BigQuery. Defaults to None.
+        table_id (str, optional): table_id on BigQuery. Defaults to None.
+        secret_path (str, optional): secret path. Defaults to None.
+        request_params (dict, optional): request parameters. Defaults to None.
+
+    Returns:
+        error: error catched from upstream tasks
+        filepath: filepath to raw data
+    """
+    error = None
+    filepath = None
+    data = None
+
+    source_values = source_type.split("-", 1)
+
+    source_type, filetype = (
+        source_values if len(source_values) == 2 else (source_values[0], None)
+    )
+
+    log(f"Getting raw data from source type: {source_type}")
+
+    try:
+        if source_type == "api":
+            error, data, filetype = get_raw_data_api(
+                url=source_path,
+                secret_path=secret_path,
+                api_params=request_params,
+                filetype=filetype,
+            )
+        elif source_type == "gcs":
+            error, data, filetype = get_raw_data_gcs(
+                dataset_id=dataset_id, table_id=table_id, zip_filename=request_params
+            )
+        else:
+            raise NotImplementedError(f"{source_type} not supported")
+
+        filepath = save_raw_local_func(
+            data=data, filepath=local_filepath, filetype=filetype
+        )
+
+    except NotImplementedError:
+        error = traceback.format_exc()
+        log(f"[CATCHED] Task failed with error: \n{error}", level="error")
+
+    log(f"Raw extraction ended returned values: {error}, {filepath}")
+    return error, filepath
+
+
 ###############
 #
 # Load data
@@ -599,6 +721,101 @@ def upload_logs_to_bq(  # pylint: disable=R0913
         raise Exception(f"Pipeline failed with error: {error}")
 
 
+@task
+def upload_raw_data_to_gcs(
+    error: str,
+    raw_filepath: str,
+    table_id: str,
+    dataset_id: str,
+    partitions: list,
+) -> Union[str, None]:
+    """
+    Upload raw data to GCS.
+
+    Args:
+        error (str): Error catched from upstream tasks.
+        raw_filepath (str): Path to the saved raw .json file
+        table_id (str): table_id on BigQuery
+        dataset_id (str): dataset_id on BigQuery
+        partitions (list): list of partition strings
+
+    Returns:
+        Union[str, None]: if there is an error returns it traceback, otherwise returns None
+    """
+    if error is None:
+        try:
+            st_obj = Storage(table_id=table_id, dataset_id=dataset_id)
+            log(
+                f"""Uploading raw file to bucket {st_obj.bucket_name} at
+                {st_obj.bucket_name}/{dataset_id}/{table_id}"""
+            )
+            st_obj.upload(
+                path=raw_filepath,
+                partitions=partitions,
+                mode="raw",
+                if_exists="replace",
+            )
+        except Exception:
+            error = traceback.format_exc()
+            log(f"[CATCHED] Task failed with error: \n{error}", level="error")
+
+    return error
+
+
+@task
+def upload_staging_data_to_gcs(
+    error: str,
+    staging_filepath: str,
+    timestamp: datetime,
+    table_id: str,
+    dataset_id: str,
+    partitions: list,
+) -> Union[str, None]:
+    """
+    Upload staging data to GCS.
+
+    Args:
+        error (str): Error catched from upstream tasks.
+        staging_filepath (str): Path to the saved treated .csv file.
+        timestamp (datetime): timestamp for flow run.
+        table_id (str): table_id on BigQuery.
+        dataset_id (str): dataset_id on BigQuery.
+        partitions (list): list of partition strings.
+
+    Returns:
+        Union[str, None]: if there is an error returns it traceback, otherwise returns None
+    """
+    if error is None:
+        try:
+            # Creates and publish table if it does not exist, append to it otherwise
+            create_or_append_table(
+                dataset_id=dataset_id,
+                table_id=table_id,
+                path=staging_filepath,
+                partitions=partitions,
+            )
+        except Exception:
+            error = traceback.format_exc()
+            log(f"[CATCHED] Task failed with error: \n{error}", level="error")
+
+    upload_run_logs_to_bq(
+        dataset_id=dataset_id,
+        parent_table_id=table_id,
+        error=error,
+        timestamp=timestamp,
+        mode="staging",
+    )
+
+    return error
+
+
+###############
+#
+# Daterange tasks
+#
+###############
+
+
 @task(
     checkpoint=False,
     max_retries=constants.MAX_RETRIES.value,
@@ -789,140 +1006,92 @@ def get_previous_date(days):
     return now.to_date_string()
 
 
-@task
-def transform_to_nested_structure(
-    status: dict, timestamp: datetime, primary_key: list = None
-):
-    """Transform dataframe to nested structure
-
-    Args:
-        status (dict): Must contain keys
-            * `data`: dataframe returned from treatement
-            * `error`: error catched from data treatement
-        timestamp (datetime): timestamp of the capture
-        primary_key (list, optional): List of primary keys to be used for nesting.
-
-    Returns:
-        dict: Conatining keys
-            * `data` (json): nested data
-            * `error` (str): catched error, if any. Otherwise, returns None
-    """
-
-    # Check previous error
-    if status["error"] is not None:
-        return {"data": pd.DataFrame(), "error": status["error"]}
-
-    # Check empty dataframe
-    if len(status["data"]) == 0:
-        log("Empty dataframe, skipping transformation...")
-        return {"data": pd.DataFrame(), "error": status["error"]}
-
-    try:
-        if primary_key is None:
-            primary_key = []
-
-        error = None
-        data = pd.DataFrame(status["data"])
-
-        log(
-            f"""
-        Received inputs:
-        - timestamp:\n{timestamp}
-        - data:\n{data.head()}"""
-        )
-
-        log(f"Raw data:\n{data_info_str(data)}", level="info")
-
-        log("Adding captured timestamp column...", level="info")
-        data["timestamp_captura"] = timestamp
-
-        log("Striping string columns...", level="info")
-        for col in data.columns[data.dtypes == "object"].to_list():
-            data[col] = data[col].str.strip()
-
-        log(f"Finished cleaning! Data:\n{data_info_str(data)}", level="info")
-
-        log("Creating nested structure...", level="info")
-        pk_cols = primary_key + ["timestamp_captura"]
-        data = (
-            data.groupby(pk_cols)
-            .apply(
-                lambda x: x[data.columns.difference(pk_cols)].to_json(orient="records")
-            )
-            .str.strip("[]")
-            .reset_index(name="content")[primary_key + ["content", "timestamp_captura"]]
-        )
-
-        log(
-            f"Finished nested structure! Data:\n{data_info_str(data)}",
-            level="info",
-        )
-
-    except Exception as exp:  # pylint: disable=W0703
-        error = exp
-
-    if error is not None:
-        log(f"[CATCHED] Task failed with error: \n{error}", level="error")
-
-    return {"data": data, "error": error}
+###############
+#
+# Pretreat data
+#
+###############
 
 
-@task(checkpoint=False)
-def get_datetime_range(
+@task(nout=2)
+def transform_raw_to_nested_structure(
+    raw_filepath: str,
+    filepath: str,
+    error: str,
     timestamp: datetime,
-    interval: int,
-) -> dict:
+    primary_key: list = None,
+) -> tuple[str, str]:
     """
-    Task to get datetime range in UTC
+    Task to transform raw data to nested structure
 
     Args:
-        timestamp (datetime): timestamp to get datetime range
-        interval (int): interval in seconds
+        raw_filepath (str): Path to the saved raw .json file
+        filepath (str): Path to the saved treated .csv file
+        error (str): Error catched from upstream tasks
+        timestamp (datetime): timestamp for flow run
+        primary_key (list, optional): Primary key to be used on nested structure
 
     Returns:
-        dict: datetime range
-    """
-
-    start = (
-        (timestamp - timedelta(seconds=interval))
-        .astimezone(tz=timezone("UTC"))
-        .strftime("%Y-%m-%d %H:%M:%S")
-    )
-
-    end = timestamp.astimezone(tz=timezone("UTC")).strftime("%Y-%m-%d %H:%M:%S")
-
-    return {"start": start, "end": end}
-
-
-@task(checkpoint=False, nout=2)
-def create_request_params(
-    datetime_range: dict, table_params: dict, secret_path: str, dataset_id: str
-) -> tuple:
+        str: Error traceback
+        str: Path to the saved treated .csv file
     """
-    Task to create request params
+    if error is None:
+        try:
+            # leitura do dado raw
+            error, data = read_raw_data(filepath=raw_filepath)
 
-    Args:
-        datetime_range (dict): datetime range to get params
-        table_params (dict): table params to get params
-        secret_path (str): secret path to get params
-        dataset_id (str): dataset id to get params
+            if primary_key is None:
+                primary_key = []
 
-    Returns:
-        request_params: host, database and query to request data
-        request_url: url to request data
-    """
+            log(
+                f"""
+                Received inputs:
+                - timestamp:\n{timestamp}
+                - data:\n{data.head()}"""
+            )
 
-    if dataset_id == constants.BILHETAGEM_DATASET_ID.value:
-        secrets = get_vault_secret(secret_path)["data"]
+            # Check empty dataframe
+            if data.empty:
+                log("Empty dataframe, skipping transformation...")
+            else:
+                log(f"Raw data:\n{data_info_str(data)}", level="info")
+
+                log("Adding captured timestamp column...", level="info")
+                data["timestamp_captura"] = timestamp
+
+                log("Striping string columns...", level="info")
+                for col in data.columns[data.dtypes == "object"].to_list():
+                    data[col] = data[col].str.strip()
+
+                log(f"Finished cleaning! Data:\n{data_info_str(data)}", level="info")
+
+                log("Creating nested structure...", level="info")
+                pk_cols = primary_key + ["timestamp_captura"]
+                data = (
+                    data.groupby(pk_cols)
+                    .apply(
+                        lambda x: x[data.columns.difference(pk_cols)].to_json(
+                            orient="records"
+                        )
+                    )
+                    .str.strip("[]")
+                    .reset_index(name="content")[
+                        primary_key + ["content", "timestamp_captura"]
+                    ]
+                )
 
-        database_secrets = secrets["databases"][table_params["database"]]
+                log(
+                    f"Finished nested structure! Data:\n{data_info_str(data)}",
+                    level="info",
+                )
 
-        request_url = secrets["vpn_url"] + database_secrets["engine"]
+            # save treated local
+            filepath = save_treated_local_func(
+                data=data, error=error, filepath=filepath
+            )
 
-        request_params = {
-            "host": database_secrets["host"],  # TODO: exibir no log em ambiente fechado
-            "database": table_params["database"],
-            "query": table_params["query"].format(**datetime_range),
-        }
+        except Exception:  # pylint: disable=W0703
+            error = traceback.format_exc()
+            log(f"[CATCHED] Task failed with error: \n{error}", level="error")
 
-    return request_params, request_url
+    return error, filepath
diff --git a/pipelines/rj_smtr/utils.py b/pipelines/rj_smtr/utils.py
index 9ddf7d687..1d71dd3dd 100644
--- a/pipelines/rj_smtr/utils.py
+++ b/pipelines/rj_smtr/utils.py
@@ -8,12 +8,18 @@
 from pathlib import Path
 
 from datetime import timedelta, datetime
-from typing import List
+from typing import List, Union
+import traceback
 import io
+import json
+import zipfile
+import pytz
+import requests
 import basedosdados as bd
 from basedosdados import Table
 import pandas as pd
-import pytz
+from google.cloud.storage.blob import Blob
+
 
 from prefect.schedules.clocks import IntervalClock
 
@@ -398,46 +404,41 @@ def data_info_str(data: pd.DataFrame):
 
 
 def generate_execute_schedules(  # pylint: disable=too-many-arguments,too-many-locals
-    interval: timedelta,
+    clock_interval: timedelta,
     labels: List[str],
-    table_parameters: list,
-    dataset_id: str,
-    secret_path: str,
+    table_parameters: Union[list[dict], dict],
     runs_interval_minutes: int = 15,
     start_date: datetime = datetime(
         2020, 1, 1, tzinfo=pytz.timezone(emd_constants.DEFAULT_TIMEZONE.value)
     ),
+    **general_flow_params,
 ) -> List[IntervalClock]:
     """
     Generates multiple schedules
 
     Args:
-        interval (timedelta): The interval to run the schedule
+        clock_interval (timedelta): The interval to run the schedule
         labels (List[str]): The labels to be added to the schedule
-        table_parameters (list): The table parameters
-        dataset_id (str): The dataset_id to be used in the schedule
-        secret_path (str): The secret path to be used in the schedule
+        table_parameters (list): The table parameters to iterate over
         runs_interval_minutes (int, optional): The interval between each schedule. Defaults to 15.
         start_date (datetime, optional): The start date of the schedule.
             Defaults to datetime(2020, 1, 1, tzinfo=pytz.timezone(emd_constants.DEFAULT_TIMEZONE.value)).
-
+        general_flow_params: Any param that you want to pass to the flow
     Returns:
         List[IntervalClock]: The list of schedules
 
     """
+    if isinstance(table_parameters, dict):
+        table_parameters = [table_parameters]
 
     clocks = []
     for count, parameters in enumerate(table_parameters):
-        parameter_defaults = {
-            "table_params": parameters,
-            "dataset_id": dataset_id,
-            "secret_path": secret_path,
-            "interval": interval.total_seconds(),
-        }
+        parameter_defaults = parameters | general_flow_params
+
         log(f"parameter_defaults: {parameter_defaults}")
         clocks.append(
             IntervalClock(
-                interval=interval,
+                interval=clock_interval,
                 start_date=start_date
                 + timedelta(minutes=runs_interval_minutes * count),
                 labels=labels,
@@ -445,3 +446,304 @@ def generate_execute_schedules(  # pylint: disable=too-many-arguments,too-many-l
             )
         )
     return clocks
+
+
+def save_raw_local_func(
+    data: Union[dict, str], filepath: str, mode: str = "raw", filetype: str = "json"
+) -> str:
+    """
+    Saves json response from API to .json file.
+    Args:
+        filepath (str): Path which to save raw file
+        status (dict): Must contain keys
+          * data: json returned from API
+          * error: error catched from API request
+        mode (str, optional): Folder to save locally, later folder which to upload to GCS.
+    Returns:
+        str: Path to the saved file
+    """
+
+    # diferentes tipos de arquivos para salvar
+    _filepath = filepath.format(mode=mode, filetype=filetype)
+    Path(_filepath).parent.mkdir(parents=True, exist_ok=True)
+
+    if filetype == "json":
+        if isinstance(data, dict):
+            data = json.loads(data)
+        json.dump(data, Path(_filepath).open("w", encoding="utf-8"))
+
+    # if filetype == "csv":
+    #     pass
+    if filetype in ("txt", "csv"):
+        with open(_filepath, "w", encoding="utf-8") as file:
+            file.write(data)
+
+    log(f"Raw data saved to: {_filepath}")
+    return _filepath
+
+
+def get_raw_data_api(  # pylint: disable=R0912
+    url: str,
+    secret_path: str = None,
+    api_params: dict = None,
+    filetype: str = None,
+) -> tuple[str, str, str]:
+    """
+    Request data from URL API
+
+    Args:
+        url (str): URL to request data
+        secret_path (str, optional): Secret path to get headers. Defaults to None.
+        api_params (dict, optional): Parameters to pass to API. Defaults to None.
+        filetype (str, optional): Filetype to save raw file. Defaults to None.
+
+    Returns:
+        tuple[str, str, str]: Error, data and filetype
+    """
+    error = None
+    data = None
+    try:
+        if secret_path is None:
+            headers = secret_path
+        else:
+            headers = get_vault_secret(secret_path)["data"]
+
+        response = requests.get(
+            url,
+            headers=headers,
+            timeout=constants.MAX_TIMEOUT_SECONDS.value,
+            params=api_params,
+        )
+
+        response.raise_for_status()
+
+        if filetype == "json":
+            data = response.json()
+        else:
+            data = response.text
+
+    except Exception:
+        error = traceback.format_exc()
+        log(f"[CATCHED] Task failed with error: \n{error}", level="error")
+
+    return error, data, filetype
+
+
+def get_upload_storage_blob(
+    dataset_id: str,
+    filename: str,
+) -> Blob:
+    """
+    Get a blob from upload zone in storage
+
+    Args:
+        dataset_id (str): The dataset id on BigQuery.
+        filename (str): The filename in GCS.
+
+    Returns:
+        Blob: blob object
+    """
+    bucket = bd.Storage(dataset_id="", table_id="")
+    blob_list = list(
+        bucket.client["storage_staging"]
+        .bucket(bucket.bucket_name)
+        .list_blobs(prefix=f"upload/{dataset_id}/{filename}.")
+    )
+    return blob_list[0]
+
+
+def get_raw_data_gcs(
+    dataset_id: str,
+    table_id: str,
+    zip_filename: str = None,
+) -> tuple[str, str, str]:
+    """
+    Get raw data from GCS
+
+    Args:
+        dataset_id (str): The dataset id on BigQuery.
+        table_id (str): The table id on BigQuery.
+        zip_filename (str, optional): The zip file name. Defaults to None.
+
+    Returns:
+        tuple[str, str, str]: Error, data and filetype
+    """
+    error = None
+    data = None
+    filetype = None
+
+    try:
+        blob_search_name = zip_filename or table_id
+        blob = get_upload_storage_blob(dataset_id=dataset_id, filename=blob_search_name)
+
+        filename = blob.name
+        filetype = filename.split(".")[-1]
+
+        data = blob.download_as_bytes()
+
+        if filetype == "zip":
+            with zipfile.ZipFile(io.BytesIO(data), "r") as zipped_file:
+                filenames = zipped_file.namelist()
+                filename = list(
+                    filter(lambda x: x.split(".")[0] == table_id, filenames)
+                )[0]
+                filetype = filename.split(".")[-1]
+                data = zipped_file.read(filename)
+
+        data = data.decode(encoding="utf-8")
+
+    except Exception:
+        error = traceback.format_exc()
+        log(f"[CATCHED] Task failed with error: \n{error}", level="error")
+
+    return error, data, filetype
+
+
+def save_treated_local_func(
+    filepath: str, data: pd.DataFrame, error: str, mode: str = "staging"
+) -> str:
+    """
+    Save treated file to CSV.
+
+    Args:
+        filepath (str): Path to save file
+        data (pd.DataFrame): Dataframe to save
+        error (str): Error catched during execution
+        mode (str, optional): Folder to save locally, later folder which to upload to GCS.
+
+    Returns:
+        str: Path to the saved file
+    """
+    _filepath = filepath.format(mode=mode, filetype="csv")
+    Path(_filepath).parent.mkdir(parents=True, exist_ok=True)
+    if error is None:
+        data.to_csv(_filepath, index=False)
+        log(f"Treated data saved to: {_filepath}")
+    return _filepath
+
+
+def upload_run_logs_to_bq(  # pylint: disable=R0913
+    dataset_id: str,
+    parent_table_id: str,
+    timestamp: str,
+    error: str = None,
+    previous_error: str = None,
+    recapture: bool = False,
+    mode: str = "raw",
+):
+    """
+    Upload execution status table to BigQuery.
+    Table is uploaded to the same dataset, named {parent_table_id}_logs.
+    If passing status_dict, should not pass timestamp and error.
+
+    Args:
+        dataset_id (str): dataset_id on BigQuery
+        parent_table_id (str): table_id on BigQuery
+        timestamp (str): timestamp to get datetime range
+        error (str): error catched during execution
+        previous_error (str): previous error catched during execution
+        recapture (bool): if the execution was a recapture
+        mode (str): folder to save locally, later folder which to upload to GCS
+
+    Returns:
+        None
+    """
+    table_id = parent_table_id + "_logs"
+    # Create partition directory
+    filename = f"{table_id}_{timestamp.isoformat()}"
+    partition = f"data={timestamp.date()}"
+    filepath = Path(
+        f"""data/{mode}/{dataset_id}/{table_id}/{partition}/{filename}.csv"""
+    )
+    filepath.parent.mkdir(exist_ok=True, parents=True)
+    # Create dataframe to be uploaded
+    if not error and recapture is True:
+        # if the recapture is succeeded, update the column erro
+        dataframe = pd.DataFrame(
+            {
+                "timestamp_captura": [timestamp],
+                "sucesso": [True],
+                "erro": [f"[recapturado]{previous_error}"],
+            }
+        )
+        log(f"Recapturing {timestamp} with previous error:\n{error}")
+    else:
+        # not recapturing or error during flow execution
+        dataframe = pd.DataFrame(
+            {
+                "timestamp_captura": [timestamp],
+                "sucesso": [error is None],
+                "erro": [error],
+            }
+        )
+    # Save data local
+    dataframe.to_csv(filepath, index=False)
+    # Upload to Storage
+    create_or_append_table(
+        dataset_id=dataset_id,
+        table_id=table_id,
+        path=filepath.as_posix(),
+        partitions=partition,
+    )
+    if error is not None:
+        raise Exception(f"Pipeline failed with error: {error}")
+
+
+def get_datetime_range(
+    timestamp: datetime,
+    interval: timedelta,
+) -> dict:
+    """
+    Task to get datetime range in UTC
+
+    Args:
+        timestamp (datetime): timestamp to get datetime range
+        interval (timedelta): interval to get datetime range
+
+    Returns:
+        dict: datetime range
+    """
+
+    start = (
+        (timestamp - interval)
+        .astimezone(tz=pytz.timezone("UTC"))
+        .strftime("%Y-%m-%d %H:%M:%S")
+    )
+
+    end = timestamp.astimezone(tz=pytz.timezone("UTC")).strftime("%Y-%m-%d %H:%M:%S")
+
+    return {"start": start, "end": end}
+
+
+def read_raw_data(filepath: str, csv_args: dict = None) -> tuple[str, pd.DataFrame]:
+    """
+    Read raw data from file
+
+    Args:
+        filepath (str): filepath to read
+        csv_args (dict): arguments to pass to pandas.read_csv
+
+    Returns:
+        tuple[str, pd.DataFrame]: error and data
+    """
+    error = None
+    data = None
+    try:
+        file_type = filepath.split(".")[-1]
+
+        if file_type == "json":
+            data = pd.read_json(filepath)
+
+            # data = json.loads(data)
+        elif file_type in ("txt", "csv"):
+            if csv_args is None:
+                csv_args = {}
+            data = pd.read_csv(filepath, **csv_args)
+        else:
+            error = "Unsupported raw file extension. Supported only: json, csv and txt"
+
+    except Exception:
+        error = traceback.format_exc()
+        log(f"[CATCHED] Task failed with error: \n{error}", level="error")
+
+    return error, data
diff --git a/pipelines/utils/utils.py b/pipelines/utils/utils.py
index efc21c133..adf89bc94 100644
--- a/pipelines/utils/utils.py
+++ b/pipelines/utils/utils.py
@@ -711,16 +711,24 @@ def get_credentials_from_env(
     return cred
 
 
-def get_storage_blobs(dataset_id: str, table_id: str) -> list:
+def get_storage_blobs(dataset_id: str, table_id: str, mode: str = "staging") -> list:
     """
     Get all blobs from a table in a dataset.
+
+    Args:
+        dataset_id (str): dataset id
+        table_id (str): table id
+        mode (str, optional): mode to use. Defaults to "staging".
+
+    Returns:
+        list: list of blobs
     """
 
     bd_storage = bd.Storage(dataset_id=dataset_id, table_id=table_id)
     return list(
         bd_storage.client["storage_staging"]
         .bucket(bd_storage.bucket_name)
-        .list_blobs(prefix=f"staging/{bd_storage.dataset_id}/{bd_storage.table_id}/")
+        .list_blobs(prefix=f"{mode}/{bd_storage.dataset_id}/{bd_storage.table_id}/")
     )
 
 

From fa29fc38850e11c4058293e1f830cbbe1a0adcc6 Mon Sep 17 00:00:00 2001
From: Gabriel Gazola Milan <gabriel.gazola@poli.ufrj.br>
Date: Mon, 2 Oct 2023 12:41:38 -0300
Subject: [PATCH 37/41] feat: update query

---
 pipelines/rj_segovi/dump_db_1746/schedules.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/pipelines/rj_segovi/dump_db_1746/schedules.py b/pipelines/rj_segovi/dump_db_1746/schedules.py
index 50e416106..f8f6819ad 100644
--- a/pipelines/rj_segovi/dump_db_1746/schedules.py
+++ b/pipelines/rj_segovi/dump_db_1746/schedules.py
@@ -374,7 +374,8 @@
         case when cv.ic_vinculo = 'O'
         or cv.ic_vinculo = 'S' then cv.id_chamado_pai_fk end
     ) as 'reclamacoes',
-    no_justificativa
+    no_justificativa,
+    oc.id_origem_ocorrencia
 from
     tb_chamado as ch
     inner join (
@@ -550,7 +551,8 @@
     chs.dt_alvo_finalizacao,
     chs.dt_alvo_diagnostico,
     cl.dt_real_diagnostico,
-    no_justificativa
+    no_justificativa,
+    oc.id_origem_ocorrencia
         """
 
 _1746_queries = {

From 533212e34d0ef3d6507fb9037b66b7a9fd3bc57a Mon Sep 17 00:00:00 2001
From: Fernanda Scovino <fscovinom@gmail.com>
Date: Mon, 2 Oct 2023 19:19:26 -0300
Subject: [PATCH 38/41] =?UTF-8?q?Adiciona=20novos=20`code=5Fowners`=20da?=
 =?UTF-8?q?=20SMTR=20=F0=9F=AB=82=20=20(#519)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* add carol e rafa como code owners

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
---
 code_owners.yaml       | 2 ++
 pipelines/constants.py | 8 ++++++++
 2 files changed, 10 insertions(+)

diff --git a/code_owners.yaml b/code_owners.yaml
index f2f563c5f..775494551 100644
--- a/code_owners.yaml
+++ b/code_owners.yaml
@@ -20,6 +20,8 @@ pipelines:
         - fernandascovino
         - eng-rodrigocunha
         - borismarinho
+        - pixuimpou
+        - lingsv
     rj_escritorio:
       owners:
         - gabriel-milan
diff --git a/pipelines/constants.py b/pipelines/constants.py
index 309325d35..900e2ebf9 100644
--- a/pipelines/constants.py
+++ b/pipelines/constants.py
@@ -138,4 +138,12 @@ class constants(Enum):  # pylint: disable=c0103
             "user_id": "369657115012366336",
             "type": "user_nickname",
         },
+        "rafaelpinheiro": {
+            "user_id": "1131538976101109772",
+            "type": "user_nickname",
+        },
+        "carolinagomes": {
+            "user_id": "620000269392019469",
+            "type": "user_nickname",
+        },
     }

From c689b4e67531c494476d55f10277a2b863113e50 Mon Sep 17 00:00:00 2001
From: Rafael Carvalho Pinheiro <74972217+pixuimpou@users.noreply.github.com>
Date: Thu, 5 Oct 2023 11:26:43 -0300
Subject: [PATCH 39/41] =?UTF-8?q?Cria=20flow=20generico=20de=20materializa?=
 =?UTF-8?q?=C3=A7=C3=A3o=20+=20Adiciona=20tratamento=20transa=C3=A7=C3=A3o?=
 =?UTF-8?q?=20Ja=C3=A9=20(#513)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* create default materialization flow

* create tasks for default materialization flow

* make generate_execute_schedules more generic

* create bilhetagem materialization flow

* adapt bilhetagem schedules for the new model

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* add run config and storage

* Update utils.py

* fix sub tasks

* fix fetch_dataset_sha run

* add run_date variable to materialization flow

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* remove discord notifications for testing

* add manual date_range / fix flow run name

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* fix missing table_id logic

* fix empty return

* fix empty return

* add flag_date_range when var_params is blank

* change rename logic when has date variables

* change return values of create_dbt_run_vars

* create dict aux function

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* remove *args from task

* change coalesce task

* fix rename task

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* fix task order

* add docstrings

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* fix line too long

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* pre-commit hook

* adjust tasks

* mudar estrutura do flow materializacao

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* adicionar schedule de bilhetagem

* adicionar schedule no flow de materialização

* ajuste nome da coluna de datetime

* ajustar nome coluna

* mudar coluna de data para datetime_transacao

* ajusta variavel date_range manual

* mudar nome parametro de variável dbt

* cria flow de orquestração materialização

* volta notificação do discord

* ajusta wait_flow_run

* mudar query para teste

* reverter query teste

* usar copy no dicionario de variaveis de data

* adjust constant run interval

* remover funcao comentada

* alterar padrão de nome dos flows

* remove imports comentados

* remove schedules nao utilizados

* remove task comentada

* mudar agent para produção

---------

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: mergify[bot] <37929162+mergify[bot]@users.noreply.github.com>
Co-authored-by: Rodrigo Cunha <66736583+eng-rodrigocunha@users.noreply.github.com>
---
 .../br_rj_riodejaneiro_bilhetagem/flows.py    | 116 +++++++++++++++--
 .../schedules.py                              |  21 +---
 pipelines/rj_smtr/constants.py                |  32 +++--
 pipelines/rj_smtr/flows.py                    |  84 ++++++++++++-
 pipelines/rj_smtr/tasks.py                    | 117 +++++++++++++++++-
 pipelines/rj_smtr/utils.py                    |  14 ++-
 6 files changed, 344 insertions(+), 40 deletions(-)

diff --git a/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/flows.py b/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/flows.py
index d7f44e3b9..568f96154 100644
--- a/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/flows.py
+++ b/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/flows.py
@@ -7,26 +7,46 @@
 
 from prefect.run_configs import KubernetesRun
 from prefect.storage import GCS
+from prefect.tasks.prefect import create_flow_run, wait_for_flow_run
+from prefect.utilities.edges import unmapped
 
 # EMD Imports #
 
 from pipelines.constants import constants as emd_constants
+from pipelines.utils.decorators import Flow
+from pipelines.utils.tasks import (
+    rename_current_flow_run_now_time,
+    get_current_flow_labels,
+)
+
+
+from pipelines.utils.utils import set_default_parameters
 
 # SMTR Imports #
 
-from pipelines.rj_smtr.flows import default_capture_flow
+from pipelines.rj_smtr.flows import (
+    default_capture_flow,
+    default_materialization_flow,
+)
+
+from pipelines.rj_smtr.tasks import (
+    get_current_timestamp,
+)
 
 from pipelines.rj_smtr.br_rj_riodejaneiro_bilhetagem.schedules import (
-    bilhetagem_principal_schedule,
     bilhetagem_transacao_schedule,
 )
 
+from pipelines.rj_smtr.constants import constants
+
+from pipelines.rj_smtr.schedules import every_hour
+
 # Flows #
 
 # BILHETAGEM TRANSAÇÃO - CAPTURA A CADA MINUTO #
 
 bilhetagem_transacao_captura = deepcopy(default_capture_flow)
-bilhetagem_transacao_captura.name = "SMTR: Bilhetagem Transação (captura)"
+bilhetagem_transacao_captura.name = "SMTR: Bilhetagem Transação - Captura"
 bilhetagem_transacao_captura.storage = GCS(emd_constants.GCS_FLOWS_BUCKET.value)
 bilhetagem_transacao_captura.run_config = KubernetesRun(
     image=emd_constants.DOCKER_IMAGE.value,
@@ -34,13 +54,91 @@
 )
 bilhetagem_transacao_captura.schedule = bilhetagem_transacao_schedule
 
-# BILHETAGEM PRINCIPAL - CAPTURA DIÁRIA DE DIVERSAS TABELAS #
+# BILHETAGEM AUXILIAR - SUBFLOW PARA RODAR ANTES DE CADA MATERIALIZAÇÃO #
+
+bilhetagem_auxiliar_captura = deepcopy(default_capture_flow)
+bilhetagem_auxiliar_captura.name = "SMTR: Bilhetagem Auxiliar - Captura (subflow)"
+bilhetagem_auxiliar_captura.storage = GCS(emd_constants.GCS_FLOWS_BUCKET.value)
+bilhetagem_auxiliar_captura.run_config = KubernetesRun(
+    image=emd_constants.DOCKER_IMAGE.value,
+    labels=[emd_constants.RJ_SMTR_AGENT_LABEL.value],
+)
+
+bilhetagem_auxiliar_captura = set_default_parameters(
+    flow=bilhetagem_auxiliar_captura,
+    default_parameters={
+        "dataset_id": constants.BILHETAGEM_DATASET_ID.value,
+        "secret_path": constants.BILHETAGEM_SECRET_PATH.value,
+        "source_type": constants.BILHETAGEM_GENERAL_CAPTURE_PARAMS.value["source_type"],
+    },
+)
+
+# MATERIALIZAÇÃO - SUBFLOW DE MATERIALIZAÇÃO
+bilhetagem_materializacao = deepcopy(default_materialization_flow)
+bilhetagem_materializacao.name = "SMTR: Bilhetagem Transação - Materialização (subflow)"
+bilhetagem_materializacao.storage = GCS(emd_constants.GCS_FLOWS_BUCKET.value)
+bilhetagem_materializacao.run_config = KubernetesRun(
+    image=emd_constants.DOCKER_IMAGE.value,
+    labels=[emd_constants.RJ_SMTR_AGENT_LABEL.value],
+)
+
+bilhetagem_materializacao_parameters = {
+    "dataset_id": constants.BILHETAGEM_DATASET_ID.value
+} | constants.BILHETAGEM_MATERIALIZACAO_PARAMS.value
+
+bilhetagem_materializacao = set_default_parameters(
+    flow=bilhetagem_materializacao,
+    default_parameters=bilhetagem_materializacao_parameters,
+)
+
+# TRATAMENTO - RODA DE HORA EM HORA, CAPTURA AUXILIAR + MATERIALIZAÇÃO
+with Flow(
+    "SMTR: Bilhetagem Transação - Tratamento",
+    code_owners=["caio", "fernanda", "boris", "rodrigo"],
+) as bilhetagem_transacao_tratamento:
+    timestamp = get_current_timestamp()
+
+    rename_flow_run = rename_current_flow_run_now_time(
+        prefix=bilhetagem_transacao_tratamento.name + " ",
+        now_time=timestamp,
+    )
+
+    LABELS = get_current_flow_labels()
+
+    # Captura
+    runs_captura = create_flow_run.map(
+        flow_name=unmapped(bilhetagem_auxiliar_captura.name),
+        project_name=unmapped(emd_constants.PREFECT_DEFAULT_PROJECT.value),
+        parameters=constants.BILHETAGEM_CAPTURE_PARAMS.value,
+        labels=unmapped(LABELS),
+    )
+
+    wait_captura = wait_for_flow_run.map(
+        runs_captura,
+        stream_states=unmapped(True),
+        stream_logs=unmapped(True),
+        raise_final_state=unmapped(True),
+    )
+
+    # Materialização
+    run_materializacao = create_flow_run(
+        flow_name=bilhetagem_materializacao.name,
+        project_name=emd_constants.PREFECT_DEFAULT_PROJECT.value,
+        labels=LABELS,
+        upstream_tasks=[wait_captura],
+    )
+
+    wait_materializacao = wait_for_flow_run(
+        run_materializacao,
+        stream_states=True,
+        stream_logs=True,
+        raise_final_state=True,
+    )
 
-bilhetagem_principal_captura = deepcopy(default_capture_flow)
-bilhetagem_principal_captura.name = "SMTR: Bilhetagem Principal (captura)"
-bilhetagem_principal_captura.storage = GCS(emd_constants.GCS_FLOWS_BUCKET.value)
-bilhetagem_principal_captura.run_config = KubernetesRun(
+bilhetagem_transacao_tratamento.storage = GCS(emd_constants.GCS_FLOWS_BUCKET.value)
+bilhetagem_transacao_tratamento.run_config = KubernetesRun(
     image=emd_constants.DOCKER_IMAGE.value,
     labels=[emd_constants.RJ_SMTR_AGENT_LABEL.value],
 )
-bilhetagem_principal_captura.schedule = bilhetagem_principal_schedule
+bilhetagem_transacao_tratamento.schedule = every_hour
+# bilhetagem_materializacao.schedule = bilhetagem_materializacao_schedule
diff --git a/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/schedules.py b/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/schedules.py
index 2f7804811..c2ee21164 100644
--- a/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/schedules.py
+++ b/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/schedules.py
@@ -15,27 +15,10 @@
     generate_execute_schedules,
 )
 
-bilhetagem_principal_clocks = generate_execute_schedules(
-    clock_interval=timedelta(
-        **constants.BILHETAGEM_GENERAL_CAPTURE_PARAMS.value["principal_run_interval"]
-    ),
-    labels=[
-        emd_constants.RJ_SMTR_AGENT_LABEL.value,
-    ],
-    table_parameters=constants.BILHETAGEM_CAPTURE_PARAMS.value,
-    dataset_id=constants.BILHETAGEM_DATASET_ID.value,
-    secret_path=constants.BILHETAGEM_SECRET_PATH.value,
-    source_type=constants.BILHETAGEM_GENERAL_CAPTURE_PARAMS.value["source_type"],
-    runs_interval_minutes=constants.BILHETAGEM_GENERAL_CAPTURE_PARAMS.value[
-        "principal_runs_interval_minutes"
-    ],
-)
-
-bilhetagem_principal_schedule = Schedule(clocks=untuple(bilhetagem_principal_clocks))
-
+BILHETAGEM_TRANSACAO_INTERVAL = timedelta(minutes=1)
 bilhetagem_transacao_clocks = generate_execute_schedules(
     clock_interval=timedelta(
-        **constants.BILHETAGEM_GENERAL_CAPTURE_PARAMS.value["transacao_run_interval"]
+        **constants.BILHETAGEM_CAPTURE_RUN_INTERVAL.value["transacao_run_interval"]
     ),
     labels=[
         emd_constants.RJ_SMTR_AGENT_LABEL.value,
diff --git a/pipelines/rj_smtr/constants.py b/pipelines/rj_smtr/constants.py
index 52e30d9f8..ee8a22cd2 100644
--- a/pipelines/rj_smtr/constants.py
+++ b/pipelines/rj_smtr/constants.py
@@ -183,12 +183,15 @@ class constants(Enum):  # pylint: disable=c0103
         },
         "vpn_url": "http://vpn-jae.mobilidade.rio/",
         "source_type": "api-json",
-        "transacao_run_interval": {"minutes": 1},
-        "principal_run_interval": {"days": 1},
         "transacao_runs_interval_minutes": 0,
         "principal_runs_interval_minutes": 5,
     }
 
+    BILHETAGEM_CAPTURE_RUN_INTERVAL = {
+        "transacao_run_interval": {"minutes": 1},
+        "principal_run_interval": {"days": 1},
+    }
+
     BILHETAGEM_TRANSACAO_CAPTURE_PARAMS = {
         "table_id": "transacao",
         "partition_date_only": False,
@@ -203,11 +206,13 @@ class constants(Enum):  # pylint: disable=c0103
                     data_processamento BETWEEN '{start}'
                     AND '{end}'
             """,
-            "run_interval": BILHETAGEM_GENERAL_CAPTURE_PARAMS["transacao_run_interval"],
+            "run_interval": BILHETAGEM_CAPTURE_RUN_INTERVAL["transacao_run_interval"],
         },
         "primary_key": ["id"],  # id column to nest data on
     }
 
+    BILHETAGEM_SECRET_PATH = "smtr_jae_access_data"
+
     BILHETAGEM_CAPTURE_PARAMS = [
         {
             "table_id": "linha",
@@ -222,7 +227,7 @@ class constants(Enum):  # pylint: disable=c0103
                     WHERE
                         DT_INCLUSAO >= '{start}'
                 """,
-                "run_interval": BILHETAGEM_GENERAL_CAPTURE_PARAMS[
+                "run_interval": BILHETAGEM_CAPTURE_RUN_INTERVAL[
                     "principal_run_interval"
                 ],
             },
@@ -241,7 +246,7 @@ class constants(Enum):  # pylint: disable=c0103
                     WHERE
                         DT_INCLUSAO >= '{start}'
                 """,
-                "run_interval": BILHETAGEM_GENERAL_CAPTURE_PARAMS[
+                "run_interval": BILHETAGEM_CAPTURE_RUN_INTERVAL[
                     "principal_run_interval"
                 ],
             },
@@ -260,7 +265,7 @@ class constants(Enum):  # pylint: disable=c0103
                     WHERE
                         DT_INCLUSAO >= '{start}'
                 """,
-                "run_interval": BILHETAGEM_GENERAL_CAPTURE_PARAMS[
+                "run_interval": BILHETAGEM_CAPTURE_RUN_INTERVAL[
                     "principal_run_interval"
                 ],
             },
@@ -279,7 +284,7 @@ class constants(Enum):  # pylint: disable=c0103
                     WHERE
                         dt_inclusao >= '{start}'
                 """,
-                "run_interval": BILHETAGEM_GENERAL_CAPTURE_PARAMS[
+                "run_interval": BILHETAGEM_CAPTURE_RUN_INTERVAL[
                     "principal_run_interval"
                 ],
             },
@@ -289,4 +294,15 @@ class constants(Enum):  # pylint: disable=c0103
             ],  # id column to nest data on
         },
     ]
-    BILHETAGEM_SECRET_PATH = "smtr_jae_access_data"
+
+    BILHETAGEM_MATERIALIZACAO_PARAMS = {
+        "table_id": BILHETAGEM_TRANSACAO_CAPTURE_PARAMS["table_id"],
+        "upstream": True,
+        "dbt_vars": {
+            "date_range": {
+                "table_run_datetime_column_name": "datetime_transacao",
+                "delay_hours": 1,
+            },
+            "version": {},
+        },
+    }
diff --git a/pipelines/rj_smtr/flows.py b/pipelines/rj_smtr/flows.py
index 4860c6d07..0efb69b17 100644
--- a/pipelines/rj_smtr/flows.py
+++ b/pipelines/rj_smtr/flows.py
@@ -5,7 +5,8 @@
 
 from prefect.run_configs import KubernetesRun
 from prefect.storage import GCS
-from prefect import Parameter
+from prefect import case, Parameter
+from prefect.utilities.edges import unmapped
 
 # EMD Imports #
 
@@ -13,7 +14,11 @@
 from pipelines.utils.decorators import Flow
 from pipelines.utils.tasks import (
     rename_current_flow_run_now_time,
+    get_now_time,
+    get_current_flow_labels,
+    get_current_flow_mode,
 )
+from pipelines.utils.execute_dbt_model.tasks import get_k8s_dbt_client
 
 # SMTR Imports #
 
@@ -22,13 +27,17 @@
     create_local_partition_path,
     get_current_timestamp,
     parse_timestamp_to_string,
+    transform_raw_to_nested_structure,
+    create_dbt_run_vars,
+    set_last_run_timestamp,
+    coalesce_task,
     upload_raw_data_to_gcs,
     upload_staging_data_to_gcs,
-    transform_raw_to_nested_structure,
     get_raw_from_sources,
     create_request_params,
 )
 
+from pipelines.utils.execute_dbt_model.tasks import run_dbt_model
 
 with Flow(
     "SMTR: Captura",
@@ -114,3 +123,74 @@
     image=emd_constants.DOCKER_IMAGE.value,
     labels=[emd_constants.RJ_SMTR_AGENT_LABEL.value],
 )
+
+with Flow(
+    "SMTR: Materialização",
+    code_owners=["caio", "fernanda", "boris", "rodrigo"],
+) as default_materialization_flow:
+    # SETUP #
+
+    dataset_id = Parameter("dataset_id", default=None)
+    table_id = Parameter("table_id", default=None)
+    raw_table_id = Parameter("raw_table_id", default=None)
+    dbt_alias = Parameter("dbt_alias", default=False)
+    upstream = Parameter("upstream", default=None)
+    downstream = Parameter("downstream", default=None)
+    exclude = Parameter("exclude", default=None)
+    flags = Parameter("flags", default=None)
+    dbt_vars = Parameter("dbt_vars", default=dict())
+
+    # treated_table_params = treat_dbt_table_params(table_params=table_params)
+
+    LABELS = get_current_flow_labels()
+    MODE = get_current_flow_mode(LABELS)
+
+    _vars, date_var, flag_date_range = create_dbt_run_vars(
+        dataset_id=dataset_id,
+        dbt_vars=dbt_vars,
+        table_id=table_id,
+        raw_dataset_id=dataset_id,
+        raw_table_id=raw_table_id,
+        mode=MODE,
+    )
+
+    # Rename flow run
+
+    flow_name_prefix = coalesce_task([table_id, dataset_id])
+
+    flow_name_now_time = coalesce_task([date_var, get_now_time()])
+
+    rename_flow_run = rename_current_flow_run_now_time(
+        prefix=default_materialization_flow.name + " " + flow_name_prefix + ": ",
+        now_time=flow_name_now_time,
+    )
+
+    dbt_client = get_k8s_dbt_client(mode=MODE, wait=rename_flow_run)
+
+    RUNS = run_dbt_model.map(
+        dbt_client=unmapped(dbt_client),
+        dataset_id=unmapped(dataset_id),
+        table_id=unmapped(table_id),
+        _vars=_vars,
+        dbt_alias=unmapped(dbt_alias),
+        upstream=unmapped(upstream),
+        downstream=unmapped(downstream),
+        exclude=unmapped(exclude),
+        flags=unmapped(flags),
+    )
+
+    with case(flag_date_range, True):
+        set_last_run_timestamp(
+            dataset_id=dataset_id,
+            table_id=table_id,
+            timestamp=date_var["date_range_end"],
+            wait=RUNS,
+            mode=MODE,
+        )
+
+
+default_materialization_flow.storage = GCS(emd_constants.GCS_FLOWS_BUCKET.value)
+default_materialization_flow.run_config = KubernetesRun(
+    image=emd_constants.DOCKER_IMAGE.value,
+    labels=[emd_constants.RJ_SMTR_AGENT_LABEL.value],
+)
diff --git a/pipelines/rj_smtr/tasks.py b/pipelines/rj_smtr/tasks.py
index a846851b5..f7d687dea 100644
--- a/pipelines/rj_smtr/tasks.py
+++ b/pipelines/rj_smtr/tasks.py
@@ -8,7 +8,7 @@
 import os
 from pathlib import Path
 import traceback
-from typing import Dict, List, Union
+from typing import Dict, List, Union, Iterable
 import io
 
 from basedosdados import Storage, Table
@@ -28,6 +28,7 @@
     get_last_run_timestamp,
     log_critical,
     data_info_str,
+    dict_contains_keys,
     get_raw_data_api,
     get_raw_data_gcs,
     upload_run_logs_to_bq,
@@ -1095,3 +1096,117 @@ def transform_raw_to_nested_structure(
             log(f"[CATCHED] Task failed with error: \n{error}", level="error")
 
     return error, filepath
+
+
+@task(checkpoint=False)
+def coalesce_task(value_list: Iterable):
+    """
+    Task to get the first non None value of a list
+
+    Args:
+        value_list (Iterable): a iterable object with the values
+    Returns:
+        any: value_list's first non None item
+    """
+
+    try:
+        return next(value for value in value_list if value is not None)
+    except StopIteration:
+        return
+
+
+@task(checkpoint=False, nout=3)
+def create_dbt_run_vars(
+    dataset_id: str,
+    dbt_vars: dict,
+    table_id: str,
+    raw_dataset_id: str,
+    raw_table_id: str,
+    mode: str,
+) -> tuple[list[dict], Union[list[dict], dict, None], bool]:
+    """
+    Create the variables to be used in dbt materialization based on a dict
+
+    Args:
+        dataset_id (str): the dataset_id to get the variables
+        dbt_vars (dict): dict containing the parameters
+        table_id (str): the table_id get the date_range variable
+        raw_dataset_id (str): the raw_dataset_id get the date_range variable
+        raw_table_id (str): the raw_table_id get the date_range variable
+        mode (str): the mode to get the date_range variable
+
+    Returns:
+        tuple[list[dict]: the variables to be used in DBT
+        Union[list[dict], dict, None]: the date variable (date_range or run_date)
+        bool: a flag that indicates if the date_range variable came from Redis
+    """
+
+    log(f"Creating DBT variables. Parameter received: {dbt_vars}")
+
+    if (not dbt_vars) or (not table_id):
+        log("dbt_vars or table_id are blank. Skiping task")
+        return [None], None, False
+
+    final_vars = []
+    date_var = None
+    flag_date_range = False
+
+    if "date_range" in dbt_vars.keys():
+        log("Creating date_range variable")
+
+        # Set date_range variable manually
+        if dict_contains_keys(
+            dbt_vars["date_range"], ["date_range_start", "date_range_end"]
+        ):
+            date_var = {
+                "date_range_start": dbt_vars["date_range"]["date_range_start"],
+                "date_range_end": dbt_vars["date_range"]["date_range_end"],
+            }
+        # Create date_range using Redis
+        else:
+            raw_table_id = raw_table_id or table_id
+
+            date_var = get_materialization_date_range.run(
+                dataset_id=dataset_id,
+                table_id=table_id,
+                raw_dataset_id=raw_dataset_id,
+                raw_table_id=raw_table_id,
+                table_run_datetime_column_name=dbt_vars["date_range"].get(
+                    "table_run_datetime_column_name"
+                ),
+                mode=mode,
+                delay_hours=dbt_vars["date_range"].get("delay_hours", 0),
+            )
+
+            flag_date_range = True
+
+        final_vars.append(date_var.copy())
+
+        log(f"date_range created: {date_var}")
+
+    elif "run_date" in dbt_vars.keys():
+        log("Creating run_date variable")
+
+        date_var = get_run_dates.run(
+            dbt_vars["run_date"].get("date_range_start"),
+            dbt_vars["run_date"].get("date_range_end"),
+        )
+        final_vars.append([d.copy() for d in date_var])
+
+        log(f"run_date created: {date_var}")
+
+    if "version" in dbt_vars.keys():
+        log("Creating version variable")
+        dataset_sha = fetch_dataset_sha.run(dataset_id=dataset_id)
+
+        # if there are other variables inside the list, update each item adding the version variable
+        if final_vars:
+            final_vars = get_join_dict.run(dict_list=final_vars, new_dict=dataset_sha)
+        else:
+            final_vars.append(dataset_sha)
+
+        log(f"version created: {dataset_sha}")
+
+    log(f"All variables was created, final value is: {final_vars}")
+
+    return final_vars, date_var, flag_date_range
diff --git a/pipelines/rj_smtr/utils.py b/pipelines/rj_smtr/utils.py
index 1d71dd3dd..f9b98afab 100644
--- a/pipelines/rj_smtr/utils.py
+++ b/pipelines/rj_smtr/utils.py
@@ -434,7 +434,6 @@ def generate_execute_schedules(  # pylint: disable=too-many-arguments,too-many-l
     clocks = []
     for count, parameters in enumerate(table_parameters):
         parameter_defaults = parameters | general_flow_params
-
         log(f"parameter_defaults: {parameter_defaults}")
         clocks.append(
             IntervalClock(
@@ -448,6 +447,19 @@ def generate_execute_schedules(  # pylint: disable=too-many-arguments,too-many-l
     return clocks
 
 
+def dict_contains_keys(input_dict: dict, keys: list[str]) -> bool:
+    """
+    Test if the input dict has all keys present in the list
+
+    Args:
+        input_dict (dict): the dict to test if has the keys
+        keys (list[str]): the list containing the keys to check
+    Returns:
+        bool: True if the input_dict has all the keys otherwise False
+    """
+    return all(x in input_dict.keys() for x in keys)
+
+
 def save_raw_local_func(
     data: Union[dict, str], filepath: str, mode: str = "raw", filetype: str = "json"
 ) -> str:

From b847649c388670ba6314ac11b791bebad985f396 Mon Sep 17 00:00:00 2001
From: d116626 <d116626@gmail.com>
Date: Thu, 5 Oct 2023 15:55:14 -0300
Subject: [PATCH 40/41] fix: smfp sigma dataset name

---
 pipelines/rj_smfp/__init__.py                        |  2 +-
 .../__init__.py                                      |  0
 .../flows.py                                         | 12 ++++++------
 .../schedules.py                                     |  5 +++--
 4 files changed, 10 insertions(+), 9 deletions(-)
 rename pipelines/rj_smfp/{dump_db_sigma_medicamentos => dump_db_sigma_compras_materiais}/__init__.py (100%)
 rename pipelines/rj_smfp/{dump_db_sigma_medicamentos => dump_db_sigma_compras_materiais}/flows.py (74%)
 rename pipelines/rj_smfp/{dump_db_sigma_medicamentos => dump_db_sigma_compras_materiais}/schedules.py (97%)

diff --git a/pipelines/rj_smfp/__init__.py b/pipelines/rj_smfp/__init__.py
index ea6519e3b..a8b9019d0 100644
--- a/pipelines/rj_smfp/__init__.py
+++ b/pipelines/rj_smfp/__init__.py
@@ -6,7 +6,7 @@
 from pipelines.rj_smfp.dump_db_ergon_comlurb.flows import *
 from pipelines.rj_smfp.dump_db_metas.flows import *
 from pipelines.rj_smfp.dump_db_sigma.flows import *
-from pipelines.rj_smfp.dump_db_sigma_medicamentos.flows import *
+from pipelines.rj_smfp.dump_db_sigma_compras_materiais.flows import *
 from pipelines.rj_smfp.dump_inadimplente.flows import *
 from pipelines.rj_smfp.dump_url_metas.flows import *
 from pipelines.rj_smfp.goals_dashboard_dbt.flows import *
diff --git a/pipelines/rj_smfp/dump_db_sigma_medicamentos/__init__.py b/pipelines/rj_smfp/dump_db_sigma_compras_materiais/__init__.py
similarity index 100%
rename from pipelines/rj_smfp/dump_db_sigma_medicamentos/__init__.py
rename to pipelines/rj_smfp/dump_db_sigma_compras_materiais/__init__.py
diff --git a/pipelines/rj_smfp/dump_db_sigma_medicamentos/flows.py b/pipelines/rj_smfp/dump_db_sigma_compras_materiais/flows.py
similarity index 74%
rename from pipelines/rj_smfp/dump_db_sigma_medicamentos/flows.py
rename to pipelines/rj_smfp/dump_db_sigma_compras_materiais/flows.py
index d8b6bd62e..8c8c1fdf9 100644
--- a/pipelines/rj_smfp/dump_db_sigma_medicamentos/flows.py
+++ b/pipelines/rj_smfp/dump_db_sigma_compras_materiais/flows.py
@@ -1,6 +1,6 @@
 # -*- coding: utf-8 -*-
 """
-Database dumping flows for SMFP SIGMA MEDICAMENTOS
+Database dumping flows for SMFP SIGMA COMPRAS MATERIAIS
 """
 
 from copy import deepcopy
@@ -11,15 +11,15 @@
 from pipelines.constants import constants
 
 # importa o schedule
-from pipelines.rj_smfp.dump_db_sigma_medicamentos.schedules import (
-    sigma_daily_update_schedule,
+from pipelines.rj_smfp.dump_db_sigma_compras_materiais.schedules import (
+    compras_sigma_daily_update_schedule,
 )
 from pipelines.utils.dump_db.flows import dump_sql_flow
 from pipelines.utils.utils import set_default_parameters
 
 rj_smfp_dump_db_sigma_medicamentos_flow = deepcopy(dump_sql_flow)
 rj_smfp_dump_db_sigma_medicamentos_flow.name = (
-    "SMFP: SIGMA - MEDICAMENTOS - Ingerir tabelas de banco SQL"
+    "SMFP: COMPRAS MATERIAIS SERVICOS SIGMA - Ingerir tabelas de banco SQL"
 )
 rj_smfp_dump_db_sigma_medicamentos_flow.storage = GCS(constants.GCS_FLOWS_BUCKET.value)
 
@@ -35,7 +35,7 @@
     "db_host": "10.90.31.22",
     "db_port": "1521",
     "db_type": "oracle",
-    "dataset_id": "saude_medicamentos_sigma",
+    "dataset_id": "compras_materiais_servicos_sigma",
     "vault_secret_path": "db-sigma",
 }
 
@@ -44,4 +44,4 @@
     default_parameters=rj_smfp_dump_db_sigma_medicamentos_default_parameters,
 )
 
-rj_smfp_dump_db_sigma_medicamentos_flow.schedule = sigma_daily_update_schedule
+rj_smfp_dump_db_sigma_medicamentos_flow.schedule = compras_sigma_daily_update_schedule
diff --git a/pipelines/rj_smfp/dump_db_sigma_medicamentos/schedules.py b/pipelines/rj_smfp/dump_db_sigma_compras_materiais/schedules.py
similarity index 97%
rename from pipelines/rj_smfp/dump_db_sigma_medicamentos/schedules.py
rename to pipelines/rj_smfp/dump_db_sigma_compras_materiais/schedules.py
index 0f61241ff..dd6847c56 100644
--- a/pipelines/rj_smfp/dump_db_sigma_medicamentos/schedules.py
+++ b/pipelines/rj_smfp/dump_db_sigma_compras_materiais/schedules.py
@@ -156,6 +156,7 @@
                 CNPJ_FABRICANTE
             FROM SIGMA.VW_MOVIMENTACAO
         """,  # noqa
+        "interval": timedelta(days=7),
     },
     "ramo_atividade": {
         "biglake_table": True,
@@ -224,9 +225,9 @@
     db_host="10.90.31.22",
     db_port="1521",
     db_type="oracle",
-    dataset_id="saude_medicamentos_sigma",
+    dataset_id="compras_materiais_servicos_sigma",
     vault_secret_path="db-sigma",
     table_parameters=_sigma_queries,
 )
 
-sigma_daily_update_schedule = Schedule(clocks=untuple(sigma_infra_clocks))
+compras_sigma_daily_update_schedule = Schedule(clocks=untuple(sigma_infra_clocks))

From 5a95a5aa0c28b47a04586d195d80c464c1c52402 Mon Sep 17 00:00:00 2001
From: Rafael Carvalho Pinheiro <74972217+pixuimpou@users.noreply.github.com>
Date: Thu, 5 Oct 2023 16:18:04 -0300
Subject: [PATCH 41/41] Alterar interval tabelas auxiliares bilhetagem (#525)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* create default materialization flow

* create tasks for default materialization flow

* make generate_execute_schedules more generic

* create bilhetagem materialization flow

* adapt bilhetagem schedules for the new model

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* add run config and storage

* Update utils.py

* fix sub tasks

* fix fetch_dataset_sha run

* add run_date variable to materialization flow

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* remove discord notifications for testing

* add manual date_range / fix flow run name

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* fix missing table_id logic

* fix empty return

* fix empty return

* add flag_date_range when var_params is blank

* change rename logic when has date variables

* change return values of create_dbt_run_vars

* create dict aux function

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* remove *args from task

* change coalesce task

* fix rename task

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* fix task order

* add docstrings

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* fix line too long

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* pre-commit hook

* adjust tasks

* mudar estrutura do flow materializacao

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* adicionar schedule de bilhetagem

* adicionar schedule no flow de materialização

* ajuste nome da coluna de datetime

* ajustar nome coluna

* mudar coluna de data para datetime_transacao

* ajusta variavel date_range manual

* mudar nome parametro de variável dbt

* cria flow de orquestração materialização

* volta notificação do discord

* ajusta wait_flow_run

* mudar query para teste

* reverter query teste

* usar copy no dicionario de variaveis de data

* adjust constant run interval

* remover funcao comentada

* alterar padrão de nome dos flows

* remove imports comentados

* remove schedules nao utilizados

* remove task comentada

* mudar agent para produção

* mudar run interval tabelas auxiliares

* remove tratamento comentado

* ajusta dicionario constante

---------

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: mergify[bot] <37929162+mergify[bot]@users.noreply.github.com>
Co-authored-by: Rodrigo Cunha <66736583+eng-rodrigocunha@users.noreply.github.com>
---
 pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/schedules.py | 4 +---
 pipelines/rj_smtr/constants.py                               | 4 +---
 pipelines/rj_smtr/flows.py                                   | 2 --
 3 files changed, 2 insertions(+), 8 deletions(-)

diff --git a/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/schedules.py b/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/schedules.py
index c2ee21164..21e13f05b 100644
--- a/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/schedules.py
+++ b/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/schedules.py
@@ -27,9 +27,7 @@
     dataset_id=constants.BILHETAGEM_DATASET_ID.value,
     secret_path=constants.BILHETAGEM_SECRET_PATH.value,
     source_type=constants.BILHETAGEM_GENERAL_CAPTURE_PARAMS.value["source_type"],
-    runs_interval_minutes=constants.BILHETAGEM_GENERAL_CAPTURE_PARAMS.value[
-        "transacao_runs_interval_minutes"
-    ],
+    runs_interval_minutes=0,
 )
 
 bilhetagem_transacao_schedule = Schedule(clocks=untuple(bilhetagem_transacao_clocks))
diff --git a/pipelines/rj_smtr/constants.py b/pipelines/rj_smtr/constants.py
index ee8a22cd2..0037c6989 100644
--- a/pipelines/rj_smtr/constants.py
+++ b/pipelines/rj_smtr/constants.py
@@ -183,13 +183,11 @@ class constants(Enum):  # pylint: disable=c0103
         },
         "vpn_url": "http://vpn-jae.mobilidade.rio/",
         "source_type": "api-json",
-        "transacao_runs_interval_minutes": 0,
-        "principal_runs_interval_minutes": 5,
     }
 
     BILHETAGEM_CAPTURE_RUN_INTERVAL = {
         "transacao_run_interval": {"minutes": 1},
-        "principal_run_interval": {"days": 1},
+        "principal_run_interval": {"hours": 1},
     }
 
     BILHETAGEM_TRANSACAO_CAPTURE_PARAMS = {
diff --git a/pipelines/rj_smtr/flows.py b/pipelines/rj_smtr/flows.py
index 0efb69b17..d4292129c 100644
--- a/pipelines/rj_smtr/flows.py
+++ b/pipelines/rj_smtr/flows.py
@@ -140,8 +140,6 @@
     flags = Parameter("flags", default=None)
     dbt_vars = Parameter("dbt_vars", default=dict())
 
-    # treated_table_params = treat_dbt_table_params(table_params=table_params)
-
     LABELS = get_current_flow_labels()
     MODE = get_current_flow_mode(LABELS)