From 3edf4587bb7992acdcb9cd4ca7f4fc155b4b4d8c Mon Sep 17 00:00:00 2001 From: d116626 Date: Mon, 18 Sep 2023 17:04:05 -0300 Subject: [PATCH 01/41] chore: add sms sigma estoque --- pipelines/rj_sms/__init__.py | 1 + pipelines/rj_sms/dump_db_sigma/__init__.py | 0 pipelines/rj_sms/dump_db_sigma/flows.py | 47 ++++ pipelines/rj_sms/dump_db_sigma/schedules.py | 232 ++++++++++++++++++++ 4 files changed, 280 insertions(+) create mode 100644 pipelines/rj_sms/dump_db_sigma/__init__.py create mode 100644 pipelines/rj_sms/dump_db_sigma/flows.py create mode 100644 pipelines/rj_sms/dump_db_sigma/schedules.py diff --git a/pipelines/rj_sms/__init__.py b/pipelines/rj_sms/__init__.py index e73a968a6..eab9c1cc4 100644 --- a/pipelines/rj_sms/__init__.py +++ b/pipelines/rj_sms/__init__.py @@ -3,5 +3,6 @@ Prefect flows for rj_sms project """ +from pipelines.rj_sms.dump_db_sigma.flows import * from pipelines.rj_sms.dump_db_sivep.flows import * from pipelines.rj_sms.pubsub.flows import * diff --git a/pipelines/rj_sms/dump_db_sigma/__init__.py b/pipelines/rj_sms/dump_db_sigma/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/pipelines/rj_sms/dump_db_sigma/flows.py b/pipelines/rj_sms/dump_db_sigma/flows.py new file mode 100644 index 000000000..8c238a995 --- /dev/null +++ b/pipelines/rj_sms/dump_db_sigma/flows.py @@ -0,0 +1,47 @@ +# -*- coding: utf-8 -*- +""" +Database dumping flows for SMS SIGMA system +""" + +from copy import deepcopy + +from prefect.run_configs import KubernetesRun +from prefect.storage import GCS + +from pipelines.constants import constants + +# importa o schedule +from pipelines.rj_sms.dump_db_sigma.schedules import ( + sigma_daily_update_schedule, +) +from pipelines.utils.dump_db.flows import dump_sql_flow +from pipelines.utils.utils import set_default_parameters + +rj_sms_dump_db_sigma_flow = deepcopy(dump_sql_flow) +rj_sms_dump_db_sigma_flow.name = ( + "SMS: SIGMA - ESTOQUE MEDICAMENTOS - Ingerir tabelas de banco SQL" +) +rj_sms_dump_db_sigma_flow.storage = GCS(constants.GCS_FLOWS_BUCKET.value) + +rj_sms_dump_db_sigma_flow.run_config = KubernetesRun( + image=constants.DOCKER_IMAGE.value, + labels=[ + constants.RJ_SMS_AGENT_LABEL.value, # label do agente + ], +) + +rj_sms_dump_db_sigma_default_parameters = { + "db_database": "CP01.SMF", + "db_host": "10.90.31.22", + "db_port": "1521", + "db_type": "oracle", + "dataset_id": "saude_estoque_medicamentos_sigma", + "vault_secret_path": "db-sigma", +} + +rj_sms_dump_db_sigma_flow = set_default_parameters( + rj_sms_dump_db_sigma_flow, + default_parameters=rj_sms_dump_db_sigma_default_parameters, +) + +rj_sms_dump_db_sigma_flow.schedule = sigma_daily_update_schedule diff --git a/pipelines/rj_sms/dump_db_sigma/schedules.py b/pipelines/rj_sms/dump_db_sigma/schedules.py new file mode 100644 index 000000000..b9c032b52 --- /dev/null +++ b/pipelines/rj_sms/dump_db_sigma/schedules.py @@ -0,0 +1,232 @@ +# -*- coding: utf-8 -*- +""" +Schedules for the SMS SIGMA dump_db pipeline. +""" + +from datetime import timedelta, datetime + +from prefect.schedules import Schedule +import pytz + +from pipelines.constants import constants +from pipelines.utils.dump_db.utils import generate_dump_db_schedules +from pipelines.utils.utils import untuple_clocks as untuple + + +##################################### +# +# SMS SIGMA Schedules +# +##################################### + +_sigma_queries = { + "classe": { + "biglake_table": True, + "materialize_after_dump": True, + "materialization_mode": "prod", + "dump_mode": "overwrite", + "execute_query": """ + SELECT + CD_GRUPO, + CD_CLASSE, + DS_CLASSE, + ST_STATUS + FROM SIGMA.VW_CLASSE + """, # noqa + }, + "fornecedor": { + "biglake_table": True, + "materialize_after_dump": True, + "materialization_mode": "prod", + "dump_mode": "overwrite", + "execute_query": """ + SELECT + CPF_CNPJ, + TIPO_CPF_CNPJ, + INSCRICAO_MUNICIPAL, + INSCRICAO_ESTADUAL, + RAZAO_SOCIAL, + NOME_FANTASIA, + NOME_CONTATO, + EMAIL, + EMAIL_CONTATO, + FAX, + DDD, + DDI, + RAMAL, + TELEFONE, + LOGRADOURO, + NUMERO_PORTA, + COMPLEMENTO, + BAIRRO, + MUNICIPIO, + UF, + CEP, + ATIVO_INATIVO_BLOQUEADO, + CD_NATUREZA_JURIDICA, + DS_NATUREZA_JURIDICA, + RAMO_ATIVIDADE, + CD_PORTE_EMPRESA, + DATA_ULTIMA_ATUALIZACAO, + FORNECEDOR_EVENTUAL + FROM SIGMA.VW_FORNECEDOR + """, # noqa + }, + "fornecedor_sem_vinculo": { + "biglake_table": True, + "materialize_after_dump": True, + "materialization_mode": "prod", + "dump_mode": "overwrite", + "execute_query": """ + SELECT + CPF_CNPJ, + TIPO_CPF_CNPJ, + NOME, + NUMERO_PORTA, + COMPLEMENTO + FROM SIGMA.VW_FORNECEDOR_SEM_VINCULO + """, # noqa + }, + "grupo": { + "biglake_table": True, + "materialize_after_dump": True, + "materialization_mode": "prod", + "dump_mode": "overwrite", + "execute_query": """ + SELECT + CD_GRUPO, + DS_GRUPO, + ST_STATUS + FROM SIGMA.VW_GRUPO + """, # noqa + }, + "material": { + "biglake_table": True, + "materialize_after_dump": True, + "materialization_mode": "prod", + "dump_mode": "overwrite", + "execute_query": """ + SELECT + CD_MATERIAL, + CD_GRUPO, + CD_CLASSE, + CD_SUBCLASSE, + SEQUENCIAL, + DV1, + DV2, + NM_PADRONIZADO, + NM_COMPLEMENTAR_MATERIAL, + UNIDADE, + DS_DETALHE_MATERIAL, + DT_DESATIVACAO, + ST_STATUS, + REMUME + FROM SIGMA.VW_MATERIAL + """, # noqa + }, + "movimentacao": { + "biglake_table": True, + "materialize_after_dump": True, + "materialization_mode": "prod", + "dump_mode": "overwrite", + "execute_query": """ + SELECT + CD_MATERIAL, + CNPJ_FORNECEDOR, + NOTA_FISCAL, + SERIE, + DATA_NOTA_FISCAL, + QUANTIDADE_ITEM, + PRECO_ITEM, + TOTAL_ITEM, + DATA_ULTIMA_ATUALIZACAO, + CD_MOVIMENTACAO, + DS_MOVIMENTACAO, + TP_ALMOXARIFADO, + CD_SECRETARIA, + DS_SECRETARIA, + CD_ALMOXARIFADO_DESTINO, + DS_ALMOXARIFADO_DESTINO, + CD_ALMOXARIFADO_ORIGEM, + DS_ALMOXARIFADO_ORIGEM, + CD_OS, + DT_INI_CONTRATO_OS, + DT_FIM_CONTRATO_OS, + NR_EMPENHO, + CNPJ_FABRICANTE + FROM SIGMA.VW_MOVIMENTACAO + """, # noqa + }, + "ramo_atividade": { + "biglake_table": True, + "materialize_after_dump": True, + "materialization_mode": "prod", + "dump_mode": "overwrite", + "execute_query": """ + SELECT + CD_RAMO, + DS_RAMO, + ST_RAMO + FROM SIGMA.VW_RAMO_ATIVIDADE + """, # noqa + }, + "servico": { + "biglake_table": True, + "materialize_after_dump": True, + "materialization_mode": "prod", + "dump_mode": "overwrite", + "execute_query": """ + SELECT + CD_SERV, + CD_SEQ, + CD_SERVICO, + DS_SERVICO, + ST_STATUS + FROM SIGMA.VW_SERVICO + """, # noqa + }, + "subclasse": { + "biglake_table": True, + "materialize_after_dump": True, + "materialization_mode": "prod", + "dump_mode": "overwrite", + "execute_query": """ + SELECT + CD_GRUPO, + CD_CLASSE, + CD_SUBCLASSE, + DS_SUBCLASSE, + ST_STATUS + FROM SIGMA.VW_SUBCLASSE + """, # noqa + }, + "unidade": { + "biglake_table": True, + "materialize_after_dump": True, + "materialization_mode": "prod", + "dump_mode": "overwrite", + "execute_query": """ + SELECT + UNIDADE, + DS_UNIDADE + FROM SIGMA.VW_UNIDADE + """, # noqa + }, +} + +sigma_infra_clocks = generate_dump_db_schedules( + interval=timedelta(days=1), + start_date=datetime(2022, 3, 21, 1, 0, tzinfo=pytz.timezone("America/Sao_Paulo")), + labels=[ + constants.RJ_SMS_AGENT_LABEL.value, + ], + db_database="CP01.SMF", + db_host="10.90.31.22", + db_port="1521", + db_type="oracle", + dataset_id="saude_estoque_medicamentos_sigma", + vault_secret_path="db-sigma", + table_parameters=_sigma_queries, +) + +sigma_daily_update_schedule = Schedule(clocks=untuple(sigma_infra_clocks)) From 70d5df1c25ced0ea304d7da118288aeff39f4186 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 18 Sep 2023 20:04:55 +0000 Subject: [PATCH 02/41] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- pipelines/rj_sms/dump_db_sigma/schedules.py | 36 ++++++++++----------- 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/pipelines/rj_sms/dump_db_sigma/schedules.py b/pipelines/rj_sms/dump_db_sigma/schedules.py index b9c032b52..1a2549a94 100644 --- a/pipelines/rj_sms/dump_db_sigma/schedules.py +++ b/pipelines/rj_sms/dump_db_sigma/schedules.py @@ -26,10 +26,10 @@ "materialization_mode": "prod", "dump_mode": "overwrite", "execute_query": """ - SELECT - CD_GRUPO, - CD_CLASSE, - DS_CLASSE, + SELECT + CD_GRUPO, + CD_CLASSE, + DS_CLASSE, ST_STATUS FROM SIGMA.VW_CLASSE """, # noqa @@ -40,7 +40,7 @@ "materialization_mode": "prod", "dump_mode": "overwrite", "execute_query": """ - SELECT + SELECT CPF_CNPJ, TIPO_CPF_CNPJ, INSCRICAO_MUNICIPAL, @@ -78,7 +78,7 @@ "materialization_mode": "prod", "dump_mode": "overwrite", "execute_query": """ - SELECT + SELECT CPF_CNPJ, TIPO_CPF_CNPJ, NOME, @@ -93,9 +93,9 @@ "materialization_mode": "prod", "dump_mode": "overwrite", "execute_query": """ - SELECT - CD_GRUPO, - DS_GRUPO, + SELECT + CD_GRUPO, + DS_GRUPO, ST_STATUS FROM SIGMA.VW_GRUPO """, # noqa @@ -106,7 +106,7 @@ "materialization_mode": "prod", "dump_mode": "overwrite", "execute_query": """ - SELECT + SELECT CD_MATERIAL, CD_GRUPO, CD_CLASSE, @@ -130,7 +130,7 @@ "materialization_mode": "prod", "dump_mode": "overwrite", "execute_query": """ - SELECT + SELECT CD_MATERIAL, CNPJ_FORNECEDOR, NOTA_FISCAL, @@ -163,9 +163,9 @@ "materialization_mode": "prod", "dump_mode": "overwrite", "execute_query": """ - SELECT - CD_RAMO, - DS_RAMO, + SELECT + CD_RAMO, + DS_RAMO, ST_RAMO FROM SIGMA.VW_RAMO_ATIVIDADE """, # noqa @@ -176,7 +176,7 @@ "materialization_mode": "prod", "dump_mode": "overwrite", "execute_query": """ - SELECT + SELECT CD_SERV, CD_SEQ, CD_SERVICO, @@ -191,7 +191,7 @@ "materialization_mode": "prod", "dump_mode": "overwrite", "execute_query": """ - SELECT + SELECT CD_GRUPO, CD_CLASSE, CD_SUBCLASSE, @@ -206,8 +206,8 @@ "materialization_mode": "prod", "dump_mode": "overwrite", "execute_query": """ - SELECT - UNIDADE, + SELECT + UNIDADE, DS_UNIDADE FROM SIGMA.VW_UNIDADE """, # noqa From a889247b284641278cfd43b41245ac247c4e8a9e Mon Sep 17 00:00:00 2001 From: patriciacatandi Date: Tue, 19 Sep 2023 19:44:02 -0300 Subject: [PATCH 03/41] starting inea flow using ftp --- .../rj_escritorio/dump_ftp_inea/flows.py | 73 +++++++++++ .../rj_escritorio/dump_ftp_inea/schedules.py | 100 +++++++++++++++ .../rj_escritorio/dump_ftp_inea/tasks.py | 120 ++++++++++++++++++ 3 files changed, 293 insertions(+) create mode 100644 pipelines/rj_escritorio/dump_ftp_inea/flows.py create mode 100644 pipelines/rj_escritorio/dump_ftp_inea/schedules.py create mode 100644 pipelines/rj_escritorio/dump_ftp_inea/tasks.py diff --git a/pipelines/rj_escritorio/dump_ftp_inea/flows.py b/pipelines/rj_escritorio/dump_ftp_inea/flows.py new file mode 100644 index 000000000..94312214a --- /dev/null +++ b/pipelines/rj_escritorio/dump_ftp_inea/flows.py @@ -0,0 +1,73 @@ +# -*- coding: utf-8 -*- +""" +Dumping data from INEA FTP to BigQuery +""" +# pylint: disable=E1101,C0103 + +from copy import deepcopy + +from prefect import Parameter +from prefect.run_configs import LocalRun +from prefect.storage import GCS +from prefect.utilities.edges import unmapped + +from pipelines.constants import constants +from pipelines.rj_escritorio.dump_ftp_inea.tasks import ( + get_ftp_client, + get_files_to_download, + download_files, + upload_file_to_gcs, +) +from pipelines.rj_escritorio.dump_ftp_inea.schedules import ( + every_5_minutes, + every_5_minutes_mac, +) +from pipelines.utils.decorators import Flow + + +with Flow( + "INEA: Captura dados de radar (Guaratiba)", + code_owners=[ + "paty", + ], +) as inea_ftp_radar_flow: + bucket_name = Parameter("bucket_name", default="rj-escritorio-dev", required=False) + prefix = Parameter( + "prefix", default="raw/meio_ambiente_clima/inea_radar_hdf5", required=False + ) + mode = Parameter("mode", default="prod", required=False) + radar = Parameter("radar", default="mac", required=False) + product = Parameter("product", default="ppi", required=False) + + client = get_ftp_client() + + files_to_download = get_files_to_download( + client=client, + radar=radar, + ) + + files_to_upload = download_files( + client=client, + files=files_to_download, + radar=radar, + ) + + upload_file_to_gcs.map( + file_to_upload=files_to_upload, + bucket_name=unmapped(bucket_name), + prefix=unmapped(prefix), + mode=unmapped(mode), + radar=unmapped(radar), + product=unmapped(product), + ) + + +inea_ftp_radar_flow.storage = GCS(constants.GCS_FLOWS_BUCKET.value) +inea_ftp_radar_flow.run_config = LocalRun(labels=[constants.INEA_AGENT_LABEL.value]) +inea_ftp_radar_flow.schedule = every_5_minutes + +inea_ftp_radar_flow_mac = deepcopy(inea_ftp_radar_flow) +inea_ftp_radar_flow_mac.name = "INEA: Captura dados de radar (Macaé)" +inea_ftp_radar_flow_mac.storage = GCS(constants.GCS_FLOWS_BUCKET.value) +inea_ftp_radar_flow_mac.run_config = LocalRun(labels=[constants.INEA_AGENT_LABEL.value]) +inea_ftp_radar_flow_mac.schedule = every_5_minutes_mac diff --git a/pipelines/rj_escritorio/dump_ftp_inea/schedules.py b/pipelines/rj_escritorio/dump_ftp_inea/schedules.py new file mode 100644 index 000000000..ae59322ff --- /dev/null +++ b/pipelines/rj_escritorio/dump_ftp_inea/schedules.py @@ -0,0 +1,100 @@ +# -*- coding: utf-8 -*- +# pylint: disable=C0103 +""" +Schedules for the INEA flows. +""" + +from datetime import timedelta, datetime + +from prefect.schedules import Schedule +from prefect.schedules.clocks import IntervalClock +import pytz + +from pipelines.constants import constants + +every_5_minutes = Schedule( + clocks=[ + IntervalClock( + interval=timedelta(minutes=5), + start_date=datetime(2021, 1, 1, tzinfo=pytz.timezone("America/Sao_Paulo")), + labels=[ + constants.INEA_AGENT_LABEL.value, + ], + parameter_defaults={ + "bucket_name": "rj-escritorio-dev", + "convert_params": "-k=ODIM2.1 -M=All", + "mode": "prod", + "output_format": "HDF5", + "prefix": "raw/meio_ambiente_clima/inea_radar_hdf5", + "product": "ppi", + "radar": "gua", + "vols_remote_directory": "/var/opt/edge/vols", + }, + ), + ] +) +every_5_minutes_mac = Schedule( + clocks=[ + IntervalClock( + interval=timedelta(minutes=5), + start_date=datetime(2021, 1, 1, tzinfo=pytz.timezone("America/Sao_Paulo")), + labels=[ + constants.INEA_AGENT_LABEL.value, + ], + parameter_defaults={ + "bucket_name": "rj-escritorio-dev", + "convert_params": "-k=ODIM2.1 -M=All", + "mode": "prod", + "output_format": "HDF5", + "prefix": "raw/meio_ambiente_clima/inea_radar_hdf5", + "product": "ppi", + "radar": "mac", + "vols_remote_directory": "/var/opt/edge/vols", + }, + ), + ] +) +every_1_day = Schedule( + clocks=[ + IntervalClock( + interval=timedelta(days=1), + start_date=datetime(2021, 1, 1, tzinfo=pytz.timezone("America/Sao_Paulo")), + labels=[ + constants.INEA_AGENT_LABEL.value, + ], + parameter_defaults={ + "bucket_name": "rj-escritorio-dev", + "convert_params": "-k=ODIM2.1 -M=All", + "mode": "prod", + "output_format": "HDF5", + "prefix": "raw/meio_ambiente_clima/inea_radar_hdf5", + "product": "ppi", + "radar": "gua", + "get_only_last_file": False, + "vols_remote_directory": "/var/opt/edge/vols", + }, + ), + ] +) +every_1_day_mac = Schedule( + clocks=[ + IntervalClock( + interval=timedelta(days=1), + start_date=datetime(2021, 1, 1, tzinfo=pytz.timezone("America/Sao_Paulo")), + labels=[ + constants.INEA_AGENT_LABEL.value, + ], + parameter_defaults={ + "bucket_name": "rj-escritorio-dev", + "convert_params": "-k=ODIM2.1 -M=All", + "mode": "prod", + "output_format": "HDF5", + "prefix": "raw/meio_ambiente_clima/inea_radar_hdf5", + "product": "ppi", + "radar": "mac", + "get_only_last_file": False, + "vols_remote_directory": "/var/opt/edge/vols", + }, + ), + ] +) diff --git a/pipelines/rj_escritorio/dump_ftp_inea/tasks.py b/pipelines/rj_escritorio/dump_ftp_inea/tasks.py new file mode 100644 index 000000000..f63b0d92c --- /dev/null +++ b/pipelines/rj_escritorio/dump_ftp_inea/tasks.py @@ -0,0 +1,120 @@ +# -*- coding: utf-8 -*- +""" +Tasks to dump data from a INEA FTP to BigQuery +""" +# pylint: disable=E0702,E1137,E1136,E1101,C0207,W0613 +from datetime import datetime, timedelta +from pathlib import Path + +from google.cloud import storage +from prefect import task + +from pipelines.utils.ftp.client import FTPClient +from pipelines.utils.utils import ( + log, + get_credentials_from_env, + get_vault_secret, +) + + +@task +def get_ftp_client(wait=None): + """ + Get FTP client + """ + inea_secret = get_vault_secret("ftp_inea_radar") + hostname = inea_secret["data"]["hostname"] + username = inea_secret["data"]["username"] + password = inea_secret["data"]["password"] + + return FTPClient( + hostname=hostname, + username=username, + password=password, + ) + + +@task( + max_retries=3, + retry_delay=timedelta(seconds=30), +) +def get_files_to_download(client, radar): + """ + Get files to download FTP and GCS + """ + + client.connect() + files = client.list_files(path=f"./{radar.upper()}/") + files = files[-4:] + log(f"files: {files}") + + return files + + +@task( + max_retries=3, + retry_delay=timedelta(seconds=30), +) +def download_files(client, files, radar): + """ + Download files from FTP + """ + + save_path = Path(radar.upper()) + save_path.mkdir(parents=True, exist_ok=True) + + client.connect() + files_downloaded = [] + for file in files: + # file_path = save_path / file + file_path = file + client.download(remote_path=file, local_path=file_path) + files_downloaded.append(file_path) + log(f"files_downloaded: {files_downloaded}") + return files_downloaded + + +@task( + max_retries=3, + retry_delay=timedelta(seconds=30), +) +# pylint: disable=too-many-arguments, too-many-locals +def upload_file_to_gcs( + file_to_upload: str, + bucket_name: str, + prefix: str, + radar: str, + product: str, + mode="prod", + task_mode="partitioned", + unlink: bool = True, +): + """ + Upload files to GCS + """ + credentials = get_credentials_from_env(mode=mode) + storage_client = storage.Client(credentials=credentials) + + bucket = storage_client.bucket(bucket_name) + + file = Path(file_to_upload) + if file.is_file(): + if task_mode == "partitioned": + # Converted file path is in the format: + # /var/opt/edge/.../YYYYMMDD/.nc.gz + # We need to get the datetime for the file + date_str = file.parent.name + date = datetime.strptime(date_str, "%Y%m%d").strftime("%Y-%m-%d") + blob_name = f"{prefix}/radar={radar}/produto={product}/data_particao={date}/{file.name}" + blob_name = blob_name.replace("//", "/") + elif task_mode == "raw": + blob_name = f"{prefix}/{file.name}" + else: + raise ValueError(f"Invalid task_mode: {task_mode}") + log(f"Uploading file {file} to GCS...") + log(f"Blob name will be {blob_name}") + blob = bucket.blob(blob_name) + blob.upload_from_filename(file) + log(f"File {file} uploaded to GCS.") + if unlink: + file.unlink() From cbc7b8c7efe7b9ddf74f5df5323092463915375f Mon Sep 17 00:00:00 2001 From: patriciacatandi Date: Tue, 19 Sep 2023 19:45:08 -0300 Subject: [PATCH 04/41] modifying init --- pipelines/rj_escritorio/__init__.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pipelines/rj_escritorio/__init__.py b/pipelines/rj_escritorio/__init__.py index 0813a42ba..a5c864245 100644 --- a/pipelines/rj_escritorio/__init__.py +++ b/pipelines/rj_escritorio/__init__.py @@ -18,6 +18,7 @@ from pipelines.rj_escritorio.waze.flows import * from pipelines.rj_escritorio.geolocator.flows import * from pipelines.rj_escritorio.inea.flows import * +from pipelines.rj_escritorio.dump_ftp_inea.flows import * from pipelines.rj_escritorio.seconserva_buracos_refresh_data.flows import * from pipelines.rj_escritorio.dump_url_turismo.flows import * from pipelines.rj_escritorio.dump_policy_matrix.flows import * From e8186624d25bd9908e0094b4aba971dcfc9b4cc8 Mon Sep 17 00:00:00 2001 From: patriciacatandi Date: Wed, 20 Sep 2023 09:36:39 -0300 Subject: [PATCH 05/41] bugfix --- pipelines/utils/__init__.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pipelines/utils/__init__.py b/pipelines/utils/__init__.py index 58e3911d7..b5ffe2d88 100644 --- a/pipelines/utils/__init__.py +++ b/pipelines/utils/__init__.py @@ -9,6 +9,7 @@ from pipelines.utils.dump_to_gcs.flows import * from pipelines.utils.dump_url.flows import * from pipelines.utils.execute_dbt_model.flows import * +from pipelines.utils.ftp.client import * from pipelines.utils.georeference.flows import * from pipelines.utils.predict_flow.flows import * from pipelines.utils.whatsapp_bot.flows import * From d9a10d39bd99aba44ef3c28bb849d629f76eda52 Mon Sep 17 00:00:00 2001 From: patriciacatandi Date: Wed, 20 Sep 2023 11:10:41 -0300 Subject: [PATCH 06/41] undoing adding on init --- pipelines/utils/__init__.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pipelines/utils/__init__.py b/pipelines/utils/__init__.py index b5ffe2d88..58e3911d7 100644 --- a/pipelines/utils/__init__.py +++ b/pipelines/utils/__init__.py @@ -9,7 +9,6 @@ from pipelines.utils.dump_to_gcs.flows import * from pipelines.utils.dump_url.flows import * from pipelines.utils.execute_dbt_model.flows import * -from pipelines.utils.ftp.client import * from pipelines.utils.georeference.flows import * from pipelines.utils.predict_flow.flows import * from pipelines.utils.whatsapp_bot.flows import * From d75a92205b7a2492d5275543680a86def53b36e3 Mon Sep 17 00:00:00 2001 From: patriciacatandi Date: Wed, 20 Sep 2023 11:15:59 -0300 Subject: [PATCH 07/41] changing flows name --- pipelines/rj_escritorio/dump_ftp_inea/flows.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pipelines/rj_escritorio/dump_ftp_inea/flows.py b/pipelines/rj_escritorio/dump_ftp_inea/flows.py index 94312214a..60e9add45 100644 --- a/pipelines/rj_escritorio/dump_ftp_inea/flows.py +++ b/pipelines/rj_escritorio/dump_ftp_inea/flows.py @@ -26,7 +26,7 @@ with Flow( - "INEA: Captura dados de radar (Guaratiba)", + "INEA: Captura FTP dados de radar (Guaratiba)", code_owners=[ "paty", ], @@ -67,7 +67,7 @@ inea_ftp_radar_flow.schedule = every_5_minutes inea_ftp_radar_flow_mac = deepcopy(inea_ftp_radar_flow) -inea_ftp_radar_flow_mac.name = "INEA: Captura dados de radar (Macaé)" +inea_ftp_radar_flow_mac.name = "INEA: Captura FTP dados de radar (Macaé)" inea_ftp_radar_flow_mac.storage = GCS(constants.GCS_FLOWS_BUCKET.value) inea_ftp_radar_flow_mac.run_config = LocalRun(labels=[constants.INEA_AGENT_LABEL.value]) inea_ftp_radar_flow_mac.schedule = every_5_minutes_mac From f43d58a4ad0f15c4597f873760b8b55939c17be8 Mon Sep 17 00:00:00 2001 From: d116626 Date: Wed, 20 Sep 2023 11:42:14 -0300 Subject: [PATCH 08/41] chore: change from repo sms to smfp --- pipelines/rj_smfp/__init__.py | 1 + .../dump_db_sigma_medicamentos}/__init__.py | 0 .../dump_db_sigma_medicamentos/flows.py | 47 +++++++++++++++++++ .../dump_db_sigma_medicamentos}/schedules.py | 4 +- pipelines/rj_sms/__init__.py | 1 - pipelines/rj_sms/dump_db_sigma/flows.py | 47 ------------------- 6 files changed, 50 insertions(+), 50 deletions(-) rename pipelines/{rj_sms/dump_db_sigma => rj_smfp/dump_db_sigma_medicamentos}/__init__.py (100%) create mode 100644 pipelines/rj_smfp/dump_db_sigma_medicamentos/flows.py rename pipelines/{rj_sms/dump_db_sigma => rj_smfp/dump_db_sigma_medicamentos}/schedules.py (98%) delete mode 100644 pipelines/rj_sms/dump_db_sigma/flows.py diff --git a/pipelines/rj_smfp/__init__.py b/pipelines/rj_smfp/__init__.py index 022606109..ea6519e3b 100644 --- a/pipelines/rj_smfp/__init__.py +++ b/pipelines/rj_smfp/__init__.py @@ -6,6 +6,7 @@ from pipelines.rj_smfp.dump_db_ergon_comlurb.flows import * from pipelines.rj_smfp.dump_db_metas.flows import * from pipelines.rj_smfp.dump_db_sigma.flows import * +from pipelines.rj_smfp.dump_db_sigma_medicamentos.flows import * from pipelines.rj_smfp.dump_inadimplente.flows import * from pipelines.rj_smfp.dump_url_metas.flows import * from pipelines.rj_smfp.goals_dashboard_dbt.flows import * diff --git a/pipelines/rj_sms/dump_db_sigma/__init__.py b/pipelines/rj_smfp/dump_db_sigma_medicamentos/__init__.py similarity index 100% rename from pipelines/rj_sms/dump_db_sigma/__init__.py rename to pipelines/rj_smfp/dump_db_sigma_medicamentos/__init__.py diff --git a/pipelines/rj_smfp/dump_db_sigma_medicamentos/flows.py b/pipelines/rj_smfp/dump_db_sigma_medicamentos/flows.py new file mode 100644 index 000000000..d8b6bd62e --- /dev/null +++ b/pipelines/rj_smfp/dump_db_sigma_medicamentos/flows.py @@ -0,0 +1,47 @@ +# -*- coding: utf-8 -*- +""" +Database dumping flows for SMFP SIGMA MEDICAMENTOS +""" + +from copy import deepcopy + +from prefect.run_configs import KubernetesRun +from prefect.storage import GCS + +from pipelines.constants import constants + +# importa o schedule +from pipelines.rj_smfp.dump_db_sigma_medicamentos.schedules import ( + sigma_daily_update_schedule, +) +from pipelines.utils.dump_db.flows import dump_sql_flow +from pipelines.utils.utils import set_default_parameters + +rj_smfp_dump_db_sigma_medicamentos_flow = deepcopy(dump_sql_flow) +rj_smfp_dump_db_sigma_medicamentos_flow.name = ( + "SMFP: SIGMA - MEDICAMENTOS - Ingerir tabelas de banco SQL" +) +rj_smfp_dump_db_sigma_medicamentos_flow.storage = GCS(constants.GCS_FLOWS_BUCKET.value) + +rj_smfp_dump_db_sigma_medicamentos_flow.run_config = KubernetesRun( + image=constants.DOCKER_IMAGE.value, + labels=[ + constants.RJ_SMFP_AGENT_LABEL.value, # label do agente + ], +) + +rj_smfp_dump_db_sigma_medicamentos_default_parameters = { + "db_database": "CP01.SMF", + "db_host": "10.90.31.22", + "db_port": "1521", + "db_type": "oracle", + "dataset_id": "saude_medicamentos_sigma", + "vault_secret_path": "db-sigma", +} + +rj_smfp_dump_db_sigma_medicamentos_flow = set_default_parameters( + rj_smfp_dump_db_sigma_medicamentos_flow, + default_parameters=rj_smfp_dump_db_sigma_medicamentos_default_parameters, +) + +rj_smfp_dump_db_sigma_medicamentos_flow.schedule = sigma_daily_update_schedule diff --git a/pipelines/rj_sms/dump_db_sigma/schedules.py b/pipelines/rj_smfp/dump_db_sigma_medicamentos/schedules.py similarity index 98% rename from pipelines/rj_sms/dump_db_sigma/schedules.py rename to pipelines/rj_smfp/dump_db_sigma_medicamentos/schedules.py index 1a2549a94..0f61241ff 100644 --- a/pipelines/rj_sms/dump_db_sigma/schedules.py +++ b/pipelines/rj_smfp/dump_db_sigma_medicamentos/schedules.py @@ -218,13 +218,13 @@ interval=timedelta(days=1), start_date=datetime(2022, 3, 21, 1, 0, tzinfo=pytz.timezone("America/Sao_Paulo")), labels=[ - constants.RJ_SMS_AGENT_LABEL.value, + constants.RJ_SMFP_AGENT_LABEL.value, ], db_database="CP01.SMF", db_host="10.90.31.22", db_port="1521", db_type="oracle", - dataset_id="saude_estoque_medicamentos_sigma", + dataset_id="saude_medicamentos_sigma", vault_secret_path="db-sigma", table_parameters=_sigma_queries, ) diff --git a/pipelines/rj_sms/__init__.py b/pipelines/rj_sms/__init__.py index eab9c1cc4..e73a968a6 100644 --- a/pipelines/rj_sms/__init__.py +++ b/pipelines/rj_sms/__init__.py @@ -3,6 +3,5 @@ Prefect flows for rj_sms project """ -from pipelines.rj_sms.dump_db_sigma.flows import * from pipelines.rj_sms.dump_db_sivep.flows import * from pipelines.rj_sms.pubsub.flows import * diff --git a/pipelines/rj_sms/dump_db_sigma/flows.py b/pipelines/rj_sms/dump_db_sigma/flows.py deleted file mode 100644 index 8c238a995..000000000 --- a/pipelines/rj_sms/dump_db_sigma/flows.py +++ /dev/null @@ -1,47 +0,0 @@ -# -*- coding: utf-8 -*- -""" -Database dumping flows for SMS SIGMA system -""" - -from copy import deepcopy - -from prefect.run_configs import KubernetesRun -from prefect.storage import GCS - -from pipelines.constants import constants - -# importa o schedule -from pipelines.rj_sms.dump_db_sigma.schedules import ( - sigma_daily_update_schedule, -) -from pipelines.utils.dump_db.flows import dump_sql_flow -from pipelines.utils.utils import set_default_parameters - -rj_sms_dump_db_sigma_flow = deepcopy(dump_sql_flow) -rj_sms_dump_db_sigma_flow.name = ( - "SMS: SIGMA - ESTOQUE MEDICAMENTOS - Ingerir tabelas de banco SQL" -) -rj_sms_dump_db_sigma_flow.storage = GCS(constants.GCS_FLOWS_BUCKET.value) - -rj_sms_dump_db_sigma_flow.run_config = KubernetesRun( - image=constants.DOCKER_IMAGE.value, - labels=[ - constants.RJ_SMS_AGENT_LABEL.value, # label do agente - ], -) - -rj_sms_dump_db_sigma_default_parameters = { - "db_database": "CP01.SMF", - "db_host": "10.90.31.22", - "db_port": "1521", - "db_type": "oracle", - "dataset_id": "saude_estoque_medicamentos_sigma", - "vault_secret_path": "db-sigma", -} - -rj_sms_dump_db_sigma_flow = set_default_parameters( - rj_sms_dump_db_sigma_flow, - default_parameters=rj_sms_dump_db_sigma_default_parameters, -) - -rj_sms_dump_db_sigma_flow.schedule = sigma_daily_update_schedule From 67304638e4e70b6fd2018ec08aa737e91e48dee5 Mon Sep 17 00:00:00 2001 From: patriciacatandi Date: Wed, 20 Sep 2023 14:49:04 -0300 Subject: [PATCH 09/41] changing run --- pipelines/rj_escritorio/dump_ftp_inea/flows.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/pipelines/rj_escritorio/dump_ftp_inea/flows.py b/pipelines/rj_escritorio/dump_ftp_inea/flows.py index 60e9add45..da5d9c0af 100644 --- a/pipelines/rj_escritorio/dump_ftp_inea/flows.py +++ b/pipelines/rj_escritorio/dump_ftp_inea/flows.py @@ -7,7 +7,7 @@ from copy import deepcopy from prefect import Parameter -from prefect.run_configs import LocalRun +from prefect.run_configs import KubernetesRun from prefect.storage import GCS from prefect.utilities.edges import unmapped @@ -63,11 +63,17 @@ inea_ftp_radar_flow.storage = GCS(constants.GCS_FLOWS_BUCKET.value) -inea_ftp_radar_flow.run_config = LocalRun(labels=[constants.INEA_AGENT_LABEL.value]) +inea_ftp_radar_flow.run_config = KubernetesRun( + image=constants.DOCKER_IMAGE.value, + labels=[constants.RJ_ESCRITORIO_DEV_AGENT_LABEL.value], +) inea_ftp_radar_flow.schedule = every_5_minutes inea_ftp_radar_flow_mac = deepcopy(inea_ftp_radar_flow) inea_ftp_radar_flow_mac.name = "INEA: Captura FTP dados de radar (Macaé)" inea_ftp_radar_flow_mac.storage = GCS(constants.GCS_FLOWS_BUCKET.value) -inea_ftp_radar_flow_mac.run_config = LocalRun(labels=[constants.INEA_AGENT_LABEL.value]) +inea_ftp_radar_flow_mac.run_config = KubernetesRun( + image=constants.DOCKER_IMAGE.value, + labels=[constants.RJ_ESCRITORIO_DEV_AGENT_LABEL.value], +) inea_ftp_radar_flow_mac.schedule = every_5_minutes_mac From 1f4b1357f94116499b1f099e33796d8f1768d466 Mon Sep 17 00:00:00 2001 From: patriciacatandi Date: Wed, 20 Sep 2023 15:29:38 -0300 Subject: [PATCH 10/41] bugfix --- pipelines/rj_escritorio/dump_ftp_inea/tasks.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/pipelines/rj_escritorio/dump_ftp_inea/tasks.py b/pipelines/rj_escritorio/dump_ftp_inea/tasks.py index f63b0d92c..13c1f9eb1 100644 --- a/pipelines/rj_escritorio/dump_ftp_inea/tasks.py +++ b/pipelines/rj_escritorio/dump_ftp_inea/tasks.py @@ -71,6 +71,8 @@ def download_files(client, files, radar): client.download(remote_path=file, local_path=file_path) files_downloaded.append(file_path) log(f"files_downloaded: {files_downloaded}") + file = Path(files_downloaded[0]) + log(f"DEBUGGGG: {file.name}") return files_downloaded @@ -103,7 +105,8 @@ def upload_file_to_gcs( # Converted file path is in the format: # /var/opt/edge/.../YYYYMMDD/.nc.gz # We need to get the datetime for the file - date_str = file.parent.name + log(f"DEBUG: {file} e {file.name}") + date_str = file.split("-")[2] date = datetime.strptime(date_str, "%Y%m%d").strftime("%Y-%m-%d") blob_name = f"{prefix}/radar={radar}/produto={product}/data_particao={date}/{file.name}" blob_name = blob_name.replace("//", "/") From 6820a5558a2e9113834386008d035e8f08f87b3a Mon Sep 17 00:00:00 2001 From: patriciacatandi Date: Wed, 20 Sep 2023 19:44:32 -0300 Subject: [PATCH 11/41] bugfix --- .../rj_escritorio/dump_ftp_inea/tasks.py | 35 +++++++++---------- 1 file changed, 16 insertions(+), 19 deletions(-) diff --git a/pipelines/rj_escritorio/dump_ftp_inea/tasks.py b/pipelines/rj_escritorio/dump_ftp_inea/tasks.py index 13c1f9eb1..c9a888dbb 100644 --- a/pipelines/rj_escritorio/dump_ftp_inea/tasks.py +++ b/pipelines/rj_escritorio/dump_ftp_inea/tasks.py @@ -99,25 +99,22 @@ def upload_file_to_gcs( bucket = storage_client.bucket(bucket_name) - file = Path(file_to_upload) - if file.is_file(): - if task_mode == "partitioned": - # Converted file path is in the format: - # /var/opt/edge/.../YYYYMMDD/.nc.gz - # We need to get the datetime for the file - log(f"DEBUG: {file} e {file.name}") - date_str = file.split("-")[2] - date = datetime.strptime(date_str, "%Y%m%d").strftime("%Y-%m-%d") - blob_name = f"{prefix}/radar={radar}/produto={product}/data_particao={date}/{file.name}" - blob_name = blob_name.replace("//", "/") - elif task_mode == "raw": - blob_name = f"{prefix}/{file.name}" - else: - raise ValueError(f"Invalid task_mode: {task_mode}") - log(f"Uploading file {file} to GCS...") + if task_mode == "partitioned": + # We need to get the datetime for the file + log(f"DEBUG: {file_to_upload} e {file_to_upload.name}") + date_str = file_to_upload.split("-")[2] + date = datetime.strptime(date_str, "%Y%m%d").strftime("%Y-%m-%d") + blob_name = ( + f"{prefix}/radar={radar}/produto={product}/" + f"data_particao={date}/{file_to_upload.name}" + ) + blob_name = blob_name.replace("//", "/") + elif task_mode == "raw": + blob_name = f"{prefix}/{file_to_upload.name}" + log(f"Uploading file {file_to_upload} to GCS...") log(f"Blob name will be {blob_name}") blob = bucket.blob(blob_name) - blob.upload_from_filename(file) - log(f"File {file} uploaded to GCS.") + blob.upload_from_filename(file_to_upload) + log(f"File {file_to_upload} uploaded to GCS.") if unlink: - file.unlink() + file_to_upload.unlink() From b42a3ed15a0cc073947ae589909f186733d982dd Mon Sep 17 00:00:00 2001 From: patriciacatandi Date: Wed, 20 Sep 2023 21:10:51 -0300 Subject: [PATCH 12/41] bugfix --- .../rj_escritorio/dump_ftp_inea/tasks.py | 21 +++++++++---------- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/pipelines/rj_escritorio/dump_ftp_inea/tasks.py b/pipelines/rj_escritorio/dump_ftp_inea/tasks.py index c9a888dbb..101434d5e 100644 --- a/pipelines/rj_escritorio/dump_ftp_inea/tasks.py +++ b/pipelines/rj_escritorio/dump_ftp_inea/tasks.py @@ -72,7 +72,7 @@ def download_files(client, files, radar): files_downloaded.append(file_path) log(f"files_downloaded: {files_downloaded}") file = Path(files_downloaded[0]) - log(f"DEBUGGGG: {file.name}") + log(f"DEBUGGGG: {file.name.split('-')[2]}") return files_downloaded @@ -99,22 +99,21 @@ def upload_file_to_gcs( bucket = storage_client.bucket(bucket_name) + file = Path(file_to_upload) if task_mode == "partitioned": - # We need to get the datetime for the file - log(f"DEBUG: {file_to_upload} e {file_to_upload.name}") - date_str = file_to_upload.split("-")[2] + log(f"DEBUG: {file} e {file.name}") + date_str = file.name.split("-")[2] date = datetime.strptime(date_str, "%Y%m%d").strftime("%Y-%m-%d") blob_name = ( - f"{prefix}/radar={radar}/produto={product}/" - f"data_particao={date}/{file_to_upload.name}" + f"{prefix}/radar={radar}/produto={product}/data_particao={date}/{file.name}" ) blob_name = blob_name.replace("//", "/") elif task_mode == "raw": - blob_name = f"{prefix}/{file_to_upload.name}" - log(f"Uploading file {file_to_upload} to GCS...") + blob_name = f"{prefix}/{file.name}" + log(f"Uploading file {file} to GCS...") log(f"Blob name will be {blob_name}") blob = bucket.blob(blob_name) - blob.upload_from_filename(file_to_upload) - log(f"File {file_to_upload} uploaded to GCS.") + blob.upload_from_filename(file) + log(f"File {file} uploaded to GCS.") if unlink: - file_to_upload.unlink() + file.unlink() From cfb370dc031c7c1d684ae793cfd728185d11e8c6 Mon Sep 17 00:00:00 2001 From: patriciacatandi Date: Wed, 20 Sep 2023 21:34:09 -0300 Subject: [PATCH 13/41] bugfix --- pipelines/rj_escritorio/dump_ftp_inea/tasks.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/pipelines/rj_escritorio/dump_ftp_inea/tasks.py b/pipelines/rj_escritorio/dump_ftp_inea/tasks.py index 101434d5e..40fabcdc4 100644 --- a/pipelines/rj_escritorio/dump_ftp_inea/tasks.py +++ b/pipelines/rj_escritorio/dump_ftp_inea/tasks.py @@ -110,10 +110,11 @@ def upload_file_to_gcs( blob_name = blob_name.replace("//", "/") elif task_mode == "raw": blob_name = f"{prefix}/{file.name}" - log(f"Uploading file {file} to GCS...") - log(f"Blob name will be {blob_name}") - blob = bucket.blob(blob_name) - blob.upload_from_filename(file) - log(f"File {file} uploaded to GCS.") - if unlink: - file.unlink() + + log(f"Uploading file {file} to GCS...") + log(f"Blob name will be {blob_name}") + blob = bucket.blob(blob_name) + blob.upload_from_filename(file) + log(f"File {file} uploaded to GCS.") + if unlink: + file.unlink() From 885f4f84652d172e82c531e490823e70cd80ad48 Mon Sep 17 00:00:00 2001 From: patriciacatandi Date: Thu, 21 Sep 2023 10:36:06 -0300 Subject: [PATCH 14/41] saving filenames on redis --- pipelines/rj_escritorio/dump_ftp_inea/tasks.py | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/pipelines/rj_escritorio/dump_ftp_inea/tasks.py b/pipelines/rj_escritorio/dump_ftp_inea/tasks.py index 40fabcdc4..dc76f8051 100644 --- a/pipelines/rj_escritorio/dump_ftp_inea/tasks.py +++ b/pipelines/rj_escritorio/dump_ftp_inea/tasks.py @@ -8,6 +8,8 @@ from google.cloud import storage from prefect import task +from prefect.engine.signals import ENDRUN +from prefect.engine.state import Skipped from pipelines.utils.ftp.client import FTPClient from pipelines.utils.utils import ( @@ -38,15 +40,25 @@ def get_ftp_client(wait=None): max_retries=3, retry_delay=timedelta(seconds=30), ) -def get_files_to_download(client, radar): +def get_files_to_download(client, radar, redis_files): """ Get files to download FTP and GCS """ client.connect() files = client.list_files(path=f"./{radar.upper()}/") - files = files[-4:] - log(f"files: {files}") + log(f"\n\nAvailable files on FTP: {files}") + log(f"\nFiles already saved on redis_files: {redis_files}") + files = [file for file in files if file not in redis_files] + log(f"\nFiles to be downloaded: {files}") + files = files[-4:] # remover + log(f"\nFiles to be downloaded: {files}") + + # Skip task if there is no new file + if len(files) == 0: + log("No new available files") + skip = Skipped("No new available files") + raise ENDRUN(state=skip) return files From 3e0dac6b0a99c5019cfabdaf6119950d4ea81e34 Mon Sep 17 00:00:00 2001 From: patriciacatandi Date: Thu, 21 Sep 2023 11:04:12 -0300 Subject: [PATCH 15/41] saving filenames on redis --- pipelines/rj_escritorio/dump_ftp_inea/flows.py | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/pipelines/rj_escritorio/dump_ftp_inea/flows.py b/pipelines/rj_escritorio/dump_ftp_inea/flows.py index da5d9c0af..5f9bdea3f 100644 --- a/pipelines/rj_escritorio/dump_ftp_inea/flows.py +++ b/pipelines/rj_escritorio/dump_ftp_inea/flows.py @@ -22,6 +22,10 @@ every_5_minutes, every_5_minutes_mac, ) +from pipelines.rj_cor.tasks import ( + get_on_redis, + save_on_redis, +) from pipelines.utils.decorators import Flow @@ -41,9 +45,14 @@ client = get_ftp_client() + redis_files = get_on_redis( + dataset_id="meio_ambiente_clima", table_id=radar, mode=mode + ) + files_to_download = get_files_to_download( client=client, radar=radar, + redis_files=redis_files, ) files_to_upload = download_files( @@ -52,7 +61,7 @@ radar=radar, ) - upload_file_to_gcs.map( + upload_files = upload_file_to_gcs.map( file_to_upload=files_to_upload, bucket_name=unmapped(bucket_name), prefix=unmapped(prefix), @@ -61,6 +70,13 @@ product=unmapped(product), ) + save_on_redis( + dataset_id="meio_ambiente_clima", + table_id=radar, + mode=mode, + files=files_to_upload, + wait=upload_files, + ) inea_ftp_radar_flow.storage = GCS(constants.GCS_FLOWS_BUCKET.value) inea_ftp_radar_flow.run_config = KubernetesRun( From 33e571d143a002deba122f11d45abbb4227a3a38 Mon Sep 17 00:00:00 2001 From: patriciacatandi Date: Thu, 21 Sep 2023 16:24:26 -0300 Subject: [PATCH 16/41] adding read files from datalake and choose if wants last file --- .../rj_escritorio/dump_ftp_inea/flows.py | 15 +++ .../rj_escritorio/dump_ftp_inea/tasks.py | 121 +++++++++++++++++- 2 files changed, 132 insertions(+), 4 deletions(-) diff --git a/pipelines/rj_escritorio/dump_ftp_inea/flows.py b/pipelines/rj_escritorio/dump_ftp_inea/flows.py index 5f9bdea3f..fb1ee08c4 100644 --- a/pipelines/rj_escritorio/dump_ftp_inea/flows.py +++ b/pipelines/rj_escritorio/dump_ftp_inea/flows.py @@ -14,6 +14,7 @@ from pipelines.constants import constants from pipelines.rj_escritorio.dump_ftp_inea.tasks import ( get_ftp_client, + get_files_datalake, get_files_to_download, download_files, upload_file_to_gcs, @@ -36,6 +37,9 @@ ], ) as inea_ftp_radar_flow: bucket_name = Parameter("bucket_name", default="rj-escritorio-dev", required=False) + date = Parameter("date", default=None, required=False) + get_only_last_file = Parameter("get_only_last_file", default=True, required=False) + greater_than = Parameter("greater_than", default=None, required=False) prefix = Parameter( "prefix", default="raw/meio_ambiente_clima/inea_radar_hdf5", required=False ) @@ -49,10 +53,21 @@ dataset_id="meio_ambiente_clima", table_id=radar, mode=mode ) + datalake_files = get_files_datalake( + bucket_name=bucket_name, + prefix=prefix, + radar=radar, + product=product, + date=date, + mode=mode, + ) + files_to_download = get_files_to_download( client=client, radar=radar, redis_files=redis_files, + datalake_files=datalake_files, + get_only_last_file=get_only_last_file, ) files_to_upload = download_files( diff --git a/pipelines/rj_escritorio/dump_ftp_inea/tasks.py b/pipelines/rj_escritorio/dump_ftp_inea/tasks.py index dc76f8051..be871d4b2 100644 --- a/pipelines/rj_escritorio/dump_ftp_inea/tasks.py +++ b/pipelines/rj_escritorio/dump_ftp_inea/tasks.py @@ -5,6 +5,7 @@ # pylint: disable=E0702,E1137,E1136,E1101,C0207,W0613 from datetime import datetime, timedelta from pathlib import Path +from typing import List, Tuple from google.cloud import storage from prefect import task @@ -16,9 +17,96 @@ log, get_credentials_from_env, get_vault_secret, + list_blobs_with_prefix, ) +@task( + nout=2, + max_retries=2, + retry_delay=timedelta(seconds=10), +) +# pylint: disable=too-many-arguments,too-many-locals, too-many-branches +def get_files_datalake( + bucket_name: str, + prefix: str, + radar: str, + product: str, + date: str = None, + greater_than: str = None, + mode: str = "prod", +) -> Tuple[List[str], str]: + """ + List files from INEA server + + Args: + product (str): "ppi" + date (str): Date of the files to be fetched (e.g. 2022-01-25) + greater_than (str): Fetch files with a date greater than this one + less_than (str): Fetch files with a date less than this one + output_directory (str): Directory where the files will be saved + radar (str): Radar name. Must be `gua` or `mac` + get_only_last_file (bool): Treat only the last file available + + How to use: + to get real time data: + let `greater_than` and `date` as None and `get_only_last_file` as True + This will prevent the flow to be stucked treating all files when something happend + and stoped the flow. Otherwise the flow will take a long time to treat all files + and came back to real time. + to fill missing files up to two days ago: + let `greater_than` and `date` as None and `get_only_last_file` as False + for backfill or to fill missing files for dates greather than two days ago: + add a `greater_than` date and let `date` as None and `get_only_last_file` as False + get all files for one day + let `greater_than` as None and `get_only_last_file` as False and fill `date` + """ + search_prefix = f"{prefix}/radar={radar}/produto={product}" + + # Get today's blobs + current_date = datetime.now().date() + current_date_str = current_date.strftime("%Y-%m-%d") + blobs = list_blobs_with_prefix( + bucket_name=bucket_name, + prefix=f"{search_prefix}/data_particao={current_date_str}", + mode=mode, + ) + log( + f"Searched for blobs with prefix {search_prefix}/data_particao={current_date_str}" + ) + + if greater_than is None: + greater_than = current_date - timedelta(days=1) + else: + greater_than = datetime.strptime(greater_than, "%Y-%m-%d") + + # Next, we get past day's blobs + past_date = greater_than.date() + while past_date < current_date: + past_date_str = past_date.strftime("%Y-%m-%d") + past_blobs = list_blobs_with_prefix( + bucket_name=bucket_name, + prefix=f"{search_prefix}/data_particao={past_date_str}", + mode=mode, + ) + log( + f"Searched for blobs with prefix {search_prefix}/data_particao={past_date_str}" + ) + # Then, we merge the two lists + blobs += past_blobs + past_date += timedelta(days=1) + + # Now, we sort it by `blob.name` + blobs.sort(key=lambda blob: blob.name) + # Get only the filenames + datalake_files = [blob.name.split("/")[-1] for blob in blobs] + # Format of the name is 9921GUA-PPIVol-20220930-121010-0004.hdf + # We need remove the last characters to stay with 9921GUA-PPIVol-20220930-121010 + datalake_files = ["-".join(fname.split("-")[:-1]) for fname in datalake_files] + + return datalake_files + + @task def get_ftp_client(wait=None): """ @@ -40,7 +128,13 @@ def get_ftp_client(wait=None): max_retries=3, retry_delay=timedelta(seconds=30), ) -def get_files_to_download(client, radar, redis_files): +def get_files_to_download( + client, + radar, + redis_files, + datalake_files, + get_only_last_file: bool = True, +): """ Get files to download FTP and GCS """ @@ -49,10 +143,19 @@ def get_files_to_download(client, radar, redis_files): files = client.list_files(path=f"./{radar.upper()}/") log(f"\n\nAvailable files on FTP: {files}") log(f"\nFiles already saved on redis_files: {redis_files}") + # Files obtained direct from INEA ends with 0000 as "9915MAC-PPIVol-20230921-123000-0000.hdf" + # Files from FTP ends with an alphanumeric string as "9915MAC-PPIVol-20230921-142000-54d4.hdf" + # We need to be careful when changing one pipeline to other + # Check if files are already on redis files = [file for file in files if file not in redis_files] - log(f"\nFiles to be downloaded: {files}") - files = files[-4:] # remover - log(f"\nFiles to be downloaded: {files}") + + # Check if files are already on datalake + if len(datalake_files) > 0: + files = [ + file + for file in files + if "-".join(file.split("-")[:-1]) not in datalake_files + ] # Skip task if there is no new file if len(files) == 0: @@ -60,6 +163,16 @@ def get_files_to_download(client, radar, redis_files): skip = Skipped("No new available files") raise ENDRUN(state=skip) + files.sort() + + log(f"\nFiles to be downloaded: {files}") + if len(files) > 20: + files = files[-20:] # remover + + if get_only_last_file: + files = files[-1] + log(f"\nFiles to be downloaded: {files}") + return files From 1531bbf3212a1bf206e3a8a1146788c7d4f88cd6 Mon Sep 17 00:00:00 2001 From: patriciacatandi Date: Thu, 21 Sep 2023 16:58:53 -0300 Subject: [PATCH 17/41] bugfix on date --- pipelines/rj_escritorio/dump_ftp_inea/tasks.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pipelines/rj_escritorio/dump_ftp_inea/tasks.py b/pipelines/rj_escritorio/dump_ftp_inea/tasks.py index be871d4b2..119276668 100644 --- a/pipelines/rj_escritorio/dump_ftp_inea/tasks.py +++ b/pipelines/rj_escritorio/dump_ftp_inea/tasks.py @@ -76,12 +76,12 @@ def get_files_datalake( ) if greater_than is None: - greater_than = current_date - timedelta(days=1) + past_date = current_date - timedelta(days=1) else: - greater_than = datetime.strptime(greater_than, "%Y-%m-%d") + past_date = datetime.strptime(greater_than, "%Y-%m-%d") + past_date = past_date.date() # Next, we get past day's blobs - past_date = greater_than.date() while past_date < current_date: past_date_str = past_date.strftime("%Y-%m-%d") past_blobs = list_blobs_with_prefix( From fe532f328405b2ba14b8ff0a1389dd889dc6eff0 Mon Sep 17 00:00:00 2001 From: patriciacatandi Date: Fri, 22 Sep 2023 11:42:19 -0300 Subject: [PATCH 18/41] bugfix --- pipelines/rj_escritorio/dump_ftp_inea/tasks.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pipelines/rj_escritorio/dump_ftp_inea/tasks.py b/pipelines/rj_escritorio/dump_ftp_inea/tasks.py index 119276668..ff13b0586 100644 --- a/pipelines/rj_escritorio/dump_ftp_inea/tasks.py +++ b/pipelines/rj_escritorio/dump_ftp_inea/tasks.py @@ -170,9 +170,8 @@ def get_files_to_download( files = files[-20:] # remover if get_only_last_file: - files = files[-1] + files = list(files[-1]) log(f"\nFiles to be downloaded: {files}") - return files @@ -191,11 +190,12 @@ def download_files(client, files, radar): client.connect() files_downloaded = [] for file in files: + log(f"Downloading file: {file}") # file_path = save_path / file file_path = file client.download(remote_path=file, local_path=file_path) files_downloaded.append(file_path) - log(f"files_downloaded: {files_downloaded}") + log(f"Downloaded: {files_downloaded}") file = Path(files_downloaded[0]) log(f"DEBUGGGG: {file.name.split('-')[2]}") return files_downloaded From 9e356b62fe9861a5c57d92358132ed99d1a922bb Mon Sep 17 00:00:00 2001 From: patriciacatandi Date: Fri, 22 Sep 2023 19:27:59 -0300 Subject: [PATCH 19/41] bugfix --- pipelines/rj_escritorio/dump_ftp_inea/tasks.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pipelines/rj_escritorio/dump_ftp_inea/tasks.py b/pipelines/rj_escritorio/dump_ftp_inea/tasks.py index ff13b0586..ec619333f 100644 --- a/pipelines/rj_escritorio/dump_ftp_inea/tasks.py +++ b/pipelines/rj_escritorio/dump_ftp_inea/tasks.py @@ -170,7 +170,7 @@ def get_files_to_download( files = files[-20:] # remover if get_only_last_file: - files = list(files[-1]) + files = [files[-1]] log(f"\nFiles to be downloaded: {files}") return files From 009a099dddec72eb058a821e5f756bf233aec02d Mon Sep 17 00:00:00 2001 From: patriciacatandi Date: Mon, 25 Sep 2023 16:52:56 -0300 Subject: [PATCH 20/41] add date and greater_than parameters and flows schedulers --- .../rj_escritorio/dump_ftp_inea/flows.py | 55 ++++-- .../rj_escritorio/dump_ftp_inea/schedules.py | 24 +-- .../rj_escritorio/dump_ftp_inea/tasks.py | 164 ++++++++++-------- 3 files changed, 140 insertions(+), 103 deletions(-) diff --git a/pipelines/rj_escritorio/dump_ftp_inea/flows.py b/pipelines/rj_escritorio/dump_ftp_inea/flows.py index fb1ee08c4..eba999e77 100644 --- a/pipelines/rj_escritorio/dump_ftp_inea/flows.py +++ b/pipelines/rj_escritorio/dump_ftp_inea/flows.py @@ -2,7 +2,7 @@ """ Dumping data from INEA FTP to BigQuery """ -# pylint: disable=E1101,C0103 +# pylint: disable=E1101,C0103,bad-continuation from copy import deepcopy @@ -22,24 +22,23 @@ from pipelines.rj_escritorio.dump_ftp_inea.schedules import ( every_5_minutes, every_5_minutes_mac, + every_1_day, + every_1_day_mac, ) -from pipelines.rj_cor.tasks import ( - get_on_redis, - save_on_redis, -) +from pipelines.rj_cor.tasks import get_on_redis, save_on_redis from pipelines.utils.decorators import Flow with Flow( - "INEA: Captura FTP dados de radar (Guaratiba)", - code_owners=[ - "paty", - ], + "INEA: Captura FTP dados de radar (Guaratiba)", code_owners=["paty"] ) as inea_ftp_radar_flow: bucket_name = Parameter("bucket_name", default="rj-escritorio-dev", required=False) date = Parameter("date", default=None, required=False) get_only_last_file = Parameter("get_only_last_file", default=True, required=False) greater_than = Parameter("greater_than", default=None, required=False) + check_datalake_files = Parameter( + "check_datalake_files", default=True, required=False + ) prefix = Parameter( "prefix", default="raw/meio_ambiente_clima/inea_radar_hdf5", required=False ) @@ -59,6 +58,8 @@ radar=radar, product=product, date=date, + greater_than=greater_than, + check_datalake_files=check_datalake_files, mode=mode, ) @@ -71,9 +72,7 @@ ) files_to_upload = download_files( - client=client, - files=files_to_download, - radar=radar, + client=client, files=files_to_download, radar=radar ) upload_files = upload_file_to_gcs.map( @@ -90,6 +89,7 @@ table_id=radar, mode=mode, files=files_to_upload, + keep_last=14400, # last 30 days files wait=upload_files, ) @@ -108,3 +108,34 @@ labels=[constants.RJ_ESCRITORIO_DEV_AGENT_LABEL.value], ) inea_ftp_radar_flow_mac.schedule = every_5_minutes_mac + +inea_ftp_radar_flow_fill_missing = deepcopy(inea_ftp_radar_flow) +inea_ftp_radar_flow_fill_missing.name = ( + "INEA: Captura FTP dados de radar (Guaratiba): preenchimento de arquivos faltantes" +) +inea_ftp_radar_flow_fill_missing.storage = GCS(constants.GCS_FLOWS_BUCKET.value) +inea_ftp_radar_flow_fill_missing.run_config = KubernetesRun( + image=constants.DOCKER_IMAGE.value, + labels=[constants.RJ_ESCRITORIO_DEV_AGENT_LABEL.value], +) +inea_ftp_radar_flow_fill_missing.schedule = every_1_day + +inea_ftp_radar_flow_fill_missing_mac = deepcopy(inea_ftp_radar_flow) +inea_ftp_radar_flow_fill_missing_mac.name = ( + "INEA: Captura FTP dados de radar (Macaé): preenchimento de arquivos faltantes" +) +inea_ftp_radar_flow_fill_missing_mac.storage = GCS(constants.GCS_FLOWS_BUCKET.value) +inea_ftp_radar_flow_fill_missing_mac.run_config = KubernetesRun( + image=constants.DOCKER_IMAGE.value, + labels=[constants.RJ_ESCRITORIO_DEV_AGENT_LABEL.value], +) +inea_ftp_radar_flow_fill_missing_mac.schedule = every_1_day_mac + +inea_ftp_backfill_radar_flow = deepcopy(inea_ftp_radar_flow) +inea_ftp_backfill_radar_flow.name = "INEA: Captura dados de radar (backfill)" +inea_ftp_backfill_radar_flow.storage = GCS(constants.GCS_FLOWS_BUCKET.value) +inea_ftp_backfill_radar_flow.run_config = KubernetesRun( + image=constants.DOCKER_IMAGE.value, + labels=[constants.RJ_ESCRITORIO_DEV_AGENT_LABEL.value], +) +inea_ftp_backfill_radar_flow.schedule = None diff --git a/pipelines/rj_escritorio/dump_ftp_inea/schedules.py b/pipelines/rj_escritorio/dump_ftp_inea/schedules.py index ae59322ff..a8db99996 100644 --- a/pipelines/rj_escritorio/dump_ftp_inea/schedules.py +++ b/pipelines/rj_escritorio/dump_ftp_inea/schedules.py @@ -17,9 +17,7 @@ IntervalClock( interval=timedelta(minutes=5), start_date=datetime(2021, 1, 1, tzinfo=pytz.timezone("America/Sao_Paulo")), - labels=[ - constants.INEA_AGENT_LABEL.value, - ], + labels=[constants.INEA_AGENT_LABEL.value], parameter_defaults={ "bucket_name": "rj-escritorio-dev", "convert_params": "-k=ODIM2.1 -M=All", @@ -30,7 +28,7 @@ "radar": "gua", "vols_remote_directory": "/var/opt/edge/vols", }, - ), + ) ] ) every_5_minutes_mac = Schedule( @@ -38,9 +36,7 @@ IntervalClock( interval=timedelta(minutes=5), start_date=datetime(2021, 1, 1, tzinfo=pytz.timezone("America/Sao_Paulo")), - labels=[ - constants.INEA_AGENT_LABEL.value, - ], + labels=[constants.INEA_AGENT_LABEL.value], parameter_defaults={ "bucket_name": "rj-escritorio-dev", "convert_params": "-k=ODIM2.1 -M=All", @@ -51,7 +47,7 @@ "radar": "mac", "vols_remote_directory": "/var/opt/edge/vols", }, - ), + ) ] ) every_1_day = Schedule( @@ -59,9 +55,7 @@ IntervalClock( interval=timedelta(days=1), start_date=datetime(2021, 1, 1, tzinfo=pytz.timezone("America/Sao_Paulo")), - labels=[ - constants.INEA_AGENT_LABEL.value, - ], + labels=[constants.INEA_AGENT_LABEL.value], parameter_defaults={ "bucket_name": "rj-escritorio-dev", "convert_params": "-k=ODIM2.1 -M=All", @@ -73,7 +67,7 @@ "get_only_last_file": False, "vols_remote_directory": "/var/opt/edge/vols", }, - ), + ) ] ) every_1_day_mac = Schedule( @@ -81,9 +75,7 @@ IntervalClock( interval=timedelta(days=1), start_date=datetime(2021, 1, 1, tzinfo=pytz.timezone("America/Sao_Paulo")), - labels=[ - constants.INEA_AGENT_LABEL.value, - ], + labels=[constants.INEA_AGENT_LABEL.value], parameter_defaults={ "bucket_name": "rj-escritorio-dev", "convert_params": "-k=ODIM2.1 -M=All", @@ -95,6 +87,6 @@ "get_only_last_file": False, "vols_remote_directory": "/var/opt/edge/vols", }, - ), + ) ] ) diff --git a/pipelines/rj_escritorio/dump_ftp_inea/tasks.py b/pipelines/rj_escritorio/dump_ftp_inea/tasks.py index ec619333f..f90f23ac4 100644 --- a/pipelines/rj_escritorio/dump_ftp_inea/tasks.py +++ b/pipelines/rj_escritorio/dump_ftp_inea/tasks.py @@ -2,7 +2,7 @@ """ Tasks to dump data from a INEA FTP to BigQuery """ -# pylint: disable=E0702,E1137,E1136,E1101,C0207,W0613 +# pylint: disable=E0702,E1137,E1136,E1101,W0613,bad-continuation from datetime import datetime, timedelta from pathlib import Path from typing import List, Tuple @@ -21,11 +21,7 @@ ) -@task( - nout=2, - max_retries=2, - retry_delay=timedelta(seconds=10), -) +@task(nout=2, max_retries=2, retry_delay=timedelta(seconds=10)) # pylint: disable=too-many-arguments,too-many-locals, too-many-branches def get_files_datalake( bucket_name: str, @@ -34,17 +30,16 @@ def get_files_datalake( product: str, date: str = None, greater_than: str = None, + check_datalake_files: bool = True, mode: str = "prod", ) -> Tuple[List[str], str]: """ - List files from INEA server + List files from INEA saved on datalake Args: product (str): "ppi" date (str): Date of the files to be fetched (e.g. 2022-01-25) - greater_than (str): Fetch files with a date greater than this one - less_than (str): Fetch files with a date less than this one - output_directory (str): Directory where the files will be saved + greater_than (str): Fetch files with a date greater than this one (e.g. 2022-01-25) radar (str): Radar name. Must be `gua` or `mac` get_only_last_file (bool): Treat only the last file available @@ -61,48 +56,48 @@ def get_files_datalake( get all files for one day let `greater_than` as None and `get_only_last_file` as False and fill `date` """ - search_prefix = f"{prefix}/radar={radar}/produto={product}" - - # Get today's blobs - current_date = datetime.now().date() - current_date_str = current_date.strftime("%Y-%m-%d") - blobs = list_blobs_with_prefix( - bucket_name=bucket_name, - prefix=f"{search_prefix}/data_particao={current_date_str}", - mode=mode, - ) - log( - f"Searched for blobs with prefix {search_prefix}/data_particao={current_date_str}" - ) - - if greater_than is None: - past_date = current_date - timedelta(days=1) + + if check_datalake_files: + search_prefix = f"{prefix}/radar={radar}/produto={product}" + + # Get today's blobs + if date: + current_date = datetime.strptime(date, "%Y-%m-%d") + else: + current_date = datetime.now().date() + + if greater_than is None: + past_date = current_date - timedelta(days=1) + else: + past_date = datetime.strptime(greater_than, "%Y-%m-%d") + past_date = past_date.date() + + blobs = [] + # Next, we get past day's blobs + while past_date <= current_date: + past_date_str = past_date.strftime("%Y-%m-%d") + past_blobs = list_blobs_with_prefix( + bucket_name=bucket_name, + prefix=f"{search_prefix}/data_particao={past_date_str}", + mode=mode, + ) + log( + f"Searched for blobs with prefix {search_prefix}/data_particao={past_date_str}" + ) + # Then, we merge the two lists + blobs += past_blobs + past_date += timedelta(days=1) + + # Now, we sort it by `blob.name` + blobs.sort(key=lambda blob: blob.name) + # Get only the filenames + datalake_files = [blob.name.split("/")[-1] for blob in blobs] + # Format of the name is 9921GUA-PPIVol-20220930-121010-0004.hdf + # We need remove the last characters to stay with 9921GUA-PPIVol-20220930-121010 + datalake_files = ["-".join(fname.split("-")[:-1]) for fname in datalake_files] + else: - past_date = datetime.strptime(greater_than, "%Y-%m-%d") - past_date = past_date.date() - - # Next, we get past day's blobs - while past_date < current_date: - past_date_str = past_date.strftime("%Y-%m-%d") - past_blobs = list_blobs_with_prefix( - bucket_name=bucket_name, - prefix=f"{search_prefix}/data_particao={past_date_str}", - mode=mode, - ) - log( - f"Searched for blobs with prefix {search_prefix}/data_particao={past_date_str}" - ) - # Then, we merge the two lists - blobs += past_blobs - past_date += timedelta(days=1) - - # Now, we sort it by `blob.name` - blobs.sort(key=lambda blob: blob.name) - # Get only the filenames - datalake_files = [blob.name.split("/")[-1] for blob in blobs] - # Format of the name is 9921GUA-PPIVol-20220930-121010-0004.hdf - # We need remove the last characters to stay with 9921GUA-PPIVol-20220930-121010 - datalake_files = ["-".join(fname.split("-")[:-1]) for fname in datalake_files] + datalake_files = [] return datalake_files @@ -117,35 +112,64 @@ def get_ftp_client(wait=None): username = inea_secret["data"]["username"] password = inea_secret["data"]["password"] - return FTPClient( - hostname=hostname, - username=username, - password=password, - ) + return FTPClient(hostname=hostname, username=username, password=password) -@task( - max_retries=3, - retry_delay=timedelta(seconds=30), -) +@task(max_retries=3, retry_delay=timedelta(seconds=30)) +# pylint: disable=too-many-arguments def get_files_to_download( client, radar, redis_files, datalake_files, + date: str = None, + greater_than: str = None, get_only_last_file: bool = True, ): """ - Get files to download FTP and GCS + List and get files to download FTP + + Args: + radar (str): Radar name. Must be `gua` or `mac` + redis_files (list): List with last files saved on GCP and redis + datalake_files (list): List with filenames saved on GCP + date (str): Date of the files to be fetched (e.g. 2022-01-25) + greater_than (str): Fetch files with a date greater than this one (e.g. 2022-01-25) + get_only_last_file (bool): Treat only the last file available + + How to use: + to get real time data: + let `greater_than` and `date` as None and `get_only_last_file` as True + This will prevent the flow to be stucked treating all files when something happend + and stoped the flow. Otherwise the flow will take a long time to treat all files + and came back to real time. + to fill missing files up to two days ago: + let `greater_than` and `date` as None and `get_only_last_file` as False + for backfill or to fill missing files for dates greather than two days ago: + add a `greater_than` date and let `date` as None and `get_only_last_file` as False + get all files for one day + let `greater_than` as None and `get_only_last_file` as False and fill `date` """ client.connect() files = client.list_files(path=f"./{radar.upper()}/") - log(f"\n\nAvailable files on FTP: {files}") - log(f"\nFiles already saved on redis_files: {redis_files}") + # log(f"\n\nAvailable files on FTP: {files}") + # log(f"\nFiles already saved on redis_files: {redis_files}") + # Files obtained direct from INEA ends with 0000 as "9915MAC-PPIVol-20230921-123000-0000.hdf" # Files from FTP ends with an alphanumeric string as "9915MAC-PPIVol-20230921-142000-54d4.hdf" # We need to be careful when changing one pipeline to other + + # Get specific files based on date and greater_than parameters + if date: + files = [file for file in files if file.split("-")[2] == date.replace("-", "")] + if greater_than: + files = [ + file + for file in files + if file.split("-")[2] >= greater_than.replace("-", "") + ] + # Check if files are already on redis files = [file for file in files if file not in redis_files] @@ -165,20 +189,13 @@ def get_files_to_download( files.sort() - log(f"\nFiles to be downloaded: {files}") - if len(files) > 20: - files = files[-20:] # remover - if get_only_last_file: files = [files[-1]] log(f"\nFiles to be downloaded: {files}") return files -@task( - max_retries=3, - retry_delay=timedelta(seconds=30), -) +@task(max_retries=3, retry_delay=timedelta(seconds=30)) def download_files(client, files, radar): """ Download files from FTP @@ -201,10 +218,7 @@ def download_files(client, files, radar): return files_downloaded -@task( - max_retries=3, - retry_delay=timedelta(seconds=30), -) +@task(max_retries=3, retry_delay=timedelta(seconds=30)) # pylint: disable=too-many-arguments, too-many-locals def upload_file_to_gcs( file_to_upload: str, From 40249a6f250322739c2711906aa27216b7da7112 Mon Sep 17 00:00:00 2001 From: patriciacatandi Date: Mon, 25 Sep 2023 19:12:32 -0300 Subject: [PATCH 21/41] chancging back parameter --- pipelines/rj_escritorio/dump_ftp_inea/flows.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pipelines/rj_escritorio/dump_ftp_inea/flows.py b/pipelines/rj_escritorio/dump_ftp_inea/flows.py index eba999e77..aa2f979b9 100644 --- a/pipelines/rj_escritorio/dump_ftp_inea/flows.py +++ b/pipelines/rj_escritorio/dump_ftp_inea/flows.py @@ -43,7 +43,7 @@ "prefix", default="raw/meio_ambiente_clima/inea_radar_hdf5", required=False ) mode = Parameter("mode", default="prod", required=False) - radar = Parameter("radar", default="mac", required=False) + radar = Parameter("radar", default="gua", required=False) product = Parameter("product", default="ppi", required=False) client = get_ftp_client() From f7c9f0fa0477dd9e351d2244ba886018df12fc42 Mon Sep 17 00:00:00 2001 From: Rodrigo Cunha <66736583+eng-rodrigocunha@users.noreply.github.com> Date: Tue, 26 Sep 2023 10:46:37 -0300 Subject: [PATCH 22/41] hotfix: update `bq_upload` data check (#516) * change agent to test * change data check * update agent * change data check * update log * update agents * enrich logs * enrich logs * update tasks * Revert "change agent to test" This reverts commit 36dacb0559b0ef11fc7687321980fef02072cfff. * Revert "update agent" This reverts commit 741a2a6e75d6fecfaea89ee714c8847b48b9a933. * Revert "update agents" This reverts commit 70f0ed33817afa7f259ea5dafefe47fac7ff466f. * update task bq_upload * update rdo agents for testing * comment update_rdo_redis + limit 10 files * Revert "comment update_rdo_redis + limit 10 files" This reverts commit e899c5c2fe8067beb6abe2e98c6a67637d330ad5. * Revert "update rdo agents for testing" This reverts commit 6cb36cbe8efc536c105ab715b10275ee650d81cb. * remove checking empty file --- pipelines/rj_smtr/tasks.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/pipelines/rj_smtr/tasks.py b/pipelines/rj_smtr/tasks.py index de52c03df..5b476e8de 100644 --- a/pipelines/rj_smtr/tasks.py +++ b/pipelines/rj_smtr/tasks.py @@ -502,11 +502,8 @@ def bq_upload( if status["error"] is not None: return status["error"] - if len(status["data"]) == 0: - log("Empty dataframe, skipping upload") - return None - error = None + try: # Upload raw to staging if raw_filepath: @@ -848,7 +845,7 @@ def transform_to_nested_structure( # Check empty dataframe if len(status["data"]) == 0: - log("Empty dataframe, skipping transformation") + log("Empty dataframe, skipping transformation...") return {"data": pd.DataFrame(), "error": status["error"]} try: From 9938bbe02cda4cd3810edc100065e1830bbbce1a Mon Sep 17 00:00:00 2001 From: Fernanda Scovino Date: Tue, 26 Sep 2023 11:45:45 -0300 Subject: [PATCH 23/41] hotfix: desativa flow gps stpl (#518) --- pipelines/rj_smtr/br_rj_riodejaneiro_stpl_gps/flows.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pipelines/rj_smtr/br_rj_riodejaneiro_stpl_gps/flows.py b/pipelines/rj_smtr/br_rj_riodejaneiro_stpl_gps/flows.py index 615b9b11f..7d8cf1574 100644 --- a/pipelines/rj_smtr/br_rj_riodejaneiro_stpl_gps/flows.py +++ b/pipelines/rj_smtr/br_rj_riodejaneiro_stpl_gps/flows.py @@ -106,5 +106,5 @@ image=emd_constants.DOCKER_IMAGE.value, labels=[emd_constants.RJ_SMTR_AGENT_LABEL.value], ) -# Seguindo o padrão de captura adotado pelo BRT -captura_stpl.schedule = every_minute +# Captura descontinuada (sem dados), avaliar quando voltar +# captura_stpl.schedule = every_minute From 323b505c43894fa79ba9aa4f2e7e5796666152ba Mon Sep 17 00:00:00 2001 From: patriciacatandi Date: Tue, 26 Sep 2023 12:12:20 -0300 Subject: [PATCH 24/41] bugfix --- pipelines/rj_escritorio/dump_ftp_inea/flows.py | 2 ++ pipelines/rj_escritorio/dump_ftp_inea/tasks.py | 16 +++++++++------- 2 files changed, 11 insertions(+), 7 deletions(-) diff --git a/pipelines/rj_escritorio/dump_ftp_inea/flows.py b/pipelines/rj_escritorio/dump_ftp_inea/flows.py index aa2f979b9..0f1d87eef 100644 --- a/pipelines/rj_escritorio/dump_ftp_inea/flows.py +++ b/pipelines/rj_escritorio/dump_ftp_inea/flows.py @@ -68,6 +68,8 @@ radar=radar, redis_files=redis_files, datalake_files=datalake_files, + date=date, + greater_than=greater_than, get_only_last_file=get_only_last_file, ) diff --git a/pipelines/rj_escritorio/dump_ftp_inea/tasks.py b/pipelines/rj_escritorio/dump_ftp_inea/tasks.py index f90f23ac4..18f123085 100644 --- a/pipelines/rj_escritorio/dump_ftp_inea/tasks.py +++ b/pipelines/rj_escritorio/dump_ftp_inea/tasks.py @@ -5,7 +5,7 @@ # pylint: disable=E0702,E1137,E1136,E1101,W0613,bad-continuation from datetime import datetime, timedelta from pathlib import Path -from typing import List, Tuple +from typing import List from google.cloud import storage from prefect import task @@ -32,7 +32,7 @@ def get_files_datalake( greater_than: str = None, check_datalake_files: bool = True, mode: str = "prod", -) -> Tuple[List[str], str]: +) -> List[str]: """ List files from INEA saved on datalake @@ -95,9 +95,11 @@ def get_files_datalake( # Format of the name is 9921GUA-PPIVol-20220930-121010-0004.hdf # We need remove the last characters to stay with 9921GUA-PPIVol-20220930-121010 datalake_files = ["-".join(fname.split("-")[:-1]) for fname in datalake_files] + log(f"Last 5 datalake files: {datalake_files[-5:]}") else: datalake_files = [] + log("This run is not considering datalake files") return datalake_files @@ -119,13 +121,13 @@ def get_ftp_client(wait=None): # pylint: disable=too-many-arguments def get_files_to_download( client, - radar, - redis_files, - datalake_files, + radar: str, + redis_files: list, + datalake_files: list, date: str = None, greater_than: str = None, get_only_last_file: bool = True, -): +) -> List[str]: """ List and get files to download FTP @@ -196,7 +198,7 @@ def get_files_to_download( @task(max_retries=3, retry_delay=timedelta(seconds=30)) -def download_files(client, files, radar): +def download_files(client, files, radar) -> List[str]: """ Download files from FTP """ From 3a406a6412620efe3f56024d99f25b4205e1a253 Mon Sep 17 00:00:00 2001 From: patriciacatandi Date: Tue, 26 Sep 2023 12:29:47 -0300 Subject: [PATCH 25/41] bugfix --- pipelines/rj_escritorio/dump_ftp_inea/tasks.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/pipelines/rj_escritorio/dump_ftp_inea/tasks.py b/pipelines/rj_escritorio/dump_ftp_inea/tasks.py index 18f123085..921b278f6 100644 --- a/pipelines/rj_escritorio/dump_ftp_inea/tasks.py +++ b/pipelines/rj_escritorio/dump_ftp_inea/tasks.py @@ -176,11 +176,16 @@ def get_files_to_download( files = [file for file in files if file not in redis_files] # Check if files are already on datalake + # Some datalake files use the pattern "9915MAC-PPIVol-20230921-123000-0000.hdf" + # Files from FTP use the pattern "./MAC/9915MAC-PPIVol-20230921-123000-3f28.hdf" + # We are going to compare "9915MAC-PPIVol-20230921-123000" from both places if len(datalake_files) > 0: + log("Removing files that are already on datalake") files = [ file for file in files - if "-".join(file.split("-")[:-1]) not in datalake_files + if "-".join(file.split("/")[-1].split("-")[:-1]) + not in ["-".join(dfile.split("-")[:-1]) for dfile in datalake_files] ] # Skip task if there is no new file From 6345a78d5e23215a167fee3de8130b021a2ca571 Mon Sep 17 00:00:00 2001 From: patriciacatandi Date: Wed, 27 Sep 2023 14:31:30 -0300 Subject: [PATCH 26/41] bugfix --- pipelines/rj_escritorio/dump_ftp_inea/tasks.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/pipelines/rj_escritorio/dump_ftp_inea/tasks.py b/pipelines/rj_escritorio/dump_ftp_inea/tasks.py index 921b278f6..691d195f2 100644 --- a/pipelines/rj_escritorio/dump_ftp_inea/tasks.py +++ b/pipelines/rj_escritorio/dump_ftp_inea/tasks.py @@ -2,7 +2,7 @@ """ Tasks to dump data from a INEA FTP to BigQuery """ -# pylint: disable=E0702,E1137,E1136,E1101,W0613,bad-continuation +# pylint: disable=E0702,E1137,E1136,E1101,W0613 from datetime import datetime, timedelta from pathlib import Path from typing import List @@ -184,8 +184,7 @@ def get_files_to_download( files = [ file for file in files - if "-".join(file.split("/")[-1].split("-")[:-1]) - not in ["-".join(dfile.split("-")[:-1]) for dfile in datalake_files] + if "-".join(file.split("/")[-1].split("-")[:-1]) not in datalake_files ] # Skip task if there is no new file From 27739dd28d6a3301f9c1d7c727021ff117f1fc26 Mon Sep 17 00:00:00 2001 From: patriciacatandi Date: Wed, 27 Sep 2023 14:58:48 -0300 Subject: [PATCH 27/41] bugfix --- pipelines/rj_escritorio/dump_ftp_inea/tasks.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/pipelines/rj_escritorio/dump_ftp_inea/tasks.py b/pipelines/rj_escritorio/dump_ftp_inea/tasks.py index 691d195f2..04069ca71 100644 --- a/pipelines/rj_escritorio/dump_ftp_inea/tasks.py +++ b/pipelines/rj_escritorio/dump_ftp_inea/tasks.py @@ -95,7 +95,7 @@ def get_files_datalake( # Format of the name is 9921GUA-PPIVol-20220930-121010-0004.hdf # We need remove the last characters to stay with 9921GUA-PPIVol-20220930-121010 datalake_files = ["-".join(fname.split("-")[:-1]) for fname in datalake_files] - log(f"Last 5 datalake files: {datalake_files[-5:]}") + log(f"Last 10 datalake files: {datalake_files[-10:]}") else: datalake_files = [] @@ -165,15 +165,21 @@ def get_files_to_download( # Get specific files based on date and greater_than parameters if date: files = [file for file in files if file.split("-")[2] == date.replace("-", "")] + log(f"Last 10 files on FTP for date {date}: {files[-10:]}") + if greater_than: files = [ file for file in files if file.split("-")[2] >= greater_than.replace("-", "") ] + log( + f"Last 10 files on FTP for date greater than {greater_than}: {files[-10:]}" + ) # Check if files are already on redis files = [file for file in files if file not in redis_files] + log(f"Last 10 files on FTP that are not on redis: {files[-10:]}") # Check if files are already on datalake # Some datalake files use the pattern "9915MAC-PPIVol-20230921-123000-0000.hdf" From 13dd1e71055146aaa1417375201418f917a0f232 Mon Sep 17 00:00:00 2001 From: patriciacatandi Date: Wed, 27 Sep 2023 15:13:55 -0300 Subject: [PATCH 28/41] adding no new files on ftp --- pipelines/rj_escritorio/dump_ftp_inea/tasks.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/pipelines/rj_escritorio/dump_ftp_inea/tasks.py b/pipelines/rj_escritorio/dump_ftp_inea/tasks.py index 04069ca71..e2d7121a8 100644 --- a/pipelines/rj_escritorio/dump_ftp_inea/tasks.py +++ b/pipelines/rj_escritorio/dump_ftp_inea/tasks.py @@ -155,6 +155,13 @@ def get_files_to_download( client.connect() files = client.list_files(path=f"./{radar.upper()}/") + + # Skip task if there is no new file on FTP + if len(files) == 0: + log("No new available files on FTP") + skip = Skipped("No new available files on FTP") + raise ENDRUN(state=skip) + # log(f"\n\nAvailable files on FTP: {files}") # log(f"\nFiles already saved on redis_files: {redis_files}") From 010d5cdd3597e3d908b28bde33be9970519bf88c Mon Sep 17 00:00:00 2001 From: patriciacatandi Date: Wed, 27 Sep 2023 15:40:01 -0300 Subject: [PATCH 29/41] bugfix --- .../rj_escritorio/dump_ftp_inea/tasks.py | 36 +++++++++++++------ 1 file changed, 25 insertions(+), 11 deletions(-) diff --git a/pipelines/rj_escritorio/dump_ftp_inea/tasks.py b/pipelines/rj_escritorio/dump_ftp_inea/tasks.py index e2d7121a8..cdf5b63ef 100644 --- a/pipelines/rj_escritorio/dump_ftp_inea/tasks.py +++ b/pipelines/rj_escritorio/dump_ftp_inea/tasks.py @@ -119,9 +119,32 @@ def get_ftp_client(wait=None): @task(max_retries=3, retry_delay=timedelta(seconds=30)) # pylint: disable=too-many-arguments -def get_files_to_download( +def get_files_from_ftp( client, radar: str, +) -> List[str]: + """ + List and get files to download FTP + """ + + client.connect() + files = client.list_files(path=f"./{radar.upper()}/") + + # Skip task if there is no new file on FTP + if len(files) == 0: + log("No new available files on FTP") + skip = Skipped("No new available files on FTP") + raise ENDRUN(state=skip) + + log(f"Last 10 files on FTP: {files[-10:]} {len(files)}") + + return files + + +@task(max_retries=3, retry_delay=timedelta(seconds=30)) +# pylint: disable=too-many-arguments +def select_files_to_download( + files: list, redis_files: list, datalake_files: list, date: str = None, @@ -129,7 +152,7 @@ def get_files_to_download( get_only_last_file: bool = True, ) -> List[str]: """ - List and get files to download FTP + Select files to download Args: radar (str): Radar name. Must be `gua` or `mac` @@ -153,15 +176,6 @@ def get_files_to_download( let `greater_than` as None and `get_only_last_file` as False and fill `date` """ - client.connect() - files = client.list_files(path=f"./{radar.upper()}/") - - # Skip task if there is no new file on FTP - if len(files) == 0: - log("No new available files on FTP") - skip = Skipped("No new available files on FTP") - raise ENDRUN(state=skip) - # log(f"\n\nAvailable files on FTP: {files}") # log(f"\nFiles already saved on redis_files: {redis_files}") From 55a238028d645e958b6af0ce14d24441662bceae Mon Sep 17 00:00:00 2001 From: patriciacatandi Date: Wed, 27 Sep 2023 15:53:01 -0300 Subject: [PATCH 30/41] bugfix --- .../rj_escritorio/dump_ftp_inea/flows.py | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/pipelines/rj_escritorio/dump_ftp_inea/flows.py b/pipelines/rj_escritorio/dump_ftp_inea/flows.py index 0f1d87eef..b60de657b 100644 --- a/pipelines/rj_escritorio/dump_ftp_inea/flows.py +++ b/pipelines/rj_escritorio/dump_ftp_inea/flows.py @@ -15,8 +15,9 @@ from pipelines.rj_escritorio.dump_ftp_inea.tasks import ( get_ftp_client, get_files_datalake, - get_files_to_download, + get_files_from_ftp, download_files, + select_files_to_download, upload_file_to_gcs, ) from pipelines.rj_escritorio.dump_ftp_inea.schedules import ( @@ -48,8 +49,16 @@ client = get_ftp_client() + files = get_files_from_ftp( + client=client, + radar=radar, + ) + redis_files = get_on_redis( - dataset_id="meio_ambiente_clima", table_id=radar, mode=mode + dataset_id="meio_ambiente_clima", + table_id=radar, + mode=mode, + wait=files, ) datalake_files = get_files_datalake( @@ -61,11 +70,11 @@ greater_than=greater_than, check_datalake_files=check_datalake_files, mode=mode, + wait=files, ) - files_to_download = get_files_to_download( - client=client, - radar=radar, + files_to_download = select_files_to_download( + files=files, redis_files=redis_files, datalake_files=datalake_files, date=date, From 372d1fd2fc6ca7656854005d876b8c02666d686e Mon Sep 17 00:00:00 2001 From: patriciacatandi Date: Wed, 27 Sep 2023 16:16:53 -0300 Subject: [PATCH 31/41] bugfix --- pipelines/rj_escritorio/dump_ftp_inea/tasks.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pipelines/rj_escritorio/dump_ftp_inea/tasks.py b/pipelines/rj_escritorio/dump_ftp_inea/tasks.py index cdf5b63ef..faa91bee7 100644 --- a/pipelines/rj_escritorio/dump_ftp_inea/tasks.py +++ b/pipelines/rj_escritorio/dump_ftp_inea/tasks.py @@ -32,6 +32,7 @@ def get_files_datalake( greater_than: str = None, check_datalake_files: bool = True, mode: str = "prod", + wait=None, # pylint: disable=unused-argument ) -> List[str]: """ List files from INEA saved on datalake From 81b31999bc62a994011eafff213fb77326f77aa4 Mon Sep 17 00:00:00 2001 From: patriciacatandi Date: Wed, 27 Sep 2023 20:16:19 -0300 Subject: [PATCH 32/41] bugfix --- pipelines/rj_escritorio/dump_ftp_inea/tasks.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pipelines/rj_escritorio/dump_ftp_inea/tasks.py b/pipelines/rj_escritorio/dump_ftp_inea/tasks.py index faa91bee7..cbdd6864a 100644 --- a/pipelines/rj_escritorio/dump_ftp_inea/tasks.py +++ b/pipelines/rj_escritorio/dump_ftp_inea/tasks.py @@ -138,6 +138,7 @@ def get_files_from_ftp( raise ENDRUN(state=skip) log(f"Last 10 files on FTP: {files[-10:]} {len(files)}") + log(f"files on FTP: {files}") return files From e05c30e2eaee0c0a7e590568c868a07a2744e7fd Mon Sep 17 00:00:00 2001 From: d116626 Date: Thu, 28 Sep 2023 15:11:11 -0300 Subject: [PATCH 33/41] chore: change interval for total_contagem --- pipelines/rj_smfp/dump_db_ergon/flows.py | 2 +- pipelines/rj_smfp/dump_db_ergon/schedules.py | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/pipelines/rj_smfp/dump_db_ergon/flows.py b/pipelines/rj_smfp/dump_db_ergon/flows.py index dbc04cb08..4e0324338 100644 --- a/pipelines/rj_smfp/dump_db_ergon/flows.py +++ b/pipelines/rj_smfp/dump_db_ergon/flows.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- """ -Database dumping flows for segovi project +Database dumping flows for segovi project. """ from copy import deepcopy diff --git a/pipelines/rj_smfp/dump_db_ergon/schedules.py b/pipelines/rj_smfp/dump_db_ergon/schedules.py index d709e913b..cb5b9b2b0 100644 --- a/pipelines/rj_smfp/dump_db_ergon/schedules.py +++ b/pipelines/rj_smfp/dump_db_ergon/schedules.py @@ -206,6 +206,7 @@ TOTAL_ANOS,DATA_PROXIMO,NOME_PROXIMO,EMP_CODIGO FROM ERGON.TOTAL_CONTA """, + "interval": timedelta(days=15), }, "pre_contagem": { "materialize_after_dump": True, From 1ef01d70161c033498c1b483191dd2beab44ca45 Mon Sep 17 00:00:00 2001 From: Fernanda Scovino Date: Fri, 29 Sep 2023 17:05:34 -0300 Subject: [PATCH 34/41] =?UTF-8?q?Unifica=20tasks=20de=20parti=C3=A7=C3=A3o?= =?UTF-8?q?=20de=20data=20e=20hora=20(#517)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * remove task de particao nao usada * unifica tasks de particao de data e hora * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * corrige condicional --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: mergify[bot] <37929162+mergify[bot]@users.noreply.github.com> --- pipelines/rj_smtr/constants.py | 9 +++--- pipelines/rj_smtr/flows.py | 14 +++------- pipelines/rj_smtr/tasks.py | 45 +++++------------------------- pipelines/rj_smtr/veiculo/flows.py | 6 ++-- 4 files changed, 18 insertions(+), 56 deletions(-) diff --git a/pipelines/rj_smtr/constants.py b/pipelines/rj_smtr/constants.py index 7133b8abe..93303e5b7 100644 --- a/pipelines/rj_smtr/constants.py +++ b/pipelines/rj_smtr/constants.py @@ -181,7 +181,6 @@ class constants(Enum): # pylint: disable=c0103 data_processamento """, "primary_key": ["id"], # id column to nest data on - "flag_date_partition": False, }, ] BILHETAGEM_TABLES_PARAMS = [ @@ -199,7 +198,7 @@ class constants(Enum): # pylint: disable=c0103 DT_INCLUSAO """, "primary_key": ["CD_LINHA"], # id column to nest data on - "flag_date_partition": True, + "partition_date_only": True, }, { "table_id": "grupo", @@ -215,7 +214,7 @@ class constants(Enum): # pylint: disable=c0103 DT_INCLUSAO """, "primary_key": ["CD_GRUPO"], - "flag_date_partition": True, + "partition_date_only": True, }, { "table_id": "grupo_linha", @@ -231,7 +230,7 @@ class constants(Enum): # pylint: disable=c0103 DT_INCLUSAO """, "primary_key": ["CD_GRUPO", "CD_LINHA"], # id column to nest data on - "flag_date_partition": True, + "partition_date_only": True, }, { "table_id": "matriz_integracao", @@ -250,7 +249,7 @@ class constants(Enum): # pylint: disable=c0103 "cd_versao_matriz", "cd_integracao", ], # id column to nest data on - "flag_date_partition": True, + "partition_date_only": True, }, ] BILHETAGEM_SECRET_PATH = "smtr_jae_access_data" diff --git a/pipelines/rj_smtr/flows.py b/pipelines/rj_smtr/flows.py index f1d29ed10..87d506813 100644 --- a/pipelines/rj_smtr/flows.py +++ b/pipelines/rj_smtr/flows.py @@ -5,8 +5,7 @@ from prefect.run_configs import KubernetesRun from prefect.storage import GCS -from prefect import case, Parameter -from prefect.tasks.control_flow import merge +from prefect import Parameter # EMD Imports # @@ -19,7 +18,6 @@ # SMTR Imports # from pipelines.rj_smtr.tasks import ( - create_date_partition, create_date_hour_partition, create_local_partition_path, get_current_timestamp, @@ -66,13 +64,9 @@ dataset_id=dataset_id, ) - with case(table_params["flag_date_partition"], True): - date_partitions = create_date_partition(timestamp) - - with case(table_params["flag_date_partition"], False): - date_hour_partitions = create_date_hour_partition(timestamp) - - partitions = merge(date_partitions, date_hour_partitions) + partitions = create_date_hour_partition( + timestamp, partition_date_only=table_params["partition_date_only"] + ) filename = parse_timestamp_to_string(timestamp) diff --git a/pipelines/rj_smtr/tasks.py b/pipelines/rj_smtr/tasks.py index 5b476e8de..e8b239957 100644 --- a/pipelines/rj_smtr/tasks.py +++ b/pipelines/rj_smtr/tasks.py @@ -158,19 +158,16 @@ def get_current_timestamp(timestamp=None, truncate_minute: bool = True) -> datet @task -def create_date_hour_partition(timestamp: datetime) -> str: - """ - Get date hour Hive partition structure from timestamp. - """ - return f"data={timestamp.strftime('%Y-%m-%d')}/hora={timestamp.strftime('%H')}" - - -@task -def create_date_partition(timestamp: datetime) -> str: +def create_date_hour_partition( + timestamp: datetime, partition_date_only: bool = False +) -> str: """ Get date hour Hive partition structure from timestamp. """ - return f"data={timestamp.date()}" + partition = f"data={timestamp.strftime('%Y-%m-%d')}" + if not partition_date_only: + partition += f"/hora={timestamp.strftime('%H')}" + return partition @task @@ -181,34 +178,6 @@ def parse_timestamp_to_string(timestamp: datetime, pattern="%Y-%m-%d-%H-%M-%S") return timestamp.strftime(pattern) -@task -def create_current_date_hour_partition(capture_time=None): - """Create partitioned directory structure to save data locally based - on capture time. - - Args: - capture_time(pendulum.datetime.DateTime, optional): - if recapturing data, will create partitions based - on the failed timestamps being recaptured - - Returns: - dict: "filename" contains the name which to upload the csv, "partitions" contains - the partitioned directory path - """ - if capture_time is None: - capture_time = datetime.now(tz=constants.TIMEZONE.value).replace( - minute=0, second=0, microsecond=0 - ) - date = capture_time.strftime("%Y-%m-%d") - hour = capture_time.strftime("%H") - - return { - "filename": capture_time.strftime("%Y-%m-%d-%H-%M-%S"), - "partitions": f"data={date}/hora={hour}", - "timestamp": capture_time, - } - - @task def create_local_partition_path( dataset_id: str, table_id: str, filename: str, partitions: str = None diff --git a/pipelines/rj_smtr/veiculo/flows.py b/pipelines/rj_smtr/veiculo/flows.py index 28188a129..e1fab515e 100644 --- a/pipelines/rj_smtr/veiculo/flows.py +++ b/pipelines/rj_smtr/veiculo/flows.py @@ -30,7 +30,7 @@ every_day_hour_seven, ) from pipelines.rj_smtr.tasks import ( - create_date_partition, + create_date_hour_partition, create_local_partition_path, get_current_timestamp, get_raw, @@ -71,7 +71,7 @@ ) # SETUP # - partitions = create_date_partition(timestamp) + partitions = create_date_hour_partition(timestamp, partition_date_only=True) filename = parse_timestamp_to_string(timestamp) @@ -140,7 +140,7 @@ ) # SETUP # - partitions = create_date_partition(timestamp) + partitions = create_date_hour_partition(timestamp, partition_date_only=True) filename = parse_timestamp_to_string(timestamp) From c16dc74e78fc4b33c71469ba654ecc0cfaaf19cc Mon Sep 17 00:00:00 2001 From: Rafael Carvalho Pinheiro <74972217+pixuimpou@users.noreply.github.com> Date: Fri, 29 Sep 2023 17:46:08 -0300 Subject: [PATCH 35/41] =?UTF-8?q?Corrige=20par=C3=A2metro=20de=20`partitio?= =?UTF-8?q?n=5Fdate=5Fonly`=20no=20flow=20de=20bilhetagem=20(#521)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- pipelines/rj_smtr/constants.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pipelines/rj_smtr/constants.py b/pipelines/rj_smtr/constants.py index 93303e5b7..c9f18f2fd 100644 --- a/pipelines/rj_smtr/constants.py +++ b/pipelines/rj_smtr/constants.py @@ -181,6 +181,7 @@ class constants(Enum): # pylint: disable=c0103 data_processamento """, "primary_key": ["id"], # id column to nest data on + "partition_date_only": False, }, ] BILHETAGEM_TABLES_PARAMS = [ From f1fc682256464af418803f5853ab73019fe716c0 Mon Sep 17 00:00:00 2001 From: Rafael Carvalho Pinheiro <74972217+pixuimpou@users.noreply.github.com> Date: Mon, 2 Oct 2023 12:12:07 -0300 Subject: [PATCH 36/41] =?UTF-8?q?Altera=20flow=20de=20captura=20gen=C3=A9r?= =?UTF-8?q?ica=20(#520)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * remove task de particao nao usada * unifica tasks de particao de data e hora * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * corrige condicional * change capture flow * change generic capture flow * atualiza esquema do flow padrao * change default capture flow structure * change generic capture flow * adjust constant structure * change bilhetagem to new capture flow structure * fix get_storage_blob function * fix get_storage_blob call * organize constants order * fix get_raw_from_sources function call * change transform_raw_to_json to read_raw_data * transform transform_raw_data_to_json to read_raw_data * fix nout task parameter * fix timedelta instantiation * set upstream tasks * declare raw_filepath * update docstrings * adjust get_raw_from_sources return * fix errors * change agent label to dev * refactore source values * update constants * update agent * update schedule params * update interval * fix get_datetime_range interval * remove order by from queries * fix get_raw_data_api * change json read function * update read_raw_data * update save_raw_local_func * log error * change raw api extraction for json * change read json function * print log traceback * skip pre treatment if empty df * skip save staging if dataframe is empty / save raw * remove skip upload if empty dataframe * update docstring and returned values * reorganize task order * fix tuple * change zip logic * remove skip * create gtfs zip constant * add gtfs zip file name * add csv to save raw / change filetype logic * remove comments * fix csv_args default value * change docstring get raw api * change raw data gcs docstring * remove commented task * change quadro primary key to list * update GTFS constants * change upload folder structure * undo silenciamento de falha de notificação * remove parametros de testes (gtfs) * Update pipelines/rj_smtr/constants.py Co-authored-by: Fernanda Scovino * corrige encadeamento de erros no flow * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * remove header treatment * mudar agent dev para prd * mudar agent de dev para prd * ajustar retorno das funcoes * Atualiza documentação * adicionar retorno em get_upload_storage_blob * Atualiza documentação * Atualiza string --------- Co-authored-by: fernandascovino Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: mergify[bot] <37929162+mergify[bot]@users.noreply.github.com> Co-authored-by: eng-rodrigocunha Co-authored-by: Carolina Gomes Co-authored-by: Rodrigo Cunha <66736583+eng-rodrigocunha@users.noreply.github.com> --- .../schedules.py | 22 +- pipelines/rj_smtr/constants.py | 150 ++++--- pipelines/rj_smtr/flows.py | 94 ++-- pipelines/rj_smtr/tasks.py | 423 ++++++++++++------ pipelines/rj_smtr/utils.py | 338 +++++++++++++- pipelines/utils/utils.py | 12 +- 6 files changed, 781 insertions(+), 258 deletions(-) diff --git a/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/schedules.py b/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/schedules.py index 38fca85a9..2f7804811 100644 --- a/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/schedules.py +++ b/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/schedules.py @@ -16,27 +16,37 @@ ) bilhetagem_principal_clocks = generate_execute_schedules( - interval=timedelta(days=1), + clock_interval=timedelta( + **constants.BILHETAGEM_GENERAL_CAPTURE_PARAMS.value["principal_run_interval"] + ), labels=[ emd_constants.RJ_SMTR_AGENT_LABEL.value, ], - table_parameters=constants.BILHETAGEM_TABLES_PARAMS.value, + table_parameters=constants.BILHETAGEM_CAPTURE_PARAMS.value, dataset_id=constants.BILHETAGEM_DATASET_ID.value, secret_path=constants.BILHETAGEM_SECRET_PATH.value, - runs_interval_minutes=15, + source_type=constants.BILHETAGEM_GENERAL_CAPTURE_PARAMS.value["source_type"], + runs_interval_minutes=constants.BILHETAGEM_GENERAL_CAPTURE_PARAMS.value[ + "principal_runs_interval_minutes" + ], ) bilhetagem_principal_schedule = Schedule(clocks=untuple(bilhetagem_principal_clocks)) bilhetagem_transacao_clocks = generate_execute_schedules( - interval=timedelta(minutes=1), + clock_interval=timedelta( + **constants.BILHETAGEM_GENERAL_CAPTURE_PARAMS.value["transacao_run_interval"] + ), labels=[ emd_constants.RJ_SMTR_AGENT_LABEL.value, ], - table_parameters=constants.BILHETAGEM_TRANSACAO_TABLE_PARAMS.value, + table_parameters=constants.BILHETAGEM_TRANSACAO_CAPTURE_PARAMS.value, dataset_id=constants.BILHETAGEM_DATASET_ID.value, secret_path=constants.BILHETAGEM_SECRET_PATH.value, - runs_interval_minutes=0, + source_type=constants.BILHETAGEM_GENERAL_CAPTURE_PARAMS.value["source_type"], + runs_interval_minutes=constants.BILHETAGEM_GENERAL_CAPTURE_PARAMS.value[ + "transacao_runs_interval_minutes" + ], ) bilhetagem_transacao_schedule = Schedule(clocks=untuple(bilhetagem_transacao_clocks)) diff --git a/pipelines/rj_smtr/constants.py b/pipelines/rj_smtr/constants.py index c9f18f2fd..52e30d9f8 100644 --- a/pipelines/rj_smtr/constants.py +++ b/pipelines/rj_smtr/constants.py @@ -165,9 +165,34 @@ class constants(Enum): # pylint: disable=c0103 # BILHETAGEM BILHETAGEM_DATASET_ID = "br_rj_riodejaneiro_bilhetagem" - BILHETAGEM_TRANSACAO_TABLE_PARAMS = [ - { - "table_id": "transacao", + + BILHETAGEM_GENERAL_CAPTURE_PARAMS = { + "databases": { + "principal_db": { + "engine": "mysql", + "host": "principal-database-replica.internal", + }, + "tarifa_db": { + "engine": "postgres", + "host": "tarifa-database-replica.internal", + }, + "transacao_db": { + "engine": "postgres", + "host": "transacao-database-replica.internal", + }, + }, + "vpn_url": "http://vpn-jae.mobilidade.rio/", + "source_type": "api-json", + "transacao_run_interval": {"minutes": 1}, + "principal_run_interval": {"days": 1}, + "transacao_runs_interval_minutes": 0, + "principal_runs_interval_minutes": 5, + } + + BILHETAGEM_TRANSACAO_CAPTURE_PARAMS = { + "table_id": "transacao", + "partition_date_only": False, + "extract_params": { "database": "transacao_db", "query": """ SELECT @@ -177,80 +202,91 @@ class constants(Enum): # pylint: disable=c0103 WHERE data_processamento BETWEEN '{start}' AND '{end}' - ORDER BY - data_processamento """, - "primary_key": ["id"], # id column to nest data on - "partition_date_only": False, + "run_interval": BILHETAGEM_GENERAL_CAPTURE_PARAMS["transacao_run_interval"], }, - ] - BILHETAGEM_TABLES_PARAMS = [ + "primary_key": ["id"], # id column to nest data on + } + + BILHETAGEM_CAPTURE_PARAMS = [ { "table_id": "linha", - "database": "principal_db", - "query": """ - SELECT - * - FROM - LINHA - WHERE - DT_INCLUSAO >= '{start}' - ORDER BY - DT_INCLUSAO - """, - "primary_key": ["CD_LINHA"], # id column to nest data on "partition_date_only": True, + "extract_params": { + "database": "principal_db", + "query": """ + SELECT + * + FROM + LINHA + WHERE + DT_INCLUSAO >= '{start}' + """, + "run_interval": BILHETAGEM_GENERAL_CAPTURE_PARAMS[ + "principal_run_interval" + ], + }, + "primary_key": ["CD_LINHA"], # id column to nest data on }, { "table_id": "grupo", - "database": "principal_db", - "query": """ - SELECT - * - FROM - GRUPO - WHERE - DT_INCLUSAO >= '{start}' - ORDER BY - DT_INCLUSAO - """, - "primary_key": ["CD_GRUPO"], "partition_date_only": True, + "extract_params": { + "database": "principal_db", + "query": """ + SELECT + * + FROM + GRUPO + WHERE + DT_INCLUSAO >= '{start}' + """, + "run_interval": BILHETAGEM_GENERAL_CAPTURE_PARAMS[ + "principal_run_interval" + ], + }, + "primary_key": ["CD_GRUPO"], # id column to nest data on }, { "table_id": "grupo_linha", - "database": "principal_db", - "query": """ - SELECT - * - FROM - GRUPO_LINHA - WHERE - DT_INCLUSAO >= '{start}' - ORDER BY - DT_INCLUSAO - """, - "primary_key": ["CD_GRUPO", "CD_LINHA"], # id column to nest data on "partition_date_only": True, + "extract_params": { + "database": "principal_db", + "query": """ + SELECT + * + FROM + GRUPO_LINHA + WHERE + DT_INCLUSAO >= '{start}' + """, + "run_interval": BILHETAGEM_GENERAL_CAPTURE_PARAMS[ + "principal_run_interval" + ], + }, + "primary_key": ["CD_GRUPO", "CD_LINHA"], # id column to nest data on }, { "table_id": "matriz_integracao", - "database": "tarifa_db", - "query": """ - SELECT - * - FROM - matriz_integracao - WHERE - dt_inclusao >= '{start}' - ORDER BY - dt_inclusao - """, + "partition_date_only": True, + "extract_params": { + "database": "tarifa_db", + "query": """ + SELECT + * + FROM + matriz_integracao + WHERE + dt_inclusao >= '{start}' + """, + "run_interval": BILHETAGEM_GENERAL_CAPTURE_PARAMS[ + "principal_run_interval" + ], + }, "primary_key": [ "cd_versao_matriz", "cd_integracao", ], # id column to nest data on - "partition_date_only": True, }, ] BILHETAGEM_SECRET_PATH = "smtr_jae_access_data" diff --git a/pipelines/rj_smtr/flows.py b/pipelines/rj_smtr/flows.py index 87d506813..4860c6d07 100644 --- a/pipelines/rj_smtr/flows.py +++ b/pipelines/rj_smtr/flows.py @@ -21,18 +21,12 @@ create_date_hour_partition, create_local_partition_path, get_current_timestamp, - get_raw, parse_timestamp_to_string, - save_raw_local, - save_treated_local, - upload_logs_to_bq, - bq_upload, - transform_to_nested_structure, -) - -from pipelines.rj_smtr.tasks import ( + upload_raw_data_to_gcs, + upload_staging_data_to_gcs, + transform_raw_to_nested_structure, + get_raw_from_sources, create_request_params, - get_datetime_range, ) @@ -40,75 +34,79 @@ "SMTR: Captura", code_owners=["caio", "fernanda", "boris", "rodrigo"], ) as default_capture_flow: - # SETUP # + # Configuração # - table_params = Parameter("table_params", default=None) - timestamp_param = Parameter("timestamp", default=None) - interval = Parameter("interval", default=None) + table_id = Parameter("table_id", default=None) + partition_date_only = Parameter("partition_date_only", default=None) + extract_params = Parameter("extract_params", default=None) dataset_id = Parameter("dataset_id", default=None) secret_path = Parameter("secret_path", default=None) + primary_key = Parameter("primary_key", default=None) + source_type = Parameter("source_type", default=None) - timestamp = get_current_timestamp(timestamp_param) - - datetime_range = get_datetime_range(timestamp, interval=interval) + timestamp = get_current_timestamp() rename_flow_run = rename_current_flow_run_now_time( - prefix=default_capture_flow.name + " " + table_params["table_id"] + ": ", + prefix=default_capture_flow.name + " " + table_id + ": ", now_time=timestamp, ) - request_params, request_url = create_request_params( - datetime_range=datetime_range, - table_params=table_params, - secret_path=secret_path, - dataset_id=dataset_id, - ) - partitions = create_date_hour_partition( - timestamp, partition_date_only=table_params["partition_date_only"] + timestamp, partition_date_only=partition_date_only ) filename = parse_timestamp_to_string(timestamp) filepath = create_local_partition_path( dataset_id=dataset_id, - table_id=table_params["table_id"], + table_id=table_id, filename=filename, partitions=partitions, ) - raw_status = get_raw( - url=request_url, - headers=secret_path, - params=request_params, + # Extração # + request_params, request_path = create_request_params( + dataset_id=dataset_id, + extract_params=extract_params, + table_id=table_id, + timestamp=timestamp, ) - raw_filepath = save_raw_local(status=raw_status, file_path=filepath) + error, raw_filepath = get_raw_from_sources( + source_type=source_type, + local_filepath=filepath, + source_path=request_path, + dataset_id=dataset_id, + table_id=table_id, + secret_path=secret_path, + request_params=request_params, + ) - # TREAT & CLEAN # - treated_status = transform_to_nested_structure( - status=raw_status, - timestamp=timestamp, - primary_key=table_params["primary_key"], + error = upload_raw_data_to_gcs( + error=error, + raw_filepath=raw_filepath, + table_id=table_id, + dataset_id=dataset_id, + partitions=partitions, ) - treated_filepath = save_treated_local(status=treated_status, file_path=filepath) + # Pré-tratamento # - # LOAD # - error = bq_upload( - dataset_id=dataset_id, - table_id=table_params["table_id"], - filepath=treated_filepath, + error, staging_filepath = transform_raw_to_nested_structure( raw_filepath=raw_filepath, - partitions=partitions, - status=treated_status, + filepath=filepath, + error=error, + timestamp=timestamp, + primary_key=primary_key, ) - upload_logs_to_bq( - dataset_id=dataset_id, - parent_table_id=table_params["table_id"], + STAGING_UPLOADED = upload_staging_data_to_gcs( error=error, + staging_filepath=staging_filepath, timestamp=timestamp, + table_id=table_id, + dataset_id=dataset_id, + partitions=partitions, ) default_capture_flow.storage = GCS(emd_constants.GCS_FLOWS_BUCKET.value) diff --git a/pipelines/rj_smtr/tasks.py b/pipelines/rj_smtr/tasks.py index e8b239957..a846851b5 100644 --- a/pipelines/rj_smtr/tasks.py +++ b/pipelines/rj_smtr/tasks.py @@ -8,7 +8,7 @@ import os from pathlib import Path import traceback -from typing import Dict, List +from typing import Dict, List, Union import io from basedosdados import Storage, Table @@ -28,6 +28,13 @@ get_last_run_timestamp, log_critical, data_info_str, + get_raw_data_api, + get_raw_data_gcs, + upload_run_logs_to_bq, + get_datetime_range, + read_raw_data, + save_treated_local_func, + save_raw_local_func, ) from pipelines.utils.execute_dbt_model.utils import get_dbt_client from pipelines.utils.utils import log, get_redis_client, get_vault_secret @@ -162,7 +169,14 @@ def create_date_hour_partition( timestamp: datetime, partition_date_only: bool = False ) -> str: """ - Get date hour Hive partition structure from timestamp. + Create a date (and hour) Hive partition structure from timestamp. + + Args: + timestamp (datetime): timestamp to be used as reference + partition_date_only (bool, optional): whether to add hour partition or not + + Returns: + str: partition string """ partition = f"data={timestamp.strftime('%Y-%m-%d')}" if not partition_date_only: @@ -417,15 +431,123 @@ def get_raw( # pylint: disable=R0912 "Unsupported raw file extension. Supported only: json, csv and txt" ) - except Exception as exp: - error = exp - - if error is not None: + except Exception: + error = traceback.format_exc() log(f"[CATCHED] Task failed with error: \n{error}", level="error") return {"data": data, "error": error} +@task(checkpoint=False, nout=2) +def create_request_params( + extract_params: dict, + table_id: str, + dataset_id: str, + timestamp: datetime, +) -> tuple[str, str]: + """ + Task to create request params + + Args: + extract_params (dict): extract parameters + table_id (str): table_id on BigQuery + dataset_id (str): dataset_id on BigQuery + timestamp (datetime): timestamp for flow run + + Returns: + request_params: host, database and query to request data + request_url: url to request data + """ + request_params = None + request_url = None + + if dataset_id == constants.BILHETAGEM_DATASET_ID.value: + database = constants.BILHETAGEM_GENERAL_CAPTURE_PARAMS.value["databases"][ + extract_params["database"] + ] + request_url = ( + constants.BILHETAGEM_GENERAL_CAPTURE_PARAMS.value["vpn_url"] + + database["engine"] + ) + + datetime_range = get_datetime_range( + timestamp=timestamp, interval=timedelta(**extract_params["run_interval"]) + ) + + request_params = { + "host": database["host"], # TODO: exibir no log em ambiente fechado + "database": extract_params["database"], + "query": extract_params["query"].format(**datetime_range), + } + + return request_params, request_url + + +@task(checkpoint=False, nout=2) +def get_raw_from_sources( + source_type: str, + local_filepath: str, + source_path: str = None, + dataset_id: str = None, + table_id: str = None, + secret_path: str = None, + request_params: dict = None, +) -> tuple[str, str]: + """ + Task to get raw data from sources + + Args: + source_type (str): source type + local_filepath (str): local filepath + source_path (str, optional): source path. Defaults to None. + dataset_id (str, optional): dataset_id on BigQuery. Defaults to None. + table_id (str, optional): table_id on BigQuery. Defaults to None. + secret_path (str, optional): secret path. Defaults to None. + request_params (dict, optional): request parameters. Defaults to None. + + Returns: + error: error catched from upstream tasks + filepath: filepath to raw data + """ + error = None + filepath = None + data = None + + source_values = source_type.split("-", 1) + + source_type, filetype = ( + source_values if len(source_values) == 2 else (source_values[0], None) + ) + + log(f"Getting raw data from source type: {source_type}") + + try: + if source_type == "api": + error, data, filetype = get_raw_data_api( + url=source_path, + secret_path=secret_path, + api_params=request_params, + filetype=filetype, + ) + elif source_type == "gcs": + error, data, filetype = get_raw_data_gcs( + dataset_id=dataset_id, table_id=table_id, zip_filename=request_params + ) + else: + raise NotImplementedError(f"{source_type} not supported") + + filepath = save_raw_local_func( + data=data, filepath=local_filepath, filetype=filetype + ) + + except NotImplementedError: + error = traceback.format_exc() + log(f"[CATCHED] Task failed with error: \n{error}", level="error") + + log(f"Raw extraction ended returned values: {error}, {filepath}") + return error, filepath + + ############### # # Load data @@ -599,6 +721,101 @@ def upload_logs_to_bq( # pylint: disable=R0913 raise Exception(f"Pipeline failed with error: {error}") +@task +def upload_raw_data_to_gcs( + error: str, + raw_filepath: str, + table_id: str, + dataset_id: str, + partitions: list, +) -> Union[str, None]: + """ + Upload raw data to GCS. + + Args: + error (str): Error catched from upstream tasks. + raw_filepath (str): Path to the saved raw .json file + table_id (str): table_id on BigQuery + dataset_id (str): dataset_id on BigQuery + partitions (list): list of partition strings + + Returns: + Union[str, None]: if there is an error returns it traceback, otherwise returns None + """ + if error is None: + try: + st_obj = Storage(table_id=table_id, dataset_id=dataset_id) + log( + f"""Uploading raw file to bucket {st_obj.bucket_name} at + {st_obj.bucket_name}/{dataset_id}/{table_id}""" + ) + st_obj.upload( + path=raw_filepath, + partitions=partitions, + mode="raw", + if_exists="replace", + ) + except Exception: + error = traceback.format_exc() + log(f"[CATCHED] Task failed with error: \n{error}", level="error") + + return error + + +@task +def upload_staging_data_to_gcs( + error: str, + staging_filepath: str, + timestamp: datetime, + table_id: str, + dataset_id: str, + partitions: list, +) -> Union[str, None]: + """ + Upload staging data to GCS. + + Args: + error (str): Error catched from upstream tasks. + staging_filepath (str): Path to the saved treated .csv file. + timestamp (datetime): timestamp for flow run. + table_id (str): table_id on BigQuery. + dataset_id (str): dataset_id on BigQuery. + partitions (list): list of partition strings. + + Returns: + Union[str, None]: if there is an error returns it traceback, otherwise returns None + """ + if error is None: + try: + # Creates and publish table if it does not exist, append to it otherwise + create_or_append_table( + dataset_id=dataset_id, + table_id=table_id, + path=staging_filepath, + partitions=partitions, + ) + except Exception: + error = traceback.format_exc() + log(f"[CATCHED] Task failed with error: \n{error}", level="error") + + upload_run_logs_to_bq( + dataset_id=dataset_id, + parent_table_id=table_id, + error=error, + timestamp=timestamp, + mode="staging", + ) + + return error + + +############### +# +# Daterange tasks +# +############### + + @task( checkpoint=False, max_retries=constants.MAX_RETRIES.value, @@ -789,140 +1006,92 @@ def get_previous_date(days): return now.to_date_string() -@task -def transform_to_nested_structure( - status: dict, timestamp: datetime, primary_key: list = None -): - """Transform dataframe to nested structure - - Args: - status (dict): Must contain keys - * `data`: dataframe returned from treatement - * `error`: error catched from data treatement - timestamp (datetime): timestamp of the capture - primary_key (list, optional): List of primary keys to be used for nesting. - - Returns: - dict: Conatining keys - * `data` (json): nested data - * `error` (str): catched error, if any. Otherwise, returns None - """ - - # Check previous error - if status["error"] is not None: - return {"data": pd.DataFrame(), "error": status["error"]} - - # Check empty dataframe - if len(status["data"]) == 0: - log("Empty dataframe, skipping transformation...") - return {"data": pd.DataFrame(), "error": status["error"]} - - try: - if primary_key is None: - primary_key = [] - - error = None - data = pd.DataFrame(status["data"]) - - log( - f""" - Received inputs: - - timestamp:\n{timestamp} - - data:\n{data.head()}""" - ) - - log(f"Raw data:\n{data_info_str(data)}", level="info") - - log("Adding captured timestamp column...", level="info") - data["timestamp_captura"] = timestamp - - log("Striping string columns...", level="info") - for col in data.columns[data.dtypes == "object"].to_list(): - data[col] = data[col].str.strip() - - log(f"Finished cleaning! Data:\n{data_info_str(data)}", level="info") - - log("Creating nested structure...", level="info") - pk_cols = primary_key + ["timestamp_captura"] - data = ( - data.groupby(pk_cols) - .apply( - lambda x: x[data.columns.difference(pk_cols)].to_json(orient="records") - ) - .str.strip("[]") - .reset_index(name="content")[primary_key + ["content", "timestamp_captura"]] - ) - - log( - f"Finished nested structure! Data:\n{data_info_str(data)}", - level="info", - ) - - except Exception as exp: # pylint: disable=W0703 - error = exp - - if error is not None: - log(f"[CATCHED] Task failed with error: \n{error}", level="error") - - return {"data": data, "error": error} +############### +# +# Pretreat data +# +############### -@task(checkpoint=False) -def get_datetime_range( +@task(nout=2) +def transform_raw_to_nested_structure( + raw_filepath: str, + filepath: str, + error: str, timestamp: datetime, - interval: int, -) -> dict: + primary_key: list = None, +) -> tuple[str, str]: """ - Task to get datetime range in UTC + Task to transform raw data to nested structure Args: - timestamp (datetime): timestamp to get datetime range - interval (int): interval in seconds + raw_filepath (str): Path to the saved raw .json file + filepath (str): Path to the saved treated .csv file + error (str): Error catched from upstream tasks + timestamp (datetime): timestamp for flow run + primary_key (list, optional): Primary key to be used on nested structure Returns: - dict: datetime range - """ - - start = ( - (timestamp - timedelta(seconds=interval)) - .astimezone(tz=timezone("UTC")) - .strftime("%Y-%m-%d %H:%M:%S") - ) - - end = timestamp.astimezone(tz=timezone("UTC")).strftime("%Y-%m-%d %H:%M:%S") - - return {"start": start, "end": end} - - -@task(checkpoint=False, nout=2) -def create_request_params( - datetime_range: dict, table_params: dict, secret_path: str, dataset_id: str -) -> tuple: + str: Error traceback + str: Path to the saved treated .csv file """ - Task to create request params + if error is None: + try: + # leitura do dado raw + error, data = read_raw_data(filepath=raw_filepath) - Args: - datetime_range (dict): datetime range to get params - table_params (dict): table params to get params - secret_path (str): secret path to get params - dataset_id (str): dataset id to get params + if primary_key is None: + primary_key = [] - Returns: - request_params: host, database and query to request data - request_url: url to request data - """ + log( + f""" + Received inputs: + - timestamp:\n{timestamp} + - data:\n{data.head()}""" + ) - if dataset_id == constants.BILHETAGEM_DATASET_ID.value: - secrets = get_vault_secret(secret_path)["data"] + # Check empty dataframe + if data.empty: + log("Empty dataframe, skipping transformation...") + else: + log(f"Raw data:\n{data_info_str(data)}", level="info") + + log("Adding captured timestamp column...", level="info") + data["timestamp_captura"] = timestamp + + log("Striping string columns...", level="info") + for col in data.columns[data.dtypes == "object"].to_list(): + data[col] = data[col].str.strip() + + log(f"Finished cleaning! Data:\n{data_info_str(data)}", level="info") + + log("Creating nested structure...", level="info") + pk_cols = primary_key + ["timestamp_captura"] + data = ( + data.groupby(pk_cols) + .apply( + lambda x: x[data.columns.difference(pk_cols)].to_json( + orient="records" + ) + ) + .str.strip("[]") + .reset_index(name="content")[ + primary_key + ["content", "timestamp_captura"] + ] + ) - database_secrets = secrets["databases"][table_params["database"]] + log( + f"Finished nested structure! Data:\n{data_info_str(data)}", + level="info", + ) - request_url = secrets["vpn_url"] + database_secrets["engine"] + # save treated local + filepath = save_treated_local_func( + data=data, error=error, filepath=filepath + ) - request_params = { - "host": database_secrets["host"], # TODO: exibir no log em ambiente fechado - "database": table_params["database"], - "query": table_params["query"].format(**datetime_range), - } + except Exception: # pylint: disable=W0703 + error = traceback.format_exc() + log(f"[CATCHED] Task failed with error: \n{error}", level="error") - return request_params, request_url + return error, filepath diff --git a/pipelines/rj_smtr/utils.py b/pipelines/rj_smtr/utils.py index 9ddf7d687..1d71dd3dd 100644 --- a/pipelines/rj_smtr/utils.py +++ b/pipelines/rj_smtr/utils.py @@ -8,12 +8,18 @@ from pathlib import Path from datetime import timedelta, datetime -from typing import List +from typing import List, Union +import traceback import io +import json +import zipfile +import pytz +import requests import basedosdados as bd from basedosdados import Table import pandas as pd -import pytz +from google.cloud.storage.blob import Blob + from prefect.schedules.clocks import IntervalClock @@ -398,46 +404,41 @@ def data_info_str(data: pd.DataFrame): def generate_execute_schedules( # pylint: disable=too-many-arguments,too-many-locals - interval: timedelta, + clock_interval: timedelta, labels: List[str], - table_parameters: list, - dataset_id: str, - secret_path: str, + table_parameters: Union[list[dict], dict], runs_interval_minutes: int = 15, start_date: datetime = datetime( 2020, 1, 1, tzinfo=pytz.timezone(emd_constants.DEFAULT_TIMEZONE.value) ), + **general_flow_params, ) -> List[IntervalClock]: """ Generates multiple schedules Args: - interval (timedelta): The interval to run the schedule + clock_interval (timedelta): The interval to run the schedule labels (List[str]): The labels to be added to the schedule - table_parameters (list): The table parameters - dataset_id (str): The dataset_id to be used in the schedule - secret_path (str): The secret path to be used in the schedule + table_parameters (list): The table parameters to iterate over runs_interval_minutes (int, optional): The interval between each schedule. Defaults to 15. start_date (datetime, optional): The start date of the schedule. Defaults to datetime(2020, 1, 1, tzinfo=pytz.timezone(emd_constants.DEFAULT_TIMEZONE.value)). - + general_flow_params: Any param that you want to pass to the flow Returns: List[IntervalClock]: The list of schedules """ + if isinstance(table_parameters, dict): + table_parameters = [table_parameters] clocks = [] for count, parameters in enumerate(table_parameters): - parameter_defaults = { - "table_params": parameters, - "dataset_id": dataset_id, - "secret_path": secret_path, - "interval": interval.total_seconds(), - } + parameter_defaults = parameters | general_flow_params + log(f"parameter_defaults: {parameter_defaults}") clocks.append( IntervalClock( - interval=interval, + interval=clock_interval, start_date=start_date + timedelta(minutes=runs_interval_minutes * count), labels=labels, @@ -445,3 +446,304 @@ def generate_execute_schedules( # pylint: disable=too-many-arguments,too-many-l ) ) return clocks + + +def save_raw_local_func( + data: Union[dict, str], filepath: str, mode: str = "raw", filetype: str = "json" +) -> str: + """ + Saves json response from API to .json file. + Args: + filepath (str): Path which to save raw file + status (dict): Must contain keys + * data: json returned from API + * error: error catched from API request + mode (str, optional): Folder to save locally, later folder which to upload to GCS. + Returns: + str: Path to the saved file + """ + + # diferentes tipos de arquivos para salvar + _filepath = filepath.format(mode=mode, filetype=filetype) + Path(_filepath).parent.mkdir(parents=True, exist_ok=True) + + if filetype == "json": + if isinstance(data, dict): + data = json.loads(data) + json.dump(data, Path(_filepath).open("w", encoding="utf-8")) + + # if filetype == "csv": + # pass + if filetype in ("txt", "csv"): + with open(_filepath, "w", encoding="utf-8") as file: + file.write(data) + + log(f"Raw data saved to: {_filepath}") + return _filepath + + +def get_raw_data_api( # pylint: disable=R0912 + url: str, + secret_path: str = None, + api_params: dict = None, + filetype: str = None, +) -> tuple[str, str, str]: + """ + Request data from URL API + + Args: + url (str): URL to request data + secret_path (str, optional): Secret path to get headers. Defaults to None. + api_params (dict, optional): Parameters to pass to API. Defaults to None. + filetype (str, optional): Filetype to save raw file. Defaults to None. + + Returns: + tuple[str, str, str]: Error, data and filetype + """ + error = None + data = None + try: + if secret_path is None: + headers = secret_path + else: + headers = get_vault_secret(secret_path)["data"] + + response = requests.get( + url, + headers=headers, + timeout=constants.MAX_TIMEOUT_SECONDS.value, + params=api_params, + ) + + response.raise_for_status() + + if filetype == "json": + data = response.json() + else: + data = response.text + + except Exception: + error = traceback.format_exc() + log(f"[CATCHED] Task failed with error: \n{error}", level="error") + + return error, data, filetype + + +def get_upload_storage_blob( + dataset_id: str, + filename: str, +) -> Blob: + """ + Get a blob from upload zone in storage + + Args: + dataset_id (str): The dataset id on BigQuery. + filename (str): The filename in GCS. + + Returns: + Blob: blob object + """ + bucket = bd.Storage(dataset_id="", table_id="") + blob_list = list( + bucket.client["storage_staging"] + .bucket(bucket.bucket_name) + .list_blobs(prefix=f"upload/{dataset_id}/{filename}.") + ) + return blob_list[0] + + +def get_raw_data_gcs( + dataset_id: str, + table_id: str, + zip_filename: str = None, +) -> tuple[str, str, str]: + """ + Get raw data from GCS + + Args: + dataset_id (str): The dataset id on BigQuery. + table_id (str): The table id on BigQuery. + zip_filename (str, optional): The zip file name. Defaults to None. + + Returns: + tuple[str, str, str]: Error, data and filetype + """ + error = None + data = None + filetype = None + + try: + blob_search_name = zip_filename or table_id + blob = get_upload_storage_blob(dataset_id=dataset_id, filename=blob_search_name) + + filename = blob.name + filetype = filename.split(".")[-1] + + data = blob.download_as_bytes() + + if filetype == "zip": + with zipfile.ZipFile(io.BytesIO(data), "r") as zipped_file: + filenames = zipped_file.namelist() + filename = list( + filter(lambda x: x.split(".")[0] == table_id, filenames) + )[0] + filetype = filename.split(".")[-1] + data = zipped_file.read(filename) + + data = data.decode(encoding="utf-8") + + except Exception: + error = traceback.format_exc() + log(f"[CATCHED] Task failed with error: \n{error}", level="error") + + return error, data, filetype + + +def save_treated_local_func( + filepath: str, data: pd.DataFrame, error: str, mode: str = "staging" +) -> str: + """ + Save treated file to CSV. + + Args: + filepath (str): Path to save file + data (pd.DataFrame): Dataframe to save + error (str): Error catched during execution + mode (str, optional): Folder to save locally, later folder which to upload to GCS. + + Returns: + str: Path to the saved file + """ + _filepath = filepath.format(mode=mode, filetype="csv") + Path(_filepath).parent.mkdir(parents=True, exist_ok=True) + if error is None: + data.to_csv(_filepath, index=False) + log(f"Treated data saved to: {_filepath}") + return _filepath + + +def upload_run_logs_to_bq( # pylint: disable=R0913 + dataset_id: str, + parent_table_id: str, + timestamp: str, + error: str = None, + previous_error: str = None, + recapture: bool = False, + mode: str = "raw", +): + """ + Upload execution status table to BigQuery. + Table is uploaded to the same dataset, named {parent_table_id}_logs. + If passing status_dict, should not pass timestamp and error. + + Args: + dataset_id (str): dataset_id on BigQuery + parent_table_id (str): table_id on BigQuery + timestamp (str): timestamp to get datetime range + error (str): error catched during execution + previous_error (str): previous error catched during execution + recapture (bool): if the execution was a recapture + mode (str): folder to save locally, later folder which to upload to GCS + + Returns: + None + """ + table_id = parent_table_id + "_logs" + # Create partition directory + filename = f"{table_id}_{timestamp.isoformat()}" + partition = f"data={timestamp.date()}" + filepath = Path( + f"""data/{mode}/{dataset_id}/{table_id}/{partition}/{filename}.csv""" + ) + filepath.parent.mkdir(exist_ok=True, parents=True) + # Create dataframe to be uploaded + if not error and recapture is True: + # if the recapture is succeeded, update the column erro + dataframe = pd.DataFrame( + { + "timestamp_captura": [timestamp], + "sucesso": [True], + "erro": [f"[recapturado]{previous_error}"], + } + ) + log(f"Recapturing {timestamp} with previous error:\n{error}") + else: + # not recapturing or error during flow execution + dataframe = pd.DataFrame( + { + "timestamp_captura": [timestamp], + "sucesso": [error is None], + "erro": [error], + } + ) + # Save data local + dataframe.to_csv(filepath, index=False) + # Upload to Storage + create_or_append_table( + dataset_id=dataset_id, + table_id=table_id, + path=filepath.as_posix(), + partitions=partition, + ) + if error is not None: + raise Exception(f"Pipeline failed with error: {error}") + + +def get_datetime_range( + timestamp: datetime, + interval: timedelta, +) -> dict: + """ + Task to get datetime range in UTC + + Args: + timestamp (datetime): timestamp to get datetime range + interval (timedelta): interval to get datetime range + + Returns: + dict: datetime range + """ + + start = ( + (timestamp - interval) + .astimezone(tz=pytz.timezone("UTC")) + .strftime("%Y-%m-%d %H:%M:%S") + ) + + end = timestamp.astimezone(tz=pytz.timezone("UTC")).strftime("%Y-%m-%d %H:%M:%S") + + return {"start": start, "end": end} + + +def read_raw_data(filepath: str, csv_args: dict = None) -> tuple[str, pd.DataFrame]: + """ + Read raw data from file + + Args: + filepath (str): filepath to read + csv_args (dict): arguments to pass to pandas.read_csv + + Returns: + tuple[str, pd.DataFrame]: error and data + """ + error = None + data = None + try: + file_type = filepath.split(".")[-1] + + if file_type == "json": + data = pd.read_json(filepath) + + # data = json.loads(data) + elif file_type in ("txt", "csv"): + if csv_args is None: + csv_args = {} + data = pd.read_csv(filepath, **csv_args) + else: + error = "Unsupported raw file extension. Supported only: json, csv and txt" + + except Exception: + error = traceback.format_exc() + log(f"[CATCHED] Task failed with error: \n{error}", level="error") + + return error, data diff --git a/pipelines/utils/utils.py b/pipelines/utils/utils.py index efc21c133..adf89bc94 100644 --- a/pipelines/utils/utils.py +++ b/pipelines/utils/utils.py @@ -711,16 +711,24 @@ def get_credentials_from_env( return cred -def get_storage_blobs(dataset_id: str, table_id: str) -> list: +def get_storage_blobs(dataset_id: str, table_id: str, mode: str = "staging") -> list: """ Get all blobs from a table in a dataset. + + Args: + dataset_id (str): dataset id + table_id (str): table id + mode (str, optional): mode to use. Defaults to "staging". + + Returns: + list: list of blobs """ bd_storage = bd.Storage(dataset_id=dataset_id, table_id=table_id) return list( bd_storage.client["storage_staging"] .bucket(bd_storage.bucket_name) - .list_blobs(prefix=f"staging/{bd_storage.dataset_id}/{bd_storage.table_id}/") + .list_blobs(prefix=f"{mode}/{bd_storage.dataset_id}/{bd_storage.table_id}/") ) From fa29fc38850e11c4058293e1f830cbbe1a0adcc6 Mon Sep 17 00:00:00 2001 From: Gabriel Gazola Milan Date: Mon, 2 Oct 2023 12:41:38 -0300 Subject: [PATCH 37/41] feat: update query --- pipelines/rj_segovi/dump_db_1746/schedules.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/pipelines/rj_segovi/dump_db_1746/schedules.py b/pipelines/rj_segovi/dump_db_1746/schedules.py index 50e416106..f8f6819ad 100644 --- a/pipelines/rj_segovi/dump_db_1746/schedules.py +++ b/pipelines/rj_segovi/dump_db_1746/schedules.py @@ -374,7 +374,8 @@ case when cv.ic_vinculo = 'O' or cv.ic_vinculo = 'S' then cv.id_chamado_pai_fk end ) as 'reclamacoes', - no_justificativa + no_justificativa, + oc.id_origem_ocorrencia from tb_chamado as ch inner join ( @@ -550,7 +551,8 @@ chs.dt_alvo_finalizacao, chs.dt_alvo_diagnostico, cl.dt_real_diagnostico, - no_justificativa + no_justificativa, + oc.id_origem_ocorrencia """ _1746_queries = { From 533212e34d0ef3d6507fb9037b66b7a9fd3bc57a Mon Sep 17 00:00:00 2001 From: Fernanda Scovino Date: Mon, 2 Oct 2023 19:19:26 -0300 Subject: [PATCH 38/41] =?UTF-8?q?Adiciona=20novos=20`code=5Fowners`=20da?= =?UTF-8?q?=20SMTR=20=F0=9F=AB=82=20=20(#519)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * add carol e rafa como code owners * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- code_owners.yaml | 2 ++ pipelines/constants.py | 8 ++++++++ 2 files changed, 10 insertions(+) diff --git a/code_owners.yaml b/code_owners.yaml index f2f563c5f..775494551 100644 --- a/code_owners.yaml +++ b/code_owners.yaml @@ -20,6 +20,8 @@ pipelines: - fernandascovino - eng-rodrigocunha - borismarinho + - pixuimpou + - lingsv rj_escritorio: owners: - gabriel-milan diff --git a/pipelines/constants.py b/pipelines/constants.py index 309325d35..900e2ebf9 100644 --- a/pipelines/constants.py +++ b/pipelines/constants.py @@ -138,4 +138,12 @@ class constants(Enum): # pylint: disable=c0103 "user_id": "369657115012366336", "type": "user_nickname", }, + "rafaelpinheiro": { + "user_id": "1131538976101109772", + "type": "user_nickname", + }, + "carolinagomes": { + "user_id": "620000269392019469", + "type": "user_nickname", + }, } From c689b4e67531c494476d55f10277a2b863113e50 Mon Sep 17 00:00:00 2001 From: Rafael Carvalho Pinheiro <74972217+pixuimpou@users.noreply.github.com> Date: Thu, 5 Oct 2023 11:26:43 -0300 Subject: [PATCH 39/41] =?UTF-8?q?Cria=20flow=20generico=20de=20materializa?= =?UTF-8?q?=C3=A7=C3=A3o=20+=20Adiciona=20tratamento=20transa=C3=A7=C3=A3o?= =?UTF-8?q?=20Ja=C3=A9=20(#513)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * create default materialization flow * create tasks for default materialization flow * make generate_execute_schedules more generic * create bilhetagem materialization flow * adapt bilhetagem schedules for the new model * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * add run config and storage * Update utils.py * fix sub tasks * fix fetch_dataset_sha run * add run_date variable to materialization flow * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * remove discord notifications for testing * add manual date_range / fix flow run name * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * fix missing table_id logic * fix empty return * fix empty return * add flag_date_range when var_params is blank * change rename logic when has date variables * change return values of create_dbt_run_vars * create dict aux function * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * remove *args from task * change coalesce task * fix rename task * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * fix task order * add docstrings * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * fix line too long * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * pre-commit hook * adjust tasks * mudar estrutura do flow materializacao * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * adicionar schedule de bilhetagem * adicionar schedule no flow de materialização * ajuste nome da coluna de datetime * ajustar nome coluna * mudar coluna de data para datetime_transacao * ajusta variavel date_range manual * mudar nome parametro de variável dbt * cria flow de orquestração materialização * volta notificação do discord * ajusta wait_flow_run * mudar query para teste * reverter query teste * usar copy no dicionario de variaveis de data * adjust constant run interval * remover funcao comentada * alterar padrão de nome dos flows * remove imports comentados * remove schedules nao utilizados * remove task comentada * mudar agent para produção --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: mergify[bot] <37929162+mergify[bot]@users.noreply.github.com> Co-authored-by: Rodrigo Cunha <66736583+eng-rodrigocunha@users.noreply.github.com> --- .../br_rj_riodejaneiro_bilhetagem/flows.py | 116 +++++++++++++++-- .../schedules.py | 21 +--- pipelines/rj_smtr/constants.py | 32 +++-- pipelines/rj_smtr/flows.py | 84 ++++++++++++- pipelines/rj_smtr/tasks.py | 117 +++++++++++++++++- pipelines/rj_smtr/utils.py | 14 ++- 6 files changed, 344 insertions(+), 40 deletions(-) diff --git a/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/flows.py b/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/flows.py index d7f44e3b9..568f96154 100644 --- a/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/flows.py +++ b/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/flows.py @@ -7,26 +7,46 @@ from prefect.run_configs import KubernetesRun from prefect.storage import GCS +from prefect.tasks.prefect import create_flow_run, wait_for_flow_run +from prefect.utilities.edges import unmapped # EMD Imports # from pipelines.constants import constants as emd_constants +from pipelines.utils.decorators import Flow +from pipelines.utils.tasks import ( + rename_current_flow_run_now_time, + get_current_flow_labels, +) + + +from pipelines.utils.utils import set_default_parameters # SMTR Imports # -from pipelines.rj_smtr.flows import default_capture_flow +from pipelines.rj_smtr.flows import ( + default_capture_flow, + default_materialization_flow, +) + +from pipelines.rj_smtr.tasks import ( + get_current_timestamp, +) from pipelines.rj_smtr.br_rj_riodejaneiro_bilhetagem.schedules import ( - bilhetagem_principal_schedule, bilhetagem_transacao_schedule, ) +from pipelines.rj_smtr.constants import constants + +from pipelines.rj_smtr.schedules import every_hour + # Flows # # BILHETAGEM TRANSAÇÃO - CAPTURA A CADA MINUTO # bilhetagem_transacao_captura = deepcopy(default_capture_flow) -bilhetagem_transacao_captura.name = "SMTR: Bilhetagem Transação (captura)" +bilhetagem_transacao_captura.name = "SMTR: Bilhetagem Transação - Captura" bilhetagem_transacao_captura.storage = GCS(emd_constants.GCS_FLOWS_BUCKET.value) bilhetagem_transacao_captura.run_config = KubernetesRun( image=emd_constants.DOCKER_IMAGE.value, @@ -34,13 +54,91 @@ ) bilhetagem_transacao_captura.schedule = bilhetagem_transacao_schedule -# BILHETAGEM PRINCIPAL - CAPTURA DIÁRIA DE DIVERSAS TABELAS # +# BILHETAGEM AUXILIAR - SUBFLOW PARA RODAR ANTES DE CADA MATERIALIZAÇÃO # + +bilhetagem_auxiliar_captura = deepcopy(default_capture_flow) +bilhetagem_auxiliar_captura.name = "SMTR: Bilhetagem Auxiliar - Captura (subflow)" +bilhetagem_auxiliar_captura.storage = GCS(emd_constants.GCS_FLOWS_BUCKET.value) +bilhetagem_auxiliar_captura.run_config = KubernetesRun( + image=emd_constants.DOCKER_IMAGE.value, + labels=[emd_constants.RJ_SMTR_AGENT_LABEL.value], +) + +bilhetagem_auxiliar_captura = set_default_parameters( + flow=bilhetagem_auxiliar_captura, + default_parameters={ + "dataset_id": constants.BILHETAGEM_DATASET_ID.value, + "secret_path": constants.BILHETAGEM_SECRET_PATH.value, + "source_type": constants.BILHETAGEM_GENERAL_CAPTURE_PARAMS.value["source_type"], + }, +) + +# MATERIALIZAÇÃO - SUBFLOW DE MATERIALIZAÇÃO +bilhetagem_materializacao = deepcopy(default_materialization_flow) +bilhetagem_materializacao.name = "SMTR: Bilhetagem Transação - Materialização (subflow)" +bilhetagem_materializacao.storage = GCS(emd_constants.GCS_FLOWS_BUCKET.value) +bilhetagem_materializacao.run_config = KubernetesRun( + image=emd_constants.DOCKER_IMAGE.value, + labels=[emd_constants.RJ_SMTR_AGENT_LABEL.value], +) + +bilhetagem_materializacao_parameters = { + "dataset_id": constants.BILHETAGEM_DATASET_ID.value +} | constants.BILHETAGEM_MATERIALIZACAO_PARAMS.value + +bilhetagem_materializacao = set_default_parameters( + flow=bilhetagem_materializacao, + default_parameters=bilhetagem_materializacao_parameters, +) + +# TRATAMENTO - RODA DE HORA EM HORA, CAPTURA AUXILIAR + MATERIALIZAÇÃO +with Flow( + "SMTR: Bilhetagem Transação - Tratamento", + code_owners=["caio", "fernanda", "boris", "rodrigo"], +) as bilhetagem_transacao_tratamento: + timestamp = get_current_timestamp() + + rename_flow_run = rename_current_flow_run_now_time( + prefix=bilhetagem_transacao_tratamento.name + " ", + now_time=timestamp, + ) + + LABELS = get_current_flow_labels() + + # Captura + runs_captura = create_flow_run.map( + flow_name=unmapped(bilhetagem_auxiliar_captura.name), + project_name=unmapped(emd_constants.PREFECT_DEFAULT_PROJECT.value), + parameters=constants.BILHETAGEM_CAPTURE_PARAMS.value, + labels=unmapped(LABELS), + ) + + wait_captura = wait_for_flow_run.map( + runs_captura, + stream_states=unmapped(True), + stream_logs=unmapped(True), + raise_final_state=unmapped(True), + ) + + # Materialização + run_materializacao = create_flow_run( + flow_name=bilhetagem_materializacao.name, + project_name=emd_constants.PREFECT_DEFAULT_PROJECT.value, + labels=LABELS, + upstream_tasks=[wait_captura], + ) + + wait_materializacao = wait_for_flow_run( + run_materializacao, + stream_states=True, + stream_logs=True, + raise_final_state=True, + ) -bilhetagem_principal_captura = deepcopy(default_capture_flow) -bilhetagem_principal_captura.name = "SMTR: Bilhetagem Principal (captura)" -bilhetagem_principal_captura.storage = GCS(emd_constants.GCS_FLOWS_BUCKET.value) -bilhetagem_principal_captura.run_config = KubernetesRun( +bilhetagem_transacao_tratamento.storage = GCS(emd_constants.GCS_FLOWS_BUCKET.value) +bilhetagem_transacao_tratamento.run_config = KubernetesRun( image=emd_constants.DOCKER_IMAGE.value, labels=[emd_constants.RJ_SMTR_AGENT_LABEL.value], ) -bilhetagem_principal_captura.schedule = bilhetagem_principal_schedule +bilhetagem_transacao_tratamento.schedule = every_hour +# bilhetagem_materializacao.schedule = bilhetagem_materializacao_schedule diff --git a/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/schedules.py b/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/schedules.py index 2f7804811..c2ee21164 100644 --- a/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/schedules.py +++ b/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/schedules.py @@ -15,27 +15,10 @@ generate_execute_schedules, ) -bilhetagem_principal_clocks = generate_execute_schedules( - clock_interval=timedelta( - **constants.BILHETAGEM_GENERAL_CAPTURE_PARAMS.value["principal_run_interval"] - ), - labels=[ - emd_constants.RJ_SMTR_AGENT_LABEL.value, - ], - table_parameters=constants.BILHETAGEM_CAPTURE_PARAMS.value, - dataset_id=constants.BILHETAGEM_DATASET_ID.value, - secret_path=constants.BILHETAGEM_SECRET_PATH.value, - source_type=constants.BILHETAGEM_GENERAL_CAPTURE_PARAMS.value["source_type"], - runs_interval_minutes=constants.BILHETAGEM_GENERAL_CAPTURE_PARAMS.value[ - "principal_runs_interval_minutes" - ], -) - -bilhetagem_principal_schedule = Schedule(clocks=untuple(bilhetagem_principal_clocks)) - +BILHETAGEM_TRANSACAO_INTERVAL = timedelta(minutes=1) bilhetagem_transacao_clocks = generate_execute_schedules( clock_interval=timedelta( - **constants.BILHETAGEM_GENERAL_CAPTURE_PARAMS.value["transacao_run_interval"] + **constants.BILHETAGEM_CAPTURE_RUN_INTERVAL.value["transacao_run_interval"] ), labels=[ emd_constants.RJ_SMTR_AGENT_LABEL.value, diff --git a/pipelines/rj_smtr/constants.py b/pipelines/rj_smtr/constants.py index 52e30d9f8..ee8a22cd2 100644 --- a/pipelines/rj_smtr/constants.py +++ b/pipelines/rj_smtr/constants.py @@ -183,12 +183,15 @@ class constants(Enum): # pylint: disable=c0103 }, "vpn_url": "http://vpn-jae.mobilidade.rio/", "source_type": "api-json", - "transacao_run_interval": {"minutes": 1}, - "principal_run_interval": {"days": 1}, "transacao_runs_interval_minutes": 0, "principal_runs_interval_minutes": 5, } + BILHETAGEM_CAPTURE_RUN_INTERVAL = { + "transacao_run_interval": {"minutes": 1}, + "principal_run_interval": {"days": 1}, + } + BILHETAGEM_TRANSACAO_CAPTURE_PARAMS = { "table_id": "transacao", "partition_date_only": False, @@ -203,11 +206,13 @@ class constants(Enum): # pylint: disable=c0103 data_processamento BETWEEN '{start}' AND '{end}' """, - "run_interval": BILHETAGEM_GENERAL_CAPTURE_PARAMS["transacao_run_interval"], + "run_interval": BILHETAGEM_CAPTURE_RUN_INTERVAL["transacao_run_interval"], }, "primary_key": ["id"], # id column to nest data on } + BILHETAGEM_SECRET_PATH = "smtr_jae_access_data" + BILHETAGEM_CAPTURE_PARAMS = [ { "table_id": "linha", @@ -222,7 +227,7 @@ class constants(Enum): # pylint: disable=c0103 WHERE DT_INCLUSAO >= '{start}' """, - "run_interval": BILHETAGEM_GENERAL_CAPTURE_PARAMS[ + "run_interval": BILHETAGEM_CAPTURE_RUN_INTERVAL[ "principal_run_interval" ], }, @@ -241,7 +246,7 @@ class constants(Enum): # pylint: disable=c0103 WHERE DT_INCLUSAO >= '{start}' """, - "run_interval": BILHETAGEM_GENERAL_CAPTURE_PARAMS[ + "run_interval": BILHETAGEM_CAPTURE_RUN_INTERVAL[ "principal_run_interval" ], }, @@ -260,7 +265,7 @@ class constants(Enum): # pylint: disable=c0103 WHERE DT_INCLUSAO >= '{start}' """, - "run_interval": BILHETAGEM_GENERAL_CAPTURE_PARAMS[ + "run_interval": BILHETAGEM_CAPTURE_RUN_INTERVAL[ "principal_run_interval" ], }, @@ -279,7 +284,7 @@ class constants(Enum): # pylint: disable=c0103 WHERE dt_inclusao >= '{start}' """, - "run_interval": BILHETAGEM_GENERAL_CAPTURE_PARAMS[ + "run_interval": BILHETAGEM_CAPTURE_RUN_INTERVAL[ "principal_run_interval" ], }, @@ -289,4 +294,15 @@ class constants(Enum): # pylint: disable=c0103 ], # id column to nest data on }, ] - BILHETAGEM_SECRET_PATH = "smtr_jae_access_data" + + BILHETAGEM_MATERIALIZACAO_PARAMS = { + "table_id": BILHETAGEM_TRANSACAO_CAPTURE_PARAMS["table_id"], + "upstream": True, + "dbt_vars": { + "date_range": { + "table_run_datetime_column_name": "datetime_transacao", + "delay_hours": 1, + }, + "version": {}, + }, + } diff --git a/pipelines/rj_smtr/flows.py b/pipelines/rj_smtr/flows.py index 4860c6d07..0efb69b17 100644 --- a/pipelines/rj_smtr/flows.py +++ b/pipelines/rj_smtr/flows.py @@ -5,7 +5,8 @@ from prefect.run_configs import KubernetesRun from prefect.storage import GCS -from prefect import Parameter +from prefect import case, Parameter +from prefect.utilities.edges import unmapped # EMD Imports # @@ -13,7 +14,11 @@ from pipelines.utils.decorators import Flow from pipelines.utils.tasks import ( rename_current_flow_run_now_time, + get_now_time, + get_current_flow_labels, + get_current_flow_mode, ) +from pipelines.utils.execute_dbt_model.tasks import get_k8s_dbt_client # SMTR Imports # @@ -22,13 +27,17 @@ create_local_partition_path, get_current_timestamp, parse_timestamp_to_string, + transform_raw_to_nested_structure, + create_dbt_run_vars, + set_last_run_timestamp, + coalesce_task, upload_raw_data_to_gcs, upload_staging_data_to_gcs, - transform_raw_to_nested_structure, get_raw_from_sources, create_request_params, ) +from pipelines.utils.execute_dbt_model.tasks import run_dbt_model with Flow( "SMTR: Captura", @@ -114,3 +123,74 @@ image=emd_constants.DOCKER_IMAGE.value, labels=[emd_constants.RJ_SMTR_AGENT_LABEL.value], ) + +with Flow( + "SMTR: Materialização", + code_owners=["caio", "fernanda", "boris", "rodrigo"], +) as default_materialization_flow: + # SETUP # + + dataset_id = Parameter("dataset_id", default=None) + table_id = Parameter("table_id", default=None) + raw_table_id = Parameter("raw_table_id", default=None) + dbt_alias = Parameter("dbt_alias", default=False) + upstream = Parameter("upstream", default=None) + downstream = Parameter("downstream", default=None) + exclude = Parameter("exclude", default=None) + flags = Parameter("flags", default=None) + dbt_vars = Parameter("dbt_vars", default=dict()) + + # treated_table_params = treat_dbt_table_params(table_params=table_params) + + LABELS = get_current_flow_labels() + MODE = get_current_flow_mode(LABELS) + + _vars, date_var, flag_date_range = create_dbt_run_vars( + dataset_id=dataset_id, + dbt_vars=dbt_vars, + table_id=table_id, + raw_dataset_id=dataset_id, + raw_table_id=raw_table_id, + mode=MODE, + ) + + # Rename flow run + + flow_name_prefix = coalesce_task([table_id, dataset_id]) + + flow_name_now_time = coalesce_task([date_var, get_now_time()]) + + rename_flow_run = rename_current_flow_run_now_time( + prefix=default_materialization_flow.name + " " + flow_name_prefix + ": ", + now_time=flow_name_now_time, + ) + + dbt_client = get_k8s_dbt_client(mode=MODE, wait=rename_flow_run) + + RUNS = run_dbt_model.map( + dbt_client=unmapped(dbt_client), + dataset_id=unmapped(dataset_id), + table_id=unmapped(table_id), + _vars=_vars, + dbt_alias=unmapped(dbt_alias), + upstream=unmapped(upstream), + downstream=unmapped(downstream), + exclude=unmapped(exclude), + flags=unmapped(flags), + ) + + with case(flag_date_range, True): + set_last_run_timestamp( + dataset_id=dataset_id, + table_id=table_id, + timestamp=date_var["date_range_end"], + wait=RUNS, + mode=MODE, + ) + + +default_materialization_flow.storage = GCS(emd_constants.GCS_FLOWS_BUCKET.value) +default_materialization_flow.run_config = KubernetesRun( + image=emd_constants.DOCKER_IMAGE.value, + labels=[emd_constants.RJ_SMTR_AGENT_LABEL.value], +) diff --git a/pipelines/rj_smtr/tasks.py b/pipelines/rj_smtr/tasks.py index a846851b5..f7d687dea 100644 --- a/pipelines/rj_smtr/tasks.py +++ b/pipelines/rj_smtr/tasks.py @@ -8,7 +8,7 @@ import os from pathlib import Path import traceback -from typing import Dict, List, Union +from typing import Dict, List, Union, Iterable import io from basedosdados import Storage, Table @@ -28,6 +28,7 @@ get_last_run_timestamp, log_critical, data_info_str, + dict_contains_keys, get_raw_data_api, get_raw_data_gcs, upload_run_logs_to_bq, @@ -1095,3 +1096,117 @@ def transform_raw_to_nested_structure( log(f"[CATCHED] Task failed with error: \n{error}", level="error") return error, filepath + + +@task(checkpoint=False) +def coalesce_task(value_list: Iterable): + """ + Task to get the first non None value of a list + + Args: + value_list (Iterable): a iterable object with the values + Returns: + any: value_list's first non None item + """ + + try: + return next(value for value in value_list if value is not None) + except StopIteration: + return + + +@task(checkpoint=False, nout=3) +def create_dbt_run_vars( + dataset_id: str, + dbt_vars: dict, + table_id: str, + raw_dataset_id: str, + raw_table_id: str, + mode: str, +) -> tuple[list[dict], Union[list[dict], dict, None], bool]: + """ + Create the variables to be used in dbt materialization based on a dict + + Args: + dataset_id (str): the dataset_id to get the variables + dbt_vars (dict): dict containing the parameters + table_id (str): the table_id get the date_range variable + raw_dataset_id (str): the raw_dataset_id get the date_range variable + raw_table_id (str): the raw_table_id get the date_range variable + mode (str): the mode to get the date_range variable + + Returns: + tuple[list[dict]: the variables to be used in DBT + Union[list[dict], dict, None]: the date variable (date_range or run_date) + bool: a flag that indicates if the date_range variable came from Redis + """ + + log(f"Creating DBT variables. Parameter received: {dbt_vars}") + + if (not dbt_vars) or (not table_id): + log("dbt_vars or table_id are blank. Skiping task") + return [None], None, False + + final_vars = [] + date_var = None + flag_date_range = False + + if "date_range" in dbt_vars.keys(): + log("Creating date_range variable") + + # Set date_range variable manually + if dict_contains_keys( + dbt_vars["date_range"], ["date_range_start", "date_range_end"] + ): + date_var = { + "date_range_start": dbt_vars["date_range"]["date_range_start"], + "date_range_end": dbt_vars["date_range"]["date_range_end"], + } + # Create date_range using Redis + else: + raw_table_id = raw_table_id or table_id + + date_var = get_materialization_date_range.run( + dataset_id=dataset_id, + table_id=table_id, + raw_dataset_id=raw_dataset_id, + raw_table_id=raw_table_id, + table_run_datetime_column_name=dbt_vars["date_range"].get( + "table_run_datetime_column_name" + ), + mode=mode, + delay_hours=dbt_vars["date_range"].get("delay_hours", 0), + ) + + flag_date_range = True + + final_vars.append(date_var.copy()) + + log(f"date_range created: {date_var}") + + elif "run_date" in dbt_vars.keys(): + log("Creating run_date variable") + + date_var = get_run_dates.run( + dbt_vars["run_date"].get("date_range_start"), + dbt_vars["run_date"].get("date_range_end"), + ) + final_vars.append([d.copy() for d in date_var]) + + log(f"run_date created: {date_var}") + + if "version" in dbt_vars.keys(): + log("Creating version variable") + dataset_sha = fetch_dataset_sha.run(dataset_id=dataset_id) + + # if there are other variables inside the list, update each item adding the version variable + if final_vars: + final_vars = get_join_dict.run(dict_list=final_vars, new_dict=dataset_sha) + else: + final_vars.append(dataset_sha) + + log(f"version created: {dataset_sha}") + + log(f"All variables was created, final value is: {final_vars}") + + return final_vars, date_var, flag_date_range diff --git a/pipelines/rj_smtr/utils.py b/pipelines/rj_smtr/utils.py index 1d71dd3dd..f9b98afab 100644 --- a/pipelines/rj_smtr/utils.py +++ b/pipelines/rj_smtr/utils.py @@ -434,7 +434,6 @@ def generate_execute_schedules( # pylint: disable=too-many-arguments,too-many-l clocks = [] for count, parameters in enumerate(table_parameters): parameter_defaults = parameters | general_flow_params - log(f"parameter_defaults: {parameter_defaults}") clocks.append( IntervalClock( @@ -448,6 +447,19 @@ def generate_execute_schedules( # pylint: disable=too-many-arguments,too-many-l return clocks +def dict_contains_keys(input_dict: dict, keys: list[str]) -> bool: + """ + Test if the input dict has all keys present in the list + + Args: + input_dict (dict): the dict to test if has the keys + keys (list[str]): the list containing the keys to check + Returns: + bool: True if the input_dict has all the keys otherwise False + """ + return all(x in input_dict.keys() for x in keys) + + def save_raw_local_func( data: Union[dict, str], filepath: str, mode: str = "raw", filetype: str = "json" ) -> str: From b847649c388670ba6314ac11b791bebad985f396 Mon Sep 17 00:00:00 2001 From: d116626 Date: Thu, 5 Oct 2023 15:55:14 -0300 Subject: [PATCH 40/41] fix: smfp sigma dataset name --- pipelines/rj_smfp/__init__.py | 2 +- .../__init__.py | 0 .../flows.py | 12 ++++++------ .../schedules.py | 5 +++-- 4 files changed, 10 insertions(+), 9 deletions(-) rename pipelines/rj_smfp/{dump_db_sigma_medicamentos => dump_db_sigma_compras_materiais}/__init__.py (100%) rename pipelines/rj_smfp/{dump_db_sigma_medicamentos => dump_db_sigma_compras_materiais}/flows.py (74%) rename pipelines/rj_smfp/{dump_db_sigma_medicamentos => dump_db_sigma_compras_materiais}/schedules.py (97%) diff --git a/pipelines/rj_smfp/__init__.py b/pipelines/rj_smfp/__init__.py index ea6519e3b..a8b9019d0 100644 --- a/pipelines/rj_smfp/__init__.py +++ b/pipelines/rj_smfp/__init__.py @@ -6,7 +6,7 @@ from pipelines.rj_smfp.dump_db_ergon_comlurb.flows import * from pipelines.rj_smfp.dump_db_metas.flows import * from pipelines.rj_smfp.dump_db_sigma.flows import * -from pipelines.rj_smfp.dump_db_sigma_medicamentos.flows import * +from pipelines.rj_smfp.dump_db_sigma_compras_materiais.flows import * from pipelines.rj_smfp.dump_inadimplente.flows import * from pipelines.rj_smfp.dump_url_metas.flows import * from pipelines.rj_smfp.goals_dashboard_dbt.flows import * diff --git a/pipelines/rj_smfp/dump_db_sigma_medicamentos/__init__.py b/pipelines/rj_smfp/dump_db_sigma_compras_materiais/__init__.py similarity index 100% rename from pipelines/rj_smfp/dump_db_sigma_medicamentos/__init__.py rename to pipelines/rj_smfp/dump_db_sigma_compras_materiais/__init__.py diff --git a/pipelines/rj_smfp/dump_db_sigma_medicamentos/flows.py b/pipelines/rj_smfp/dump_db_sigma_compras_materiais/flows.py similarity index 74% rename from pipelines/rj_smfp/dump_db_sigma_medicamentos/flows.py rename to pipelines/rj_smfp/dump_db_sigma_compras_materiais/flows.py index d8b6bd62e..8c8c1fdf9 100644 --- a/pipelines/rj_smfp/dump_db_sigma_medicamentos/flows.py +++ b/pipelines/rj_smfp/dump_db_sigma_compras_materiais/flows.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- """ -Database dumping flows for SMFP SIGMA MEDICAMENTOS +Database dumping flows for SMFP SIGMA COMPRAS MATERIAIS """ from copy import deepcopy @@ -11,15 +11,15 @@ from pipelines.constants import constants # importa o schedule -from pipelines.rj_smfp.dump_db_sigma_medicamentos.schedules import ( - sigma_daily_update_schedule, +from pipelines.rj_smfp.dump_db_sigma_compras_materiais.schedules import ( + compras_sigma_daily_update_schedule, ) from pipelines.utils.dump_db.flows import dump_sql_flow from pipelines.utils.utils import set_default_parameters rj_smfp_dump_db_sigma_medicamentos_flow = deepcopy(dump_sql_flow) rj_smfp_dump_db_sigma_medicamentos_flow.name = ( - "SMFP: SIGMA - MEDICAMENTOS - Ingerir tabelas de banco SQL" + "SMFP: COMPRAS MATERIAIS SERVICOS SIGMA - Ingerir tabelas de banco SQL" ) rj_smfp_dump_db_sigma_medicamentos_flow.storage = GCS(constants.GCS_FLOWS_BUCKET.value) @@ -35,7 +35,7 @@ "db_host": "10.90.31.22", "db_port": "1521", "db_type": "oracle", - "dataset_id": "saude_medicamentos_sigma", + "dataset_id": "compras_materiais_servicos_sigma", "vault_secret_path": "db-sigma", } @@ -44,4 +44,4 @@ default_parameters=rj_smfp_dump_db_sigma_medicamentos_default_parameters, ) -rj_smfp_dump_db_sigma_medicamentos_flow.schedule = sigma_daily_update_schedule +rj_smfp_dump_db_sigma_medicamentos_flow.schedule = compras_sigma_daily_update_schedule diff --git a/pipelines/rj_smfp/dump_db_sigma_medicamentos/schedules.py b/pipelines/rj_smfp/dump_db_sigma_compras_materiais/schedules.py similarity index 97% rename from pipelines/rj_smfp/dump_db_sigma_medicamentos/schedules.py rename to pipelines/rj_smfp/dump_db_sigma_compras_materiais/schedules.py index 0f61241ff..dd6847c56 100644 --- a/pipelines/rj_smfp/dump_db_sigma_medicamentos/schedules.py +++ b/pipelines/rj_smfp/dump_db_sigma_compras_materiais/schedules.py @@ -156,6 +156,7 @@ CNPJ_FABRICANTE FROM SIGMA.VW_MOVIMENTACAO """, # noqa + "interval": timedelta(days=7), }, "ramo_atividade": { "biglake_table": True, @@ -224,9 +225,9 @@ db_host="10.90.31.22", db_port="1521", db_type="oracle", - dataset_id="saude_medicamentos_sigma", + dataset_id="compras_materiais_servicos_sigma", vault_secret_path="db-sigma", table_parameters=_sigma_queries, ) -sigma_daily_update_schedule = Schedule(clocks=untuple(sigma_infra_clocks)) +compras_sigma_daily_update_schedule = Schedule(clocks=untuple(sigma_infra_clocks)) From 5a95a5aa0c28b47a04586d195d80c464c1c52402 Mon Sep 17 00:00:00 2001 From: Rafael Carvalho Pinheiro <74972217+pixuimpou@users.noreply.github.com> Date: Thu, 5 Oct 2023 16:18:04 -0300 Subject: [PATCH 41/41] Alterar interval tabelas auxiliares bilhetagem (#525) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * create default materialization flow * create tasks for default materialization flow * make generate_execute_schedules more generic * create bilhetagem materialization flow * adapt bilhetagem schedules for the new model * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * add run config and storage * Update utils.py * fix sub tasks * fix fetch_dataset_sha run * add run_date variable to materialization flow * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * remove discord notifications for testing * add manual date_range / fix flow run name * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * fix missing table_id logic * fix empty return * fix empty return * add flag_date_range when var_params is blank * change rename logic when has date variables * change return values of create_dbt_run_vars * create dict aux function * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * remove *args from task * change coalesce task * fix rename task * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * fix task order * add docstrings * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * fix line too long * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * pre-commit hook * adjust tasks * mudar estrutura do flow materializacao * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * adicionar schedule de bilhetagem * adicionar schedule no flow de materialização * ajuste nome da coluna de datetime * ajustar nome coluna * mudar coluna de data para datetime_transacao * ajusta variavel date_range manual * mudar nome parametro de variável dbt * cria flow de orquestração materialização * volta notificação do discord * ajusta wait_flow_run * mudar query para teste * reverter query teste * usar copy no dicionario de variaveis de data * adjust constant run interval * remover funcao comentada * alterar padrão de nome dos flows * remove imports comentados * remove schedules nao utilizados * remove task comentada * mudar agent para produção * mudar run interval tabelas auxiliares * remove tratamento comentado * ajusta dicionario constante --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: mergify[bot] <37929162+mergify[bot]@users.noreply.github.com> Co-authored-by: Rodrigo Cunha <66736583+eng-rodrigocunha@users.noreply.github.com> --- pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/schedules.py | 4 +--- pipelines/rj_smtr/constants.py | 4 +--- pipelines/rj_smtr/flows.py | 2 -- 3 files changed, 2 insertions(+), 8 deletions(-) diff --git a/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/schedules.py b/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/schedules.py index c2ee21164..21e13f05b 100644 --- a/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/schedules.py +++ b/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/schedules.py @@ -27,9 +27,7 @@ dataset_id=constants.BILHETAGEM_DATASET_ID.value, secret_path=constants.BILHETAGEM_SECRET_PATH.value, source_type=constants.BILHETAGEM_GENERAL_CAPTURE_PARAMS.value["source_type"], - runs_interval_minutes=constants.BILHETAGEM_GENERAL_CAPTURE_PARAMS.value[ - "transacao_runs_interval_minutes" - ], + runs_interval_minutes=0, ) bilhetagem_transacao_schedule = Schedule(clocks=untuple(bilhetagem_transacao_clocks)) diff --git a/pipelines/rj_smtr/constants.py b/pipelines/rj_smtr/constants.py index ee8a22cd2..0037c6989 100644 --- a/pipelines/rj_smtr/constants.py +++ b/pipelines/rj_smtr/constants.py @@ -183,13 +183,11 @@ class constants(Enum): # pylint: disable=c0103 }, "vpn_url": "http://vpn-jae.mobilidade.rio/", "source_type": "api-json", - "transacao_runs_interval_minutes": 0, - "principal_runs_interval_minutes": 5, } BILHETAGEM_CAPTURE_RUN_INTERVAL = { "transacao_run_interval": {"minutes": 1}, - "principal_run_interval": {"days": 1}, + "principal_run_interval": {"hours": 1}, } BILHETAGEM_TRANSACAO_CAPTURE_PARAMS = { diff --git a/pipelines/rj_smtr/flows.py b/pipelines/rj_smtr/flows.py index 0efb69b17..d4292129c 100644 --- a/pipelines/rj_smtr/flows.py +++ b/pipelines/rj_smtr/flows.py @@ -140,8 +140,6 @@ flags = Parameter("flags", default=None) dbt_vars = Parameter("dbt_vars", default=dict()) - # treated_table_params = treat_dbt_table_params(table_params=table_params) - LABELS = get_current_flow_labels() MODE = get_current_flow_mode(LABELS)