From b6089fa2bf7fd75fd5abb1fb8affe74eb63f3ff8 Mon Sep 17 00:00:00 2001 From: fernandascovino Date: Mon, 25 Sep 2023 19:09:32 -0300 Subject: [PATCH 001/145] remove task de particao nao usada --- pipelines/rj_smtr/tasks.py | 28 ---------------------------- 1 file changed, 28 deletions(-) diff --git a/pipelines/rj_smtr/tasks.py b/pipelines/rj_smtr/tasks.py index de52c03df..983f93fbf 100644 --- a/pipelines/rj_smtr/tasks.py +++ b/pipelines/rj_smtr/tasks.py @@ -181,34 +181,6 @@ def parse_timestamp_to_string(timestamp: datetime, pattern="%Y-%m-%d-%H-%M-%S") return timestamp.strftime(pattern) -@task -def create_current_date_hour_partition(capture_time=None): - """Create partitioned directory structure to save data locally based - on capture time. - - Args: - capture_time(pendulum.datetime.DateTime, optional): - if recapturing data, will create partitions based - on the failed timestamps being recaptured - - Returns: - dict: "filename" contains the name which to upload the csv, "partitions" contains - the partitioned directory path - """ - if capture_time is None: - capture_time = datetime.now(tz=constants.TIMEZONE.value).replace( - minute=0, second=0, microsecond=0 - ) - date = capture_time.strftime("%Y-%m-%d") - hour = capture_time.strftime("%H") - - return { - "filename": capture_time.strftime("%Y-%m-%d-%H-%M-%S"), - "partitions": f"data={date}/hora={hour}", - "timestamp": capture_time, - } - - @task def create_local_partition_path( dataset_id: str, table_id: str, filename: str, partitions: str = None From dc197ccac6d2be6af8b6025974cbdd6e8c826041 Mon Sep 17 00:00:00 2001 From: fernandascovino Date: Mon, 25 Sep 2023 19:17:54 -0300 Subject: [PATCH 002/145] unifica tasks de particao de data e hora --- pipelines/rj_smtr/constants.py | 11 +++++------ pipelines/rj_smtr/flows.py | 12 ++---------- pipelines/rj_smtr/tasks.py | 15 +++++---------- pipelines/rj_smtr/veiculo/flows.py | 6 +++--- 4 files changed, 15 insertions(+), 29 deletions(-) diff --git a/pipelines/rj_smtr/constants.py b/pipelines/rj_smtr/constants.py index 7133b8abe..b22c4a412 100644 --- a/pipelines/rj_smtr/constants.py +++ b/pipelines/rj_smtr/constants.py @@ -180,8 +180,7 @@ class constants(Enum): # pylint: disable=c0103 ORDER BY data_processamento """, - "primary_key": ["id"], # id column to nest data on - "flag_date_partition": False, + "primary_key": ["id"] # id column to nest data on }, ] BILHETAGEM_TABLES_PARAMS = [ @@ -199,7 +198,7 @@ class constants(Enum): # pylint: disable=c0103 DT_INCLUSAO """, "primary_key": ["CD_LINHA"], # id column to nest data on - "flag_date_partition": True, + "partition_date_only": True, }, { "table_id": "grupo", @@ -215,7 +214,7 @@ class constants(Enum): # pylint: disable=c0103 DT_INCLUSAO """, "primary_key": ["CD_GRUPO"], - "flag_date_partition": True, + "partition_date_only": True, }, { "table_id": "grupo_linha", @@ -231,7 +230,7 @@ class constants(Enum): # pylint: disable=c0103 DT_INCLUSAO """, "primary_key": ["CD_GRUPO", "CD_LINHA"], # id column to nest data on - "flag_date_partition": True, + "partition_date_only": True, }, { "table_id": "matriz_integracao", @@ -250,7 +249,7 @@ class constants(Enum): # pylint: disable=c0103 "cd_versao_matriz", "cd_integracao", ], # id column to nest data on - "flag_date_partition": True, + "partition_date_only": True, }, ] BILHETAGEM_SECRET_PATH = "smtr_jae_access_data" diff --git a/pipelines/rj_smtr/flows.py b/pipelines/rj_smtr/flows.py index f1d29ed10..bfe9d86e4 100644 --- a/pipelines/rj_smtr/flows.py +++ b/pipelines/rj_smtr/flows.py @@ -5,8 +5,7 @@ from prefect.run_configs import KubernetesRun from prefect.storage import GCS -from prefect import case, Parameter -from prefect.tasks.control_flow import merge +from prefect import Parameter # EMD Imports # @@ -19,7 +18,6 @@ # SMTR Imports # from pipelines.rj_smtr.tasks import ( - create_date_partition, create_date_hour_partition, create_local_partition_path, get_current_timestamp, @@ -66,13 +64,7 @@ dataset_id=dataset_id, ) - with case(table_params["flag_date_partition"], True): - date_partitions = create_date_partition(timestamp) - - with case(table_params["flag_date_partition"], False): - date_hour_partitions = create_date_hour_partition(timestamp) - - partitions = merge(date_partitions, date_hour_partitions) + partitions = create_date_hour_partition(timestamp, partition_date_only=table_params["partition_date_only"]) filename = parse_timestamp_to_string(timestamp) diff --git a/pipelines/rj_smtr/tasks.py b/pipelines/rj_smtr/tasks.py index 983f93fbf..a2a5adddc 100644 --- a/pipelines/rj_smtr/tasks.py +++ b/pipelines/rj_smtr/tasks.py @@ -158,19 +158,14 @@ def get_current_timestamp(timestamp=None, truncate_minute: bool = True) -> datet @task -def create_date_hour_partition(timestamp: datetime) -> str: +def create_date_hour_partition(timestamp: datetime, partition_date_only: bool = False) -> str: """ Get date hour Hive partition structure from timestamp. """ - return f"data={timestamp.strftime('%Y-%m-%d')}/hora={timestamp.strftime('%H')}" - - -@task -def create_date_partition(timestamp: datetime) -> str: - """ - Get date hour Hive partition structure from timestamp. - """ - return f"data={timestamp.date()}" + partition = f"data={timestamp.strftime('%Y-%m-%d')}" + if partition_date_only: + parition += f"/hora={timestamp.strftime('%H')}" + return partition @task diff --git a/pipelines/rj_smtr/veiculo/flows.py b/pipelines/rj_smtr/veiculo/flows.py index 28188a129..e1fab515e 100644 --- a/pipelines/rj_smtr/veiculo/flows.py +++ b/pipelines/rj_smtr/veiculo/flows.py @@ -30,7 +30,7 @@ every_day_hour_seven, ) from pipelines.rj_smtr.tasks import ( - create_date_partition, + create_date_hour_partition, create_local_partition_path, get_current_timestamp, get_raw, @@ -71,7 +71,7 @@ ) # SETUP # - partitions = create_date_partition(timestamp) + partitions = create_date_hour_partition(timestamp, partition_date_only=True) filename = parse_timestamp_to_string(timestamp) @@ -140,7 +140,7 @@ ) # SETUP # - partitions = create_date_partition(timestamp) + partitions = create_date_hour_partition(timestamp, partition_date_only=True) filename = parse_timestamp_to_string(timestamp) From 66e84a1e2b2b24ead92842b604c2210238fb037b Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 25 Sep 2023 22:22:31 +0000 Subject: [PATCH 003/145] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- pipelines/rj_smtr/constants.py | 2 +- pipelines/rj_smtr/flows.py | 4 +++- pipelines/rj_smtr/tasks.py | 4 +++- 3 files changed, 7 insertions(+), 3 deletions(-) diff --git a/pipelines/rj_smtr/constants.py b/pipelines/rj_smtr/constants.py index b22c4a412..93303e5b7 100644 --- a/pipelines/rj_smtr/constants.py +++ b/pipelines/rj_smtr/constants.py @@ -180,7 +180,7 @@ class constants(Enum): # pylint: disable=c0103 ORDER BY data_processamento """, - "primary_key": ["id"] # id column to nest data on + "primary_key": ["id"], # id column to nest data on }, ] BILHETAGEM_TABLES_PARAMS = [ diff --git a/pipelines/rj_smtr/flows.py b/pipelines/rj_smtr/flows.py index bfe9d86e4..87d506813 100644 --- a/pipelines/rj_smtr/flows.py +++ b/pipelines/rj_smtr/flows.py @@ -64,7 +64,9 @@ dataset_id=dataset_id, ) - partitions = create_date_hour_partition(timestamp, partition_date_only=table_params["partition_date_only"]) + partitions = create_date_hour_partition( + timestamp, partition_date_only=table_params["partition_date_only"] + ) filename = parse_timestamp_to_string(timestamp) diff --git a/pipelines/rj_smtr/tasks.py b/pipelines/rj_smtr/tasks.py index a2a5adddc..f35a9db72 100644 --- a/pipelines/rj_smtr/tasks.py +++ b/pipelines/rj_smtr/tasks.py @@ -158,7 +158,9 @@ def get_current_timestamp(timestamp=None, truncate_minute: bool = True) -> datet @task -def create_date_hour_partition(timestamp: datetime, partition_date_only: bool = False) -> str: +def create_date_hour_partition( + timestamp: datetime, partition_date_only: bool = False +) -> str: """ Get date hour Hive partition structure from timestamp. """ From 7cb436bc9d0fc7cf045ca56248ef58a63ed634e7 Mon Sep 17 00:00:00 2001 From: fernandascovino Date: Mon, 25 Sep 2023 19:29:50 -0300 Subject: [PATCH 004/145] corrige condicional --- pipelines/rj_smtr/tasks.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pipelines/rj_smtr/tasks.py b/pipelines/rj_smtr/tasks.py index f35a9db72..e1a0d0c7d 100644 --- a/pipelines/rj_smtr/tasks.py +++ b/pipelines/rj_smtr/tasks.py @@ -165,8 +165,8 @@ def create_date_hour_partition( Get date hour Hive partition structure from timestamp. """ partition = f"data={timestamp.strftime('%Y-%m-%d')}" - if partition_date_only: - parition += f"/hora={timestamp.strftime('%H')}" + if not partition_date_only: + partition += f"/hora={timestamp.strftime('%H')}" return partition From 588fe7d3f3cc02500930d2bd94996152b51a5bce Mon Sep 17 00:00:00 2001 From: Rafael Date: Tue, 26 Sep 2023 11:20:28 -0300 Subject: [PATCH 005/145] change capture flow --- pipelines/rj_smtr/constants.py | 1 + pipelines/rj_smtr/flows.py | 44 +++++++++----- pipelines/rj_smtr/tasks.py | 45 +++++++++++++++ pipelines/rj_smtr/utils.py | 101 +++++++++++++++++++++++++++++++++ pipelines/utils/custom.py | 10 ++-- pipelines/utils/utils.py | 15 ++++- 6 files changed, 196 insertions(+), 20 deletions(-) diff --git a/pipelines/rj_smtr/constants.py b/pipelines/rj_smtr/constants.py index 7133b8abe..34b63781a 100644 --- a/pipelines/rj_smtr/constants.py +++ b/pipelines/rj_smtr/constants.py @@ -182,6 +182,7 @@ class constants(Enum): # pylint: disable=c0103 """, "primary_key": ["id"], # id column to nest data on "flag_date_partition": False, + "source": "api", }, ] BILHETAGEM_TABLES_PARAMS = [ diff --git a/pipelines/rj_smtr/flows.py b/pipelines/rj_smtr/flows.py index f1d29ed10..e36c8e676 100644 --- a/pipelines/rj_smtr/flows.py +++ b/pipelines/rj_smtr/flows.py @@ -23,13 +23,13 @@ create_date_hour_partition, create_local_partition_path, get_current_timestamp, - get_raw, parse_timestamp_to_string, save_raw_local, save_treated_local, upload_logs_to_bq, bq_upload, transform_to_nested_structure, + get_raw, ) from pipelines.rj_smtr.tasks import ( @@ -37,6 +37,14 @@ get_datetime_range, ) +with Flow( + "SMTR: Pre-Treatment", + code_owners=["caio", "fernanda", "boris", "rodrigo"], +) as default_pre_treatment_flow: + # SETUP # + table_params = Parameter("table_params", default=None) + dataset_id = Parameter("dataset_id", default=None) + with Flow( "SMTR: Captura", @@ -59,13 +67,6 @@ now_time=timestamp, ) - request_params, request_url = create_request_params( - datetime_range=datetime_range, - table_params=table_params, - secret_path=secret_path, - dataset_id=dataset_id, - ) - with case(table_params["flag_date_partition"], True): date_partitions = create_date_partition(timestamp) @@ -83,11 +84,28 @@ partitions=partitions, ) - raw_status = get_raw( - url=request_url, - headers=secret_path, - params=request_params, - ) + raw_status_list = [] + + with case(table_params["source"], "api"): + request_params, request_url = create_request_params( + datetime_range=datetime_range, + table_params=table_params, + secret_path=secret_path, + dataset_id=dataset_id, + ) + + api_raw_status = get_raw( + url=request_url, + headers=secret_path, + params=request_params, + ) + + raw_status_list.append(api_raw_status) + + with case(table_params["source"], "gcs"): + pass + + raw_status = merge(*raw_status_list) raw_filepath = save_raw_local(status=raw_status, file_path=filepath) diff --git a/pipelines/rj_smtr/tasks.py b/pipelines/rj_smtr/tasks.py index de52c03df..49c745076 100644 --- a/pipelines/rj_smtr/tasks.py +++ b/pipelines/rj_smtr/tasks.py @@ -28,6 +28,7 @@ get_last_run_timestamp, log_critical, data_info_str, + get_raw_data_api, ) from pipelines.utils.execute_dbt_model.utils import get_dbt_client from pipelines.utils.utils import log, get_redis_client, get_vault_secret @@ -960,3 +961,47 @@ def create_request_params( } return request_params, request_url + + +# @task(checkpoint=False) +# def get_raw_from_sources( +# source: str, +# url:str, +# dataset_id:str = None, +# table_id:str = None, +# mode:str = None, +# headers: str = None, +# filetype: str = "json", +# csv_args: dict = None, +# params: dict = None, +# ): +# if source == "api": +# return get_raw_data_api( +# url=url, +# headers=headers, +# filetype=filetype, +# csv_args=csv_args, +# params=params +# ) +# if source == "gcs": +# file = + + +@task(checkpoint=False) +def save_raw_storage( + dataset_id: str, + table_id: str, + raw_filepath: str, + partitions: str = None, +): + st_obj = Storage(table_id=table_id, dataset_id=dataset_id) + log( + f"""Uploading raw file to bucket {st_obj.bucket_name} at + {st_obj.bucket_name}/{dataset_id}/{table_id}""" + ) + st_obj.upload( + path=raw_filepath, + partitions=partitions, + mode="raw", + if_exists="replace", + ) diff --git a/pipelines/rj_smtr/utils.py b/pipelines/rj_smtr/utils.py index 9ddf7d687..3b3c7377d 100644 --- a/pipelines/rj_smtr/utils.py +++ b/pipelines/rj_smtr/utils.py @@ -14,6 +14,8 @@ from basedosdados import Table import pandas as pd import pytz +import requests +import zipfile from prefect.schedules.clocks import IntervalClock @@ -27,6 +29,8 @@ get_vault_secret, send_discord_message, get_redis_client, + get_storage_blobs, + get_storage_blob, ) @@ -445,3 +449,100 @@ def generate_execute_schedules( # pylint: disable=too-many-arguments,too-many-l ) ) return clocks + + +def get_raw_data_api( # pylint: disable=R0912 + url: str, + headers: str = None, + filetype: str = "json", + csv_args: dict = None, + params: dict = None, +) -> list[dict]: + """ + Request data from URL API + + Args: + url (str): URL to send request + headers (str, optional): Path to headers guardeded on Vault, if needed. + filetype (str, optional): Filetype to be formatted (supported only: json, csv and txt) + csv_args (dict, optional): Arguments for read_csv, if needed + params (dict, optional): Params to be sent on request + + Returns: + dict: Conatining keys + * `data` (json): data result + * `error` (str): catched error, if any. Otherwise, returns None + """ + data = None + error = None + + try: + if headers is not None: + headers = get_vault_secret(headers)["data"] + + # remove from headers, if present + remove_headers = ["host", "databases"] + for remove_header in remove_headers: + if remove_header in list(headers.keys()): + del headers[remove_header] + + response = requests.get( + url, + headers=headers, + timeout=constants.MAX_TIMEOUT_SECONDS.value, + params=params, + ) + + if response.ok: # status code is less than 400 + if filetype == "json": + data = response.json() + + # todo: move to data check on specfic API # pylint: disable=W0102 + if isinstance(data, dict) and "DescricaoErro" in data.keys(): + error = data["DescricaoErro"] + + elif filetype in ("txt", "csv"): + if csv_args is None: + csv_args = {} + data = pd.read_csv(io.StringIO(response.text), **csv_args).to_dict( + orient="records" + ) + else: + error = ( + "Unsupported raw file extension. Supported only: json, csv and txt" + ) + + except Exception as exp: + error = exp + + if error is not None: + log(f"[CATCHED] Task failed with error: \n{error}", level="error") + + return {"data": data, "error": error} + + +def get_raw_data_gcs( + dataset_id: str, table_id: str, file_name: str, mode: str, zip_file_name: str = None +) -> dict: + error = None + data = None + try: + if zip_file_name: + blob = get_storage_blob( + dataset_id=dataset_id, + table_id=table_id, + file_name=zip_file_name, + mode=mode, + ) + compressed_data = blob.download_as_bytes() + with zipfile.ZipFile(io.BytesIO(compressed_data), "r") as zipped_file: + data = zipped_file.read(file_name).decode(encoding="utf-8") + else: + blob = get_storage_blob( + dataset_id=dataset_id, table_id=table_id, file_name=file_name, mode=mode + ) + data = blob.download_as_string() + except Exception as exp: + error = exp + + return {"data": data, "error": error} diff --git a/pipelines/utils/custom.py b/pipelines/utils/custom.py index 13ae82dd5..d91739817 100644 --- a/pipelines/utils/custom.py +++ b/pipelines/utils/custom.py @@ -68,11 +68,11 @@ def __init__( # pylint: disable=too-many-arguments, too-many-locals edges=edges, reference_tasks=reference_tasks, state_handlers=state_handlers, - on_failure=partial( - notify_discord_on_failure, - secret_path=constants.EMD_DISCORD_WEBHOOK_SECRET_PATH.value, - code_owners=code_owners, - ), + # on_failure=partial( + # notify_discord_on_failure, + # secret_path=constants.EMD_DISCORD_WEBHOOK_SECRET_PATH.value, + # code_owners=code_owners, + # ), validate=validate, result=result, terminal_state_handler=terminal_state_handler, diff --git a/pipelines/utils/utils.py b/pipelines/utils/utils.py index efc21c133..7042709e9 100644 --- a/pipelines/utils/utils.py +++ b/pipelines/utils/utils.py @@ -711,7 +711,7 @@ def get_credentials_from_env( return cred -def get_storage_blobs(dataset_id: str, table_id: str) -> list: +def get_storage_blobs(dataset_id: str, table_id: str, mode: str = "staging") -> list: """ Get all blobs from a table in a dataset. """ @@ -720,7 +720,18 @@ def get_storage_blobs(dataset_id: str, table_id: str) -> list: return list( bd_storage.client["storage_staging"] .bucket(bd_storage.bucket_name) - .list_blobs(prefix=f"staging/{bd_storage.dataset_id}/{bd_storage.table_id}/") + .list_blobs(prefix=f"{mode}/{bd_storage.dataset_id}/{bd_storage.table_id}/") + ) + + +def get_storage_blob( + dataset_id: str, table_id: str, file_name: str, mode: str = "staging" +): + bd_storage = bd.Storage(dataset_id=dataset_id, table_id=table_id) + return ( + bd_storage.client["storage_staging"] + .bucket(bd_storage.bucket_name) + .get_blob(blob_name=f"{mode}/{dataset_id}/{table_id}/{file_name}") ) From 97746e1c34db7410a78a69e0b5ce4e7df4b12ad7 Mon Sep 17 00:00:00 2001 From: Rafael Date: Tue, 26 Sep 2023 15:04:09 -0300 Subject: [PATCH 006/145] change generic capture flow --- pipelines/rj_smtr/constants.py | 39 +++++++++------ pipelines/rj_smtr/flows.py | 72 +++++++++++++-------------- pipelines/rj_smtr/tasks.py | 89 ++++++++++++++++++---------------- pipelines/rj_smtr/utils.py | 52 ++++++++------------ pipelines/utils/utils.py | 15 +++++- 5 files changed, 135 insertions(+), 132 deletions(-) diff --git a/pipelines/rj_smtr/constants.py b/pipelines/rj_smtr/constants.py index 34b63781a..caa4a5e23 100644 --- a/pipelines/rj_smtr/constants.py +++ b/pipelines/rj_smtr/constants.py @@ -167,23 +167,30 @@ class constants(Enum): # pylint: disable=c0103 BILHETAGEM_DATASET_ID = "br_rj_riodejaneiro_bilhetagem" BILHETAGEM_TRANSACAO_TABLE_PARAMS = [ { - "table_id": "transacao", - "database": "transacao_db", - "query": """ - SELECT - * - FROM - transacao - WHERE - data_processamento BETWEEN '{start}' - AND '{end}' - ORDER BY - data_processamento - """, - "primary_key": ["id"], # id column to nest data on "flag_date_partition": False, - "source": "api", - }, + "flow_run_name": "transacao", + "extraction": { + "table_id": "transacao", + "database": "transacao_db", + "query": """ + SELECT + * + FROM + transacao + WHERE + data_processamento BETWEEN '{start}' + AND '{end}' + ORDER BY + data_processamento + """, + "source": "api", + }, + "pre-treatment": { + "table_id": "transacao", + "file_type": "json", + "primary_key": ["id"], # id column to nest data on + }, + } ] BILHETAGEM_TABLES_PARAMS = [ { diff --git a/pipelines/rj_smtr/flows.py b/pipelines/rj_smtr/flows.py index e36c8e676..8076633c8 100644 --- a/pipelines/rj_smtr/flows.py +++ b/pipelines/rj_smtr/flows.py @@ -7,6 +7,7 @@ from prefect.storage import GCS from prefect import case, Parameter from prefect.tasks.control_flow import merge +from prefect.utilities.collections import DotDict # EMD Imports # @@ -29,22 +30,12 @@ upload_logs_to_bq, bq_upload, transform_to_nested_structure, - get_raw, -) - -from pipelines.rj_smtr.tasks import ( + get_raw_from_sources, + transform_data_to_json, create_request_params, get_datetime_range, ) -with Flow( - "SMTR: Pre-Treatment", - code_owners=["caio", "fernanda", "boris", "rodrigo"], -) as default_pre_treatment_flow: - # SETUP # - table_params = Parameter("table_params", default=None) - dataset_id = Parameter("dataset_id", default=None) - with Flow( "SMTR: Captura", @@ -63,7 +54,7 @@ datetime_range = get_datetime_range(timestamp, interval=interval) rename_flow_run = rename_current_flow_run_now_time( - prefix=default_capture_flow.name + " " + table_params["table_id"] + ": ", + prefix=default_capture_flow.name + " " + table_params["flow_run_name"] + ": ", now_time=timestamp, ) @@ -79,41 +70,44 @@ filepath = create_local_partition_path( dataset_id=dataset_id, - table_id=table_params["table_id"], + table_id=table_params["pre-treatment"]["table_id"], filename=filename, partitions=partitions, ) - raw_status_list = [] - - with case(table_params["source"], "api"): - request_params, request_url = create_request_params( - datetime_range=datetime_range, - table_params=table_params, - secret_path=secret_path, - dataset_id=dataset_id, - ) - - api_raw_status = get_raw( - url=request_url, - headers=secret_path, - params=request_params, - ) - - raw_status_list.append(api_raw_status) - - with case(table_params["source"], "gcs"): - pass + # CAPTURA + request_params, request_url = create_request_params( + datetime_range=datetime_range, + table_params=table_params, + secret_path=secret_path, + dataset_id=dataset_id, + ) - raw_status = merge(*raw_status_list) + raw_status = get_raw_from_sources( + source=table_params["extraction"]["source"], + url=request_url, + dataset_id=dataset_id, + table_id=table_params["extraction"]["table_id"], + file_name=table_params["extraction"]["file_name"], + zip_file_name=table_params["extraction"]["zip_file_name"], + mode=table_params["extraction"]["mode"], + headers=secret_path, + params=request_params, + ) raw_filepath = save_raw_local(status=raw_status, file_path=filepath) # TREAT & CLEAN # - treated_status = transform_to_nested_structure( + json_status = transform_data_to_json( status=raw_status, + file_type=table_params["pre-treatment"]["file_type"], + csv_args=table_params["pre-treatment"]["csv_args"], + ) + + treated_status = transform_to_nested_structure( + status=json_status, timestamp=timestamp, - primary_key=table_params["primary_key"], + primary_key=table_params["pre-treatment"]["primary_key"], ) treated_filepath = save_treated_local(status=treated_status, file_path=filepath) @@ -121,7 +115,7 @@ # LOAD # error = bq_upload( dataset_id=dataset_id, - table_id=table_params["table_id"], + table_id=table_params["pre-treatment"]["table_id"], filepath=treated_filepath, raw_filepath=raw_filepath, partitions=partitions, @@ -130,7 +124,7 @@ upload_logs_to_bq( dataset_id=dataset_id, - parent_table_id=table_params["table_id"], + parent_table_id=table_params["pre-treatment"]["table_id"], error=error, timestamp=timestamp, ) diff --git a/pipelines/rj_smtr/tasks.py b/pipelines/rj_smtr/tasks.py index 49c745076..1b9545ca8 100644 --- a/pipelines/rj_smtr/tasks.py +++ b/pipelines/rj_smtr/tasks.py @@ -29,6 +29,7 @@ log_critical, data_info_str, get_raw_data_api, + get_raw_data_gcs, ) from pipelines.utils.execute_dbt_model.utils import get_dbt_client from pipelines.utils.utils import log, get_redis_client, get_vault_secret @@ -950,58 +951,62 @@ def create_request_params( if dataset_id == constants.BILHETAGEM_DATASET_ID.value: secrets = get_vault_secret(secret_path)["data"] - database_secrets = secrets["databases"][table_params["database"]] + database_secrets = secrets["databases"][table_params["extraction"]["database"]] request_url = secrets["vpn_url"] + database_secrets["engine"] request_params = { "host": database_secrets["host"], # TODO: exibir no log em ambiente fechado - "database": table_params["database"], - "query": table_params["query"].format(**datetime_range), + "database": table_params["extraction"]["database"], + "query": table_params["extraction"]["query"].format(**datetime_range), } return request_params, request_url -# @task(checkpoint=False) -# def get_raw_from_sources( -# source: str, -# url:str, -# dataset_id:str = None, -# table_id:str = None, -# mode:str = None, -# headers: str = None, -# filetype: str = "json", -# csv_args: dict = None, -# params: dict = None, -# ): -# if source == "api": -# return get_raw_data_api( -# url=url, -# headers=headers, -# filetype=filetype, -# csv_args=csv_args, -# params=params -# ) -# if source == "gcs": -# file = - - @task(checkpoint=False) -def save_raw_storage( - dataset_id: str, - table_id: str, - raw_filepath: str, +def get_raw_from_sources( + source: str, + url: str, + dataset_id: str = None, + table_id: str = None, + file_name: str = None, partitions: str = None, + zip_file_name: str = None, + mode: str = None, + headers: str = None, + params: dict = None, ): - st_obj = Storage(table_id=table_id, dataset_id=dataset_id) - log( - f"""Uploading raw file to bucket {st_obj.bucket_name} at - {st_obj.bucket_name}/{dataset_id}/{table_id}""" - ) - st_obj.upload( - path=raw_filepath, - partitions=partitions, - mode="raw", - if_exists="replace", - ) + if source == "api": + return get_raw_data_api(url=url, headers=headers, params=params) + if source == "gcs": + return get_raw_data_gcs( + dataset_id=dataset_id, + table_id=table_id, + file_name=file_name, + mode=mode, + partitions=partitions, + zip_file_name=zip_file_name, + ) + + +@task(checkpoint=False) +def transform_data_to_json(status: dict, file_type: str, csv_args: dict): + data = status["data"] + error = status["error"] + + if file_type == "json": + pass + + # todo: move to data check on specfic API # pylint: disable=W0102 + # if isinstance(data, dict) and "DescricaoErro" in data.keys(): + # error = data["DescricaoErro"] + + elif file_type in ("txt", "csv"): + if csv_args is None: + csv_args = {} + data = pd.read_csv(io.StringIO(data), **csv_args).to_dict(orient="records") + else: + error = "Unsupported raw file extension. Supported only: json, csv and txt" + + return {"data": data, "error": error} diff --git a/pipelines/rj_smtr/utils.py b/pipelines/rj_smtr/utils.py index 3b3c7377d..c7b13bfc3 100644 --- a/pipelines/rj_smtr/utils.py +++ b/pipelines/rj_smtr/utils.py @@ -454,8 +454,6 @@ def generate_execute_schedules( # pylint: disable=too-many-arguments,too-many-l def get_raw_data_api( # pylint: disable=R0912 url: str, headers: str = None, - filetype: str = "json", - csv_args: dict = None, params: dict = None, ) -> list[dict]: """ @@ -464,8 +462,6 @@ def get_raw_data_api( # pylint: disable=R0912 Args: url (str): URL to send request headers (str, optional): Path to headers guardeded on Vault, if needed. - filetype (str, optional): Filetype to be formatted (supported only: json, csv and txt) - csv_args (dict, optional): Arguments for read_csv, if needed params (dict, optional): Params to be sent on request Returns: @@ -493,24 +489,9 @@ def get_raw_data_api( # pylint: disable=R0912 params=params, ) - if response.ok: # status code is less than 400 - if filetype == "json": - data = response.json() + response.raise_for_status() - # todo: move to data check on specfic API # pylint: disable=W0102 - if isinstance(data, dict) and "DescricaoErro" in data.keys(): - error = data["DescricaoErro"] - - elif filetype in ("txt", "csv"): - if csv_args is None: - csv_args = {} - data = pd.read_csv(io.StringIO(response.text), **csv_args).to_dict( - orient="records" - ) - else: - error = ( - "Unsupported raw file extension. Supported only: json, csv and txt" - ) + data = response.text except Exception as exp: error = exp @@ -522,25 +503,30 @@ def get_raw_data_api( # pylint: disable=R0912 def get_raw_data_gcs( - dataset_id: str, table_id: str, file_name: str, mode: str, zip_file_name: str = None + dataset_id: str, + table_id: str, + file_name: str, + mode: str, + partitions: str = None, + zip_extracted_file: str = None, ) -> dict: error = None data = None try: - if zip_file_name: - blob = get_storage_blob( - dataset_id=dataset_id, - table_id=table_id, - file_name=zip_file_name, - mode=mode, - ) + blob = get_storage_blob( + dataset_id=dataset_id, + table_id=table_id, + file_name=file_name, + partitions=partitions, + mode=mode, + ) + + if zip_extracted_file: compressed_data = blob.download_as_bytes() + with zipfile.ZipFile(io.BytesIO(compressed_data), "r") as zipped_file: - data = zipped_file.read(file_name).decode(encoding="utf-8") + data = zipped_file.read(zip_extracted_file).decode(encoding="utf-8") else: - blob = get_storage_blob( - dataset_id=dataset_id, table_id=table_id, file_name=file_name, mode=mode - ) data = blob.download_as_string() except Exception as exp: error = exp diff --git a/pipelines/utils/utils.py b/pipelines/utils/utils.py index 7042709e9..79a264017 100644 --- a/pipelines/utils/utils.py +++ b/pipelines/utils/utils.py @@ -725,13 +725,24 @@ def get_storage_blobs(dataset_id: str, table_id: str, mode: str = "staging") -> def get_storage_blob( - dataset_id: str, table_id: str, file_name: str, mode: str = "staging" + dataset_id: str, + table_id: str, + file_name: str, + partitions: str = None, + mode: str = "staging", ): + path = f"{mode}/{dataset_id}/{table_id}/" + + if partitions: + path += f"{partitions}/" + + path += file_name + bd_storage = bd.Storage(dataset_id=dataset_id, table_id=table_id) return ( bd_storage.client["storage_staging"] .bucket(bd_storage.bucket_name) - .get_blob(blob_name=f"{mode}/{dataset_id}/{table_id}/{file_name}") + .get_blob(blob_name=path) ) From 6f12477d14e45a2bb83c817976a597282625a66b Mon Sep 17 00:00:00 2001 From: fernandascovino Date: Tue, 26 Sep 2023 17:18:56 -0300 Subject: [PATCH 007/145] atualiza esquema do flow padrao --- pipelines/rj_smtr/constants.py | 3 + pipelines/rj_smtr/flows.py | 121 +++++++++---------- pipelines/rj_smtr/tasks.py | 206 ++++++++++++++++++++++----------- pipelines/rj_smtr/utils.py | 163 +++++++++++++++++++++----- pipelines/utils/utils.py | 20 +--- 5 files changed, 337 insertions(+), 176 deletions(-) diff --git a/pipelines/rj_smtr/constants.py b/pipelines/rj_smtr/constants.py index 3b1b6dc8d..d402bb6e9 100644 --- a/pipelines/rj_smtr/constants.py +++ b/pipelines/rj_smtr/constants.py @@ -262,3 +262,6 @@ class constants(Enum): # pylint: disable=c0103 }, ] BILHETAGEM_SECRET_PATH = "smtr_jae_access_data" + + # GTFS + diff --git a/pipelines/rj_smtr/flows.py b/pipelines/rj_smtr/flows.py index da802d277..fb763cc5a 100644 --- a/pipelines/rj_smtr/flows.py +++ b/pipelines/rj_smtr/flows.py @@ -22,15 +22,17 @@ create_local_partition_path, get_current_timestamp, parse_timestamp_to_string, - save_raw_local, - save_treated_local, - upload_logs_to_bq, - bq_upload, - transform_to_nested_structure, + # save_raw_local, + # save_treated_local, + # upload_logs_to_bq, + # bq_upload, + upload_raw_data_to_gcs, + upload_staging_data_to_gcs, + transform_raw_to_nested_structure, get_raw_from_sources, - transform_data_to_json, + # transform_data_to_json, create_request_params, - get_datetime_range, + # get_datetime_range, ) @@ -38,96 +40,87 @@ "SMTR: Captura", code_owners=["caio", "fernanda", "boris", "rodrigo"], ) as default_capture_flow: - # SETUP # + + ### Configuração ### - table_params = Parameter("table_params", default=None) - timestamp_param = Parameter("timestamp", default=None) - interval = Parameter("interval", default=None) + table_id = Parameter("table_id", default=None) + partition_date_only = Parameter("partition_date_only", default=None) + request_params = Parameter("request_params", default=None) dataset_id = Parameter("dataset_id", default=None) secret_path = Parameter("secret_path", default=None) + primary_key = Parameter("primary_key", default=None) + source_type = Parameter("source_type", default=None) - timestamp = get_current_timestamp(timestamp_param) - - datetime_range = get_datetime_range(timestamp, interval=interval) + timestamp = get_current_timestamp() rename_flow_run = rename_current_flow_run_now_time( - prefix=default_capture_flow.name + " " + table_params["flow_run_name"] + ": ", + prefix=default_capture_flow.name + " " + table_id + ": ", now_time=timestamp, ) - request_params, request_url = create_request_params( - datetime_range=datetime_range, - table_params=table_params, - secret_path=secret_path, - dataset_id=dataset_id, - ) - partitions = create_date_hour_partition( - timestamp, partition_date_only=table_params["partition_date_only"] + timestamp, partition_date_only=partition_date_only ) filename = parse_timestamp_to_string(timestamp) filepath = create_local_partition_path( dataset_id=dataset_id, - table_id=table_params["pre-treatment"]["table_id"], + table_id=table_id, filename=filename, partitions=partitions, ) - # CAPTURA - request_params, request_url = create_request_params( - datetime_range=datetime_range, - table_params=table_params, + ### Extração ### + # é necessária task ou função dentro da extract_raw_data? + request_params, request_path = create_request_params( secret_path=secret_path, dataset_id=dataset_id, ) - raw_status = get_raw_from_sources( - source=table_params["extraction"]["source"], - url=request_url, - dataset_id=dataset_id, - table_id=table_params["extraction"]["table_id"], - file_name=table_params["extraction"]["file_name"], - zip_file_name=table_params["extraction"]["zip_file_name"], - mode=table_params["extraction"]["mode"], - headers=secret_path, - params=request_params, + error, raw_filepath = get_raw_from_sources( + source_type=source_type, # parametro de extracao, onde ficar? + source_path=request_path, + zip_filename=table_id, + secret_path=secret_path, + request_params=request_params, ) - raw_filepath = save_raw_local(status=raw_status, file_path=filepath) - - # TREAT & CLEAN # - json_status = transform_data_to_json( - status=raw_status, - file_type=table_params["pre-treatment"]["file_type"], - csv_args=table_params["pre-treatment"]["csv_args"], + RAW_UPLOADED = upload_raw_data_to_gcs( + error=error, + filepath=raw_filepath, + timestamp=timestamp, + partitions=partitions ) - treated_status = transform_to_nested_structure( - status=json_status, + ### Pré-tratamento ### + + error, staging_filepath = transform_raw_to_nested_structure( + raw_filepath=raw_filepath, timestamp=timestamp, - primary_key=table_params["pre-treatment"]["primary_key"], + primary_key=primary_key, ) - treated_filepath = save_treated_local(status=treated_status, file_path=filepath) + STAGING_UPLOADED = upload_staging_data_to_gcs(error=error, filepath=staging_filepath, timestamp=timestamp) - # LOAD # - error = bq_upload( - dataset_id=dataset_id, - table_id=table_params["pre-treatment"]["table_id"], - filepath=treated_filepath, - raw_filepath=raw_filepath, - partitions=partitions, - status=treated_status, - ) + # treated_filepath = save_treated_local(status=treated_status, file_path=filepath) - upload_logs_to_bq( - dataset_id=dataset_id, - parent_table_id=table_params["pre-treatment"]["table_id"], - error=error, - timestamp=timestamp, - ) + # LOAD # + # error = bq_upload( + # dataset_id=dataset_id, + # table_id=table_params["pre-treatment"]["table_id"], + # filepath=treated_filepath, + # raw_filepath=raw_filepath, + # partitions=partitions, + # status=treated_status, + # ) + + # upload_logs_to_bq( + # dataset_id=dataset_id, + # parent_table_id=table_params["pre-treatment"]["table_id"], + # error=error, + # timestamp=timestamp, + # ) default_capture_flow.storage = GCS(emd_constants.GCS_FLOWS_BUCKET.value) default_capture_flow.run_config = KubernetesRun( diff --git a/pipelines/rj_smtr/tasks.py b/pipelines/rj_smtr/tasks.py index bf0aec407..b7f484171 100644 --- a/pipelines/rj_smtr/tasks.py +++ b/pipelines/rj_smtr/tasks.py @@ -30,6 +30,7 @@ data_info_str, get_raw_data_api, get_raw_data_gcs, + upload_run_logs_to_bq ) from pipelines.utils.execute_dbt_model.utils import get_dbt_client from pipelines.utils.utils import log, get_redis_client, get_vault_secret @@ -601,6 +602,69 @@ def upload_logs_to_bq( # pylint: disable=R0913 raise Exception(f"Pipeline failed with error: {error}") +@task +def upload_raw_data_to_gcs( + error: bool, raw_filepath: str, timestamp: datetime, table_id: str, dataset_id: str, partitions: list +): + if not error: + try: + st_obj = Storage(table_id=table_id, dataset_id=dataset_id) + log( + f"""Uploading raw file to bucket {st_obj.bucket_name} at + {st_obj.bucket_name}/{dataset_id}/{table_id}""" + ) + st_obj.upload( + path=raw_filepath, + partitions=partitions, + mode="raw", + if_exists="replace", + ) + except Exception: + error = traceback.format_exc() + log(f"[CATCHED] Task failed with error: \n{error}", level="error") + + upload_run_logs_to_bq( + dataset_id=dataset_id, + parent_table_id=table_id, + error=error, + timestamp=timestamp, + mode="raw" + ) + + +@task +def upload_staging_data_to_gcs( + error: bool, staging_filepath: str, timestamp: datetime, table_id: str, dataset_id: str, partitions: list +): + if not error: + try: + # Creates and publish table if it does not exist, append to it otherwise + create_or_append_table( + dataset_id=dataset_id, + table_id=table_id, + path=staging_filepath, + partitions=partitions + ) + except Exception: + error = traceback.format_exc() + log(f"[CATCHED] Task failed with error: \n{error}", level="error") + + upload_run_logs_to_bq( + dataset_id=dataset_id, + parent_table_id=table_id, + error=error, + timestamp=timestamp, + mode="staging" + ) + + +############### +# +# Daterange tasks +# +############### + + @task( checkpoint=False, max_retries=constants.MAX_RETRIES.value, @@ -791,9 +855,16 @@ def get_previous_date(days): return now.to_date_string() +############### +# +# Pretreat data +# +############### + + @task -def transform_to_nested_structure( - status: dict, timestamp: datetime, primary_key: list = None +def transform_raw_to_nested_structure( + filepath: str, error: bool, timestamp: datetime, primary_key: list = None ): """Transform dataframe to nested structure @@ -810,21 +881,29 @@ def transform_to_nested_structure( * `error` (str): catched error, if any. Otherwise, returns None """ + # ORGANIZAR: + # json_status = transform_data_to_json( + # status=raw_status, + # file_type=table_params["pre-treatment"]["file_type"], + # csv_args=table_params["pre-treatment"]["csv_args"], + # ) + # Check previous error - if status["error"] is not None: - return {"data": pd.DataFrame(), "error": status["error"]} + if error is not None: + return {"data": pd.DataFrame(), "error": error} # Check empty dataframe - if len(status["data"]) == 0: - log("Empty dataframe, skipping transformation...") - return {"data": pd.DataFrame(), "error": status["error"]} + # if len(status["data"]) == 0: + # log("Empty dataframe, skipping transformation...") + # return {"data": pd.DataFrame(), "error": error} try: if primary_key is None: primary_key = [] error = None - data = pd.DataFrame(status["data"]) + # leitura do dado raw + # data = pd.DataFrame(status["data"]) log( f""" @@ -860,40 +939,43 @@ def transform_to_nested_structure( level="info", ) + # save treated local + filepath = _save_trated_local(data=data, filepath=filepath) + except Exception as exp: # pylint: disable=W0703 error = exp if error is not None: log(f"[CATCHED] Task failed with error: \n{error}", level="error") - return {"data": data, "error": error} + return error, filepath -@task(checkpoint=False) -def get_datetime_range( - timestamp: datetime, - interval: int, -) -> dict: - """ - Task to get datetime range in UTC +# @task(checkpoint=False) +# def get_datetime_range( +# timestamp: datetime, +# interval: int, +# ) -> dict: +# """ +# Task to get datetime range in UTC - Args: - timestamp (datetime): timestamp to get datetime range - interval (int): interval in seconds +# Args: +# timestamp (datetime): timestamp to get datetime range +# interval (int): interval in seconds - Returns: - dict: datetime range - """ +# Returns: +# dict: datetime range +# """ - start = ( - (timestamp - timedelta(seconds=interval)) - .astimezone(tz=timezone("UTC")) - .strftime("%Y-%m-%d %H:%M:%S") - ) +# start = ( +# (timestamp - timedelta(seconds=interval)) +# .astimezone(tz=timezone("UTC")) +# .strftime("%Y-%m-%d %H:%M:%S") +# ) - end = timestamp.astimezone(tz=timezone("UTC")).strftime("%Y-%m-%d %H:%M:%S") +# end = timestamp.astimezone(tz=timezone("UTC")).strftime("%Y-%m-%d %H:%M:%S") - return {"start": start, "end": end} +# return {"start": start, "end": end} @task(checkpoint=False, nout=2) @@ -916,11 +998,8 @@ def create_request_params( if dataset_id == constants.BILHETAGEM_DATASET_ID.value: secrets = get_vault_secret(secret_path)["data"] - database_secrets = secrets["databases"][table_params["extraction"]["database"]] - request_url = secrets["vpn_url"] + database_secrets["engine"] - request_params = { "host": database_secrets["host"], # TODO: exibir no log em ambiente fechado "database": table_params["extraction"]["database"], @@ -932,47 +1011,40 @@ def create_request_params( @task(checkpoint=False) def get_raw_from_sources( - source: str, - url: str, - dataset_id: str = None, - table_id: str = None, - file_name: str = None, - partitions: str = None, - zip_file_name: str = None, - mode: str = None, - headers: str = None, - params: dict = None, + source_type: str, + source_path: str = None, + zip_filename: str = None, + secret_path: str = None, + api_params: dict = None, ): - if source == "api": - return get_raw_data_api(url=url, headers=headers, params=params) - if source == "gcs": + if source_type == "api": + return get_raw_data_api(url=source_path, secret_path=secret_path, params=api_params) + if source_type == "gcs": return get_raw_data_gcs( - dataset_id=dataset_id, - table_id=table_id, - file_name=file_name, - mode=mode, - partitions=partitions, - zip_file_name=zip_file_name, + gcs_path=source_path, + mode="raw", + zip_filename=zip_filename, ) -@task(checkpoint=False) -def transform_data_to_json(status: dict, file_type: str, csv_args: dict): - data = status["data"] - error = status["error"] +# TODO: passar para função para dentro da transform_raw_to_nested_structure +# @task(checkpoint=False) +# def transform_data_to_json(status: dict, file_type: str, csv_args: dict): +# data = status["data"] +# error = status["error"] - if file_type == "json": - pass +# if file_type == "json": +# pass - # todo: move to data check on specfic API # pylint: disable=W0102 - # if isinstance(data, dict) and "DescricaoErro" in data.keys(): - # error = data["DescricaoErro"] +# # todo: move to data check on specfic API # pylint: disable=W0102 +# # if isinstance(data, dict) and "DescricaoErro" in data.keys(): +# # error = data["DescricaoErro"] - elif file_type in ("txt", "csv"): - if csv_args is None: - csv_args = {} - data = pd.read_csv(io.StringIO(data), **csv_args).to_dict(orient="records") - else: - error = "Unsupported raw file extension. Supported only: json, csv and txt" +# elif file_type in ("txt", "csv"): +# if csv_args is None: +# csv_args = {} +# data = pd.read_csv(io.StringIO(data), **csv_args).to_dict(orient="records") +# else: +# error = "Unsupported raw file extension. Supported only: json, csv and txt" - return {"data": data, "error": error} +# return {"data": data, "error": error} diff --git a/pipelines/rj_smtr/utils.py b/pipelines/rj_smtr/utils.py index c7b13bfc3..a4376bb88 100644 --- a/pipelines/rj_smtr/utils.py +++ b/pipelines/rj_smtr/utils.py @@ -451,17 +451,47 @@ def generate_execute_schedules( # pylint: disable=too-many-arguments,too-many-l return clocks +def _save_raw_local(data: dict, file_path: str, mode: str = "raw", filetype: str = "json") -> str: + """ + Saves json response from API to .json file. + Args: + file_path (str): Path which to save raw file + status (dict): Must contain keys + * data: json returned from API + * error: error catched from API request + mode (str, optional): Folder to save locally, later folder which to upload to GCS. + Returns: + str: Path to the saved file + """ + + # diferentes tipos de arquivos para salvar + _file_path = file_path.format(mode=mode, filetype=filetype) + Path(_file_path).parent.mkdir(parents=True, exist_ok=True) + + if filetype == "json": + json.dump(data, Path(_file_path).open("w", encoding="utf-8")) + + if filetype == "csv": + pass + if filetype == "txt": + pass + + log(f"Raw data saved to: {_file_path}") + return _file_path + + def get_raw_data_api( # pylint: disable=R0912 url: str, - headers: str = None, - params: dict = None, + secret_path: str = None, + api_params: dict = None, + filepath: str = None ) -> list[dict]: """ Request data from URL API Args: url (str): URL to send request - headers (str, optional): Path to headers guardeded on Vault, if needed. + secret_path (str, optional): Path to secrets guardeded on Vault, if needed. params (dict, optional): Params to be sent on request Returns: @@ -469,58 +499,45 @@ def get_raw_data_api( # pylint: disable=R0912 * `data` (json): data result * `error` (str): catched error, if any. Otherwise, returns None """ - data = None error = None - try: - if headers is not None: - headers = get_vault_secret(headers)["data"] - - # remove from headers, if present - remove_headers = ["host", "databases"] - for remove_header in remove_headers: - if remove_header in list(headers.keys()): - del headers[remove_header] + if secret_path is None: + headers = secret_path + else: + headers = get_vault_secret(secret_path)["data"] response = requests.get( url, headers=headers, timeout=constants.MAX_TIMEOUT_SECONDS.value, - params=params, + params=api_params, ) response.raise_for_status() - - data = response.text + filepath = _save_raw_local(data=response.text, filepath=filepath) except Exception as exp: error = exp - - if error is not None: log(f"[CATCHED] Task failed with error: \n{error}", level="error") - return {"data": data, "error": error} + return error, filepath def get_raw_data_gcs( - dataset_id: str, - table_id: str, - file_name: str, - mode: str, - partitions: str = None, + gcs_path: str, zip_extracted_file: str = None, ) -> dict: + error = None - data = None + try: blob = get_storage_blob( - dataset_id=dataset_id, - table_id=table_id, - file_name=file_name, - partitions=partitions, - mode=mode, + gcs_path=gcs_path, + mode="raw", ) + data = blob.download_as_bytes() + if zip_extracted_file: compressed_data = blob.download_as_bytes() @@ -528,7 +545,93 @@ def get_raw_data_gcs( data = zipped_file.read(zip_extracted_file).decode(encoding="utf-8") else: data = blob.download_as_string() + except Exception as exp: error = exp return {"data": data, "error": error} + + +def _save_treated_local(file_path: str, status: dict, mode: str = "staging") -> str: + """ + Save treated file to CSV. + + Args: + file_path (str): Path which to save treated file + status (dict): Must contain keys + * `data`: dataframe returned from treatement + * `error`: error catched from data treatement + mode (str, optional): Folder to save locally, later folder which to upload to GCS. + + Returns: + str: Path to the saved file + """ + _file_path = file_path.format(mode=mode, filetype="csv") + Path(_file_path).parent.mkdir(parents=True, exist_ok=True) + if status["error"] is None: + status["data"].to_csv(_file_path, index=False) + log(f"Treated data saved to: {_file_path}") + return _file_path + + +def upload_run_logs_to_bq( # pylint: disable=R0913 + dataset_id: str, + parent_table_id: str, + timestamp: str, + error: str = None, + previous_error: str = None, + recapture: bool = False, + mode: str = "raw" +): + """ + Upload execution status table to BigQuery. + Table is uploaded to the same dataset, named {parent_table_id}_logs. + If passing status_dict, should not pass timestamp and error. + + Args: + dataset_id (str): dataset_id on BigQuery + parent_table_id (str): Parent table id related to the status table + timestamp (str): ISO formatted timestamp string + error (str, optional): String associated with error caught during execution + Returns: + None + """ + table_id = parent_table_id + "_logs" + # Create partition directory + filename = f"{table_id}_{timestamp.isoformat()}" + partition = f"data={timestamp.date()}" + filepath = Path( + f"""data/{mode}/{dataset_id}/{table_id}/{partition}/{filename}.csv""" + ) + filepath.parent.mkdir(exist_ok=True, parents=True) + # Create dataframe to be uploaded + if not error and recapture is True: + # if the recapture is succeeded, update the column erro + dataframe = pd.DataFrame( + { + "timestamp_captura": [timestamp], + "sucesso": [True], + "erro": [f"[recapturado]{previous_error}"], + } + ) + log(f"Recapturing {timestamp} with previous error:\n{error}") + else: + # not recapturing or error during flow execution + dataframe = pd.DataFrame( + { + "timestamp_captura": [timestamp], + "sucesso": [error is None], + "erro": [error], + } + ) + # Save data local + dataframe.to_csv(filepath, index=False) + # Upload to Storage + create_or_append_table( + dataset_id=dataset_id, + table_id=table_id, + path=filepath.as_posix(), + partitions=partition, + ) + if error is not None: + raise Exception(f"Pipeline failed with error: {error}") \ No newline at end of file diff --git a/pipelines/utils/utils.py b/pipelines/utils/utils.py index 79a264017..147e54f4f 100644 --- a/pipelines/utils/utils.py +++ b/pipelines/utils/utils.py @@ -725,24 +725,14 @@ def get_storage_blobs(dataset_id: str, table_id: str, mode: str = "staging") -> def get_storage_blob( - dataset_id: str, - table_id: str, - file_name: str, - partitions: str = None, + gcs_path: str, mode: str = "staging", ): - path = f"{mode}/{dataset_id}/{table_id}/" - - if partitions: - path += f"{partitions}/" - - path += file_name - - bd_storage = bd.Storage(dataset_id=dataset_id, table_id=table_id) + bucket = bd.Storage() return ( - bd_storage.client["storage_staging"] - .bucket(bd_storage.bucket_name) - .get_blob(blob_name=path) + bucket.client["storage_staging"] + .bucket(bucket.bucket_name) + .get_blob(blob_name=gcs_path) ) From 0c3df1b05e8a257a20d9367cb282050a1df74cb9 Mon Sep 17 00:00:00 2001 From: Rafael Date: Tue, 26 Sep 2023 22:41:01 -0300 Subject: [PATCH 008/145] change default capture flow structure --- pipelines/rj_smtr/constants.py | 12 ++++- pipelines/rj_smtr/tasks.py | 87 ++++++++++++++++++++++------------ pipelines/rj_smtr/utils.py | 55 ++++++++++++--------- 3 files changed, 102 insertions(+), 52 deletions(-) diff --git a/pipelines/rj_smtr/constants.py b/pipelines/rj_smtr/constants.py index d402bb6e9..00558f9cc 100644 --- a/pipelines/rj_smtr/constants.py +++ b/pipelines/rj_smtr/constants.py @@ -264,4 +264,14 @@ class constants(Enum): # pylint: disable=c0103 BILHETAGEM_SECRET_PATH = "smtr_jae_access_data" # GTFS - + GTFS_DATASET_ID = "br_rj_riodejaneiro_gtfs" + + GTFS_SOURCE_TYPE = "gcs" + + GTFS_AGENCY_REQUEST_PARAMS = { + "filepath": "development/br_rj_riodejaneiro_gtfs/upload/gtfs.zip" + } + + GTFS_AGENCY_TABLE_ID = "agency" + + GTFS_QUADRO_TABLE_ID = "quadro" diff --git a/pipelines/rj_smtr/tasks.py b/pipelines/rj_smtr/tasks.py index b7f484171..0a40dae26 100644 --- a/pipelines/rj_smtr/tasks.py +++ b/pipelines/rj_smtr/tasks.py @@ -30,7 +30,7 @@ data_info_str, get_raw_data_api, get_raw_data_gcs, - upload_run_logs_to_bq + upload_run_logs_to_bq, ) from pipelines.utils.execute_dbt_model.utils import get_dbt_client from pipelines.utils.utils import log, get_redis_client, get_vault_secret @@ -604,7 +604,12 @@ def upload_logs_to_bq( # pylint: disable=R0913 @task def upload_raw_data_to_gcs( - error: bool, raw_filepath: str, timestamp: datetime, table_id: str, dataset_id: str, partitions: list + error: bool, + raw_filepath: str, + timestamp: datetime, + table_id: str, + dataset_id: str, + partitions: list, ): if not error: try: @@ -622,19 +627,24 @@ def upload_raw_data_to_gcs( except Exception: error = traceback.format_exc() log(f"[CATCHED] Task failed with error: \n{error}", level="error") - + upload_run_logs_to_bq( dataset_id=dataset_id, parent_table_id=table_id, error=error, timestamp=timestamp, - mode="raw" + mode="raw", ) @task def upload_staging_data_to_gcs( - error: bool, staging_filepath: str, timestamp: datetime, table_id: str, dataset_id: str, partitions: list + error: bool, + staging_filepath: str, + timestamp: datetime, + table_id: str, + dataset_id: str, + partitions: list, ): if not error: try: @@ -643,20 +653,20 @@ def upload_staging_data_to_gcs( dataset_id=dataset_id, table_id=table_id, path=staging_filepath, - partitions=partitions + partitions=partitions, ) except Exception: error = traceback.format_exc() log(f"[CATCHED] Task failed with error: \n{error}", level="error") - + upload_run_logs_to_bq( dataset_id=dataset_id, parent_table_id=table_id, error=error, timestamp=timestamp, - mode="staging" + mode="staging", ) - + ############### # @@ -904,7 +914,7 @@ def transform_raw_to_nested_structure( error = None # leitura do dado raw # data = pd.DataFrame(status["data"]) - + data = None log( f""" Received inputs: @@ -940,7 +950,7 @@ def transform_raw_to_nested_structure( ) # save treated local - filepath = _save_trated_local(data=data, filepath=filepath) + # filepath = _save_trated_local(data=data, filepath=filepath) except Exception as exp: # pylint: disable=W0703 error = exp @@ -980,7 +990,11 @@ def transform_raw_to_nested_structure( @task(checkpoint=False, nout=2) def create_request_params( - datetime_range: dict, table_params: dict, secret_path: str, dataset_id: str + # datetime_range: dict, + # table_params: dict, + table_id: str, + secret_path: str, + dataset_id: str, ) -> tuple: """ Task to create request params @@ -995,16 +1009,28 @@ def create_request_params( request_params: host, database and query to request data request_url: url to request data """ - + request_params = None # TODO: retirar essa linha if dataset_id == constants.BILHETAGEM_DATASET_ID.value: secrets = get_vault_secret(secret_path)["data"] - database_secrets = secrets["databases"][table_params["extraction"]["database"]] - request_url = secrets["vpn_url"] + database_secrets["engine"] - request_params = { - "host": database_secrets["host"], # TODO: exibir no log em ambiente fechado - "database": table_params["extraction"]["database"], - "query": table_params["extraction"]["query"].format(**datetime_range), - } + + # TODO: RETIRAR ESSA LINHA + request_params = secrets + + # TODO: mudar modo de pegar os parametros + # database_secrets = secrets["databases"][table_params["extraction"]["database"]] + # request_url = secrets["vpn_url"] + database_secrets["engine"] + # request_params = { + # "host": database_secrets["host"], # TODO: exibir no log em ambiente fechado + # "database": table_params["extraction"]["database"], + # "query": table_params["extraction"]["query"].format(**datetime_range), + # } + + elif dataset_id == constants.GTFS_DATASET_ID.value: + gtfs_base_path = "development/br_rj_riodejaneiro_gtfs/upload" + if table_id == constants.GTFS_QUADRO_ID.value: + request_url = f"{gtfs_base_path}/quadro.csv" + else: + request_url = f"{gtfs_base_path}/gtfs.zip" return request_params, request_url @@ -1013,18 +1039,21 @@ def create_request_params( def get_raw_from_sources( source_type: str, source_path: str = None, - zip_filename: str = None, + table_id: str = None, secret_path: str = None, api_params: dict = None, ): - if source_type == "api": - return get_raw_data_api(url=source_path, secret_path=secret_path, params=api_params) - if source_type == "gcs": - return get_raw_data_gcs( - gcs_path=source_path, - mode="raw", - zip_filename=zip_filename, - ) + pass + # TODO: descomentar linhas abaixo, passando argumentos corretos + # if source_type == "api": + # return get_raw_data_api( + # url=source_path, secret_path=secret_path, params=api_params + # ) + # if source_type == "gcs": + # return get_raw_data_gcs( + # gcs_path=source_path, + # filename_to_unzip=table_id, + # ) # TODO: passar para função para dentro da transform_raw_to_nested_structure diff --git a/pipelines/rj_smtr/utils.py b/pipelines/rj_smtr/utils.py index a4376bb88..68774c17d 100644 --- a/pipelines/rj_smtr/utils.py +++ b/pipelines/rj_smtr/utils.py @@ -10,12 +10,14 @@ from datetime import timedelta, datetime from typing import List import io -import basedosdados as bd -from basedosdados import Table -import pandas as pd +import json import pytz import requests import zipfile +import basedosdados as bd +from basedosdados import Table +import pandas as pd + from prefect.schedules.clocks import IntervalClock @@ -451,7 +453,9 @@ def generate_execute_schedules( # pylint: disable=too-many-arguments,too-many-l return clocks -def _save_raw_local(data: dict, file_path: str, mode: str = "raw", filetype: str = "json") -> str: +def _save_raw_local( + data: dict, file_path: str, mode: str = "raw", filetype: str = "json" +) -> str: """ Saves json response from API to .json file. Args: @@ -471,20 +475,18 @@ def _save_raw_local(data: dict, file_path: str, mode: str = "raw", filetype: str if filetype == "json": json.dump(data, Path(_file_path).open("w", encoding="utf-8")) - if filetype == "csv": - pass + # if filetype == "csv": + # pass if filetype == "txt": - pass + with open(_file_path, "w", encoding="utf-8") as file: + file.write(data) log(f"Raw data saved to: {_file_path}") return _file_path def get_raw_data_api( # pylint: disable=R0912 - url: str, - secret_path: str = None, - api_params: dict = None, - filepath: str = None + url: str, secret_path: str = None, api_params: dict = None, filepath: str = None ) -> list[dict]: """ Request data from URL API @@ -525,9 +527,9 @@ def get_raw_data_api( # pylint: disable=R0912 def get_raw_data_gcs( gcs_path: str, - zip_extracted_file: str = None, + local_filepath: str, + filename_to_unzip: str = None, ) -> dict: - error = None try: @@ -538,18 +540,27 @@ def get_raw_data_gcs( data = blob.download_as_bytes() - if zip_extracted_file: - compressed_data = blob.download_as_bytes() - - with zipfile.ZipFile(io.BytesIO(compressed_data), "r") as zipped_file: - data = zipped_file.read(zip_extracted_file).decode(encoding="utf-8") + if filename_to_unzip: + with zipfile.ZipFile(io.BytesIO(data), "r") as zipped_file: + filenames = zipped_file.namelist() + filename = list( + filter(lambda x: x.split(".")[0] == filename_to_unzip, filenames) + )[0] + data = zipped_file.read(filename) else: - data = blob.download_as_string() + filename = blob.name + + raw_filepath = _save_raw_local( + data=data.decode(encoding="utf-8"), + file_path=local_filepath, + filetype=filename.split(".")[-1], + ) except Exception as exp: error = exp + log(f"[CATCHED] Task failed with error: \n{error}", level="error") - return {"data": data, "error": error} + return error, raw_filepath def _save_treated_local(file_path: str, status: dict, mode: str = "staging") -> str: @@ -581,7 +592,7 @@ def upload_run_logs_to_bq( # pylint: disable=R0913 error: str = None, previous_error: str = None, recapture: bool = False, - mode: str = "raw" + mode: str = "raw", ): """ Upload execution status table to BigQuery. @@ -634,4 +645,4 @@ def upload_run_logs_to_bq( # pylint: disable=R0913 partitions=partition, ) if error is not None: - raise Exception(f"Pipeline failed with error: {error}") \ No newline at end of file + raise Exception(f"Pipeline failed with error: {error}") From f6ca7ab8c23ad720e30b00c1862837848ad1fad3 Mon Sep 17 00:00:00 2001 From: Rafael Date: Wed, 27 Sep 2023 10:36:00 -0300 Subject: [PATCH 009/145] change generic capture flow --- pipelines/rj_smtr/flows.py | 53 ++++++++++------------ pipelines/rj_smtr/tasks.py | 80 +++++++++++++++++++-------------- pipelines/rj_smtr/utils.py | 91 +++++++++++++++++++++++++++++--------- 3 files changed, 141 insertions(+), 83 deletions(-) diff --git a/pipelines/rj_smtr/flows.py b/pipelines/rj_smtr/flows.py index fb763cc5a..3dd834a75 100644 --- a/pipelines/rj_smtr/flows.py +++ b/pipelines/rj_smtr/flows.py @@ -40,8 +40,7 @@ "SMTR: Captura", code_owners=["caio", "fernanda", "boris", "rodrigo"], ) as default_capture_flow: - - ### Configuração ### + # Configuração # table_id = Parameter("table_id", default=None) partition_date_only = Parameter("partition_date_only", default=None) @@ -71,15 +70,19 @@ partitions=partitions, ) - ### Extração ### + # Extração # # é necessária task ou função dentro da extract_raw_data? request_params, request_path = create_request_params( secret_path=secret_path, dataset_id=dataset_id, + request_params=request_params, + table_id=table_id, + timestamp=timestamp, ) error, raw_filepath = get_raw_from_sources( - source_type=source_type, # parametro de extracao, onde ficar? + source_type=source_type, # parametro de extracao, onde ficar? + local_filepath=filepath, source_path=request_path, zip_filename=table_id, secret_path=secret_path, @@ -87,40 +90,32 @@ ) RAW_UPLOADED = upload_raw_data_to_gcs( - error=error, - filepath=raw_filepath, - timestamp=timestamp, - partitions=partitions + error=error, + raw_filepath=raw_filepath, + timestamp=timestamp, + table_id=table_id, + dataset_id=dataset_id, + partitions=partitions, ) - ### Pré-tratamento ### + # Pré-tratamento # error, staging_filepath = transform_raw_to_nested_structure( raw_filepath=raw_filepath, + filepath=filepath, + error=error, timestamp=timestamp, primary_key=primary_key, ) - STAGING_UPLOADED = upload_staging_data_to_gcs(error=error, filepath=staging_filepath, timestamp=timestamp) - - # treated_filepath = save_treated_local(status=treated_status, file_path=filepath) - - # LOAD # - # error = bq_upload( - # dataset_id=dataset_id, - # table_id=table_params["pre-treatment"]["table_id"], - # filepath=treated_filepath, - # raw_filepath=raw_filepath, - # partitions=partitions, - # status=treated_status, - # ) - - # upload_logs_to_bq( - # dataset_id=dataset_id, - # parent_table_id=table_params["pre-treatment"]["table_id"], - # error=error, - # timestamp=timestamp, - # ) + STAGING_UPLOADED = upload_staging_data_to_gcs( + error=error, + staging_filepath=staging_filepath, + timestamp=timestamp, + table_id=table_id, + dataset_id=dataset_id, + partitions=partitions, + ) default_capture_flow.storage = GCS(emd_constants.GCS_FLOWS_BUCKET.value) default_capture_flow.run_config = KubernetesRun( diff --git a/pipelines/rj_smtr/tasks.py b/pipelines/rj_smtr/tasks.py index 0a40dae26..89beae6f2 100644 --- a/pipelines/rj_smtr/tasks.py +++ b/pipelines/rj_smtr/tasks.py @@ -31,6 +31,9 @@ get_raw_data_api, get_raw_data_gcs, upload_run_logs_to_bq, + get_datetime_range, + transform_data_to_json, + save_treated_local_func, ) from pipelines.utils.execute_dbt_model.utils import get_dbt_client from pipelines.utils.utils import log, get_redis_client, get_vault_secret @@ -874,7 +877,11 @@ def get_previous_date(days): @task def transform_raw_to_nested_structure( - filepath: str, error: bool, timestamp: datetime, primary_key: list = None + raw_filepath: str, + filepath: str, + error: bool, + timestamp: datetime, + primary_key: list = None, ): """Transform dataframe to nested structure @@ -891,16 +898,18 @@ def transform_raw_to_nested_structure( * `error` (str): catched error, if any. Otherwise, returns None """ + with open(raw_filepath, "r", encoding="utf-8") as file: + data = file.read() + # ORGANIZAR: - # json_status = transform_data_to_json( - # status=raw_status, - # file_type=table_params["pre-treatment"]["file_type"], - # csv_args=table_params["pre-treatment"]["csv_args"], - # ) + error, data = transform_data_to_json( + data=data, + file_type=raw_filepath.split(".")[-1], + ) # Check previous error if error is not None: - return {"data": pd.DataFrame(), "error": error} + return error, None # Check empty dataframe # if len(status["data"]) == 0: @@ -913,8 +922,8 @@ def transform_raw_to_nested_structure( error = None # leitura do dado raw - # data = pd.DataFrame(status["data"]) - data = None + data = pd.DataFrame(data) + log( f""" Received inputs: @@ -950,7 +959,7 @@ def transform_raw_to_nested_structure( ) # save treated local - # filepath = _save_trated_local(data=data, filepath=filepath) + filepath = save_treated_local_func(data=data, error=error, filepath=filepath) except Exception as exp: # pylint: disable=W0703 error = exp @@ -992,9 +1001,11 @@ def transform_raw_to_nested_structure( def create_request_params( # datetime_range: dict, # table_params: dict, + request_params: dict, table_id: str, secret_path: str, dataset_id: str, + timestamp: datetime, ) -> tuple: """ Task to create request params @@ -1009,25 +1020,25 @@ def create_request_params( request_params: host, database and query to request data request_url: url to request data """ - request_params = None # TODO: retirar essa linha + if dataset_id == constants.BILHETAGEM_DATASET_ID.value: secrets = get_vault_secret(secret_path)["data"] - # TODO: RETIRAR ESSA LINHA - request_params = secrets + database_secrets = secrets["databases"][request_params["database"]] + request_url = secrets["vpn_url"] + database_secrets["engine"] - # TODO: mudar modo de pegar os parametros - # database_secrets = secrets["databases"][table_params["extraction"]["database"]] - # request_url = secrets["vpn_url"] + database_secrets["engine"] - # request_params = { - # "host": database_secrets["host"], # TODO: exibir no log em ambiente fechado - # "database": table_params["extraction"]["database"], - # "query": table_params["extraction"]["query"].format(**datetime_range), - # } + datetime_range = get_datetime_range( + timestamp=timestamp, interval=request_params["run_interval"] + ) + request_params = { + "host": database_secrets["host"], # TODO: exibir no log em ambiente fechado + "database": request_params["database"], + "query": request_params["query"].format(**datetime_range), + } elif dataset_id == constants.GTFS_DATASET_ID.value: gtfs_base_path = "development/br_rj_riodejaneiro_gtfs/upload" - if table_id == constants.GTFS_QUADRO_ID.value: + if table_id == constants.GTFS_QUADRO_TABLE_ID.value: request_url = f"{gtfs_base_path}/quadro.csv" else: request_url = f"{gtfs_base_path}/gtfs.zip" @@ -1038,22 +1049,25 @@ def create_request_params( @task(checkpoint=False) def get_raw_from_sources( source_type: str, + local_filepath: str, source_path: str = None, table_id: str = None, secret_path: str = None, api_params: dict = None, ): - pass - # TODO: descomentar linhas abaixo, passando argumentos corretos - # if source_type == "api": - # return get_raw_data_api( - # url=source_path, secret_path=secret_path, params=api_params - # ) - # if source_type == "gcs": - # return get_raw_data_gcs( - # gcs_path=source_path, - # filename_to_unzip=table_id, - # ) + if source_type == "api": + return get_raw_data_api( + url=source_path, + secret_path=secret_path, + api_params=api_params, + filepath=local_filepath, + ) + if source_type == "gcs": + return get_raw_data_gcs( + gcs_path=source_path, + filename_to_unzip=table_id, + local_filepath=local_filepath, + ) # TODO: passar para função para dentro da transform_raw_to_nested_structure diff --git a/pipelines/rj_smtr/utils.py b/pipelines/rj_smtr/utils.py index 68774c17d..184a93df7 100644 --- a/pipelines/rj_smtr/utils.py +++ b/pipelines/rj_smtr/utils.py @@ -11,9 +11,9 @@ from typing import List import io import json +import zipfile import pytz import requests -import zipfile import basedosdados as bd from basedosdados import Table import pandas as pd @@ -453,13 +453,13 @@ def generate_execute_schedules( # pylint: disable=too-many-arguments,too-many-l return clocks -def _save_raw_local( - data: dict, file_path: str, mode: str = "raw", filetype: str = "json" +def save_raw_local_func( + data: dict, filepath: str, mode: str = "raw", filetype: str = "json" ) -> str: """ Saves json response from API to .json file. Args: - file_path (str): Path which to save raw file + filepath (str): Path which to save raw file status (dict): Must contain keys * data: json returned from API * error: error catched from API request @@ -469,20 +469,20 @@ def _save_raw_local( """ # diferentes tipos de arquivos para salvar - _file_path = file_path.format(mode=mode, filetype=filetype) - Path(_file_path).parent.mkdir(parents=True, exist_ok=True) + _filepath = filepath.format(mode=mode, filetype=filetype) + Path(_filepath).parent.mkdir(parents=True, exist_ok=True) if filetype == "json": - json.dump(data, Path(_file_path).open("w", encoding="utf-8")) + json.dump(data, Path(_filepath).open("w", encoding="utf-8")) # if filetype == "csv": # pass if filetype == "txt": - with open(_file_path, "w", encoding="utf-8") as file: + with open(_filepath, "w", encoding="utf-8") as file: file.write(data) - log(f"Raw data saved to: {_file_path}") - return _file_path + log(f"Raw data saved to: {_filepath}") + return _filepath def get_raw_data_api( # pylint: disable=R0912 @@ -516,7 +516,9 @@ def get_raw_data_api( # pylint: disable=R0912 ) response.raise_for_status() - filepath = _save_raw_local(data=response.text, filepath=filepath) + filepath = save_raw_local_func( + data=response.text, filepath=filepath + ) # TODO: mudar filetype except Exception as exp: error = exp @@ -550,9 +552,9 @@ def get_raw_data_gcs( else: filename = blob.name - raw_filepath = _save_raw_local( + raw_filepath = save_raw_local_func( data=data.decode(encoding="utf-8"), - file_path=local_filepath, + filepath=local_filepath, filetype=filename.split(".")[-1], ) @@ -563,12 +565,14 @@ def get_raw_data_gcs( return error, raw_filepath -def _save_treated_local(file_path: str, status: dict, mode: str = "staging") -> str: +def save_treated_local_func( + filepath: str, data: pd.DataFrame, error: str, mode: str = "staging" +) -> str: """ Save treated file to CSV. Args: - file_path (str): Path which to save treated file + filepath (str): Path which to save treated file status (dict): Must contain keys * `data`: dataframe returned from treatement * `error`: error catched from data treatement @@ -577,12 +581,12 @@ def _save_treated_local(file_path: str, status: dict, mode: str = "staging") -> Returns: str: Path to the saved file """ - _file_path = file_path.format(mode=mode, filetype="csv") - Path(_file_path).parent.mkdir(parents=True, exist_ok=True) - if status["error"] is None: - status["data"].to_csv(_file_path, index=False) - log(f"Treated data saved to: {_file_path}") - return _file_path + _filepath = filepath.format(mode=mode, filetype="csv") + Path(_filepath).parent.mkdir(parents=True, exist_ok=True) + if error is None: + data.to_csv(_filepath, index=False) + log(f"Treated data saved to: {_filepath}") + return _filepath def upload_run_logs_to_bq( # pylint: disable=R0913 @@ -646,3 +650,48 @@ def upload_run_logs_to_bq( # pylint: disable=R0913 ) if error is not None: raise Exception(f"Pipeline failed with error: {error}") + + +def get_datetime_range( + timestamp: datetime, + interval: int, +) -> dict: + """ + Task to get datetime range in UTC + + Args: + timestamp (datetime): timestamp to get datetime range + interval (int): interval in seconds + + Returns: + dict: datetime range + """ + + start = ( + (timestamp - timedelta(seconds=interval)) + .astimezone(tz=pytz.timezone("UTC")) + .strftime("%Y-%m-%d %H:%M:%S") + ) + + end = timestamp.astimezone(tz=pytz.timezone("UTC")).strftime("%Y-%m-%d %H:%M:%S") + + return {"start": start, "end": end} + + +def transform_data_to_json(data: str, file_type: str, csv_args: dict = {}): + try: + if file_type == "json": + data = json.loads(data) + + elif file_type in ("txt", "csv"): + if csv_args is None: + csv_args = {} + data = pd.read_csv(io.StringIO(data), **csv_args).to_dict(orient="records") + else: + error = "Unsupported raw file extension. Supported only: json, csv and txt" + + except Exception as exp: + error = exp + log(f"[CATCHED] Task failed with error: \n{error}", level="error") + + return error, data From fa17be21b41769895fb4154b78d86d373652d368 Mon Sep 17 00:00:00 2001 From: Rafael Date: Wed, 27 Sep 2023 11:20:15 -0300 Subject: [PATCH 010/145] adjust constant structure --- pipelines/rj_smtr/constants.py | 36 +++++++++++++++++++++++++------- pipelines/rj_smtr/flows.py | 6 ++---- pipelines/rj_smtr/tasks.py | 38 +++++++++++++++------------------- 3 files changed, 48 insertions(+), 32 deletions(-) diff --git a/pipelines/rj_smtr/constants.py b/pipelines/rj_smtr/constants.py index 00558f9cc..7eb18ef85 100644 --- a/pipelines/rj_smtr/constants.py +++ b/pipelines/rj_smtr/constants.py @@ -165,6 +165,18 @@ class constants(Enum): # pylint: disable=c0103 # BILHETAGEM BILHETAGEM_DATASET_ID = "br_rj_riodejaneiro_bilhetagem" + BILHETAGEM_DATABASES = { + "principal_db": { + "engine": "mysql", + "host": "principal-database-replica.internal", + }, + "tarifa_db": {"engine": "postgres", "host": "tarifa-database-replica.internal"}, + "transacao_db": { + "engine": "postgres", + "host": "transacao-database-replica.internal", + }, + } + BILHETAGEM_VPN_URL = "http://vpn-jae.mobilidade.rio/" BILHETAGEM_TRANSACAO_TABLE_PARAMS = [ { "partition_date_only": False, @@ -264,14 +276,24 @@ class constants(Enum): # pylint: disable=c0103 BILHETAGEM_SECRET_PATH = "smtr_jae_access_data" # GTFS - GTFS_DATASET_ID = "br_rj_riodejaneiro_gtfs" + GTFS_CAPTURE_PARAMS = [ + {"table_id": "agency", "primary_key": ["agency_id"]}, + {"table_id": "calendar_dates", "primary_key": ["service_id"]}, + {"table_id": "calendar", "primary_key": ["service_id"]}, + {"table_id": "feed_info", "primary_key": ["feed_publisher_name"]}, + {"table_id": "frequencies", "primary_key": ["trip_id"]}, + {"table_id": "routes", "primary_key": ["route_id"]}, + {"table_id": "shapes", "primary_key": ["shape_id"]}, + {"table_id": "stops", "primary_key": ["stop_id"]}, + {"table_id": "trips", "primary_key": ["trip_id"]}, + {"table_id": "fare_attributes", "primary_key": ["fare_id"]}, + {"table_id": "fare_rules", "primary_key": ["fare_id"]}, + ] - GTFS_SOURCE_TYPE = "gcs" + GTFS_GENERAL_CAPTURE_PARAMS = {"partition_date_only": True, "source_type": "gcs"} - GTFS_AGENCY_REQUEST_PARAMS = { - "filepath": "development/br_rj_riodejaneiro_gtfs/upload/gtfs.zip" - } + GTFS_QUADRO_CAPTURE_PARAMS = {"table_id": "quadro", "primary_key": "servico"} - GTFS_AGENCY_TABLE_ID = "agency" + GTFS_DATASET_ID = "br_rj_riodejaneiro_gtfs" - GTFS_QUADRO_TABLE_ID = "quadro" + GTFS_BASE_GCS_PATH = "development/br_rj_riodejaneiro_gtfs/upload" diff --git a/pipelines/rj_smtr/flows.py b/pipelines/rj_smtr/flows.py index 3dd834a75..94a3ffb93 100644 --- a/pipelines/rj_smtr/flows.py +++ b/pipelines/rj_smtr/flows.py @@ -44,7 +44,7 @@ table_id = Parameter("table_id", default=None) partition_date_only = Parameter("partition_date_only", default=None) - request_params = Parameter("request_params", default=None) + extract_params = Parameter("extract_params", default=None) dataset_id = Parameter("dataset_id", default=None) secret_path = Parameter("secret_path", default=None) primary_key = Parameter("primary_key", default=None) @@ -71,11 +71,9 @@ ) # Extração # - # é necessária task ou função dentro da extract_raw_data? request_params, request_path = create_request_params( - secret_path=secret_path, dataset_id=dataset_id, - request_params=request_params, + extract_params=extract_params, table_id=table_id, timestamp=timestamp, ) diff --git a/pipelines/rj_smtr/tasks.py b/pipelines/rj_smtr/tasks.py index 89beae6f2..a134dd966 100644 --- a/pipelines/rj_smtr/tasks.py +++ b/pipelines/rj_smtr/tasks.py @@ -879,7 +879,7 @@ def get_previous_date(days): def transform_raw_to_nested_structure( raw_filepath: str, filepath: str, - error: bool, + error: str, timestamp: datetime, primary_key: list = None, ): @@ -898,6 +898,10 @@ def transform_raw_to_nested_structure( * `error` (str): catched error, if any. Otherwise, returns None """ + # Check previous error + if error is not None: + return error, None + with open(raw_filepath, "r", encoding="utf-8") as file: data = file.read() @@ -907,10 +911,6 @@ def transform_raw_to_nested_structure( file_type=raw_filepath.split(".")[-1], ) - # Check previous error - if error is not None: - return error, None - # Check empty dataframe # if len(status["data"]) == 0: # log("Empty dataframe, skipping transformation...") @@ -999,11 +999,8 @@ def transform_raw_to_nested_structure( @task(checkpoint=False, nout=2) def create_request_params( - # datetime_range: dict, - # table_params: dict, - request_params: dict, + extract_params: dict, table_id: str, - secret_path: str, dataset_id: str, timestamp: datetime, ) -> tuple: @@ -1020,28 +1017,27 @@ def create_request_params( request_params: host, database and query to request data request_url: url to request data """ + request_params = None if dataset_id == constants.BILHETAGEM_DATASET_ID.value: - secrets = get_vault_secret(secret_path)["data"] - - database_secrets = secrets["databases"][request_params["database"]] - request_url = secrets["vpn_url"] + database_secrets["engine"] + database = constants.BILHETAGEM_DATABASES.value[extract_params["database"]] + request_url = constants.BILHETAGEM_VPN_URL.value + database["engine"] datetime_range = get_datetime_range( - timestamp=timestamp, interval=request_params["run_interval"] + timestamp=timestamp, interval=extract_params["run_interval"] ) + request_params = { - "host": database_secrets["host"], # TODO: exibir no log em ambiente fechado - "database": request_params["database"], - "query": request_params["query"].format(**datetime_range), + "host": database["host"], # TODO: exibir no log em ambiente fechado + "database": extract_params["database"], + "query": extract_params["query"].format(**datetime_range), } elif dataset_id == constants.GTFS_DATASET_ID.value: - gtfs_base_path = "development/br_rj_riodejaneiro_gtfs/upload" - if table_id == constants.GTFS_QUADRO_TABLE_ID.value: - request_url = f"{gtfs_base_path}/quadro.csv" + if table_id == constants.GTFS_QUADRO_CAPTURE_PARAMS.value["table_id"]: + request_url = f"{constants.GTFS_BASE_GCS_PATH.value}/{table_id}.csv" else: - request_url = f"{gtfs_base_path}/gtfs.zip" + request_url = f"{constants.GTFS_BASE_GCS_PATH.value}/gtfs.zip" return request_params, request_url From bdc3881cde88840b62175e1ce8ac66a596e37feb Mon Sep 17 00:00:00 2001 From: Rafael Date: Wed, 27 Sep 2023 13:27:11 -0300 Subject: [PATCH 011/145] change bilhetagem to new capture flow structure --- .../schedules.py | 18 +- pipelines/rj_smtr/constants.py | 186 ++++++++++-------- pipelines/rj_smtr/tasks.py | 14 +- pipelines/rj_smtr/utils.py | 40 ++-- 4 files changed, 145 insertions(+), 113 deletions(-) diff --git a/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/schedules.py b/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/schedules.py index 38fca85a9..538e5b816 100644 --- a/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/schedules.py +++ b/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/schedules.py @@ -16,26 +16,32 @@ ) bilhetagem_principal_clocks = generate_execute_schedules( - interval=timedelta(days=1), + clock_interval=timedelta( + **constants.BILHETAGEM_GENERAL_CAPTURE_PARAMS.value["principal_run_interval"] + ), labels=[ - emd_constants.RJ_SMTR_AGENT_LABEL.value, + emd_constants.RJ_SMTR_DEV_AGENT_LABEL.value, ], - table_parameters=constants.BILHETAGEM_TABLES_PARAMS.value, + table_parameters=constants.BILHETAGEM_CAPTURE_PARAMS.value, dataset_id=constants.BILHETAGEM_DATASET_ID.value, secret_path=constants.BILHETAGEM_SECRET_PATH.value, + source_type=constants.BILHETAGEM_GENERAL_CAPTURE_PARAMS.value["source_type"], runs_interval_minutes=15, ) bilhetagem_principal_schedule = Schedule(clocks=untuple(bilhetagem_principal_clocks)) bilhetagem_transacao_clocks = generate_execute_schedules( - interval=timedelta(minutes=1), + clock_interval=timedelta( + constants.BILHETAGEM_GENERAL_CAPTURE_PARAMS.value["transacao_run_interval"] + ), labels=[ - emd_constants.RJ_SMTR_AGENT_LABEL.value, + emd_constants.RJ_SMTR_DEV_AGENT_LABEL.value, ], - table_parameters=constants.BILHETAGEM_TRANSACAO_TABLE_PARAMS.value, + table_parameters=constants.BILHETAGEM_TRANSACAO_CAPTURE_PARAMS.value, dataset_id=constants.BILHETAGEM_DATASET_ID.value, secret_path=constants.BILHETAGEM_SECRET_PATH.value, + source_type=constants.BILHETAGEM_GENERAL_CAPTURE_PARAMS.value["source_type"], runs_interval_minutes=0, ) diff --git a/pipelines/rj_smtr/constants.py b/pipelines/rj_smtr/constants.py index 7eb18ef85..969ccd871 100644 --- a/pipelines/rj_smtr/constants.py +++ b/pipelines/rj_smtr/constants.py @@ -165,117 +165,142 @@ class constants(Enum): # pylint: disable=c0103 # BILHETAGEM BILHETAGEM_DATASET_ID = "br_rj_riodejaneiro_bilhetagem" - BILHETAGEM_DATABASES = { - "principal_db": { - "engine": "mysql", - "host": "principal-database-replica.internal", + + BILHETAGEM_GENERAL_CAPTURE_PARAMS = { + "databases": { + "principal_db": { + "engine": "mysql", + "host": "principal-database-replica.internal", + }, + "tarifa_db": { + "engine": "postgres", + "host": "tarifa-database-replica.internal", + }, + "transacao_db": { + "engine": "postgres", + "host": "transacao-database-replica.internal", + }, }, - "tarifa_db": {"engine": "postgres", "host": "tarifa-database-replica.internal"}, - "transacao_db": { - "engine": "postgres", - "host": "transacao-database-replica.internal", + "vpn_url": "http://vpn-jae.mobilidade.rio/", + "source_type": "api-json", + "transacao_run_interval": {"minutes": 1}, + "principal_run_interval": {"days": 1}, + } + + BILHETAGEM_TRANSACAO_CAPTURE_PARAMS = { + "table_id": "transacao", + "partition_date_only": False, + "extract_params": { + "database": "transacao_db", + "query": """ + SELECT + * + FROM + transacao + WHERE + data_processamento BETWEEN '{start}' + AND '{end}' + ORDER BY + data_processamento + """, + "run_interval": BILHETAGEM_GENERAL_CAPTURE_PARAMS["transacao_run_interval"], }, + "primary_key": ["id"], } - BILHETAGEM_VPN_URL = "http://vpn-jae.mobilidade.rio/" - BILHETAGEM_TRANSACAO_TABLE_PARAMS = [ + + BILHETAGEM_CAPTURE_PARAMS = [ { - "partition_date_only": False, - "flow_run_name": "transacao", - "extraction": { - "table_id": "transacao", - "database": "transacao_db", + "table_id": "linha", + "partition_date_only": True, + "extract_params": { + "database": "principal_db", "query": """ SELECT * FROM - transacao + LINHA WHERE - data_processamento BETWEEN '{start}' - AND '{end}' + DT_INCLUSAO >= '{start}' ORDER BY - data_processamento + DT_INCLUSAO """, - "source": "api", - }, - "pre-treatment": { - "table_id": "transacao", - "file_type": "json", - "primary_key": ["id"], # id column to nest data on + "run_interval": BILHETAGEM_GENERAL_CAPTURE_PARAMS[ + "principal_run_interval" + ], }, - } - ] - BILHETAGEM_TABLES_PARAMS = [ - { - "table_id": "linha", - "database": "principal_db", - "query": """ - SELECT - * - FROM - LINHA - WHERE - DT_INCLUSAO >= '{start}' - ORDER BY - DT_INCLUSAO - """, "primary_key": ["CD_LINHA"], # id column to nest data on - "partition_date_only": True, }, { "table_id": "grupo", - "database": "principal_db", - "query": """ - SELECT - * - FROM - GRUPO - WHERE - DT_INCLUSAO >= '{start}' - ORDER BY - DT_INCLUSAO - """, - "primary_key": ["CD_GRUPO"], "partition_date_only": True, + "extract_params": { + "database": "principal_db", + "query": """ + SELECT + * + FROM + GRUPO + WHERE + DT_INCLUSAO >= '{start}' + ORDER BY + DT_INCLUSAO + """, + "run_interval": BILHETAGEM_GENERAL_CAPTURE_PARAMS[ + "principal_run_interval" + ], + }, + "primary_key": ["CD_GRUPO"], }, { "table_id": "grupo_linha", - "database": "principal_db", - "query": """ - SELECT - * - FROM - GRUPO_LINHA - WHERE - DT_INCLUSAO >= '{start}' - ORDER BY - DT_INCLUSAO - """, - "primary_key": ["CD_GRUPO", "CD_LINHA"], # id column to nest data on "partition_date_only": True, + "extract_params": { + "database": "principal_db", + "query": """ + SELECT + * + FROM + GRUPO_LINHA + WHERE + DT_INCLUSAO >= '{start}' + ORDER BY + DT_INCLUSAO + """, + "run_interval": BILHETAGEM_GENERAL_CAPTURE_PARAMS[ + "principal_run_interval" + ], + }, + "primary_key": ["CD_GRUPO", "CD_LINHA"], # id column to nest data on }, { "table_id": "matriz_integracao", - "database": "tarifa_db", - "query": """ - SELECT - * - FROM - matriz_integracao - WHERE - dt_inclusao >= '{start}' - ORDER BY - dt_inclusao - """, + "partition_date_only": True, + "extract_params": { + "database": "tarifa_db", + "query": """ + SELECT + * + FROM + matriz_integracao + WHERE + dt_inclusao >= '{start}' + ORDER BY + dt_inclusao + """, + "run_interval": BILHETAGEM_GENERAL_CAPTURE_PARAMS[ + "principal_run_interval" + ], + }, "primary_key": [ "cd_versao_matriz", "cd_integracao", ], # id column to nest data on - "partition_date_only": True, }, ] BILHETAGEM_SECRET_PATH = "smtr_jae_access_data" # GTFS + GTFS_DATASET_ID = "br_rj_riodejaneiro_gtfs" GTFS_CAPTURE_PARAMS = [ {"table_id": "agency", "primary_key": ["agency_id"]}, {"table_id": "calendar_dates", "primary_key": ["service_id"]}, @@ -289,11 +314,6 @@ class constants(Enum): # pylint: disable=c0103 {"table_id": "fare_attributes", "primary_key": ["fare_id"]}, {"table_id": "fare_rules", "primary_key": ["fare_id"]}, ] - GTFS_GENERAL_CAPTURE_PARAMS = {"partition_date_only": True, "source_type": "gcs"} - GTFS_QUADRO_CAPTURE_PARAMS = {"table_id": "quadro", "primary_key": "servico"} - - GTFS_DATASET_ID = "br_rj_riodejaneiro_gtfs" - GTFS_BASE_GCS_PATH = "development/br_rj_riodejaneiro_gtfs/upload" diff --git a/pipelines/rj_smtr/tasks.py b/pipelines/rj_smtr/tasks.py index a134dd966..e414f1c70 100644 --- a/pipelines/rj_smtr/tasks.py +++ b/pipelines/rj_smtr/tasks.py @@ -1020,11 +1020,16 @@ def create_request_params( request_params = None if dataset_id == constants.BILHETAGEM_DATASET_ID.value: - database = constants.BILHETAGEM_DATABASES.value[extract_params["database"]] - request_url = constants.BILHETAGEM_VPN_URL.value + database["engine"] + database = constants.BILHETAGEM_GENERAL_CAPTURE_PARAMS.value["databases"][ + extract_params["database"] + ] + request_url = ( + constants.BILHETAGEM_GENERAL_CAPTURE_PARAMS.value["vpn_url"] + + database["engine"] + ) datetime_range = get_datetime_range( - timestamp=timestamp, interval=extract_params["run_interval"] + timestamp=timestamp, interval=timedelta(**extract_params["run_interval"]) ) request_params = { @@ -1051,12 +1056,15 @@ def get_raw_from_sources( secret_path: str = None, api_params: dict = None, ): + source_type, filetype = source_type.split("-", maxsplit=1) + if source_type == "api": return get_raw_data_api( url=source_path, secret_path=secret_path, api_params=api_params, filepath=local_filepath, + filetype=filetype, ) if source_type == "gcs": return get_raw_data_gcs( diff --git a/pipelines/rj_smtr/utils.py b/pipelines/rj_smtr/utils.py index 184a93df7..d354ae6ab 100644 --- a/pipelines/rj_smtr/utils.py +++ b/pipelines/rj_smtr/utils.py @@ -8,7 +8,7 @@ from pathlib import Path from datetime import timedelta, datetime -from typing import List +from typing import List, Union import io import json import zipfile @@ -31,7 +31,6 @@ get_vault_secret, send_discord_message, get_redis_client, - get_storage_blobs, get_storage_blob, ) @@ -404,46 +403,41 @@ def data_info_str(data: pd.DataFrame): def generate_execute_schedules( # pylint: disable=too-many-arguments,too-many-locals - interval: timedelta, + clock_interval: timedelta, labels: List[str], - table_parameters: list, - dataset_id: str, - secret_path: str, + table_parameters: Union[list[dict], dict], runs_interval_minutes: int = 15, start_date: datetime = datetime( 2020, 1, 1, tzinfo=pytz.timezone(emd_constants.DEFAULT_TIMEZONE.value) ), + **general_flow_params, ) -> List[IntervalClock]: """ Generates multiple schedules Args: - interval (timedelta): The interval to run the schedule + clock_interval (timedelta): The interval to run the schedule labels (List[str]): The labels to be added to the schedule - table_parameters (list): The table parameters - dataset_id (str): The dataset_id to be used in the schedule - secret_path (str): The secret path to be used in the schedule + table_parameters (list): The table parameters to iterate over runs_interval_minutes (int, optional): The interval between each schedule. Defaults to 15. start_date (datetime, optional): The start date of the schedule. Defaults to datetime(2020, 1, 1, tzinfo=pytz.timezone(emd_constants.DEFAULT_TIMEZONE.value)). - + general_flow_params: Any param that you want to pass to the flow Returns: List[IntervalClock]: The list of schedules """ + if isinstance(table_parameters, dict): + table_parameters = [table_parameters] clocks = [] for count, parameters in enumerate(table_parameters): - parameter_defaults = { - "table_params": parameters, - "dataset_id": dataset_id, - "secret_path": secret_path, - "interval": interval.total_seconds(), - } + parameter_defaults = parameters | general_flow_params + log(f"parameter_defaults: {parameter_defaults}") clocks.append( IntervalClock( - interval=interval, + interval=clock_interval, start_date=start_date + timedelta(minutes=runs_interval_minutes * count), labels=labels, @@ -486,7 +480,11 @@ def save_raw_local_func( def get_raw_data_api( # pylint: disable=R0912 - url: str, secret_path: str = None, api_params: dict = None, filepath: str = None + url: str, + secret_path: str = None, + api_params: dict = None, + filepath: str = None, + filetype: str = None, ) -> list[dict]: """ Request data from URL API @@ -517,8 +515,8 @@ def get_raw_data_api( # pylint: disable=R0912 response.raise_for_status() filepath = save_raw_local_func( - data=response.text, filepath=filepath - ) # TODO: mudar filetype + data=response.text, filepath=filepath, filetype=filetype + ) except Exception as exp: error = exp From fc61c4762c7a416872ba6fbbfa5a064a43e846a4 Mon Sep 17 00:00:00 2001 From: Rafael Date: Wed, 27 Sep 2023 14:24:48 -0300 Subject: [PATCH 012/145] fix get_storage_blob function --- pipelines/rj_smtr/constants.py | 2 +- pipelines/utils/utils.py | 3 +-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/pipelines/rj_smtr/constants.py b/pipelines/rj_smtr/constants.py index 969ccd871..2faeccb25 100644 --- a/pipelines/rj_smtr/constants.py +++ b/pipelines/rj_smtr/constants.py @@ -316,4 +316,4 @@ class constants(Enum): # pylint: disable=c0103 ] GTFS_GENERAL_CAPTURE_PARAMS = {"partition_date_only": True, "source_type": "gcs"} GTFS_QUADRO_CAPTURE_PARAMS = {"table_id": "quadro", "primary_key": "servico"} - GTFS_BASE_GCS_PATH = "development/br_rj_riodejaneiro_gtfs/upload" + GTFS_BASE_GCS_PATH = "raw/development/br_rj_riodejaneiro_gtfs/upload" diff --git a/pipelines/utils/utils.py b/pipelines/utils/utils.py index 147e54f4f..57384f8f4 100644 --- a/pipelines/utils/utils.py +++ b/pipelines/utils/utils.py @@ -726,9 +726,8 @@ def get_storage_blobs(dataset_id: str, table_id: str, mode: str = "staging") -> def get_storage_blob( gcs_path: str, - mode: str = "staging", ): - bucket = bd.Storage() + bucket = bd.Storage(dataset_id="", table_id="") return ( bucket.client["storage_staging"] .bucket(bucket.bucket_name) From 0fc26cbc9d786fd28b369ab35784b636c3ecdc12 Mon Sep 17 00:00:00 2001 From: Rafael Date: Wed, 27 Sep 2023 14:25:27 -0300 Subject: [PATCH 013/145] fix get_storage_blob call --- pipelines/rj_smtr/utils.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/pipelines/rj_smtr/utils.py b/pipelines/rj_smtr/utils.py index d354ae6ab..55abfc9cf 100644 --- a/pipelines/rj_smtr/utils.py +++ b/pipelines/rj_smtr/utils.py @@ -533,10 +533,7 @@ def get_raw_data_gcs( error = None try: - blob = get_storage_blob( - gcs_path=gcs_path, - mode="raw", - ) + blob = get_storage_blob(gcs_path=gcs_path) data = blob.download_as_bytes() From 634df851e41bff549fe5f9daab4801f0eb6e0858 Mon Sep 17 00:00:00 2001 From: Rafael Date: Wed, 27 Sep 2023 14:45:26 -0300 Subject: [PATCH 014/145] organize constants order --- pipelines/rj_smtr/constants.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pipelines/rj_smtr/constants.py b/pipelines/rj_smtr/constants.py index 2faeccb25..722d7e9e1 100644 --- a/pipelines/rj_smtr/constants.py +++ b/pipelines/rj_smtr/constants.py @@ -301,6 +301,7 @@ class constants(Enum): # pylint: disable=c0103 # GTFS GTFS_DATASET_ID = "br_rj_riodejaneiro_gtfs" + GTFS_GENERAL_CAPTURE_PARAMS = {"partition_date_only": True, "source_type": "gcs"} GTFS_CAPTURE_PARAMS = [ {"table_id": "agency", "primary_key": ["agency_id"]}, {"table_id": "calendar_dates", "primary_key": ["service_id"]}, @@ -314,6 +315,5 @@ class constants(Enum): # pylint: disable=c0103 {"table_id": "fare_attributes", "primary_key": ["fare_id"]}, {"table_id": "fare_rules", "primary_key": ["fare_id"]}, ] - GTFS_GENERAL_CAPTURE_PARAMS = {"partition_date_only": True, "source_type": "gcs"} GTFS_QUADRO_CAPTURE_PARAMS = {"table_id": "quadro", "primary_key": "servico"} - GTFS_BASE_GCS_PATH = "raw/development/br_rj_riodejaneiro_gtfs/upload" + GTFS_BASE_GCS_PATH = "development/br_rj_riodejaneiro_gtfs/upload" From bda52aa6eedb6eedec2c6334f0843e2a80edcd4a Mon Sep 17 00:00:00 2001 From: Rafael Date: Wed, 27 Sep 2023 14:46:45 -0300 Subject: [PATCH 015/145] fix get_raw_from_sources function call --- pipelines/rj_smtr/flows.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pipelines/rj_smtr/flows.py b/pipelines/rj_smtr/flows.py index 94a3ffb93..19ac776b7 100644 --- a/pipelines/rj_smtr/flows.py +++ b/pipelines/rj_smtr/flows.py @@ -82,9 +82,9 @@ source_type=source_type, # parametro de extracao, onde ficar? local_filepath=filepath, source_path=request_path, - zip_filename=table_id, + table_id=table_id, secret_path=secret_path, - request_params=request_params, + api_params=request_params, ) RAW_UPLOADED = upload_raw_data_to_gcs( From b2548d6b8cd1f56bf9dbd4676e52011ce5fdfa16 Mon Sep 17 00:00:00 2001 From: Rafael Date: Wed, 27 Sep 2023 14:47:35 -0300 Subject: [PATCH 016/145] change transform_raw_to_json to read_raw_data --- pipelines/rj_smtr/tasks.py | 17 +++++------------ 1 file changed, 5 insertions(+), 12 deletions(-) diff --git a/pipelines/rj_smtr/tasks.py b/pipelines/rj_smtr/tasks.py index e414f1c70..ee99ff654 100644 --- a/pipelines/rj_smtr/tasks.py +++ b/pipelines/rj_smtr/tasks.py @@ -32,7 +32,7 @@ get_raw_data_gcs, upload_run_logs_to_bq, get_datetime_range, - transform_data_to_json, + read_raw_data, save_treated_local_func, ) from pipelines.utils.execute_dbt_model.utils import get_dbt_client @@ -899,17 +899,11 @@ def transform_raw_to_nested_structure( """ # Check previous error + if error is not None: return error, None - with open(raw_filepath, "r", encoding="utf-8") as file: - data = file.read() - # ORGANIZAR: - error, data = transform_data_to_json( - data=data, - file_type=raw_filepath.split(".")[-1], - ) # Check empty dataframe # if len(status["data"]) == 0: @@ -917,13 +911,12 @@ def transform_raw_to_nested_structure( # return {"data": pd.DataFrame(), "error": error} try: + # leitura do dado raw + error, data = read_raw_data(filepath=raw_filepath) + if primary_key is None: primary_key = [] - error = None - # leitura do dado raw - data = pd.DataFrame(data) - log( f""" Received inputs: From 307863a1d381cefeeb5a9001fb8f4ef235923cbb Mon Sep 17 00:00:00 2001 From: Rafael Date: Wed, 27 Sep 2023 14:48:30 -0300 Subject: [PATCH 017/145] transform transform_raw_data_to_json to read_raw_data --- pipelines/rj_smtr/utils.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/pipelines/rj_smtr/utils.py b/pipelines/rj_smtr/utils.py index 55abfc9cf..3f4281a2c 100644 --- a/pipelines/rj_smtr/utils.py +++ b/pipelines/rj_smtr/utils.py @@ -673,15 +673,18 @@ def get_datetime_range( return {"start": start, "end": end} -def transform_data_to_json(data: str, file_type: str, csv_args: dict = {}): +def read_raw_data(filepath: str, csv_args: dict = {}) -> tuple[str, pd.DataFrame]: try: + file_type = filepath.split(".")[-1] + if file_type == "json": - data = json.loads(data) + data = pd.read_json(filepath) + # data = json.loads(data) elif file_type in ("txt", "csv"): if csv_args is None: csv_args = {} - data = pd.read_csv(io.StringIO(data), **csv_args).to_dict(orient="records") + data = pd.read_csv(filepath, **csv_args) else: error = "Unsupported raw file extension. Supported only: json, csv and txt" From 7f2c1e3fe3db535868943404e945b5b44eefad74 Mon Sep 17 00:00:00 2001 From: Rafael Date: Wed, 27 Sep 2023 14:59:43 -0300 Subject: [PATCH 018/145] fix nout task parameter --- pipelines/rj_smtr/tasks.py | 4 ++-- pipelines/rj_smtr/utils.py | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/pipelines/rj_smtr/tasks.py b/pipelines/rj_smtr/tasks.py index ee99ff654..9beb5a87e 100644 --- a/pipelines/rj_smtr/tasks.py +++ b/pipelines/rj_smtr/tasks.py @@ -875,7 +875,7 @@ def get_previous_date(days): ############### -@task +@task(nout=2) def transform_raw_to_nested_structure( raw_filepath: str, filepath: str, @@ -1040,7 +1040,7 @@ def create_request_params( return request_params, request_url -@task(checkpoint=False) +@task(checkpoint=False, nout=2) def get_raw_from_sources( source_type: str, local_filepath: str, diff --git a/pipelines/rj_smtr/utils.py b/pipelines/rj_smtr/utils.py index 3f4281a2c..8a8804474 100644 --- a/pipelines/rj_smtr/utils.py +++ b/pipelines/rj_smtr/utils.py @@ -485,7 +485,7 @@ def get_raw_data_api( # pylint: disable=R0912 api_params: dict = None, filepath: str = None, filetype: str = None, -) -> list[dict]: +) -> tuple[str, str]: """ Request data from URL API @@ -529,7 +529,7 @@ def get_raw_data_gcs( gcs_path: str, local_filepath: str, filename_to_unzip: str = None, -) -> dict: +) -> tuple[str, str]: error = None try: @@ -673,7 +673,7 @@ def get_datetime_range( return {"start": start, "end": end} -def read_raw_data(filepath: str, csv_args: dict = {}) -> tuple[str, pd.DataFrame]: +def read_raw_data(filepath: str, csv_args: dict = dict()) -> tuple[str, pd.DataFrame]: try: file_type = filepath.split(".")[-1] From 51977c10621d34ea3643004cba5bc4f990d249db Mon Sep 17 00:00:00 2001 From: Rafael Date: Wed, 27 Sep 2023 15:16:38 -0300 Subject: [PATCH 019/145] fix timedelta instantiation --- pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/schedules.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/schedules.py b/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/schedules.py index 538e5b816..f19f0d8ad 100644 --- a/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/schedules.py +++ b/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/schedules.py @@ -33,7 +33,7 @@ bilhetagem_transacao_clocks = generate_execute_schedules( clock_interval=timedelta( - constants.BILHETAGEM_GENERAL_CAPTURE_PARAMS.value["transacao_run_interval"] + **constants.BILHETAGEM_GENERAL_CAPTURE_PARAMS.value["transacao_run_interval"] ), labels=[ emd_constants.RJ_SMTR_DEV_AGENT_LABEL.value, From 8ef0b5df7c31ebb7f59ff719c338e029e34cf031 Mon Sep 17 00:00:00 2001 From: Rafael Date: Wed, 27 Sep 2023 15:58:05 -0300 Subject: [PATCH 020/145] set upstream tasks --- pipelines/rj_smtr/flows.py | 1 + pipelines/rj_smtr/tasks.py | 10 +++++++--- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/pipelines/rj_smtr/flows.py b/pipelines/rj_smtr/flows.py index 19ac776b7..a4044933a 100644 --- a/pipelines/rj_smtr/flows.py +++ b/pipelines/rj_smtr/flows.py @@ -104,6 +104,7 @@ error=error, timestamp=timestamp, primary_key=primary_key, + upstream_tasks=[RAW_UPLOADED], ) STAGING_UPLOADED = upload_staging_data_to_gcs( diff --git a/pipelines/rj_smtr/tasks.py b/pipelines/rj_smtr/tasks.py index 9beb5a87e..269ee73eb 100644 --- a/pipelines/rj_smtr/tasks.py +++ b/pipelines/rj_smtr/tasks.py @@ -1052,19 +1052,23 @@ def get_raw_from_sources( source_type, filetype = source_type.split("-", maxsplit=1) if source_type == "api": - return get_raw_data_api( + error, filepath = get_raw_data_api( url=source_path, secret_path=secret_path, api_params=api_params, filepath=local_filepath, filetype=filetype, ) - if source_type == "gcs": - return get_raw_data_gcs( + elif source_type == "gcs": + error, filepath = get_raw_data_gcs( gcs_path=source_path, filename_to_unzip=table_id, local_filepath=local_filepath, ) + else: + raise NotImplementedError(f"{source_type} not supported") + + return error, filepath # TODO: passar para função para dentro da transform_raw_to_nested_structure From 4f21f0af7fff375354538c868e7b4cedd7943f4d Mon Sep 17 00:00:00 2001 From: Rafael Date: Wed, 27 Sep 2023 16:02:09 -0300 Subject: [PATCH 021/145] declare raw_filepath --- pipelines/rj_smtr/utils.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pipelines/rj_smtr/utils.py b/pipelines/rj_smtr/utils.py index 8a8804474..0fd5c7d6c 100644 --- a/pipelines/rj_smtr/utils.py +++ b/pipelines/rj_smtr/utils.py @@ -531,6 +531,7 @@ def get_raw_data_gcs( filename_to_unzip: str = None, ) -> tuple[str, str]: error = None + raw_filepath = None try: blob = get_storage_blob(gcs_path=gcs_path) From 11b973581c7ccc103d16bccc09dccd41f86f68da Mon Sep 17 00:00:00 2001 From: eng-rodrigocunha Date: Wed, 27 Sep 2023 16:19:43 -0300 Subject: [PATCH 022/145] update docstrings --- pipelines/rj_smtr/tasks.py | 76 +++++++++++++++++++++++++++++++------- pipelines/rj_smtr/utils.py | 50 ++++++++++++++++++------- pipelines/utils/utils.py | 17 +++++++++ 3 files changed, 116 insertions(+), 27 deletions(-) diff --git a/pipelines/rj_smtr/tasks.py b/pipelines/rj_smtr/tasks.py index 269ee73eb..b12f0604c 100644 --- a/pipelines/rj_smtr/tasks.py +++ b/pipelines/rj_smtr/tasks.py @@ -168,7 +168,14 @@ def create_date_hour_partition( timestamp: datetime, partition_date_only: bool = False ) -> str: """ - Get date hour Hive partition structure from timestamp. + Generate partition string for date and hour. + + Args: + timestamp (datetime): timestamp to be used as reference + partition_date_only (bool, optional): whether to add hour partition or not + + Returns: + str: partition string """ partition = f"data={timestamp.strftime('%Y-%m-%d')}" if not partition_date_only: @@ -614,6 +621,20 @@ def upload_raw_data_to_gcs( dataset_id: str, partitions: list, ): + """ + Upload raw data to GCS. + + Args: + error (bool): whether the upstream tasks failed or not + raw_filepath (str): Path to the saved raw .json file + timestamp (datetime): timestamp for flow run + table_id (str): table_id on BigQuery + dataset_id (str): dataset_id on BigQuery + partitions (list): list of partition strings + + Returns: + None + """ if not error: try: st_obj = Storage(table_id=table_id, dataset_id=dataset_id) @@ -649,6 +670,20 @@ def upload_staging_data_to_gcs( dataset_id: str, partitions: list, ): + """ + Upload staging data to GCS. + + Args: + error (bool): whether the upstream tasks failed or not + staging_filepath (str): Path to the saved treated .csv file + timestamp (datetime): timestamp for flow run + table_id (str): table_id on BigQuery + dataset_id (str): dataset_id on BigQuery + partitions (list): list of partition strings + + Returns: + None + """ if not error: try: # Creates and publish table if it does not exist, append to it otherwise @@ -883,19 +918,18 @@ def transform_raw_to_nested_structure( timestamp: datetime, primary_key: list = None, ): - """Transform dataframe to nested structure + """ + Task to transform raw data to nested structure Args: - status (dict): Must contain keys - * `data`: dataframe returned from treatement - * `error`: error catched from data treatement - timestamp (datetime): timestamp of the capture - primary_key (list, optional): List of primary keys to be used for nesting. + raw_filepath (str): Path to the saved raw .json file + filepath (str): Path to the saved treated .csv file + error (str): Error catched from upstream tasks + timestamp (datetime): timestamp for flow run + primary_key (list, optional): Primary key to be used on nested structure Returns: - dict: Conatining keys - * `data` (json): nested data - * `error` (str): catched error, if any. Otherwise, returns None + str: Path to the saved treated .csv file """ # Check previous error @@ -1001,10 +1035,10 @@ def create_request_params( Task to create request params Args: - datetime_range (dict): datetime range to get params - table_params (dict): table params to get params - secret_path (str): secret path to get params - dataset_id (str): dataset id to get params + extract_params (dict): extract parameters + table_id (str): table_id on BigQuery + dataset_id (str): dataset_id on BigQuery + timestamp (datetime): timestamp for flow run Returns: request_params: host, database and query to request data @@ -1049,6 +1083,20 @@ def get_raw_from_sources( secret_path: str = None, api_params: dict = None, ): + """ + Task to get raw data from sources + + Args: + source_type (str): source type + local_filepath (str): local filepath + source_path (str, optional): source path. Defaults to None. + table_id (str, optional): table_id on BigQuery. Defaults to None. + secret_path (str, optional): secret path. Defaults to None. + api_params (dict, optional): api parameters. Defaults to None. + + Returns: + error: error + """ source_type, filetype = source_type.split("-", maxsplit=1) if source_type == "api": diff --git a/pipelines/rj_smtr/utils.py b/pipelines/rj_smtr/utils.py index 0fd5c7d6c..801c8d336 100644 --- a/pipelines/rj_smtr/utils.py +++ b/pipelines/rj_smtr/utils.py @@ -490,14 +490,14 @@ def get_raw_data_api( # pylint: disable=R0912 Request data from URL API Args: - url (str): URL to send request - secret_path (str, optional): Path to secrets guardeded on Vault, if needed. - params (dict, optional): Params to be sent on request + url (str): URL to request data + secret_path (str, optional): Secret path to get headers. Defaults to None. + api_params (dict, optional): Parameters to pass to API. Defaults to None. + filepath (str, optional): Path to save raw file. Defaults to None. + filetype (str, optional): Filetype to save raw file. Defaults to None. Returns: - dict: Conatining keys - * `data` (json): data result - * `error` (str): catched error, if any. Otherwise, returns None + tuple[str, str]: Error and filepath """ error = None try: @@ -530,6 +530,17 @@ def get_raw_data_gcs( local_filepath: str, filename_to_unzip: str = None, ) -> tuple[str, str]: + """ + Get raw data from GCS + + Args: + gcs_path (str): GCS path to get data + local_filepath (str): Local filepath to save raw data + filename_to_unzip (str, optional): Filename to unzip. Defaults to None. + + Returns: + tuple[str, str]: Error and filepath + """ error = None raw_filepath = None @@ -568,10 +579,9 @@ def save_treated_local_func( Save treated file to CSV. Args: - filepath (str): Path which to save treated file - status (dict): Must contain keys - * `data`: dataframe returned from treatement - * `error`: error catched from data treatement + filepath (str): Path to save file + data (pd.DataFrame): Dataframe to save + error (str): Error catched during execution mode (str, optional): Folder to save locally, later folder which to upload to GCS. Returns: @@ -601,9 +611,13 @@ def upload_run_logs_to_bq( # pylint: disable=R0913 Args: dataset_id (str): dataset_id on BigQuery - parent_table_id (str): Parent table id related to the status table - timestamp (str): ISO formatted timestamp string - error (str, optional): String associated with error caught during execution + parent_table_id (str): table_id on BigQuery + timestamp (str): timestamp to get datetime range + error (str): error catched during execution + previous_error (str): previous error catched during execution + recapture (bool): if the execution was a recapture + mode (str): folder to save locally, later folder which to upload to GCS + Returns: None """ @@ -675,6 +689,16 @@ def get_datetime_range( def read_raw_data(filepath: str, csv_args: dict = dict()) -> tuple[str, pd.DataFrame]: + """ + Read raw data from file + + Args: + filepath (str): filepath to read + csv_args (dict): arguments to pass to pandas.read_csv + + Returns: + tuple[str, pd.DataFrame]: error and data + """ try: file_type = filepath.split(".")[-1] diff --git a/pipelines/utils/utils.py b/pipelines/utils/utils.py index 57384f8f4..e37a88d8b 100644 --- a/pipelines/utils/utils.py +++ b/pipelines/utils/utils.py @@ -714,6 +714,14 @@ def get_credentials_from_env( def get_storage_blobs(dataset_id: str, table_id: str, mode: str = "staging") -> list: """ Get all blobs from a table in a dataset. + + Args: + dataset_id (str): dataset id + table_id (str): table id + mode (str, optional): mode to use. Defaults to "staging". + + Returns: + list: list of blobs """ bd_storage = bd.Storage(dataset_id=dataset_id, table_id=table_id) @@ -727,6 +735,15 @@ def get_storage_blobs(dataset_id: str, table_id: str, mode: str = "staging") -> def get_storage_blob( gcs_path: str, ): + """ + Get a blob from a path. + + Args: + gcs_path (str): path to blob + + Returns: + Blob: blob object + """ bucket = bd.Storage(dataset_id="", table_id="") return ( bucket.client["storage_staging"] From f484b880d54367e375a2ce72b02d9835f20fe4d1 Mon Sep 17 00:00:00 2001 From: Rafael Date: Wed, 27 Sep 2023 16:20:42 -0300 Subject: [PATCH 023/145] adjust get_raw_from_sources return --- pipelines/rj_smtr/tasks.py | 38 ++++++++++++++++++++++---------------- 1 file changed, 22 insertions(+), 16 deletions(-) diff --git a/pipelines/rj_smtr/tasks.py b/pipelines/rj_smtr/tasks.py index 269ee73eb..023ea2796 100644 --- a/pipelines/rj_smtr/tasks.py +++ b/pipelines/rj_smtr/tasks.py @@ -1051,22 +1051,28 @@ def get_raw_from_sources( ): source_type, filetype = source_type.split("-", maxsplit=1) - if source_type == "api": - error, filepath = get_raw_data_api( - url=source_path, - secret_path=secret_path, - api_params=api_params, - filepath=local_filepath, - filetype=filetype, - ) - elif source_type == "gcs": - error, filepath = get_raw_data_gcs( - gcs_path=source_path, - filename_to_unzip=table_id, - local_filepath=local_filepath, - ) - else: - raise NotImplementedError(f"{source_type} not supported") + log(f"Source type: {source_type}") + + try: + if source_type == "api": + error, filepath = get_raw_data_api( + url=source_path, + secret_path=secret_path, + api_params=api_params, + filepath=local_filepath, + filetype=filetype, + ) + elif source_type == "gcs": + error, filepath = get_raw_data_gcs( + gcs_path=source_path, + filename_to_unzip=table_id, + local_filepath=local_filepath, + ) + else: + raise NotImplementedError(f"{source_type} not supported") + except NotImplementedError as exp: + error = exp + filepath = None return error, filepath From 2df4318dc407b58ca6a6c4bf5a3bfad8db7fab37 Mon Sep 17 00:00:00 2001 From: Rafael Date: Wed, 27 Sep 2023 16:41:00 -0300 Subject: [PATCH 024/145] fix errors --- pipelines/rj_smtr/tasks.py | 13 +++++++++++-- pipelines/rj_smtr/utils.py | 1 + 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/pipelines/rj_smtr/tasks.py b/pipelines/rj_smtr/tasks.py index 7ff9ee637..9c2ae3be0 100644 --- a/pipelines/rj_smtr/tasks.py +++ b/pipelines/rj_smtr/tasks.py @@ -1097,7 +1097,15 @@ def get_raw_from_sources( Returns: error: error """ - source_type, filetype = source_type.split("-", maxsplit=1) + error = None + filepath = None + + source_values = source_type.split("-", maxsplit=1) + source_type = source_values[0] + try: + filetype = source_values[1] + except IndexError: + filetype = None log(f"Source type: {source_type}") @@ -1120,8 +1128,9 @@ def get_raw_from_sources( raise NotImplementedError(f"{source_type} not supported") except NotImplementedError as exp: error = exp - filepath = None + log(f"[CATCHED] Task failed with error: \n{error}", level="error") + log(f"Raw extraction ended returned values: {error}, {filepath}") return error, filepath diff --git a/pipelines/rj_smtr/utils.py b/pipelines/rj_smtr/utils.py index 801c8d336..743e955e1 100644 --- a/pipelines/rj_smtr/utils.py +++ b/pipelines/rj_smtr/utils.py @@ -699,6 +699,7 @@ def read_raw_data(filepath: str, csv_args: dict = dict()) -> tuple[str, pd.DataF Returns: tuple[str, pd.DataFrame]: error and data """ + error = None try: file_type = filepath.split(".")[-1] From df6525ac9e946f5a3d3709b768e02f2c26aae1c8 Mon Sep 17 00:00:00 2001 From: Rafael Date: Wed, 27 Sep 2023 16:45:37 -0300 Subject: [PATCH 025/145] change agent label to dev --- pipelines/rj_smtr/flows.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pipelines/rj_smtr/flows.py b/pipelines/rj_smtr/flows.py index a4044933a..27eaa76a4 100644 --- a/pipelines/rj_smtr/flows.py +++ b/pipelines/rj_smtr/flows.py @@ -119,5 +119,5 @@ default_capture_flow.storage = GCS(emd_constants.GCS_FLOWS_BUCKET.value) default_capture_flow.run_config = KubernetesRun( image=emd_constants.DOCKER_IMAGE.value, - labels=[emd_constants.RJ_SMTR_AGENT_LABEL.value], + labels=[emd_constants.RJ_SMTR_DEV_AGENT_LABEL.value], ) From 2983b687fb1910cc1086cb875367493706ed905e Mon Sep 17 00:00:00 2001 From: Rafael Date: Thu, 28 Sep 2023 10:54:51 -0300 Subject: [PATCH 026/145] refactore source values --- pipelines/rj_smtr/tasks.py | 36 ++++++------------------------------ 1 file changed, 6 insertions(+), 30 deletions(-) diff --git a/pipelines/rj_smtr/tasks.py b/pipelines/rj_smtr/tasks.py index 9c2ae3be0..4a7182daf 100644 --- a/pipelines/rj_smtr/tasks.py +++ b/pipelines/rj_smtr/tasks.py @@ -1100,14 +1100,13 @@ def get_raw_from_sources( error = None filepath = None - source_values = source_type.split("-", maxsplit=1) - source_type = source_values[0] - try: - filetype = source_values[1] - except IndexError: - filetype = None + source_values = source_type.split("-", 1) + + source_type, filetype = ( + source_values if len(source_values) == 2 else (source_values[0], None) + ) - log(f"Source type: {source_type}") + log(f"Getting raw data from source type: {source_type}") try: if source_type == "api": @@ -1132,26 +1131,3 @@ def get_raw_from_sources( log(f"Raw extraction ended returned values: {error}, {filepath}") return error, filepath - - -# TODO: passar para função para dentro da transform_raw_to_nested_structure -# @task(checkpoint=False) -# def transform_data_to_json(status: dict, file_type: str, csv_args: dict): -# data = status["data"] -# error = status["error"] - -# if file_type == "json": -# pass - -# # todo: move to data check on specfic API # pylint: disable=W0102 -# # if isinstance(data, dict) and "DescricaoErro" in data.keys(): -# # error = data["DescricaoErro"] - -# elif file_type in ("txt", "csv"): -# if csv_args is None: -# csv_args = {} -# data = pd.read_csv(io.StringIO(data), **csv_args).to_dict(orient="records") -# else: -# error = "Unsupported raw file extension. Supported only: json, csv and txt" - -# return {"data": data, "error": error} From 2c78b09404680d561a5afe5096428cb44a3b8032 Mon Sep 17 00:00:00 2001 From: eng-rodrigocunha Date: Thu, 28 Sep 2023 11:27:23 -0300 Subject: [PATCH 027/145] update constants --- pipelines/rj_smtr/constants.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/pipelines/rj_smtr/constants.py b/pipelines/rj_smtr/constants.py index 722d7e9e1..3afb0b1cd 100644 --- a/pipelines/rj_smtr/constants.py +++ b/pipelines/rj_smtr/constants.py @@ -185,6 +185,8 @@ class constants(Enum): # pylint: disable=c0103 "source_type": "api-json", "transacao_run_interval": {"minutes": 1}, "principal_run_interval": {"days": 1}, + "transacao_runs_interval_minutes": 0, + "principal_runs_interval_minutes": 15, } BILHETAGEM_TRANSACAO_CAPTURE_PARAMS = { @@ -205,7 +207,7 @@ class constants(Enum): # pylint: disable=c0103 """, "run_interval": BILHETAGEM_GENERAL_CAPTURE_PARAMS["transacao_run_interval"], }, - "primary_key": ["id"], + "primary_key": ["id"], # id column to nest data on } BILHETAGEM_CAPTURE_PARAMS = [ @@ -249,7 +251,7 @@ class constants(Enum): # pylint: disable=c0103 "principal_run_interval" ], }, - "primary_key": ["CD_GRUPO"], + "primary_key": ["CD_GRUPO"], # id column to nest data on }, { "table_id": "grupo_linha", From 1f3c2fc307e21e77de206f5ded612a690e8108cf Mon Sep 17 00:00:00 2001 From: eng-rodrigocunha Date: Thu, 28 Sep 2023 11:28:23 -0300 Subject: [PATCH 028/145] update agent --- pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/flows.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/flows.py b/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/flows.py index d7f44e3b9..793d37c0d 100644 --- a/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/flows.py +++ b/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/flows.py @@ -30,7 +30,7 @@ bilhetagem_transacao_captura.storage = GCS(emd_constants.GCS_FLOWS_BUCKET.value) bilhetagem_transacao_captura.run_config = KubernetesRun( image=emd_constants.DOCKER_IMAGE.value, - labels=[emd_constants.RJ_SMTR_AGENT_LABEL.value], + labels=[emd_constants.RJ_SMTR_DEV_AGENT_LABEL.value], ) bilhetagem_transacao_captura.schedule = bilhetagem_transacao_schedule @@ -41,6 +41,6 @@ bilhetagem_principal_captura.storage = GCS(emd_constants.GCS_FLOWS_BUCKET.value) bilhetagem_principal_captura.run_config = KubernetesRun( image=emd_constants.DOCKER_IMAGE.value, - labels=[emd_constants.RJ_SMTR_AGENT_LABEL.value], + labels=[emd_constants.RJ_SMTR_DEV_AGENT_LABEL.value], ) bilhetagem_principal_captura.schedule = bilhetagem_principal_schedule From 702e70d6ae1341889e333e2d07fc0fec70dd6cef Mon Sep 17 00:00:00 2001 From: eng-rodrigocunha Date: Thu, 28 Sep 2023 11:30:21 -0300 Subject: [PATCH 029/145] update schedule params --- .../rj_smtr/br_rj_riodejaneiro_bilhetagem/schedules.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/schedules.py b/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/schedules.py index f19f0d8ad..e897286b0 100644 --- a/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/schedules.py +++ b/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/schedules.py @@ -26,7 +26,9 @@ dataset_id=constants.BILHETAGEM_DATASET_ID.value, secret_path=constants.BILHETAGEM_SECRET_PATH.value, source_type=constants.BILHETAGEM_GENERAL_CAPTURE_PARAMS.value["source_type"], - runs_interval_minutes=15, + runs_interval_minutes=constants.BILHETAGEM_GENERAL_CAPTURE_PARAMS.value[ + "principal_runs_interval_minutes" + ], ) bilhetagem_principal_schedule = Schedule(clocks=untuple(bilhetagem_principal_clocks)) @@ -42,7 +44,9 @@ dataset_id=constants.BILHETAGEM_DATASET_ID.value, secret_path=constants.BILHETAGEM_SECRET_PATH.value, source_type=constants.BILHETAGEM_GENERAL_CAPTURE_PARAMS.value["source_type"], - runs_interval_minutes=0, + runs_interval_minutes=constants.BILHETAGEM_GENERAL_CAPTURE_PARAMS.value[ + "transacao_runs_interval_minutes" + ], ) bilhetagem_transacao_schedule = Schedule(clocks=untuple(bilhetagem_transacao_clocks)) From b5712d2746675c4925231382f2cf436da339be94 Mon Sep 17 00:00:00 2001 From: eng-rodrigocunha Date: Thu, 28 Sep 2023 11:42:25 -0300 Subject: [PATCH 030/145] update interval --- pipelines/rj_smtr/utils.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pipelines/rj_smtr/utils.py b/pipelines/rj_smtr/utils.py index 743e955e1..0972a22c8 100644 --- a/pipelines/rj_smtr/utils.py +++ b/pipelines/rj_smtr/utils.py @@ -664,21 +664,21 @@ def upload_run_logs_to_bq( # pylint: disable=R0913 def get_datetime_range( timestamp: datetime, - interval: int, + interval: timedelta, ) -> dict: """ Task to get datetime range in UTC Args: timestamp (datetime): timestamp to get datetime range - interval (int): interval in seconds + interval (timedelta): interval to get datetime range Returns: dict: datetime range """ start = ( - (timestamp - timedelta(seconds=interval)) + (timestamp - timedelta(interval)) .astimezone(tz=pytz.timezone("UTC")) .strftime("%Y-%m-%d %H:%M:%S") ) From e3df22cc2cec64b6fcc7e0258caafdf542c8ab86 Mon Sep 17 00:00:00 2001 From: eng-rodrigocunha Date: Thu, 28 Sep 2023 11:44:39 -0300 Subject: [PATCH 031/145] fix get_datetime_range interval --- pipelines/rj_smtr/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pipelines/rj_smtr/utils.py b/pipelines/rj_smtr/utils.py index 0972a22c8..7b32e2831 100644 --- a/pipelines/rj_smtr/utils.py +++ b/pipelines/rj_smtr/utils.py @@ -678,7 +678,7 @@ def get_datetime_range( """ start = ( - (timestamp - timedelta(interval)) + (timestamp - interval) .astimezone(tz=pytz.timezone("UTC")) .strftime("%Y-%m-%d %H:%M:%S") ) From 6ed06dad2772cb2d4ff32e6a19393d2e24cfe47f Mon Sep 17 00:00:00 2001 From: eng-rodrigocunha Date: Thu, 28 Sep 2023 12:21:35 -0300 Subject: [PATCH 032/145] remove order by from queries --- pipelines/rj_smtr/constants.py | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/pipelines/rj_smtr/constants.py b/pipelines/rj_smtr/constants.py index 3afb0b1cd..4f2b1c95a 100644 --- a/pipelines/rj_smtr/constants.py +++ b/pipelines/rj_smtr/constants.py @@ -202,8 +202,6 @@ class constants(Enum): # pylint: disable=c0103 WHERE data_processamento BETWEEN '{start}' AND '{end}' - ORDER BY - data_processamento """, "run_interval": BILHETAGEM_GENERAL_CAPTURE_PARAMS["transacao_run_interval"], }, @@ -223,8 +221,6 @@ class constants(Enum): # pylint: disable=c0103 LINHA WHERE DT_INCLUSAO >= '{start}' - ORDER BY - DT_INCLUSAO """, "run_interval": BILHETAGEM_GENERAL_CAPTURE_PARAMS[ "principal_run_interval" @@ -244,8 +240,6 @@ class constants(Enum): # pylint: disable=c0103 GRUPO WHERE DT_INCLUSAO >= '{start}' - ORDER BY - DT_INCLUSAO """, "run_interval": BILHETAGEM_GENERAL_CAPTURE_PARAMS[ "principal_run_interval" @@ -265,8 +259,6 @@ class constants(Enum): # pylint: disable=c0103 GRUPO_LINHA WHERE DT_INCLUSAO >= '{start}' - ORDER BY - DT_INCLUSAO """, "run_interval": BILHETAGEM_GENERAL_CAPTURE_PARAMS[ "principal_run_interval" @@ -286,8 +278,6 @@ class constants(Enum): # pylint: disable=c0103 matriz_integracao WHERE dt_inclusao >= '{start}' - ORDER BY - dt_inclusao """, "run_interval": BILHETAGEM_GENERAL_CAPTURE_PARAMS[ "principal_run_interval" From 822c59f256d4e4ff900486a6472145bcbea4b08a Mon Sep 17 00:00:00 2001 From: eng-rodrigocunha Date: Thu, 28 Sep 2023 12:22:30 -0300 Subject: [PATCH 033/145] fix get_raw_data_api --- pipelines/rj_smtr/utils.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/pipelines/rj_smtr/utils.py b/pipelines/rj_smtr/utils.py index 7b32e2831..445389340 100644 --- a/pipelines/rj_smtr/utils.py +++ b/pipelines/rj_smtr/utils.py @@ -506,6 +506,13 @@ def get_raw_data_api( # pylint: disable=R0912 else: headers = get_vault_secret(secret_path)["data"] + # remove from headers, if present + # TODO: remove this before merge to master + remove_headers = ["host", "databases"] + for remove_header in remove_headers: + if remove_header in list(headers.keys()): + del headers[remove_header] + response = requests.get( url, headers=headers, From c58ea9639bcb2812484dd899de6bfd33a776aec9 Mon Sep 17 00:00:00 2001 From: Rafael Date: Thu, 28 Sep 2023 15:41:42 -0300 Subject: [PATCH 034/145] change json read function --- pipelines/rj_smtr/utils.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/pipelines/rj_smtr/utils.py b/pipelines/rj_smtr/utils.py index 445389340..be8ed7bbd 100644 --- a/pipelines/rj_smtr/utils.py +++ b/pipelines/rj_smtr/utils.py @@ -707,11 +707,14 @@ def read_raw_data(filepath: str, csv_args: dict = dict()) -> tuple[str, pd.DataF tuple[str, pd.DataFrame]: error and data """ error = None + data = None try: file_type = filepath.split(".")[-1] if file_type == "json": - data = pd.read_json(filepath) + with open(filepath, "r") as file: + data = json.load(file) + data = pd.DataFrame(data) # data = json.loads(data) elif file_type in ("txt", "csv"): From 045a42368562263938b90a25feffaaed4c83318d Mon Sep 17 00:00:00 2001 From: Carolina Gomes Date: Thu, 28 Sep 2023 16:01:10 -0300 Subject: [PATCH 035/145] update read_raw_data --- pipelines/rj_smtr/utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pipelines/rj_smtr/utils.py b/pipelines/rj_smtr/utils.py index be8ed7bbd..c0c203dcd 100644 --- a/pipelines/rj_smtr/utils.py +++ b/pipelines/rj_smtr/utils.py @@ -713,8 +713,8 @@ def read_raw_data(filepath: str, csv_args: dict = dict()) -> tuple[str, pd.DataF if file_type == "json": with open(filepath, "r") as file: - data = json.load(file) - data = pd.DataFrame(data) + data = pd.DataFrame.from_dict(json.load(file), orient="records") + # data = json.loads(data) elif file_type in ("txt", "csv"): From d2d188f7491de19ac2554eb465e46829d04e572c Mon Sep 17 00:00:00 2001 From: Carolina Gomes Date: Thu, 28 Sep 2023 16:09:27 -0300 Subject: [PATCH 036/145] update save_raw_local_func --- pipelines/rj_smtr/utils.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pipelines/rj_smtr/utils.py b/pipelines/rj_smtr/utils.py index c0c203dcd..20168b039 100644 --- a/pipelines/rj_smtr/utils.py +++ b/pipelines/rj_smtr/utils.py @@ -448,7 +448,7 @@ def generate_execute_schedules( # pylint: disable=too-many-arguments,too-many-l def save_raw_local_func( - data: dict, filepath: str, mode: str = "raw", filetype: str = "json" + data: Union[dict, str], filepath: str, mode: str = "raw", filetype: str = "json" ) -> str: """ Saves json response from API to .json file. @@ -467,6 +467,8 @@ def save_raw_local_func( Path(_filepath).parent.mkdir(parents=True, exist_ok=True) if filetype == "json": + if isinstance(data, dict): + data = json.loads(data) json.dump(data, Path(_filepath).open("w", encoding="utf-8")) # if filetype == "csv": From b7c4e2fe39b2e0d3a613a68ecab8a155787f2292 Mon Sep 17 00:00:00 2001 From: Rafael Date: Thu, 28 Sep 2023 16:18:03 -0300 Subject: [PATCH 037/145] log error --- pipelines/rj_smtr/utils.py | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/pipelines/rj_smtr/utils.py b/pipelines/rj_smtr/utils.py index 20168b039..6219aaa78 100644 --- a/pipelines/rj_smtr/utils.py +++ b/pipelines/rj_smtr/utils.py @@ -9,6 +9,8 @@ from datetime import timedelta, datetime from typing import List, Union +import traceback +import sys import io import json import zipfile @@ -52,6 +54,19 @@ def log_critical(message: str, secret_path: str = constants.CRITICAL_SECRET_PATH return send_discord_message(message=message, webhook_url=url) +def log_error(error: str): + tb = sys.exc_info()[-1] + frame = traceback.extract_tb(tb, 1)[0] + file_name = frame[0] + function_name = frame[2] + line_no = frame[1] + + log( + f"[CATCHED] Task failed in file {file_name} - ({function_name}) line: {line_no} with error: \n{error}", + level="error", + ) + + def create_or_append_table( dataset_id: str, table_id: str, path: str, partitions: str = None ): @@ -728,6 +743,7 @@ def read_raw_data(filepath: str, csv_args: dict = dict()) -> tuple[str, pd.DataF except Exception as exp: error = exp - log(f"[CATCHED] Task failed with error: \n{error}", level="error") + log_error(error=error) + # log(f"[CATCHED] Task failed with error: \n{error}", level="error") return error, data From 2bedf890ee42187088bfa645d61a0af08598f4f7 Mon Sep 17 00:00:00 2001 From: Rafael Date: Thu, 28 Sep 2023 16:44:41 -0300 Subject: [PATCH 038/145] change raw api extraction for json --- pipelines/rj_smtr/tasks.py | 7 ++++--- pipelines/rj_smtr/utils.py | 14 +++++++++----- 2 files changed, 13 insertions(+), 8 deletions(-) diff --git a/pipelines/rj_smtr/tasks.py b/pipelines/rj_smtr/tasks.py index 4a7182daf..be878db21 100644 --- a/pipelines/rj_smtr/tasks.py +++ b/pipelines/rj_smtr/tasks.py @@ -34,6 +34,7 @@ get_datetime_range, read_raw_data, save_treated_local_func, + log_error, ) from pipelines.utils.execute_dbt_model.utils import get_dbt_client from pipelines.utils.utils import log, get_redis_client, get_vault_secret @@ -434,7 +435,7 @@ def get_raw( # pylint: disable=R0912 error = exp if error is not None: - log(f"[CATCHED] Task failed with error: \n{error}", level="error") + log_error(error=error) return {"data": data, "error": error} @@ -992,7 +993,7 @@ def transform_raw_to_nested_structure( error = exp if error is not None: - log(f"[CATCHED] Task failed with error: \n{error}", level="error") + log_error(error=error) return error, filepath @@ -1127,7 +1128,7 @@ def get_raw_from_sources( raise NotImplementedError(f"{source_type} not supported") except NotImplementedError as exp: error = exp - log(f"[CATCHED] Task failed with error: \n{error}", level="error") + log_error(error=error) log(f"Raw extraction ended returned values: {error}, {filepath}") return error, filepath diff --git a/pipelines/rj_smtr/utils.py b/pipelines/rj_smtr/utils.py index 6219aaa78..41b29d41e 100644 --- a/pipelines/rj_smtr/utils.py +++ b/pipelines/rj_smtr/utils.py @@ -538,13 +538,17 @@ def get_raw_data_api( # pylint: disable=R0912 ) response.raise_for_status() - filepath = save_raw_local_func( - data=response.text, filepath=filepath, filetype=filetype - ) + + if filetype == "json": + data = response.json() + else: + data = response.text + + filepath = save_raw_local_func(data=data, filepath=filepath, filetype=filetype) except Exception as exp: error = exp - log(f"[CATCHED] Task failed with error: \n{error}", level="error") + log_error(error=error) return error, filepath @@ -591,7 +595,7 @@ def get_raw_data_gcs( except Exception as exp: error = exp - log(f"[CATCHED] Task failed with error: \n{error}", level="error") + log_error(error=error) return error, raw_filepath From 20b48dfb2950ba513c049e922b8768da9ab03e57 Mon Sep 17 00:00:00 2001 From: Rafael Date: Thu, 28 Sep 2023 16:53:26 -0300 Subject: [PATCH 039/145] change read json function --- pipelines/rj_smtr/utils.py | 13 +++---------- 1 file changed, 3 insertions(+), 10 deletions(-) diff --git a/pipelines/rj_smtr/utils.py b/pipelines/rj_smtr/utils.py index 41b29d41e..9c04ed701 100644 --- a/pipelines/rj_smtr/utils.py +++ b/pipelines/rj_smtr/utils.py @@ -55,14 +55,9 @@ def log_critical(message: str, secret_path: str = constants.CRITICAL_SECRET_PATH def log_error(error: str): - tb = sys.exc_info()[-1] - frame = traceback.extract_tb(tb, 1)[0] - file_name = frame[0] - function_name = frame[2] - line_no = frame[1] - + error = traceback.format_exc() log( - f"[CATCHED] Task failed in file {file_name} - ({function_name}) line: {line_no} with error: \n{error}", + f"[CATCHED] Task failed with error: \n{error}", level="error", ) @@ -733,11 +728,9 @@ def read_raw_data(filepath: str, csv_args: dict = dict()) -> tuple[str, pd.DataF file_type = filepath.split(".")[-1] if file_type == "json": - with open(filepath, "r") as file: - data = pd.DataFrame.from_dict(json.load(file), orient="records") + data = pd.read_json(filepath) # data = json.loads(data) - elif file_type in ("txt", "csv"): if csv_args is None: csv_args = {} From 42c6ac008e6e8f569993c9b0a40958941c0750a0 Mon Sep 17 00:00:00 2001 From: Rafael Date: Thu, 28 Sep 2023 17:45:44 -0300 Subject: [PATCH 040/145] print log traceback --- pipelines/rj_smtr/tasks.py | 23 +++++++++-------------- pipelines/rj_smtr/utils.py | 21 ++++++--------------- 2 files changed, 15 insertions(+), 29 deletions(-) diff --git a/pipelines/rj_smtr/tasks.py b/pipelines/rj_smtr/tasks.py index be878db21..dd48d2c64 100644 --- a/pipelines/rj_smtr/tasks.py +++ b/pipelines/rj_smtr/tasks.py @@ -34,7 +34,6 @@ get_datetime_range, read_raw_data, save_treated_local_func, - log_error, ) from pipelines.utils.execute_dbt_model.utils import get_dbt_client from pipelines.utils.utils import log, get_redis_client, get_vault_secret @@ -431,11 +430,9 @@ def get_raw( # pylint: disable=R0912 "Unsupported raw file extension. Supported only: json, csv and txt" ) - except Exception as exp: - error = exp - - if error is not None: - log_error(error=error) + except Exception: + error = traceback.format_exc() + log(f"[CATCHED] Task failed with error: \n{error}", level="error") return {"data": data, "error": error} @@ -989,11 +986,9 @@ def transform_raw_to_nested_structure( # save treated local filepath = save_treated_local_func(data=data, error=error, filepath=filepath) - except Exception as exp: # pylint: disable=W0703 - error = exp - - if error is not None: - log_error(error=error) + except Exception: # pylint: disable=W0703 + error = traceback.format_exc() + log(f"[CATCHED] Task failed with error: \n{error}", level="error") return error, filepath @@ -1126,9 +1121,9 @@ def get_raw_from_sources( ) else: raise NotImplementedError(f"{source_type} not supported") - except NotImplementedError as exp: - error = exp - log_error(error=error) + except NotImplementedError: + error = traceback.format_exc() + log(f"[CATCHED] Task failed with error: \n{error}", level="error") log(f"Raw extraction ended returned values: {error}, {filepath}") return error, filepath diff --git a/pipelines/rj_smtr/utils.py b/pipelines/rj_smtr/utils.py index 9c04ed701..553bd860a 100644 --- a/pipelines/rj_smtr/utils.py +++ b/pipelines/rj_smtr/utils.py @@ -54,14 +54,6 @@ def log_critical(message: str, secret_path: str = constants.CRITICAL_SECRET_PATH return send_discord_message(message=message, webhook_url=url) -def log_error(error: str): - error = traceback.format_exc() - log( - f"[CATCHED] Task failed with error: \n{error}", - level="error", - ) - - def create_or_append_table( dataset_id: str, table_id: str, path: str, partitions: str = None ): @@ -542,8 +534,8 @@ def get_raw_data_api( # pylint: disable=R0912 filepath = save_raw_local_func(data=data, filepath=filepath, filetype=filetype) except Exception as exp: - error = exp - log_error(error=error) + error = traceback.format_exc() + log(f"[CATCHED] Task failed with error: \n{error}", level="error") return error, filepath @@ -589,8 +581,8 @@ def get_raw_data_gcs( ) except Exception as exp: - error = exp - log_error(error=error) + error = traceback.format_exc() + log(f"[CATCHED] Task failed with error: \n{error}", level="error") return error, raw_filepath @@ -739,8 +731,7 @@ def read_raw_data(filepath: str, csv_args: dict = dict()) -> tuple[str, pd.DataF error = "Unsupported raw file extension. Supported only: json, csv and txt" except Exception as exp: - error = exp - log_error(error=error) - # log(f"[CATCHED] Task failed with error: \n{error}", level="error") + error = traceback.format_exc() + log(f"[CATCHED] Task failed with error: \n{error}", level="error") return error, data From 25276040630a11950a1c5f556ccff944e2202a39 Mon Sep 17 00:00:00 2001 From: Rafael Date: Thu, 28 Sep 2023 23:08:40 -0300 Subject: [PATCH 041/145] skip pre treatment if empty df --- pipelines/rj_smtr/tasks.py | 53 ++++++++++++++++++++------------------ 1 file changed, 28 insertions(+), 25 deletions(-) diff --git a/pipelines/rj_smtr/tasks.py b/pipelines/rj_smtr/tasks.py index dd48d2c64..8a24934ce 100644 --- a/pipelines/rj_smtr/tasks.py +++ b/pipelines/rj_smtr/tasks.py @@ -937,11 +937,6 @@ def transform_raw_to_nested_structure( # ORGANIZAR: - # Check empty dataframe - # if len(status["data"]) == 0: - # log("Empty dataframe, skipping transformation...") - # return {"data": pd.DataFrame(), "error": error} - try: # leitura do dado raw error, data = read_raw_data(filepath=raw_filepath) @@ -956,32 +951,40 @@ def transform_raw_to_nested_structure( - data:\n{data.head()}""" ) - log(f"Raw data:\n{data_info_str(data)}", level="info") + # Check empty dataframe + if data.empty: + log("Empty dataframe, skipping transformation...") + else: + log(f"Raw data:\n{data_info_str(data)}", level="info") - log("Adding captured timestamp column...", level="info") - data["timestamp_captura"] = timestamp + log("Adding captured timestamp column...", level="info") + data["timestamp_captura"] = timestamp - log("Striping string columns...", level="info") - for col in data.columns[data.dtypes == "object"].to_list(): - data[col] = data[col].str.strip() + log("Striping string columns...", level="info") + for col in data.columns[data.dtypes == "object"].to_list(): + data[col] = data[col].str.strip() - log(f"Finished cleaning! Data:\n{data_info_str(data)}", level="info") + log(f"Finished cleaning! Data:\n{data_info_str(data)}", level="info") - log("Creating nested structure...", level="info") - pk_cols = primary_key + ["timestamp_captura"] - data = ( - data.groupby(pk_cols) - .apply( - lambda x: x[data.columns.difference(pk_cols)].to_json(orient="records") + log("Creating nested structure...", level="info") + pk_cols = primary_key + ["timestamp_captura"] + data = ( + data.groupby(pk_cols) + .apply( + lambda x: x[data.columns.difference(pk_cols)].to_json( + orient="records" + ) + ) + .str.strip("[]") + .reset_index(name="content")[ + primary_key + ["content", "timestamp_captura"] + ] ) - .str.strip("[]") - .reset_index(name="content")[primary_key + ["content", "timestamp_captura"]] - ) - log( - f"Finished nested structure! Data:\n{data_info_str(data)}", - level="info", - ) + log( + f"Finished nested structure! Data:\n{data_info_str(data)}", + level="info", + ) # save treated local filepath = save_treated_local_func(data=data, error=error, filepath=filepath) From 0f907b977b075949373c790fabf221409bae1ca6 Mon Sep 17 00:00:00 2001 From: Rafael Date: Fri, 29 Sep 2023 06:47:53 -0300 Subject: [PATCH 042/145] skip save staging if dataframe is empty / save raw --- pipelines/rj_smtr/flows.py | 3 ++- pipelines/rj_smtr/tasks.py | 27 ++++++++++++++++++--------- pipelines/rj_smtr/utils.py | 27 +++++++++++---------------- 3 files changed, 31 insertions(+), 26 deletions(-) diff --git a/pipelines/rj_smtr/flows.py b/pipelines/rj_smtr/flows.py index 27eaa76a4..f1b30335a 100644 --- a/pipelines/rj_smtr/flows.py +++ b/pipelines/rj_smtr/flows.py @@ -98,7 +98,7 @@ # Pré-tratamento # - error, staging_filepath = transform_raw_to_nested_structure( + error, staging_filepath, flag_empty_data = transform_raw_to_nested_structure( raw_filepath=raw_filepath, filepath=filepath, error=error, @@ -114,6 +114,7 @@ table_id=table_id, dataset_id=dataset_id, partitions=partitions, + flag_empty_data=flag_empty_data, ) default_capture_flow.storage = GCS(emd_constants.GCS_FLOWS_BUCKET.value) diff --git a/pipelines/rj_smtr/tasks.py b/pipelines/rj_smtr/tasks.py index 8a24934ce..ee6d5bfa1 100644 --- a/pipelines/rj_smtr/tasks.py +++ b/pipelines/rj_smtr/tasks.py @@ -34,6 +34,7 @@ get_datetime_range, read_raw_data, save_treated_local_func, + save_raw_local_func, ) from pipelines.utils.execute_dbt_model.utils import get_dbt_client from pipelines.utils.utils import log, get_redis_client, get_vault_secret @@ -667,6 +668,7 @@ def upload_staging_data_to_gcs( table_id: str, dataset_id: str, partitions: list, + flag_empty_data: bool, ): """ Upload staging data to GCS. @@ -682,7 +684,9 @@ def upload_staging_data_to_gcs( Returns: None """ - if not error: + if flag_empty_data: + log("Empty dataframe, skipping upload") + elif not error: try: # Creates and publish table if it does not exist, append to it otherwise create_or_append_table( @@ -908,7 +912,7 @@ def get_previous_date(days): ############### -@task(nout=2) +@task(nout=3) def transform_raw_to_nested_structure( raw_filepath: str, filepath: str, @@ -931,9 +935,9 @@ def transform_raw_to_nested_structure( """ # Check previous error - + flag_empty_data = False if error is not None: - return error, None + return error, None, flag_empty_data # ORGANIZAR: @@ -953,6 +957,7 @@ def transform_raw_to_nested_structure( # Check empty dataframe if data.empty: + flag_empty_data = True log("Empty dataframe, skipping transformation...") else: log(f"Raw data:\n{data_info_str(data)}", level="info") @@ -993,7 +998,7 @@ def transform_raw_to_nested_structure( error = traceback.format_exc() log(f"[CATCHED] Task failed with error: \n{error}", level="error") - return error, filepath + return error, filepath, flag_empty_data # @task(checkpoint=False) @@ -1098,6 +1103,7 @@ def get_raw_from_sources( """ error = None filepath = None + data = None source_values = source_type.split("-", 1) @@ -1109,21 +1115,24 @@ def get_raw_from_sources( try: if source_type == "api": - error, filepath = get_raw_data_api( + error, data, filetype = get_raw_data_api( url=source_path, secret_path=secret_path, api_params=api_params, - filepath=local_filepath, filetype=filetype, ) elif source_type == "gcs": - error, filepath = get_raw_data_gcs( + error, data, filetype = get_raw_data_gcs( gcs_path=source_path, filename_to_unzip=table_id, - local_filepath=local_filepath, ) else: raise NotImplementedError(f"{source_type} not supported") + + filepath = save_raw_local_func( + data=data, filepath=local_filepath, filetype=filetype + ) + except NotImplementedError: error = traceback.format_exc() log(f"[CATCHED] Task failed with error: \n{error}", level="error") diff --git a/pipelines/rj_smtr/utils.py b/pipelines/rj_smtr/utils.py index 553bd860a..f3ff410c4 100644 --- a/pipelines/rj_smtr/utils.py +++ b/pipelines/rj_smtr/utils.py @@ -487,9 +487,8 @@ def get_raw_data_api( # pylint: disable=R0912 url: str, secret_path: str = None, api_params: dict = None, - filepath: str = None, filetype: str = None, -) -> tuple[str, str]: +) -> tuple[str, str, str]: """ Request data from URL API @@ -504,6 +503,7 @@ def get_raw_data_api( # pylint: disable=R0912 tuple[str, str]: Error and filepath """ error = None + data = None try: if secret_path is None: headers = secret_path @@ -531,20 +531,17 @@ def get_raw_data_api( # pylint: disable=R0912 else: data = response.text - filepath = save_raw_local_func(data=data, filepath=filepath, filetype=filetype) - - except Exception as exp: + except Exception: error = traceback.format_exc() log(f"[CATCHED] Task failed with error: \n{error}", level="error") - return error, filepath + return error, data, filetype def get_raw_data_gcs( gcs_path: str, - local_filepath: str, filename_to_unzip: str = None, -) -> tuple[str, str]: +) -> tuple[str, str, str]: """ Get raw data from GCS @@ -557,7 +554,8 @@ def get_raw_data_gcs( tuple[str, str]: Error and filepath """ error = None - raw_filepath = None + data = None + filetype = None try: blob = get_storage_blob(gcs_path=gcs_path) @@ -574,17 +572,14 @@ def get_raw_data_gcs( else: filename = blob.name - raw_filepath = save_raw_local_func( - data=data.decode(encoding="utf-8"), - filepath=local_filepath, - filetype=filename.split(".")[-1], - ) + data = data.decode(encoding="utf-8") + filetype = filename.split(".")[-1] - except Exception as exp: + except Exception: error = traceback.format_exc() log(f"[CATCHED] Task failed with error: \n{error}", level="error") - return error, raw_filepath + return error, data, filetype def save_treated_local_func( From ba1dad2654709c11c0fedd9d88d7a5eb6a969c60 Mon Sep 17 00:00:00 2001 From: Rafael Date: Fri, 29 Sep 2023 11:30:15 -0300 Subject: [PATCH 043/145] remove skip upload if empty dataframe --- pipelines/rj_smtr/flows.py | 3 +-- pipelines/rj_smtr/tasks.py | 8 +++----- 2 files changed, 4 insertions(+), 7 deletions(-) diff --git a/pipelines/rj_smtr/flows.py b/pipelines/rj_smtr/flows.py index f1b30335a..27eaa76a4 100644 --- a/pipelines/rj_smtr/flows.py +++ b/pipelines/rj_smtr/flows.py @@ -98,7 +98,7 @@ # Pré-tratamento # - error, staging_filepath, flag_empty_data = transform_raw_to_nested_structure( + error, staging_filepath = transform_raw_to_nested_structure( raw_filepath=raw_filepath, filepath=filepath, error=error, @@ -114,7 +114,6 @@ table_id=table_id, dataset_id=dataset_id, partitions=partitions, - flag_empty_data=flag_empty_data, ) default_capture_flow.storage = GCS(emd_constants.GCS_FLOWS_BUCKET.value) diff --git a/pipelines/rj_smtr/tasks.py b/pipelines/rj_smtr/tasks.py index ee6d5bfa1..2fe9e27ed 100644 --- a/pipelines/rj_smtr/tasks.py +++ b/pipelines/rj_smtr/tasks.py @@ -912,7 +912,7 @@ def get_previous_date(days): ############### -@task(nout=3) +@task(nout=2) def transform_raw_to_nested_structure( raw_filepath: str, filepath: str, @@ -935,9 +935,8 @@ def transform_raw_to_nested_structure( """ # Check previous error - flag_empty_data = False if error is not None: - return error, None, flag_empty_data + return error, None # ORGANIZAR: @@ -957,7 +956,6 @@ def transform_raw_to_nested_structure( # Check empty dataframe if data.empty: - flag_empty_data = True log("Empty dataframe, skipping transformation...") else: log(f"Raw data:\n{data_info_str(data)}", level="info") @@ -998,7 +996,7 @@ def transform_raw_to_nested_structure( error = traceback.format_exc() log(f"[CATCHED] Task failed with error: \n{error}", level="error") - return error, filepath, flag_empty_data + return error, filepath # @task(checkpoint=False) From 4c3d1cffa53f376a8aa4ed3493db531e89bbc378 Mon Sep 17 00:00:00 2001 From: Rafael Date: Fri, 29 Sep 2023 11:31:37 -0300 Subject: [PATCH 044/145] update docstring and returned values --- pipelines/rj_smtr/tasks.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pipelines/rj_smtr/tasks.py b/pipelines/rj_smtr/tasks.py index 2fe9e27ed..14da15069 100644 --- a/pipelines/rj_smtr/tasks.py +++ b/pipelines/rj_smtr/tasks.py @@ -919,7 +919,7 @@ def transform_raw_to_nested_structure( error: str, timestamp: datetime, primary_key: list = None, -): +) -> tuple(str, str): """ Task to transform raw data to nested structure @@ -931,6 +931,7 @@ def transform_raw_to_nested_structure( primary_key (list, optional): Primary key to be used on nested structure Returns: + str: Error traceback str: Path to the saved treated .csv file """ From 39e8606ffba05a096cc05f672085c05d5b4bd091 Mon Sep 17 00:00:00 2001 From: Rafael Date: Fri, 29 Sep 2023 11:33:28 -0300 Subject: [PATCH 045/145] reorganize task order --- pipelines/rj_smtr/tasks.py | 226 ++++++++++++++++++------------------- 1 file changed, 113 insertions(+), 113 deletions(-) diff --git a/pipelines/rj_smtr/tasks.py b/pipelines/rj_smtr/tasks.py index 14da15069..9c372e213 100644 --- a/pipelines/rj_smtr/tasks.py +++ b/pipelines/rj_smtr/tasks.py @@ -438,6 +438,119 @@ def get_raw( # pylint: disable=R0912 return {"data": data, "error": error} +@task(checkpoint=False, nout=2) +def create_request_params( + extract_params: dict, + table_id: str, + dataset_id: str, + timestamp: datetime, +) -> tuple: + """ + Task to create request params + + Args: + extract_params (dict): extract parameters + table_id (str): table_id on BigQuery + dataset_id (str): dataset_id on BigQuery + timestamp (datetime): timestamp for flow run + + Returns: + request_params: host, database and query to request data + request_url: url to request data + """ + request_params = None + + if dataset_id == constants.BILHETAGEM_DATASET_ID.value: + database = constants.BILHETAGEM_GENERAL_CAPTURE_PARAMS.value["databases"][ + extract_params["database"] + ] + request_url = ( + constants.BILHETAGEM_GENERAL_CAPTURE_PARAMS.value["vpn_url"] + + database["engine"] + ) + + datetime_range = get_datetime_range( + timestamp=timestamp, interval=timedelta(**extract_params["run_interval"]) + ) + + request_params = { + "host": database["host"], # TODO: exibir no log em ambiente fechado + "database": extract_params["database"], + "query": extract_params["query"].format(**datetime_range), + } + + elif dataset_id == constants.GTFS_DATASET_ID.value: + if table_id == constants.GTFS_QUADRO_CAPTURE_PARAMS.value["table_id"]: + request_url = f"{constants.GTFS_BASE_GCS_PATH.value}/{table_id}.csv" + else: + request_url = f"{constants.GTFS_BASE_GCS_PATH.value}/gtfs.zip" + + return request_params, request_url + + +@task(checkpoint=False, nout=2) +def get_raw_from_sources( + source_type: str, + local_filepath: str, + source_path: str = None, + table_id: str = None, + secret_path: str = None, + api_params: dict = None, +): + """ + Task to get raw data from sources + + Args: + source_type (str): source type + local_filepath (str): local filepath + source_path (str, optional): source path. Defaults to None. + table_id (str, optional): table_id on BigQuery. Defaults to None. + secret_path (str, optional): secret path. Defaults to None. + api_params (dict, optional): api parameters. Defaults to None. + + Returns: + error: error + """ + error = None + filepath = None + data = None + + source_values = source_type.split("-", 1) + + source_type, filetype = ( + source_values if len(source_values) == 2 else (source_values[0], None) + ) + + log(f"Getting raw data from source type: {source_type}") + + try: + if source_type == "api": + error, data, filetype = get_raw_data_api( + url=source_path, + secret_path=secret_path, + api_params=api_params, + filetype=filetype, + ) + elif source_type == "gcs": + error, data, filetype = get_raw_data_gcs( + gcs_path=source_path, + filename_to_unzip=table_id, + ) + else: + raise NotImplementedError(f"{source_type} not supported") + + filepath = save_raw_local_func( + data=data, filepath=local_filepath, filetype=filetype + ) + + except NotImplementedError: + error = traceback.format_exc() + log(f"[CATCHED] Task failed with error: \n{error}", level="error") + + log(f"Raw extraction ended returned values: {error}, {filepath}") + return error, filepath + + ############### # # Load data @@ -1025,116 +1138,3 @@ def transform_raw_to_nested_structure( # end = timestamp.astimezone(tz=timezone("UTC")).strftime("%Y-%m-%d %H:%M:%S") # return {"start": start, "end": end} - - -@task(checkpoint=False, nout=2) -def create_request_params( - extract_params: dict, - table_id: str, - dataset_id: str, - timestamp: datetime, -) -> tuple: - """ - Task to create request params - - Args: - extract_params (dict): extract parameters - table_id (str): table_id on BigQuery - dataset_id (str): dataset_id on BigQuery - timestamp (datetime): timestamp for flow run - - Returns: - request_params: host, database and query to request data - request_url: url to request data - """ - request_params = None - - if dataset_id == constants.BILHETAGEM_DATASET_ID.value: - database = constants.BILHETAGEM_GENERAL_CAPTURE_PARAMS.value["databases"][ - extract_params["database"] - ] - request_url = ( - constants.BILHETAGEM_GENERAL_CAPTURE_PARAMS.value["vpn_url"] - + database["engine"] - ) - - datetime_range = get_datetime_range( - timestamp=timestamp, interval=timedelta(**extract_params["run_interval"]) - ) - - request_params = { - "host": database["host"], # TODO: exibir no log em ambiente fechado - "database": extract_params["database"], - "query": extract_params["query"].format(**datetime_range), - } - - elif dataset_id == constants.GTFS_DATASET_ID.value: - if table_id == constants.GTFS_QUADRO_CAPTURE_PARAMS.value["table_id"]: - request_url = f"{constants.GTFS_BASE_GCS_PATH.value}/{table_id}.csv" - else: - request_url = f"{constants.GTFS_BASE_GCS_PATH.value}/gtfs.zip" - - return request_params, request_url - - -@task(checkpoint=False, nout=2) -def get_raw_from_sources( - source_type: str, - local_filepath: str, - source_path: str = None, - table_id: str = None, - secret_path: str = None, - api_params: dict = None, -): - """ - Task to get raw data from sources - - Args: - source_type (str): source type - local_filepath (str): local filepath - source_path (str, optional): source path. Defaults to None. - table_id (str, optional): table_id on BigQuery. Defaults to None. - secret_path (str, optional): secret path. Defaults to None. - api_params (dict, optional): api parameters. Defaults to None. - - Returns: - error: error - """ - error = None - filepath = None - data = None - - source_values = source_type.split("-", 1) - - source_type, filetype = ( - source_values if len(source_values) == 2 else (source_values[0], None) - ) - - log(f"Getting raw data from source type: {source_type}") - - try: - if source_type == "api": - error, data, filetype = get_raw_data_api( - url=source_path, - secret_path=secret_path, - api_params=api_params, - filetype=filetype, - ) - elif source_type == "gcs": - error, data, filetype = get_raw_data_gcs( - gcs_path=source_path, - filename_to_unzip=table_id, - ) - else: - raise NotImplementedError(f"{source_type} not supported") - - filepath = save_raw_local_func( - data=data, filepath=local_filepath, filetype=filetype - ) - - except NotImplementedError: - error = traceback.format_exc() - log(f"[CATCHED] Task failed with error: \n{error}", level="error") - - log(f"Raw extraction ended returned values: {error}, {filepath}") - return error, filepath From 465ee525648dae41c1e938986f4acc08ec5bac18 Mon Sep 17 00:00:00 2001 From: Rafael Date: Fri, 29 Sep 2023 11:47:24 -0300 Subject: [PATCH 046/145] fix tuple --- pipelines/rj_smtr/tasks.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pipelines/rj_smtr/tasks.py b/pipelines/rj_smtr/tasks.py index 9c372e213..f5fb79ede 100644 --- a/pipelines/rj_smtr/tasks.py +++ b/pipelines/rj_smtr/tasks.py @@ -1032,7 +1032,7 @@ def transform_raw_to_nested_structure( error: str, timestamp: datetime, primary_key: list = None, -) -> tuple(str, str): +) -> tuple[str, str]: """ Task to transform raw data to nested structure From 67a1056a3e363b01fa8573c247ab41aa453ffd2c Mon Sep 17 00:00:00 2001 From: Rafael Date: Fri, 29 Sep 2023 12:01:42 -0300 Subject: [PATCH 047/145] change zip logic --- pipelines/rj_smtr/utils.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pipelines/rj_smtr/utils.py b/pipelines/rj_smtr/utils.py index f3ff410c4..338f2e07b 100644 --- a/pipelines/rj_smtr/utils.py +++ b/pipelines/rj_smtr/utils.py @@ -560,9 +560,11 @@ def get_raw_data_gcs( try: blob = get_storage_blob(gcs_path=gcs_path) + blob_type = blob.name.split(".")[-1] + data = blob.download_as_bytes() - if filename_to_unzip: + if blob_type == "zip": with zipfile.ZipFile(io.BytesIO(data), "r") as zipped_file: filenames = zipped_file.namelist() filename = list( From 3f5f34cabc75a05f424d0e3ed8c1443915f9656c Mon Sep 17 00:00:00 2001 From: Rafael Date: Fri, 29 Sep 2023 12:03:15 -0300 Subject: [PATCH 048/145] remove skip --- pipelines/rj_smtr/tasks.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/pipelines/rj_smtr/tasks.py b/pipelines/rj_smtr/tasks.py index f5fb79ede..4d4088866 100644 --- a/pipelines/rj_smtr/tasks.py +++ b/pipelines/rj_smtr/tasks.py @@ -781,7 +781,6 @@ def upload_staging_data_to_gcs( table_id: str, dataset_id: str, partitions: list, - flag_empty_data: bool, ): """ Upload staging data to GCS. @@ -797,9 +796,7 @@ def upload_staging_data_to_gcs( Returns: None """ - if flag_empty_data: - log("Empty dataframe, skipping upload") - elif not error: + if not error: try: # Creates and publish table if it does not exist, append to it otherwise create_or_append_table( From 7860a4bdd58e1f35bd0260ce0088db7049a67a3f Mon Sep 17 00:00:00 2001 From: Rafael Date: Fri, 29 Sep 2023 12:09:20 -0300 Subject: [PATCH 049/145] create gtfs zip constant --- pipelines/rj_smtr/constants.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pipelines/rj_smtr/constants.py b/pipelines/rj_smtr/constants.py index 4f2b1c95a..009c241e1 100644 --- a/pipelines/rj_smtr/constants.py +++ b/pipelines/rj_smtr/constants.py @@ -309,3 +309,4 @@ class constants(Enum): # pylint: disable=c0103 ] GTFS_QUADRO_CAPTURE_PARAMS = {"table_id": "quadro", "primary_key": "servico"} GTFS_BASE_GCS_PATH = "development/br_rj_riodejaneiro_gtfs/upload" + GTFS_ZIP_FILENAME = "gtfs.zip" From 2d7c9cb8b12f4e2ca71fcffd8b2896952acbc11a Mon Sep 17 00:00:00 2001 From: Rafael Date: Fri, 29 Sep 2023 12:13:06 -0300 Subject: [PATCH 050/145] add gtfs zip file name --- pipelines/rj_smtr/constants.py | 2 +- pipelines/rj_smtr/tasks.py | 4 +++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/pipelines/rj_smtr/constants.py b/pipelines/rj_smtr/constants.py index 009c241e1..4975a246f 100644 --- a/pipelines/rj_smtr/constants.py +++ b/pipelines/rj_smtr/constants.py @@ -309,4 +309,4 @@ class constants(Enum): # pylint: disable=c0103 ] GTFS_QUADRO_CAPTURE_PARAMS = {"table_id": "quadro", "primary_key": "servico"} GTFS_BASE_GCS_PATH = "development/br_rj_riodejaneiro_gtfs/upload" - GTFS_ZIP_FILENAME = "gtfs.zip" + GTFS_ZIP_NAME = "gtfs.zip" diff --git a/pipelines/rj_smtr/tasks.py b/pipelines/rj_smtr/tasks.py index 4d4088866..8863f6405 100644 --- a/pipelines/rj_smtr/tasks.py +++ b/pipelines/rj_smtr/tasks.py @@ -483,7 +483,9 @@ def create_request_params( if table_id == constants.GTFS_QUADRO_CAPTURE_PARAMS.value["table_id"]: request_url = f"{constants.GTFS_BASE_GCS_PATH.value}/{table_id}.csv" else: - request_url = f"{constants.GTFS_BASE_GCS_PATH.value}/gtfs.zip" + request_url = ( + f"{constants.GTFS_BASE_GCS_PATH.value}/{constants.GTFS_ZIP_NAME.value}" + ) return request_params, request_url From bfa62739a46ce2d8f287e4989755aa4e2b6505e9 Mon Sep 17 00:00:00 2001 From: Rafael Date: Fri, 29 Sep 2023 12:18:11 -0300 Subject: [PATCH 051/145] add csv to save raw / change filetype logic --- pipelines/rj_smtr/utils.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/pipelines/rj_smtr/utils.py b/pipelines/rj_smtr/utils.py index 338f2e07b..f3362bd91 100644 --- a/pipelines/rj_smtr/utils.py +++ b/pipelines/rj_smtr/utils.py @@ -475,7 +475,7 @@ def save_raw_local_func( # if filetype == "csv": # pass - if filetype == "txt": + if filetype in ("txt", "csv"): with open(_filepath, "w", encoding="utf-8") as file: file.write(data) @@ -560,22 +560,21 @@ def get_raw_data_gcs( try: blob = get_storage_blob(gcs_path=gcs_path) - blob_type = blob.name.split(".")[-1] + filename = blob.name + filetype = filename.split(".")[-1] data = blob.download_as_bytes() - if blob_type == "zip": + if filetype == "zip": with zipfile.ZipFile(io.BytesIO(data), "r") as zipped_file: filenames = zipped_file.namelist() filename = list( filter(lambda x: x.split(".")[0] == filename_to_unzip, filenames) )[0] + filetype = filename.split(".")[-1] data = zipped_file.read(filename) - else: - filename = blob.name data = data.decode(encoding="utf-8") - filetype = filename.split(".")[-1] except Exception: error = traceback.format_exc() From 524cd07363cc5866db94a1c67d4b32f04ceac87f Mon Sep 17 00:00:00 2001 From: Rafael Date: Fri, 29 Sep 2023 12:18:32 -0300 Subject: [PATCH 052/145] remove comments --- pipelines/rj_smtr/flows.py | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/pipelines/rj_smtr/flows.py b/pipelines/rj_smtr/flows.py index 27eaa76a4..3fc18b7b0 100644 --- a/pipelines/rj_smtr/flows.py +++ b/pipelines/rj_smtr/flows.py @@ -22,17 +22,11 @@ create_local_partition_path, get_current_timestamp, parse_timestamp_to_string, - # save_raw_local, - # save_treated_local, - # upload_logs_to_bq, - # bq_upload, upload_raw_data_to_gcs, upload_staging_data_to_gcs, transform_raw_to_nested_structure, get_raw_from_sources, - # transform_data_to_json, create_request_params, - # get_datetime_range, ) @@ -79,7 +73,7 @@ ) error, raw_filepath = get_raw_from_sources( - source_type=source_type, # parametro de extracao, onde ficar? + source_type=source_type, local_filepath=filepath, source_path=request_path, table_id=table_id, From 3477a2c53306bacff80f08fb2d94a7863381e61e Mon Sep 17 00:00:00 2001 From: Rafael Date: Fri, 29 Sep 2023 12:21:00 -0300 Subject: [PATCH 053/145] fix csv_args default value --- pipelines/rj_smtr/utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pipelines/rj_smtr/utils.py b/pipelines/rj_smtr/utils.py index f3362bd91..b60d8d8ac 100644 --- a/pipelines/rj_smtr/utils.py +++ b/pipelines/rj_smtr/utils.py @@ -699,7 +699,7 @@ def get_datetime_range( return {"start": start, "end": end} -def read_raw_data(filepath: str, csv_args: dict = dict()) -> tuple[str, pd.DataFrame]: +def read_raw_data(filepath: str, csv_args: dict = None) -> tuple[str, pd.DataFrame]: """ Read raw data from file @@ -726,7 +726,7 @@ def read_raw_data(filepath: str, csv_args: dict = dict()) -> tuple[str, pd.DataF else: error = "Unsupported raw file extension. Supported only: json, csv and txt" - except Exception as exp: + except Exception: error = traceback.format_exc() log(f"[CATCHED] Task failed with error: \n{error}", level="error") From 16e61c879172bce4679d696884ba486294963e5c Mon Sep 17 00:00:00 2001 From: Rafael Date: Fri, 29 Sep 2023 12:24:43 -0300 Subject: [PATCH 054/145] change docstring get raw api --- pipelines/rj_smtr/utils.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pipelines/rj_smtr/utils.py b/pipelines/rj_smtr/utils.py index b60d8d8ac..33378f049 100644 --- a/pipelines/rj_smtr/utils.py +++ b/pipelines/rj_smtr/utils.py @@ -496,7 +496,6 @@ def get_raw_data_api( # pylint: disable=R0912 url (str): URL to request data secret_path (str, optional): Secret path to get headers. Defaults to None. api_params (dict, optional): Parameters to pass to API. Defaults to None. - filepath (str, optional): Path to save raw file. Defaults to None. filetype (str, optional): Filetype to save raw file. Defaults to None. Returns: From 4bdaa4fe4ed5b576aa3a04af44fed9a792b274ea Mon Sep 17 00:00:00 2001 From: Rafael Date: Fri, 29 Sep 2023 12:27:33 -0300 Subject: [PATCH 055/145] change raw data gcs docstring --- pipelines/rj_smtr/utils.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pipelines/rj_smtr/utils.py b/pipelines/rj_smtr/utils.py index 33378f049..aeafa8ae1 100644 --- a/pipelines/rj_smtr/utils.py +++ b/pipelines/rj_smtr/utils.py @@ -546,7 +546,6 @@ def get_raw_data_gcs( Args: gcs_path (str): GCS path to get data - local_filepath (str): Local filepath to save raw data filename_to_unzip (str, optional): Filename to unzip. Defaults to None. Returns: From e3b7c140db20a62c4d8be850abab2087406c890f Mon Sep 17 00:00:00 2001 From: Rafael Date: Fri, 29 Sep 2023 12:29:21 -0300 Subject: [PATCH 056/145] remove commented task --- pipelines/rj_smtr/tasks.py | 27 --------------------------- 1 file changed, 27 deletions(-) diff --git a/pipelines/rj_smtr/tasks.py b/pipelines/rj_smtr/tasks.py index 8863f6405..65ab9505e 100644 --- a/pipelines/rj_smtr/tasks.py +++ b/pipelines/rj_smtr/tasks.py @@ -1110,30 +1110,3 @@ def transform_raw_to_nested_structure( log(f"[CATCHED] Task failed with error: \n{error}", level="error") return error, filepath - - -# @task(checkpoint=False) -# def get_datetime_range( -# timestamp: datetime, -# interval: int, -# ) -> dict: -# """ -# Task to get datetime range in UTC - -# Args: -# timestamp (datetime): timestamp to get datetime range -# interval (int): interval in seconds - -# Returns: -# dict: datetime range -# """ - -# start = ( -# (timestamp - timedelta(seconds=interval)) -# .astimezone(tz=timezone("UTC")) -# .strftime("%Y-%m-%d %H:%M:%S") -# ) - -# end = timestamp.astimezone(tz=timezone("UTC")).strftime("%Y-%m-%d %H:%M:%S") - -# return {"start": start, "end": end} From 0935cbd46f0fd720f3675b991f4c6fa64e5b58ef Mon Sep 17 00:00:00 2001 From: Rafael Date: Fri, 29 Sep 2023 12:30:08 -0300 Subject: [PATCH 057/145] change quadro primary key to list --- pipelines/rj_smtr/constants.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pipelines/rj_smtr/constants.py b/pipelines/rj_smtr/constants.py index 4975a246f..b1072b607 100644 --- a/pipelines/rj_smtr/constants.py +++ b/pipelines/rj_smtr/constants.py @@ -307,6 +307,6 @@ class constants(Enum): # pylint: disable=c0103 {"table_id": "fare_attributes", "primary_key": ["fare_id"]}, {"table_id": "fare_rules", "primary_key": ["fare_id"]}, ] - GTFS_QUADRO_CAPTURE_PARAMS = {"table_id": "quadro", "primary_key": "servico"} + GTFS_QUADRO_CAPTURE_PARAMS = {"table_id": "quadro", "primary_key": ["servico"]} GTFS_BASE_GCS_PATH = "development/br_rj_riodejaneiro_gtfs/upload" GTFS_ZIP_NAME = "gtfs.zip" From e5bad98594e931bf06c4ed6aec3b8887490729c8 Mon Sep 17 00:00:00 2001 From: Carolina Gomes Date: Fri, 29 Sep 2023 13:59:40 -0300 Subject: [PATCH 058/145] update GTFS constants --- pipelines/rj_smtr/constants.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pipelines/rj_smtr/constants.py b/pipelines/rj_smtr/constants.py index b1072b607..d9dd7055c 100644 --- a/pipelines/rj_smtr/constants.py +++ b/pipelines/rj_smtr/constants.py @@ -296,16 +296,16 @@ class constants(Enum): # pylint: disable=c0103 GTFS_GENERAL_CAPTURE_PARAMS = {"partition_date_only": True, "source_type": "gcs"} GTFS_CAPTURE_PARAMS = [ {"table_id": "agency", "primary_key": ["agency_id"]}, - {"table_id": "calendar_dates", "primary_key": ["service_id"]}, + {"table_id": "calendar_dates", "primary_key": ["service_id", "date"]}, {"table_id": "calendar", "primary_key": ["service_id"]}, {"table_id": "feed_info", "primary_key": ["feed_publisher_name"]}, - {"table_id": "frequencies", "primary_key": ["trip_id"]}, + {"table_id": "frequencies", "primary_key": ["trip_id", "start_time"]}, {"table_id": "routes", "primary_key": ["route_id"]}, - {"table_id": "shapes", "primary_key": ["shape_id"]}, + {"table_id": "shapes", "primary_key": ["shape_id", "shape_pt_sequence"]}, {"table_id": "stops", "primary_key": ["stop_id"]}, {"table_id": "trips", "primary_key": ["trip_id"]}, {"table_id": "fare_attributes", "primary_key": ["fare_id"]}, - {"table_id": "fare_rules", "primary_key": ["fare_id"]}, + {"table_id": "fare_rules", "primary_key": []}, ] GTFS_QUADRO_CAPTURE_PARAMS = {"table_id": "quadro", "primary_key": ["servico"]} GTFS_BASE_GCS_PATH = "development/br_rj_riodejaneiro_gtfs/upload" From d4230bb0c860e542fc066bf768ff93056c0e1dc5 Mon Sep 17 00:00:00 2001 From: Rafael Date: Fri, 29 Sep 2023 15:59:04 -0300 Subject: [PATCH 059/145] change upload folder structure --- pipelines/rj_smtr/constants.py | 3 +-- pipelines/rj_smtr/flows.py | 3 ++- pipelines/rj_smtr/tasks.py | 20 ++++++++--------- pipelines/rj_smtr/utils.py | 41 +++++++++++++++++++++++++++------- pipelines/utils/utils.py | 20 ----------------- 5 files changed, 45 insertions(+), 42 deletions(-) diff --git a/pipelines/rj_smtr/constants.py b/pipelines/rj_smtr/constants.py index b1072b607..eece75525 100644 --- a/pipelines/rj_smtr/constants.py +++ b/pipelines/rj_smtr/constants.py @@ -308,5 +308,4 @@ class constants(Enum): # pylint: disable=c0103 {"table_id": "fare_rules", "primary_key": ["fare_id"]}, ] GTFS_QUADRO_CAPTURE_PARAMS = {"table_id": "quadro", "primary_key": ["servico"]} - GTFS_BASE_GCS_PATH = "development/br_rj_riodejaneiro_gtfs/upload" - GTFS_ZIP_NAME = "gtfs.zip" + GTFS_ZIP_NAME = "gtfs" diff --git a/pipelines/rj_smtr/flows.py b/pipelines/rj_smtr/flows.py index 3fc18b7b0..c53a3f7d8 100644 --- a/pipelines/rj_smtr/flows.py +++ b/pipelines/rj_smtr/flows.py @@ -76,9 +76,10 @@ source_type=source_type, local_filepath=filepath, source_path=request_path, + dataset_id=dataset_id, table_id=table_id, secret_path=secret_path, - api_params=request_params, + request_params=request_params, ) RAW_UPLOADED = upload_raw_data_to_gcs( diff --git a/pipelines/rj_smtr/tasks.py b/pipelines/rj_smtr/tasks.py index 65ab9505e..65e3e95b1 100644 --- a/pipelines/rj_smtr/tasks.py +++ b/pipelines/rj_smtr/tasks.py @@ -459,6 +459,7 @@ def create_request_params( request_url: url to request data """ request_params = None + request_url = None if dataset_id == constants.BILHETAGEM_DATASET_ID.value: database = constants.BILHETAGEM_GENERAL_CAPTURE_PARAMS.value["databases"][ @@ -480,12 +481,8 @@ def create_request_params( } elif dataset_id == constants.GTFS_DATASET_ID.value: - if table_id == constants.GTFS_QUADRO_CAPTURE_PARAMS.value["table_id"]: - request_url = f"{constants.GTFS_BASE_GCS_PATH.value}/{table_id}.csv" - else: - request_url = ( - f"{constants.GTFS_BASE_GCS_PATH.value}/{constants.GTFS_ZIP_NAME.value}" - ) + if table_id != constants.GTFS_QUADRO_CAPTURE_PARAMS.value["table_id"]: + request_params = constants.GTFS_ZIP_NAME.value return request_params, request_url @@ -495,9 +492,10 @@ def get_raw_from_sources( source_type: str, local_filepath: str, source_path: str = None, + dataset_id: str = None, table_id: str = None, secret_path: str = None, - api_params: dict = None, + request_params: dict = None, ): """ Task to get raw data from sources @@ -506,9 +504,10 @@ def get_raw_from_sources( source_type (str): source type local_filepath (str): local filepath source_path (str, optional): source path. Defaults to None. + dataset_id (str, optional): dataset_id on BigQuery. Defaults to None. table_id (str, optional): table_id on BigQuery. Defaults to None. secret_path (str, optional): secret path. Defaults to None. - api_params (dict, optional): api parameters. Defaults to None. + request_params (dict, optional): request parameters. Defaults to None. Returns: error: error @@ -530,13 +529,12 @@ def get_raw_from_sources( error, data, filetype = get_raw_data_api( url=source_path, secret_path=secret_path, - api_params=api_params, + api_params=request_params, filetype=filetype, ) elif source_type == "gcs": error, data, filetype = get_raw_data_gcs( - gcs_path=source_path, - filename_to_unzip=table_id, + dataset_id=dataset_id, table_id=table_id, zip_filename=request_params ) else: raise NotImplementedError(f"{source_type} not supported") diff --git a/pipelines/rj_smtr/utils.py b/pipelines/rj_smtr/utils.py index aeafa8ae1..9265e1a59 100644 --- a/pipelines/rj_smtr/utils.py +++ b/pipelines/rj_smtr/utils.py @@ -10,7 +10,6 @@ from datetime import timedelta, datetime from typing import List, Union import traceback -import sys import io import json import zipfile @@ -33,7 +32,6 @@ get_vault_secret, send_discord_message, get_redis_client, - get_storage_blob, ) @@ -537,16 +535,42 @@ def get_raw_data_api( # pylint: disable=R0912 return error, data, filetype +def get_upload_storage_blob( + dataset_id: str, + filename: str, +): + """ + Get a blob from upload zone in storage + + Args: + dataset_id (str): The dataset id on BigQuery. + filename (str): The filename in GCS. + + + Returns: + Blob: blob object + """ + bucket = bd.Storage(dataset_id="", table_id="") + blob_list = list( + bucket.client["storage_staging"] + .bucket(bucket.bucket_name) + .list_blobs(prefix=f"upload/{dataset_id}/{filename}.") + ) + return blob_list[0] + + def get_raw_data_gcs( - gcs_path: str, - filename_to_unzip: str = None, + dataset_id: str, + table_id: str, + zip_filename: str = None, ) -> tuple[str, str, str]: """ Get raw data from GCS Args: - gcs_path (str): GCS path to get data - filename_to_unzip (str, optional): Filename to unzip. Defaults to None. + dataset_id (str): The dataset id on BigQuery. + table_id (str): The table id on BigQuery. + zip_filename (str, optional): The zip file name. Defaults to None. Returns: tuple[str, str]: Error and filepath @@ -556,7 +580,8 @@ def get_raw_data_gcs( filetype = None try: - blob = get_storage_blob(gcs_path=gcs_path) + blob_search_name = zip_filename or table_id + blob = get_upload_storage_blob(dataset_id=dataset_id, filename=blob_search_name) filename = blob.name filetype = filename.split(".")[-1] @@ -567,7 +592,7 @@ def get_raw_data_gcs( with zipfile.ZipFile(io.BytesIO(data), "r") as zipped_file: filenames = zipped_file.namelist() filename = list( - filter(lambda x: x.split(".")[0] == filename_to_unzip, filenames) + filter(lambda x: x.split(".")[0] == table_id, filenames) )[0] filetype = filename.split(".")[-1] data = zipped_file.read(filename) diff --git a/pipelines/utils/utils.py b/pipelines/utils/utils.py index e37a88d8b..adf89bc94 100644 --- a/pipelines/utils/utils.py +++ b/pipelines/utils/utils.py @@ -732,26 +732,6 @@ def get_storage_blobs(dataset_id: str, table_id: str, mode: str = "staging") -> ) -def get_storage_blob( - gcs_path: str, -): - """ - Get a blob from a path. - - Args: - gcs_path (str): path to blob - - Returns: - Blob: blob object - """ - bucket = bd.Storage(dataset_id="", table_id="") - return ( - bucket.client["storage_staging"] - .bucket(bucket.bucket_name) - .get_blob(blob_name=gcs_path) - ) - - def list_blobs_with_prefix( bucket_name: str, prefix: str, mode: str = "prod" ) -> List[Blob]: From 7c43d1d0ec5dfc8abdb6f2c17e7c56606b894eb7 Mon Sep 17 00:00:00 2001 From: fernandascovino Date: Fri, 29 Sep 2023 17:24:31 -0300 Subject: [PATCH 060/145] =?UTF-8?q?undo=20silenciamento=20de=20falha=20de?= =?UTF-8?q?=20notifica=C3=A7=C3=A3o?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- pipelines/utils/custom.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/pipelines/utils/custom.py b/pipelines/utils/custom.py index d91739817..13ae82dd5 100644 --- a/pipelines/utils/custom.py +++ b/pipelines/utils/custom.py @@ -68,11 +68,11 @@ def __init__( # pylint: disable=too-many-arguments, too-many-locals edges=edges, reference_tasks=reference_tasks, state_handlers=state_handlers, - # on_failure=partial( - # notify_discord_on_failure, - # secret_path=constants.EMD_DISCORD_WEBHOOK_SECRET_PATH.value, - # code_owners=code_owners, - # ), + on_failure=partial( + notify_discord_on_failure, + secret_path=constants.EMD_DISCORD_WEBHOOK_SECRET_PATH.value, + code_owners=code_owners, + ), validate=validate, result=result, terminal_state_handler=terminal_state_handler, From 089e9334300798660f6a2bde67be6e06112e4c6d Mon Sep 17 00:00:00 2001 From: Rafael Date: Fri, 29 Sep 2023 17:40:58 -0300 Subject: [PATCH 061/145] adicionar partition date only na transacao --- pipelines/rj_smtr/constants.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pipelines/rj_smtr/constants.py b/pipelines/rj_smtr/constants.py index 93303e5b7..c9f18f2fd 100644 --- a/pipelines/rj_smtr/constants.py +++ b/pipelines/rj_smtr/constants.py @@ -181,6 +181,7 @@ class constants(Enum): # pylint: disable=c0103 data_processamento """, "primary_key": ["id"], # id column to nest data on + "partition_date_only": False, }, ] BILHETAGEM_TABLES_PARAMS = [ From 685aae52143ec88eecefbe6fab61fe01953b0fe2 Mon Sep 17 00:00:00 2001 From: fernandascovino Date: Fri, 29 Sep 2023 18:33:40 -0300 Subject: [PATCH 062/145] remove parametros de testes (gtfs) --- pipelines/rj_smtr/constants.py | 19 ------------------- pipelines/rj_smtr/tasks.py | 4 ---- 2 files changed, 23 deletions(-) diff --git a/pipelines/rj_smtr/constants.py b/pipelines/rj_smtr/constants.py index 943304191..d7cf3e771 100644 --- a/pipelines/rj_smtr/constants.py +++ b/pipelines/rj_smtr/constants.py @@ -290,22 +290,3 @@ class constants(Enum): # pylint: disable=c0103 }, ] BILHETAGEM_SECRET_PATH = "smtr_jae_access_data" - - # GTFS - GTFS_DATASET_ID = "br_rj_riodejaneiro_gtfs" - GTFS_GENERAL_CAPTURE_PARAMS = {"partition_date_only": True, "source_type": "gcs"} - GTFS_CAPTURE_PARAMS = [ - {"table_id": "agency", "primary_key": ["agency_id"]}, - {"table_id": "calendar_dates", "primary_key": ["service_id", "date"]}, - {"table_id": "calendar", "primary_key": ["service_id"]}, - {"table_id": "feed_info", "primary_key": ["feed_publisher_name"]}, - {"table_id": "frequencies", "primary_key": ["trip_id", "start_time"]}, - {"table_id": "routes", "primary_key": ["route_id"]}, - {"table_id": "shapes", "primary_key": ["shape_id", "shape_pt_sequence"]}, - {"table_id": "stops", "primary_key": ["stop_id"]}, - {"table_id": "trips", "primary_key": ["trip_id"]}, - {"table_id": "fare_attributes", "primary_key": ["fare_id"]}, - {"table_id": "fare_rules", "primary_key": []}, - ] - GTFS_QUADRO_CAPTURE_PARAMS = {"table_id": "quadro", "primary_key": ["servico"]} - GTFS_ZIP_NAME = "gtfs" diff --git a/pipelines/rj_smtr/tasks.py b/pipelines/rj_smtr/tasks.py index 916721a74..c5dae7741 100644 --- a/pipelines/rj_smtr/tasks.py +++ b/pipelines/rj_smtr/tasks.py @@ -480,10 +480,6 @@ def create_request_params( "query": extract_params["query"].format(**datetime_range), } - elif dataset_id == constants.GTFS_DATASET_ID.value: - if table_id != constants.GTFS_QUADRO_CAPTURE_PARAMS.value["table_id"]: - request_params = constants.GTFS_ZIP_NAME.value - return request_params, request_url From cd5048e56e26eae6cbf571d262c0278be8c6ef7e Mon Sep 17 00:00:00 2001 From: Rodrigo Cunha <66736583+eng-rodrigocunha@users.noreply.github.com> Date: Fri, 29 Sep 2023 18:36:15 -0300 Subject: [PATCH 063/145] Update pipelines/rj_smtr/constants.py Co-authored-by: Fernanda Scovino --- pipelines/rj_smtr/constants.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pipelines/rj_smtr/constants.py b/pipelines/rj_smtr/constants.py index d7cf3e771..52e30d9f8 100644 --- a/pipelines/rj_smtr/constants.py +++ b/pipelines/rj_smtr/constants.py @@ -186,7 +186,7 @@ class constants(Enum): # pylint: disable=c0103 "transacao_run_interval": {"minutes": 1}, "principal_run_interval": {"days": 1}, "transacao_runs_interval_minutes": 0, - "principal_runs_interval_minutes": 15, + "principal_runs_interval_minutes": 5, } BILHETAGEM_TRANSACAO_CAPTURE_PARAMS = { From d17d16127ed3012b0377a7f7b4311af72a3dd911 Mon Sep 17 00:00:00 2001 From: fernandascovino Date: Fri, 29 Sep 2023 18:53:23 -0300 Subject: [PATCH 064/145] corrige encadeamento de erros no flow --- pipelines/rj_smtr/flows.py | 3 +- pipelines/rj_smtr/tasks.py | 110 +++++++++++++++++-------------------- 2 files changed, 51 insertions(+), 62 deletions(-) diff --git a/pipelines/rj_smtr/flows.py b/pipelines/rj_smtr/flows.py index c53a3f7d8..0cac7769f 100644 --- a/pipelines/rj_smtr/flows.py +++ b/pipelines/rj_smtr/flows.py @@ -82,7 +82,7 @@ request_params=request_params, ) - RAW_UPLOADED = upload_raw_data_to_gcs( + error = upload_raw_data_to_gcs( error=error, raw_filepath=raw_filepath, timestamp=timestamp, @@ -99,7 +99,6 @@ error=error, timestamp=timestamp, primary_key=primary_key, - upstream_tasks=[RAW_UPLOADED], ) STAGING_UPLOADED = upload_staging_data_to_gcs( diff --git a/pipelines/rj_smtr/tasks.py b/pipelines/rj_smtr/tasks.py index c5dae7741..86948899f 100644 --- a/pipelines/rj_smtr/tasks.py +++ b/pipelines/rj_smtr/tasks.py @@ -743,7 +743,7 @@ def upload_raw_data_to_gcs( Returns: None """ - if not error: + if error is None: try: st_obj = Storage(table_id=table_id, dataset_id=dataset_id) log( @@ -759,14 +759,8 @@ def upload_raw_data_to_gcs( except Exception: error = traceback.format_exc() log(f"[CATCHED] Task failed with error: \n{error}", level="error") - - upload_run_logs_to_bq( - dataset_id=dataset_id, - parent_table_id=table_id, - error=error, - timestamp=timestamp, - mode="raw", - ) + + return error @task @@ -792,7 +786,7 @@ def upload_staging_data_to_gcs( Returns: None """ - if not error: + if error is None: try: # Creates and publish table if it does not exist, append to it otherwise create_or_append_table( @@ -813,6 +807,8 @@ def upload_staging_data_to_gcs( mode="staging", ) + return error + ############### # @@ -1040,67 +1036,61 @@ def transform_raw_to_nested_structure( str: Error traceback str: Path to the saved treated .csv file """ + if error is None: + try: + # leitura do dado raw + error, data = read_raw_data(filepath=raw_filepath) - # Check previous error - if error is not None: - return error, None - - # ORGANIZAR: - - try: - # leitura do dado raw - error, data = read_raw_data(filepath=raw_filepath) - - if primary_key is None: - primary_key = [] + if primary_key is None: + primary_key = [] - log( - f""" - Received inputs: - - timestamp:\n{timestamp} - - data:\n{data.head()}""" - ) + log( + f""" + Received inputs: + - timestamp:\n{timestamp} + - data:\n{data.head()}""" + ) - # Check empty dataframe - if data.empty: - log("Empty dataframe, skipping transformation...") - else: - log(f"Raw data:\n{data_info_str(data)}", level="info") + # Check empty dataframe + if data.empty: + log("Empty dataframe, skipping transformation...") + else: + log(f"Raw data:\n{data_info_str(data)}", level="info") - log("Adding captured timestamp column...", level="info") - data["timestamp_captura"] = timestamp + log("Adding captured timestamp column...", level="info") + data["timestamp_captura"] = timestamp - log("Striping string columns...", level="info") - for col in data.columns[data.dtypes == "object"].to_list(): - data[col] = data[col].str.strip() + log("Striping string columns...", level="info") + for col in data.columns[data.dtypes == "object"].to_list(): + data[col] = data[col].str.strip() - log(f"Finished cleaning! Data:\n{data_info_str(data)}", level="info") + log(f"Finished cleaning! Data:\n{data_info_str(data)}", level="info") - log("Creating nested structure...", level="info") - pk_cols = primary_key + ["timestamp_captura"] - data = ( - data.groupby(pk_cols) - .apply( - lambda x: x[data.columns.difference(pk_cols)].to_json( - orient="records" + log("Creating nested structure...", level="info") + pk_cols = primary_key + ["timestamp_captura"] + data = ( + data.groupby(pk_cols) + .apply( + lambda x: x[data.columns.difference(pk_cols)].to_json( + orient="records" + ) ) + .str.strip("[]") + .reset_index(name="content")[ + primary_key + ["content", "timestamp_captura"] + ] ) - .str.strip("[]") - .reset_index(name="content")[ - primary_key + ["content", "timestamp_captura"] - ] - ) - log( - f"Finished nested structure! Data:\n{data_info_str(data)}", - level="info", - ) + log( + f"Finished nested structure! Data:\n{data_info_str(data)}", + level="info", + ) - # save treated local - filepath = save_treated_local_func(data=data, error=error, filepath=filepath) + # save treated local + filepath = save_treated_local_func(data=data, error=error, filepath=filepath) - except Exception: # pylint: disable=W0703 - error = traceback.format_exc() - log(f"[CATCHED] Task failed with error: \n{error}", level="error") + except Exception: # pylint: disable=W0703 + error = traceback.format_exc() + log(f"[CATCHED] Task failed with error: \n{error}", level="error") return error, filepath From 02b948a66a7a11515dabb7995dc76ecd7d3f2c3c Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 29 Sep 2023 21:53:43 +0000 Subject: [PATCH 065/145] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- pipelines/rj_smtr/tasks.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/pipelines/rj_smtr/tasks.py b/pipelines/rj_smtr/tasks.py index 86948899f..1a6c1e876 100644 --- a/pipelines/rj_smtr/tasks.py +++ b/pipelines/rj_smtr/tasks.py @@ -759,7 +759,7 @@ def upload_raw_data_to_gcs( except Exception: error = traceback.format_exc() log(f"[CATCHED] Task failed with error: \n{error}", level="error") - + return error @@ -1087,7 +1087,9 @@ def transform_raw_to_nested_structure( ) # save treated local - filepath = save_treated_local_func(data=data, error=error, filepath=filepath) + filepath = save_treated_local_func( + data=data, error=error, filepath=filepath + ) except Exception: # pylint: disable=W0703 error = traceback.format_exc() From fac7821be88db6d07982591ef3793f1810430d24 Mon Sep 17 00:00:00 2001 From: Rafael Date: Mon, 2 Oct 2023 11:24:44 -0300 Subject: [PATCH 066/145] remove header treatment --- pipelines/rj_smtr/utils.py | 7 ------- 1 file changed, 7 deletions(-) diff --git a/pipelines/rj_smtr/utils.py b/pipelines/rj_smtr/utils.py index 9265e1a59..e2fffe8dc 100644 --- a/pipelines/rj_smtr/utils.py +++ b/pipelines/rj_smtr/utils.py @@ -507,13 +507,6 @@ def get_raw_data_api( # pylint: disable=R0912 else: headers = get_vault_secret(secret_path)["data"] - # remove from headers, if present - # TODO: remove this before merge to master - remove_headers = ["host", "databases"] - for remove_header in remove_headers: - if remove_header in list(headers.keys()): - del headers[remove_header] - response = requests.get( url, headers=headers, From e291e514f2b08063ba92e4112d116e4c86821392 Mon Sep 17 00:00:00 2001 From: Rafael Date: Mon, 2 Oct 2023 11:25:14 -0300 Subject: [PATCH 067/145] mudar agent dev para prd --- pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/flows.py | 4 ++-- pipelines/rj_smtr/flows.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/flows.py b/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/flows.py index 793d37c0d..d7f44e3b9 100644 --- a/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/flows.py +++ b/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/flows.py @@ -30,7 +30,7 @@ bilhetagem_transacao_captura.storage = GCS(emd_constants.GCS_FLOWS_BUCKET.value) bilhetagem_transacao_captura.run_config = KubernetesRun( image=emd_constants.DOCKER_IMAGE.value, - labels=[emd_constants.RJ_SMTR_DEV_AGENT_LABEL.value], + labels=[emd_constants.RJ_SMTR_AGENT_LABEL.value], ) bilhetagem_transacao_captura.schedule = bilhetagem_transacao_schedule @@ -41,6 +41,6 @@ bilhetagem_principal_captura.storage = GCS(emd_constants.GCS_FLOWS_BUCKET.value) bilhetagem_principal_captura.run_config = KubernetesRun( image=emd_constants.DOCKER_IMAGE.value, - labels=[emd_constants.RJ_SMTR_DEV_AGENT_LABEL.value], + labels=[emd_constants.RJ_SMTR_AGENT_LABEL.value], ) bilhetagem_principal_captura.schedule = bilhetagem_principal_schedule diff --git a/pipelines/rj_smtr/flows.py b/pipelines/rj_smtr/flows.py index 0cac7769f..c7638676b 100644 --- a/pipelines/rj_smtr/flows.py +++ b/pipelines/rj_smtr/flows.py @@ -113,5 +113,5 @@ default_capture_flow.storage = GCS(emd_constants.GCS_FLOWS_BUCKET.value) default_capture_flow.run_config = KubernetesRun( image=emd_constants.DOCKER_IMAGE.value, - labels=[emd_constants.RJ_SMTR_DEV_AGENT_LABEL.value], + labels=[emd_constants.RJ_SMTR_AGENT_LABEL.value], ) From e57d4576ddb729f3871dd190d61be7c37ae9b9a3 Mon Sep 17 00:00:00 2001 From: Rafael Date: Mon, 2 Oct 2023 11:33:33 -0300 Subject: [PATCH 068/145] mudar agent de dev para prd --- pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/schedules.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/schedules.py b/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/schedules.py index e897286b0..2f7804811 100644 --- a/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/schedules.py +++ b/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/schedules.py @@ -20,7 +20,7 @@ **constants.BILHETAGEM_GENERAL_CAPTURE_PARAMS.value["principal_run_interval"] ), labels=[ - emd_constants.RJ_SMTR_DEV_AGENT_LABEL.value, + emd_constants.RJ_SMTR_AGENT_LABEL.value, ], table_parameters=constants.BILHETAGEM_CAPTURE_PARAMS.value, dataset_id=constants.BILHETAGEM_DATASET_ID.value, @@ -38,7 +38,7 @@ **constants.BILHETAGEM_GENERAL_CAPTURE_PARAMS.value["transacao_run_interval"] ), labels=[ - emd_constants.RJ_SMTR_DEV_AGENT_LABEL.value, + emd_constants.RJ_SMTR_AGENT_LABEL.value, ], table_parameters=constants.BILHETAGEM_TRANSACAO_CAPTURE_PARAMS.value, dataset_id=constants.BILHETAGEM_DATASET_ID.value, From 3767a5622ae6beb9a7758070e71b6830a967a84f Mon Sep 17 00:00:00 2001 From: Rafael Date: Mon, 2 Oct 2023 11:33:59 -0300 Subject: [PATCH 069/145] ajustar retorno das funcoes --- pipelines/rj_smtr/flows.py | 1 - pipelines/rj_smtr/tasks.py | 30 ++++++++++++++---------------- 2 files changed, 14 insertions(+), 17 deletions(-) diff --git a/pipelines/rj_smtr/flows.py b/pipelines/rj_smtr/flows.py index c7638676b..4860c6d07 100644 --- a/pipelines/rj_smtr/flows.py +++ b/pipelines/rj_smtr/flows.py @@ -85,7 +85,6 @@ error = upload_raw_data_to_gcs( error=error, raw_filepath=raw_filepath, - timestamp=timestamp, table_id=table_id, dataset_id=dataset_id, partitions=partitions, diff --git a/pipelines/rj_smtr/tasks.py b/pipelines/rj_smtr/tasks.py index 1a6c1e876..5edee3f7c 100644 --- a/pipelines/rj_smtr/tasks.py +++ b/pipelines/rj_smtr/tasks.py @@ -8,7 +8,7 @@ import os from pathlib import Path import traceback -from typing import Dict, List +from typing import Dict, List, Union import io from basedosdados import Storage, Table @@ -722,26 +722,24 @@ def upload_logs_to_bq( # pylint: disable=R0913 @task def upload_raw_data_to_gcs( - error: bool, + error: str, raw_filepath: str, - timestamp: datetime, table_id: str, dataset_id: str, partitions: list, -): +) -> Union[str, None]: """ Upload raw data to GCS. Args: - error (bool): whether the upstream tasks failed or not + error (str): Error catched from upstream tasks. raw_filepath (str): Path to the saved raw .json file - timestamp (datetime): timestamp for flow run table_id (str): table_id on BigQuery dataset_id (str): dataset_id on BigQuery partitions (list): list of partition strings Returns: - None + Union[str, None]: if there is an error returns it traceback, otherwise returns None """ if error is None: try: @@ -765,26 +763,26 @@ def upload_raw_data_to_gcs( @task def upload_staging_data_to_gcs( - error: bool, + error: str, staging_filepath: str, timestamp: datetime, table_id: str, dataset_id: str, partitions: list, -): +) -> Union[str, None]: """ Upload staging data to GCS. Args: - error (bool): whether the upstream tasks failed or not - staging_filepath (str): Path to the saved treated .csv file - timestamp (datetime): timestamp for flow run - table_id (str): table_id on BigQuery - dataset_id (str): dataset_id on BigQuery - partitions (list): list of partition strings + error (str): Error catched from upstream tasks. + staging_filepath (str): Path to the saved treated .csv file. + timestamp (datetime): timestamp for flow run. + table_id (str): table_id on BigQuery. + dataset_id (str): dataset_id on BigQuery. + partitions (list): list of partition strings. Returns: - None + Union[str, None]: if there is an error returns it traceback, otherwise returns None """ if error is None: try: From 6564ea663204adb01d9b13852746da9e3eebd97f Mon Sep 17 00:00:00 2001 From: eng-rodrigocunha Date: Mon, 2 Oct 2023 11:39:17 -0300 Subject: [PATCH 070/145] =?UTF-8?q?Atualiza=20documenta=C3=A7=C3=A3o?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- pipelines/rj_smtr/tasks.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/pipelines/rj_smtr/tasks.py b/pipelines/rj_smtr/tasks.py index 5edee3f7c..9d46d49ac 100644 --- a/pipelines/rj_smtr/tasks.py +++ b/pipelines/rj_smtr/tasks.py @@ -444,7 +444,7 @@ def create_request_params( table_id: str, dataset_id: str, timestamp: datetime, -) -> tuple: +) -> tuple[str, str]: """ Task to create request params @@ -492,7 +492,7 @@ def get_raw_from_sources( table_id: str = None, secret_path: str = None, request_params: dict = None, -): +) -> tuple[str, str]: """ Task to get raw data from sources @@ -506,7 +506,8 @@ def get_raw_from_sources( request_params (dict, optional): request parameters. Defaults to None. Returns: - error: error + error: error catched from upstream tasks + filepath: filepath to raw data """ error = None filepath = None From 19bb0bedde0b7d7e0f14fbe6bff70196e8da5679 Mon Sep 17 00:00:00 2001 From: Rafael Date: Mon, 2 Oct 2023 11:52:34 -0300 Subject: [PATCH 071/145] adicionar retorno em get_upload_storage_blob --- pipelines/rj_smtr/utils.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pipelines/rj_smtr/utils.py b/pipelines/rj_smtr/utils.py index e2fffe8dc..a89a95541 100644 --- a/pipelines/rj_smtr/utils.py +++ b/pipelines/rj_smtr/utils.py @@ -18,6 +18,7 @@ import basedosdados as bd from basedosdados import Table import pandas as pd +from google.cloud.storage.blob import Blob from prefect.schedules.clocks import IntervalClock @@ -531,7 +532,7 @@ def get_raw_data_api( # pylint: disable=R0912 def get_upload_storage_blob( dataset_id: str, filename: str, -): +) -> Blob: """ Get a blob from upload zone in storage From bc87f44aa2a0202790255a8658d14ad14454ae76 Mon Sep 17 00:00:00 2001 From: eng-rodrigocunha Date: Mon, 2 Oct 2023 11:55:31 -0300 Subject: [PATCH 072/145] =?UTF-8?q?Atualiza=20documenta=C3=A7=C3=A3o?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- pipelines/rj_smtr/utils.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/pipelines/rj_smtr/utils.py b/pipelines/rj_smtr/utils.py index a89a95541..1d71dd3dd 100644 --- a/pipelines/rj_smtr/utils.py +++ b/pipelines/rj_smtr/utils.py @@ -498,7 +498,7 @@ def get_raw_data_api( # pylint: disable=R0912 filetype (str, optional): Filetype to save raw file. Defaults to None. Returns: - tuple[str, str]: Error and filepath + tuple[str, str, str]: Error, data and filetype """ error = None data = None @@ -540,7 +540,6 @@ def get_upload_storage_blob( dataset_id (str): The dataset id on BigQuery. filename (str): The filename in GCS. - Returns: Blob: blob object """ @@ -567,7 +566,7 @@ def get_raw_data_gcs( zip_filename (str, optional): The zip file name. Defaults to None. Returns: - tuple[str, str]: Error and filepath + tuple[str, str, str]: Error, data and filetype """ error = None data = None From 185d695ff4dbcd647e93e938639721a6120276d6 Mon Sep 17 00:00:00 2001 From: eng-rodrigocunha Date: Mon, 2 Oct 2023 11:58:30 -0300 Subject: [PATCH 073/145] Atualiza string --- pipelines/rj_smtr/tasks.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pipelines/rj_smtr/tasks.py b/pipelines/rj_smtr/tasks.py index 9d46d49ac..a846851b5 100644 --- a/pipelines/rj_smtr/tasks.py +++ b/pipelines/rj_smtr/tasks.py @@ -1045,9 +1045,9 @@ def transform_raw_to_nested_structure( log( f""" - Received inputs: - - timestamp:\n{timestamp} - - data:\n{data.head()}""" + Received inputs: + - timestamp:\n{timestamp} + - data:\n{data.head()}""" ) # Check empty dataframe From 4a975d52e2a3cd7a6a41f772320d0e6b890d4eda Mon Sep 17 00:00:00 2001 From: Rafael Date: Tue, 10 Oct 2023 12:06:09 -0300 Subject: [PATCH 074/145] adiciona recaptura no flow generico --- pipelines/rj_smtr/flows.py | 101 +++++++++++++++++++++---------------- pipelines/rj_smtr/tasks.py | 4 ++ 2 files changed, 61 insertions(+), 44 deletions(-) diff --git a/pipelines/rj_smtr/flows.py b/pipelines/rj_smtr/flows.py index 4860c6d07..f1433f52c 100644 --- a/pipelines/rj_smtr/flows.py +++ b/pipelines/rj_smtr/flows.py @@ -5,15 +5,14 @@ from prefect.run_configs import KubernetesRun from prefect.storage import GCS -from prefect import Parameter +from prefect import Parameter, case, unmapped +from prefect.tasks.control_flow import merge # EMD Imports # from pipelines.constants import constants as emd_constants from pipelines.utils.decorators import Flow -from pipelines.utils.tasks import ( - rename_current_flow_run_now_time, -) +from pipelines.utils.tasks import rename_current_flow_run_now_time, get_now_time # SMTR Imports # @@ -27,6 +26,7 @@ transform_raw_to_nested_structure, get_raw_from_sources, create_request_params, + query_logs, ) @@ -43,74 +43,87 @@ secret_path = Parameter("secret_path", default=None) primary_key = Parameter("primary_key", default=None) source_type = Parameter("source_type", default=None) + recapture = Parameter("recapture", default=False) + + with case(recapture, True): + _, recapture_timestamps, previous_errors = query_logs( + dataset_id=dataset_id, + table_id=table_id, + ) - timestamp = get_current_timestamp() + with case(recapture, False): + capture_timestamp = [get_current_timestamp()] + previous_errors = [None] + + timestamps = merge(recapture_timestamps, capture_timestamp) rename_flow_run = rename_current_flow_run_now_time( prefix=default_capture_flow.name + " " + table_id + ": ", - now_time=timestamp, + now_time=get_now_time(), ) - partitions = create_date_hour_partition( - timestamp, partition_date_only=partition_date_only + partitions = create_date_hour_partition.map( + timestamps, partition_date_only=unmapped(partition_date_only) ) - filename = parse_timestamp_to_string(timestamp) + filenames = parse_timestamp_to_string.map(timestamps) - filepath = create_local_partition_path( - dataset_id=dataset_id, - table_id=table_id, - filename=filename, + filepaths = create_local_partition_path.map( + dataset_id=unmapped(dataset_id), + table_id=unmapped(table_id), + filename=filenames, partitions=partitions, ) # Extração # - request_params, request_path = create_request_params( - dataset_id=dataset_id, - extract_params=extract_params, - table_id=table_id, - timestamp=timestamp, + request_params, request_paths = create_request_params.map( + dataset_id=unmapped(dataset_id), + extract_params=unmapped(extract_params), + table_id=unmapped(table_id), + timestamp=timestamps, ) - error, raw_filepath = get_raw_from_sources( - source_type=source_type, - local_filepath=filepath, - source_path=request_path, - dataset_id=dataset_id, - table_id=table_id, - secret_path=secret_path, + errors, raw_filepaths = get_raw_from_sources.map( + source_type=unmapped(source_type), + local_filepath=unmapped(filepaths), + source_path=request_paths, + dataset_id=unmapped(dataset_id), + table_id=unmapped(table_id), + secret_path=unmapped(secret_path), request_params=request_params, ) - error = upload_raw_data_to_gcs( - error=error, - raw_filepath=raw_filepath, - table_id=table_id, - dataset_id=dataset_id, - partitions=partitions, + errors = upload_raw_data_to_gcs.map( + error=errors, + raw_filepath=raw_filepaths, + table_id=unmapped(table_id), + dataset_id=unmapped(dataset_id), + partitions=unmapped(partitions), ) # Pré-tratamento # - error, staging_filepath = transform_raw_to_nested_structure( - raw_filepath=raw_filepath, - filepath=filepath, - error=error, - timestamp=timestamp, - primary_key=primary_key, + errors, staging_filepaths = transform_raw_to_nested_structure.map( + raw_filepath=raw_filepaths, + filepath=filepaths, + error=errors, + timestamp=timestamps, + primary_key=unmapped(primary_key), ) - STAGING_UPLOADED = upload_staging_data_to_gcs( - error=error, - staging_filepath=staging_filepath, - timestamp=timestamp, - table_id=table_id, - dataset_id=dataset_id, + STAGING_UPLOADED = upload_staging_data_to_gcs.map( + error=errors, + staging_filepath=staging_filepaths, + timestamp=timestamps, + table_id=unmapped(table_id), + dataset_id=unmapped(dataset_id), partitions=partitions, + previous_error=previous_errors, + recapture=recapture, ) default_capture_flow.storage = GCS(emd_constants.GCS_FLOWS_BUCKET.value) default_capture_flow.run_config = KubernetesRun( image=emd_constants.DOCKER_IMAGE.value, - labels=[emd_constants.RJ_SMTR_AGENT_LABEL.value], + labels=[emd_constants.RJ_SMTR_DEV_AGENT_LABEL.value], ) diff --git a/pipelines/rj_smtr/tasks.py b/pipelines/rj_smtr/tasks.py index a846851b5..71a1e2891 100644 --- a/pipelines/rj_smtr/tasks.py +++ b/pipelines/rj_smtr/tasks.py @@ -770,6 +770,8 @@ def upload_staging_data_to_gcs( table_id: str, dataset_id: str, partitions: list, + previous_error: str = None, + recapture: bool = False, ) -> Union[str, None]: """ Upload staging data to GCS. @@ -803,6 +805,8 @@ def upload_staging_data_to_gcs( parent_table_id=table_id, error=error, timestamp=timestamp, + previous_error=previous_error, + recapture=recapture, mode="staging", ) From 33a23e5b7c1a732d5b23cdc551edbbb3fac0b28b Mon Sep 17 00:00:00 2001 From: Rafael Date: Wed, 11 Oct 2023 10:57:22 -0300 Subject: [PATCH 075/145] alterar labels para dev --- pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/flows.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/flows.py b/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/flows.py index 568f96154..fb6e67594 100644 --- a/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/flows.py +++ b/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/flows.py @@ -50,7 +50,7 @@ bilhetagem_transacao_captura.storage = GCS(emd_constants.GCS_FLOWS_BUCKET.value) bilhetagem_transacao_captura.run_config = KubernetesRun( image=emd_constants.DOCKER_IMAGE.value, - labels=[emd_constants.RJ_SMTR_AGENT_LABEL.value], + labels=[emd_constants.RJ_SMTR_DEV_AGENT_LABEL.value], ) bilhetagem_transacao_captura.schedule = bilhetagem_transacao_schedule @@ -61,7 +61,7 @@ bilhetagem_auxiliar_captura.storage = GCS(emd_constants.GCS_FLOWS_BUCKET.value) bilhetagem_auxiliar_captura.run_config = KubernetesRun( image=emd_constants.DOCKER_IMAGE.value, - labels=[emd_constants.RJ_SMTR_AGENT_LABEL.value], + labels=[emd_constants.RJ_SMTR_DEV_AGENT_LABEL.value], ) bilhetagem_auxiliar_captura = set_default_parameters( @@ -79,7 +79,7 @@ bilhetagem_materializacao.storage = GCS(emd_constants.GCS_FLOWS_BUCKET.value) bilhetagem_materializacao.run_config = KubernetesRun( image=emd_constants.DOCKER_IMAGE.value, - labels=[emd_constants.RJ_SMTR_AGENT_LABEL.value], + labels=[emd_constants.RJ_SMTR_DEV_AGENT_LABEL.value], ) bilhetagem_materializacao_parameters = { @@ -91,6 +91,7 @@ default_parameters=bilhetagem_materializacao_parameters, ) + # TRATAMENTO - RODA DE HORA EM HORA, CAPTURA AUXILIAR + MATERIALIZAÇÃO with Flow( "SMTR: Bilhetagem Transação - Tratamento", @@ -138,7 +139,7 @@ bilhetagem_transacao_tratamento.storage = GCS(emd_constants.GCS_FLOWS_BUCKET.value) bilhetagem_transacao_tratamento.run_config = KubernetesRun( image=emd_constants.DOCKER_IMAGE.value, - labels=[emd_constants.RJ_SMTR_AGENT_LABEL.value], + labels=[emd_constants.RJ_SMTR_DEV_AGENT_LABEL.value], ) bilhetagem_transacao_tratamento.schedule = every_hour # bilhetagem_materializacao.schedule = bilhetagem_materializacao_schedule From 0eb4e92b2640fc67e3c8296addb17c51ed25b6d3 Mon Sep 17 00:00:00 2001 From: Rafael Date: Wed, 11 Oct 2023 10:57:39 -0300 Subject: [PATCH 076/145] adicionar logica de recaptura --- pipelines/rj_smtr/flows.py | 95 ++++++++++++++++++++++---------------- 1 file changed, 55 insertions(+), 40 deletions(-) diff --git a/pipelines/rj_smtr/flows.py b/pipelines/rj_smtr/flows.py index 419a6e6a1..6012aae22 100644 --- a/pipelines/rj_smtr/flows.py +++ b/pipelines/rj_smtr/flows.py @@ -7,6 +7,7 @@ from prefect.storage import GCS from prefect import case, Parameter from prefect.utilities.edges import unmapped +from prefect.tasks.control_flow import merge # EMD Imports # @@ -35,6 +36,7 @@ upload_staging_data_to_gcs, get_raw_from_sources, create_request_params, + query_logs, ) from pipelines.utils.execute_dbt_model.tasks import run_dbt_model @@ -52,70 +54,83 @@ secret_path = Parameter("secret_path", default=None) primary_key = Parameter("primary_key", default=None) source_type = Parameter("source_type", default=None) + recapture = Parameter("recapture", default=False) - timestamp = get_current_timestamp() + with case(recapture, True): + _, recapture_timestamps, previous_errors = query_logs( + dataset_id=dataset_id, + table_id=table_id, + ) + + with case(recapture, False): + capture_timestamp = [get_current_timestamp()] + previous_errors = [None] + + timestamps = merge(recapture_timestamps, capture_timestamp) rename_flow_run = rename_current_flow_run_now_time( prefix=default_capture_flow.name + " " + table_id + ": ", - now_time=timestamp, + now_time=get_now_time(), ) - partitions = create_date_hour_partition( - timestamp, partition_date_only=partition_date_only + partitions = create_date_hour_partition.map( + timestamps, partition_date_only=unmapped(partition_date_only) ) - filename = parse_timestamp_to_string(timestamp) + filenames = parse_timestamp_to_string.map(timestamps) - filepath = create_local_partition_path( - dataset_id=dataset_id, - table_id=table_id, - filename=filename, + filepaths = create_local_partition_path.map( + dataset_id=unmapped(dataset_id), + table_id=unmapped(table_id), + filename=filenames, partitions=partitions, ) # Extração # - request_params, request_path = create_request_params( - dataset_id=dataset_id, - extract_params=extract_params, - table_id=table_id, - timestamp=timestamp, + request_params, request_paths = create_request_params.map( + dataset_id=unmapped(dataset_id), + extract_params=unmapped(extract_params), + table_id=unmapped(table_id), + timestamp=timestamps, ) - error, raw_filepath = get_raw_from_sources( - source_type=source_type, - local_filepath=filepath, - source_path=request_path, - dataset_id=dataset_id, - table_id=table_id, - secret_path=secret_path, + errors, raw_filepaths = get_raw_from_sources.map( + source_type=unmapped(source_type), + local_filepath=unmapped(filepaths), + source_path=request_paths, + dataset_id=unmapped(dataset_id), + table_id=unmapped(table_id), + secret_path=unmapped(secret_path), request_params=request_params, ) - error = upload_raw_data_to_gcs( - error=error, - raw_filepath=raw_filepath, - table_id=table_id, - dataset_id=dataset_id, - partitions=partitions, + errors = upload_raw_data_to_gcs.map( + error=errors, + raw_filepath=raw_filepaths, + table_id=unmapped(table_id), + dataset_id=unmapped(dataset_id), + partitions=unmapped(partitions), ) # Pré-tratamento # - error, staging_filepath = transform_raw_to_nested_structure( - raw_filepath=raw_filepath, - filepath=filepath, - error=error, - timestamp=timestamp, - primary_key=primary_key, + errors, staging_filepaths = transform_raw_to_nested_structure.map( + raw_filepath=raw_filepaths, + filepath=filepaths, + error=errors, + timestamp=timestamps, + primary_key=unmapped(primary_key), ) - STAGING_UPLOADED = upload_staging_data_to_gcs( - error=error, - staging_filepath=staging_filepath, - timestamp=timestamp, - table_id=table_id, - dataset_id=dataset_id, + STAGING_UPLOADED = upload_staging_data_to_gcs.map( + error=errors, + staging_filepath=staging_filepaths, + timestamp=timestamps, + table_id=unmapped(table_id), + dataset_id=unmapped(dataset_id), partitions=partitions, + previous_error=previous_errors, + recapture=recapture, ) default_capture_flow.storage = GCS(emd_constants.GCS_FLOWS_BUCKET.value) @@ -259,5 +274,5 @@ default_materialization_flow.storage = GCS(emd_constants.GCS_FLOWS_BUCKET.value) default_materialization_flow.run_config = KubernetesRun( image=emd_constants.DOCKER_IMAGE.value, - labels=[emd_constants.RJ_SMTR_AGENT_LABEL.value], + labels=[emd_constants.RJ_SMTR_DEV_AGENT_LABEL.value], ) From ecc67d14e525b1697a46da3aabcf06d95294b7b6 Mon Sep 17 00:00:00 2001 From: Rafael Date: Wed, 11 Oct 2023 10:58:20 -0300 Subject: [PATCH 077/145] =?UTF-8?q?criar=20conex=C3=A3o=20com=20banco=20de?= =?UTF-8?q?=20dados?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- pipelines/rj_smtr/tasks.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/pipelines/rj_smtr/tasks.py b/pipelines/rj_smtr/tasks.py index f7d687dea..bd4b45680 100644 --- a/pipelines/rj_smtr/tasks.py +++ b/pipelines/rj_smtr/tasks.py @@ -31,6 +31,7 @@ dict_contains_keys, get_raw_data_api, get_raw_data_gcs, + get_raw_data_db, upload_run_logs_to_bq, get_datetime_range, read_raw_data, @@ -534,6 +535,10 @@ def get_raw_from_sources( error, data, filetype = get_raw_data_gcs( dataset_id=dataset_id, table_id=table_id, zip_filename=request_params ) + elif source_type == "db": + error, data, filetype = get_raw_data_db( + host=source_path, secret_path=secret_path, **request_params + ) else: raise NotImplementedError(f"{source_type} not supported") @@ -771,6 +776,8 @@ def upload_staging_data_to_gcs( table_id: str, dataset_id: str, partitions: list, + previous_error: str = None, + recapture: bool = False, ) -> Union[str, None]: """ Upload staging data to GCS. @@ -805,6 +812,8 @@ def upload_staging_data_to_gcs( error=error, timestamp=timestamp, mode="staging", + previous_error=previous_error, + recapture=recapture, ) return error From 2a882865254857d5451f1666a4564e736b082df5 Mon Sep 17 00:00:00 2001 From: Rafael Date: Wed, 11 Oct 2023 10:58:35 -0300 Subject: [PATCH 078/145] =?UTF-8?q?criar=20conex=C3=A3o=20com=20banco=20de?= =?UTF-8?q?=20dados?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- pipelines/rj_smtr/utils.py | 58 ++++++++++++++++++++++++++++++++++++-- 1 file changed, 56 insertions(+), 2 deletions(-) diff --git a/pipelines/rj_smtr/utils.py b/pipelines/rj_smtr/utils.py index f9b98afab..6a6c70ee5 100644 --- a/pipelines/rj_smtr/utils.py +++ b/pipelines/rj_smtr/utils.py @@ -19,6 +19,7 @@ from basedosdados import Table import pandas as pd from google.cloud.storage.blob import Blob +import pymysql from prefect.schedules.clocks import IntervalClock @@ -480,9 +481,10 @@ def save_raw_local_func( Path(_filepath).parent.mkdir(parents=True, exist_ok=True) if filetype == "json": - if isinstance(data, dict): + if isinstance(data, str): data = json.loads(data) - json.dump(data, Path(_filepath).open("w", encoding="utf-8")) + with Path(_filepath).open("w", encoding="utf-8") as fi: + json.dump(data, fi) # if filetype == "csv": # pass @@ -611,6 +613,58 @@ def get_raw_data_gcs( return error, data, filetype +def get_raw_data_db( + sql: str, dbms: str, host: str, secret_path: str, database: str +) -> tuple[str, str, str]: + """ + Get data from Databases + + Args: + sql (str): the SQL Query to execute + dbms (str): The datase management system + host (str): The database host + secret_path (str): Secret path to get credentials + database (str): The database to connect + + Returns: + tuple[str, str, str]: Error, data and filetype + """ + connection_mapping = { + # 'postgresql': {'connector': psycopg2.connect, 'port': '5432', 'cursor':{'cursor_factory': psycopg2.extras.RealDictCursor}}, + "mysql": { + "connector": pymysql.connect, + "port": "3306", + "cursor": {"cursor": pymysql.cursors.DictCursor}, + } + } + + data = None + error = None + filetype = "json" + + try: + credentials = get_vault_secret(secret_path)["data"] + + connection = connection_mapping[dbms]( + host=host, + user=credentials["user"], + password=credentials["password"], + database=database, + ) + + with connection: + with connection.cursor(**connection_mapping[dbms]["cursor"]) as cursor: + cursor.execute(sql) + data = cursor.fetchall() + + data = [dict(d) for d in data] + except Exception: + error = traceback.format_exc() + log(f"[CATCHED] Task failed with error: \n{error}", level="error") + + return error, data, filetype + + def save_treated_local_func( filepath: str, data: pd.DataFrame, error: str, mode: str = "staging" ) -> str: From ae1c88201635cf629bb2ad76ee0122cbbe0bce21 Mon Sep 17 00:00:00 2001 From: Rafael Date: Wed, 11 Oct 2023 12:48:36 -0300 Subject: [PATCH 079/145] =?UTF-8?q?cria=20fun=C3=A7=C3=A3o=20para=20map=20?= =?UTF-8?q?de=20multiplos=20retornos?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- pipelines/rj_smtr/flows.py | 19 +++- pipelines/rj_smtr/tasks.py | 208 +++++++++++++++++++++---------------- 2 files changed, 133 insertions(+), 94 deletions(-) diff --git a/pipelines/rj_smtr/flows.py b/pipelines/rj_smtr/flows.py index 6012aae22..d408c75f7 100644 --- a/pipelines/rj_smtr/flows.py +++ b/pipelines/rj_smtr/flows.py @@ -37,6 +37,7 @@ get_raw_from_sources, create_request_params, query_logs, + unpack_mapped_results_nout2, ) from pipelines.utils.execute_dbt_model.tasks import run_dbt_model @@ -87,14 +88,18 @@ ) # Extração # - request_params, request_paths = create_request_params.map( + create_request_params_returns = create_request_params.map( dataset_id=unmapped(dataset_id), extract_params=unmapped(extract_params), table_id=unmapped(table_id), timestamp=timestamps, ) - errors, raw_filepaths = get_raw_from_sources.map( + request_params, request_paths = unpack_mapped_results_nout2( + mapped_results=create_request_params_returns + ) + + get_raw_from_sources_returns = get_raw_from_sources.map( source_type=unmapped(source_type), local_filepath=unmapped(filepaths), source_path=request_paths, @@ -104,6 +109,10 @@ request_params=request_params, ) + errors, raw_filepaths = unpack_mapped_results_nout2( + mapped_results=get_raw_from_sources_returns + ) + errors = upload_raw_data_to_gcs.map( error=errors, raw_filepath=raw_filepaths, @@ -114,7 +123,7 @@ # Pré-tratamento # - errors, staging_filepaths = transform_raw_to_nested_structure.map( + nested_structure_returns = transform_raw_to_nested_structure.map( raw_filepath=raw_filepaths, filepath=filepaths, error=errors, @@ -122,6 +131,10 @@ primary_key=unmapped(primary_key), ) + errors, staging_filepaths = unpack_mapped_results_nout2( + mapped_results=nested_structure_returns + ) + STAGING_UPLOADED = upload_staging_data_to_gcs.map( error=errors, staging_filepath=staging_filepaths, diff --git a/pipelines/rj_smtr/tasks.py b/pipelines/rj_smtr/tasks.py index bd4b45680..5d2083e8f 100644 --- a/pipelines/rj_smtr/tasks.py +++ b/pipelines/rj_smtr/tasks.py @@ -8,7 +8,7 @@ import os from pathlib import Path import traceback -from typing import Dict, List, Union, Iterable +from typing import Dict, List, Union, Iterable, Any import io from basedosdados import Storage, Table @@ -138,6 +138,103 @@ def build_incremental_model( # pylint: disable=too-many-arguments return False +@task(checkpoint=False, nout=3) +def create_dbt_run_vars( + dataset_id: str, + dbt_vars: dict, + table_id: str, + raw_dataset_id: str, + raw_table_id: str, + mode: str, +) -> tuple[list[dict], Union[list[dict], dict, None], bool]: + """ + Create the variables to be used in dbt materialization based on a dict + + Args: + dataset_id (str): the dataset_id to get the variables + dbt_vars (dict): dict containing the parameters + table_id (str): the table_id get the date_range variable + raw_dataset_id (str): the raw_dataset_id get the date_range variable + raw_table_id (str): the raw_table_id get the date_range variable + mode (str): the mode to get the date_range variable + + Returns: + tuple[list[dict]: the variables to be used in DBT + Union[list[dict], dict, None]: the date variable (date_range or run_date) + bool: a flag that indicates if the date_range variable came from Redis + """ + + log(f"Creating DBT variables. Parameter received: {dbt_vars}") + + if (not dbt_vars) or (not table_id): + log("dbt_vars or table_id are blank. Skiping task") + return [None], None, False + + final_vars = [] + date_var = None + flag_date_range = False + + if "date_range" in dbt_vars.keys(): + log("Creating date_range variable") + + # Set date_range variable manually + if dict_contains_keys( + dbt_vars["date_range"], ["date_range_start", "date_range_end"] + ): + date_var = { + "date_range_start": dbt_vars["date_range"]["date_range_start"], + "date_range_end": dbt_vars["date_range"]["date_range_end"], + } + # Create date_range using Redis + else: + raw_table_id = raw_table_id or table_id + + date_var = get_materialization_date_range.run( + dataset_id=dataset_id, + table_id=table_id, + raw_dataset_id=raw_dataset_id, + raw_table_id=raw_table_id, + table_run_datetime_column_name=dbt_vars["date_range"].get( + "table_run_datetime_column_name" + ), + mode=mode, + delay_hours=dbt_vars["date_range"].get("delay_hours", 0), + ) + + flag_date_range = True + + final_vars.append(date_var.copy()) + + log(f"date_range created: {date_var}") + + elif "run_date" in dbt_vars.keys(): + log("Creating run_date variable") + + date_var = get_run_dates.run( + dbt_vars["run_date"].get("date_range_start"), + dbt_vars["run_date"].get("date_range_end"), + ) + final_vars.append([d.copy() for d in date_var]) + + log(f"run_date created: {date_var}") + + if "version" in dbt_vars.keys(): + log("Creating version variable") + dataset_sha = fetch_dataset_sha.run(dataset_id=dataset_id) + + # if there are other variables inside the list, update each item adding the version variable + if final_vars: + final_vars = get_join_dict.run(dict_list=final_vars, new_dict=dataset_sha) + else: + final_vars.append(dataset_sha) + + log(f"version created: {dataset_sha}") + + log(f"All variables was created, final value is: {final_vars}") + + return final_vars, date_var, flag_date_range + + ############### # # Local file managment @@ -1107,6 +1204,13 @@ def transform_raw_to_nested_structure( return error, filepath +############### +# +# Utilitary tasks +# +############### + + @task(checkpoint=False) def coalesce_task(value_list: Iterable): """ @@ -1121,101 +1225,23 @@ def coalesce_task(value_list: Iterable): try: return next(value for value in value_list if value is not None) except StopIteration: - return + return None -@task(checkpoint=False, nout=3) -def create_dbt_run_vars( - dataset_id: str, - dbt_vars: dict, - table_id: str, - raw_dataset_id: str, - raw_table_id: str, - mode: str, -) -> tuple[list[dict], Union[list[dict], dict, None], bool]: +@task(checkpoint=False, nout=2) +def unpack_mapped_results_nout2( + mapped_results: Iterable, +) -> tuple[list[Any], list[Any]]: """ - Create the variables to be used in dbt materialization based on a dict + Task to unpack the results from an nout=2 tasks in 2 lists when it is mapped Args: - dataset_id (str): the dataset_id to get the variables - dbt_vars (dict): dict containing the parameters - table_id (str): the table_id get the date_range variable - raw_dataset_id (str): the raw_dataset_id get the date_range variable - raw_table_id (str): the raw_table_id get the date_range variable - mode (str): the mode to get the date_range variable + mapped_results (Iterable): The mapped task return Returns: - tuple[list[dict]: the variables to be used in DBT - Union[list[dict], dict, None]: the date variable (date_range or run_date) - bool: a flag that indicates if the date_range variable came from Redis - """ - - log(f"Creating DBT variables. Parameter received: {dbt_vars}") - - if (not dbt_vars) or (not table_id): - log("dbt_vars or table_id are blank. Skiping task") - return [None], None, False - - final_vars = [] - date_var = None - flag_date_range = False + tuple[list[Any], list[Any]]: The task original return splited in 2 lists: + - 1st list being all the first return + - 2nd list being all the second return - if "date_range" in dbt_vars.keys(): - log("Creating date_range variable") - - # Set date_range variable manually - if dict_contains_keys( - dbt_vars["date_range"], ["date_range_start", "date_range_end"] - ): - date_var = { - "date_range_start": dbt_vars["date_range"]["date_range_start"], - "date_range_end": dbt_vars["date_range"]["date_range_end"], - } - # Create date_range using Redis - else: - raw_table_id = raw_table_id or table_id - - date_var = get_materialization_date_range.run( - dataset_id=dataset_id, - table_id=table_id, - raw_dataset_id=raw_dataset_id, - raw_table_id=raw_table_id, - table_run_datetime_column_name=dbt_vars["date_range"].get( - "table_run_datetime_column_name" - ), - mode=mode, - delay_hours=dbt_vars["date_range"].get("delay_hours", 0), - ) - - flag_date_range = True - - final_vars.append(date_var.copy()) - - log(f"date_range created: {date_var}") - - elif "run_date" in dbt_vars.keys(): - log("Creating run_date variable") - - date_var = get_run_dates.run( - dbt_vars["run_date"].get("date_range_start"), - dbt_vars["run_date"].get("date_range_end"), - ) - final_vars.append([d.copy() for d in date_var]) - - log(f"run_date created: {date_var}") - - if "version" in dbt_vars.keys(): - log("Creating version variable") - dataset_sha = fetch_dataset_sha.run(dataset_id=dataset_id) - - # if there are other variables inside the list, update each item adding the version variable - if final_vars: - final_vars = get_join_dict.run(dict_list=final_vars, new_dict=dataset_sha) - else: - final_vars.append(dataset_sha) - - log(f"version created: {dataset_sha}") - - log(f"All variables was created, final value is: {final_vars}") - - return final_vars, date_var, flag_date_range + """ + return [r[0] for r in mapped_results], [r[1] for r in mapped_results] From 22bb4ce20f2a9bfa59f078921d449e1d3689defd Mon Sep 17 00:00:00 2001 From: Rafael Date: Wed, 11 Oct 2023 13:19:29 -0300 Subject: [PATCH 080/145] remover unmapped dos filepaths --- pipelines/rj_smtr/flows.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pipelines/rj_smtr/flows.py b/pipelines/rj_smtr/flows.py index d408c75f7..7636ce081 100644 --- a/pipelines/rj_smtr/flows.py +++ b/pipelines/rj_smtr/flows.py @@ -101,7 +101,7 @@ get_raw_from_sources_returns = get_raw_from_sources.map( source_type=unmapped(source_type), - local_filepath=unmapped(filepaths), + local_filepath=filepaths, source_path=request_paths, dataset_id=unmapped(dataset_id), table_id=unmapped(table_id), From 8cb440459fe77d0b016d5f67fe1e352cf047da4f Mon Sep 17 00:00:00 2001 From: Rafael Date: Wed, 11 Oct 2023 13:19:41 -0300 Subject: [PATCH 081/145] log para debbug --- pipelines/rj_smtr/tasks.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pipelines/rj_smtr/tasks.py b/pipelines/rj_smtr/tasks.py index 5d2083e8f..eea38f3f2 100644 --- a/pipelines/rj_smtr/tasks.py +++ b/pipelines/rj_smtr/tasks.py @@ -903,6 +903,8 @@ def upload_staging_data_to_gcs( error = traceback.format_exc() log(f"[CATCHED] Task failed with error: \n{error}", level="error") + log(f"previous_error = {previous_error}") + upload_run_logs_to_bq( dataset_id=dataset_id, parent_table_id=table_id, From e8d9fb7be62464b5160e13fbc7b2a65f95e4953a Mon Sep 17 00:00:00 2001 From: Rafael Date: Wed, 11 Oct 2023 14:58:09 -0300 Subject: [PATCH 082/145] =?UTF-8?q?retirar=20unmapped=20das=20parti=C3=A7?= =?UTF-8?q?=C3=B5es?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- pipelines/rj_smtr/flows.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pipelines/rj_smtr/flows.py b/pipelines/rj_smtr/flows.py index 7636ce081..5000c6d57 100644 --- a/pipelines/rj_smtr/flows.py +++ b/pipelines/rj_smtr/flows.py @@ -118,7 +118,7 @@ raw_filepath=raw_filepaths, table_id=unmapped(table_id), dataset_id=unmapped(dataset_id), - partitions=unmapped(partitions), + partitions=partitions, ) # Pré-tratamento # From cb7e7e5d52b38b8285c8c3089e0575a0176f10fe Mon Sep 17 00:00:00 2001 From: Rafael Date: Wed, 11 Oct 2023 15:16:51 -0300 Subject: [PATCH 083/145] adicionar unmapped no parametro recapture --- pipelines/rj_smtr/flows.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pipelines/rj_smtr/flows.py b/pipelines/rj_smtr/flows.py index 5000c6d57..e95eae285 100644 --- a/pipelines/rj_smtr/flows.py +++ b/pipelines/rj_smtr/flows.py @@ -143,7 +143,7 @@ dataset_id=unmapped(dataset_id), partitions=partitions, previous_error=previous_errors, - recapture=recapture, + recapture=unmapped(recapture), ) default_capture_flow.storage = GCS(emd_constants.GCS_FLOWS_BUCKET.value) From 59789abafd22c19d31c62e2c555c58cf493990f9 Mon Sep 17 00:00:00 2001 From: Rafael Date: Mon, 16 Oct 2023 15:43:29 -0300 Subject: [PATCH 084/145] adicionar psycopg2 --- poetry.lock | 121 +++++++++++++++++++++++++++++++++++++++++++------ pyproject.toml | 1 + 2 files changed, 108 insertions(+), 14 deletions(-) diff --git a/poetry.lock b/poetry.lock index f106de89b..330ce7b4b 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 1.5.1 and should not be changed by hand. +# This file is automatically @generated by Poetry 1.6.1 and should not be changed by hand. [[package]] name = "adal" @@ -2483,6 +2483,7 @@ files = [ {file = "greenlet-2.0.2-cp27-cp27m-win32.whl", hash = "sha256:6c3acb79b0bfd4fe733dff8bc62695283b57949ebcca05ae5c129eb606ff2d74"}, {file = "greenlet-2.0.2-cp27-cp27m-win_amd64.whl", hash = "sha256:283737e0da3f08bd637b5ad058507e578dd462db259f7f6e4c5c365ba4ee9343"}, {file = "greenlet-2.0.2-cp27-cp27mu-manylinux2010_x86_64.whl", hash = "sha256:d27ec7509b9c18b6d73f2f5ede2622441de812e7b1a80bbd446cb0633bd3d5ae"}, + {file = "greenlet-2.0.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:d967650d3f56af314b72df7089d96cda1083a7fc2da05b375d2bc48c82ab3f3c"}, {file = "greenlet-2.0.2-cp310-cp310-macosx_11_0_x86_64.whl", hash = "sha256:30bcf80dda7f15ac77ba5af2b961bdd9dbc77fd4ac6105cee85b0d0a5fcf74df"}, {file = "greenlet-2.0.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:26fbfce90728d82bc9e6c38ea4d038cba20b7faf8a0ca53a9c07b67318d46088"}, {file = "greenlet-2.0.2-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:9190f09060ea4debddd24665d6804b995a9c122ef5917ab26e1566dcc712ceeb"}, @@ -2491,6 +2492,7 @@ files = [ {file = "greenlet-2.0.2-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:76ae285c8104046b3a7f06b42f29c7b73f77683df18c49ab5af7983994c2dd91"}, {file = "greenlet-2.0.2-cp310-cp310-win_amd64.whl", hash = "sha256:2d4686f195e32d36b4d7cf2d166857dbd0ee9f3d20ae349b6bf8afc8485b3645"}, {file = "greenlet-2.0.2-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:c4302695ad8027363e96311df24ee28978162cdcdd2006476c43970b384a244c"}, + {file = "greenlet-2.0.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:d4606a527e30548153be1a9f155f4e283d109ffba663a15856089fb55f933e47"}, {file = "greenlet-2.0.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c48f54ef8e05f04d6eff74b8233f6063cb1ed960243eacc474ee73a2ea8573ca"}, {file = "greenlet-2.0.2-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:a1846f1b999e78e13837c93c778dcfc3365902cfb8d1bdb7dd73ead37059f0d0"}, {file = "greenlet-2.0.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3a06ad5312349fec0ab944664b01d26f8d1f05009566339ac6f63f56589bc1a2"}, @@ -2520,6 +2522,7 @@ files = [ {file = "greenlet-2.0.2-cp37-cp37m-win32.whl", hash = "sha256:3f6ea9bd35eb450837a3d80e77b517ea5bc56b4647f5502cd28de13675ee12f7"}, {file = "greenlet-2.0.2-cp37-cp37m-win_amd64.whl", hash = "sha256:7492e2b7bd7c9b9916388d9df23fa49d9b88ac0640db0a5b4ecc2b653bf451e3"}, {file = "greenlet-2.0.2-cp38-cp38-macosx_10_15_x86_64.whl", hash = "sha256:b864ba53912b6c3ab6bcb2beb19f19edd01a6bfcbdfe1f37ddd1778abfe75a30"}, + {file = "greenlet-2.0.2-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:1087300cf9700bbf455b1b97e24db18f2f77b55302a68272c56209d5587c12d1"}, {file = "greenlet-2.0.2-cp38-cp38-manylinux2010_x86_64.whl", hash = "sha256:ba2956617f1c42598a308a84c6cf021a90ff3862eddafd20c3333d50f0edb45b"}, {file = "greenlet-2.0.2-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:fc3a569657468b6f3fb60587e48356fe512c1754ca05a564f11366ac9e306526"}, {file = "greenlet-2.0.2-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:8eab883b3b2a38cc1e050819ef06a7e6344d4a990d24d45bc6f2cf959045a45b"}, @@ -2528,6 +2531,7 @@ files = [ {file = "greenlet-2.0.2-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:b0ef99cdbe2b682b9ccbb964743a6aca37905fda5e0452e5ee239b1654d37f2a"}, {file = "greenlet-2.0.2-cp38-cp38-win32.whl", hash = "sha256:b80f600eddddce72320dbbc8e3784d16bd3fb7b517e82476d8da921f27d4b249"}, {file = "greenlet-2.0.2-cp38-cp38-win_amd64.whl", hash = "sha256:4d2e11331fc0c02b6e84b0d28ece3a36e0548ee1a1ce9ddde03752d9b79bba40"}, + {file = "greenlet-2.0.2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:8512a0c38cfd4e66a858ddd1b17705587900dd760c6003998e9472b77b56d417"}, {file = "greenlet-2.0.2-cp39-cp39-macosx_11_0_x86_64.whl", hash = "sha256:88d9ab96491d38a5ab7c56dd7a3cc37d83336ecc564e4e8816dbed12e5aaefc8"}, {file = "greenlet-2.0.2-cp39-cp39-manylinux2010_x86_64.whl", hash = "sha256:561091a7be172ab497a3527602d467e2b3fbe75f9e783d8b8ce403fa414f71a6"}, {file = "greenlet-2.0.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:971ce5e14dc5e73715755d0ca2975ac88cfdaefcaab078a284fea6cfabf866df"}, @@ -3533,10 +3537,7 @@ packaging = "<24" pandas = "<3" prometheus-flask-exporter = {version = "*", optional = true, markers = "extra == \"extras\""} protobuf = ">=3.12.0,<5" -pyarrow = [ - {version = ">=4.0.0,<13"}, - {version = "*", optional = true, markers = "extra == \"extras\""}, -] +pyarrow = ">=4.0.0,<13" pysftp = {version = "*", optional = true, markers = "extra == \"extras\""} pytz = "<2024" pyyaml = ">=5.1,<7" @@ -3945,12 +3946,11 @@ files = [ [package.dependencies] numpy = [ - {version = ">=1.21.0", markers = "python_version <= \"3.9\" and platform_system == \"Darwin\" and platform_machine == \"arm64\""}, - {version = ">=1.19.3", markers = "python_version >= \"3.6\" and platform_system == \"Linux\" and platform_machine == \"aarch64\" or python_version >= \"3.9\""}, - {version = ">=1.17.0", markers = "python_version >= \"3.7\""}, - {version = ">=1.17.3", markers = "python_version >= \"3.8\""}, - {version = ">=1.21.2", markers = "python_version >= \"3.10\""}, + {version = ">=1.21.0", markers = "python_version <= \"3.9\" and platform_system == \"Darwin\" and platform_machine == \"arm64\" and python_version >= \"3.8\""}, + {version = ">=1.19.3", markers = "platform_system == \"Linux\" and platform_machine == \"aarch64\" and python_version >= \"3.8\" and python_version < \"3.10\" or python_version > \"3.9\" and python_version < \"3.10\" or python_version >= \"3.9\" and platform_system != \"Darwin\" and python_version < \"3.10\" or python_version >= \"3.9\" and platform_machine != \"arm64\" and python_version < \"3.10\""}, + {version = ">=1.17.3", markers = "(platform_system != \"Darwin\" and platform_system != \"Linux\") and python_version >= \"3.8\" and python_version < \"3.9\" or platform_system != \"Darwin\" and python_version >= \"3.8\" and python_version < \"3.9\" and platform_machine != \"aarch64\" or platform_machine != \"arm64\" and python_version >= \"3.8\" and python_version < \"3.9\" and platform_system != \"Linux\" or (platform_machine != \"arm64\" and platform_machine != \"aarch64\") and python_version >= \"3.8\" and python_version < \"3.9\""}, {version = ">=1.21.4", markers = "python_version >= \"3.10\" and platform_system == \"Darwin\""}, + {version = ">=1.21.2", markers = "platform_system != \"Darwin\" and python_version >= \"3.10\""}, ] [[package]] @@ -3968,6 +3968,7 @@ files = [ {file = "orjson-3.9.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4a39c2529d75373b7167bf84c814ef9b8f3737a339c225ed6c0df40736df8748"}, {file = "orjson-3.9.2-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:84ebd6fdf138eb0eb4280045442331ee71c0aab5e16397ba6645f32f911bfb37"}, {file = "orjson-3.9.2-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:5a60a1cfcfe310547a1946506dd4f1ed0a7d5bd5b02c8697d9d5dcd8d2e9245e"}, + {file = "orjson-3.9.2-cp310-none-win32.whl", hash = "sha256:2ae61f5d544030a6379dbc23405df66fea0777c48a0216d2d83d3e08b69eb676"}, {file = "orjson-3.9.2-cp310-none-win_amd64.whl", hash = "sha256:c290c4f81e8fd0c1683638802c11610b2f722b540f8e5e858b6914b495cf90c8"}, {file = "orjson-3.9.2-cp311-cp311-macosx_10_15_x86_64.macosx_11_0_arm64.macosx_10_15_universal2.whl", hash = "sha256:02ef014f9a605e84b675060785e37ec9c0d2347a04f1307a9d6840ab8ecd6f55"}, {file = "orjson-3.9.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:992af54265ada1c1579500d6594ed73fe333e726de70d64919cf37f93defdd06"}, @@ -3977,6 +3978,7 @@ files = [ {file = "orjson-3.9.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:275b5a18fd9ed60b2720543d3ddac170051c43d680e47d04ff5203d2c6d8ebf1"}, {file = "orjson-3.9.2-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:b9aea6dcb99fcbc9f6d1dd84fca92322fda261da7fb014514bb4689c7c2097a8"}, {file = "orjson-3.9.2-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:7d74ae0e101d17c22ef67b741ba356ab896fc0fa64b301c2bf2bb0a4d874b190"}, + {file = "orjson-3.9.2-cp311-none-win32.whl", hash = "sha256:a9a7d618f99b2d67365f2b3a588686195cb6e16666cd5471da603a01315c17cc"}, {file = "orjson-3.9.2-cp311-none-win_amd64.whl", hash = "sha256:6320b28e7bdb58c3a3a5efffe04b9edad3318d82409e84670a9b24e8035a249d"}, {file = "orjson-3.9.2-cp37-cp37m-macosx_10_15_x86_64.macosx_11_0_arm64.macosx_10_15_universal2.whl", hash = "sha256:368e9cc91ecb7ac21f2aa475e1901204110cf3e714e98649c2502227d248f947"}, {file = "orjson-3.9.2-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:58e9e70f0dcd6a802c35887f306b555ff7a214840aad7de24901fc8bd9cf5dde"}, @@ -3986,6 +3988,7 @@ files = [ {file = "orjson-3.9.2-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e46e9c5b404bb9e41d5555762fd410d5466b7eb1ec170ad1b1609cbebe71df21"}, {file = "orjson-3.9.2-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:8170157288714678ffd64f5de33039e1164a73fd8b6be40a8a273f80093f5c4f"}, {file = "orjson-3.9.2-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:e3e2f087161947dafe8319ea2cfcb9cea4bb9d2172ecc60ac3c9738f72ef2909"}, + {file = "orjson-3.9.2-cp37-none-win32.whl", hash = "sha256:373b7b2ad11975d143556fdbd2c27e1150b535d2c07e0b48dc434211ce557fe6"}, {file = "orjson-3.9.2-cp37-none-win_amd64.whl", hash = "sha256:d7de3dbbe74109ae598692113cec327fd30c5a30ebca819b21dfa4052f7b08ef"}, {file = "orjson-3.9.2-cp38-cp38-macosx_10_15_x86_64.macosx_11_0_arm64.macosx_10_15_universal2.whl", hash = "sha256:8cd4385c59bbc1433cad4a80aca65d2d9039646a9c57f8084897549b55913b17"}, {file = "orjson-3.9.2-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a74036aab1a80c361039290cdbc51aa7adc7ea13f56e5ef94e9be536abd227bd"}, @@ -3995,6 +3998,7 @@ files = [ {file = "orjson-3.9.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1882a70bb69595b9ec5aac0040a819e94d2833fe54901e2b32f5e734bc259a8b"}, {file = "orjson-3.9.2-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:fc05e060d452145ab3c0b5420769e7356050ea311fc03cb9d79c481982917cca"}, {file = "orjson-3.9.2-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:f8bc2c40d9bb26efefb10949d261a47ca196772c308babc538dd9f4b73e8d386"}, + {file = "orjson-3.9.2-cp38-none-win32.whl", hash = "sha256:302d80198d8d5b658065627da3a356cbe5efa082b89b303f162f030c622e0a17"}, {file = "orjson-3.9.2-cp38-none-win_amd64.whl", hash = "sha256:3164fc20a585ec30a9aff33ad5de3b20ce85702b2b2a456852c413e3f0d7ab09"}, {file = "orjson-3.9.2-cp39-cp39-macosx_10_15_x86_64.macosx_11_0_arm64.macosx_10_15_universal2.whl", hash = "sha256:7a6ccadf788531595ed4728aa746bc271955448d2460ff0ef8e21eb3f2a281ba"}, {file = "orjson-3.9.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3245d230370f571c945f69aab823c279a868dc877352817e22e551de155cb06c"}, @@ -4004,6 +4008,7 @@ files = [ {file = "orjson-3.9.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:03fb36f187a0c19ff38f6289418863df8b9b7880cdbe279e920bef3a09d8dab1"}, {file = "orjson-3.9.2-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:20925d07a97c49c6305bff1635318d9fc1804aa4ccacb5fb0deb8a910e57d97a"}, {file = "orjson-3.9.2-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:eebfed53bec5674e981ebe8ed2cf00b3f7bcda62d634733ff779c264307ea505"}, + {file = "orjson-3.9.2-cp39-none-win32.whl", hash = "sha256:ba60f09d735f16593950c6adf033fbb526faa94d776925579a87b777db7d0838"}, {file = "orjson-3.9.2-cp39-none-win_amd64.whl", hash = "sha256:869b961df5fcedf6c79f4096119b35679b63272362e9b745e668f0391a892d39"}, {file = "orjson-3.9.2.tar.gz", hash = "sha256:24257c8f641979bf25ecd3e27251b5cc194cdd3a6e96004aac8446f5e63d9664"}, ] @@ -4594,6 +4599,84 @@ files = [ [package.extras] test = ["enum34", "ipaddress", "mock", "pywin32", "wmi"] +[[package]] +name = "psycopg2-binary" +version = "2.9.9" +description = "psycopg2 - Python-PostgreSQL Database Adapter" +optional = false +python-versions = ">=3.7" +files = [ + {file = "psycopg2-binary-2.9.9.tar.gz", hash = "sha256:7f01846810177d829c7692f1f5ada8096762d9172af1b1a28d4ab5b77c923c1c"}, + {file = "psycopg2_binary-2.9.9-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:c2470da5418b76232f02a2fcd2229537bb2d5a7096674ce61859c3229f2eb202"}, + {file = "psycopg2_binary-2.9.9-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:c6af2a6d4b7ee9615cbb162b0738f6e1fd1f5c3eda7e5da17861eacf4c717ea7"}, + {file = "psycopg2_binary-2.9.9-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:75723c3c0fbbf34350b46a3199eb50638ab22a0228f93fb472ef4d9becc2382b"}, + {file = "psycopg2_binary-2.9.9-cp310-cp310-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:83791a65b51ad6ee6cf0845634859d69a038ea9b03d7b26e703f94c7e93dbcf9"}, + {file = "psycopg2_binary-2.9.9-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:0ef4854e82c09e84cc63084a9e4ccd6d9b154f1dbdd283efb92ecd0b5e2b8c84"}, + {file = "psycopg2_binary-2.9.9-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ed1184ab8f113e8d660ce49a56390ca181f2981066acc27cf637d5c1e10ce46e"}, + {file = "psycopg2_binary-2.9.9-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:d2997c458c690ec2bc6b0b7ecbafd02b029b7b4283078d3b32a852a7ce3ddd98"}, + {file = "psycopg2_binary-2.9.9-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:b58b4710c7f4161b5e9dcbe73bb7c62d65670a87df7bcce9e1faaad43e715245"}, + {file = "psycopg2_binary-2.9.9-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:0c009475ee389757e6e34611d75f6e4f05f0cf5ebb76c6037508318e1a1e0d7e"}, + {file = "psycopg2_binary-2.9.9-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:8dbf6d1bc73f1d04ec1734bae3b4fb0ee3cb2a493d35ede9badbeb901fb40f6f"}, + {file = "psycopg2_binary-2.9.9-cp310-cp310-win32.whl", hash = "sha256:3f78fd71c4f43a13d342be74ebbc0666fe1f555b8837eb113cb7416856c79682"}, + {file = "psycopg2_binary-2.9.9-cp310-cp310-win_amd64.whl", hash = "sha256:876801744b0dee379e4e3c38b76fc89f88834bb15bf92ee07d94acd06ec890a0"}, + {file = "psycopg2_binary-2.9.9-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:ee825e70b1a209475622f7f7b776785bd68f34af6e7a46e2e42f27b659b5bc26"}, + {file = "psycopg2_binary-2.9.9-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:1ea665f8ce695bcc37a90ee52de7a7980be5161375d42a0b6c6abedbf0d81f0f"}, + {file = "psycopg2_binary-2.9.9-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:143072318f793f53819048fdfe30c321890af0c3ec7cb1dfc9cc87aa88241de2"}, + {file = "psycopg2_binary-2.9.9-cp311-cp311-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c332c8d69fb64979ebf76613c66b985414927a40f8defa16cf1bc028b7b0a7b0"}, + {file = "psycopg2_binary-2.9.9-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:f7fc5a5acafb7d6ccca13bfa8c90f8c51f13d8fb87d95656d3950f0158d3ce53"}, + {file = "psycopg2_binary-2.9.9-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:977646e05232579d2e7b9c59e21dbe5261f403a88417f6a6512e70d3f8a046be"}, + {file = "psycopg2_binary-2.9.9-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:b6356793b84728d9d50ead16ab43c187673831e9d4019013f1402c41b1db9b27"}, + {file = "psycopg2_binary-2.9.9-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:bc7bb56d04601d443f24094e9e31ae6deec9ccb23581f75343feebaf30423359"}, + {file = "psycopg2_binary-2.9.9-cp311-cp311-musllinux_1_1_ppc64le.whl", hash = "sha256:77853062a2c45be16fd6b8d6de2a99278ee1d985a7bd8b103e97e41c034006d2"}, + {file = "psycopg2_binary-2.9.9-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:78151aa3ec21dccd5cdef6c74c3e73386dcdfaf19bced944169697d7ac7482fc"}, + {file = "psycopg2_binary-2.9.9-cp311-cp311-win32.whl", hash = "sha256:dc4926288b2a3e9fd7b50dc6a1909a13bbdadfc67d93f3374d984e56f885579d"}, + {file = "psycopg2_binary-2.9.9-cp311-cp311-win_amd64.whl", hash = "sha256:b76bedd166805480ab069612119ea636f5ab8f8771e640ae103e05a4aae3e417"}, + {file = "psycopg2_binary-2.9.9-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:8532fd6e6e2dc57bcb3bc90b079c60de896d2128c5d9d6f24a63875a95a088cf"}, + {file = "psycopg2_binary-2.9.9-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8f8544b092a29a6ddd72f3556a9fcf249ec412e10ad28be6a0c0d948924f2212"}, + {file = "psycopg2_binary-2.9.9-cp312-cp312-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:2d423c8d8a3c82d08fe8af900ad5b613ce3632a1249fd6a223941d0735fce493"}, + {file = "psycopg2_binary-2.9.9-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:2e5afae772c00980525f6d6ecf7cbca55676296b580c0e6abb407f15f3706996"}, + {file = "psycopg2_binary-2.9.9-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6e6f98446430fdf41bd36d4faa6cb409f5140c1c2cf58ce0bbdaf16af7d3f119"}, + {file = "psycopg2_binary-2.9.9-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:c77e3d1862452565875eb31bdb45ac62502feabbd53429fdc39a1cc341d681ba"}, + {file = "psycopg2_binary-2.9.9-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:cb16c65dcb648d0a43a2521f2f0a2300f40639f6f8c1ecbc662141e4e3e1ee07"}, + {file = "psycopg2_binary-2.9.9-cp312-cp312-musllinux_1_1_ppc64le.whl", hash = "sha256:911dda9c487075abd54e644ccdf5e5c16773470a6a5d3826fda76699410066fb"}, + {file = "psycopg2_binary-2.9.9-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:57fede879f08d23c85140a360c6a77709113efd1c993923c59fde17aa27599fe"}, + {file = "psycopg2_binary-2.9.9-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:2293b001e319ab0d869d660a704942c9e2cce19745262a8aba2115ef41a0a42a"}, + {file = "psycopg2_binary-2.9.9-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:03ef7df18daf2c4c07e2695e8cfd5ee7f748a1d54d802330985a78d2a5a6dca9"}, + {file = "psycopg2_binary-2.9.9-cp37-cp37m-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:0a602ea5aff39bb9fac6308e9c9d82b9a35c2bf288e184a816002c9fae930b77"}, + {file = "psycopg2_binary-2.9.9-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:8359bf4791968c5a78c56103702000105501adb557f3cf772b2c207284273984"}, + {file = "psycopg2_binary-2.9.9-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:275ff571376626195ab95a746e6a04c7df8ea34638b99fc11160de91f2fef503"}, + {file = "psycopg2_binary-2.9.9-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:f9b5571d33660d5009a8b3c25dc1db560206e2d2f89d3df1cb32d72c0d117d52"}, + {file = "psycopg2_binary-2.9.9-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:420f9bbf47a02616e8554e825208cb947969451978dceb77f95ad09c37791dae"}, + {file = "psycopg2_binary-2.9.9-cp37-cp37m-musllinux_1_1_ppc64le.whl", hash = "sha256:4154ad09dac630a0f13f37b583eae260c6aa885d67dfbccb5b02c33f31a6d420"}, + {file = "psycopg2_binary-2.9.9-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:a148c5d507bb9b4f2030a2025c545fccb0e1ef317393eaba42e7eabd28eb6041"}, + {file = "psycopg2_binary-2.9.9-cp37-cp37m-win32.whl", hash = "sha256:68fc1f1ba168724771e38bee37d940d2865cb0f562380a1fb1ffb428b75cb692"}, + {file = "psycopg2_binary-2.9.9-cp37-cp37m-win_amd64.whl", hash = "sha256:281309265596e388ef483250db3640e5f414168c5a67e9c665cafce9492eda2f"}, + {file = "psycopg2_binary-2.9.9-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:60989127da422b74a04345096c10d416c2b41bd7bf2a380eb541059e4e999980"}, + {file = "psycopg2_binary-2.9.9-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:246b123cc54bb5361588acc54218c8c9fb73068bf227a4a531d8ed56fa3ca7d6"}, + {file = "psycopg2_binary-2.9.9-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:34eccd14566f8fe14b2b95bb13b11572f7c7d5c36da61caf414d23b91fcc5d94"}, + {file = "psycopg2_binary-2.9.9-cp38-cp38-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:18d0ef97766055fec15b5de2c06dd8e7654705ce3e5e5eed3b6651a1d2a9a152"}, + {file = "psycopg2_binary-2.9.9-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:d3f82c171b4ccd83bbaf35aa05e44e690113bd4f3b7b6cc54d2219b132f3ae55"}, + {file = "psycopg2_binary-2.9.9-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ead20f7913a9c1e894aebe47cccf9dc834e1618b7aa96155d2091a626e59c972"}, + {file = "psycopg2_binary-2.9.9-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:ca49a8119c6cbd77375ae303b0cfd8c11f011abbbd64601167ecca18a87e7cdd"}, + {file = "psycopg2_binary-2.9.9-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:323ba25b92454adb36fa425dc5cf6f8f19f78948cbad2e7bc6cdf7b0d7982e59"}, + {file = "psycopg2_binary-2.9.9-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:1236ed0952fbd919c100bc839eaa4a39ebc397ed1c08a97fc45fee2a595aa1b3"}, + {file = "psycopg2_binary-2.9.9-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:729177eaf0aefca0994ce4cffe96ad3c75e377c7b6f4efa59ebf003b6d398716"}, + {file = "psycopg2_binary-2.9.9-cp38-cp38-win32.whl", hash = "sha256:804d99b24ad523a1fe18cc707bf741670332f7c7412e9d49cb5eab67e886b9b5"}, + {file = "psycopg2_binary-2.9.9-cp38-cp38-win_amd64.whl", hash = "sha256:a6cdcc3ede532f4a4b96000b6362099591ab4a3e913d70bcbac2b56c872446f7"}, + {file = "psycopg2_binary-2.9.9-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:72dffbd8b4194858d0941062a9766f8297e8868e1dd07a7b36212aaa90f49472"}, + {file = "psycopg2_binary-2.9.9-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:30dcc86377618a4c8f3b72418df92e77be4254d8f89f14b8e8f57d6d43603c0f"}, + {file = "psycopg2_binary-2.9.9-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:31a34c508c003a4347d389a9e6fcc2307cc2150eb516462a7a17512130de109e"}, + {file = "psycopg2_binary-2.9.9-cp39-cp39-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:15208be1c50b99203fe88d15695f22a5bed95ab3f84354c494bcb1d08557df67"}, + {file = "psycopg2_binary-2.9.9-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:1873aade94b74715be2246321c8650cabf5a0d098a95bab81145ffffa4c13876"}, + {file = "psycopg2_binary-2.9.9-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3a58c98a7e9c021f357348867f537017057c2ed7f77337fd914d0bedb35dace7"}, + {file = "psycopg2_binary-2.9.9-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:4686818798f9194d03c9129a4d9a702d9e113a89cb03bffe08c6cf799e053291"}, + {file = "psycopg2_binary-2.9.9-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:ebdc36bea43063116f0486869652cb2ed7032dbc59fbcb4445c4862b5c1ecf7f"}, + {file = "psycopg2_binary-2.9.9-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:ca08decd2697fdea0aea364b370b1249d47336aec935f87b8bbfd7da5b2ee9c1"}, + {file = "psycopg2_binary-2.9.9-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:ac05fb791acf5e1a3e39402641827780fe44d27e72567a000412c648a85ba860"}, + {file = "psycopg2_binary-2.9.9-cp39-cp39-win32.whl", hash = "sha256:9dba73be7305b399924709b91682299794887cbbd88e38226ed9f6712eabee90"}, + {file = "psycopg2_binary-2.9.9-cp39-cp39-win_amd64.whl", hash = "sha256:f7ae5d65ccfbebdfa761585228eb4d0df3a8b15cfb53bd953e713e09fbb12957"}, +] + [[package]] name = "ptyprocess" version = "0.7.0" @@ -5296,6 +5379,7 @@ files = [ {file = "PyYAML-6.0.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:69b023b2b4daa7548bcfbd4aa3da05b3a74b772db9e23b982788168117739938"}, {file = "PyYAML-6.0.1-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:81e0b275a9ecc9c0c0c07b4b90ba548307583c125f54d5b6946cfee6360c733d"}, {file = "PyYAML-6.0.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ba336e390cd8e4d1739f42dfe9bb83a3cc2e80f567d8805e11b46f4a943f5515"}, + {file = "PyYAML-6.0.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:326c013efe8048858a6d312ddd31d56e468118ad4cdeda36c719bf5bb6192290"}, {file = "PyYAML-6.0.1-cp310-cp310-win32.whl", hash = "sha256:bd4af7373a854424dabd882decdc5579653d7868b8fb26dc7d0e99f823aa5924"}, {file = "PyYAML-6.0.1-cp310-cp310-win_amd64.whl", hash = "sha256:fd1592b3fdf65fff2ad0004b5e363300ef59ced41c2e6b3a99d4089fa8c5435d"}, {file = "PyYAML-6.0.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:6965a7bc3cf88e5a1c3bd2e0b5c22f8d677dc88a455344035f03399034eb3007"}, @@ -5303,8 +5387,15 @@ files = [ {file = "PyYAML-6.0.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:42f8152b8dbc4fe7d96729ec2b99c7097d656dc1213a3229ca5383f973a5ed6d"}, {file = "PyYAML-6.0.1-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:062582fca9fabdd2c8b54a3ef1c978d786e0f6b3a1510e0ac93ef59e0ddae2bc"}, {file = "PyYAML-6.0.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d2b04aac4d386b172d5b9692e2d2da8de7bfb6c387fa4f801fbf6fb2e6ba4673"}, + {file = "PyYAML-6.0.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:e7d73685e87afe9f3b36c799222440d6cf362062f78be1013661b00c5c6f678b"}, {file = "PyYAML-6.0.1-cp311-cp311-win32.whl", hash = "sha256:1635fd110e8d85d55237ab316b5b011de701ea0f29d07611174a1b42f1444741"}, {file = "PyYAML-6.0.1-cp311-cp311-win_amd64.whl", hash = "sha256:bf07ee2fef7014951eeb99f56f39c9bb4af143d8aa3c21b1677805985307da34"}, + {file = "PyYAML-6.0.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:855fb52b0dc35af121542a76b9a84f8d1cd886ea97c84703eaa6d88e37a2ad28"}, + {file = "PyYAML-6.0.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:40df9b996c2b73138957fe23a16a4f0ba614f4c0efce1e9406a184b6d07fa3a9"}, + {file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6c22bec3fbe2524cde73d7ada88f6566758a8f7227bfbf93a408a9d86bcc12a0"}, + {file = "PyYAML-6.0.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:8d4e9c88387b0f5c7d5f281e55304de64cf7f9c0021a3525bd3b1c542da3b0e4"}, + {file = "PyYAML-6.0.1-cp312-cp312-win32.whl", hash = "sha256:d483d2cdf104e7c9fa60c544d92981f12ad66a457afae824d146093b8c294c54"}, + {file = "PyYAML-6.0.1-cp312-cp312-win_amd64.whl", hash = "sha256:0d3304d8c0adc42be59c5f8a4d9e3d7379e6955ad754aa9d6ab7a398b59dd1df"}, {file = "PyYAML-6.0.1-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:50550eb667afee136e9a77d6dc71ae76a44df8b3e51e41b77f6de2932bfe0f47"}, {file = "PyYAML-6.0.1-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1fe35611261b29bd1de0070f0b2f47cb6ff71fa6595c077e42bd0c419fa27b98"}, {file = "PyYAML-6.0.1-cp36-cp36m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:704219a11b772aea0d8ecd7058d0082713c3562b4e271b849ad7dc4a5c90c13c"}, @@ -5321,6 +5412,7 @@ files = [ {file = "PyYAML-6.0.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a0cd17c15d3bb3fa06978b4e8958dcdc6e0174ccea823003a106c7d4d7899ac5"}, {file = "PyYAML-6.0.1-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:28c119d996beec18c05208a8bd78cbe4007878c6dd15091efb73a30e90539696"}, {file = "PyYAML-6.0.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7e07cbde391ba96ab58e532ff4803f79c4129397514e1413a7dc761ccd755735"}, + {file = "PyYAML-6.0.1-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:49a183be227561de579b4a36efbb21b3eab9651dd81b1858589f796549873dd6"}, {file = "PyYAML-6.0.1-cp38-cp38-win32.whl", hash = "sha256:184c5108a2aca3c5b3d3bf9395d50893a7ab82a38004c8f61c258d4428e80206"}, {file = "PyYAML-6.0.1-cp38-cp38-win_amd64.whl", hash = "sha256:1e2722cc9fbb45d9b87631ac70924c11d3a401b2d7f410cc0e3bbf249f2dca62"}, {file = "PyYAML-6.0.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:9eb6caa9a297fc2c2fb8862bc5370d0303ddba53ba97e71f08023b6cd73d16a8"}, @@ -5328,6 +5420,7 @@ files = [ {file = "PyYAML-6.0.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5773183b6446b2c99bb77e77595dd486303b4faab2b086e7b17bc6bef28865f6"}, {file = "PyYAML-6.0.1-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:b786eecbdf8499b9ca1d697215862083bd6d2a99965554781d0d8d1ad31e13a0"}, {file = "PyYAML-6.0.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bc1bf2925a1ecd43da378f4db9e4f799775d6367bdb94671027b73b393a7c42c"}, + {file = "PyYAML-6.0.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:04ac92ad1925b2cff1db0cfebffb6ffc43457495c9b3c39d3fcae417d7125dc5"}, {file = "PyYAML-6.0.1-cp39-cp39-win32.whl", hash = "sha256:faca3bdcf85b2fc05d06ff3fbc1f83e1391b3e724afa3feba7d13eeab355484c"}, {file = "PyYAML-6.0.1-cp39-cp39-win_amd64.whl", hash = "sha256:510c9deebc5c0225e8c96813043e62b680ba2f9c50a08d3724c7f28a747d1486"}, {file = "PyYAML-6.0.1.tar.gz", hash = "sha256:bfdf460b1736c775f2ba9f6a92bca30bc2095067b8a9d77876d1fad6cc3b4a43"}, @@ -6088,7 +6181,7 @@ files = [ ] [package.dependencies] -greenlet = {version = "!=0.4.17", markers = "platform_machine == \"win32\" or platform_machine == \"WIN32\" or platform_machine == \"AMD64\" or platform_machine == \"amd64\" or platform_machine == \"x86_64\" or platform_machine == \"ppc64le\" or platform_machine == \"aarch64\""} +greenlet = {version = "!=0.4.17", markers = "platform_machine == \"aarch64\" or platform_machine == \"ppc64le\" or platform_machine == \"x86_64\" or platform_machine == \"amd64\" or platform_machine == \"AMD64\" or platform_machine == \"win32\" or platform_machine == \"WIN32\""} typing-extensions = ">=4.2.0" [package.extras] @@ -6248,8 +6341,8 @@ packaging = ">=21.3" pandas = ">=0.25" patsy = ">=0.5.2" scipy = [ - {version = ">=1.3", markers = "(python_version > \"3.9\" or platform_system != \"Windows\" or platform_machine != \"x86\") and python_version < \"3.12\""}, - {version = ">=1.3,<1.9", markers = "(python_version == \"3.8\" or python_version == \"3.9\") and platform_system == \"Windows\" and platform_machine == \"x86\""}, + {version = ">=1.3", markers = "python_version > \"3.9\" and python_version < \"3.12\" or platform_system != \"Windows\" and python_version < \"3.12\" or platform_machine != \"x86\" and python_version < \"3.12\""}, + {version = ">=1.3,<1.9", markers = "python_version == \"3.8\" and platform_system == \"Windows\" and platform_machine == \"x86\" or python_version == \"3.9\" and platform_system == \"Windows\" and platform_machine == \"x86\""}, ] [package.extras] @@ -7119,4 +7212,4 @@ testing = ["coverage (>=5.0.3)", "zope.event", "zope.testing"] [metadata] lock-version = "2.0" python-versions = ">=3.8,<3.11" -content-hash = "ed25c76ba0aeea3d6fc6c59725c127160d13c12b527a3cf3900cb58db177750c" +content-hash = "44c47c0f926f2494ef43ed357af82aa10b2ce5d1c5a46197a594ed94ec1e8b6a" diff --git a/pyproject.toml b/pyproject.toml index 0c8318999..36a66722a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -60,6 +60,7 @@ statsmodels = "^0.13.0" tweepy = "4.4" xarray = "^2022.6.0" xgboost = "^1.7.4" +psycopg2-binary = "^2.9.9" [tool.poetry.dev-dependencies] pylint = "^2.12.2" From 60b1a93339085bfc9ca7a6122b952e544a9e6605 Mon Sep 17 00:00:00 2001 From: Rafael Date: Mon, 16 Oct 2023 15:43:44 -0300 Subject: [PATCH 085/145] =?UTF-8?q?coment=C3=A1rios=20dos=20parametros?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- pipelines/rj_smtr/flows.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/pipelines/rj_smtr/flows.py b/pipelines/rj_smtr/flows.py index e95eae285..e08bb919e 100644 --- a/pipelines/rj_smtr/flows.py +++ b/pipelines/rj_smtr/flows.py @@ -48,15 +48,20 @@ ) as default_capture_flow: # Configuração # + # Parâmetros Gerais # table_id = Parameter("table_id", default=None) + dataset_id = Parameter("dataset_id", default=None) partition_date_only = Parameter("partition_date_only", default=None) + + # Parâmetros Captura # extract_params = Parameter("extract_params", default=None) - dataset_id = Parameter("dataset_id", default=None) secret_path = Parameter("secret_path", default=None) - primary_key = Parameter("primary_key", default=None) source_type = Parameter("source_type", default=None) recapture = Parameter("recapture", default=False) + # Parâmetros Pré-tratamento # + primary_key = Parameter("primary_key", default=None) + with case(recapture, True): _, recapture_timestamps, previous_errors = query_logs( dataset_id=dataset_id, From ff7797355e8dc90db951e23eee9e7c10c32b29e0 Mon Sep 17 00:00:00 2001 From: Rafael Date: Mon, 16 Oct 2023 15:44:02 -0300 Subject: [PATCH 086/145] =?UTF-8?q?adicionar=20conex=C3=A3o=20com=20postgr?= =?UTF-8?q?esql?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- pipelines/rj_smtr/utils.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/pipelines/rj_smtr/utils.py b/pipelines/rj_smtr/utils.py index 6a6c70ee5..8775ca9b8 100644 --- a/pipelines/rj_smtr/utils.py +++ b/pipelines/rj_smtr/utils.py @@ -20,6 +20,8 @@ import pandas as pd from google.cloud.storage.blob import Blob import pymysql +import psycopg2 +import psycopg2.extras from prefect.schedules.clocks import IntervalClock @@ -435,7 +437,6 @@ def generate_execute_schedules( # pylint: disable=too-many-arguments,too-many-l clocks = [] for count, parameters in enumerate(table_parameters): parameter_defaults = parameters | general_flow_params - log(f"parameter_defaults: {parameter_defaults}") clocks.append( IntervalClock( interval=clock_interval, @@ -630,12 +631,16 @@ def get_raw_data_db( tuple[str, str, str]: Error, data and filetype """ connection_mapping = { - # 'postgresql': {'connector': psycopg2.connect, 'port': '5432', 'cursor':{'cursor_factory': psycopg2.extras.RealDictCursor}}, + "postgresql": { + "connector": psycopg2.connect, + "port": "5432", + "cursor": {"cursor_factory": psycopg2.extras.RealDictCursor}, + }, "mysql": { "connector": pymysql.connect, "port": "3306", "cursor": {"cursor": pymysql.cursors.DictCursor}, - } + }, } data = None From b0843876f6c4f61094b3481bdb0da546555094d3 Mon Sep 17 00:00:00 2001 From: Rafael Date: Mon, 16 Oct 2023 16:19:05 -0300 Subject: [PATCH 087/145] mudar bilhetagem para extrair do db --- pipelines/rj_smtr/constants.py | 9 ++++----- pipelines/rj_smtr/tasks.py | 7 ++----- 2 files changed, 6 insertions(+), 10 deletions(-) diff --git a/pipelines/rj_smtr/constants.py b/pipelines/rj_smtr/constants.py index 0037c6989..ee96e8a79 100644 --- a/pipelines/rj_smtr/constants.py +++ b/pipelines/rj_smtr/constants.py @@ -170,19 +170,18 @@ class constants(Enum): # pylint: disable=c0103 "databases": { "principal_db": { "engine": "mysql", - "host": "principal-database-replica.internal", + "host": "10.5.114.121", }, "tarifa_db": { "engine": "postgres", - "host": "tarifa-database-replica.internal", + "host": "10.5.113.254", }, "transacao_db": { "engine": "postgres", - "host": "transacao-database-replica.internal", + "host": "10.5.114.65", }, }, - "vpn_url": "http://vpn-jae.mobilidade.rio/", - "source_type": "api-json", + "source_type": "db", } BILHETAGEM_CAPTURE_RUN_INTERVAL = { diff --git a/pipelines/rj_smtr/tasks.py b/pipelines/rj_smtr/tasks.py index eea38f3f2..3969f28b9 100644 --- a/pipelines/rj_smtr/tasks.py +++ b/pipelines/rj_smtr/tasks.py @@ -564,18 +564,15 @@ def create_request_params( database = constants.BILHETAGEM_GENERAL_CAPTURE_PARAMS.value["databases"][ extract_params["database"] ] - request_url = ( - constants.BILHETAGEM_GENERAL_CAPTURE_PARAMS.value["vpn_url"] - + database["engine"] - ) + request_url = database["host"] datetime_range = get_datetime_range( timestamp=timestamp, interval=timedelta(**extract_params["run_interval"]) ) request_params = { - "host": database["host"], # TODO: exibir no log em ambiente fechado "database": extract_params["database"], + "engine": database["engine"], "query": extract_params["query"].format(**datetime_range), } From 032763c6f65fcf9310775e2d70a5ecba2bbfafba Mon Sep 17 00:00:00 2001 From: Rafael Date: Mon, 16 Oct 2023 16:19:32 -0300 Subject: [PATCH 088/145] padronizar nomenclatura dos argumentos --- pipelines/rj_smtr/utils.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/pipelines/rj_smtr/utils.py b/pipelines/rj_smtr/utils.py index 8775ca9b8..cf69edc2c 100644 --- a/pipelines/rj_smtr/utils.py +++ b/pipelines/rj_smtr/utils.py @@ -615,14 +615,14 @@ def get_raw_data_gcs( def get_raw_data_db( - sql: str, dbms: str, host: str, secret_path: str, database: str + query: str, engine: str, host: str, secret_path: str, database: str ) -> tuple[str, str, str]: """ Get data from Databases Args: - sql (str): the SQL Query to execute - dbms (str): The datase management system + query (str): the SQL Query to execute + engine (str): The datase management system host (str): The database host secret_path (str): Secret path to get credentials database (str): The database to connect @@ -650,7 +650,7 @@ def get_raw_data_db( try: credentials = get_vault_secret(secret_path)["data"] - connection = connection_mapping[dbms]( + connection = connection_mapping[engine]( host=host, user=credentials["user"], password=credentials["password"], @@ -658,8 +658,8 @@ def get_raw_data_db( ) with connection: - with connection.cursor(**connection_mapping[dbms]["cursor"]) as cursor: - cursor.execute(sql) + with connection.cursor(**connection_mapping[engine]["cursor"]) as cursor: + cursor.execute(query) data = cursor.fetchall() data = [dict(d) for d in data] From ffb205176bed69b67f5578392ac4951a40483691 Mon Sep 17 00:00:00 2001 From: Rafael Date: Mon, 16 Oct 2023 16:43:41 -0300 Subject: [PATCH 089/145] mudar label schedule para dev --- pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/schedules.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/schedules.py b/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/schedules.py index 21e13f05b..6cb4b0724 100644 --- a/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/schedules.py +++ b/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/schedules.py @@ -21,7 +21,7 @@ **constants.BILHETAGEM_CAPTURE_RUN_INTERVAL.value["transacao_run_interval"] ), labels=[ - emd_constants.RJ_SMTR_AGENT_LABEL.value, + emd_constants.RJ_SMTR_DEV_AGENT_LABEL.value, ], table_parameters=constants.BILHETAGEM_TRANSACAO_CAPTURE_PARAMS.value, dataset_id=constants.BILHETAGEM_DATASET_ID.value, From 10911c60af21122883c0c6557678128b6397a595 Mon Sep 17 00:00:00 2001 From: Rafael Date: Mon, 16 Oct 2023 18:19:37 -0300 Subject: [PATCH 090/145] corrigir constante db bilhetagem postgresql --- pipelines/rj_smtr/constants.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pipelines/rj_smtr/constants.py b/pipelines/rj_smtr/constants.py index ee96e8a79..5dde5c55a 100644 --- a/pipelines/rj_smtr/constants.py +++ b/pipelines/rj_smtr/constants.py @@ -173,11 +173,11 @@ class constants(Enum): # pylint: disable=c0103 "host": "10.5.114.121", }, "tarifa_db": { - "engine": "postgres", + "engine": "postgresql", "host": "10.5.113.254", }, "transacao_db": { - "engine": "postgres", + "engine": "postgresql", "host": "10.5.114.65", }, }, From 7e51e69ce7189306683ee4142c461bd1ee1a5068 Mon Sep 17 00:00:00 2001 From: Rafael Date: Mon, 16 Oct 2023 18:24:07 -0300 Subject: [PATCH 091/145] =?UTF-8?q?alterar=20nomea=C3=A7=C3=A3o=20para=20r?= =?UTF-8?q?uns=20de=20recaptura?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- pipelines/rj_smtr/flows.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/pipelines/rj_smtr/flows.py b/pipelines/rj_smtr/flows.py index e08bb919e..14d8a1207 100644 --- a/pipelines/rj_smtr/flows.py +++ b/pipelines/rj_smtr/flows.py @@ -67,15 +67,18 @@ dataset_id=dataset_id, table_id=table_id, ) + RECAPTURE_RUNNAME_SUFIX = " Recaptura" with case(recapture, False): capture_timestamp = [get_current_timestamp()] previous_errors = [None] + CAPTURE_RUNNAME_SUFIX = "" timestamps = merge(recapture_timestamps, capture_timestamp) + runname_sufix = merge(RECAPTURE_RUNNAME_SUFIX, CAPTURE_RUNNAME_SUFIX) rename_flow_run = rename_current_flow_run_now_time( - prefix=default_capture_flow.name + " " + table_id + ": ", + prefix=default_capture_flow.name + " " + table_id + runname_sufix + ": ", now_time=get_now_time(), ) From e256f120f52b128367301b39ce8c34541c6423a7 Mon Sep 17 00:00:00 2001 From: Rafael Date: Mon, 16 Oct 2023 19:15:03 -0300 Subject: [PATCH 092/145] ajuste connector --- pipelines/rj_smtr/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pipelines/rj_smtr/utils.py b/pipelines/rj_smtr/utils.py index cf69edc2c..89bd2a3c7 100644 --- a/pipelines/rj_smtr/utils.py +++ b/pipelines/rj_smtr/utils.py @@ -650,7 +650,7 @@ def get_raw_data_db( try: credentials = get_vault_secret(secret_path)["data"] - connection = connection_mapping[engine]( + connection = connection_mapping[engine]["connector"]( host=host, user=credentials["user"], password=credentials["password"], From c67a93e970450b0a2316bda6c66ba1b2055c2a5f Mon Sep 17 00:00:00 2001 From: Rafael Date: Tue, 17 Oct 2023 09:06:48 -0300 Subject: [PATCH 093/145] alterar IP para DNS --- pipelines/rj_smtr/constants.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pipelines/rj_smtr/constants.py b/pipelines/rj_smtr/constants.py index 5dde5c55a..9369bb465 100644 --- a/pipelines/rj_smtr/constants.py +++ b/pipelines/rj_smtr/constants.py @@ -170,15 +170,15 @@ class constants(Enum): # pylint: disable=c0103 "databases": { "principal_db": { "engine": "mysql", - "host": "10.5.114.121", + "host": "principal-database-replica.internal", }, "tarifa_db": { "engine": "postgresql", - "host": "10.5.113.254", + "host": "tarifa-database-replica.internal", }, "transacao_db": { "engine": "postgresql", - "host": "10.5.114.65", + "host": "transacao-database-replica.internal", }, }, "source_type": "db", From a5d342c98fb4d531ed77f2c4e3cf986dc8b85dcb Mon Sep 17 00:00:00 2001 From: Rafael Date: Tue, 17 Oct 2023 10:28:32 -0300 Subject: [PATCH 094/145] Serialize datetime objects / read sql with pandas --- pipelines/rj_smtr/utils.py | 64 +++++++++++++++++++++----------------- 1 file changed, 36 insertions(+), 28 deletions(-) diff --git a/pipelines/rj_smtr/utils.py b/pipelines/rj_smtr/utils.py index 89bd2a3c7..16ed538d3 100644 --- a/pipelines/rj_smtr/utils.py +++ b/pipelines/rj_smtr/utils.py @@ -7,8 +7,8 @@ from ftplib import FTP from pathlib import Path -from datetime import timedelta, datetime -from typing import List, Union +from datetime import timedelta, datetime, date +from typing import List, Union, Any import traceback import io import json @@ -462,17 +462,40 @@ def dict_contains_keys(input_dict: dict, keys: list[str]) -> bool: return all(x in input_dict.keys() for x in keys) +def custom_serialization(obj: Any) -> Any: + """ + Function to serialize not JSON serializable objects + + Args: + obj (Any): Object to serialize + + Returns: + Any: Serialized object + """ + if isinstance(obj, (datetime, date, pd.Timestamp)): + if obj.tzinfo is None: + obj = obj.tz_localize(emd_constants.DEFAULT_TIMEZONE.value) + else: + obj = obj.tz_convert(emd_constants.DEFAULT_TIMEZONE.value) + + return obj.isoformat() + + raise TypeError(f"Object of type {type(obj)} is not JSON serializable") + + def save_raw_local_func( - data: Union[dict, str], filepath: str, mode: str = "raw", filetype: str = "json" + data: Union[dict, str], + filepath: str, + mode: str = "raw", + filetype: str = "json", ) -> str: """ Saves json response from API to .json file. Args: + data (Union[dict, str]): Raw data to save filepath (str): Path which to save raw file - status (dict): Must contain keys - * data: json returned from API - * error: error catched from API request mode (str, optional): Folder to save locally, later folder which to upload to GCS. + filetype (str, optional): The file format Returns: str: Path to the saved file """ @@ -485,10 +508,8 @@ def save_raw_local_func( if isinstance(data, str): data = json.loads(data) with Path(_filepath).open("w", encoding="utf-8") as fi: - json.dump(data, fi) + json.dump(data, fi, default=custom_serialization) - # if filetype == "csv": - # pass if filetype in ("txt", "csv"): with open(_filepath, "w", encoding="utf-8") as file: file.write(data) @@ -630,17 +651,9 @@ def get_raw_data_db( Returns: tuple[str, str, str]: Error, data and filetype """ - connection_mapping = { - "postgresql": { - "connector": psycopg2.connect, - "port": "5432", - "cursor": {"cursor_factory": psycopg2.extras.RealDictCursor}, - }, - "mysql": { - "connector": pymysql.connect, - "port": "3306", - "cursor": {"cursor": pymysql.cursors.DictCursor}, - }, + connector_mapping = { + "postgresql": psycopg2.connect, + "mysql": pymysql.connect, } data = None @@ -650,19 +663,14 @@ def get_raw_data_db( try: credentials = get_vault_secret(secret_path)["data"] - connection = connection_mapping[engine]["connector"]( + with connector_mapping[engine]( host=host, user=credentials["user"], password=credentials["password"], database=database, - ) - - with connection: - with connection.cursor(**connection_mapping[engine]["cursor"]) as cursor: - cursor.execute(query) - data = cursor.fetchall() + ) as connection: + data = pd.read_sql(sql=query, con=connection).to_dict(orient="records") - data = [dict(d) for d in data] except Exception: error = traceback.format_exc() log(f"[CATCHED] Task failed with error: \n{error}", level="error") From 16ffff35eee7cc10d56334174602ced25b2e4461 Mon Sep 17 00:00:00 2001 From: Rafael Date: Tue, 17 Oct 2023 12:20:37 -0300 Subject: [PATCH 095/145] mudar logica do nome da run --- pipelines/rj_smtr/flows.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/pipelines/rj_smtr/flows.py b/pipelines/rj_smtr/flows.py index 14d8a1207..db72bfd3d 100644 --- a/pipelines/rj_smtr/flows.py +++ b/pipelines/rj_smtr/flows.py @@ -5,7 +5,7 @@ from prefect.run_configs import KubernetesRun from prefect.storage import GCS -from prefect import case, Parameter +from prefect import case, Parameter, task from prefect.utilities.edges import unmapped from prefect.tasks.control_flow import merge @@ -62,23 +62,26 @@ # Parâmetros Pré-tratamento # primary_key = Parameter("primary_key", default=None) + get_run_name_prefix = task( + lambda recap: "Recaptura" if recap else "Captura", + name="get_run_name_prefix", + checkpoint=False, + ) + with case(recapture, True): _, recapture_timestamps, previous_errors = query_logs( dataset_id=dataset_id, table_id=table_id, ) - RECAPTURE_RUNNAME_SUFIX = " Recaptura" with case(recapture, False): capture_timestamp = [get_current_timestamp()] previous_errors = [None] - CAPTURE_RUNNAME_SUFIX = "" timestamps = merge(recapture_timestamps, capture_timestamp) - runname_sufix = merge(RECAPTURE_RUNNAME_SUFIX, CAPTURE_RUNNAME_SUFIX) rename_flow_run = rename_current_flow_run_now_time( - prefix=default_capture_flow.name + " " + table_id + runname_sufix + ": ", + prefix="SMTR: " + get_run_name_prefix(recap=recapture) + " " + table_id + ": ", now_time=get_now_time(), ) From 55fbe34f12bbe6d26264e269de0e4b82244caa67 Mon Sep 17 00:00:00 2001 From: Rafael Date: Tue, 17 Oct 2023 12:21:30 -0300 Subject: [PATCH 096/145] cria recaptura bilhetagem --- .../br_rj_riodejaneiro_bilhetagem/flows.py | 51 ++++++++++++++++--- 1 file changed, 44 insertions(+), 7 deletions(-) diff --git a/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/flows.py b/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/flows.py index fb6e67594..0a1e29ba9 100644 --- a/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/flows.py +++ b/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/flows.py @@ -39,7 +39,14 @@ from pipelines.rj_smtr.constants import constants -from pipelines.rj_smtr.schedules import every_hour +from pipelines.rj_smtr.schedules import every_hour, every_minute + + +GENERAL_CAPTURE_DEFAULT_PARAMS = { + "dataset_id": constants.BILHETAGEM_DATASET_ID.value, + "secret_path": constants.BILHETAGEM_SECRET_PATH.value, + "source_type": constants.BILHETAGEM_GENERAL_CAPTURE_PARAMS.value["source_type"], +} # Flows # @@ -52,7 +59,25 @@ image=emd_constants.DOCKER_IMAGE.value, labels=[emd_constants.RJ_SMTR_DEV_AGENT_LABEL.value], ) -bilhetagem_transacao_captura.schedule = bilhetagem_transacao_schedule + +bilhetagem_transacao_captura = set_default_parameters( + flow=bilhetagem_transacao_captura, + default_parameters=GENERAL_CAPTURE_DEFAULT_PARAMS + | constants.BILHETAGEM_TRANSACAO_CAPTURE_PARAMS.value, +) + +bilhetagem_transacao_captura.schedule = every_minute + +bilhetagem_transacao_recaptura = deepcopy(default_capture_flow) +bilhetagem_transacao_recaptura.name = "SMTR: Bilhetagem Transação - Recaptura (subflow)" +bilhetagem_transacao_recaptura.storage = GCS(emd_constants.GCS_FLOWS_BUCKET.value) +bilhetagem_transacao_recaptura = set_default_parameters( + flow=bilhetagem_transacao_recaptura, + default_parameters=GENERAL_CAPTURE_DEFAULT_PARAMS + | constants.BILHETAGEM_TRANSACAO_CAPTURE_PARAMS.value + | {"recapture": True}, +) + # BILHETAGEM AUXILIAR - SUBFLOW PARA RODAR ANTES DE CADA MATERIALIZAÇÃO # @@ -66,11 +91,7 @@ bilhetagem_auxiliar_captura = set_default_parameters( flow=bilhetagem_auxiliar_captura, - default_parameters={ - "dataset_id": constants.BILHETAGEM_DATASET_ID.value, - "secret_path": constants.BILHETAGEM_SECRET_PATH.value, - "source_type": constants.BILHETAGEM_GENERAL_CAPTURE_PARAMS.value["source_type"], - }, + default_parameters=GENERAL_CAPTURE_DEFAULT_PARAMS, ) # MATERIALIZAÇÃO - SUBFLOW DE MATERIALIZAÇÃO @@ -106,12 +127,28 @@ LABELS = get_current_flow_labels() + # Recaptura Transações + + run_recaptura = create_flow_run( + flow_name=bilhetagem_transacao_recaptura.name, + project_name=emd_constants.PREFECT_DEFAULT_PROJECT.value, + labels=LABELS, + ) + + wait_recaptura = wait_for_flow_run( + run_recaptura, + stream_states=True, + stream_logs=True, + raise_final_state=True, + ) + # Captura runs_captura = create_flow_run.map( flow_name=unmapped(bilhetagem_auxiliar_captura.name), project_name=unmapped(emd_constants.PREFECT_DEFAULT_PROJECT.value), parameters=constants.BILHETAGEM_CAPTURE_PARAMS.value, labels=unmapped(LABELS), + upstream_tasks=[wait_recaptura], ) wait_captura = wait_for_flow_run.map( From db6e6d9ada18bb35f27d1b8ee46ac473dc17bea7 Mon Sep 17 00:00:00 2001 From: Rafael Date: Tue, 17 Oct 2023 14:32:59 -0300 Subject: [PATCH 097/145] mudar host para IP / adiciona interval_minutes --- pipelines/rj_smtr/constants.py | 33 ++++++++++----------------------- 1 file changed, 10 insertions(+), 23 deletions(-) diff --git a/pipelines/rj_smtr/constants.py b/pipelines/rj_smtr/constants.py index 9369bb465..ccf1c6c44 100644 --- a/pipelines/rj_smtr/constants.py +++ b/pipelines/rj_smtr/constants.py @@ -170,25 +170,20 @@ class constants(Enum): # pylint: disable=c0103 "databases": { "principal_db": { "engine": "mysql", - "host": "principal-database-replica.internal", + "host": "10.5.114.121", }, "tarifa_db": { "engine": "postgresql", - "host": "tarifa-database-replica.internal", + "host": "10.5.113.254", }, "transacao_db": { "engine": "postgresql", - "host": "transacao-database-replica.internal", + "host": "10.5.115.1", }, }, "source_type": "db", } - BILHETAGEM_CAPTURE_RUN_INTERVAL = { - "transacao_run_interval": {"minutes": 1}, - "principal_run_interval": {"hours": 1}, - } - BILHETAGEM_TRANSACAO_CAPTURE_PARAMS = { "table_id": "transacao", "partition_date_only": False, @@ -203,9 +198,9 @@ class constants(Enum): # pylint: disable=c0103 data_processamento BETWEEN '{start}' AND '{end}' """, - "run_interval": BILHETAGEM_CAPTURE_RUN_INTERVAL["transacao_run_interval"], }, - "primary_key": ["id"], # id column to nest data on + "primary_key": ["id"], + "interval_minutes": 1, } BILHETAGEM_SECRET_PATH = "smtr_jae_access_data" @@ -224,11 +219,9 @@ class constants(Enum): # pylint: disable=c0103 WHERE DT_INCLUSAO >= '{start}' """, - "run_interval": BILHETAGEM_CAPTURE_RUN_INTERVAL[ - "principal_run_interval" - ], }, "primary_key": ["CD_LINHA"], # id column to nest data on + "interval_minutes": 60, }, { "table_id": "grupo", @@ -243,11 +236,9 @@ class constants(Enum): # pylint: disable=c0103 WHERE DT_INCLUSAO >= '{start}' """, - "run_interval": BILHETAGEM_CAPTURE_RUN_INTERVAL[ - "principal_run_interval" - ], }, "primary_key": ["CD_GRUPO"], # id column to nest data on + "interval_minutes": 60, }, { "table_id": "grupo_linha", @@ -262,11 +253,9 @@ class constants(Enum): # pylint: disable=c0103 WHERE DT_INCLUSAO >= '{start}' """, - "run_interval": BILHETAGEM_CAPTURE_RUN_INTERVAL[ - "principal_run_interval" - ], }, - "primary_key": ["CD_GRUPO", "CD_LINHA"], # id column to nest data on + "primary_key": ["CD_GRUPO", "CD_LINHA"], + "interval_minutes": 60, }, { "table_id": "matriz_integracao", @@ -281,14 +270,12 @@ class constants(Enum): # pylint: disable=c0103 WHERE dt_inclusao >= '{start}' """, - "run_interval": BILHETAGEM_CAPTURE_RUN_INTERVAL[ - "principal_run_interval" - ], }, "primary_key": [ "cd_versao_matriz", "cd_integracao", ], # id column to nest data on + "interval_minutes": 60, }, ] From d115126bcf7ab550c8a7ac271b52c356264b0be6 Mon Sep 17 00:00:00 2001 From: Rafael Date: Tue, 17 Oct 2023 14:33:16 -0300 Subject: [PATCH 098/145] adiciona parametro interval minutes --- pipelines/rj_smtr/flows.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/pipelines/rj_smtr/flows.py b/pipelines/rj_smtr/flows.py index db72bfd3d..f9c40bd23 100644 --- a/pipelines/rj_smtr/flows.py +++ b/pipelines/rj_smtr/flows.py @@ -58,6 +58,7 @@ secret_path = Parameter("secret_path", default=None) source_type = Parameter("source_type", default=None) recapture = Parameter("recapture", default=False) + interval_minutes = Parameter("interval_minutes", default=None) # Parâmetros Pré-tratamento # primary_key = Parameter("primary_key", default=None) @@ -70,8 +71,7 @@ with case(recapture, True): _, recapture_timestamps, previous_errors = query_logs( - dataset_id=dataset_id, - table_id=table_id, + dataset_id=dataset_id, table_id=table_id, interval_minutes=interval_minutes ) with case(recapture, False): @@ -104,6 +104,7 @@ extract_params=unmapped(extract_params), table_id=unmapped(table_id), timestamp=timestamps, + interval_minutes=interval_minutes, ) request_params, request_paths = unpack_mapped_results_nout2( From 97c865ae23b2fbc83ec9d88c274cc698379db037 Mon Sep 17 00:00:00 2001 From: Rafael Date: Tue, 17 Oct 2023 14:33:27 -0300 Subject: [PATCH 099/145] remove linha comentada --- pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/flows.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/flows.py b/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/flows.py index 0a1e29ba9..df47f17eb 100644 --- a/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/flows.py +++ b/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/flows.py @@ -179,4 +179,3 @@ labels=[emd_constants.RJ_SMTR_DEV_AGENT_LABEL.value], ) bilhetagem_transacao_tratamento.schedule = every_hour -# bilhetagem_materializacao.schedule = bilhetagem_materializacao_schedule From 0bf3ade6cc89ebc1714421ecb3207524dd969cbb Mon Sep 17 00:00:00 2001 From: Rafael Date: Tue, 17 Oct 2023 14:34:01 -0300 Subject: [PATCH 100/145] remove arquivo de schedules da bilhetagem --- .../schedules.py | 33 ------------------- 1 file changed, 33 deletions(-) delete mode 100644 pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/schedules.py diff --git a/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/schedules.py b/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/schedules.py deleted file mode 100644 index 6cb4b0724..000000000 --- a/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/schedules.py +++ /dev/null @@ -1,33 +0,0 @@ -# -*- coding: utf-8 -*- -""" -Schedules for br_rj_riodejaneiro_bilhetagem -""" - -from datetime import timedelta - -from prefect.schedules import Schedule - -from pipelines.constants import constants as emd_constants -from pipelines.utils.utils import untuple_clocks as untuple - -from pipelines.rj_smtr.constants import constants -from pipelines.rj_smtr.utils import ( - generate_execute_schedules, -) - -BILHETAGEM_TRANSACAO_INTERVAL = timedelta(minutes=1) -bilhetagem_transacao_clocks = generate_execute_schedules( - clock_interval=timedelta( - **constants.BILHETAGEM_CAPTURE_RUN_INTERVAL.value["transacao_run_interval"] - ), - labels=[ - emd_constants.RJ_SMTR_DEV_AGENT_LABEL.value, - ], - table_parameters=constants.BILHETAGEM_TRANSACAO_CAPTURE_PARAMS.value, - dataset_id=constants.BILHETAGEM_DATASET_ID.value, - secret_path=constants.BILHETAGEM_SECRET_PATH.value, - source_type=constants.BILHETAGEM_GENERAL_CAPTURE_PARAMS.value["source_type"], - runs_interval_minutes=0, -) - -bilhetagem_transacao_schedule = Schedule(clocks=untuple(bilhetagem_transacao_clocks)) From 35c80d4532adc2c4ee12b300d4de201ee934bd7a Mon Sep 17 00:00:00 2001 From: Rafael Date: Tue, 17 Oct 2023 14:34:31 -0300 Subject: [PATCH 101/145] =?UTF-8?q?generaliza=20fun=C3=A7=C3=A3o=20query?= =?UTF-8?q?=20logs?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- pipelines/rj_smtr/tasks.py | 99 +++++------------------------ pipelines/rj_smtr/utils.py | 123 +++++++++++++++++++++++++++++++++++++ 2 files changed, 139 insertions(+), 83 deletions(-) diff --git a/pipelines/rj_smtr/tasks.py b/pipelines/rj_smtr/tasks.py index 3969f28b9..2b733aef0 100644 --- a/pipelines/rj_smtr/tasks.py +++ b/pipelines/rj_smtr/tasks.py @@ -26,7 +26,6 @@ bq_project, get_table_min_max_value, get_last_run_timestamp, - log_critical, data_info_str, dict_contains_keys, get_raw_data_api, @@ -37,6 +36,7 @@ read_raw_data, save_treated_local_func, save_raw_local_func, + query_logs_func, ) from pipelines.utils.execute_dbt_model.utils import get_dbt_client from pipelines.utils.utils import log, get_redis_client, get_vault_secret @@ -370,6 +370,7 @@ def query_logs( table_id: str, datetime_filter=None, max_recaptures: int = 60, + interval_minutes: int = 1, ): """ Queries capture logs to check for errors @@ -380,92 +381,22 @@ def query_logs( datetime_filter (pendulum.datetime.DateTime, optional): filter passed to query. This task will query the logs table for the last 1 day before datetime_filter + max_recaptures (int, optional): maximum number of recaptures to be done + interval_minutes (int, optional): interval in minutes between each recapture Returns: - list: containing timestamps for which the capture failed + lists: errors (bool), + timestamps (list of pendulum.datetime.DateTime), + previous_errors (list of previous errors) """ - if not datetime_filter: - datetime_filter = pendulum.now(constants.TIMEZONE.value).replace( - second=0, microsecond=0 - ) - elif isinstance(datetime_filter, str): - datetime_filter = datetime.fromisoformat(datetime_filter).replace( - second=0, microsecond=0 - ) - - query = f""" - with t as ( - select - datetime(timestamp_array) as timestamp_array - from - unnest(GENERATE_TIMESTAMP_ARRAY( - timestamp_sub('{datetime_filter.strftime('%Y-%m-%d %H:%M:%S')}', interval 1 day), - timestamp('{datetime_filter.strftime('%Y-%m-%d %H:%M:%S')}'), - interval 1 minute) - ) as timestamp_array - where timestamp_array < '{datetime_filter.strftime('%Y-%m-%d %H:%M:%S')}' - ), - logs as ( - select - *, - timestamp_trunc(timestamp_captura, minute) as timestamp_array - from - rj-smtr.{dataset_id}.{table_id}_logs - where - data between - date(datetime_sub('{datetime_filter.strftime('%Y-%m-%d %H:%M:%S')}', - interval 1 day)) - and date('{datetime_filter.strftime('%Y-%m-%d %H:%M:%S')}') - and - timestamp_captura between - datetime_sub('{datetime_filter.strftime('%Y-%m-%d %H:%M:%S')}', interval 1 day) - and '{datetime_filter.strftime('%Y-%m-%d %H:%M:%S')}' - order by timestamp_captura + return query_logs_func( + dataset_id=dataset_id, + table_id=table_id, + datetime_filter=datetime_filter, + max_recaptures=max_recaptures, + interval_minutes=interval_minutes, ) - select - case - when logs.timestamp_captura is not null then logs.timestamp_captura - else t.timestamp_array - end as timestamp_captura, - logs.erro - from - t - left join - logs - on - logs.timestamp_array = t.timestamp_array - where - logs.sucesso is not True - order by - timestamp_captura - """ - log(f"Run query to check logs:\n{query}") - results = bd.read_sql(query=query, billing_project_id=bq_project()) - if len(results) > 0: - results["timestamp_captura"] = ( - pd.to_datetime(results["timestamp_captura"]) - .dt.tz_localize(constants.TIMEZONE.value) - .to_list() - ) - log(f"Recapture data for the following {len(results)} timestamps:\n{results}") - if len(results) > max_recaptures: - message = f""" - [SPPO - Recaptures] - Encontradas {len(results)} timestamps para serem recapturadas. - Essa run processará as seguintes: - ##### - {results[:max_recaptures]} - ##### - Sobraram as seguintes para serem recapturadas na próxima run: - ##### - {results[max_recaptures:]} - ##### - """ - log_critical(message) - results = results[:max_recaptures] - return True, results["timestamp_captura"].to_list(), results["erro"].to_list() - return False, [], [] @task @@ -543,6 +474,7 @@ def create_request_params( table_id: str, dataset_id: str, timestamp: datetime, + interval_minutes: int, ) -> tuple[str, str]: """ Task to create request params @@ -552,6 +484,7 @@ def create_request_params( table_id (str): table_id on BigQuery dataset_id (str): dataset_id on BigQuery timestamp (datetime): timestamp for flow run + interval_minutes (int): interval in minutes between each capture Returns: request_params: host, database and query to request data @@ -567,7 +500,7 @@ def create_request_params( request_url = database["host"] datetime_range = get_datetime_range( - timestamp=timestamp, interval=timedelta(**extract_params["run_interval"]) + timestamp=timestamp, interval=timedelta(minutes=interval_minutes) ) request_params = { diff --git a/pipelines/rj_smtr/utils.py b/pipelines/rj_smtr/utils.py index 16ed538d3..41dc1dd02 100644 --- a/pipelines/rj_smtr/utils.py +++ b/pipelines/rj_smtr/utils.py @@ -8,6 +8,7 @@ from pathlib import Path from datetime import timedelta, datetime, date +import pendulum from typing import List, Union, Any import traceback import io @@ -449,6 +450,128 @@ def generate_execute_schedules( # pylint: disable=too-many-arguments,too-many-l return clocks +def query_logs_func( + dataset_id: str, + table_id: str, + datetime_filter=None, + max_recaptures: int = 60, + interval_minutes: int = 1, +): + """ + Queries capture logs to check for errors + + Args: + dataset_id (str): dataset_id on BigQuery + table_id (str): table_id on BigQuery + datetime_filter (pendulum.datetime.DateTime, optional): + filter passed to query. This task will query the logs table + for the last 1 day before datetime_filter + max_recaptures (int, optional): maximum number of recaptures to be done + interval_minutes (int, optional): interval in minutes between each recapture + + Returns: + lists: errors (bool), + timestamps (list of pendulum.datetime.DateTime), + previous_errors (list of previous errors) + """ + + if not datetime_filter: + datetime_filter = pendulum.now(constants.TIMEZONE.value).replace( + second=0, microsecond=0 + ) + elif isinstance(datetime_filter, str): + datetime_filter = datetime.fromisoformat(datetime_filter).replace( + second=0, microsecond=0 + ) + + datetime_filter = datetime_filter.strftime("%Y-%m-%d %H:%M:%S") + + query = f""" + WITH + t AS ( + SELECT + DATETIME(timestamp_array) AS timestamp_array + FROM + UNNEST( + GENERATE_TIMESTAMP_ARRAY( + TIMESTAMP_SUB('{datetime_filter}', INTERVAL 1 day), + TIMESTAMP('{datetime_filter}'), + INTERVAL {interval_minutes} minute) ) + AS timestamp_array + WHERE + timestamp_array < '{datetime_filter}' ), + logs_table AS ( + SELECT + SAFE_CAST(DATETIME(TIMESTAMP(timestamp_captura), + "America/Sao_Paulo") AS DATETIME) timestamp_captura, + SAFE_CAST(sucesso AS BOOLEAN) sucesso, + SAFE_CAST(erro AS STRING) erro, + SAFE_CAST(DATA AS DATE) DATA + FROM + rj-smtr-staging.{dataset_id}_staging.{table_id}_logs AS t + ), + logs AS ( + SELECT + *, + TIMESTAMP_TRUNC(timestamp_captura, minute) AS timestamp_array + FROM + logs_table + WHERE + DATA BETWEEN DATE(DATETIME_SUB('{datetime_filter}', + INTERVAL 1 day)) + AND DATE('{datetime_filter}') + AND timestamp_captura BETWEEN + DATETIME_SUB('{datetime_filter}', INTERVAL 1 day) + AND '{datetime_filter}' + ORDER BY + timestamp_captura ) + SELECT + CASE + WHEN logs.timestamp_captura IS NOT NULL THEN logs.timestamp_captura + ELSE + t.timestamp_array + END + AS timestamp_captura, + logs.erro + FROM + t + LEFT JOIN + logs + ON + logs.timestamp_array = t.timestamp_array + WHERE + logs.sucesso IS NOT TRUE + ORDER BY + timestamp_captura + """ + log(f"Run query to check logs:\n{query}") + results = bd.read_sql(query=query, billing_project_id=bq_project()) + if len(results) > 0: + results["timestamp_captura"] = ( + pd.to_datetime(results["timestamp_captura"]) + .dt.tz_localize(constants.TIMEZONE.value) + .to_list() + ) + log(f"Recapture data for the following {len(results)} timestamps:\n{results}") + if len(results) > max_recaptures: + message = f""" + [SPPO - Recaptures] + Encontradas {len(results)} timestamps para serem recapturadas. + Essa run processará as seguintes: + ##### + {results[:max_recaptures]} + ##### + Sobraram as seguintes para serem recapturadas na próxima run: + ##### + {results[max_recaptures:]} + ##### + """ + log_critical(message) + results = results[:max_recaptures] + return True, results["timestamp_captura"].to_list(), results["erro"].to_list() + return False, [], [] + + def dict_contains_keys(input_dict: dict, keys: list[str]) -> bool: """ Test if the input dict has all keys present in the list From a59e353ff63699695542ce63e72871a9282412b6 Mon Sep 17 00:00:00 2001 From: Rafael Date: Tue, 17 Oct 2023 14:45:44 -0300 Subject: [PATCH 102/145] ajuste remove schedule personalizado --- pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/flows.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/flows.py b/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/flows.py index df47f17eb..4c54424ba 100644 --- a/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/flows.py +++ b/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/flows.py @@ -33,10 +33,6 @@ get_current_timestamp, ) -from pipelines.rj_smtr.br_rj_riodejaneiro_bilhetagem.schedules import ( - bilhetagem_transacao_schedule, -) - from pipelines.rj_smtr.constants import constants from pipelines.rj_smtr.schedules import every_hour, every_minute From 2616565cae881262abe6eef88444f32ca2f07a81 Mon Sep 17 00:00:00 2001 From: Rafael Date: Tue, 17 Oct 2023 15:05:06 -0300 Subject: [PATCH 103/145] unmap interval_minutes --- pipelines/rj_smtr/flows.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pipelines/rj_smtr/flows.py b/pipelines/rj_smtr/flows.py index f9c40bd23..b951a18a4 100644 --- a/pipelines/rj_smtr/flows.py +++ b/pipelines/rj_smtr/flows.py @@ -104,7 +104,7 @@ extract_params=unmapped(extract_params), table_id=unmapped(table_id), timestamp=timestamps, - interval_minutes=interval_minutes, + interval_minutes=unmapped(interval_minutes), ) request_params, request_paths = unpack_mapped_results_nout2( From 0696626dc6057901b2344b145137cc94bc5d3e5d Mon Sep 17 00:00:00 2001 From: Rafael Date: Tue, 17 Oct 2023 17:50:43 -0300 Subject: [PATCH 104/145] =?UTF-8?q?altera=C3=A7=C3=A3o=20de=20pasta=20de?= =?UTF-8?q?=20grava=C3=A7=C3=A3o=20para=20teste?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- pipelines/rj_smtr/tasks.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pipelines/rj_smtr/tasks.py b/pipelines/rj_smtr/tasks.py index 2b733aef0..9e80873fa 100644 --- a/pipelines/rj_smtr/tasks.py +++ b/pipelines/rj_smtr/tasks.py @@ -309,7 +309,7 @@ def create_local_partition_path( either to save raw or staging files. """ data_folder = os.getenv("DATA_FOLDER", "data") - file_path = f"{os.getcwd()}/{data_folder}/{{mode}}/{dataset_id}/{table_id}" + file_path = f"{os.getcwd()}/{data_folder}/{{mode}}/{dataset_id}_dev/{table_id}" file_path += f"/{partitions}/{filename}.{{filetype}}" log(f"Creating file path: {file_path}") return file_path @@ -780,7 +780,7 @@ def upload_raw_data_to_gcs( st_obj = Storage(table_id=table_id, dataset_id=dataset_id) log( f"""Uploading raw file to bucket {st_obj.bucket_name} at - {st_obj.bucket_name}/{dataset_id}/{table_id}""" + {st_obj.bucket_name}/{dataset_id}_dev/{table_id}""" ) st_obj.upload( path=raw_filepath, @@ -824,7 +824,7 @@ def upload_staging_data_to_gcs( try: # Creates and publish table if it does not exist, append to it otherwise create_or_append_table( - dataset_id=dataset_id, + dataset_id=f"{dataset_id}_dev", table_id=table_id, path=staging_filepath, partitions=partitions, @@ -836,7 +836,7 @@ def upload_staging_data_to_gcs( log(f"previous_error = {previous_error}") upload_run_logs_to_bq( - dataset_id=dataset_id, + dataset_id=f"{dataset_id}_dev", parent_table_id=table_id, error=error, timestamp=timestamp, From ee0c4408abbc5c3bc81f3fdda95605a51bad2b43 Mon Sep 17 00:00:00 2001 From: Rafael Date: Wed, 18 Oct 2023 08:04:51 -0300 Subject: [PATCH 105/145] teste retirar timezone --- pipelines/rj_smtr/utils.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pipelines/rj_smtr/utils.py b/pipelines/rj_smtr/utils.py index 41dc1dd02..2ea0a8b57 100644 --- a/pipelines/rj_smtr/utils.py +++ b/pipelines/rj_smtr/utils.py @@ -596,10 +596,10 @@ def custom_serialization(obj: Any) -> Any: Any: Serialized object """ if isinstance(obj, (datetime, date, pd.Timestamp)): - if obj.tzinfo is None: - obj = obj.tz_localize(emd_constants.DEFAULT_TIMEZONE.value) - else: - obj = obj.tz_convert(emd_constants.DEFAULT_TIMEZONE.value) + # if obj.tzinfo is None: + # obj = obj.tz_localize(emd_constants.DEFAULT_TIMEZONE.value) + # else: + # obj = obj.tz_convert(emd_constants.DEFAULT_TIMEZONE.value) return obj.isoformat() From a8bb7f1e864fee561e9fb9e8b35d350abb8d1358 Mon Sep 17 00:00:00 2001 From: Rafael Date: Wed, 18 Oct 2023 08:28:27 -0300 Subject: [PATCH 106/145] mudar timezone --- pipelines/rj_smtr/utils.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/pipelines/rj_smtr/utils.py b/pipelines/rj_smtr/utils.py index 2ea0a8b57..f33e93759 100644 --- a/pipelines/rj_smtr/utils.py +++ b/pipelines/rj_smtr/utils.py @@ -595,7 +595,11 @@ def custom_serialization(obj: Any) -> Any: Returns: Any: Serialized object """ - if isinstance(obj, (datetime, date, pd.Timestamp)): + if isinstance(obj, pd.Timestamp): + if obj.tzinfo is None: + obj = obj.tz_localize("UTC").tz_convert( + emd_constants.DEFAULT_TIMEZONE.value + ) # if obj.tzinfo is None: # obj = obj.tz_localize(emd_constants.DEFAULT_TIMEZONE.value) # else: From d956a5317b6d75b4f8e0fe12101f59759fde73c5 Mon Sep 17 00:00:00 2001 From: Rafael Date: Wed, 18 Oct 2023 09:15:58 -0300 Subject: [PATCH 107/145] corrigir logica de recaptura --- pipelines/rj_smtr/flows.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pipelines/rj_smtr/flows.py b/pipelines/rj_smtr/flows.py index b951a18a4..a63d8681d 100644 --- a/pipelines/rj_smtr/flows.py +++ b/pipelines/rj_smtr/flows.py @@ -76,7 +76,9 @@ with case(recapture, False): capture_timestamp = [get_current_timestamp()] - previous_errors = [None] + previous_errors = task( + lambda: [None], checkpoint=False, name="assign_none_to_previous_errors" + )() timestamps = merge(recapture_timestamps, capture_timestamp) From 2261952e40f7728025fd913b55588a946cf5c5cd Mon Sep 17 00:00:00 2001 From: Rafael Date: Wed, 18 Oct 2023 09:31:39 -0300 Subject: [PATCH 108/145] adicionar possibilidade de recapturar mais dias --- pipelines/rj_smtr/flows.py | 8 ++++++-- pipelines/rj_smtr/utils.py | 10 ++++++---- 2 files changed, 12 insertions(+), 6 deletions(-) diff --git a/pipelines/rj_smtr/flows.py b/pipelines/rj_smtr/flows.py index a63d8681d..b9ad0252f 100644 --- a/pipelines/rj_smtr/flows.py +++ b/pipelines/rj_smtr/flows.py @@ -57,8 +57,9 @@ extract_params = Parameter("extract_params", default=None) secret_path = Parameter("secret_path", default=None) source_type = Parameter("source_type", default=None) - recapture = Parameter("recapture", default=False) interval_minutes = Parameter("interval_minutes", default=None) + recapture = Parameter("recapture", default=False) + recapture_window_days = Parameter("recapture_window_days", default=None) # Parâmetros Pré-tratamento # primary_key = Parameter("primary_key", default=None) @@ -71,7 +72,10 @@ with case(recapture, True): _, recapture_timestamps, previous_errors = query_logs( - dataset_id=dataset_id, table_id=table_id, interval_minutes=interval_minutes + dataset_id=dataset_id, + table_id=table_id, + interval_minutes=interval_minutes, + recapture_window_days=recapture_window_days, ) with case(recapture, False): diff --git a/pipelines/rj_smtr/utils.py b/pipelines/rj_smtr/utils.py index f33e93759..45b7de7d8 100644 --- a/pipelines/rj_smtr/utils.py +++ b/pipelines/rj_smtr/utils.py @@ -456,6 +456,7 @@ def query_logs_func( datetime_filter=None, max_recaptures: int = 60, interval_minutes: int = 1, + recapture_window_days: int = 1, ): """ Queries capture logs to check for errors @@ -465,9 +466,10 @@ def query_logs_func( table_id (str): table_id on BigQuery datetime_filter (pendulum.datetime.DateTime, optional): filter passed to query. This task will query the logs table - for the last 1 day before datetime_filter + for the last n (n = recapture_window_days) days before datetime_filter max_recaptures (int, optional): maximum number of recaptures to be done interval_minutes (int, optional): interval in minutes between each recapture + recapture_window_days (int, optional): Number of days to query for erros Returns: lists: errors (bool), @@ -494,7 +496,7 @@ def query_logs_func( FROM UNNEST( GENERATE_TIMESTAMP_ARRAY( - TIMESTAMP_SUB('{datetime_filter}', INTERVAL 1 day), + TIMESTAMP_SUB('{datetime_filter}', INTERVAL {recapture_window_days} day), TIMESTAMP('{datetime_filter}'), INTERVAL {interval_minutes} minute) ) AS timestamp_array @@ -518,10 +520,10 @@ def query_logs_func( logs_table WHERE DATA BETWEEN DATE(DATETIME_SUB('{datetime_filter}', - INTERVAL 1 day)) + INTERVAL {recapture_window_days} day)) AND DATE('{datetime_filter}') AND timestamp_captura BETWEEN - DATETIME_SUB('{datetime_filter}', INTERVAL 1 day) + DATETIME_SUB('{datetime_filter}', INTERVAL {recapture_window_days} day) AND '{datetime_filter}' ORDER BY timestamp_captura ) From b8ac6b8465018add18986ce411d6d4241b774312 Mon Sep 17 00:00:00 2001 From: Rafael Date: Wed, 18 Oct 2023 09:33:17 -0300 Subject: [PATCH 109/145] ajustar recapture_window_days default --- pipelines/rj_smtr/flows.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pipelines/rj_smtr/flows.py b/pipelines/rj_smtr/flows.py index b9ad0252f..5017b226e 100644 --- a/pipelines/rj_smtr/flows.py +++ b/pipelines/rj_smtr/flows.py @@ -59,7 +59,7 @@ source_type = Parameter("source_type", default=None) interval_minutes = Parameter("interval_minutes", default=None) recapture = Parameter("recapture", default=False) - recapture_window_days = Parameter("recapture_window_days", default=None) + recapture_window_days = Parameter("recapture_window_days", default=1) # Parâmetros Pré-tratamento # primary_key = Parameter("primary_key", default=None) From 7f0c3098bd3a7f6e516c1bd2a1e72b6e19c3d3b5 Mon Sep 17 00:00:00 2001 From: Rafael Date: Wed, 18 Oct 2023 10:18:34 -0300 Subject: [PATCH 110/145] adicionae recapture_window na task query_logs --- pipelines/rj_smtr/tasks.py | 5 ++++- pipelines/rj_smtr/utils.py | 4 ++-- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/pipelines/rj_smtr/tasks.py b/pipelines/rj_smtr/tasks.py index 9e80873fa..40129e4fa 100644 --- a/pipelines/rj_smtr/tasks.py +++ b/pipelines/rj_smtr/tasks.py @@ -371,6 +371,7 @@ def query_logs( datetime_filter=None, max_recaptures: int = 60, interval_minutes: int = 1, + recapture_window_days: int = 1, ): """ Queries capture logs to check for errors @@ -380,9 +381,10 @@ def query_logs( table_id (str): table_id on BigQuery datetime_filter (pendulum.datetime.DateTime, optional): filter passed to query. This task will query the logs table - for the last 1 day before datetime_filter + for the last n (n = recapture_window_days) days before datetime_filter max_recaptures (int, optional): maximum number of recaptures to be done interval_minutes (int, optional): interval in minutes between each recapture + recapture_window_days (int, optional): Number of days to query for erros Returns: lists: errors (bool), @@ -396,6 +398,7 @@ def query_logs( datetime_filter=datetime_filter, max_recaptures=max_recaptures, interval_minutes=interval_minutes, + recapture_window_days=recapture_window_days, ) diff --git a/pipelines/rj_smtr/utils.py b/pipelines/rj_smtr/utils.py index 45b7de7d8..4d7ac329e 100644 --- a/pipelines/rj_smtr/utils.py +++ b/pipelines/rj_smtr/utils.py @@ -7,13 +7,13 @@ from ftplib import FTP from pathlib import Path -from datetime import timedelta, datetime, date -import pendulum +from datetime import timedelta, datetime from typing import List, Union, Any import traceback import io import json import zipfile +import pendulum import pytz import requests import basedosdados as bd From ae1774657822387e9a897d9cfdd56d1e3976bd97 Mon Sep 17 00:00:00 2001 From: Rafael Date: Wed, 18 Oct 2023 10:54:14 -0300 Subject: [PATCH 111/145] merge previous_errors --- pipelines/rj_smtr/flows.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/pipelines/rj_smtr/flows.py b/pipelines/rj_smtr/flows.py index 5017b226e..63acc1497 100644 --- a/pipelines/rj_smtr/flows.py +++ b/pipelines/rj_smtr/flows.py @@ -71,7 +71,7 @@ ) with case(recapture, True): - _, recapture_timestamps, previous_errors = query_logs( + _, recapture_timestamps, recapture_previous_errors = query_logs( dataset_id=dataset_id, table_id=table_id, interval_minutes=interval_minutes, @@ -80,11 +80,12 @@ with case(recapture, False): capture_timestamp = [get_current_timestamp()] - previous_errors = task( + capture_previous_errors = task( lambda: [None], checkpoint=False, name="assign_none_to_previous_errors" )() timestamps = merge(recapture_timestamps, capture_timestamp) + previous_errors = merge(recapture_previous_errors, capture_previous_errors) rename_flow_run = rename_current_flow_run_now_time( prefix="SMTR: " + get_run_name_prefix(recap=recapture) + " " + table_id + ": ", From b172a63ff9638352dbc8c01765d3591b933855fd Mon Sep 17 00:00:00 2001 From: Rafael Date: Wed, 18 Oct 2023 11:24:44 -0300 Subject: [PATCH 112/145] remover log de teste --- pipelines/rj_smtr/tasks.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/pipelines/rj_smtr/tasks.py b/pipelines/rj_smtr/tasks.py index 40129e4fa..5f5daf1a1 100644 --- a/pipelines/rj_smtr/tasks.py +++ b/pipelines/rj_smtr/tasks.py @@ -836,8 +836,6 @@ def upload_staging_data_to_gcs( error = traceback.format_exc() log(f"[CATCHED] Task failed with error: \n{error}", level="error") - log(f"previous_error = {previous_error}") - upload_run_logs_to_bq( dataset_id=f"{dataset_id}_dev", parent_table_id=table_id, From 0bfe9cf6ad9f6c67403d3a528434c10c107d52b2 Mon Sep 17 00:00:00 2001 From: Rafael Date: Wed, 18 Oct 2023 11:24:57 -0300 Subject: [PATCH 113/145] ajustar log recaptura --- pipelines/rj_smtr/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pipelines/rj_smtr/utils.py b/pipelines/rj_smtr/utils.py index 4d7ac329e..386053e66 100644 --- a/pipelines/rj_smtr/utils.py +++ b/pipelines/rj_smtr/utils.py @@ -874,7 +874,7 @@ def upload_run_logs_to_bq( # pylint: disable=R0913 "erro": [f"[recapturado]{previous_error}"], } ) - log(f"Recapturing {timestamp} with previous error:\n{error}") + log(f"Recapturing {timestamp} with previous error:\n{previous_error}") else: # not recapturing or error during flow execution dataframe = pd.DataFrame( From 7ca3764070eeb8af8be68fbcefe523ebc4d8408c Mon Sep 17 00:00:00 2001 From: Rafael Date: Wed, 18 Oct 2023 13:19:34 -0300 Subject: [PATCH 114/145] adicionar recaptura auxiliar --- .../br_rj_riodejaneiro_bilhetagem/flows.py | 33 +++++++++++++++---- 1 file changed, 27 insertions(+), 6 deletions(-) diff --git a/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/flows.py b/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/flows.py index 4c54424ba..7799c3eac 100644 --- a/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/flows.py +++ b/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/flows.py @@ -9,6 +9,7 @@ from prefect.storage import GCS from prefect.tasks.prefect import create_flow_run, wait_for_flow_run from prefect.utilities.edges import unmapped +from prefect import Parameter # EMD Imports # @@ -109,11 +110,14 @@ ) -# TRATAMENTO - RODA DE HORA EM HORA, CAPTURA AUXILIAR + MATERIALIZAÇÃO +# TRATAMENTO - RODA DE HORA EM HORA, RECAPTURAS + CAPTURA AUXILIAR + MATERIALIZAÇÃO with Flow( "SMTR: Bilhetagem Transação - Tratamento", code_owners=["caio", "fernanda", "boris", "rodrigo"], ) as bilhetagem_transacao_tratamento: + # Configuração # + recapture_window_days = Parameter("recapture_window_days", default=1) + timestamp = get_current_timestamp() rename_flow_run = rename_current_flow_run_now_time( @@ -123,28 +127,45 @@ LABELS = get_current_flow_labels() - # Recaptura Transações + # Recapturas - run_recaptura = create_flow_run( + run_recaptura_trasacao = create_flow_run( flow_name=bilhetagem_transacao_recaptura.name, project_name=emd_constants.PREFECT_DEFAULT_PROJECT.value, labels=LABELS, + parameters={"recapture_window_days": recapture_window_days}, ) - wait_recaptura = wait_for_flow_run( - run_recaptura, + wait_recaptura_trasacao = wait_for_flow_run( + run_recaptura_trasacao, stream_states=True, stream_logs=True, raise_final_state=True, ) + runs_recaptura_auxiliar = create_flow_run.map( + flow_name=unmapped(bilhetagem_auxiliar_captura.name), + project_name=unmapped(emd_constants.PREFECT_DEFAULT_PROJECT.value), + parameters=constants.BILHETAGEM_CAPTURE_PARAMS.value + | {"recapture": True, "recapture_window_days": recapture_window_days}, + labels=unmapped(LABELS), + upstream_tasks=[wait_recaptura_trasacao], + ) + + wait_recaptura_auxiliar = wait_for_flow_run.map( + runs_recaptura_auxiliar, + stream_states=unmapped(True), + stream_logs=unmapped(True), + raise_final_state=unmapped(True), + ) + # Captura runs_captura = create_flow_run.map( flow_name=unmapped(bilhetagem_auxiliar_captura.name), project_name=unmapped(emd_constants.PREFECT_DEFAULT_PROJECT.value), parameters=constants.BILHETAGEM_CAPTURE_PARAMS.value, labels=unmapped(LABELS), - upstream_tasks=[wait_recaptura], + upstream_tasks=[wait_recaptura_auxiliar], ) wait_captura = wait_for_flow_run.map( From c5f369f9375c7c74ac1595645f91c9d703801c22 Mon Sep 17 00:00:00 2001 From: Rafael Date: Wed, 18 Oct 2023 13:46:56 -0300 Subject: [PATCH 115/145] criar parametros recaptura tabelas auxiliares --- .../br_rj_riodejaneiro_bilhetagem/flows.py | 48 +++++++++++-------- pipelines/rj_smtr/tasks.py | 15 ++++++ 2 files changed, 44 insertions(+), 19 deletions(-) diff --git a/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/flows.py b/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/flows.py index 7799c3eac..2545ee04e 100644 --- a/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/flows.py +++ b/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/flows.py @@ -30,9 +30,7 @@ default_materialization_flow, ) -from pipelines.rj_smtr.tasks import ( - get_current_timestamp, -) +from pipelines.rj_smtr.tasks import get_current_timestamp, merge_dict_with_dict_list from pipelines.rj_smtr.constants import constants @@ -65,16 +63,6 @@ bilhetagem_transacao_captura.schedule = every_minute -bilhetagem_transacao_recaptura = deepcopy(default_capture_flow) -bilhetagem_transacao_recaptura.name = "SMTR: Bilhetagem Transação - Recaptura (subflow)" -bilhetagem_transacao_recaptura.storage = GCS(emd_constants.GCS_FLOWS_BUCKET.value) -bilhetagem_transacao_recaptura = set_default_parameters( - flow=bilhetagem_transacao_recaptura, - default_parameters=GENERAL_CAPTURE_DEFAULT_PARAMS - | constants.BILHETAGEM_TRANSACAO_CAPTURE_PARAMS.value - | {"recapture": True}, -) - # BILHETAGEM AUXILIAR - SUBFLOW PARA RODAR ANTES DE CADA MATERIALIZAÇÃO # @@ -109,6 +97,17 @@ default_parameters=bilhetagem_materializacao_parameters, ) +# RECAPTURA + +bilhetagem_transacao_recaptura = deepcopy(default_capture_flow) +bilhetagem_transacao_recaptura.name = "SMTR: Bilhetagem Transação - Recaptura (subflow)" +bilhetagem_transacao_recaptura.storage = GCS(emd_constants.GCS_FLOWS_BUCKET.value) +bilhetagem_transacao_recaptura = set_default_parameters( + flow=bilhetagem_transacao_recaptura, + default_parameters=GENERAL_CAPTURE_DEFAULT_PARAMS + | constants.BILHETAGEM_TRANSACAO_CAPTURE_PARAMS.value + | {"recapture": True}, +) # TRATAMENTO - RODA DE HORA EM HORA, RECAPTURAS + CAPTURA AUXILIAR + MATERIALIZAÇÃO with Flow( @@ -131,7 +130,8 @@ run_recaptura_trasacao = create_flow_run( flow_name=bilhetagem_transacao_recaptura.name, - project_name=emd_constants.PREFECT_DEFAULT_PROJECT.value, + # project_name=emd_constants.PREFECT_DEFAULT_PROJECT.value, + project_name="staging", labels=LABELS, parameters={"recapture_window_days": recapture_window_days}, ) @@ -143,11 +143,19 @@ raise_final_state=True, ) + recaptura_auxiliar_params = merge_dict_with_dict_list( + dict_list=constants.BILHETAGEM_CAPTURE_PARAMS.value, + dict_to_merge={ + "recapture": True, + "recapture_window_days": recapture_window_days, + }, + ) + runs_recaptura_auxiliar = create_flow_run.map( flow_name=unmapped(bilhetagem_auxiliar_captura.name), - project_name=unmapped(emd_constants.PREFECT_DEFAULT_PROJECT.value), - parameters=constants.BILHETAGEM_CAPTURE_PARAMS.value - | {"recapture": True, "recapture_window_days": recapture_window_days}, + # project_name=unmapped(emd_constants.PREFECT_DEFAULT_PROJECT.value), + project_name=unmapped("staging"), + parameters=recaptura_auxiliar_params, labels=unmapped(LABELS), upstream_tasks=[wait_recaptura_trasacao], ) @@ -162,7 +170,8 @@ # Captura runs_captura = create_flow_run.map( flow_name=unmapped(bilhetagem_auxiliar_captura.name), - project_name=unmapped(emd_constants.PREFECT_DEFAULT_PROJECT.value), + # project_name=unmapped(emd_constants.PREFECT_DEFAULT_PROJECT.value), + project_name=unmapped("staging"), parameters=constants.BILHETAGEM_CAPTURE_PARAMS.value, labels=unmapped(LABELS), upstream_tasks=[wait_recaptura_auxiliar], @@ -178,7 +187,8 @@ # Materialização run_materializacao = create_flow_run( flow_name=bilhetagem_materializacao.name, - project_name=emd_constants.PREFECT_DEFAULT_PROJECT.value, + # project_name=emd_constants.PREFECT_DEFAULT_PROJECT.value, + project_name="staging", labels=LABELS, upstream_tasks=[wait_captura], ) diff --git a/pipelines/rj_smtr/tasks.py b/pipelines/rj_smtr/tasks.py index 5f5daf1a1..fa930a893 100644 --- a/pipelines/rj_smtr/tasks.py +++ b/pipelines/rj_smtr/tasks.py @@ -1178,3 +1178,18 @@ def unpack_mapped_results_nout2( """ return [r[0] for r in mapped_results], [r[1] for r in mapped_results] + + +@task(checkpoint=False) +def merge_dict_with_dict_list(dict_list: list[dict], dict_to_merge: dict) -> list[dict]: + """ + Task to merge a dict with every dict inside a list + + Args: + dict_list (list[dict]): A list of dictionaries to update + dict_to_merge (dict): The dict that will be merged in every dict inside the list + + Returns: + list[dict]: The updated list + """ + return [inside_dict | dict_to_merge for inside_dict in dict_list] From e75e7a64b9e21bba495cf208cce33ff6fbc73e67 Mon Sep 17 00:00:00 2001 From: Rafael Date: Wed, 18 Oct 2023 14:06:38 -0300 Subject: [PATCH 116/145] comentar materializacao --- .../br_rj_riodejaneiro_bilhetagem/flows.py | 28 +++++++++---------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/flows.py b/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/flows.py index 2545ee04e..1638817ef 100644 --- a/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/flows.py +++ b/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/flows.py @@ -185,20 +185,20 @@ ) # Materialização - run_materializacao = create_flow_run( - flow_name=bilhetagem_materializacao.name, - # project_name=emd_constants.PREFECT_DEFAULT_PROJECT.value, - project_name="staging", - labels=LABELS, - upstream_tasks=[wait_captura], - ) - - wait_materializacao = wait_for_flow_run( - run_materializacao, - stream_states=True, - stream_logs=True, - raise_final_state=True, - ) + # run_materializacao = create_flow_run( + # flow_name=bilhetagem_materializacao.name, + # # project_name=emd_constants.PREFECT_DEFAULT_PROJECT.value, + # project_name="staging", + # labels=LABELS, + # upstream_tasks=[wait_captura], + # ) + + # wait_materializacao = wait_for_flow_run( + # run_materializacao, + # stream_states=True, + # stream_logs=True, + # raise_final_state=True, + # ) bilhetagem_transacao_tratamento.storage = GCS(emd_constants.GCS_FLOWS_BUCKET.value) bilhetagem_transacao_tratamento.run_config = KubernetesRun( From 6b6d0cbce8b341e6c4d93ef368caf1b26b2f85c4 Mon Sep 17 00:00:00 2001 From: Rafael Date: Wed, 18 Oct 2023 15:19:21 -0300 Subject: [PATCH 117/145] teste log --- pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/flows.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/flows.py b/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/flows.py index 1638817ef..9b28676da 100644 --- a/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/flows.py +++ b/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/flows.py @@ -11,6 +11,9 @@ from prefect.utilities.edges import unmapped from prefect import Parameter +from prefect import task +from pipelines.utils.utils import log + # EMD Imports # from pipelines.constants import constants as emd_constants @@ -151,6 +154,8 @@ }, ) + task(lambda x: log(x))(x=recaptura_auxiliar_params) + runs_recaptura_auxiliar = create_flow_run.map( flow_name=unmapped(bilhetagem_auxiliar_captura.name), # project_name=unmapped(emd_constants.PREFECT_DEFAULT_PROJECT.value), From ec23cf62abbd1523b8a309b57ca144c734f304a8 Mon Sep 17 00:00:00 2001 From: Rafael Date: Wed, 18 Oct 2023 15:56:47 -0300 Subject: [PATCH 118/145] muda logica recaptura bilhetagem --- .../br_rj_riodejaneiro_bilhetagem/flows.py | 38 ++++++------------- pipelines/rj_smtr/tasks.py | 15 -------- 2 files changed, 11 insertions(+), 42 deletions(-) diff --git a/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/flows.py b/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/flows.py index 9b28676da..6e2c976c1 100644 --- a/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/flows.py +++ b/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/flows.py @@ -9,10 +9,7 @@ from prefect.storage import GCS from prefect.tasks.prefect import create_flow_run, wait_for_flow_run from prefect.utilities.edges import unmapped -from prefect import Parameter -from prefect import task -from pipelines.utils.utils import log # EMD Imports # @@ -33,7 +30,7 @@ default_materialization_flow, ) -from pipelines.rj_smtr.tasks import get_current_timestamp, merge_dict_with_dict_list +from pipelines.rj_smtr.tasks import get_current_timestamp from pipelines.rj_smtr.constants import constants @@ -102,14 +99,12 @@ # RECAPTURA -bilhetagem_transacao_recaptura = deepcopy(default_capture_flow) -bilhetagem_transacao_recaptura.name = "SMTR: Bilhetagem Transação - Recaptura (subflow)" -bilhetagem_transacao_recaptura.storage = GCS(emd_constants.GCS_FLOWS_BUCKET.value) -bilhetagem_transacao_recaptura = set_default_parameters( - flow=bilhetagem_transacao_recaptura, - default_parameters=GENERAL_CAPTURE_DEFAULT_PARAMS - | constants.BILHETAGEM_TRANSACAO_CAPTURE_PARAMS.value - | {"recapture": True}, +bilhetagem_recaptura = deepcopy(default_capture_flow) +bilhetagem_recaptura.name = "SMTR: Bilhetagem - Recaptura (subflow)" +bilhetagem_recaptura.storage = GCS(emd_constants.GCS_FLOWS_BUCKET.value) +bilhetagem_recaptura = set_default_parameters( + flow=bilhetagem_recaptura, + default_parameters=GENERAL_CAPTURE_DEFAULT_PARAMS | {"recapture": True}, ) # TRATAMENTO - RODA DE HORA EM HORA, RECAPTURAS + CAPTURA AUXILIAR + MATERIALIZAÇÃO @@ -118,7 +113,6 @@ code_owners=["caio", "fernanda", "boris", "rodrigo"], ) as bilhetagem_transacao_tratamento: # Configuração # - recapture_window_days = Parameter("recapture_window_days", default=1) timestamp = get_current_timestamp() @@ -132,11 +126,11 @@ # Recapturas run_recaptura_trasacao = create_flow_run( - flow_name=bilhetagem_transacao_recaptura.name, + flow_name=bilhetagem_recaptura.name, # project_name=emd_constants.PREFECT_DEFAULT_PROJECT.value, project_name="staging", labels=LABELS, - parameters={"recapture_window_days": recapture_window_days}, + parameters=constants.BILHETAGEM_TRANSACAO_CAPTURE_PARAMS.value, ) wait_recaptura_trasacao = wait_for_flow_run( @@ -146,21 +140,11 @@ raise_final_state=True, ) - recaptura_auxiliar_params = merge_dict_with_dict_list( - dict_list=constants.BILHETAGEM_CAPTURE_PARAMS.value, - dict_to_merge={ - "recapture": True, - "recapture_window_days": recapture_window_days, - }, - ) - - task(lambda x: log(x))(x=recaptura_auxiliar_params) - runs_recaptura_auxiliar = create_flow_run.map( - flow_name=unmapped(bilhetagem_auxiliar_captura.name), + flow_name=unmapped(bilhetagem_recaptura.name), # project_name=unmapped(emd_constants.PREFECT_DEFAULT_PROJECT.value), project_name=unmapped("staging"), - parameters=recaptura_auxiliar_params, + parameters=constants.BILHETAGEM_CAPTURE_PARAMS.value, labels=unmapped(LABELS), upstream_tasks=[wait_recaptura_trasacao], ) diff --git a/pipelines/rj_smtr/tasks.py b/pipelines/rj_smtr/tasks.py index fa930a893..5f5daf1a1 100644 --- a/pipelines/rj_smtr/tasks.py +++ b/pipelines/rj_smtr/tasks.py @@ -1178,18 +1178,3 @@ def unpack_mapped_results_nout2( """ return [r[0] for r in mapped_results], [r[1] for r in mapped_results] - - -@task(checkpoint=False) -def merge_dict_with_dict_list(dict_list: list[dict], dict_to_merge: dict) -> list[dict]: - """ - Task to merge a dict with every dict inside a list - - Args: - dict_list (list[dict]): A list of dictionaries to update - dict_to_merge (dict): The dict that will be merged in every dict inside the list - - Returns: - list[dict]: The updated list - """ - return [inside_dict | dict_to_merge for inside_dict in dict_list] From 1644b7204f0e4bd77a18248e18010d7989d7d64a Mon Sep 17 00:00:00 2001 From: Rafael Date: Wed, 18 Oct 2023 16:31:44 -0300 Subject: [PATCH 119/145] unmapped upstream tasks --- pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/flows.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/flows.py b/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/flows.py index 6e2c976c1..787ff13e2 100644 --- a/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/flows.py +++ b/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/flows.py @@ -146,7 +146,7 @@ project_name=unmapped("staging"), parameters=constants.BILHETAGEM_CAPTURE_PARAMS.value, labels=unmapped(LABELS), - upstream_tasks=[wait_recaptura_trasacao], + upstream_tasks=unmapped([wait_recaptura_trasacao]), ) wait_recaptura_auxiliar = wait_for_flow_run.map( @@ -163,7 +163,7 @@ project_name=unmapped("staging"), parameters=constants.BILHETAGEM_CAPTURE_PARAMS.value, labels=unmapped(LABELS), - upstream_tasks=[wait_recaptura_auxiliar], + upstream_tasks=unmapped([wait_recaptura_auxiliar]), ) wait_captura = wait_for_flow_run.map( From a33a4b8f09007aabeebc0bd8c07731762f2dcd16 Mon Sep 17 00:00:00 2001 From: Rafael Date: Wed, 18 Oct 2023 16:42:18 -0300 Subject: [PATCH 120/145] mudar forma de upstream --- pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/flows.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/flows.py b/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/flows.py index 787ff13e2..02b89a155 100644 --- a/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/flows.py +++ b/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/flows.py @@ -146,9 +146,10 @@ project_name=unmapped("staging"), parameters=constants.BILHETAGEM_CAPTURE_PARAMS.value, labels=unmapped(LABELS), - upstream_tasks=unmapped([wait_recaptura_trasacao]), ) + runs_recaptura_auxiliar.set_upstream(wait_recaptura_trasacao) + wait_recaptura_auxiliar = wait_for_flow_run.map( runs_recaptura_auxiliar, stream_states=unmapped(True), @@ -163,9 +164,10 @@ project_name=unmapped("staging"), parameters=constants.BILHETAGEM_CAPTURE_PARAMS.value, labels=unmapped(LABELS), - upstream_tasks=unmapped([wait_recaptura_auxiliar]), ) + runs_captura.set_upstream(wait_recaptura_auxiliar) + wait_captura = wait_for_flow_run.map( runs_captura, stream_states=unmapped(True), From e47ff2d303e34e5618e50cc44eddc111c2724a3e Mon Sep 17 00:00:00 2001 From: Rafael Date: Thu, 19 Oct 2023 07:37:12 -0300 Subject: [PATCH 121/145] =?UTF-8?q?remover=20altera=C3=A7=C3=B5es=20de=20t?= =?UTF-8?q?este?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../br_rj_riodejaneiro_bilhetagem/flows.py | 28 +++++++++---------- pipelines/rj_smtr/tasks.py | 8 +++--- 2 files changed, 18 insertions(+), 18 deletions(-) diff --git a/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/flows.py b/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/flows.py index 02b89a155..6cb5b47ae 100644 --- a/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/flows.py +++ b/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/flows.py @@ -176,20 +176,20 @@ ) # Materialização - # run_materializacao = create_flow_run( - # flow_name=bilhetagem_materializacao.name, - # # project_name=emd_constants.PREFECT_DEFAULT_PROJECT.value, - # project_name="staging", - # labels=LABELS, - # upstream_tasks=[wait_captura], - # ) - - # wait_materializacao = wait_for_flow_run( - # run_materializacao, - # stream_states=True, - # stream_logs=True, - # raise_final_state=True, - # ) + run_materializacao = create_flow_run( + flow_name=bilhetagem_materializacao.name, + # project_name=emd_constants.PREFECT_DEFAULT_PROJECT.value, + project_name="staging", + labels=LABELS, + upstream_tasks=[wait_captura], + ) + + wait_materializacao = wait_for_flow_run( + run_materializacao, + stream_states=True, + stream_logs=True, + raise_final_state=True, + ) bilhetagem_transacao_tratamento.storage = GCS(emd_constants.GCS_FLOWS_BUCKET.value) bilhetagem_transacao_tratamento.run_config = KubernetesRun( diff --git a/pipelines/rj_smtr/tasks.py b/pipelines/rj_smtr/tasks.py index 5f5daf1a1..671d6171a 100644 --- a/pipelines/rj_smtr/tasks.py +++ b/pipelines/rj_smtr/tasks.py @@ -309,7 +309,7 @@ def create_local_partition_path( either to save raw or staging files. """ data_folder = os.getenv("DATA_FOLDER", "data") - file_path = f"{os.getcwd()}/{data_folder}/{{mode}}/{dataset_id}_dev/{table_id}" + file_path = f"{os.getcwd()}/{data_folder}/{{mode}}/{dataset_id}/{table_id}" file_path += f"/{partitions}/{filename}.{{filetype}}" log(f"Creating file path: {file_path}") return file_path @@ -783,7 +783,7 @@ def upload_raw_data_to_gcs( st_obj = Storage(table_id=table_id, dataset_id=dataset_id) log( f"""Uploading raw file to bucket {st_obj.bucket_name} at - {st_obj.bucket_name}/{dataset_id}_dev/{table_id}""" + {st_obj.bucket_name}/{dataset_id}/{table_id}""" ) st_obj.upload( path=raw_filepath, @@ -827,7 +827,7 @@ def upload_staging_data_to_gcs( try: # Creates and publish table if it does not exist, append to it otherwise create_or_append_table( - dataset_id=f"{dataset_id}_dev", + dataset_id=dataset_id, table_id=table_id, path=staging_filepath, partitions=partitions, @@ -837,7 +837,7 @@ def upload_staging_data_to_gcs( log(f"[CATCHED] Task failed with error: \n{error}", level="error") upload_run_logs_to_bq( - dataset_id=f"{dataset_id}_dev", + dataset_id=dataset_id, parent_table_id=table_id, error=error, timestamp=timestamp, From ba730a4d4c453e9cf585282df4372c078c55bb55 Mon Sep 17 00:00:00 2001 From: Rafael Date: Thu, 19 Oct 2023 07:38:48 -0300 Subject: [PATCH 122/145] mudar agent para prd --- pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/flows.py | 8 ++++---- pipelines/rj_smtr/flows.py | 4 ++-- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/flows.py b/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/flows.py index 6cb5b47ae..66e834170 100644 --- a/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/flows.py +++ b/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/flows.py @@ -52,7 +52,7 @@ bilhetagem_transacao_captura.storage = GCS(emd_constants.GCS_FLOWS_BUCKET.value) bilhetagem_transacao_captura.run_config = KubernetesRun( image=emd_constants.DOCKER_IMAGE.value, - labels=[emd_constants.RJ_SMTR_DEV_AGENT_LABEL.value], + labels=[emd_constants.RJ_SMTR_AGENT_LABEL.value], ) bilhetagem_transacao_captura = set_default_parameters( @@ -71,7 +71,7 @@ bilhetagem_auxiliar_captura.storage = GCS(emd_constants.GCS_FLOWS_BUCKET.value) bilhetagem_auxiliar_captura.run_config = KubernetesRun( image=emd_constants.DOCKER_IMAGE.value, - labels=[emd_constants.RJ_SMTR_DEV_AGENT_LABEL.value], + labels=[emd_constants.RJ_SMTR_AGENT_LABEL.value], ) bilhetagem_auxiliar_captura = set_default_parameters( @@ -85,7 +85,7 @@ bilhetagem_materializacao.storage = GCS(emd_constants.GCS_FLOWS_BUCKET.value) bilhetagem_materializacao.run_config = KubernetesRun( image=emd_constants.DOCKER_IMAGE.value, - labels=[emd_constants.RJ_SMTR_DEV_AGENT_LABEL.value], + labels=[emd_constants.RJ_SMTR_AGENT_LABEL.value], ) bilhetagem_materializacao_parameters = { @@ -194,6 +194,6 @@ bilhetagem_transacao_tratamento.storage = GCS(emd_constants.GCS_FLOWS_BUCKET.value) bilhetagem_transacao_tratamento.run_config = KubernetesRun( image=emd_constants.DOCKER_IMAGE.value, - labels=[emd_constants.RJ_SMTR_DEV_AGENT_LABEL.value], + labels=[emd_constants.RJ_SMTR_AGENT_LABEL.value], ) bilhetagem_transacao_tratamento.schedule = every_hour diff --git a/pipelines/rj_smtr/flows.py b/pipelines/rj_smtr/flows.py index 63acc1497..18a7fb1a3 100644 --- a/pipelines/rj_smtr/flows.py +++ b/pipelines/rj_smtr/flows.py @@ -168,7 +168,7 @@ default_capture_flow.storage = GCS(emd_constants.GCS_FLOWS_BUCKET.value) default_capture_flow.run_config = KubernetesRun( image=emd_constants.DOCKER_IMAGE.value, - labels=[emd_constants.RJ_SMTR_DEV_AGENT_LABEL.value], + labels=[emd_constants.RJ_SMTR_AGENT_LABEL.value], ) with Flow( @@ -306,5 +306,5 @@ default_materialization_flow.storage = GCS(emd_constants.GCS_FLOWS_BUCKET.value) default_materialization_flow.run_config = KubernetesRun( image=emd_constants.DOCKER_IMAGE.value, - labels=[emd_constants.RJ_SMTR_DEV_AGENT_LABEL.value], + labels=[emd_constants.RJ_SMTR_AGENT_LABEL.value], ) From 4421043e7915528f7760266e57749c24b76e7c73 Mon Sep 17 00:00:00 2001 From: Rafael Date: Thu, 19 Oct 2023 13:02:59 -0300 Subject: [PATCH 123/145] corrigir project_name --- pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/flows.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/flows.py b/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/flows.py index 66e834170..5f4a82f75 100644 --- a/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/flows.py +++ b/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/flows.py @@ -142,8 +142,7 @@ runs_recaptura_auxiliar = create_flow_run.map( flow_name=unmapped(bilhetagem_recaptura.name), - # project_name=unmapped(emd_constants.PREFECT_DEFAULT_PROJECT.value), - project_name=unmapped("staging"), + project_name=unmapped(emd_constants.PREFECT_DEFAULT_PROJECT.value), parameters=constants.BILHETAGEM_CAPTURE_PARAMS.value, labels=unmapped(LABELS), ) @@ -160,8 +159,7 @@ # Captura runs_captura = create_flow_run.map( flow_name=unmapped(bilhetagem_auxiliar_captura.name), - # project_name=unmapped(emd_constants.PREFECT_DEFAULT_PROJECT.value), - project_name=unmapped("staging"), + project_name=unmapped(emd_constants.PREFECT_DEFAULT_PROJECT.value), parameters=constants.BILHETAGEM_CAPTURE_PARAMS.value, labels=unmapped(LABELS), ) From f1a3bbd0dec741f314d1e1fb7ec636eeb1753715 Mon Sep 17 00:00:00 2001 From: Rafael Date: Thu, 19 Oct 2023 13:34:42 -0300 Subject: [PATCH 124/145] passar tirar query_logs_func --- pipelines/rj_smtr/tasks.py | 107 +++++++++++++++++++++++++++++++++---- 1 file changed, 97 insertions(+), 10 deletions(-) diff --git a/pipelines/rj_smtr/tasks.py b/pipelines/rj_smtr/tasks.py index 671d6171a..e329069af 100644 --- a/pipelines/rj_smtr/tasks.py +++ b/pipelines/rj_smtr/tasks.py @@ -36,7 +36,7 @@ read_raw_data, save_treated_local_func, save_raw_local_func, - query_logs_func, + log_critical, ) from pipelines.utils.execute_dbt_model.utils import get_dbt_client from pipelines.utils.utils import log, get_redis_client, get_vault_secret @@ -159,7 +159,7 @@ def create_dbt_run_vars( mode (str): the mode to get the date_range variable Returns: - tuple[list[dict]: the variables to be used in DBT + list[dict]: the variables to be used in DBT Union[list[dict], dict, None]: the date variable (date_range or run_date) bool: a flag that indicates if the date_range variable came from Redis """ @@ -392,14 +392,101 @@ def query_logs( previous_errors (list of previous errors) """ - return query_logs_func( - dataset_id=dataset_id, - table_id=table_id, - datetime_filter=datetime_filter, - max_recaptures=max_recaptures, - interval_minutes=interval_minutes, - recapture_window_days=recapture_window_days, - ) + if not datetime_filter: + datetime_filter = pendulum.now(constants.TIMEZONE.value).replace( + second=0, microsecond=0 + ) + elif isinstance(datetime_filter, str): + datetime_filter = datetime.fromisoformat(datetime_filter).replace( + second=0, microsecond=0 + ) + + datetime_filter = datetime_filter.strftime("%Y-%m-%d %H:%M:%S") + + query = f""" + WITH + t AS ( + SELECT + DATETIME(timestamp_array) AS timestamp_array + FROM + UNNEST( + GENERATE_TIMESTAMP_ARRAY( + TIMESTAMP_SUB('{datetime_filter}', INTERVAL {recapture_window_days} day), + TIMESTAMP('{datetime_filter}'), + INTERVAL {interval_minutes} minute) ) + AS timestamp_array + WHERE + timestamp_array < '{datetime_filter}' ), + logs_table AS ( + SELECT + SAFE_CAST(DATETIME(TIMESTAMP(timestamp_captura), + "America/Sao_Paulo") AS DATETIME) timestamp_captura, + SAFE_CAST(sucesso AS BOOLEAN) sucesso, + SAFE_CAST(erro AS STRING) erro, + SAFE_CAST(DATA AS DATE) DATA + FROM + rj-smtr-staging.{dataset_id}_staging.{table_id}_logs AS t + ), + logs AS ( + SELECT + *, + TIMESTAMP_TRUNC(timestamp_captura, minute) AS timestamp_array + FROM + logs_table + WHERE + DATA BETWEEN DATE(DATETIME_SUB('{datetime_filter}', + INTERVAL {recapture_window_days} day)) + AND DATE('{datetime_filter}') + AND timestamp_captura BETWEEN + DATETIME_SUB('{datetime_filter}', INTERVAL {recapture_window_days} day) + AND '{datetime_filter}' + ORDER BY + timestamp_captura ) + SELECT + CASE + WHEN logs.timestamp_captura IS NOT NULL THEN logs.timestamp_captura + ELSE + t.timestamp_array + END + AS timestamp_captura, + logs.erro + FROM + t + LEFT JOIN + logs + ON + logs.timestamp_array = t.timestamp_array + WHERE + logs.sucesso IS NOT TRUE + ORDER BY + timestamp_captura + """ + log(f"Run query to check logs:\n{query}") + results = bd.read_sql(query=query, billing_project_id=bq_project()) + if len(results) > 0: + results["timestamp_captura"] = ( + pd.to_datetime(results["timestamp_captura"]) + .dt.tz_localize(constants.TIMEZONE.value) + .to_list() + ) + log(f"Recapture data for the following {len(results)} timestamps:\n{results}") + if len(results) > max_recaptures: + message = f""" + [SPPO - Recaptures] + Encontradas {len(results)} timestamps para serem recapturadas. + Essa run processará as seguintes: + ##### + {results[:max_recaptures]} + ##### + Sobraram as seguintes para serem recapturadas na próxima run: + ##### + {results[max_recaptures:]} + ##### + """ + log_critical(message) + results = results[:max_recaptures] + return True, results["timestamp_captura"].to_list(), results["erro"].to_list() + return False, [], [] @task From 517a9a2c88f15ce6f0ff298d988be11821d6c349 Mon Sep 17 00:00:00 2001 From: Rafael Date: Thu, 19 Oct 2023 13:35:04 -0300 Subject: [PATCH 125/145] corrigir project_name --- pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/flows.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/flows.py b/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/flows.py index 5f4a82f75..096e5d3e3 100644 --- a/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/flows.py +++ b/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/flows.py @@ -127,8 +127,7 @@ run_recaptura_trasacao = create_flow_run( flow_name=bilhetagem_recaptura.name, - # project_name=emd_constants.PREFECT_DEFAULT_PROJECT.value, - project_name="staging", + project_name=emd_constants.PREFECT_DEFAULT_PROJECT.value, labels=LABELS, parameters=constants.BILHETAGEM_TRANSACAO_CAPTURE_PARAMS.value, ) @@ -176,8 +175,7 @@ # Materialização run_materializacao = create_flow_run( flow_name=bilhetagem_materializacao.name, - # project_name=emd_constants.PREFECT_DEFAULT_PROJECT.value, - project_name="staging", + project_name=emd_constants.PREFECT_DEFAULT_PROJECT.value, labels=LABELS, upstream_tasks=[wait_captura], ) From df6ee96a5dad1f0d49fa9988a53f2edda7d9d12f Mon Sep 17 00:00:00 2001 From: Rafael Date: Thu, 19 Oct 2023 13:35:21 -0300 Subject: [PATCH 126/145] =?UTF-8?q?remover=20coment=C3=A1rios?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- pipelines/rj_smtr/utils.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/pipelines/rj_smtr/utils.py b/pipelines/rj_smtr/utils.py index 386053e66..bf9fb6778 100644 --- a/pipelines/rj_smtr/utils.py +++ b/pipelines/rj_smtr/utils.py @@ -602,10 +602,6 @@ def custom_serialization(obj: Any) -> Any: obj = obj.tz_localize("UTC").tz_convert( emd_constants.DEFAULT_TIMEZONE.value ) - # if obj.tzinfo is None: - # obj = obj.tz_localize(emd_constants.DEFAULT_TIMEZONE.value) - # else: - # obj = obj.tz_convert(emd_constants.DEFAULT_TIMEZONE.value) return obj.isoformat() From 7b2e1dfce2e8f353fef7dadf7453552a49b54f78 Mon Sep 17 00:00:00 2001 From: Rafael Date: Thu, 19 Oct 2023 14:12:52 -0300 Subject: [PATCH 127/145] remover query_logs_func --- pipelines/rj_smtr/utils.py | 124 ------------------------------------- 1 file changed, 124 deletions(-) diff --git a/pipelines/rj_smtr/utils.py b/pipelines/rj_smtr/utils.py index bf9fb6778..0d05ffb09 100644 --- a/pipelines/rj_smtr/utils.py +++ b/pipelines/rj_smtr/utils.py @@ -450,130 +450,6 @@ def generate_execute_schedules( # pylint: disable=too-many-arguments,too-many-l return clocks -def query_logs_func( - dataset_id: str, - table_id: str, - datetime_filter=None, - max_recaptures: int = 60, - interval_minutes: int = 1, - recapture_window_days: int = 1, -): - """ - Queries capture logs to check for errors - - Args: - dataset_id (str): dataset_id on BigQuery - table_id (str): table_id on BigQuery - datetime_filter (pendulum.datetime.DateTime, optional): - filter passed to query. This task will query the logs table - for the last n (n = recapture_window_days) days before datetime_filter - max_recaptures (int, optional): maximum number of recaptures to be done - interval_minutes (int, optional): interval in minutes between each recapture - recapture_window_days (int, optional): Number of days to query for erros - - Returns: - lists: errors (bool), - timestamps (list of pendulum.datetime.DateTime), - previous_errors (list of previous errors) - """ - - if not datetime_filter: - datetime_filter = pendulum.now(constants.TIMEZONE.value).replace( - second=0, microsecond=0 - ) - elif isinstance(datetime_filter, str): - datetime_filter = datetime.fromisoformat(datetime_filter).replace( - second=0, microsecond=0 - ) - - datetime_filter = datetime_filter.strftime("%Y-%m-%d %H:%M:%S") - - query = f""" - WITH - t AS ( - SELECT - DATETIME(timestamp_array) AS timestamp_array - FROM - UNNEST( - GENERATE_TIMESTAMP_ARRAY( - TIMESTAMP_SUB('{datetime_filter}', INTERVAL {recapture_window_days} day), - TIMESTAMP('{datetime_filter}'), - INTERVAL {interval_minutes} minute) ) - AS timestamp_array - WHERE - timestamp_array < '{datetime_filter}' ), - logs_table AS ( - SELECT - SAFE_CAST(DATETIME(TIMESTAMP(timestamp_captura), - "America/Sao_Paulo") AS DATETIME) timestamp_captura, - SAFE_CAST(sucesso AS BOOLEAN) sucesso, - SAFE_CAST(erro AS STRING) erro, - SAFE_CAST(DATA AS DATE) DATA - FROM - rj-smtr-staging.{dataset_id}_staging.{table_id}_logs AS t - ), - logs AS ( - SELECT - *, - TIMESTAMP_TRUNC(timestamp_captura, minute) AS timestamp_array - FROM - logs_table - WHERE - DATA BETWEEN DATE(DATETIME_SUB('{datetime_filter}', - INTERVAL {recapture_window_days} day)) - AND DATE('{datetime_filter}') - AND timestamp_captura BETWEEN - DATETIME_SUB('{datetime_filter}', INTERVAL {recapture_window_days} day) - AND '{datetime_filter}' - ORDER BY - timestamp_captura ) - SELECT - CASE - WHEN logs.timestamp_captura IS NOT NULL THEN logs.timestamp_captura - ELSE - t.timestamp_array - END - AS timestamp_captura, - logs.erro - FROM - t - LEFT JOIN - logs - ON - logs.timestamp_array = t.timestamp_array - WHERE - logs.sucesso IS NOT TRUE - ORDER BY - timestamp_captura - """ - log(f"Run query to check logs:\n{query}") - results = bd.read_sql(query=query, billing_project_id=bq_project()) - if len(results) > 0: - results["timestamp_captura"] = ( - pd.to_datetime(results["timestamp_captura"]) - .dt.tz_localize(constants.TIMEZONE.value) - .to_list() - ) - log(f"Recapture data for the following {len(results)} timestamps:\n{results}") - if len(results) > max_recaptures: - message = f""" - [SPPO - Recaptures] - Encontradas {len(results)} timestamps para serem recapturadas. - Essa run processará as seguintes: - ##### - {results[:max_recaptures]} - ##### - Sobraram as seguintes para serem recapturadas na próxima run: - ##### - {results[max_recaptures:]} - ##### - """ - log_critical(message) - results = results[:max_recaptures] - return True, results["timestamp_captura"].to_list(), results["erro"].to_list() - return False, [], [] - - def dict_contains_keys(input_dict: dict, keys: list[str]) -> bool: """ Test if the input dict has all keys present in the list From 5f303849ea483f5f37115238a5084c9aa207f2fd Mon Sep 17 00:00:00 2001 From: Rafael Date: Thu, 19 Oct 2023 16:12:44 -0300 Subject: [PATCH 128/145] aumentar max_recaptures --- pipelines/rj_smtr/tasks.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pipelines/rj_smtr/tasks.py b/pipelines/rj_smtr/tasks.py index e329069af..79cd84751 100644 --- a/pipelines/rj_smtr/tasks.py +++ b/pipelines/rj_smtr/tasks.py @@ -369,7 +369,7 @@ def query_logs( dataset_id: str, table_id: str, datetime_filter=None, - max_recaptures: int = 60, + max_recaptures: int = 360, interval_minutes: int = 1, recapture_window_days: int = 1, ): From 431f0047af35e41eaf03fddc67bb8a62d852a4cf Mon Sep 17 00:00:00 2001 From: Rafael Date: Thu, 19 Oct 2023 20:16:21 -0300 Subject: [PATCH 129/145] adiciona extracao tracking --- .../br_rj_riodejaneiro_bilhetagem/flows.py | 18 +++++++++++++++++ pipelines/rj_smtr/constants.py | 20 +++++++++++++++++++ 2 files changed, 38 insertions(+) diff --git a/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/flows.py b/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/flows.py index 096e5d3e3..11c66ba69 100644 --- a/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/flows.py +++ b/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/flows.py @@ -63,6 +63,24 @@ bilhetagem_transacao_captura.schedule = every_minute +# BILHETAGEM GPS + +bilhetagem_tracking_captura = deepcopy(default_capture_flow) +bilhetagem_tracking_captura.name = "SMTR: Bilhetagem GPS VALIDADOR - Captura" +bilhetagem_tracking_captura.storage = GCS(emd_constants.GCS_FLOWS_BUCKET.value) +bilhetagem_tracking_captura.run_config = KubernetesRun( + image=emd_constants.DOCKER_IMAGE.value, + labels=[emd_constants.RJ_SMTR_AGENT_LABEL.value], +) + +bilhetagem_tracking_captura = set_default_parameters( + flow=bilhetagem_tracking_captura, + default_parameters=GENERAL_CAPTURE_DEFAULT_PARAMS + | constants.BILHETAGEM_TRACKING_CAPTURE_PARAMS.value, +) + +bilhetagem_tracking_captura.schedule = every_minute + # BILHETAGEM AUXILIAR - SUBFLOW PARA RODAR ANTES DE CADA MATERIALIZAÇÃO # diff --git a/pipelines/rj_smtr/constants.py b/pipelines/rj_smtr/constants.py index ccf1c6c44..923383d04 100644 --- a/pipelines/rj_smtr/constants.py +++ b/pipelines/rj_smtr/constants.py @@ -180,6 +180,7 @@ class constants(Enum): # pylint: disable=c0103 "engine": "postgresql", "host": "10.5.115.1", }, + "tracking_db": {"engine": "postgresql", "host": "10.5.15.25"}, }, "source_type": "db", } @@ -203,6 +204,25 @@ class constants(Enum): # pylint: disable=c0103 "interval_minutes": 1, } + BILHETAGEM_TRACKING_CAPTURE_PARAMS = { + "table_id": "gps_validador", + "partition_date_only": False, + "extract_params": { + "database": "tracking_db", + "query": """ + SELECT + * + FROM + tracking_detalhe + WHERE + data_tracking BETWEEN '{start}' + AND '{end}' + """, + }, + "primary_key": ["id"], + "interval_minutes": 1, + } + BILHETAGEM_SECRET_PATH = "smtr_jae_access_data" BILHETAGEM_CAPTURE_PARAMS = [ From e7ca572108773525a4002fab189c078a6d2c1465 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 19 Oct 2023 23:21:24 +0000 Subject: [PATCH 130/145] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- pipelines/rj_smtr/constants.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pipelines/rj_smtr/constants.py b/pipelines/rj_smtr/constants.py index 222440ca0..9d7465e9e 100644 --- a/pipelines/rj_smtr/constants.py +++ b/pipelines/rj_smtr/constants.py @@ -219,7 +219,7 @@ class constants(Enum): # pylint: disable=c0103 """, }, "primary_key": ["id"], - "interval_minutes": 1, + "interval_minutes": 1, } BILHETAGEM_SECRET_PATH = "smtr_jae_access_data" From dae77e84160a57fb4e17803d3cf2f0cc6922032f Mon Sep 17 00:00:00 2001 From: Rafael Date: Thu, 19 Oct 2023 20:22:06 -0300 Subject: [PATCH 131/145] muda agent para dev --- pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/flows.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/flows.py b/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/flows.py index 11c66ba69..cf8e39644 100644 --- a/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/flows.py +++ b/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/flows.py @@ -70,7 +70,7 @@ bilhetagem_tracking_captura.storage = GCS(emd_constants.GCS_FLOWS_BUCKET.value) bilhetagem_tracking_captura.run_config = KubernetesRun( image=emd_constants.DOCKER_IMAGE.value, - labels=[emd_constants.RJ_SMTR_AGENT_LABEL.value], + labels=[emd_constants.RJ_SMTR_DEV_AGENT_LABEL.value], ) bilhetagem_tracking_captura = set_default_parameters( From 5fb96e6fd09077dc9f36c1fe1b3e2662af51eec9 Mon Sep 17 00:00:00 2001 From: Rafael Date: Thu, 19 Oct 2023 20:23:41 -0300 Subject: [PATCH 132/145] corrige constante --- pipelines/rj_smtr/constants.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pipelines/rj_smtr/constants.py b/pipelines/rj_smtr/constants.py index 9d7465e9e..923383d04 100644 --- a/pipelines/rj_smtr/constants.py +++ b/pipelines/rj_smtr/constants.py @@ -202,6 +202,7 @@ class constants(Enum): # pylint: disable=c0103 }, "primary_key": ["id"], "interval_minutes": 1, + } BILHETAGEM_TRACKING_CAPTURE_PARAMS = { "table_id": "gps_validador", From 7c3de1a436e3412fc3b5133a38bc01414307fae6 Mon Sep 17 00:00:00 2001 From: Rafael Date: Thu, 19 Oct 2023 20:26:24 -0300 Subject: [PATCH 133/145] formatar constante database --- pipelines/rj_smtr/constants.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/pipelines/rj_smtr/constants.py b/pipelines/rj_smtr/constants.py index 923383d04..79aa91b5d 100644 --- a/pipelines/rj_smtr/constants.py +++ b/pipelines/rj_smtr/constants.py @@ -180,7 +180,10 @@ class constants(Enum): # pylint: disable=c0103 "engine": "postgresql", "host": "10.5.115.1", }, - "tracking_db": {"engine": "postgresql", "host": "10.5.15.25"}, + "tracking_db": { + "engine": "postgresql", + "host": "10.5.15.25", + }, }, "source_type": "db", } From de245398ddd95f937f056750dccd4b1bae4d182c Mon Sep 17 00:00:00 2001 From: Rafael Date: Thu, 19 Oct 2023 20:36:44 -0300 Subject: [PATCH 134/145] altera nome do flow --- pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/flows.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/flows.py b/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/flows.py index 61cec24dd..a9489c7d8 100644 --- a/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/flows.py +++ b/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/flows.py @@ -66,7 +66,7 @@ # BILHETAGEM GPS bilhetagem_tracking_captura = deepcopy(default_capture_flow) -bilhetagem_tracking_captura.name = "SMTR: Bilhetagem GPS VALIDADOR - Captura" +bilhetagem_tracking_captura.name = "SMTR: Bilhetagem GPS Validador - Captura" bilhetagem_tracking_captura.storage = GCS(emd_constants.GCS_FLOWS_BUCKET.value) bilhetagem_tracking_captura.run_config = KubernetesRun( image=emd_constants.DOCKER_IMAGE.value, From ccddeead4099e172c2e4445d97f63f149cc91ba4 Mon Sep 17 00:00:00 2001 From: Rafael Date: Mon, 23 Oct 2023 08:41:20 -0300 Subject: [PATCH 135/145] alterar queries bilhetagem auxiliar --- pipelines/rj_smtr/constants.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/pipelines/rj_smtr/constants.py b/pipelines/rj_smtr/constants.py index ccf1c6c44..0f1c8dff0 100644 --- a/pipelines/rj_smtr/constants.py +++ b/pipelines/rj_smtr/constants.py @@ -217,7 +217,8 @@ class constants(Enum): # pylint: disable=c0103 FROM LINHA WHERE - DT_INCLUSAO >= '{start}' + DT_INCLUSAO BETWEEN '{start}' + AND '{end}' """, }, "primary_key": ["CD_LINHA"], # id column to nest data on @@ -234,7 +235,8 @@ class constants(Enum): # pylint: disable=c0103 FROM GRUPO WHERE - DT_INCLUSAO >= '{start}' + DT_INCLUSAO BETWEEN '{start}' + AND '{end}' """, }, "primary_key": ["CD_GRUPO"], # id column to nest data on @@ -251,7 +253,8 @@ class constants(Enum): # pylint: disable=c0103 FROM GRUPO_LINHA WHERE - DT_INCLUSAO >= '{start}' + DT_INCLUSAO BETWEEN '{start}' + AND '{end}' """, }, "primary_key": ["CD_GRUPO", "CD_LINHA"], @@ -268,7 +271,8 @@ class constants(Enum): # pylint: disable=c0103 FROM matriz_integracao WHERE - dt_inclusao >= '{start}' + dt_inclusao BETWEEN '{start}' + AND '{end}' """, }, "primary_key": [ From 830c52f98179b800a816981a367c7f4ab3c6da10 Mon Sep 17 00:00:00 2001 From: Rafael Date: Mon, 23 Oct 2023 11:53:08 -0300 Subject: [PATCH 136/145] ajuste na logica de recaptura bilhetagem auxiliar --- .../br_rj_riodejaneiro_bilhetagem/flows.py | 57 +++++++++------ pipelines/rj_smtr/constants.py | 4 + pipelines/rj_smtr/flows.py | 10 ++- pipelines/rj_smtr/tasks.py | 73 ++++++++++++++++++- 4 files changed, 117 insertions(+), 27 deletions(-) diff --git a/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/flows.py b/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/flows.py index 096e5d3e3..dd1c9865d 100644 --- a/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/flows.py +++ b/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/flows.py @@ -30,7 +30,7 @@ default_materialization_flow, ) -from pipelines.rj_smtr.tasks import get_current_timestamp +from pipelines.rj_smtr.tasks import get_rounded_timestamp, join_dicts from pipelines.rj_smtr.constants import constants @@ -114,7 +114,7 @@ ) as bilhetagem_transacao_tratamento: # Configuração # - timestamp = get_current_timestamp() + timestamp = get_rounded_timestamp(interval_minutes=60) rename_flow_run = rename_current_flow_run_now_time( prefix=bilhetagem_transacao_tratamento.name + " ", @@ -123,13 +123,17 @@ LABELS = get_current_flow_labels() - # Recapturas - + # Recaptura Transação + transacao_recapture_params = join_dicts( + original_dict=constants.BILHETAGEM_TRANSACAO_CAPTURE_PARAMS.value, + dict_to_join={"timestamp": timestamp}, + ) run_recaptura_trasacao = create_flow_run( flow_name=bilhetagem_recaptura.name, - project_name=emd_constants.PREFECT_DEFAULT_PROJECT.value, + # project_name=emd_constants.PREFECT_DEFAULT_PROJECT.value, + project_name="staging", labels=LABELS, - parameters=constants.BILHETAGEM_TRANSACAO_CAPTURE_PARAMS.value, + parameters=transacao_recapture_params, ) wait_recaptura_trasacao = wait_for_flow_run( @@ -139,34 +143,41 @@ raise_final_state=True, ) - runs_recaptura_auxiliar = create_flow_run.map( - flow_name=unmapped(bilhetagem_recaptura.name), - project_name=unmapped(emd_constants.PREFECT_DEFAULT_PROJECT.value), - parameters=constants.BILHETAGEM_CAPTURE_PARAMS.value, + # Captura Auxiliar + + auxiliar_capture_params = join_dicts( + original_dict=constants.BILHETAGEM_CAPTURE_PARAMS.value, + dict_to_join={"timestamp": timestamp}, + ) + runs_captura = create_flow_run.map( + flow_name=unmapped(bilhetagem_auxiliar_captura.name), + # project_name=unmapped(emd_constants.PREFECT_DEFAULT_PROJECT.value), + project_name=unmapped("staging"), + parameters=auxiliar_capture_params, labels=unmapped(LABELS), ) - runs_recaptura_auxiliar.set_upstream(wait_recaptura_trasacao) - - wait_recaptura_auxiliar = wait_for_flow_run.map( - runs_recaptura_auxiliar, + wait_captura = wait_for_flow_run.map( + runs_captura, stream_states=unmapped(True), stream_logs=unmapped(True), raise_final_state=unmapped(True), ) - # Captura - runs_captura = create_flow_run.map( - flow_name=unmapped(bilhetagem_auxiliar_captura.name), - project_name=unmapped(emd_constants.PREFECT_DEFAULT_PROJECT.value), - parameters=constants.BILHETAGEM_CAPTURE_PARAMS.value, + # Recaptura Auxiliar + + runs_recaptura_auxiliar = create_flow_run.map( + flow_name=unmapped(bilhetagem_recaptura.name), + # project_name=unmapped(emd_constants.PREFECT_DEFAULT_PROJECT.value), + project_name=unmapped("staging"), + parameters=auxiliar_capture_params, labels=unmapped(LABELS), ) - runs_captura.set_upstream(wait_recaptura_auxiliar) + runs_recaptura_auxiliar.set_upstream(wait_captura) - wait_captura = wait_for_flow_run.map( - runs_captura, + wait_recaptura_auxiliar = wait_for_flow_run.map( + runs_recaptura_auxiliar, stream_states=unmapped(True), stream_logs=unmapped(True), raise_final_state=unmapped(True), @@ -190,6 +201,6 @@ bilhetagem_transacao_tratamento.storage = GCS(emd_constants.GCS_FLOWS_BUCKET.value) bilhetagem_transacao_tratamento.run_config = KubernetesRun( image=emd_constants.DOCKER_IMAGE.value, - labels=[emd_constants.RJ_SMTR_AGENT_LABEL.value], + labels=[emd_constants.RJ_SMTR_DEV_AGENT_LABEL.value], ) bilhetagem_transacao_tratamento.schedule = every_hour diff --git a/pipelines/rj_smtr/constants.py b/pipelines/rj_smtr/constants.py index 0f1c8dff0..63a959419 100644 --- a/pipelines/rj_smtr/constants.py +++ b/pipelines/rj_smtr/constants.py @@ -223,6 +223,7 @@ class constants(Enum): # pylint: disable=c0103 }, "primary_key": ["CD_LINHA"], # id column to nest data on "interval_minutes": 60, + "truncate_hour": True, }, { "table_id": "grupo", @@ -241,6 +242,7 @@ class constants(Enum): # pylint: disable=c0103 }, "primary_key": ["CD_GRUPO"], # id column to nest data on "interval_minutes": 60, + "truncate_hour": True, }, { "table_id": "grupo_linha", @@ -259,6 +261,7 @@ class constants(Enum): # pylint: disable=c0103 }, "primary_key": ["CD_GRUPO", "CD_LINHA"], "interval_minutes": 60, + "truncate_hour": True, }, { "table_id": "matriz_integracao", @@ -280,6 +283,7 @@ class constants(Enum): # pylint: disable=c0103 "cd_integracao", ], # id column to nest data on "interval_minutes": 60, + "truncate_hour": True, }, ] diff --git a/pipelines/rj_smtr/flows.py b/pipelines/rj_smtr/flows.py index 18a7fb1a3..7340e6a3b 100644 --- a/pipelines/rj_smtr/flows.py +++ b/pipelines/rj_smtr/flows.py @@ -26,7 +26,7 @@ from pipelines.rj_smtr.tasks import ( create_date_hour_partition, create_local_partition_path, - get_current_timestamp, + get_rounded_timestamp, parse_timestamp_to_string, transform_raw_to_nested_structure, create_dbt_run_vars, @@ -52,6 +52,7 @@ table_id = Parameter("table_id", default=None) dataset_id = Parameter("dataset_id", default=None) partition_date_only = Parameter("partition_date_only", default=None) + timestamp = Parameter("timestamp", default=None) # Parâmetros Captura # extract_params = Parameter("extract_params", default=None) @@ -70,16 +71,21 @@ checkpoint=False, ) + current_timestamp = get_rounded_timestamp( + timestamp=timestamp, interval_minutes=interval_minutes + ) + with case(recapture, True): _, recapture_timestamps, recapture_previous_errors = query_logs( dataset_id=dataset_id, table_id=table_id, + datetime_filter=current_timestamp, interval_minutes=interval_minutes, recapture_window_days=recapture_window_days, ) with case(recapture, False): - capture_timestamp = [get_current_timestamp()] + capture_timestamp = [current_timestamp] capture_previous_errors = task( lambda: [None], checkpoint=False, name="assign_none_to_previous_errors" )() diff --git a/pipelines/rj_smtr/tasks.py b/pipelines/rj_smtr/tasks.py index 79cd84751..4bb7e481d 100644 --- a/pipelines/rj_smtr/tasks.py +++ b/pipelines/rj_smtr/tasks.py @@ -243,13 +243,54 @@ def create_dbt_run_vars( @task -def get_current_timestamp(timestamp=None, truncate_minute: bool = True) -> datetime: +def get_rounded_timestamp( + timestamp: Union[str, datetime, None] = None, + interval_minutes: Union[int, None] = None, +) -> datetime: + """ + Calculate rounded timestamp for flow run. + + Args: + timestamp (Union[str, datetime, None]): timestamp to be used as reference + interval_minutes (Union[int, None], optional): interval in minutes between each recapture + + Returns: + datetime: timestamp for flow run + """ + if isinstance(timestamp, str): + timestamp = datetime.fromisoformat(timestamp) + + if not timestamp: + timestamp = datetime.now(tz=timezone(constants.TIMEZONE.value)) + + timestamp = timestamp.replace(second=0, microsecond=0) + + if interval_minutes: + if interval_minutes >= 60: + hours = interval_minutes / 60 + interval_minutes = round(((hours) % 1) * 60) + + if interval_minutes == 0: + rounded_minutes = interval_minutes + else: + rounded_minutes = (timestamp.minute // interval_minutes) * interval_minutes + + timestamp = timestamp.replace(minute=rounded_minutes) + + return timestamp + + +@task +def get_current_timestamp( + timestamp=None, truncate_minute: bool = True, truncate_hour: bool = False +) -> datetime: """ Get current timestamp for flow run. Args: timestamp: timestamp to be used as reference (optionally, it can be a string) truncate_minute: whether to truncate the timestamp to the minute or not + truncate_hour: whether to truncate the timestamp to the hour or not Returns: datetime: timestamp for flow run @@ -259,7 +300,9 @@ def get_current_timestamp(timestamp=None, truncate_minute: bool = True) -> datet if not timestamp: timestamp = datetime.now(tz=timezone(constants.TIMEZONE.value)) if truncate_minute: - return timestamp.replace(second=0, microsecond=0) + timestamp = timestamp.replace(second=0, microsecond=0) + if truncate_hour: + timestamp = timestamp.replace(minute=0) return timestamp @@ -385,6 +428,7 @@ def query_logs( max_recaptures (int, optional): maximum number of recaptures to be done interval_minutes (int, optional): interval in minutes between each recapture recapture_window_days (int, optional): Number of days to query for erros + truncate_hour: whether to truncate the timestamp to the hour or not Returns: lists: errors (bool), @@ -1265,3 +1309,28 @@ def unpack_mapped_results_nout2( """ return [r[0] for r in mapped_results], [r[1] for r in mapped_results] + + +@task(checkpoint=False) +def join_dicts( + original_dict: Union[dict, list[dict]], dict_to_join: dict +) -> Union[dict, list[dict]]: + """ + Task to join a dict or list of dicts with another dict + + Args: + original_dict (Union[dict, list[dict]]): The input dict or list of dicts + dict_to_join (dict): The dict to be joined with original_dict + + Returns: + Union[dict, list[dict]]: The joined value + """ + + if isinstance(original_dict, list): + return [d | dict_to_join for d in original_dict] + elif isinstance(original_dict, dict): + return original_dict | dict_to_join + else: + raise ValueError( + f"original_dict must be dict or list, received: {type(original_dict)}" + ) From d4e16db9e7ddaf87156fd9142d944643a28a5077 Mon Sep 17 00:00:00 2001 From: Rafael Date: Mon, 23 Oct 2023 12:59:22 -0300 Subject: [PATCH 137/145] remover parametro timestamp --- .../br_rj_riodejaneiro_bilhetagem/flows.py | 17 ++++--------- pipelines/rj_smtr/flows.py | 5 +--- pipelines/rj_smtr/tasks.py | 25 ------------------- 3 files changed, 6 insertions(+), 41 deletions(-) diff --git a/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/flows.py b/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/flows.py index 1c8c5c934..8dcfe80cd 100644 --- a/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/flows.py +++ b/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/flows.py @@ -30,7 +30,7 @@ default_materialization_flow, ) -from pipelines.rj_smtr.tasks import get_rounded_timestamp, join_dicts +from pipelines.rj_smtr.tasks import get_rounded_timestamp from pipelines.rj_smtr.constants import constants @@ -141,16 +141,13 @@ LABELS = get_current_flow_labels() # Recaptura Transação - transacao_recapture_params = join_dicts( - original_dict=constants.BILHETAGEM_TRANSACAO_CAPTURE_PARAMS.value, - dict_to_join={"timestamp": timestamp}, - ) + run_recaptura_trasacao = create_flow_run( flow_name=bilhetagem_recaptura.name, # project_name=emd_constants.PREFECT_DEFAULT_PROJECT.value, project_name="staging", labels=LABELS, - parameters=transacao_recapture_params, + parameters=constants.BILHETAGEM_TRANSACAO_CAPTURE_PARAMS.value, ) wait_recaptura_trasacao = wait_for_flow_run( @@ -162,15 +159,11 @@ # Captura Auxiliar - auxiliar_capture_params = join_dicts( - original_dict=constants.BILHETAGEM_CAPTURE_PARAMS.value, - dict_to_join={"timestamp": timestamp}, - ) runs_captura = create_flow_run.map( flow_name=unmapped(bilhetagem_auxiliar_captura.name), # project_name=unmapped(emd_constants.PREFECT_DEFAULT_PROJECT.value), project_name=unmapped("staging"), - parameters=auxiliar_capture_params, + parameters=constants.BILHETAGEM_CAPTURE_PARAMS.value, labels=unmapped(LABELS), ) @@ -187,7 +180,7 @@ flow_name=unmapped(bilhetagem_recaptura.name), # project_name=unmapped(emd_constants.PREFECT_DEFAULT_PROJECT.value), project_name=unmapped("staging"), - parameters=auxiliar_capture_params, + parameters=constants.BILHETAGEM_CAPTURE_PARAMS.value, labels=unmapped(LABELS), ) diff --git a/pipelines/rj_smtr/flows.py b/pipelines/rj_smtr/flows.py index 7340e6a3b..0dddf166b 100644 --- a/pipelines/rj_smtr/flows.py +++ b/pipelines/rj_smtr/flows.py @@ -52,7 +52,6 @@ table_id = Parameter("table_id", default=None) dataset_id = Parameter("dataset_id", default=None) partition_date_only = Parameter("partition_date_only", default=None) - timestamp = Parameter("timestamp", default=None) # Parâmetros Captura # extract_params = Parameter("extract_params", default=None) @@ -71,9 +70,7 @@ checkpoint=False, ) - current_timestamp = get_rounded_timestamp( - timestamp=timestamp, interval_minutes=interval_minutes - ) + current_timestamp = get_rounded_timestamp(interval_minutes=interval_minutes) with case(recapture, True): _, recapture_timestamps, recapture_previous_errors = query_logs( diff --git a/pipelines/rj_smtr/tasks.py b/pipelines/rj_smtr/tasks.py index 4bb7e481d..887e2c29f 100644 --- a/pipelines/rj_smtr/tasks.py +++ b/pipelines/rj_smtr/tasks.py @@ -1309,28 +1309,3 @@ def unpack_mapped_results_nout2( """ return [r[0] for r in mapped_results], [r[1] for r in mapped_results] - - -@task(checkpoint=False) -def join_dicts( - original_dict: Union[dict, list[dict]], dict_to_join: dict -) -> Union[dict, list[dict]]: - """ - Task to join a dict or list of dicts with another dict - - Args: - original_dict (Union[dict, list[dict]]): The input dict or list of dicts - dict_to_join (dict): The dict to be joined with original_dict - - Returns: - Union[dict, list[dict]]: The joined value - """ - - if isinstance(original_dict, list): - return [d | dict_to_join for d in original_dict] - elif isinstance(original_dict, dict): - return original_dict | dict_to_join - else: - raise ValueError( - f"original_dict must be dict or list, received: {type(original_dict)}" - ) From af89e2fe07244b9878c429fa7b0606b8a852c26b Mon Sep 17 00:00:00 2001 From: Rafael Date: Mon, 23 Oct 2023 13:09:46 -0300 Subject: [PATCH 138/145] remove truncate hour --- pipelines/rj_smtr/tasks.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/pipelines/rj_smtr/tasks.py b/pipelines/rj_smtr/tasks.py index 887e2c29f..2c7e60c16 100644 --- a/pipelines/rj_smtr/tasks.py +++ b/pipelines/rj_smtr/tasks.py @@ -281,16 +281,13 @@ def get_rounded_timestamp( @task -def get_current_timestamp( - timestamp=None, truncate_minute: bool = True, truncate_hour: bool = False -) -> datetime: +def get_current_timestamp(timestamp=None, truncate_minute: bool = True) -> datetime: """ Get current timestamp for flow run. Args: timestamp: timestamp to be used as reference (optionally, it can be a string) truncate_minute: whether to truncate the timestamp to the minute or not - truncate_hour: whether to truncate the timestamp to the hour or not Returns: datetime: timestamp for flow run @@ -301,8 +298,7 @@ def get_current_timestamp( timestamp = datetime.now(tz=timezone(constants.TIMEZONE.value)) if truncate_minute: timestamp = timestamp.replace(second=0, microsecond=0) - if truncate_hour: - timestamp = timestamp.replace(minute=0) + return timestamp From ddffd6cdf69f20eae8c92775ab4d19bd16dc4cfe Mon Sep 17 00:00:00 2001 From: Rafael Date: Mon, 23 Oct 2023 13:32:44 -0300 Subject: [PATCH 139/145] mudar agent para prd --- pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/flows.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/flows.py b/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/flows.py index 8dcfe80cd..899ad3127 100644 --- a/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/flows.py +++ b/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/flows.py @@ -211,6 +211,6 @@ bilhetagem_transacao_tratamento.storage = GCS(emd_constants.GCS_FLOWS_BUCKET.value) bilhetagem_transacao_tratamento.run_config = KubernetesRun( image=emd_constants.DOCKER_IMAGE.value, - labels=[emd_constants.RJ_SMTR_DEV_AGENT_LABEL.value], + labels=[emd_constants.RJ_SMTR_AGENT_LABEL.value], ) bilhetagem_transacao_tratamento.schedule = every_hour From a4660d169b4fcc60c0fa8af176d26ca034fcb55b Mon Sep 17 00:00:00 2001 From: Rafael Date: Mon, 23 Oct 2023 15:25:28 -0300 Subject: [PATCH 140/145] mudar project name --- pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/flows.py | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/flows.py b/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/flows.py index 899ad3127..4ca7bc6ec 100644 --- a/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/flows.py +++ b/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/flows.py @@ -144,8 +144,7 @@ run_recaptura_trasacao = create_flow_run( flow_name=bilhetagem_recaptura.name, - # project_name=emd_constants.PREFECT_DEFAULT_PROJECT.value, - project_name="staging", + project_name=emd_constants.PREFECT_DEFAULT_PROJECT.value, labels=LABELS, parameters=constants.BILHETAGEM_TRANSACAO_CAPTURE_PARAMS.value, ) @@ -161,8 +160,7 @@ runs_captura = create_flow_run.map( flow_name=unmapped(bilhetagem_auxiliar_captura.name), - # project_name=unmapped(emd_constants.PREFECT_DEFAULT_PROJECT.value), - project_name=unmapped("staging"), + project_name=unmapped(emd_constants.PREFECT_DEFAULT_PROJECT.value), parameters=constants.BILHETAGEM_CAPTURE_PARAMS.value, labels=unmapped(LABELS), ) @@ -178,8 +176,7 @@ runs_recaptura_auxiliar = create_flow_run.map( flow_name=unmapped(bilhetagem_recaptura.name), - # project_name=unmapped(emd_constants.PREFECT_DEFAULT_PROJECT.value), - project_name=unmapped("staging"), + project_name=unmapped(emd_constants.PREFECT_DEFAULT_PROJECT.value), parameters=constants.BILHETAGEM_CAPTURE_PARAMS.value, labels=unmapped(LABELS), ) From 5f3596b40afe8e078b6ae037fb7fe47f940b9c89 Mon Sep 17 00:00:00 2001 From: Rafael Date: Mon, 23 Oct 2023 15:33:21 -0300 Subject: [PATCH 141/145] criar constante interval --- .../rj_smtr/br_rj_riodejaneiro_bilhetagem/flows.py | 4 +++- pipelines/rj_smtr/constants.py | 10 ++++++---- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/flows.py b/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/flows.py index 4ca7bc6ec..03293ca0c 100644 --- a/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/flows.py +++ b/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/flows.py @@ -131,7 +131,9 @@ ) as bilhetagem_transacao_tratamento: # Configuração # - timestamp = get_rounded_timestamp(interval_minutes=60) + timestamp = get_rounded_timestamp( + interval_minutes=constants.BILHETAGEM_AUXILIAR_INTERVAL.value + ) rename_flow_run = rename_current_flow_run_now_time( prefix=bilhetagem_transacao_tratamento.name + " ", diff --git a/pipelines/rj_smtr/constants.py b/pipelines/rj_smtr/constants.py index 0c8c5e134..a85e2af73 100644 --- a/pipelines/rj_smtr/constants.py +++ b/pipelines/rj_smtr/constants.py @@ -228,6 +228,8 @@ class constants(Enum): # pylint: disable=c0103 BILHETAGEM_SECRET_PATH = "smtr_jae_access_data" + BILHETAGEM_AUXILIAR_INTERVAL = 60 + BILHETAGEM_CAPTURE_PARAMS = [ { "table_id": "linha", @@ -245,7 +247,7 @@ class constants(Enum): # pylint: disable=c0103 """, }, "primary_key": ["CD_LINHA"], # id column to nest data on - "interval_minutes": 60, + "interval_minutes": BILHETAGEM_AUXILIAR_INTERVAL, "truncate_hour": True, }, { @@ -264,7 +266,7 @@ class constants(Enum): # pylint: disable=c0103 """, }, "primary_key": ["CD_GRUPO"], # id column to nest data on - "interval_minutes": 60, + "interval_minutes": BILHETAGEM_AUXILIAR_INTERVAL, "truncate_hour": True, }, { @@ -283,7 +285,7 @@ class constants(Enum): # pylint: disable=c0103 """, }, "primary_key": ["CD_GRUPO", "CD_LINHA"], - "interval_minutes": 60, + "interval_minutes": BILHETAGEM_AUXILIAR_INTERVAL, "truncate_hour": True, }, { @@ -305,7 +307,7 @@ class constants(Enum): # pylint: disable=c0103 "cd_versao_matriz", "cd_integracao", ], # id column to nest data on - "interval_minutes": 60, + "interval_minutes": BILHETAGEM_AUXILIAR_INTERVAL, "truncate_hour": True, }, ] From 9a057080da825807b738de162b8cdded57150c7c Mon Sep 17 00:00:00 2001 From: Rafael Date: Mon, 23 Oct 2023 15:42:15 -0300 Subject: [PATCH 142/145] criar recaptura gps --- .../br_rj_riodejaneiro_bilhetagem/flows.py | 36 +++++++++++++++++-- pipelines/rj_smtr/constants.py | 10 +++--- 2 files changed, 39 insertions(+), 7 deletions(-) diff --git a/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/flows.py b/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/flows.py index 03293ca0c..02c675860 100644 --- a/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/flows.py +++ b/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/flows.py @@ -70,7 +70,7 @@ bilhetagem_tracking_captura.storage = GCS(emd_constants.GCS_FLOWS_BUCKET.value) bilhetagem_tracking_captura.run_config = KubernetesRun( image=emd_constants.DOCKER_IMAGE.value, - labels=[emd_constants.RJ_SMTR_DEV_AGENT_LABEL.value], + labels=[emd_constants.RJ_SMTR_AGENT_LABEL.value], ) bilhetagem_tracking_captura = set_default_parameters( @@ -132,7 +132,7 @@ # Configuração # timestamp = get_rounded_timestamp( - interval_minutes=constants.BILHETAGEM_AUXILIAR_INTERVAL.value + interval_minutes=constants.BILHETAGEM_TRATAMENTO_INTERVAL.value ) rename_flow_run = rename_current_flow_run_now_time( @@ -213,3 +213,35 @@ labels=[emd_constants.RJ_SMTR_AGENT_LABEL.value], ) bilhetagem_transacao_tratamento.schedule = every_hour + + +with Flow( + "SMTR: Bilhetagem GPS Validador - Tratamento", + code_owners=["caio", "fernanda", "boris", "rodrigo"], +) as bilhetagem_gps_tratamento: + timestamp = get_rounded_timestamp( + interval_minutes=constants.BILHETAGEM_TRATAMENTO_INTERVAL.value + ) + + rename_flow_run = rename_current_flow_run_now_time( + prefix=bilhetagem_transacao_tratamento.name + " ", + now_time=timestamp, + ) + + LABELS = get_current_flow_labels() + + # Recaptura Transação + + run_recaptura_gps = create_flow_run( + flow_name=bilhetagem_recaptura.name, + project_name=emd_constants.PREFECT_DEFAULT_PROJECT.value, + labels=LABELS, + parameters=constants.BILHETAGEM_TRACKING_CAPTURE_PARAMS.value, + ) + + wait_recaptura_gps = wait_for_flow_run( + run_recaptura_gps, + stream_states=True, + stream_logs=True, + raise_final_state=True, + ) diff --git a/pipelines/rj_smtr/constants.py b/pipelines/rj_smtr/constants.py index a85e2af73..a4a8c375f 100644 --- a/pipelines/rj_smtr/constants.py +++ b/pipelines/rj_smtr/constants.py @@ -228,7 +228,7 @@ class constants(Enum): # pylint: disable=c0103 BILHETAGEM_SECRET_PATH = "smtr_jae_access_data" - BILHETAGEM_AUXILIAR_INTERVAL = 60 + BILHETAGEM_TRATAMENTO_INTERVAL = 60 BILHETAGEM_CAPTURE_PARAMS = [ { @@ -247,7 +247,7 @@ class constants(Enum): # pylint: disable=c0103 """, }, "primary_key": ["CD_LINHA"], # id column to nest data on - "interval_minutes": BILHETAGEM_AUXILIAR_INTERVAL, + "interval_minutes": BILHETAGEM_TRATAMENTO_INTERVAL, "truncate_hour": True, }, { @@ -266,7 +266,7 @@ class constants(Enum): # pylint: disable=c0103 """, }, "primary_key": ["CD_GRUPO"], # id column to nest data on - "interval_minutes": BILHETAGEM_AUXILIAR_INTERVAL, + "interval_minutes": BILHETAGEM_TRATAMENTO_INTERVAL, "truncate_hour": True, }, { @@ -285,7 +285,7 @@ class constants(Enum): # pylint: disable=c0103 """, }, "primary_key": ["CD_GRUPO", "CD_LINHA"], - "interval_minutes": BILHETAGEM_AUXILIAR_INTERVAL, + "interval_minutes": BILHETAGEM_TRATAMENTO_INTERVAL, "truncate_hour": True, }, { @@ -307,7 +307,7 @@ class constants(Enum): # pylint: disable=c0103 "cd_versao_matriz", "cd_integracao", ], # id column to nest data on - "interval_minutes": BILHETAGEM_AUXILIAR_INTERVAL, + "interval_minutes": BILHETAGEM_TRATAMENTO_INTERVAL, "truncate_hour": True, }, ] From b5a403d9f9f8822487b43f15068100f90b3a5886 Mon Sep 17 00:00:00 2001 From: Rafael Date: Mon, 23 Oct 2023 15:42:26 -0300 Subject: [PATCH 143/145] corrigir docstring --- pipelines/rj_smtr/tasks.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pipelines/rj_smtr/tasks.py b/pipelines/rj_smtr/tasks.py index 2c7e60c16..9a8188ebe 100644 --- a/pipelines/rj_smtr/tasks.py +++ b/pipelines/rj_smtr/tasks.py @@ -424,7 +424,6 @@ def query_logs( max_recaptures (int, optional): maximum number of recaptures to be done interval_minutes (int, optional): interval in minutes between each recapture recapture_window_days (int, optional): Number of days to query for erros - truncate_hour: whether to truncate the timestamp to the hour or not Returns: lists: errors (bool), From 165f9abedd825ac8aff14fe2215f2f1ed659b996 Mon Sep 17 00:00:00 2001 From: Rafael Date: Mon, 23 Oct 2023 15:43:47 -0300 Subject: [PATCH 144/145] alterar comentario recaptura --- pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/flows.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/flows.py b/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/flows.py index 02c675860..04f6eb61f 100644 --- a/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/flows.py +++ b/pipelines/rj_smtr/br_rj_riodejaneiro_bilhetagem/flows.py @@ -230,7 +230,7 @@ LABELS = get_current_flow_labels() - # Recaptura Transação + # Recaptura GPS run_recaptura_gps = create_flow_run( flow_name=bilhetagem_recaptura.name, From e8711b6bc9cbcc4bbfee46c7e2fc8752cccd231d Mon Sep 17 00:00:00 2001 From: Rafael Date: Mon, 23 Oct 2023 15:51:34 -0300 Subject: [PATCH 145/145] voltar task get_current_timestamp --- pipelines/rj_smtr/tasks.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/pipelines/rj_smtr/tasks.py b/pipelines/rj_smtr/tasks.py index 9a8188ebe..236988282 100644 --- a/pipelines/rj_smtr/tasks.py +++ b/pipelines/rj_smtr/tasks.py @@ -297,9 +297,7 @@ def get_current_timestamp(timestamp=None, truncate_minute: bool = True) -> datet if not timestamp: timestamp = datetime.now(tz=timezone(constants.TIMEZONE.value)) if truncate_minute: - timestamp = timestamp.replace(second=0, microsecond=0) - - return timestamp + return timestamp.replace(second=0, microsecond=0) @task