From 320b10ebcc6be43a9f428a65d27629b2111d2527 Mon Sep 17 00:00:00 2001 From: Daniel Herszenhut Date: Wed, 6 Nov 2024 16:27:12 -0300 Subject: [PATCH] padronizacao --- R/padronizacao.R | 108 ++++++++++++- _targets.R | 5 +- _targets/meta/meta | 4 +- renv.lock | 391 +++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 503 insertions(+), 5 deletions(-) diff --git a/R/padronizacao.R b/R/padronizacao.R index 0970758..fa92d53 100644 --- a/R/padronizacao.R +++ b/R/padronizacao.R @@ -9,10 +9,114 @@ padronizar_cnefe <- function() { "nom_titulo_seglogr", # titulo (e.g. general, papa, santa, etc) "nom_seglogr", # logradouro "num_adress", # numero - "dsc_modificador", # modificador do numero - "cep", # cep "lon", # longitude "lat", # latituted "nv_geo_coord" # nivel de geocodificacao ) + + cnefe <- ipeadatalake::ler_cnefe(2022, colunas = colunas_a_manter) + + # se número == 0, setar NA, que vira S/N depois + + cnefe <- mutate( + cnefe, + num_adress = ifelse(num_adress == 0, NA_integer_, num_adress) + ) + + # existem casos em que o titulo do logradouro é repetido no nome do + # logradouro. isso acontece mesmo quando o título do logradouro tem até 3 + # palavras. só podemos juntar o nome com o titulo nos casos em que essa + # repetição não ocorre + + cnefe <- mutate( + cnefe, + nwords_titulo = stringr::str_count(nom_titulo_seglogr, "\\S+") + ) + + cnefe <- data.table::setDT(collect(cnefe)) + + cnefe[nwords_titulo == 1, comeco_logr := stringr::word(nom_seglogr, 1, 1)] + cnefe[nwords_titulo == 2, comeco_logr := stringr::word(nom_seglogr, 1, 2)] + cnefe[nwords_titulo == 3, comeco_logr := stringr::word(nom_seglogr, 1, 3)] + cnefe[nom_titulo_seglogr == comeco_logr, juntar := FALSE] + cnefe[nwords_titulo == 0, juntar := FALSE] + cnefe[is.na(juntar), juntar := TRUE] + + cnefe[ + , + nome_logradouro := ifelse( + juntar, + paste(nom_titulo_seglogr, nom_seglogr), + nom_seglogr + ) + ] + cnefe[, c("nom_titulo_seglogr", "nom_seglogr") := NULL] + cnefe[, c("nwords_titulo", "comeco_logr", "juntar") := NULL] + + cnefe[, estado := enderecobr::padronizar_estados(code_state)] + cnefe[, code_state := NULL] + + cnefe[, municipio := enderecobr::padronizar_municipios(code_muni)] + cnefe[, code_muni := NULL] + + cnefe[, cep := enderecobr::padronizar_ceps(cep)] + + cnefe[, numero := enderecobr::padronizar_numeros(num_adress)] + cnefe[, num_adress := NULL] + + cnefe[ + , + `:=`( + logradouro_completo = paste(nom_tipo_seglogr, nome_logradouro, numero), + logradouro_sem_numero = paste(nom_tipo_seglogr, nome_logradouro) + ) + ] + + data.table::setnames( + cnefe, + old = c("code_address", "desc_localidade", "nom_tipo_seglogr"), + new = c("codigo_endereco", "localidade", "tipo_logradouro") + ) + + data.table::setcolorder( + cnefe, + c( + "codigo_endereco", "estado", "municipio", "localidade", "cep", + "tipo_logradouro", "nome_logradouro", "numero", "logradouro_sem_numero", + "logradouro_completo", "lon", "lat", "nv_geo_coord" + ) + ) + + schema_cnefe <- arrow::schema( + codigo_endereco = arrow::int32(), + estado = arrow::string(), + municipio = arrow::string(), + localidade = arrow::string(), + cep = arrow::string(), + tipo_logradouro = arrow::string(), + nome_logradouro = arrow::string(), + numero = arrow::string(), + logradouro_sem_numero = arrow::large_utf8(), + logradouro_completo = arrow::large_utf8(), + lon = arrow::float64(), + lat = arrow::float64(), + nv_geo_coord = arrow::int8() + ) + + cnefe_arrow <- arrow::as_arrow_table(cnefe, schema = schema_cnefe) + + dir_dados <- file.path( + Sys.getenv("USERS_DATA_PATH"), + "CGDTI/IpeaDataLab/projetos/geolocalizacao/cnefe_padronizado_tmpdir" + ) + + arrow::write_dataset( + cnefe_arrow, + path = dir_dados, + format = "parquet", + partitioning = "estado", + hive_style = TRUE + ) + + return(dir_dados) } diff --git a/_targets.R b/_targets.R index bccc999..25f3bb6 100644 --- a/_targets.R +++ b/_targets.R @@ -1,9 +1,12 @@ suppressPackageStartupMessages({ library(targets) + library(dplyr) }) +tar_option_set(trust_timestamps = TRUE) + source("R/padronizacao.R", encoding = "UTF-8") list( - tar_target(padronizacao, padronizar_cnefe()) + tar_target(padronizacao, padronizar_cnefe(), format = "file") ) diff --git a/_targets/meta/meta b/_targets/meta/meta index 0dd957e..d382772 100644 --- a/_targets/meta/meta +++ b/_targets/meta/meta @@ -1,3 +1,3 @@ name|type|data|command|depend|seed|path|time|size|bytes|format|repository|iteration|parent|children|seconds|warnings|error -padronizar_cnefe|function|6c2e94b92c42135a -padronizacao|stem|274ca85b9d987ab1|0e8b7c33747183c6|3b04dbe22eee49f6|704861656||t20032.832925249s|7abe23fc6ac7bb03|175|rds|local|vector|||0|| +padronizacao|stem|cfcfd9014a4db4b8|0e8b7c33747183c6|c0a30c215126d086|704861656|//storage6/usuarios/CGDTI/IpeaDataLab/projetos/geolocalizacao/cnefe_padronizado_tmpdir|t20033.7973247289s|8c2f75acc7334810|4927165576|file|local|vector|||1434.67|Potentially unsafe or invalid elements have been discarded from R metadata.ℹ Type externalptr If you trust the source, you can set optionsarrow.unsafe_metadata TRUE to preserve them.. Potentially unsafe or invalid elements have been discarded from R metadata.ℹ Type externalptr If you trust the source, you can set optionsarrow.unsafe_metadata TRUE to preserve them.. Potentially unsafe or invalid elements have been discarded from R metadata.ℹ Type externalptr If you trust the source, you can set optionsarrow.unsafe_metadata TRUE to preserve them.| +padronizar_cnefe|function|6e9f9f3a976f8006||||||||||||||| diff --git a/renv.lock b/renv.lock index 1971010..542bd48 100644 --- a/renv.lock +++ b/renv.lock @@ -9,6 +9,17 @@ ] }, "Packages": { + "DBI": { + "Package": "DBI", + "Version": "1.2.3", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "R", + "methods" + ], + "Hash": "065ae649b05f1ff66bb0c793107508f5" + }, "Matrix": { "Package": "Matrix", "Version": "1.7-1", @@ -36,6 +47,63 @@ ], "Hash": "470851b6d5d0ac559e9d01bb352b4021" }, + "Rcpp": { + "Package": "Rcpp", + "Version": "1.0.13-1", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "methods", + "utils" + ], + "Hash": "6b868847b365672d6c1677b1608da9ed" + }, + "RcppArmadillo": { + "Package": "RcppArmadillo", + "Version": "14.0.2-1", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "R", + "Rcpp", + "methods", + "stats", + "utils" + ], + "Hash": "edff747eebfb8f2e18eed194e000caa1" + }, + "arrow": { + "Package": "arrow", + "Version": "17.0.0.1", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "R", + "R6", + "assertthat", + "bit64", + "cpp11", + "glue", + "methods", + "purrr", + "rlang", + "stats", + "tidyselect", + "utils", + "vctrs" + ], + "Hash": "14af96cb2973f6a6c220ce9c3e5b02cd" + }, + "assertthat": { + "Package": "assertthat", + "Version": "0.2.1", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "tools" + ], + "Hash": "50c838a310445e954bc13f26f26a6ecf" + }, "backports": { "Package": "backports", "Version": "1.5.0", @@ -56,6 +124,30 @@ ], "Hash": "0c54cf3a08cc0e550fbd64ad33166143" }, + "bit": { + "Package": "bit", + "Version": "4.5.0", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "R" + ], + "Hash": "5dc7b2677d65d0e874fc4aaf0e879987" + }, + "bit64": { + "Package": "bit64", + "Version": "4.5.2", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "R", + "bit", + "methods", + "stats", + "utils" + ], + "Hash": "e84984bf5f12a18628d9a02322128dfd" + }, "callr": { "Package": "callr", "Version": "3.7.6", @@ -69,6 +161,37 @@ ], "Hash": "d7e13f49c19103ece9e58ad2d83a7354" }, + "censobr": { + "Package": "censobr", + "Version": "0.4.0", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "DBI", + "R", + "arrow", + "checkmate", + "curl", + "dplyr", + "duckdb", + "duckplyr", + "fs", + "tools" + ], + "Hash": "1fda43f008044c4e7ada6a4b26c113a7" + }, + "checkmate": { + "Package": "checkmate", + "Version": "2.3.2", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "R", + "backports", + "utils" + ], + "Hash": "0e14e01ce07e7c88fd25de6d4260d26b" + }, "cli": { "Package": "cli", "Version": "3.6.3", @@ -90,6 +213,13 @@ ], "Hash": "61e097f35917d342622f21cdc79c256e" }, + "collections": { + "Package": "collections", + "Version": "0.3.7", + "Source": "Repository", + "Repository": "RSPM", + "Hash": "90a0eda114ab0bef170ddbf5ef0cd93f" + }, "cpp11": { "Package": "cpp11", "Version": "0.5.0", @@ -100,6 +230,16 @@ ], "Hash": "91570bba75d0c9d3f1040c835cee8fba" }, + "curl": { + "Package": "curl", + "Version": "5.2.3", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "R" + ], + "Hash": "d91263322a58af798f6cf3b13fd56dde" + }, "data.table": { "Package": "data.table", "Version": "1.16.2", @@ -111,6 +251,82 @@ ], "Hash": "2e00b378fc3be69c865120d9f313039a" }, + "dplyr": { + "Package": "dplyr", + "Version": "1.1.4", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "R", + "R6", + "cli", + "generics", + "glue", + "lifecycle", + "magrittr", + "methods", + "pillar", + "rlang", + "tibble", + "tidyselect", + "utils", + "vctrs" + ], + "Hash": "fedd9d00c2944ff00a0e2696ccf048ec" + }, + "duckdb": { + "Package": "duckdb", + "Version": "1.1.2", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "DBI", + "R", + "methods", + "utils" + ], + "Hash": "83a09ee9c8380fecfcea1daeaa99e3b2" + }, + "duckplyr": { + "Package": "duckplyr", + "Version": "0.4.1", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "DBI", + "R", + "cli", + "collections", + "dplyr", + "duckdb", + "glue", + "jsonlite", + "lifecycle", + "rlang", + "tibble", + "tidyselect", + "utils", + "vctrs" + ], + "Hash": "8de3d932a099f4c19907b4e459d854eb" + }, + "enderecobr": { + "Package": "enderecobr", + "Version": "0.2.0", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "R", + "checkmate", + "cli", + "data.table", + "rlang", + "stringi", + "stringr", + "tibble" + ], + "Hash": "7f867231a2e699d85682dc118066e239" + }, "evaluate": { "Package": "evaluate", "Version": "1.0.1", @@ -133,6 +349,28 @@ ], "Hash": "962174cf2aeb5b9eea581522286a911f" }, + "fs": { + "Package": "fs", + "Version": "1.6.5", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "R", + "methods" + ], + "Hash": "7f48af39fa27711ea5fbd183b399920d" + }, + "generics": { + "Package": "generics", + "Version": "0.1.3", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "R", + "methods" + ], + "Hash": "15e9634c0fcd294799e9b2e929ed1b86" + }, "glue": { "Package": "glue", "Version": "1.8.0", @@ -178,6 +416,43 @@ ], "Hash": "c03878b48737a0e2da3b772d7b2e22da" }, + "ipeadatalake": { + "Package": "ipeadatalake", + "Version": "0.1.0", + "Source": "Git", + "RemoteType": "git2r", + "RemoteUrl": "https://gitlab.ipea.gov.br/data/ipeadatalake", + "RemoteRef": "v0.1.0", + "RemoteSha": "fc27f89ab3e493135bfb56934ce1b0cc448e15cf", + "Requirements": [ + "DBI", + "R", + "arrow", + "bit64", + "censobr", + "checkmate", + "cli", + "data.table", + "dplyr", + "duckdb", + "duckplyr", + "fs", + "lifecycle", + "rlang", + "survey" + ], + "Hash": "7a975cff84657cbd1efec1e77d38c5a2" + }, + "jsonlite": { + "Package": "jsonlite", + "Version": "1.8.9", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "methods" + ], + "Hash": "4e993b65c2c3ffbffce7bb3e2c6f832b" + }, "knitr": { "Package": "knitr", "Version": "1.48", @@ -232,6 +507,38 @@ ], "Hash": "7ce2733a9826b3aeb1775d56fd305472" }, + "minqa": { + "Package": "minqa", + "Version": "1.2.8", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "Rcpp" + ], + "Hash": "785ef8e22389d4a7634c6c944f2dc07d" + }, + "mitools": { + "Package": "mitools", + "Version": "2.4", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "DBI", + "methods", + "stats" + ], + "Hash": "a4b659bd0528226724d55034f11ed7cb" + }, + "numDeriv": { + "Package": "numDeriv", + "Version": "2016.8-1.1", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "R" + ], + "Hash": "df58958f293b166e4ab885ebcad90e02" + }, "pillar": { "Package": "pillar", "Version": "1.9.0", @@ -283,6 +590,21 @@ ], "Hash": "b4404b1de13758dea1c0484ad0d48563" }, + "purrr": { + "Package": "purrr", + "Version": "1.0.2", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "R", + "cli", + "lifecycle", + "magrittr", + "rlang", + "vctrs" + ], + "Hash": "1cba04a4e9414bdefc9dcaa99649a8dc" + }, "renv": { "Package": "renv", "Version": "1.0.11", @@ -314,6 +636,75 @@ ], "Hash": "eaf84737a6da68c1e843979963c09a6b" }, + "stringi": { + "Package": "stringi", + "Version": "1.8.4", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "R", + "stats", + "tools", + "utils" + ], + "Hash": "39e1144fd75428983dc3f63aa53dfa91" + }, + "stringr": { + "Package": "stringr", + "Version": "1.5.1", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "R", + "cli", + "glue", + "lifecycle", + "magrittr", + "rlang", + "stringi", + "vctrs" + ], + "Hash": "960e2ae9e09656611e0b8214ad543207" + }, + "survey": { + "Package": "survey", + "Version": "4.4-2", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "Matrix", + "R", + "Rcpp", + "RcppArmadillo", + "graphics", + "grid", + "lattice", + "methods", + "minqa", + "mitools", + "numDeriv", + "splines", + "stats", + "survival" + ], + "Hash": "b29af45d3afe5f718e387688d43d71e6" + }, + "survival": { + "Package": "survival", + "Version": "3.7-0", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "Matrix", + "R", + "graphics", + "methods", + "splines", + "stats", + "utils" + ], + "Hash": "5aaa9cbaf4aba20f8e06fdea1850a398" + }, "targets": { "Package": "targets", "Version": "1.8.0",