Skip to content

Commit

Permalink
dados (#1506)
Browse files Browse the repository at this point in the history
Co-authored-by: mergify[bot] <37929162+mergify[bot]@users.noreply.github.com>
  • Loading branch information
gabrielle-carv and mergify[bot] authored Jan 6, 2023
1 parent 086d6ac commit f849136
Show file tree
Hide file tree
Showing 8 changed files with 625 additions and 0 deletions.
7 changes: 7 additions & 0 deletions bases/world_wb_wwbi/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
Como capturar os dados de world_wb_wwbi?

Para capturar esses dados, basta verificar o link dos dados originais indicado em dataset_config.yaml no item website.

Caso tenha sido utilizado algum código de captura ou tratamento, estes estarão contidos em code/. Se o dado publicado for em sua versão bruta, não existirá a pasta code/.

Os dados publicados estão disponíveis em: https://basedosdados.org/dataset/world-wb-wwbi
287 changes: 287 additions & 0 deletions bases/world_wb_wwbi/code/data.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,287 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "85c05889",
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import os\n",
"#import basedosdados"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "6f52c930",
"metadata": {},
"outputs": [],
"source": [
"path = os.path.abspath(os.path.join('..', 'input'))\n",
"path_data = os.path.join(path, 'WWBIData.csv')\n",
"df = pd.read_csv(path_data)"
]
},
{
"cell_type": "code",
"execution_count": 24,
"id": "374fa6f9",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"202"
]
},
"execution_count": 24,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(df['Country Code'].unique())"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "d41e3750",
"metadata": {
"scrolled": true
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>year</th>\n",
" <th>country_name</th>\n",
" <th>country_code</th>\n",
" <th>indicator</th>\n",
" <th>score</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>2000</td>\n",
" <td>Afghanistan</td>\n",
" <td>AFG</td>\n",
" <td>Core Public Administration workers, as a share...</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>2000</td>\n",
" <td>Afghanistan</td>\n",
" <td>AFG</td>\n",
" <td>Core Public Administration workers, as a share...</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>2000</td>\n",
" <td>Afghanistan</td>\n",
" <td>AFG</td>\n",
" <td>Core Public Administration workers, as a share...</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>2000</td>\n",
" <td>Afghanistan</td>\n",
" <td>AFG</td>\n",
" <td>Cross-country public sector pay comparison rat...</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>2000</td>\n",
" <td>Afghanistan</td>\n",
" <td>AFG</td>\n",
" <td>Cross-country public sector pay comparison rat...</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1281079</th>\n",
" <td>2020</td>\n",
" <td>Zimbabwe</td>\n",
" <td>ZWE</td>\n",
" <td>Teachers, as a share of public formal employees</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1281080</th>\n",
" <td>2020</td>\n",
" <td>Zimbabwe</td>\n",
" <td>ZWE</td>\n",
" <td>Teachers, as a share of public paid employees</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1281081</th>\n",
" <td>2020</td>\n",
" <td>Zimbabwe</td>\n",
" <td>ZWE</td>\n",
" <td>Teachers, as a share of public total employees</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1281082</th>\n",
" <td>2020</td>\n",
" <td>Zimbabwe</td>\n",
" <td>ZWE</td>\n",
" <td>Wage bill as a percentage of GDP</td>\n",
" <td>6.440993</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1281083</th>\n",
" <td>2020</td>\n",
" <td>Zimbabwe</td>\n",
" <td>ZWE</td>\n",
" <td>Wage bill as a percentage of Public Expenditure</td>\n",
" <td>41.238518</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>1281084 rows × 5 columns</p>\n",
"</div>"
],
"text/plain": [
" year country_name country_code \\\n",
"0 2000 Afghanistan AFG \n",
"1 2000 Afghanistan AFG \n",
"2 2000 Afghanistan AFG \n",
"3 2000 Afghanistan AFG \n",
"4 2000 Afghanistan AFG \n",
"... ... ... ... \n",
"1281079 2020 Zimbabwe ZWE \n",
"1281080 2020 Zimbabwe ZWE \n",
"1281081 2020 Zimbabwe ZWE \n",
"1281082 2020 Zimbabwe ZWE \n",
"1281083 2020 Zimbabwe ZWE \n",
"\n",
" indicator score \n",
"0 Core Public Administration workers, as a share... NaN \n",
"1 Core Public Administration workers, as a share... NaN \n",
"2 Core Public Administration workers, as a share... NaN \n",
"3 Cross-country public sector pay comparison rat... NaN \n",
"4 Cross-country public sector pay comparison rat... NaN \n",
"... ... ... \n",
"1281079 Teachers, as a share of public formal employees NaN \n",
"1281080 Teachers, as a share of public paid employees NaN \n",
"1281081 Teachers, as a share of public total employees NaN \n",
"1281082 Wage bill as a percentage of GDP 6.440993 \n",
"1281083 Wage bill as a percentage of Public Expenditure 41.238518 \n",
"\n",
"[1281084 rows x 5 columns]"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df2 = pd.DataFrame()\n",
"for i in ['00','01','02','03','04','05', '06','07','08','09','10','11','12','13','14','15','16','17','18','19','20']:\n",
" lista = df[[ 'Country Name', 'Country Code', 'Indicator Name', '20'+i]].values\n",
" lista = pd.DataFrame(lista)\n",
" lista['year'] = '20'+i\n",
" lista.rename(columns = {0:'country_name', 1:'country_code', 2:'indicator',3:'score'}, inplace = True)\n",
" df2 = df2.append(lista).reset_index(drop=True)\n",
"\n",
"df2 = df2[['year', 'country_name','country_code','indicator','score']]\n",
"df2['year'] = df2['year'].astype(int)\n",
"df2['score'] = df2['score'].astype('float')\n",
"df2"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "1c75b7c7",
"metadata": {},
"outputs": [],
"source": [
"output = os.path.abspath(os.path.join('..', 'output'))"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "cb8f4261",
"metadata": {},
"outputs": [],
"source": [
"for ano in [*range(2000, 2021)]:\n",
" particao = output + f'/country_indicators/year={ano}/'\n",
" if not os.path.exists(particao):\n",
" os.makedirs(particao)"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "8d9730c3",
"metadata": {},
"outputs": [],
"source": [
"for ano in [*range(2000, 2021)]:\n",
" df_particao = df2[df2['year'] == ano].copy() # O .copy não é necessário é apenas uma boa prática\n",
" df_particao.drop(['year'], axis=1, inplace=True) # É preciso excluir as colunas utilizadas para partição \n",
" particao = output + f'/country_indicators/year={ano}/country_indicators.csv'\n",
" df_particao.to_csv(particao, index=False, encoding='utf-8', na_rep='')\n"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.12"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
27 changes: 27 additions & 0 deletions bases/world_wb_wwbi/country_indicators/publish.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
/*
Query para publicar a tabela.
Esse é o lugar para:
- modificar nomes, ordem e tipos de colunas
- dar join com outras tabelas
- criar colunas extras (e.g. logs, proporções, etc.)
Qualquer coluna definida aqui deve também existir em `table_config.yaml`.
# Além disso, sinta-se à vontade para alterar alguns nomes obscuros
# para algo um pouco mais explícito.
TIPOS:
- Para modificar tipos de colunas, basta substituir STRING por outro tipo válido.
- Exemplo: `SAFE_CAST(column_name AS NUMERIC) column_name`
- Mais detalhes: https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types
*/

CREATE VIEW basedosdados-dev.world_wb_wwbi.country_indicators AS
SELECT
SAFE_CAST(year AS INT64) year,
SAFE_CAST(country_name AS STRING) country_name,
SAFE_CAST(country_code AS STRING) country_code,
SAFE_CAST(indicator AS STRING) indicator,
SAFE_CAST(score AS FLOAT64) score
FROM basedosdados-dev.world_wb_wwbi_staging.country_indicators AS t
1 change: 1 addition & 0 deletions bases/world_wb_wwbi/country_indicators/schema-prod.json
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
[{"name": "year", "bigquery_type": "int64", "description": "Year", "temporal_coverage": ["(1)"], "covered_by_dictionary": false, "directory_column": {"dataset_id": "br_bd_diretorios_data_tempo", "table_id": "ano", "column_name": "ano"}, "measurement_unit": null, "has_sensitive_data": false, "observations": null, "is_in_staging": true, "is_partition": true, "type": "INTEGER", "mode": "NULLABLE"}, {"name": "country_name", "bigquery_type": "string", "description": "Name of the country", "temporal_coverage": ["(1)"], "covered_by_dictionary": false, "directory_column": {"dataset_id": "br_bd_diretorios_mundo", "table_id": "pais", "column_name": "nome_ingles"}, "measurement_unit": null, "has_sensitive_data": false, "observations": null, "is_in_staging": true, "is_partition": false, "type": "STRING", "mode": "NULLABLE"}, {"name": "country_code", "bigquery_type": "string", "description": "Country 3-letter ISO code", "temporal_coverage": ["(1)"], "covered_by_dictionary": false, "directory_column": {"dataset_id": "br_bd_diretorios_mundo", "table_id": "pais", "column_name": "sigla_pais_iso3"}, "measurement_unit": null, "has_sensitive_data": false, "observations": null, "is_in_staging": true, "is_partition": false, "type": "STRING", "mode": "NULLABLE"}, {"name": "indicator", "bigquery_type": "string", "description": "Indicator name", "temporal_coverage": ["(1)"], "covered_by_dictionary": false, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": null, "has_sensitive_data": false, "observations": null, "is_in_staging": true, "is_partition": false, "type": "STRING", "mode": "NULLABLE"}, {"name": "score", "bigquery_type": "float64", "description": "Score of indicators", "temporal_coverage": ["(1)"], "covered_by_dictionary": false, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": null, "has_sensitive_data": false, "observations": null, "is_in_staging": true, "is_partition": false, "type": "FLOAT", "mode": "NULLABLE"}]
1 change: 1 addition & 0 deletions bases/world_wb_wwbi/country_indicators/schema-staging.json
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
[{"name": "country_name", "bigquery_type": "string", "description": "Name of the country", "temporal_coverage": ["(1)"], "covered_by_dictionary": false, "directory_column": {"dataset_id": "br_bd_diretorios_mundo", "table_id": "pais", "column_name": "nome_ingles"}, "measurement_unit": null, "has_sensitive_data": false, "observations": null, "is_in_staging": true, "is_partition": false, "type": "STRING"}, {"name": "country_code", "bigquery_type": "string", "description": "Country 3-letter ISO code", "temporal_coverage": ["(1)"], "covered_by_dictionary": false, "directory_column": {"dataset_id": "br_bd_diretorios_mundo", "table_id": "pais", "column_name": "sigla_pais_iso3"}, "measurement_unit": null, "has_sensitive_data": false, "observations": null, "is_in_staging": true, "is_partition": false, "type": "STRING"}, {"name": "indicator", "bigquery_type": "string", "description": "Indicator name", "temporal_coverage": ["(1)"], "covered_by_dictionary": false, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": null, "has_sensitive_data": false, "observations": null, "is_in_staging": true, "is_partition": false, "type": "STRING"}, {"name": "score", "bigquery_type": "float64", "description": "Score of indicators", "temporal_coverage": ["(1)"], "covered_by_dictionary": false, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": null, "has_sensitive_data": false, "observations": null, "is_in_staging": true, "is_partition": false, "type": "STRING"}]
Loading

0 comments on commit f849136

Please sign in to comment.