-
-
Notifications
You must be signed in to change notification settings - Fork 88
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Co-authored-by: mergify[bot] <37929162+mergify[bot]@users.noreply.github.com>
- Loading branch information
1 parent
086d6ac
commit f849136
Showing
8 changed files
with
625 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,7 @@ | ||
Como capturar os dados de world_wb_wwbi? | ||
|
||
Para capturar esses dados, basta verificar o link dos dados originais indicado em dataset_config.yaml no item website. | ||
|
||
Caso tenha sido utilizado algum código de captura ou tratamento, estes estarão contidos em code/. Se o dado publicado for em sua versão bruta, não existirá a pasta code/. | ||
|
||
Os dados publicados estão disponíveis em: https://basedosdados.org/dataset/world-wb-wwbi |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,287 @@ | ||
{ | ||
"cells": [ | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 1, | ||
"id": "85c05889", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"import pandas as pd\n", | ||
"import os\n", | ||
"#import basedosdados" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 2, | ||
"id": "6f52c930", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"path = os.path.abspath(os.path.join('..', 'input'))\n", | ||
"path_data = os.path.join(path, 'WWBIData.csv')\n", | ||
"df = pd.read_csv(path_data)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 24, | ||
"id": "374fa6f9", | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"data": { | ||
"text/plain": [ | ||
"202" | ||
] | ||
}, | ||
"execution_count": 24, | ||
"metadata": {}, | ||
"output_type": "execute_result" | ||
} | ||
], | ||
"source": [ | ||
"len(df['Country Code'].unique())" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 3, | ||
"id": "d41e3750", | ||
"metadata": { | ||
"scrolled": true | ||
}, | ||
"outputs": [ | ||
{ | ||
"data": { | ||
"text/html": [ | ||
"<div>\n", | ||
"<style scoped>\n", | ||
" .dataframe tbody tr th:only-of-type {\n", | ||
" vertical-align: middle;\n", | ||
" }\n", | ||
"\n", | ||
" .dataframe tbody tr th {\n", | ||
" vertical-align: top;\n", | ||
" }\n", | ||
"\n", | ||
" .dataframe thead th {\n", | ||
" text-align: right;\n", | ||
" }\n", | ||
"</style>\n", | ||
"<table border=\"1\" class=\"dataframe\">\n", | ||
" <thead>\n", | ||
" <tr style=\"text-align: right;\">\n", | ||
" <th></th>\n", | ||
" <th>year</th>\n", | ||
" <th>country_name</th>\n", | ||
" <th>country_code</th>\n", | ||
" <th>indicator</th>\n", | ||
" <th>score</th>\n", | ||
" </tr>\n", | ||
" </thead>\n", | ||
" <tbody>\n", | ||
" <tr>\n", | ||
" <th>0</th>\n", | ||
" <td>2000</td>\n", | ||
" <td>Afghanistan</td>\n", | ||
" <td>AFG</td>\n", | ||
" <td>Core Public Administration workers, as a share...</td>\n", | ||
" <td>NaN</td>\n", | ||
" </tr>\n", | ||
" <tr>\n", | ||
" <th>1</th>\n", | ||
" <td>2000</td>\n", | ||
" <td>Afghanistan</td>\n", | ||
" <td>AFG</td>\n", | ||
" <td>Core Public Administration workers, as a share...</td>\n", | ||
" <td>NaN</td>\n", | ||
" </tr>\n", | ||
" <tr>\n", | ||
" <th>2</th>\n", | ||
" <td>2000</td>\n", | ||
" <td>Afghanistan</td>\n", | ||
" <td>AFG</td>\n", | ||
" <td>Core Public Administration workers, as a share...</td>\n", | ||
" <td>NaN</td>\n", | ||
" </tr>\n", | ||
" <tr>\n", | ||
" <th>3</th>\n", | ||
" <td>2000</td>\n", | ||
" <td>Afghanistan</td>\n", | ||
" <td>AFG</td>\n", | ||
" <td>Cross-country public sector pay comparison rat...</td>\n", | ||
" <td>NaN</td>\n", | ||
" </tr>\n", | ||
" <tr>\n", | ||
" <th>4</th>\n", | ||
" <td>2000</td>\n", | ||
" <td>Afghanistan</td>\n", | ||
" <td>AFG</td>\n", | ||
" <td>Cross-country public sector pay comparison rat...</td>\n", | ||
" <td>NaN</td>\n", | ||
" </tr>\n", | ||
" <tr>\n", | ||
" <th>...</th>\n", | ||
" <td>...</td>\n", | ||
" <td>...</td>\n", | ||
" <td>...</td>\n", | ||
" <td>...</td>\n", | ||
" <td>...</td>\n", | ||
" </tr>\n", | ||
" <tr>\n", | ||
" <th>1281079</th>\n", | ||
" <td>2020</td>\n", | ||
" <td>Zimbabwe</td>\n", | ||
" <td>ZWE</td>\n", | ||
" <td>Teachers, as a share of public formal employees</td>\n", | ||
" <td>NaN</td>\n", | ||
" </tr>\n", | ||
" <tr>\n", | ||
" <th>1281080</th>\n", | ||
" <td>2020</td>\n", | ||
" <td>Zimbabwe</td>\n", | ||
" <td>ZWE</td>\n", | ||
" <td>Teachers, as a share of public paid employees</td>\n", | ||
" <td>NaN</td>\n", | ||
" </tr>\n", | ||
" <tr>\n", | ||
" <th>1281081</th>\n", | ||
" <td>2020</td>\n", | ||
" <td>Zimbabwe</td>\n", | ||
" <td>ZWE</td>\n", | ||
" <td>Teachers, as a share of public total employees</td>\n", | ||
" <td>NaN</td>\n", | ||
" </tr>\n", | ||
" <tr>\n", | ||
" <th>1281082</th>\n", | ||
" <td>2020</td>\n", | ||
" <td>Zimbabwe</td>\n", | ||
" <td>ZWE</td>\n", | ||
" <td>Wage bill as a percentage of GDP</td>\n", | ||
" <td>6.440993</td>\n", | ||
" </tr>\n", | ||
" <tr>\n", | ||
" <th>1281083</th>\n", | ||
" <td>2020</td>\n", | ||
" <td>Zimbabwe</td>\n", | ||
" <td>ZWE</td>\n", | ||
" <td>Wage bill as a percentage of Public Expenditure</td>\n", | ||
" <td>41.238518</td>\n", | ||
" </tr>\n", | ||
" </tbody>\n", | ||
"</table>\n", | ||
"<p>1281084 rows × 5 columns</p>\n", | ||
"</div>" | ||
], | ||
"text/plain": [ | ||
" year country_name country_code \\\n", | ||
"0 2000 Afghanistan AFG \n", | ||
"1 2000 Afghanistan AFG \n", | ||
"2 2000 Afghanistan AFG \n", | ||
"3 2000 Afghanistan AFG \n", | ||
"4 2000 Afghanistan AFG \n", | ||
"... ... ... ... \n", | ||
"1281079 2020 Zimbabwe ZWE \n", | ||
"1281080 2020 Zimbabwe ZWE \n", | ||
"1281081 2020 Zimbabwe ZWE \n", | ||
"1281082 2020 Zimbabwe ZWE \n", | ||
"1281083 2020 Zimbabwe ZWE \n", | ||
"\n", | ||
" indicator score \n", | ||
"0 Core Public Administration workers, as a share... NaN \n", | ||
"1 Core Public Administration workers, as a share... NaN \n", | ||
"2 Core Public Administration workers, as a share... NaN \n", | ||
"3 Cross-country public sector pay comparison rat... NaN \n", | ||
"4 Cross-country public sector pay comparison rat... NaN \n", | ||
"... ... ... \n", | ||
"1281079 Teachers, as a share of public formal employees NaN \n", | ||
"1281080 Teachers, as a share of public paid employees NaN \n", | ||
"1281081 Teachers, as a share of public total employees NaN \n", | ||
"1281082 Wage bill as a percentage of GDP 6.440993 \n", | ||
"1281083 Wage bill as a percentage of Public Expenditure 41.238518 \n", | ||
"\n", | ||
"[1281084 rows x 5 columns]" | ||
] | ||
}, | ||
"execution_count": 3, | ||
"metadata": {}, | ||
"output_type": "execute_result" | ||
} | ||
], | ||
"source": [ | ||
"df2 = pd.DataFrame()\n", | ||
"for i in ['00','01','02','03','04','05', '06','07','08','09','10','11','12','13','14','15','16','17','18','19','20']:\n", | ||
" lista = df[[ 'Country Name', 'Country Code', 'Indicator Name', '20'+i]].values\n", | ||
" lista = pd.DataFrame(lista)\n", | ||
" lista['year'] = '20'+i\n", | ||
" lista.rename(columns = {0:'country_name', 1:'country_code', 2:'indicator',3:'score'}, inplace = True)\n", | ||
" df2 = df2.append(lista).reset_index(drop=True)\n", | ||
"\n", | ||
"df2 = df2[['year', 'country_name','country_code','indicator','score']]\n", | ||
"df2['year'] = df2['year'].astype(int)\n", | ||
"df2['score'] = df2['score'].astype('float')\n", | ||
"df2" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 4, | ||
"id": "1c75b7c7", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"output = os.path.abspath(os.path.join('..', 'output'))" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 5, | ||
"id": "cb8f4261", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"for ano in [*range(2000, 2021)]:\n", | ||
" particao = output + f'/country_indicators/year={ano}/'\n", | ||
" if not os.path.exists(particao):\n", | ||
" os.makedirs(particao)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 6, | ||
"id": "8d9730c3", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"for ano in [*range(2000, 2021)]:\n", | ||
" df_particao = df2[df2['year'] == ano].copy() # O .copy não é necessário é apenas uma boa prática\n", | ||
" df_particao.drop(['year'], axis=1, inplace=True) # É preciso excluir as colunas utilizadas para partição \n", | ||
" particao = output + f'/country_indicators/year={ano}/country_indicators.csv'\n", | ||
" df_particao.to_csv(particao, index=False, encoding='utf-8', na_rep='')\n" | ||
] | ||
} | ||
], | ||
"metadata": { | ||
"kernelspec": { | ||
"display_name": "Python 3", | ||
"language": "python", | ||
"name": "python3" | ||
}, | ||
"language_info": { | ||
"codemirror_mode": { | ||
"name": "ipython", | ||
"version": 3 | ||
}, | ||
"file_extension": ".py", | ||
"mimetype": "text/x-python", | ||
"name": "python", | ||
"nbconvert_exporter": "python", | ||
"pygments_lexer": "ipython3", | ||
"version": "3.9.12" | ||
} | ||
}, | ||
"nbformat": 4, | ||
"nbformat_minor": 5 | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,27 @@ | ||
/* | ||
Query para publicar a tabela. | ||
Esse é o lugar para: | ||
- modificar nomes, ordem e tipos de colunas | ||
- dar join com outras tabelas | ||
- criar colunas extras (e.g. logs, proporções, etc.) | ||
Qualquer coluna definida aqui deve também existir em `table_config.yaml`. | ||
# Além disso, sinta-se à vontade para alterar alguns nomes obscuros | ||
# para algo um pouco mais explícito. | ||
TIPOS: | ||
- Para modificar tipos de colunas, basta substituir STRING por outro tipo válido. | ||
- Exemplo: `SAFE_CAST(column_name AS NUMERIC) column_name` | ||
- Mais detalhes: https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types | ||
*/ | ||
|
||
CREATE VIEW basedosdados-dev.world_wb_wwbi.country_indicators AS | ||
SELECT | ||
SAFE_CAST(year AS INT64) year, | ||
SAFE_CAST(country_name AS STRING) country_name, | ||
SAFE_CAST(country_code AS STRING) country_code, | ||
SAFE_CAST(indicator AS STRING) indicator, | ||
SAFE_CAST(score AS FLOAT64) score | ||
FROM basedosdados-dev.world_wb_wwbi_staging.country_indicators AS t |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
[{"name": "year", "bigquery_type": "int64", "description": "Year", "temporal_coverage": ["(1)"], "covered_by_dictionary": false, "directory_column": {"dataset_id": "br_bd_diretorios_data_tempo", "table_id": "ano", "column_name": "ano"}, "measurement_unit": null, "has_sensitive_data": false, "observations": null, "is_in_staging": true, "is_partition": true, "type": "INTEGER", "mode": "NULLABLE"}, {"name": "country_name", "bigquery_type": "string", "description": "Name of the country", "temporal_coverage": ["(1)"], "covered_by_dictionary": false, "directory_column": {"dataset_id": "br_bd_diretorios_mundo", "table_id": "pais", "column_name": "nome_ingles"}, "measurement_unit": null, "has_sensitive_data": false, "observations": null, "is_in_staging": true, "is_partition": false, "type": "STRING", "mode": "NULLABLE"}, {"name": "country_code", "bigquery_type": "string", "description": "Country 3-letter ISO code", "temporal_coverage": ["(1)"], "covered_by_dictionary": false, "directory_column": {"dataset_id": "br_bd_diretorios_mundo", "table_id": "pais", "column_name": "sigla_pais_iso3"}, "measurement_unit": null, "has_sensitive_data": false, "observations": null, "is_in_staging": true, "is_partition": false, "type": "STRING", "mode": "NULLABLE"}, {"name": "indicator", "bigquery_type": "string", "description": "Indicator name", "temporal_coverage": ["(1)"], "covered_by_dictionary": false, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": null, "has_sensitive_data": false, "observations": null, "is_in_staging": true, "is_partition": false, "type": "STRING", "mode": "NULLABLE"}, {"name": "score", "bigquery_type": "float64", "description": "Score of indicators", "temporal_coverage": ["(1)"], "covered_by_dictionary": false, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": null, "has_sensitive_data": false, "observations": null, "is_in_staging": true, "is_partition": false, "type": "FLOAT", "mode": "NULLABLE"}] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
[{"name": "country_name", "bigquery_type": "string", "description": "Name of the country", "temporal_coverage": ["(1)"], "covered_by_dictionary": false, "directory_column": {"dataset_id": "br_bd_diretorios_mundo", "table_id": "pais", "column_name": "nome_ingles"}, "measurement_unit": null, "has_sensitive_data": false, "observations": null, "is_in_staging": true, "is_partition": false, "type": "STRING"}, {"name": "country_code", "bigquery_type": "string", "description": "Country 3-letter ISO code", "temporal_coverage": ["(1)"], "covered_by_dictionary": false, "directory_column": {"dataset_id": "br_bd_diretorios_mundo", "table_id": "pais", "column_name": "sigla_pais_iso3"}, "measurement_unit": null, "has_sensitive_data": false, "observations": null, "is_in_staging": true, "is_partition": false, "type": "STRING"}, {"name": "indicator", "bigquery_type": "string", "description": "Indicator name", "temporal_coverage": ["(1)"], "covered_by_dictionary": false, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": null, "has_sensitive_data": false, "observations": null, "is_in_staging": true, "is_partition": false, "type": "STRING"}, {"name": "score", "bigquery_type": "float64", "description": "Score of indicators", "temporal_coverage": ["(1)"], "covered_by_dictionary": false, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": null, "has_sensitive_data": false, "observations": null, "is_in_staging": true, "is_partition": false, "type": "STRING"}] |
Oops, something went wrong.