dados (#1506)

Co-authored-by: mergify[bot] <37929162+mergify[bot]@users.noreply.github.com>
basedosdados · Jan 6, 2023 · f849136 · f849136
1 parent 086d6ac
commit f849136
Show file tree

Hide file tree

Showing 8 changed files with 625 additions and 0 deletions.
diff --git a/bases/world_wb_wwbi/README.md b/bases/world_wb_wwbi/README.md
@@ -0,0 +1,7 @@
+Como capturar os dados de world_wb_wwbi?
+
+Para capturar esses dados, basta verificar o link dos dados originais indicado em dataset_config.yaml no item website.
+
+Caso tenha sido utilizado algum código de captura ou tratamento, estes estarão contidos em code/. Se o dado publicado for em sua versão bruta, não existirá a pasta code/.
+
+Os dados publicados estão disponíveis em: https://basedosdados.org/dataset/world-wb-wwbi
diff --git a/bases/world_wb_wwbi/code/data.ipynb b/bases/world_wb_wwbi/code/data.ipynb
@@ -0,0 +1,287 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "85c05889",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "import os\n",
+    "#import basedosdados"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "6f52c930",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "path = os.path.abspath(os.path.join('..', 'input'))\n",
+    "path_data = os.path.join(path, 'WWBIData.csv')\n",
+    "df = pd.read_csv(path_data)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 24,
+   "id": "374fa6f9",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "202"
+      ]
+     },
+     "execution_count": 24,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "len(df['Country Code'].unique())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "d41e3750",
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>year</th>\n",
+       "      <th>country_name</th>\n",
+       "      <th>country_code</th>\n",
+       "      <th>indicator</th>\n",
+       "      <th>score</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>2000</td>\n",
+       "      <td>Afghanistan</td>\n",
+       "      <td>AFG</td>\n",
+       "      <td>Core Public Administration workers, as a share...</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>2000</td>\n",
+       "      <td>Afghanistan</td>\n",
+       "      <td>AFG</td>\n",
+       "      <td>Core Public Administration workers, as a share...</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>2000</td>\n",
+       "      <td>Afghanistan</td>\n",
+       "      <td>AFG</td>\n",
+       "      <td>Core Public Administration workers, as a share...</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>2000</td>\n",
+       "      <td>Afghanistan</td>\n",
+       "      <td>AFG</td>\n",
+       "      <td>Cross-country public sector pay comparison rat...</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>2000</td>\n",
+       "      <td>Afghanistan</td>\n",
+       "      <td>AFG</td>\n",
+       "      <td>Cross-country public sector pay comparison rat...</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>...</th>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1281079</th>\n",
+       "      <td>2020</td>\n",
+       "      <td>Zimbabwe</td>\n",
+       "      <td>ZWE</td>\n",
+       "      <td>Teachers, as a share of public formal employees</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1281080</th>\n",
+       "      <td>2020</td>\n",
+       "      <td>Zimbabwe</td>\n",
+       "      <td>ZWE</td>\n",
+       "      <td>Teachers, as a share of public paid employees</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1281081</th>\n",
+       "      <td>2020</td>\n",
+       "      <td>Zimbabwe</td>\n",
+       "      <td>ZWE</td>\n",
+       "      <td>Teachers, as a share of public total employees</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1281082</th>\n",
+       "      <td>2020</td>\n",
+       "      <td>Zimbabwe</td>\n",
+       "      <td>ZWE</td>\n",
+       "      <td>Wage bill as a percentage of GDP</td>\n",
+       "      <td>6.440993</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1281083</th>\n",
+       "      <td>2020</td>\n",
+       "      <td>Zimbabwe</td>\n",
+       "      <td>ZWE</td>\n",
+       "      <td>Wage bill as a percentage of Public Expenditure</td>\n",
+       "      <td>41.238518</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>1281084 rows × 5 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "         year country_name country_code  \\\n",
+       "0        2000  Afghanistan          AFG   \n",
+       "1        2000  Afghanistan          AFG   \n",
+       "2        2000  Afghanistan          AFG   \n",
+       "3        2000  Afghanistan          AFG   \n",
+       "4        2000  Afghanistan          AFG   \n",
+       "...       ...          ...          ...   \n",
+       "1281079  2020     Zimbabwe          ZWE   \n",
+       "1281080  2020     Zimbabwe          ZWE   \n",
+       "1281081  2020     Zimbabwe          ZWE   \n",
+       "1281082  2020     Zimbabwe          ZWE   \n",
+       "1281083  2020     Zimbabwe          ZWE   \n",
+       "\n",
+       "                                                 indicator      score  \n",
+       "0        Core Public Administration workers, as a share...        NaN  \n",
+       "1        Core Public Administration workers, as a share...        NaN  \n",
+       "2        Core Public Administration workers, as a share...        NaN  \n",
+       "3        Cross-country public sector pay comparison rat...        NaN  \n",
+       "4        Cross-country public sector pay comparison rat...        NaN  \n",
+       "...                                                    ...        ...  \n",
+       "1281079    Teachers, as a share of public formal employees        NaN  \n",
+       "1281080      Teachers, as a share of public paid employees        NaN  \n",
+       "1281081     Teachers, as a share of public total employees        NaN  \n",
+       "1281082                   Wage bill as a percentage of GDP   6.440993  \n",
+       "1281083    Wage bill as a percentage of Public Expenditure  41.238518  \n",
+       "\n",
+       "[1281084 rows x 5 columns]"
+      ]
+     },
+     "execution_count": 3,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df2 = pd.DataFrame()\n",
+    "for i in ['00','01','02','03','04','05', '06','07','08','09','10','11','12','13','14','15','16','17','18','19','20']:\n",
+    "    lista = df[[ 'Country Name', 'Country Code', 'Indicator Name', '20'+i]].values\n",
+    "    lista = pd.DataFrame(lista)\n",
+    "    lista['year'] = '20'+i\n",
+    "    lista.rename(columns = {0:'country_name', 1:'country_code', 2:'indicator',3:'score'}, inplace = True)\n",
+    "    df2 = df2.append(lista).reset_index(drop=True)\n",
+    "\n",
+    "df2 = df2[['year', 'country_name','country_code','indicator','score']]\n",
+    "df2['year'] = df2['year'].astype(int)\n",
+    "df2['score'] = df2['score'].astype('float')\n",
+    "df2"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "1c75b7c7",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "output = os.path.abspath(os.path.join('..', 'output'))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "cb8f4261",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "for ano in [*range(2000, 2021)]:\n",
+    "    particao = output + f'/country_indicators/year={ano}/'\n",
+    "    if not os.path.exists(particao):\n",
+    "      os.makedirs(particao)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "8d9730c3",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "for ano in [*range(2000, 2021)]:\n",
+    "    df_particao = df2[df2['year'] == ano].copy() # O .copy não é necessário é apenas uma boa prática\n",
+    "    df_particao.drop(['year'], axis=1, inplace=True) # É preciso excluir as colunas utilizadas para partição \n",
+    "    particao = output + f'/country_indicators/year={ano}/country_indicators.csv'\n",
+    "    df_particao.to_csv(particao, index=False, encoding='utf-8', na_rep='')\n"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.12"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/bases/world_wb_wwbi/country_indicators/publish.sql b/bases/world_wb_wwbi/country_indicators/publish.sql
@@ -0,0 +1,27 @@
+/*
+Query para publicar a tabela.
+
+Esse é o lugar para:
+    - modificar nomes, ordem e tipos de colunas
+    - dar join com outras tabelas
+    - criar colunas extras (e.g. logs, proporções, etc.)
+
+Qualquer coluna definida aqui deve também existir em `table_config.yaml`.
+
+# Além disso, sinta-se à vontade para alterar alguns nomes obscuros
+# para algo um pouco mais explícito.
+
+TIPOS:
+    - Para modificar tipos de colunas, basta substituir STRING por outro tipo válido.
+    - Exemplo: `SAFE_CAST(column_name AS NUMERIC) column_name`
+    - Mais detalhes: https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types
+*/
+
+CREATE VIEW basedosdados-dev.world_wb_wwbi.country_indicators AS
+SELECT 
+SAFE_CAST(year AS INT64) year,
+SAFE_CAST(country_name AS STRING) country_name,
+SAFE_CAST(country_code AS STRING) country_code,
+SAFE_CAST(indicator AS STRING) indicator,
+SAFE_CAST(score AS FLOAT64) score
+FROM basedosdados-dev.world_wb_wwbi_staging.country_indicators AS t
diff --git a/bases/world_wb_wwbi/country_indicators/schema-prod.json b/bases/world_wb_wwbi/country_indicators/schema-prod.json
@@ -0,0 +1 @@
+[{"name": "year", "bigquery_type": "int64", "description": "Year", "temporal_coverage": ["(1)"], "covered_by_dictionary": false, "directory_column": {"dataset_id": "br_bd_diretorios_data_tempo", "table_id": "ano", "column_name": "ano"}, "measurement_unit": null, "has_sensitive_data": false, "observations": null, "is_in_staging": true, "is_partition": true, "type": "INTEGER", "mode": "NULLABLE"}, {"name": "country_name", "bigquery_type": "string", "description": "Name of the country", "temporal_coverage": ["(1)"], "covered_by_dictionary": false, "directory_column": {"dataset_id": "br_bd_diretorios_mundo", "table_id": "pais", "column_name": "nome_ingles"}, "measurement_unit": null, "has_sensitive_data": false, "observations": null, "is_in_staging": true, "is_partition": false, "type": "STRING", "mode": "NULLABLE"}, {"name": "country_code", "bigquery_type": "string", "description": "Country 3-letter ISO code", "temporal_coverage": ["(1)"], "covered_by_dictionary": false, "directory_column": {"dataset_id": "br_bd_diretorios_mundo", "table_id": "pais", "column_name": "sigla_pais_iso3"}, "measurement_unit": null, "has_sensitive_data": false, "observations": null, "is_in_staging": true, "is_partition": false, "type": "STRING", "mode": "NULLABLE"}, {"name": "indicator", "bigquery_type": "string", "description": "Indicator name", "temporal_coverage": ["(1)"], "covered_by_dictionary": false, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": null, "has_sensitive_data": false, "observations": null, "is_in_staging": true, "is_partition": false, "type": "STRING", "mode": "NULLABLE"}, {"name": "score", "bigquery_type": "float64", "description": "Score of indicators", "temporal_coverage": ["(1)"], "covered_by_dictionary": false, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": null, "has_sensitive_data": false, "observations": null, "is_in_staging": true, "is_partition": false, "type": "FLOAT", "mode": "NULLABLE"}]
diff --git a/bases/world_wb_wwbi/country_indicators/schema-staging.json b/bases/world_wb_wwbi/country_indicators/schema-staging.json
@@ -0,0 +1 @@
+[{"name": "country_name", "bigquery_type": "string", "description": "Name of the country", "temporal_coverage": ["(1)"], "covered_by_dictionary": false, "directory_column": {"dataset_id": "br_bd_diretorios_mundo", "table_id": "pais", "column_name": "nome_ingles"}, "measurement_unit": null, "has_sensitive_data": false, "observations": null, "is_in_staging": true, "is_partition": false, "type": "STRING"}, {"name": "country_code", "bigquery_type": "string", "description": "Country 3-letter ISO code", "temporal_coverage": ["(1)"], "covered_by_dictionary": false, "directory_column": {"dataset_id": "br_bd_diretorios_mundo", "table_id": "pais", "column_name": "sigla_pais_iso3"}, "measurement_unit": null, "has_sensitive_data": false, "observations": null, "is_in_staging": true, "is_partition": false, "type": "STRING"}, {"name": "indicator", "bigquery_type": "string", "description": "Indicator name", "temporal_coverage": ["(1)"], "covered_by_dictionary": false, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": null, "has_sensitive_data": false, "observations": null, "is_in_staging": true, "is_partition": false, "type": "STRING"}, {"name": "score", "bigquery_type": "float64", "description": "Score of indicators", "temporal_coverage": ["(1)"], "covered_by_dictionary": false, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": null, "has_sensitive_data": false, "observations": null, "is_in_staging": true, "is_partition": false, "type": "STRING"}]
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		[{"name": "year", "bigquery_type": "int64", "description": "Year", "temporal_coverage": ["(1)"], "covered_by_dictionary": false, "directory_column": {"dataset_id": "br_bd_diretorios_data_tempo", "table_id": "ano", "column_name": "ano"}, "measurement_unit": null, "has_sensitive_data": false, "observations": null, "is_in_staging": true, "is_partition": true, "type": "INTEGER", "mode": "NULLABLE"}, {"name": "country_name", "bigquery_type": "string", "description": "Name of the country", "temporal_coverage": ["(1)"], "covered_by_dictionary": false, "directory_column": {"dataset_id": "br_bd_diretorios_mundo", "table_id": "pais", "column_name": "nome_ingles"}, "measurement_unit": null, "has_sensitive_data": false, "observations": null, "is_in_staging": true, "is_partition": false, "type": "STRING", "mode": "NULLABLE"}, {"name": "country_code", "bigquery_type": "string", "description": "Country 3-letter ISO code", "temporal_coverage": ["(1)"], "covered_by_dictionary": false, "directory_column": {"dataset_id": "br_bd_diretorios_mundo", "table_id": "pais", "column_name": "sigla_pais_iso3"}, "measurement_unit": null, "has_sensitive_data": false, "observations": null, "is_in_staging": true, "is_partition": false, "type": "STRING", "mode": "NULLABLE"}, {"name": "indicator", "bigquery_type": "string", "description": "Indicator name", "temporal_coverage": ["(1)"], "covered_by_dictionary": false, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": null, "has_sensitive_data": false, "observations": null, "is_in_staging": true, "is_partition": false, "type": "STRING", "mode": "NULLABLE"}, {"name": "score", "bigquery_type": "float64", "description": "Score of indicators", "temporal_coverage": ["(1)"], "covered_by_dictionary": false, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": null, "has_sensitive_data": false, "observations": null, "is_in_staging": true, "is_partition": false, "type": "FLOAT", "mode": "NULLABLE"}]
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		[{"name": "country_name", "bigquery_type": "string", "description": "Name of the country", "temporal_coverage": ["(1)"], "covered_by_dictionary": false, "directory_column": {"dataset_id": "br_bd_diretorios_mundo", "table_id": "pais", "column_name": "nome_ingles"}, "measurement_unit": null, "has_sensitive_data": false, "observations": null, "is_in_staging": true, "is_partition": false, "type": "STRING"}, {"name": "country_code", "bigquery_type": "string", "description": "Country 3-letter ISO code", "temporal_coverage": ["(1)"], "covered_by_dictionary": false, "directory_column": {"dataset_id": "br_bd_diretorios_mundo", "table_id": "pais", "column_name": "sigla_pais_iso3"}, "measurement_unit": null, "has_sensitive_data": false, "observations": null, "is_in_staging": true, "is_partition": false, "type": "STRING"}, {"name": "indicator", "bigquery_type": "string", "description": "Indicator name", "temporal_coverage": ["(1)"], "covered_by_dictionary": false, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": null, "has_sensitive_data": false, "observations": null, "is_in_staging": true, "is_partition": false, "type": "STRING"}, {"name": "score", "bigquery_type": "float64", "description": "Score of indicators", "temporal_coverage": ["(1)"], "covered_by_dictionary": false, "directory_column": {"dataset_id": null, "table_id": null, "column_name": null}, "measurement_unit": null, "has_sensitive_data": false, "observations": null, "is_in_staging": true, "is_partition": false, "type": "STRING"}]