Skip to content

Commit

Permalink
Merge branch 'master' of https://github.com/i2mint/tabled
Browse files Browse the repository at this point in the history
  • Loading branch information
thorwhalen committed Feb 9, 2024
2 parents ba301ad + 9959432 commit dd57903
Show file tree
Hide file tree
Showing 6 changed files with 1,043 additions and 70 deletions.
847 changes: 847 additions & 0 deletions misc/JB_problem.ipynb

Large diffs are not rendered by default.

142 changes: 142 additions & 0 deletions misc/S_D_wip_scrap.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,142 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## WIP Scrap"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import requests\n",
"from bs4 import BeautifulSoup\n",
"\n",
"def extract_wikipedia_tables(wikiurl):\n",
" try:\n",
" # Send an HTTP GET request to the Wikipedia URL\n",
" response = requests.get(wikiurl)\n",
" response.raise_for_status() \n",
" # Parse the HTML content using BeautifulSoup\n",
" soup = BeautifulSoup(response.text, 'html.parser')\n",
" tables = soup.find_all('table', {'class': \"wikitable\"})\n",
"\n",
" if tables:\n",
" df_list = []\n",
" for table in tables:\n",
" df = pd.read_html(str(table))\n",
" if df:\n",
" df_list.append(pd.DataFrame(df[0]))\n",
" else:\n",
" print(\"No data found in one of the tables.\")\n",
" \n",
" if df_list:\n",
" return df_list\n",
" else:\n",
" print(\"No tables found with class 'wikitable' containing data.\")\n",
" else:\n",
" print(f\"No tables with class 'wikitable' found on the page.\")\n",
" except requests.exceptions.RequestException as e:\n",
" print(f\"An error occurred during the HTTP request: {e}\")\n",
" except Exception as e:\n",
" print(f\"An error occurred: {e}\")\n",
"\n",
" return None"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Table 1:\n",
" Rang2024 CodeInsee Commune Département Statut \\\n",
" Rang2024 CodeInsee Commune Département Statut \n",
"0 1 75056 Paris[a] Paris[a] Préfecture/Capitale \n",
"1 2 13055 Marseille Bouches-du-Rhône Préfecture \n",
"2 3 69123 Lyon Métropole de Lyon[b] Préfecture \n",
"3 4 31555 Toulouse Haute-Garonne Préfecture \n",
"4 5 06088 Nice Alpes-Maritimes Préfecture \n",
"\n",
" Région Population légale \\\n",
" Région 2021[1] 2015[5] 2010[6] \n",
"0 Île-de-France 2 133 111 2 206 488 2 243 833 \n",
"1 Provence-Alpes-Côte d'Azur 873 076 861 635 850 726 \n",
"2 Auvergne-Rhône-Alpes 522 250 513 275 484 344 \n",
"3 Occitanie 504 078 471 941 441 802 \n",
"4 Provence-Alpes-Côte d'Azur 348 085 342 522 343 304 \n",
"\n",
" \n",
" 1999[7],[8] 1990[7] 1982[7] 1975[7],[9] 1968[7],[10] \n",
"0 2 125 246 2 152 423 2 176 243 2 299 830 2 590 771 \n",
"1 798 430 800 550 874 436 908 600 889 029 \n",
"2 445 452 415 487 413 095 456 716 527 800 \n",
"3 390 350 358 688 347 995 373 796 370 796 \n",
"4 342 738 342 439 337 085 344 481 322 442 \n",
"Table 2:\n",
" Rang2024 CodeInsee Commune Département Statut \\\n",
"0 1 93014 Clichy-sous-Bois Seine-Saint-Denis — \n",
"1 2 92064 Saint-Cloud Hauts-de-Seine — \n",
"2 3 78146 Chatou Yvelines — \n",
"3 4 54547 Vandœuvre-lès-Nancy Meurthe-et-Moselle — \n",
"4 5 24322 Périgueux Dordogne Préfecture \n",
"\n",
" Région 2021[1] 2019[12] 2014[13] 2013[14] 2006[15] 1999[7],[8] \\\n",
"0 Île-de-France 29 735 28 782 29 933 30 725 29 412 28 288 \n",
"1 Île-de-France 29 727 30 012 29 360 29 109 29 385 28 157 \n",
"2 Île-de-France 29 649 30 153 30 876 30 809 29 472 28 588 \n",
"3 Grand Est 29 537 29 942 29 721 29 836 31 447 32 048 \n",
"4 Nouvelle-Aquitaine 29 516 29 896 30 069 30 036 29 558 30 193 \n",
"\n",
" 1990[7] 1982[7] 1975[7] 1968[7] \n",
"0 28 180 24 654 22 422 16 357 \n",
"1 28 597 28 561 28 139 28 158 \n",
"2 27 977 28 437 26 550 22 619 \n",
"3 34 105 33 682 33 909 19 686 \n",
"4 30 280 32 916 35 120 37 450 \n"
]
}
],
"source": [
"wikiurl = \"https://fr.wikipedia.org/wiki/Liste_des_communes_de_France_les_plus_peupl%C3%A9es\"\n",
"resulting_dfs = extract_wikipedia_tables(wikiurl)\n",
"if resulting_dfs is not None:\n",
" for idx, df in enumerate(resulting_dfs):\n",
" print(f\"Table {idx + 1}:\")\n",
" print(df.head())\n",
"else:\n",
" print(\"Extraction failed.\")"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.0"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
2 changes: 1 addition & 1 deletion setup.cfg
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[metadata]
name = tabled
version = 0.1.4
version = 0.1.7
url = https://github.com/i2mint/tabled
platforms = any
description_file = README.md
Expand Down
9 changes: 8 additions & 1 deletion tabled/multi.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,7 @@ class Join:
class Remove:
fields: Union[str, Iterable[str]]


@dataclass
class Rename:
rename_mapping: Dict[str, str]
Expand All @@ -92,7 +93,11 @@ def load_func(scope, command):


def join_func(scope, command):
table = scope[command.table_key]
table = scope[command.table_key].copy()
if 'renamed_columns' in scope:
for old_col, new_col in scope['renamed_columns'].items():
if old_col in table:
table.rename(columns={old_col: new_col}, inplace=True)
cumul = scope['cumul']
scope['cumul'] = cumul.merge(table, how='inner')

Expand All @@ -102,9 +107,11 @@ def remove_func(scope, command):


def rename_func(scope, command):
scope['renamed_columns'] = command.rename_mapping
for old_col, new_col in command.rename_mapping.items():
scope['cumul'] = scope['cumul'].rename(columns={old_col: new_col})


dflt_tables_interpreter_map = {
Load: load_func,
Join: join_func,
Expand Down
68 changes: 45 additions & 23 deletions tabled/tests/join_tables.py
Original file line number Diff line number Diff line change
Expand Up @@ -156,7 +156,12 @@ def test_execute_commands_simply():

tables = {'table1': table1, 'table2': table2, 'table3': table3}

commands = [Load('table1'), Rename({'Name': 'First Name'}), Remove(['First Name']), Join('table3')]
commands = [
Load('table1'),
Rename({'Name': 'First Name'}),
Remove(['First Name']),
Join('table3'),
]

scope = tables
extra_scope = dict()
Expand All @@ -178,40 +183,57 @@ def are_equal(a, b):
next(it)
assert list(extra_scope['cumul']) == ['ID', 'Salary']

from wiki_table import extract_wikipedia_tables

# Test wiki table
from tabled.html import *


def test_extract_wikipedia_tables():
wikiurl = "https://fr.wikipedia.org/wiki/Liste_des_communes_de_France_les_plus_peupl%C3%A9es"
resulting_dfs = extract_wikipedia_tables(wikiurl)
assert resulting_dfs is not None
assert len(resulting_dfs) > 0
for idx, df in enumerate(resulting_dfs):
wikiurl = 'https://fr.wikipedia.org/wiki/Liste_des_communes_de_France_les_plus_peupl%C3%A9es'
converter = url_to_html_func('requests')
tables = get_tables_from_url(wikiurl, url_to_html=converter)
assert tables is not None
assert len(tables) > 0
for idx, df in enumerate(tables):
assert isinstance(df, pd.DataFrame)
assert not df.empty


@pytest.fixture
def extracted_dataframes():
url_aeroports_frequentes = "https://fr.wikipedia.org/wiki/Liste_des_a%C3%A9roports_les_plus_fr%C3%A9quent%C3%A9s_en_France"
url_aeroports_vastes = "https://fr.wikipedia.org/wiki/Liste_des_a%C3%A9roports_les_plus_vastes_au_monde"
converter = url_to_html_func('requests')
url_aeroports_frequentes = 'https://fr.wikipedia.org/wiki/Liste_des_a%C3%A9roports_les_plus_fr%C3%A9quent%C3%A9s_en_France'
url_aeroports_vastes = 'https://fr.wikipedia.org/wiki/Liste_des_a%C3%A9roports_les_plus_vastes_au_monde'

dfs_aeroports_frequentes = extract_wikipedia_tables(url_aeroports_frequentes)
dfs_aeroports_vastes = extract_wikipedia_tables(url_aeroports_vastes)
dfs_aeroports_frequentes = get_tables_from_url(
url_aeroports_frequentes, url_to_html=converter
)
dfs_aeroports_vastes = get_tables_from_url(
url_aeroports_vastes, url_to_html=converter
)

return dfs_aeroports_frequentes, dfs_aeroports_vastes


# def test_execute_commands_wiki(extracted_dataframes):
# from tabled.multi import Join, Remove, Load
# table1_wiki = extracted_dataframes[0]
# table2_wiki = extracted_dataframes[1]
def test_execute_commands_wiki(extracted_dataframes):
from tabled.multi import Join, Remove, Load, Rename

# tables = {'table1_wiki': table1_wiki, 'table2_wiki': table2_wiki}
# commands = [Load('table1_wiki'), Join('table2_wiki')]
table1_wiki = extracted_dataframes[0][0]
table2_wiki = extracted_dataframes[1][0]

# scope = tables
# extra_scope = dict()
# it = execute_table_commands(commands, scope, extra_scope=extra_scope)
# next(it)
# next(it)
# assert extra_scope['cumul'].shape[0] == 1
tables = {'table1_wiki': table1_wiki, 'table2_wiki': table2_wiki}
commands = [
Load('table2_wiki'),
Remove('Aéroport'),
Rename({'Code': 'Code IATA'}),
Join('table1_wiki'),
]

scope = tables
extra_scope = dict()
it = execute_table_commands(commands, scope, extra_scope=extra_scope)
next(it)
next(it)
next(it)
next(it)
assert extra_scope['cumul'].shape[0] == 1
45 changes: 0 additions & 45 deletions wiki_table.py

This file was deleted.

0 comments on commit dd57903

Please sign in to comment.