Skip to content

Commit

Permalink
feat: wiki tables #4
Browse files Browse the repository at this point in the history
  • Loading branch information
sana-dibe committed Jan 22, 2024
1 parent 0f3aad9 commit 22b1a70
Show file tree
Hide file tree
Showing 2 changed files with 84 additions and 2 deletions.
41 changes: 39 additions & 2 deletions tabled/tests/join_tables.py
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,7 @@ def test_compute_join_resolution(tables):


from tabled.multi import execute_commands
from tabled.multi import execute_table_commands


def test_execute_commands_simply():
Expand Down Expand Up @@ -160,8 +161,6 @@ def test_execute_commands_simply():
scope = tables
extra_scope = dict()

from tabled.multi import execute_table_commands

it = execute_table_commands(commands, tables, extra_scope=extra_scope)

def are_equal(a, b):
Expand All @@ -176,3 +175,41 @@ def are_equal(a, b):
assert are_equal(extra_scope['cumul'], pd.DataFrame({'ID': [1, 2, 3]}))
next(it)
assert list(extra_scope['cumul']) == ['ID', 'Salary']

from wiki_table import extract_wikipedia_tables

# Test wiki table
def test_extract_wikipedia_tables():
wikiurl = "https://fr.wikipedia.org/wiki/Liste_des_communes_de_France_les_plus_peupl%C3%A9es"
resulting_dfs = extract_wikipedia_tables(wikiurl)
assert resulting_dfs is not None
assert len(resulting_dfs) > 0
for idx, df in enumerate(resulting_dfs):
assert isinstance(df, pd.DataFrame)
assert not df.empty

@pytest.fixture
def extracted_dataframes():
url_aeroports_frequentes = "https://fr.wikipedia.org/wiki/Liste_des_a%C3%A9roports_les_plus_fr%C3%A9quent%C3%A9s_en_France"
url_aeroports_vastes = "https://fr.wikipedia.org/wiki/Liste_des_a%C3%A9roports_les_plus_vastes_au_monde"

dfs_aeroports_frequentes = extract_wikipedia_tables(url_aeroports_frequentes)
dfs_aeroports_vastes = extract_wikipedia_tables(url_aeroports_vastes)

return dfs_aeroports_frequentes, dfs_aeroports_vastes


def test_execute_commands_wiki(extracted_dataframes):
from tabled.multi import Join, Remove, Load
table1_wiki = extracted_dataframes[0]
table2_wiki = extracted_dataframes[1]

tables = {'table1_wiki': table1_wiki, 'table2_wiki': table2_wiki}
commands = [Load('table1_wiki'), Join('table2_wiki')]

scope = tables
extra_scope = dict()
it = execute_table_commands(commands, scope, extra_scope=extra_scope)
next(it)
next(it)
assert extra_scope['cumul'].shape[0] == 1
45 changes: 45 additions & 0 deletions wiki_table.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
import pandas as pd
import requests
from bs4 import BeautifulSoup

def extract_wikipedia_tables(wikiurl):
try:
# Send an HTTP GET request to the Wikipedia URL
response = requests.get(wikiurl)
response.raise_for_status()
# Parse the HTML content using BeautifulSoup
soup = BeautifulSoup(response.text, 'html.parser')
tables = soup.find_all('table', {'class': "wikitable"})

if tables:
df_list = []
for table in tables:
df = pd.read_html(str(table))
if df:
df_list.append(pd.DataFrame(df[0]))
else:
print("No data found in one of the tables.")

if df_list:
return df_list
else:
print("No tables found with class 'wikitable' containing data.")
else:
print(f"No tables with class 'wikitable' found on the page.")
except requests.exceptions.RequestException as e:
print(f"An error occurred during the HTTP request: {e}")
except Exception as e:
print(f"An error occurred: {e}")

return None

if __name__ == "__main__":
wikiurl = "https://fr.wikipedia.org/wiki/Liste_des_communes_de_France_les_plus_peupl%C3%A9es"
resulting_dfs = extract_wikipedia_tables(wikiurl)
print(resulting_dfs[0].columns)
if resulting_dfs is not None:
for idx, df in enumerate(resulting_dfs):
print(f"Table {idx + 1}:")
print(df.head())
else:
print("Extraction failed.")

0 comments on commit 22b1a70

Please sign in to comment.