-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge branch 'master' of https://github.com/i2mint/tabled
- Loading branch information
Showing
6 changed files
with
1,043 additions
and
70 deletions.
There are no files selected for viewing
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,142 @@ | ||
{ | ||
"cells": [ | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": {}, | ||
"source": [ | ||
"## WIP Scrap" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 1, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"import pandas as pd\n", | ||
"import requests\n", | ||
"from bs4 import BeautifulSoup\n", | ||
"\n", | ||
"def extract_wikipedia_tables(wikiurl):\n", | ||
" try:\n", | ||
" # Send an HTTP GET request to the Wikipedia URL\n", | ||
" response = requests.get(wikiurl)\n", | ||
" response.raise_for_status() \n", | ||
" # Parse the HTML content using BeautifulSoup\n", | ||
" soup = BeautifulSoup(response.text, 'html.parser')\n", | ||
" tables = soup.find_all('table', {'class': \"wikitable\"})\n", | ||
"\n", | ||
" if tables:\n", | ||
" df_list = []\n", | ||
" for table in tables:\n", | ||
" df = pd.read_html(str(table))\n", | ||
" if df:\n", | ||
" df_list.append(pd.DataFrame(df[0]))\n", | ||
" else:\n", | ||
" print(\"No data found in one of the tables.\")\n", | ||
" \n", | ||
" if df_list:\n", | ||
" return df_list\n", | ||
" else:\n", | ||
" print(\"No tables found with class 'wikitable' containing data.\")\n", | ||
" else:\n", | ||
" print(f\"No tables with class 'wikitable' found on the page.\")\n", | ||
" except requests.exceptions.RequestException as e:\n", | ||
" print(f\"An error occurred during the HTTP request: {e}\")\n", | ||
" except Exception as e:\n", | ||
" print(f\"An error occurred: {e}\")\n", | ||
"\n", | ||
" return None" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 2, | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"name": "stdout", | ||
"output_type": "stream", | ||
"text": [ | ||
"Table 1:\n", | ||
" Rang2024 CodeInsee Commune Département Statut \\\n", | ||
" Rang2024 CodeInsee Commune Département Statut \n", | ||
"0 1 75056 Paris[a] Paris[a] Préfecture/Capitale \n", | ||
"1 2 13055 Marseille Bouches-du-Rhône Préfecture \n", | ||
"2 3 69123 Lyon Métropole de Lyon[b] Préfecture \n", | ||
"3 4 31555 Toulouse Haute-Garonne Préfecture \n", | ||
"4 5 06088 Nice Alpes-Maritimes Préfecture \n", | ||
"\n", | ||
" Région Population légale \\\n", | ||
" Région 2021[1] 2015[5] 2010[6] \n", | ||
"0 Île-de-France 2 133 111 2 206 488 2 243 833 \n", | ||
"1 Provence-Alpes-Côte d'Azur 873 076 861 635 850 726 \n", | ||
"2 Auvergne-Rhône-Alpes 522 250 513 275 484 344 \n", | ||
"3 Occitanie 504 078 471 941 441 802 \n", | ||
"4 Provence-Alpes-Côte d'Azur 348 085 342 522 343 304 \n", | ||
"\n", | ||
" \n", | ||
" 1999[7],[8] 1990[7] 1982[7] 1975[7],[9] 1968[7],[10] \n", | ||
"0 2 125 246 2 152 423 2 176 243 2 299 830 2 590 771 \n", | ||
"1 798 430 800 550 874 436 908 600 889 029 \n", | ||
"2 445 452 415 487 413 095 456 716 527 800 \n", | ||
"3 390 350 358 688 347 995 373 796 370 796 \n", | ||
"4 342 738 342 439 337 085 344 481 322 442 \n", | ||
"Table 2:\n", | ||
" Rang2024 CodeInsee Commune Département Statut \\\n", | ||
"0 1 93014 Clichy-sous-Bois Seine-Saint-Denis — \n", | ||
"1 2 92064 Saint-Cloud Hauts-de-Seine — \n", | ||
"2 3 78146 Chatou Yvelines — \n", | ||
"3 4 54547 Vandœuvre-lès-Nancy Meurthe-et-Moselle — \n", | ||
"4 5 24322 Périgueux Dordogne Préfecture \n", | ||
"\n", | ||
" Région 2021[1] 2019[12] 2014[13] 2013[14] 2006[15] 1999[7],[8] \\\n", | ||
"0 Île-de-France 29 735 28 782 29 933 30 725 29 412 28 288 \n", | ||
"1 Île-de-France 29 727 30 012 29 360 29 109 29 385 28 157 \n", | ||
"2 Île-de-France 29 649 30 153 30 876 30 809 29 472 28 588 \n", | ||
"3 Grand Est 29 537 29 942 29 721 29 836 31 447 32 048 \n", | ||
"4 Nouvelle-Aquitaine 29 516 29 896 30 069 30 036 29 558 30 193 \n", | ||
"\n", | ||
" 1990[7] 1982[7] 1975[7] 1968[7] \n", | ||
"0 28 180 24 654 22 422 16 357 \n", | ||
"1 28 597 28 561 28 139 28 158 \n", | ||
"2 27 977 28 437 26 550 22 619 \n", | ||
"3 34 105 33 682 33 909 19 686 \n", | ||
"4 30 280 32 916 35 120 37 450 \n" | ||
] | ||
} | ||
], | ||
"source": [ | ||
"wikiurl = \"https://fr.wikipedia.org/wiki/Liste_des_communes_de_France_les_plus_peupl%C3%A9es\"\n", | ||
"resulting_dfs = extract_wikipedia_tables(wikiurl)\n", | ||
"if resulting_dfs is not None:\n", | ||
" for idx, df in enumerate(resulting_dfs):\n", | ||
" print(f\"Table {idx + 1}:\")\n", | ||
" print(df.head())\n", | ||
"else:\n", | ||
" print(\"Extraction failed.\")" | ||
] | ||
} | ||
], | ||
"metadata": { | ||
"kernelspec": { | ||
"display_name": "Python 3", | ||
"language": "python", | ||
"name": "python3" | ||
}, | ||
"language_info": { | ||
"codemirror_mode": { | ||
"name": "ipython", | ||
"version": 3 | ||
}, | ||
"file_extension": ".py", | ||
"mimetype": "text/x-python", | ||
"name": "python", | ||
"nbconvert_exporter": "python", | ||
"pygments_lexer": "ipython3", | ||
"version": "3.12.0" | ||
} | ||
}, | ||
"nbformat": 4, | ||
"nbformat_minor": 2 | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file was deleted.
Oops, something went wrong.