-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathelt_project_gdp.py
131 lines (98 loc) · 5.2 KB
/
elt_project_gdp.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
# Task:
# Automated script that can extract the list of all countries in order of their GDPs in billion USDs (rounded to 2 decimal places), as logged by the International Monetary Fund (IMF).
# website: https://web.archive.org/web/20230902185326/https://en.wikipedia.org/wiki/List_of_countries_by_GDP_%28nominal%29
# The required information needs to be made accessible as a JSON file 'Countries_by_GDP.json' as well as a table 'Countries_by_GDP' in a database file 'World_Economies.db' with attributes 'Country' and 'GDP_USD_billion.'
# Your boss wants you to demonstrate the success of this code by running a query on the database table to display only the entries with more than a 100 billion USD economy. Also, log the entire process of execution in a file named 'etl_project_log.txt'.
# Log the progress of the code with appropriate timestamps.
# Note: you can do 'from pprint import pprint' and then print out dictionaries cleaner by doing pprint(dict)
# Install libraries
# While requests, sqlite3, and datetime come bundled with python, the other libraries will have to be installed.
# In terminal:
# pip install pandas
# pip install numpy
# pip install bs4
# Code for ETL operations on Country-GDP data
# str(col[1].a.contents[0])
# Importing the required libraries
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np
import sqlite3
from datetime import datetime
# Define known entities
url = 'https://web.archive.org/web/20230902185326/https://en.wikipedia.org/wiki/List_of_countries_by_GDP_%28nominal%29'
table_attribs = ["Country", "GDP_USD_millions"]
db_name = 'World_Economies.db'
table_name = 'Countries_by_GDP'
csv_path = '/mnt/c/Users/Amanda.Rodgers/OneDrive - Voyatek/Code/python_data_science//Countries_by_GDP.csv'
def extract(url, table_attribs):
''' This function extracts the required
information from the website and saves it to a dataframe. The
function returns the dataframe for further processing. '''
page = requests.get(url).text
data = BeautifulSoup(page,'html.parser')
df = pd.DataFrame(columns=table_attribs)
tables = data.find_all('tbody')
rows = tables[2].find_all('tr')
for row in rows:
col = row.find_all('td')
if len(col)!=0:
if col[0].find('a') is not None and '—' not in col[2]:
data_dict = {"Country": col[0].a.contents[0],
"GDP_USD_millions": col[2].contents[0]}
df1 = pd.DataFrame(data_dict, index=[0])
df = pd.concat([df,df1], ignore_index=True)
return df
def transform(df):
''' This function converts the GDP information from Currency
format to float value, transforms the information of GDP from
USD (Millions) to USD (Billions) rounding to 2 decimal places.
The function returns the transformed dataframe.'''
GDP_list = df["GDP_USD_millions"].tolist()
GDP_list = [float("".join(x.split(','))) for x in GDP_list]
GDP_list = [np.round(x/1000,2) for x in GDP_list]
df["GDP_USD_millions"] = GDP_list
df=df.rename(columns = {"GDP_USD_millions":"GDP_USD_billions"})
return df
def load_to_csv(df, csv_path):
''' This function saves the final dataframe as a `CSV` file
in the provided path. Function returns nothing.'''
df.to_csv(csv_path)
def load_to_db(df, sql_connection, table_name):
''' This function saves the final dataframe as a database table
with the provided name. Function returns nothing. Save the transformed dataframe as a table in the database'''
df.to_sql(table_name, sql_connection, if_exists='replace', index=False)
def run_query(query_statement, sql_connection):
''' This function runs the stated query on the database table and
prints the output on the terminal. Function returns nothing. '''
print(query_statement)
query_output = pd.read_sql(query_statement, sql_connection)
print(query_output)
def log_progress(message):
''' This function logs the mentioned message at a given stage of the code execution to a log file. Function returns nothing'''
timestamp_format = '%Y-%h-%d-%H:%M:%S' # Year-Monthname-Day-Hour-Minute-Second
now = datetime.now() # get current timestamp
timestamp = now.strftime(timestamp_format)
with open("./etl_project_log.txt","a") as f:
f.write(timestamp + ' : ' + message + '\n')
''' Here, you define the required entities and call the relevant
functions in the correct order to complete the project. Note that this
portion is not inside any function.'''
log_progress('Preliminaries complete. Initiating ETL process')
df = extract(url, table_attribs)
log_progress('Data extraction complete. Initiating Transformation process')
df = transform(df)
log_progress('Data transformation complete. Initiating loading process')
load_to_csv(df, csv_path)
log_progress('Data saved to CSV file')
sql_connection = sqlite3.connect('World_Economies.db')
log_progress('SQL Connection initiated.')
load_to_db(df, sql_connection, table_name)
log_progress('Data loaded to Database as table. Running the query')
query_statement = f"SELECT * from {table_name} WHERE GDP_USD_billions >= 100"
run_query(query_statement, sql_connection)
log_progress('Process Complete.')
sql_connection.close()
# terminal:
# python3 etl_project_gdp.py