Skip to content

Commit

Permalink
Progress on retrieving subcounty data
Browse files Browse the repository at this point in the history
  • Loading branch information
kihuha committed Mar 17, 2021
1 parent 01a2319 commit 3491ef3
Show file tree
Hide file tree
Showing 3 changed files with 73 additions and 31 deletions.
4 changes: 4 additions & 0 deletions miner/Pipfile
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,12 @@ verify_ssl = true
name = "pypi"

[packages]
pandas = "*"
tabula-py = "*"

[dev-packages]
pylint = "*"
autopep8 = "*"

[requires]
python_version = "3.8"
75 changes: 47 additions & 28 deletions miner/counties.py
Original file line number Diff line number Diff line change
@@ -1,20 +1,33 @@
import tabula
from tabula.io import read_pdf
import pandas as pd
import re


class County:
def __init__ (self):
result = pd.concat([self.getCounties(), self.getHouseholds(), self.getPopulationDensity(), self.getCountyCodes()], axis=1)
def __init__(self):
result = pd.concat([
self.getCounties(), self.getHouseholds(),
self.getPopulationDensity(),
self.getCountyCodes()
], axis=1)
self.data = result

def getCounties(self):
file = "VOLUME-1-KPHC-2019.pdf"

# TABLE ONE
table = tabula.read_pdf(file,pages=17)
table = read_pdf(file, pages=17)

# Rename columns and remove unused columns
raw = table[0].rename(columns={'Unnamed: 0': 'Male', 'Unnamed: 1': 'B', 'Sex': 'Female', 'Unnamed: 2': 'Intersex', 'Unnamed: 3': 'Total' })
raw = table[0].rename(
columns={
'Unnamed: 0': 'Male',
'Unnamed: 1': 'B',
'Sex': 'Female',
'Unnamed: 2': 'Intersex',
'Unnamed: 3': 'Total'
}
)
df = raw.drop(0).drop(1).drop(2).drop(3).drop(columns="B")

counties = []
Expand All @@ -30,7 +43,7 @@ def getCounties(self):
male_value = re.sub(r'[\,\/\-\']', "", raw_male_value)
counties.append(re.sub(r'\,', "", county_name))
male.append(male_value)

for item in df['Female'].array:
female_value = re.sub(r'[\,\/\-\']', "", item)
female.append(female_value)
Expand All @@ -42,22 +55,29 @@ def getCounties(self):
total_value = re.sub(r'[\,\/\-\']', "", item)
total.append(total_value)


df1 = df.assign(County=counties).drop(columns="Male")
df2 = df1.assign(Male=male)
df3 = df2.assign(Female=female)
df4 = df3.assign(Intersex=intersex)
df5 = df4.assign(Total=total)

d = {'County': counties, 'Male': male, 'Female': female, 'Intersex': intersex, 'Total': total }
d = {
'County': counties,
'Male': male,
'Female': female,
'Intersex': intersex,
'Total': total
}
final = pd.DataFrame(data=d)
return final

def getHouseholds(self):
file = "VOLUME-1-KPHC-2019.pdf"
table = tabula.read_pdf(file,pages=19, area=[100.8125,0.6425,1000.2825,1000.1025])
raw = table[0].drop(0).drop(columns="National/ County Population+").drop(1).drop(2).drop(50).drop(51).drop(52).drop(53)
df = raw.rename(columns={'Number of': 'households', 'Average': 'average_households'})
table = read_pdf(file, pages=19, area=[
100.8125, 0.6425, 1000.2825, 1000.1025])
raw = table[0].drop(0).drop(
columns="National/ County Population+"
).drop(1).drop(2).drop(50).drop(51).drop(52).drop(53)
df = raw.rename(
columns={
'Number of': 'households',
'Average': 'average_households'
}
)

households = []
avg = []
Expand All @@ -66,17 +86,19 @@ def getHouseholds(self):
households.append(value)
for item in df['average_households'].array:
avg.append(item)


d = {'Households': households, 'Average-Households': avg }
d = {'Households': households, 'Average-Households': avg}
final = pd.DataFrame(data=d)
return final

def getPopulationDensity(self):
file = "VOLUME-1-KPHC-2019.pdf"
table = tabula.read_pdf(file,pages=20, area=[100.8125,0.6425,1000.2825,1000.1025])
df1 = table[0].drop(0).drop(1).drop(49).drop(50).drop(51).drop(columns="Unnamed: 0").drop(columns="Unnamed: 1").drop(columns="Unnamed: 2").drop(columns="Unnamed: 3").drop(columns="Unnamed: 4")
df2 = df1.rename(columns={'(Sq. Km)': 'land_area', '(No. per Sq. Km)': 'density'})
table = read_pdf(file, pages=20, area=[
100.8125, 0.6425, 1000.2825, 1000.1025])
df1 = table[0].drop(0).drop(1).drop(49).drop(50).drop(51).drop(columns="Unnamed: 0").drop(
columns="Unnamed: 1").drop(columns="Unnamed: 2").drop(columns="Unnamed: 3").drop(columns="Unnamed: 4")
df2 = df1.rename(
columns={'(Sq. Km)': 'land_area', '(No. per Sq. Km)': 'density'})

land_area = []
density = []
Expand All @@ -86,20 +108,17 @@ def getPopulationDensity(self):
for item in df2['density'].array:
value = re.sub(r'[\,\/\-\']', "", item)
density.append(value)
d = {'land_area': land_area, 'density': density }

d = {'land_area': land_area, 'density': density}
final = pd.DataFrame(data=d)
return final

def getCountyCodes(self):
data = self.getCounties()


code = []
for idx, item in enumerate(data['County'].array):
code.append(idx + 1)



result = pd.DataFrame(data={'code': code})
return result
25 changes: 22 additions & 3 deletions miner/subcounties.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,27 @@
import tabula
from tabula.io import read_pdf
import pandas as pd
import re


def getPopulation():
table = tabula.read_pdf(file,pages=22)

import pdb;pdb.set_trace()
file = "VOLUME-1-KPHC-2019.pdf"

raw = read_pdf(file, pages=[22, 23, 24, 25, 26, 27, 2, 29, 30], area=[
100.8125, 0.6425, 1000.2825, 1000.1025
])

table1 = raw[1].dropna(
subset=['National/ County']
).drop(columns='Unnamed: 0').drop(2).drop(3)

import pdb
pdb.set_trace()

# table2 = read_pdf(file, pages=23)
# table3 = read_pdf(file, pages=24)

# tables = [table1, table2, table3]


getPopulation()

0 comments on commit 3491ef3

Please sign in to comment.