-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy path2008preterm.py
170 lines (170 loc) · 8.21 KB
/
2008preterm.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
# -*- coding: utf-8 -*-
# Analysis of 2008 preterm infant deaths from Tomaslav Urban's data
#
# Dave Semeraro - Janurary 2019
import pandas as pd
import sys
import os
from CDCDataTools import CDCFilters
import matplotlib.pyplot as plt
# These are the tags or column names I will need.
clinical_age = 'clinical_gestation_estimate'
obstetric_age = 'gestation_in_weeks'
resident = 'residence_status'
clinical_flag = 'clinical_estimate_of_gestation_used_flag'
death_age = 'age_at_death_in_days'
death_year = 'death_year'
delivery_year = 'delivery_year'
# read the infant death data
# the data contains no column headers so we have to add them later
# read_csv will use the first column of the data as the index.
# turn this off with index_col=False
death_columns_used=[death_age,death_year,delivery_year,resident,clinical_age,obstetric_age,clinical_flag]
filename = os.path.expanduser("~"+"\\Box Sync\\DellMed\\Data\\TUdata\\infant_deaths_2008.csv")
headername = os.path.expanduser("~"+"\\Box Sync\\DellMed\\Data\\TUdata\\headers_infant_deaths.csv")
col_index_list = list()
deaths_header = pd.read_csv(headername,index_col=False,header=0)
for name in death_columns_used: # col_index_list contains column numbers
col_index_list.append(deaths_header.columns.get_loc(name))
infant_deaths = pd.read_csv(filename,index_col=False,header=0,usecols=col_index_list)
infant_deaths.columns = death_columns_used
# read the birth data.
# the column header situation is the same for live births as for infant death
# add the headers after the read
live_columns_used=[delivery_year,resident,clinical_age,obstetric_age,clinical_flag]
filename = os.path.expanduser("~"+"\\Box Sync\\DellMed\\Data\\TUdata\\live_births_2008.csv")
headername = os.path.expanduser("~"+"\\Box Sync\\DellMed\\Data\\TUdata\\headers_live_births.csv")
col_index_list=list()
births_header = pd.read_csv(headername,index_col=False,header=0)
for name in live_columns_used:
col_index_list.append(births_header.columns.get_loc(name))
live_births = pd.read_csv(filename,index_col=False,header=0,usecols=col_index_list)
live_births.columns = live_columns_used
# do a little clean up
del filename, headername, deaths_header, births_header
# remove non residents
# use the CDCFilters module
CDCFilters.remove_nonresidents(infant_deaths,inplace=True)
CDCFilters.remove_nonresidents(live_births,inplace=True)
# remove unknown gestational age.
CDCFilters.remove_unknown_gestation_age(infant_deaths,inplace=True)
CDCFilters.remove_unknown_gestation_age(live_births,inplace=True)
# split the tables into records that use clinical estimate and those that dont.
clindeath,obsdeath = CDCFilters.split_on_clinical_flag(infant_deaths)
clinlive,obslive = CDCFilters.split_on_clinical_flag(live_births)
# count total deaths vs gestational age for clin
clindvsage = CDCFilters.count_and_sort(clindeath,clinical_age)
plt.figure(1)
clindeadplot = clindvsage.plot.bar()
clindeadplot.set_title("Clinical Estimate Used",fontsize=18)
clindeadplot.set_xlabel("Gestational Age",fontsize=16)
clindeadplot.set_ylabel("Deaths",fontsize=16)
# count total births vs ga for clinical estimate.
clinbvsage = CDCFilters.count_and_sort(clinlive,clinical_age)
plt.figure(2)
clinbirthplot = clinbvsage.plot.bar()
clinbirthplot.set_title("Clinical Estimate Used",fontsize=18)
clinbirthplot.set_xlabel("Gestational Age",fontsize=16)
clinbirthplot.set_ylabel("Births",fontsize=16)
# plot just the preterm data
plt.figure(3)
clinbirthplot = clinbvsage[0:6].plot.bar()
clinbirthplot.set_title("Clinical Estimate Used",fontsize=18)
clinbirthplot.set_xlabel("Gestational Age",fontsize=16)
clinbirthplot.set_ylabel("Births",fontsize=16)
# count deaths vs gestational age for obstetric estimate
obsdvsage = CDCFilters.count_and_sort(obsdeath,obstetric_age)
plt.figure(4)
obsdeadplot = obsdvsage.plot.bar()
obsdeadplot.set_title("Clinical Estimate Not Used",fontsize=18)
obsdeadplot.set_xlabel("Gestational Age",fontsize=16)
obsdeadplot.set_ylabel("Deaths",fontsize=16)
# count births vs gestational age fo obstetric estimates
obsbvsage = CDCFilters.count_and_sort(obslive,obstetric_age)
plt.figure(5)
obsliveplot = obsbvsage.plot.bar()
obsliveplot.set_title("Clinical Estimate Not Used",fontsize=18)
obsliveplot.set_xlabel("Gestational Age",fontsize=16)
obsliveplot.set_ylabel("Births",fontsize=16)
# just the preterm.
plt.figure(6)
obsliveplot = obsbvsage[0:6].plot.bar()
obsliveplot.set_title("Clinical Estimate Not Used",fontsize=18)
obsliveplot.set_xlabel("Gestational Age",fontsize=16)
obsliveplot.set_ylabel("Births",fontsize=16)
# look at clinical estimate in obstetric records with unknown obs age
unknowndead = obsdeath[obsdeath.gestation_in_weeks==99]
unknowndeadvsclinga = CDCFilters.count_and_sort(unknowndead,clinical_age)
plt.figure(7)
ukdvclinplot = unknowndeadvsclinga.plot.bar()
ukdvclinplot.set_title("Unknown Obstetric Deaths",fontsize=18)
ukdvclinplot.set_xlabel("Gestational Age",fontsize=16)
ukdvclinplot.set_ylabel("Deaths",fontsize=16)
# live
unknownlive = obslive[obslive.gestation_in_weeks==99]
unknownlivevsclinga = CDCFilters.count_and_sort(unknownlive,clinical_age)
plt.figure(8)
uklvclinplot=unknownlivevsclinga.plot.bar()
uklvclinplot.set_title("Unknown Obstetric Births",fontsize=18)
uklvclinplot.set_xlabel("Gestational Age",fontsize=16)
uklvclinplot.set_ylabel("Births")
# clean up
del clinbvsage, clindeath, clindvsage, clinlive
del obsbvsage, obsdeath, obsdvsage, obslive
# add GA column to infant_deaths that contains clinical if flag set or
# obstetric missing and obstetric otherwise.
infant_deaths['GA'] = infant_deaths.apply(lambda row: CDCFilters.GA(row),axis=1).astype(int)
# add GA column to live births that contains clinical if flag set or
# obstetric missing and obstetric otherwise
live_births['GA'] = live_births.apply(lambda row: CDCFilters.GA(row),axis=1).astype(int)
infantdeathsvsga = CDCFilters.count_and_sort(infant_deaths,'GA')
plt.figure(9)
idgaplt = infantdeathsvsga.plot.bar()
idgaplt.set_title("Deaths VS GA",fontsize=18)
idgaplt.set_xlabel("Gestational Age",fontsize=16)
idgaplt.set_ylabel("Deaths",fontsize=16)
#
live_birthsvsga = CDCFilters.count_and_sort(live_births,'GA')
plt.figure(10)
lbgaplt = live_birthsvsga.plot.bar()
lbgaplt.set_title("Surviving Births VS GA",fontsize=18)
lbgaplt.set_xlabel("Gestational Age",fontsize=16)
lbgaplt.set_ylabel("Births",fontsize=16)
# get the total
birthvsga = live_birthsvsga.to_frame()
deathvsga = infantdeathsvsga.to_frame()
totalbirthvsga = birthvsga.add(deathvsga,axis=1,fill_value=0)
# calculate neonatal death
neonatal_deaths = infant_deaths[infant_deaths.age_at_death_in_days < 28]
neonvsga = CDCFilters.count_and_sort(neonatal_deaths,'GA')
plt.figure(11)
ndgaplt = neonvsga.plot.bar()
ndgaplt.set_title("Neonatal Deaths VS GA",fontsize=18)
ndgaplt.set_xlabel("Gestational Age",fontsize=16)
ndgaplt.set_ylabel("Deaths",fontsize=16)
prob = neonvsga.to_frame().divide(totalbirthvsga,fill_value=0)
# split tables by gestational estimate type
#clinical_deaths,obstetric_deaths = CDCFilters.split_on_clinical_flag(infant_deaths)
#clinical_neonatal,obstetric_neonatal = CDCFilters.split_on_clinical_flag(neonatal_deaths)
#delete records with missing obstetric age in obstetric data
#CDCFilters.remove_unknown_obstetric_age(obstetric_deaths,inplace=True)
#CDCFilters.remove_unknown_obstetric_age(obstetric_neonatal,inplace=True)
#calculate death counts vs gestational age
#clinical_deaths_vs_gestation = CDCFilters.count_and_sort(clinical_deaths,clinical_age)
# count births and deaths vs gestational age.
#totaldeaths_vs_gestation = infant_deaths['gestation_in_weeks'].value_counts().sort_index()
#neondeath_vs_gestation = neonatal_deaths['gestation_in_weeks'].value_counts().sort_index()
#births_vs_gestation = live_births['gestation_in_weeks'].value_counts().sort_index()
#total_births_vs_gestation = totaldeaths_vs_gestation + births_vs_gestation
#probability_of_neonatal_death = neondeath_vs_gestation/total_births_vs_gestation
# plot it
#plt.figure(1)
#dead = neondeath_vs_gestation.plot.bar()
#dead.set_title("Neonatal deaths infants born in 2008",fontsize=18)
#dead.set_xlabel("Gestation age (weeks)",fontsize=16)
#dead.set_ylabel("Deaths",fontsize=16)
#plt.figure(2)
#prob = probability_of_neonatal_death.plot.bar()
#prob.set_title("Probability of Neonatal Death 2008",fontsize=18)
#prob.set_xlabel("Gestation age (weeks)",fontsize=16)
#prob.set_ylabel("Probability",fontsize=16)