forked from eveningdong/New_York_State_Inpatients_Medical_Treatment_and_Hospital_Recommender_System_Design
-
Notifications
You must be signed in to change notification settings - Fork 0
/
preprocess.py
55 lines (42 loc) · 1.42 KB
/
preprocess.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
import pandas as pd
raw_data = pd.read_csv('Hospital.csv')
names = {
'Hospital County':'county',
'Facility Id':'facility',
'Age Group':'age',
'Gender':'gender',
'Race':'race',
'Ethnicity':'ethnicity',
'Admit Day of Week':'admit_day',
'Type of Admission':'admit_type',
'Patient Disposition':'disposition',
'CCS Diagnosis Code':'diagnosis',
'CCS Procedure Code':'procedure',
'APR DRG Code':'DRG',
'APR MDC Code':'MDC',
'APR Severity of Illness Code':'severity',
'APR Risk of Mortality':'risk',
'APR Medical Surgical Description':'method',
'Emergency Department Indicator':'emg'
}
payment_methods = {
'Payment Typology 1':'payment1',
'Payment Typology 2':'payment2',
'Payment Typology 3':'payment3',
}
raw_data.rename(columns=names, inplace=True)
# select relevant variables
data = raw_data[sorted(names.values())]
data.shape
# data clearning and data transformation
data = data.dropna()
data['facility'] = data['facility'].astype(int)
data = data[data['gender'] != 'U']
data = data[data['race'] != 'Unknown']
data = data[data['ethnicity'] != 'Unknown']
data['diagnosis'] = data['diagnosis'].astype(str)
data['procedure'] = data['procedure'].astype(str)
data['DRG'] = data['DRG'].astype(str)
data['MDC'] = data['MDC'].astype(str)
data['severity'] = data['severity'].astype(str)
data.to_csv('cleaned_data.csv')