-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathaux_functions.py
225 lines (185 loc) · 8.79 KB
/
aux_functions.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
# -*- coding: utf-8 -*-
"""
This code was created by Ricardo and Lucas to parse ECG files in the .xml format
The code contains functions to load all files and map them to their IDS, save and plot them.
The names of the fields can change depending on the database, for the MUSE it is already set
If you have any questions feel free to contact us on:
[email protected] Ricardo R Lopes or
[email protected] - Lucas A. Ramos
"""
import xmltodict #need to install
import base64
import numpy as np
import matplotlib.pyplot as plt
import glob
import os
import pandas as pd
from collections import defaultdict
from tqdm import tqdm #need to install
import pickle #need to install
from datetime import datetime
def plot_signal(signal_list,file_name,path_files):
n_plots=len(signal_list)
fig = plt.figure(figsize=(35, 15))
ax = fig.add_subplot(n_plots, 1, 1)
x=211
for i in range(n_plots):
s=np.array(signal_list[i])
if False:
ax = plt.gca()
ax.xaxis.set_major_locator(plt.MultipleLocator(0.25))
ax.xaxis.set_minor_locator(plt.MultipleLocator(0.05))
ax.yaxis.set_major_locator(plt.MultipleLocator(50.0))
ax.yaxis.set_minor_locator(plt.MultipleLocator(10.0))
ax.grid(which='major', axis='x', linewidth=2, alpha=0.3, linestyle='-', color='red')
ax.grid(which='minor', axis='x', linewidth=0.5, alpha=0.2, linestyle='-', color='red')
ax.grid(which='major', axis='y', linewidth=2, alpha=0.3, linestyle='-', color='red')
ax.grid(which='minor', axis='y', linewidth=0.5, alpha=0.2, linestyle='-', color='red')
ax.set_xticklabels([])
ax.set_yticklabels([])
max_len = int(s.shape[0]/2)
plt.subplot(x)
x=x+1
plt.plot(np.arange(0, 5, 5/max_len), s[:max_len])
#plt.show()
plt.savefig(path_files+'\\images\\'+file_name+'.png')
plt.close(fig)
#plt.show()
def get_xml_hash(xml_path):
"""
This function gets all the xml files in a given path and write down the patientID and the path to the xml file
This way we don't have to search for the patients every time
Input:
xml_path: string, path to all the xml files, if you have several folder use ** for the different folders as shown in the example
Output:
hash_table: dictionary with patientid as key and a list of paths for each id, this has to ve saved outside the function
"""
hash_table = defaultdict(list)
print("Catching all files!")
file_list=glob.glob((xml_path), recursive=True)
#file_list=glob.glob((r"G:\diva1\Research\MUSE ECGs\00_01_02_03\*.xml"), recursive=True)
error_table = list()
for file in tqdm(file_list):
#for file in file_list:
with open(file) as fd:
try:
ecg_dict = xmltodict.parse(fd.read(), process_namespaces=True)
f_id = ecg_dict['RestingECG']['PatientDemographics'][xml_field]
#all_xml[f_id].append(file)
hash_table[f_id].append(file)
except:
# print("Error:",file)
error_table.append(file)
return(hash_table,error_table)
def get_excel_from_xml(excel_ids,hash_table):
"""
This function loads the ecg signals from the xml files according to the excel file
Input:
excel_ids: list of all the ids from the excel file (PIN)
hash_table: output from get_xml_hash
Output:
data_signal: dictionary with patient ID and the loaded signal
date: dictionary with date of acquisition of the ecg
data_amp: dictionary with amplitudes just to make sure they are not changing = (to be removed)
"""
data_signal = defaultdict(list)
date = defaultdict(list)
data_amp = defaultdict(list)
for key in tqdm(excel_ids):
file_names=hash_table[key]
for i in range(len(file_names)):
with open(file_names[i]) as fd:
ecg_dict = xmltodict.parse(fd.read(), process_namespaces=True)
mean_wave, leads = ecg_dict['RestingECG']['Waveform']
date[key].append(ecg_dict['RestingECG']['TestDemographics']['AcquisitionDate'])
for k,i in enumerate(leads['LeadData'][:]):
amp=float(leads['LeadData'][k]['LeadAmplitudeUnitsPerBit'])
data_amp[key].append(amp)
b64_encoded = ''.join(i['WaveFormData'].split('\n'))
decoded = base64.b64decode(b64_encoded)
signal = np.frombuffer(decoded, dtype='int16')
data_signal[key].append(signal*amp)
#plot_ecg(signal)
return(data_signal,date,data_amp)
def filter_by_date(hash_table,excel_ids,excel_dates,date,six_months):
"""
This function will further filter patients based on the date read from the file (excel in this case)
Current options are 6 months before surgery or just before surgery, but that can be easily adapted.
Input:
excel_ids: list of all the ids from the excel file (PIN)
excel_dates: dictionary with the keys from each patient ID and the date to be filtered (surgery date in this case)
six_months: boolean, if True, only keeps the ECGs that were done 6 months before surgery date
Output:
filter_date: Dictionary with a list of .xml paths for each key whose date was < than surgery date
later_date: same of filter_date but the ones that did not fit the search criterea
"""
filter_date = defaultdict(list)
later_date = defaultdict(list)
found=False
#number of days in 6 months
days_in_months=183
for key in (excel_ids):
file_names=hash_table[key]
file_d=date[key]
excel_d=datetime.strptime(excel_dates[key], "%d-%m-%Y")
for i in range(len(file_d)):
d=datetime.strptime(file_d[i], "%m-%d-%Y")
if six_months:
if (excel_d-d).days<days_in_months and (excel_d-d).days>0:
filter_date[key].append(file_names[i])
found=True
else:
if d<excel_d:
filter_date[key].append(file_names[i])
found=True
if not found:
later_date[key].append(file_names)
else:
found=False
return(filter_date,later_date)
def save_file(table_save,path_save,file_name):
filename = path_save+file_name
outfile = open(filename,'wb')
pickle.dump(table_save,outfile)
outfile.close()
def load_file(path_save,filename):
infile = open(path_save+filename,'rb')
new_dict = pickle.load(infile)
infile.close()
return(new_dict)
def fix_excel(df,excel_field,excel_surgery,label_name):
"""
This function fixes some errors that came with the excel file, probably not very useful if you switch files, but can be adapted and makes the code easier to read
Input:
df: Pandas Dataframe from the excel file
excel_field: name of Excel field of patient ID, in this case it is the PIN
excel_surgery: name of Excel field of Surgery Date
Output:
excel_ids: list of all the ids from the excel file (PIN) after corrections
excel_dates: dictionary with the keys from each patient ID and the date to be filtered (surgery date in this case)
"""
excel_ids=list(df[excel_field].astype(str))
#add zeros because the number should have 7 digits
for i in range(len(excel_ids)):
if len(excel_ids[i])<7:
excel_ids[i]=excel_ids[i].rjust(7,'0')
excel_dates=list(df[excel_surgery].astype(str))
excel_labels=list(df[label_name].astype(str))
#Replacing nans with ridiculous date instead of excluding, they will never be used anyway
excel_dates = ['1-1-1500' if str(x)=='nan' else x for x in excel_dates]
excel_dates = dict(zip(excel_ids,excel_dates))
labels = dict(zip(excel_ids,excel_labels))
return(excel_ids,excel_dates,labels)
def filter_by_rhythm(filter_date):
filter_rhythm=defaultdict(list)
other_rhythms=defaultdict(list)
for key in tqdm(filter_date):
file_names=filter_date[key]
for file in file_names:
with open(file) as fd:
rhythm = str(xmltodict.parse(fd.read(), process_namespaces=True))
if ('Sinusritme'in rhythm or 'Sinus ritme'in rhythm or 'Sinusbradycardie'in rhythm or 'Sinustachycardie' in rhythm):
filter_rhythm[key].append(file)
else:
other_rhythms[key].append(rhythm)
return(filter_rhythm,other_rhythms)