-
Notifications
You must be signed in to change notification settings - Fork 1
/
bipolar_study_subject_selection_abcd4.0.py
275 lines (213 loc) · 9.64 KB
/
bipolar_study_subject_selection_abcd4.0.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
# ---
# jupyter:
# jupytext:
# formats: ipynb,py:percent
# text_representation:
# extension: .py
# format_name: percent
# format_version: '1.3'
# jupytext_version: 1.14.1
# kernelspec:
# display_name: Python 3 (ipykernel)
# language: python
# name: python3
# ---
# %% [markdown]
# The purpose of this notebook is to select a reasonable collection of subjects for a dimensional neuroimaging study of the onset of BP symptoms.
# %%
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
# %%
table_path_fstring = "/home/ebrahim/data/abcd/Package_1200530/{}.txt"
table_path_fstring2 = "/home/ebrahim/data/abcd/Package_1200951/{}.txt"
dict_path_fstring = "/home/ebrahim/data/abcd/abcd-4.0-data-dictionaries/{}.csv"
def read_abcd_table(table_name, include_data_dict = True, use_alternative_package = False):
if use_alternative_package:
table_path = table_path_fstring2.format(table_name)
else:
table_path = table_path_fstring.format(table_name)
df = pd.read_csv(table_path, sep='\t', header=0, skiprows=[1], low_memory=False, parse_dates=['interview_date'])
if include_data_dict:
dict_path = dict_path_fstring.format(table_name)
data_dictionary = pd.read_csv(dict_path, index_col='ElementName')
return df, data_dictionary
else:
return df
# %%
# Load data
# Demographic data
demo, demo_dd = read_abcd_table("pdem02")
ldemo, ldemo_dd = read_abcd_table("abcd_lpds01") # longitudinal version
# raw survey data for bipolar
bp, bp_dd = read_abcd_table("bipolar_disorders_p01", use_alternative_package=True) # parent survey
bp_youth, bp_youth_dd = read_abcd_table("bipolar_disorders01", use_alternative_package=True) # youth survey
# mental health data
mh, mh_dd = read_abcd_table("abcd_ksad01")
# DMRI imaging data
dmri = read_abcd_table("fmriresults01", include_data_dict=False)
# %%
# Fix row duplication issue
demo = demo.drop_duplicates()
mh = mh.drop_duplicates()
dmri = dmri.drop_duplicates()
# ldemo also has a row duplication problem, but there are two subjects that have
# their 2 and 3 year follow up occuring at the same interview_age. drop these. then fix the row duplication
ldemo = ldemo[(ldemo.subjectkey!='NDAR_INV7YM285FW') & (ldemo.subjectkey!='NDAR_INV99J6PWJX')]
ldemo = ldemo.groupby(['subjectkey', 'interview_age'], as_index=False).apply(lambda s: s.iloc[0])
# %% [markdown]
# # Subject Selection
# %% [markdown]
# There are up to 3 time points for these interviews:
# %%
bp.eventname.value_counts(sort=False)
# %% [markdown]
# Our largest number of subjects to select is the number who completed at least one survey from the 3 time points:
# %%
bp.subjectkey.nunique()
# %% [markdown]
# Or we could only keep the subjects who did the survey at all 3 time points:
# %%
sum(bp.groupby('subjectkey').apply(lambda x : len(x)) == 3)
# %% [markdown]
# We can instead focus on subjects who had any BP symptom as indicated by KSADS diagnistics, with a sampling of the same number of healthy controls.
# %%
# Collect list of KSADS diagnostic column names related to BP
bp_elementnames = mh_dd[mh_dd.ElementDescription.str.contains("Bipolar", case=False)].index
for en in bp_elementnames:
print(mh_dd.loc[en].ElementDescription)
# %%
mh[bp_elementnames].ksads_2_837_p.value_counts()
# %%
mh['some_bp_thing'] = mh[bp_elementnames].apply(lambda x : (x==1).any(), axis=1)
# %%
mh.groupby('subjectkey').some_bp_thing.agg(func='any').value_counts()
# %% [markdown]
# That would be 3189 subjects; maybe we can restrict to the more clear diagnositic columns only.
# %%
mh['some_bp_thing2'] = mh[bp_elementnames[:10]].apply(lambda x : (x==1).any(), axis=1)
mh['some_bp_thing3'] = mh[bp_elementnames[:8]].apply(lambda x : (x==1).any(), axis=1)
# %%
mh.groupby('subjectkey').some_bp_thing2.agg(func='any').value_counts()
# %%
mh.groupby('subjectkey').some_bp_thing3.agg(func='any').value_counts()
# %% [markdown]
# That last one seems like a good number to take: 568 subjects.
# So that would be those who have a positive in at least one of the following KSADS diagnostics for at least one interview age:
# %%
for en in bp_elementnames[:8]:
print(mh_dd.loc[en].ElementDescription)
# %%
# indicates whether a given *subject* had at least one interview that contained a positive diagnostic
subject_positive_bp = mh.groupby('subjectkey').some_bp_thing3.agg(func='any')
# indicates whether a given subject did all three of their interviews
subject_interview_complete = mh.groupby('subjectkey').apply(lambda x : len(x)) == 3
# indicates whether a given subject did all two of their scans
subject_scans_complete = dmri.groupby(['subjectkey']).apply(lambda x : x.interview_age.nunique()) == 2
# Concatenate into a dataframe
subject_inclusion_criteria = pd.concat(
{'positive_bp':subject_positive_bp,
'interview_complete':subject_interview_complete,
'scans_complete':subject_scans_complete
}, axis=1)
subject_inclusion_criteria = subject_inclusion_criteria.replace(np.nan, False)
# %% [markdown]
# If we want to include the subjects that satisfy all three of these criteria, then here's how many subjects we get:
# %%
subject_inclusion_criteria.all(axis=1).sum()
# %% [markdown]
# If we then want to include the same number of healthy controls, then we can take a random sample that is stratified based on the demographics of these 290. (Or should it be stratified based on the demographics of the full abcd study cohort?)
# %% [markdown]
# # COVID-19 Lockdown Effects
#
# ## Data Collection Timing
# Let's now look at the interview dates for those included subjects.
# %%
included_subjects = subject_inclusion_criteria[subject_inclusion_criteria.all(axis=1)].index
# %%
time_index_mapping = {
'baselineYear1Arm1' : 0,
'2YearFollowUpYArm1' : 1
}
dmri['time_index'] = dmri.derived_files.apply(lambda x : time_index_mapping[x.split('/')[-1].split('_')[1]])
# %%
def year_and_month_to_month_offset(year, month, day):
return (year-2018)*12 + month + day/30.5
dmri_second_scan_included_subjects = dmri[dmri.subjectkey.isin(included_subjects) & (dmri.time_index==1)]
dmri_second_scan_included_unique_subjects = dmri_second_scan_included_subjects.groupby('subjectkey').apply(lambda x : x.iloc[0])
month_offsets = dmri_second_scan_included_unique_subjects.interview_date.apply(lambda x : year_and_month_to_month_offset(x.year,x.month, x.day))
lockdown_month_offset = year_and_month_to_month_offset(2020,3,20)
# %%
bins = range(8,38,2)
plt.figure(figsize=(4,2))
plt.hist(month_offsets, bins=bins)
plt.plot([lockdown_month_offset,lockdown_month_offset],[0,100], label="Lockdown")
plt.xticks(ticks = bins)
plt.ylim(0,45)
plt.yticks(ticks=range(0,50,10))
plt.xlabel("Months since Jan 2018")
plt.title("2-Year Follow-Up Scan Dates")
plt.legend()
plt.show()
# %%
(month_offsets > lockdown_month_offset).value_counts()
# %% [markdown]
# This is great. Of the 290 selected BP subjects, 42 of them had their second scan after lockdown measures, and 248 of them before. That means we can _ignore_ lockdown effects by restricting to those 248 and it's not a serious loss at all. And we can also _observe_ lockdown effects if we want to, because 42 is a decent number of positive post-lockdown cases to have.
# %% [markdown]
# Let's now check the same for the mental health interview data.
# %%
time_index_mapping = {
'baseline_year_1_arm_1' : 0,
'1_year_follow_up_y_arm_1' : 1,
'2_year_follow_up_y_arm_1' : 2,
}
mh['time_index'] = mh.eventname.apply(lambda x : time_index_mapping[x])
mh_time1_included_subjects = mh[mh.subjectkey.isin(included_subjects) & (mh.time_index==1)]
month_offsets1 = mh_time1_included_subjects.interview_date.apply(lambda x : year_and_month_to_month_offset(x.year,x.month, x.day))
mh_time2_included_subjects = mh[mh.subjectkey.isin(included_subjects) & (mh.time_index==2)]
month_offsets2 = mh_time2_included_subjects.interview_date.apply(lambda x : year_and_month_to_month_offset(x.year,x.month, x.day))
# %%
bins = range(8,38,2)
plt.hist(month_offsets2, bins=bins)
plt.plot([lockdown_month_offset,lockdown_month_offset],[0,100], label="Lockdown")
plt.xticks(ticks = bins)
plt.ylim(0,45)
plt.yticks(ticks=range(0,50,10))
plt.xlabel("Months since Jan 2018")
plt.title("2-Year Follow-Up Interview Dates")
plt.legend()
plt.show()
# %% [markdown]
# ## Data Collection Adjustments
#
# The table `abcd_lt01` provides some information on study sites and on whether a visit was in person, remote, or hybrid. To determine visit type:
#
# - If `sched_delay = 7`, then `visit = In person`
# - If `sched_delay = 9` and `sched_hybrid = 0`, then `visit = Remote`
# - If `sched_delay = 9` and `sched_hybrid = 1`, then `visit = Hybrid`
# %%
lt, lt_dd = read_abcd_table("abcd_lt01")
lt = lt.groupby('subjectkey').apply(lambda x : x.iloc[0]) # Remove duplicate rows
lt_2year = lt[lt.eventname.isin(['2_year_follow_up_y_arm_1'])]
# %%
lt_2year_included_subjects = lt_2year[lt_2year.subjectkey.isin(included_subjects)]
month_offsets = lt_2year_included_subjects.interview_date.apply(lambda x : year_and_month_to_month_offset(x.year,x.month, x.day))
# %%
bins = range(8,38,2)
plt.hist(month_offsets, bins=bins)
plt.plot([lockdown_month_offset,lockdown_month_offset],[0,100], label="Lockdown")
plt.xticks(ticks = bins)
plt.ylim(0,45)
plt.yticks(ticks=range(0,50,10))
plt.xlabel("Months since Jan 2018")
plt.title("2-Year Follow-Up Events")
plt.legend()
plt.show()
# %%
len(lt_2year_included_subjects)
# %% [markdown]
# Hmm why is this different from the following?
# %%
len(mh_time2_included_subjects)
# %% [markdown]
# The `abcd_lt01` table does not seem to have entries to go with all the events I find in, say, the `abcd_ksad01` table. Need to figure out why that is in order to map each of my cases reliably to in-person/remote/hybrid status.