-
Notifications
You must be signed in to change notification settings - Fork 0
/
SEPIA.py
597 lines (458 loc) · 21.9 KB
/
SEPIA.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
#!/usr/bin/env python3
"""
File takes in a priortization ordering and runs through the SEPIA workflow
to output the Kendall Tau B correlation coefficient between their ordering
and the most optimal ordering, as generated by the chosen metric.
If verbose flag is specified, intermediate data in the process can be outputted to stderr.
"""
# imports
from gzip import open as gopen
from itertools import repeat
from numpy import append, linspace
from scipy.stats import kendalltau, linregress
from sys import stderr, stdin
import argparse
# CONSTANTS
DEF_POINTS_PER_STEP = 10
METRIC1 = 1; METRIC2 = 2; METRIC3 = 3; METRIC4 = 4; METRIC5 = 5; METRIC6 = 6
TAB_CHAR = '\t'
def pairCounts(transmissionHist, contactNet, lowerBound: int, upperBound: int, metric: float) -> dict:
"""
DRIVER, DIRECTLY CALLED FROM COMPUTE_EFFICACY
Pairs each individual with a count value, where a higher count value indicates
that an individual has a higher priority. Count values are calculated based
on the corresponding chosen metric.
This function calls other functions that handle building the dictionaries for the chosen metric
and handles error checking/input formatting.
There are currently six metrics to choose from:
Metric 1 - Finds the number of direct transmissions from one individual to another
Metric 2 - Performs linear regression per individual on to analyze their rate of infection.
Metric 3 - Finds the number of indirect transmissions from the individuals HIV was
transmitted to from a given individual.
Metric 4 - Totals the numbers from metric 1 and metric 3 for each individual.
Metric 5 - Finds the number of contacts for each individual in the contact number.
Metric 6 - TODO - MCKENNA!!
Returns a dictionary where each key is an individual and their value
is their corresponding count.
Parameters
----------
tranmissionHist - the file object with data on tranmissions used to build the
dictionary
contactNet - the file object with data on the contact network used to build the
dictionary
lowerBound - lower bound of time range
upperBound - upper bound of timerange
metric - float, specifies the chosen metric
"""
# Error checking, check if necessary files were provided
if metric in [METRIC1, METRIC2, METRIC3, METRIC4, METRIC6] and transmissionHist == '':
raise ValueError("Missing transmission history file for metric " + str(metric) + ".\nSpecify with '-t TRANSMISSIONHIST'")
if metric in [METRIC5, METRIC6] and contactNet == '':
raise ValueError("Missing contact network file for metric " + str(metric) + ".\nSpecify with '-c CONTACTNET'")
# Call the function corresponding to the chosen metric
if (metric == METRIC1):
return directTransmissions(transmissionHist, lowerBound, upperBound)
elif (int(metric) == METRIC2):
# Parse num dots per line (linspace) from the input and convert it to an int
numPointsPerStep = str(metric).split("."); numPointsPerStep = float(numPointsPerStep[1])
while not numPointsPerStep.is_integer():
numPointsPerStep *= 10
return bestfitGraph(transmissionHist, lowerBound, upperBound, int(numPointsPerStep))
elif (int(metric) == METRIC3):
# Parse number of degrees away and convert it to an int
numDegrees = str(metric).split("."); numDegrees = float(numDegrees[1])
while not numDegrees.is_integer():
numDegrees *= 10
# If number degrees away is less than 2, default to 2
if numDegrees < 2:
numDegrees = 2
return indirectTransmissions(transmissionHist, int(numDegrees), lowerBound, upperBound)
elif (metric == METRIC4):
return totalTransmissions(transmissionHist, lowerBound, upperBound)
elif (metric == METRIC5):
return numContacts(contactNet, lowerBound, upperBound)
elif (metric == METRIC6):
return numContactInfect(transmissionHist, contactNet, lowerBound, upperBound)
else:
raise ValueError("No metric " + str(metric) + " exists.\nPlease specify one between 1-6.")
# FUNCTIONS PERFORMING DIFFERENT METRICS (1-6) --------------------------------------------------------------------------
def directTransmissions(transmissionHist, lowerBound: int, upperBound: int) -> dict:
"""
METRIC 1
Counts the number of times each individual infected someone else in a file.
Returns a dictionary where each key is an individual and their value
is their corresponding infection count.
Parameters
----------
tranmissionHist - the file object with data on tranmissions used to build the
dictionary
lowerBound - lower bound of years range
upperBound - upper bound of years range
"""
infectedPersons= []; people = []; numInfected = dict()
lines = opengzip(transmissionHist)
# Loop over each line in the file.
for line in lines:
u,v,t = line.split(TAB_CHAR)
u = u.strip(); v = v.strip() # strip leading/trailing white space
# Only considers infections within a given range of years
if (lowerBound > float(t)) | (float(t) > upperBound):
continue
if not u or u == 'None':
continue
if u not in numInfected:
numInfected[u] = 0
numInfected[u] += 1
return numInfected
def bestfitGraph(transmissionHist, lowerBound: int, upperBound: int, numPointsPerStep: int) -> dict:
"""
METRIC 2
Returns a dictionary where each key is an individual and their value
is their corresponding count as calculated by the slope linear regression
of their transmissions over time.
If the upperBound was not specified when running the script, it defaults
to being the time of the latest transmission.
Parameters
----------
tranmissionHist - the file object with data on tranmissions used to build the
dictionary
lowerBound - lower bound of time range
upperBound - upper bound of time range
numPointsPerStep - TODO
"""
# Use default if none specified
if numPointsPerStep == 0:
numPointsPerStep = DEF_POINTS_PER_STEP
infectedPersons= []; people = []
# build timesInfected, a dict where each person is
# matched up with a list of times at which they transmitted
timesInfected = dict()
lines = opengzip(transmissionHist)
# Deal with upper bound setting
isUpperBoundSet = True; latestInfectionTime = -1;
# Check if an upper bound for time was set
if upperBound == float('inf'):
isUpperBoundSet = False
else:
latestInfectionTime = upperBound
# Loop over each line in the transmissions file to build timesInfected
for line in lines:
u,v,t = line.split(TAB_CHAR)
u = u.strip(); v = v.strip()
# Only considers infections within a given range of years
if (lowerBound > float(t)) | (float(t) > upperBound):
continue
if u == 'None':
continue
if u not in timesInfected:
timesInfected[u] = []
# Append this time to u's list
timesInfected[u].append(float(t))
# Keep iterating to get the globally latest infection time
if not isUpperBoundSet and float(t) > latestInfectionTime:
latestInfectionTime = float(t)
# Build a dict with users as keys paired with their slopes
slopesDict = dict()
count = 0
# Loop over all transmitters
for u in timesInfected:
# Build two lists, one with xCoordinates and
# one with yCoordinates, for linear regression
x = [0]
y = [0]
# Gets times of transmissions for transmitter u
times = timesInfected[u]
# Plot up to the first transmission time, step 0
step = linspace(lowerBound, times[0], numPointsPerStep, endpoint=True)
x = append(x, step)
y = append(y, list(repeat(0, numPointsPerStep)))
# Loop over all the transmission times of u
for i in range(len(times)):
# At the last step (to the upperBound),
# only plot the start point and then up to the latest time of
# infection globally OR plot up to upperBound if its set
if (i == len(times) - 1):
step = linspace(times[i], latestInfectionTime,
numPointsPerStep, endpoint=True)
x = append(x, step)
y = append(y, list(repeat(i + 1, numPointsPerStep)))
break
# plot the xcoords of this step
step = linspace(times[i], times[i+1], numPointsPerStep, endpoint=True)
x = append(x, step)
# plot the ycoords of this step
y = append(y, list(repeat(i + 1, numPointsPerStep)))
lr = linregress(x[1:], y[1:])
slopesDict[u] = lr.slope
count += 1
return slopesDict
def indirectTransmissions(transmissionHist, numDegrees: int, lowerBound: int, upperBound: int) -> dict:
"""
METRIC 3
Returns a dictionary where each key is an individual and their value
is their corresponding indirect infection count.
Parameters
----------
tranmissionHist - the file object with data on tranmissions used to build the
dictionary
numDegrees - number of degrees away to measure indirect transmissions to
lowerBound - lower bound of years range
upperBound - upper bound of years range
"""
infectedPersons= []; people = []
lines = opengzip(transmissionHist)
direct = dict() # will be populated with all of key's indirect transmissions to a specified degree
# Loop over each line in the file.
for line in lines:
u,v,t = line.split(TAB_CHAR)
u = u.strip()
v = v.strip()
# Only considers infections within a given range of years
if (lowerBound > float(t)) | (float(t) > upperBound):
continue
if u == 'None':
continue
# creates entry in direct for all individuals, including those w/o outgoing transmissions
if u not in direct:
direct[u] = []
if v not in direct:
direct[v] = []
direct[u].append(v)
numIndirect = dict() # counts each person's number of indirect transmissions
lastDegree = direct.copy()
thisDegree = dict()
for n in range(1,numDegrees): # iterating through number of degrees away
# adds infectees at current degree to list in thisDegree
for key in lastDegree:
if key not in thisDegree:
thisDegree[key] = []
for value in lastDegree[key]:
thisDegree[key].extend(direct[value])
# adds count at this degree to individuals indirect transmisisons count
for key in thisDegree:
if key not in numIndirect:
numIndirect[key] = 0
numIndirect[key] += len(thisDegree[key])
lastDegree = thisDegree.copy()
thisDegree.clear()
return numIndirect
def totalTransmissions(transmissionHist, lowerBound: int, upperBound: int) -> dict:
"""
METRIC 4
Returns a dictionary where each key is an individual and their value
is their corresponding indirect infection count.
Parameters
----------
tranmissionHist - the file object with data on tranmissions used to build the
dictionary
lowerBound - lower bound of years range
upperBound - upper bound of years range
"""
infectedPersons= []; people = []; numInfected = dict()
lines = opengzip(transmissionHist)
# Loop over each line in the file.
for line in lines:
u,v,t = line.split(TAB_CHAR)
u = u.strip()
v = v.strip()
# Only considers infections within a given range of years
if (lowerBound > float(t)) | (float(t) > upperBound):
continue
if u == 'None':
continue
if u not in numInfected:
numInfected[u] = 0
numInfected[u] += 1
numIndirect = dict()
for line in lines:
u,v,t = line.split(TAB_CHAR)
u = u.strip()
v = v.strip()
# Only considers infections within a given range of years
if (lowerBound > float(t)) | (float(t) > upperBound):
continue
if u == 'None':
continue
if u not in numIndirect:
numIndirect[u] = 0
# should get the number of people that were indirected impacted
if v in numInfected:
numIndirect[u] += numInfected.get(v)
numTotal = dict()
# go through loop
for person in numInfected:
if person not in numTotal:
numTotal[person] = 0
if person in numInfected:
numTotal[person] += numInfected[person]
if person in numIndirect:
numIndirect[person]
return numTotal
def numContacts(transmissionHist, lowerBound: int, upperBound: int) -> dict:
"""
METRIC 5
Counts the number of contacts an individual has.
Returns a dictionary where each key is an individual and their value
is their corresponding number of contacts in the file.
Parameters
----------
transmissionHist - the file object with data on transmissions used to
build the dictionary.
lowerBound - Ignored for contact networks
upperBound - Ignored for contact networks
"""
infectedPersons= []; people = []
numberContacts = dict()
lines = opengzip(transmissionHist)
# Loop over each line in the file.
for line in lines:
# Skip over lines listing the nodes
if(line[0:4] == 'NODE'):
continue
u,v,t,w,x = line.split('\t')
u = u.strip()
v = v.strip()
if u == 'None':
continue
# Add person to numberContacts if they don't already exist in the dict
if v not in numberContacts:
numberContacts[v] = 0
if t not in numberContacts:
numberContacts[t] = 0
# Increment their number of contacts
numberContacts[v] += 1
numberContacts[t] += 1
return numberContacts
def numContactInfect(transmissionHist, contactNet, lowerBound: int, upperBound: int) -> dict:
"""
METRIC 6
Counts the number of contacts and transmissions a person has had.
Returns a dictionary where each key is an individual and their value
is their corresponding number of contacts in the file.
Parameters
----------
transmissionHist - the file object with data on transmissions used to
build the dictionary.
contactNet - includes all the contacts a person has had
lowerBound - Ignored for contact networks
upperBound - Ignored for contact networks
"""
infectedPersons= []; people = []; numInfected = dict()
lines = opengzip(transmissionHist)
# Loop over each line in the file.
for line in lines:
u,v,t = line.split(TAB_CHAR)
u = u.strip()
v = v.strip()
# Only considers infections within a given range of years
if (lowerBound > float(t)) | (float(t) > upperBound):
continue
if u == 'None':
continue
if u not in numInfected:
numInfected[u] = 0
numInfected[u] += 1
infectedPersons= []; people = []
totalContactCount = 0
lines = opengzip(contactNet)
numberContacts = dict()
# Loop over each line in the file.
for line in lines:
# Skip over lines listing the nodes
if(line[0:4] == 'NODE'):
continue
u,v,t,w,x = line.split('\t')
u = u.strip()
v = v.strip()
if u == 'None':
continue
# Add person to numberContacts if they don't already exist in the dict
if v not in numberContacts:
numberContacts[v] = 0
if t not in numberContacts:
numberContacts[t] = 0
# Increment their number of contacts
if t in numInfected:
numberContacts[v] += numInfected[t]
if v in numInfected:
numberContacts[t] += numInfected[v]
return numberContacts
# HELPER METHODS -----------------------------------------------------------------------------------------------------
def matchInfectorCounts(infectionsDict: dict, inputOrder) -> None:
"""
Matches the infectors in a user inputted file to their corresponding
infection count. Returns a list of tuples (<indvidiual>, <count>),
maintaing the original order of individuals in input.
Parameters
----------
infectionsDict - a dict with keys as infectors and values as
their infection counts
inputOrder - the user's ordering of individuals
"""
res = []
for line in inputOrder:
p = line.strip()
if p in infectionsDict.keys():
res.append((p, infectionsDict[p]))
else:
# individual not in the dictionary, just print them with 0 infections
res.append((p, 0))
return res
def opengzip(transmissionHist: str) -> list:
"""
Helper method - Opens a gzip and returns the lines of the file.
Parameters
----------
transmissionHist - the gzip to open. the file object with data on
tranmissions.
"""
if isinstance(transmissionHist,str):
if transmissionHist.lower().endswith('.gz'):
lines = [l.strip() for l in gopen(transmissionHist,'rb').read().decode().strip().splitlines()]
else:
lines = [l.strip() for l in open(transmissionHist).read().strip().splitlines()]
else:
lines = [l.strip() for l in transmissionHist.read().strip().splitlines()]
return lines
def calculateTauB(userOrder) -> None:
"""
Calculates the Kendall Tau B correlation coefficient between user ordering
and most optimal ordering.
Outputs coefficient and pvalue in the following format: "<tau> <pvalue>".
Returns void.
Parameters
----------
userOrder- an ordering of infectors and their counts
- generated by the user's algorithm
outfile - the file the tau and pvalue are outputted
"""
optimalOrder = list(range(len(userOrder), 0, -1))
tau, pvalue = kendalltau(optimalOrder, userOrder)
print("%s\t%s\n" % (tau, pvalue))
if __name__ == "__main__":
# parse user arguments [-h] -m METRIC [-i INPUT] [-t TRANMSISSIONHIST] [-c CONTACTNET] -s START [-e END] [-v]
parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('-m', '--metric', required=True, type=float, help="Metric of prioritization (1-6)")
parser.add_argument('-i', '--input', required=False, type=str, default='stdin', help="Input File - User's Ordering")
parser.add_argument('-t', '--transmissionHist', required=False, type=str, default='', help='Transmission History File')
parser.add_argument('-c', '--contactNet', required=False, type=str, default='', help='Contact History File')
parser.add_argument('-s', '--start', required=True, type=float, help='Time Start')
parser.add_argument('-e', '--end', required=False, type=float, default=float('inf'), help='Time End') # end defaults to infinity
parser.add_argument('-v', '--verbose', required=False, action='store_true', help='Print Intermediate List with Individuals Matched to Counts')
args = parser.parse_args()
# handle input, save into infile var
if args.input == 'stdin':
order = [l.strip() for l in stdin]
elif args.input.lower().endswith('.gz'):
order = [l.strip() for l in gopen(args.input).read().decode().strip().splitlines()]
else:
order = [l.strip() for l in open(args.input).read().strip().splitlines()]
# Create a dictionary matching individuals to infection counts using tranmission history data
infectionsDict = pairCounts(args.transmissionHist, args.contactNet, args.start, args.end, args.metric)
# Read the user's ordering and create a list of tuple pairs with individuals and their respective counts in the same order
countsList = matchInfectorCounts(infectionsDict, order)
# output verbose to sdterr if verbose flag was specified
if args.verbose:
print(countsList, stderr)
# calculate and output Tau B to stdout
calculateTauB([x[1] for x in countsList])