-
Notifications
You must be signed in to change notification settings - Fork 1
/
cluster_VAFs_422.py
153 lines (135 loc) · 5.54 KB
/
cluster_VAFs_422.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
# -*- coding: utf-8 -*-
"""
Created on Thu Oct 4 12:43:41 2018
@author: Lucian
"""
from __future__ import division
from os import walk
from os import path
from os import readlink
from os import mkdir
from os.path import isfile
from copy import deepcopy
from ete3 import Tree
import numpy
import math
import matplotlib.pyplot as plt
import csv
import lucianSNPLibrary as lsl
onlysomepatients = True
somepatients = ["521"]
#mutation_file = "snv_plus_indels.twoPlus.20181030.csv"
mutation_dir = "full_mutations/"
outdir = "VAFclusters_521_oneplus/"
if not path.isdir(outdir):
mkdir(outdir)
mutfiles = []
for __, _, files in walk(mutation_dir):
mutfiles += files
def writeAllSampleVAFs(mutations, patientSampleMap, deletions):
for patient in mutations:
#Collect a set of clusters
print("Writing data for patient", patient)
clustercount = {}
for chr in mutations[patient]:
for pos in mutations[patient][chr]:
for alt in mutations[patient][chr][pos]:
cluster = list(mutations[patient][chr][pos][alt].keys())
cluster.sort()
for sample in patientSampleMap[patient]:
if sample in cluster:
continue
if isDeleted(patient, sample, chr, pos, deletions):
cluster.append("-" + sample)
mutations[patient][chr][pos][alt]["-" + sample] = ""
cluster = tuple(cluster)
if cluster not in clustercount:
clustercount[cluster] = 0
clustercount[cluster] += 1
clusterlist = []
for cluster in clustercount:
# if clustercount[cluster] > 40:
clusterlist.append(cluster)
for sample in patientSampleMap[patient]:
patientVAFs = open(outdir + patient + "_" + sample + "_VAFs.tsv", "w")
patientVAFs.write("Patient")
patientVAFs.write("\tSample")
patientVAFs.write("\tchr")
patientVAFs.write("\tpos")
patientVAFs.write("\talt")
for cluster in clusterlist:
if sample in cluster:
patientVAFs.write("\t" + str(cluster))
patientVAFs.write("\n")
for chr in mutations[patient]:
posvec = list(mutations[patient][chr].keys())
posvec.sort()
for pos in posvec:
for alt in mutations[patient][chr][pos]:
outstr = patient
outstr += ("\t" + sample)
outstr += ("\t" + chr)
outstr += ("\t" + str(pos))
outstr += ("\t" + alt)
writeVAF = False
for cluster in clusterlist:
if sample in cluster and sample in mutations[patient][chr][pos][alt]:
#We have to write something: VAF or a tab.
outstr += "\t"
if set(cluster) == set(mutations[patient][chr][pos][alt].keys()):
outstr += str(mutations[patient][chr][pos][alt][sample])
writeVAF = True
if writeVAF:
patientVAFs.write(outstr + "\n")
patientVAFs.close()
def isDeleted(patient, sample, chrom, pos, deletions):
if patient not in deletions:
return False
if sample not in deletions[patient]:
return False
if chrom not in deletions[patient][sample]:
return False
for (start, end) in deletions[patient][sample][chrom]:
if start <= pos and end >= pos:
return True
return False
mutations = {}
patientSampleMap = {}
samplePatientMap = {}
for mutation_file in mutfiles:
patient, sample = mutation_file.split('-')[0:2]
samplePatientMap[sample] = patient
if patient not in patientSampleMap:
patientSampleMap[patient] = []
patientSampleMap[patient].append(sample)
with open(mutation_dir + mutation_file, 'r') as csvfile:
for lvec in csv.reader(csvfile, delimiter="\t"):
if "DNANum" in lvec[0]:
continue
(chr, pos, __, ref, alt) = lvec[0:5]
if len(ref)>1 or len(alt)>1 or ref=="." or alt==".":
continue
# if (is_2p=="f"):
# continue
# if ("N" in sample):
# continue
if chr=="X":
chr = "23"
refcnt = int(lvec[-3])
bafcnt = int(lvec[-4])
VAF = bafcnt/(refcnt+bafcnt)
if onlysomepatients and patient not in somepatients:
continue
if patient not in mutations:
mutations[patient] = {}
print("Reached patient", patient)
if chr not in mutations[patient]:
mutations[patient][chr] = {}
pos = int(pos)
if pos not in mutations[patient][chr]:
mutations[patient][chr][pos] = {}
if alt not in mutations[patient][chr][pos]:
mutations[patient][chr][pos][alt] = {}
mutations[patient][chr][pos][alt][sample] = VAF
deletions = lsl.loadDeletions(samplePatientMap)
writeAllSampleVAFs(mutations, patientSampleMap, deletions)