-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathDP4_extract_varscan2.py
82 lines (76 loc) · 3.21 KB
/
DP4_extract_varscan2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
def varscan2_anno_DP4(dirpath,regexp):
'''This function accepts the directory with all the vcf files to be opened. It also accepts the regex expression
that will create the ID column based on the name of the file. E.g.,
TCGA-123-DFB-23R2F_ANNOTATAED_VARSCAN2 can be collapsed to TCGA-123 as ID. The value returned is a dataframe with the
chromosome, position of variation and the DP4 values '''
#create the id_list which hold id values per file analyzed and the full path to invoke it for iterations.
import os
import re
import pandas as pd
DF = pd.DataFrame()
filenames = os.listdir(dirpath)
id_tcga = []
full_path = []
for filename in filenames:
m = re.search(regexp, filename)
id_tcga = m[1]
full_path = (dirpath+filename)
#open vcf file and pass it to a handler, then read it as a string
with open(full_path, "rt") as f:
vcf = f.read()
#remove the INFO field. It is too concentrated with information to look at.
def removeCSQ(line):
values = line.split()
INFO = values[7]
NEWINFO = []
for info in INFO.split(";"):
if not "CSQ=" in info:
NEWINFO.append(info)
values[7] = ";".join(NEWINFO)
return "\t".join(values)
#Detect the field I am interested and group them in endlist
line = vcf.split()
line[11]
n = 11
endlist = [[] for _ in range(n)]
for index, item in enumerate(line):
endlist[index % n].append(item)
# list splice to create a dp4 list
endlist[-1]
n = -1
dp4_abcd = []
for i in endlist[-1]:
n = n + 1
dp4 = endlist[-1][n].split("%:")
dp4 = dp4[1]
dp4_abcd.append(dp4)
# split the dp4 string to a list
n = -1
for i in dp4_abcd:
n = n + 1
dp4_abcd[n] = dp4_abcd[n].split(",")
# match 1-1 elements from multiple lists to create a list of lists
comb_lists = [[x] + [y] +[z] for x, y, z in zip(endlist[0], endlist[1], dp4_abcd)]
#print(comb_lists)
#convert list of lists to pandas dataframe
columns = ["chrom","pos","abcd"]
df = pd.DataFrame(comb_lists, columns = columns)
#create a,b,c,d columns for ref fwd, ref rev, alt fwd, alt rev
n = -1
df["a"] = "NA"
df["b"] = "NA"
df["c"] = "NA"
df["d"] = "NA"
for i in df.iterrows():
n = n + 1
df["a"][n] = df["abcd"][n][0]
df["b"][n] = df["abcd"][n][1]
df["c"][n] = df["abcd"][n][2]
df["d"][n] = df["abcd"][n][3]
#remove the combined abcd
df.drop('abcd', inplace=True, axis=1)
#add a filename idendifier
df["ID"] = id_tcga
DF = DF.append(df)
DF = DF.astype({"a": int, "b": int, "c": int, "d": "int"})
return DF