-
Notifications
You must be signed in to change notification settings - Fork 1
/
resolve_multiple_gene.py
42 lines (38 loc) · 1.02 KB
/
resolve_multiple_gene.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
import pandas as pd
def Main(infile):
"""
solves problem of multiple gene symbols
"""
df = pd.read_csv(infile, sep="\t")
for i, row in df.iterrows():
#entry = df[i, row]
entry = str(row['Gene Symbol'])
if "///" in entry:
symbol = parse(entry)
else:
if entry == "nan":
symbol = ""
else:
symbol = entry
df.at[i,'Gene Symbol'] = symbol.upper()
df.to_csv("../data/Array_resolved.txt", sep="\t", header=True)
def parse(entry):
"""
entry a string to be modified(chosen)
"""
entry = entry.split("///")
l = []
for i in entry:
j = i.replace(" ", "")
l.append(j)
k = []
for i in l:
if not i.startswith("LOC"):
k.append(i)
if len(k) > 0:
return k[0]
else:
return ""
if __name__ == "__main__":
#print(parse("LOC001 /// gto9"))
print(Main("../data/Array_design.txt"))