forked from Atkinson-Lab/Post-QC
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathupdate_rsID_bim_arg.py
76 lines (66 loc) · 3.14 KB
/
update_rsID_bim_arg.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
n__author__= "mlin, adapted for specific use by egatkinson"
import argparse
import numpy as np
import pandas as pd
USAGE = """
update_rsID_bim.py --bim <PLINK bim file to update SNP ID>
--bed <bed file containing newest ID>
--format <formatting unmatched ID as chr:bp>
--codechr < if need to use X, Y, XY, M in bed file, default T>
--out <new bim file name>
"""
parser = argparse.ArgumentParser()
parser.add_argument("--bim", help = "PLINK bim file to update SNP ID")
parser.add_argument("--bed", help = "e.g. dbSNP build bed file")
parser.add_argument("-f", "--format", default = "F", help = "T or F, formatting unmatched ID as chr:bp")
parser.add_argument("-c", "--codechr", default = "T", help = "T or F")
parser.add_argument("-o", "--out", default = "out")
args=parser.parse_args()
bimfile = args.bim
bedfile = args.bed
out = file(args.out,'w')
#read in
bim = pd.read_csv(bimfile, header=-1, sep='\s+', dtype = str)
bed = pd.read_csv(bedfile, header=-1, sep='\s+', dtype = str)
bim['chr'] = bim[0] # temporarily store original chr values
if args.codechr=='T':
for i in range(0,bim[0].size):
if bim[0][i]=="23":
bim[0][i] = "X"
if bim[0][i]=="24":
bim[0][i] = "Y"
if bim[0][i]=="25":
bim[0][i] = "XY"
if bim[0][i]=="26":
bim[0][i] = "M"
#creating an index according to chr and bp
##bim['position'] = pd.Series(['chr%s']*len(bim[0]) % bim[0].astype(str)).str.cat(bim[3].astype(str),sep="_")
bim['position'] = pd.Series(bim[0].astype(str)).str.cat(bim[3].astype(str), sep=":") # key in the form of chr:bp
if 'chr' in bed[0][0]:
##bed['position'] = pd.Series(bed[0].astype(str)).str.cat(bed[2].astype(str),sep="_")
bed['position'] = pd.Series(bed[0].astype(str).str.lstrip('chr')).str.cat(bed[2].astype(str),sep=":")
else:
##bed['position'] = pd.Series(['chr%s']*len(bed[0]) % bed[0].astype(str)).str.cat(bed[2].astype(str),sep="_")
bed['position'] = pd.Series(bed[0].astype(str)).str.cat(bed[2].astype(str),sep=":")
#building a dictionary based on index and rsID of the reference
bed_dict = dict(zip(bed['position'],bed[3]))
# a mark of indels
bim['indel'] = 'NA'
for i in range(0, bim[0].size):
bim['indel'][i] = int((bim[4][i]=='D') | (bim[4][i]=='I') | (bim[5][i]=='D') | (bim[5][i]=='I') | (len(bim[4][i])>1) | (len(bim[5][i])>1))
#updating new rsIDs of bim
bim['new'] = [0]*len(bim[0])
bim[0]=bim['chr'] # convert back to original chr values
for i,snp in enumerate(bim['position']):
if (args.format == 'T'):
if bim['indel'][i] == 0: #not indel
bim['new'][i] = bed_dict.get(snp, snp) #if not matched, return "chr:bp" as the new ID of the locus
else: #indel renamed as 'chr:pos_indel'
bim['new'][i] = snp + '_indel'
else:
if bim['indel'][i] == 0:
bim['new'][i] = bed_dict.get(snp, bim[1][i])
else: #indel, don't change name
bim['new'][i] = bim[1][i]
out.write('%s\t%s\t%s\t%s\t%s\t%s\n' % (bim[0][i],bim['new'][i],str(bim[2][i]),str(int(bim[3][i])),bim[4][i],bim[5][i]))
out.close()