-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathextract_homologs.py
124 lines (107 loc) · 5.45 KB
/
extract_homologs.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
from xml.etree import ElementTree as ET
import itertools
import argparse, os, sys
'''
Extract 1-1 homologs from a homologene XML.
1-1 homolog mappings are generated by selecting pairs
with reciprocol best bitscores.
'''
# Parse command-line arguments
parser = argparse.ArgumentParser()
parser.add_argument('-i', '--input', type=str, required=True)
parser.add_argument('-ids', '--tax_ids', type=str, nargs='+', required=True)
parser.add_argument('-o', '--output', required=True)
parser.add_argument('--use_refseq_id', action='store_true')
args = parser.parse_args()
assert len(args.tax_ids) == 2
# Needs lots of memory if XML is big
with open(args.input, 'r') as fp:
tree = ET.parse(fp)
root_element = tree.getroot()
# Root contains one entry: HG-EntrySet_entries
assert(len(root_element) == 1)
entries = root_element[0]
# Parsing is guided by documentation in
# ftp://ftp.ncbi.nih.gov/pub/HomoloGene/HomoloGene_Field_Description.txt
# Tax IDs for different species can be found in:
# from ftp://ftp.ncbi.nih.gov/pub/HomoloGene/build68/build_inputs/taxid_taxname
S1_taxid = args.tax_ids[0]
S2_taxid = args.tax_ids[1]
n_pairs = 0
err = 0
with open(args.output, 'w') as of:
# each entry is one group of homologous genes
# there are multiple species present and there
# can be multiple genes for a give species in the group
for entry in entries:
# set of Gene objects
genes = entry.find('HG-Entry_genes')
# set of Stats objects
stats = entry.find('HG-Entry_distances')
# what we want is the following. if there is a group in which both
# Sp and Sc genes are present, choose the Sp-Sc pair which is reciprocal-best
# in BLAST score. This is stored in the Stats_recip-best field
# in the stats element
S1_genes = []
S2_genes = []
for gene in genes:
# Contains information about each gene in a HomoloGene group. Contains
# the following:
# - Gene_geneid: GeneID (from EntrezGene) identifier
# - Gene_symbol: "best" symbol for gene (official if available)
# - Gene_title: name of the gene
# - Gene_taxid: taxonomic id for the gene
# - Gene_prot-gi: gi of protein used for calculations
# - Gene_prot-acc: accession of protein used for calculations
# - Gene_prot-len: length of protein used for calculations
# - Gene_nuc-gi: gi of corresponding mRNA to protein used in
# calculations (if available)
# - Gene_nuc-acc: accession of corresponding mRNA to proteinused in
# calculations (if available)
# - Gene_domains: set of Domain objects
taxid = gene.find('HG-Gene_taxid').text
if taxid == S1_taxid:
S1_genes.append(gene)
elif taxid == S2_taxid:
S2_genes.append(gene)
if (S1_genes and S2_genes):
# create a structure that allows us to quickly look up stats for a gene
# pair regardless of ordering (XML representation is inflexible here)
stats_array = {frozenset([stat.find('HG-Stats_gi1').text,
stat.find('HG-Stats_gi2').text]): stat for stat in stats}
for S1, S2 in itertools.product(S1_genes, S2_genes):
gene_ids = frozenset([S1.find('HG-Gene_prot-gi').text,
S2.find('HG-Gene_prot-gi').text])
stat = stats_array.get(gene_ids)
if stat is not None:
if stat.find('HG-Stats_recip-best').attrib['value'] == 'true':
if args.use_refseq_id:
of.write('{}\t{}\n'.format(
S1.find('HG-Gene_prot-acc').text,
S2.find('HG-Gene_prot-acc').text))
n_pairs += 1
else:
of.write('{}\t{}\n'.format(
S1.find('HG-Gene_locus-tag').text,
S2.find('HG-Gene_locus-tag').text))
n_pairs += 1
# stats object:
# Contains different pairwise statistics between the proteins in a
# HomoloGene group. Contains the following:
# - Stats_gi1: gi of the first protein in the pair
# - Stats_gi2: gi of the second protein in the pair
# - Stats_nuc-change: ratio of nucleotide differences between the pair
# - Stats_nuc-change-jc: ratio of nucleotide differences between the
# pair (corrected for back substitions through Jukes and Cantor
# formula)
# - Stats_prot-change: ratio of amino acid differences between the pair
# - Stats_ka: Ka for the pair (ratio of non-synonymous differences per
# non-synonymous site)
# - Stats_ks: Ks for the pair (ratio of synonymous differences per
# synonymous site)
# - Stats_knr: Knr for the pair (ratio of radical non-synonymous
# differences per radical non-synonymous site)
# - Stats_knc: Knc for the pair (ratio of conservative non-synonymous
# differences per conservative non-synonymous site)
# - Stats_recip-best: is this pair reciprocal best using bitscore as
# the measurement