extract_homologs.py

from xml.etree import ElementTree as ET
import itertools
import argparse, os, sys
'''
Extract 1-1 homologs from a homologene XML.
1-1 homolog mappings are generated by selecting pairs 
with reciprocol best bitscores.
'''

# Parse command-line arguments
parser = argparse.ArgumentParser()
parser.add_argument('-i', '--input', type=str, required=True)
parser.add_argument('-ids', '--tax_ids', type=str, nargs='+', required=True)
parser.add_argument('-o', '--output', required=True)
parser.add_argument('--use_refseq_id', action='store_true')
args = parser.parse_args()
assert len(args.tax_ids) == 2

# Needs lots of memory if XML is big
with open(args.input, 'r') as fp:
    tree = ET.parse(fp)
root_element = tree.getroot()

# Root contains one entry: HG-EntrySet_entries
assert(len(root_element) == 1)
entries = root_element[0]

# Parsing is guided by documentation in 
# ftp://ftp.ncbi.nih.gov/pub/HomoloGene/HomoloGene_Field_Description.txt
# Tax IDs for different species can be found in:
# from ftp://ftp.ncbi.nih.gov/pub/HomoloGene/build68/build_inputs/taxid_taxname
S1_taxid = args.tax_ids[0]
S2_taxid = args.tax_ids[1]
n_pairs = 0
err = 0
with open(args.output, 'w') as of:

    # each entry is one group of homologous genes
    # there are multiple species present and there
    # can be multiple genes for a give species in the group
    for entry in entries:

        # set of Gene objects 
        genes = entry.find('HG-Entry_genes')
        # set of Stats objects
        stats = entry.find('HG-Entry_distances')

        # what we want is the following.  if there is a group in which both
        # Sp and Sc genes are present, choose the Sp-Sc pair which is reciprocal-best
        # in BLAST score.  This is stored in the Stats_recip-best field
        # in the stats element

        S1_genes = []
        S2_genes = []

        for gene in genes:

            # Contains information about each gene in a HomoloGene group.  Contains
            # the following:
            # - Gene_geneid: GeneID (from EntrezGene) identifier
            # - Gene_symbol: "best" symbol for gene (official if available)
            # - Gene_title: name of the gene
            # - Gene_taxid: taxonomic id for the gene
            # - Gene_prot-gi: gi of protein used for calculations
            # - Gene_prot-acc: accession of protein used for calculations
            # - Gene_prot-len: length of protein used for calculations
            # - Gene_nuc-gi: gi of corresponding mRNA to protein used in
            #   calculations (if available)
            # - Gene_nuc-acc: accession of corresponding mRNA to proteinused in
            #   calculations (if available)
            # - Gene_domains: set of Domain objects

            taxid = gene.find('HG-Gene_taxid').text
            if taxid == S1_taxid:
                S1_genes.append(gene)
            elif taxid == S2_taxid:
                S2_genes.append(gene)
        if (S1_genes and S2_genes):

            # create a structure that allows us to quickly look up stats for a gene
            # pair regardless of ordering (XML representation is inflexible here)
            stats_array = {frozenset([stat.find('HG-Stats_gi1').text, 
                            stat.find('HG-Stats_gi2').text]): stat for stat in stats}

            for S1, S2 in itertools.product(S1_genes, S2_genes):
                
                gene_ids = frozenset([S1.find('HG-Gene_prot-gi').text,
                                              S2.find('HG-Gene_prot-gi').text])
                stat = stats_array.get(gene_ids)

                if stat is not None:
                    if stat.find('HG-Stats_recip-best').attrib['value'] == 'true':
                        if args.use_refseq_id:
                            of.write('{}\t{}\n'.format(
                            S1.find('HG-Gene_prot-acc').text,
                            S2.find('HG-Gene_prot-acc').text))
                            n_pairs += 1
                        else:
                            of.write('{}\t{}\n'.format(                    
                            S1.find('HG-Gene_locus-tag').text,
                            S2.find('HG-Gene_locus-tag').text))
                            n_pairs += 1

                # stats object:
                # Contains different pairwise statistics between the proteins in a
                # HomoloGene group.  Contains the following:

                # - Stats_gi1: gi of the first protein in the pair
                # - Stats_gi2: gi of the second protein in the pair
                # - Stats_nuc-change: ratio of nucleotide differences between the pair
                # - Stats_nuc-change-jc: ratio of nucleotide differences between the
                #   pair (corrected for back substitions through Jukes and Cantor
                #   formula)
                # - Stats_prot-change: ratio of amino acid differences between the pair
                # - Stats_ka: Ka for the pair (ratio of non-synonymous differences per
                #   non-synonymous site)
                # - Stats_ks: Ks for the pair (ratio of synonymous differences per
                #   synonymous site)
                # - Stats_knr: Knr for the pair (ratio of radical non-synonymous
                #   differences per radical non-synonymous site)
                # - Stats_knc: Knc for the pair (ratio of conservative non-synonymous
                #   differences per conservative non-synonymous site)
                # - Stats_recip-best: is this pair reciprocal best using bitscore as
                #   the measurement