Skip to content

Commit

Permalink
Merge pull request #8 from abretaud/typedefs
Browse files Browse the repository at this point in the history
Properly load orphan typedefs
  • Loading branch information
abretaud authored Nov 23, 2016
2 parents 9d78061 + fa6cb99 commit c3a78ae
Show file tree
Hide file tree
Showing 3 changed files with 159 additions and 0 deletions.
1 change: 1 addition & 0 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -43,5 +43,6 @@ ADD load.conf.tt2 /opt/load.conf.tt2
ADD cvtermpath_fix.sql /opt/cvtermpath_fix.sql
ADD update_urls.sql /opt/update_urls.sql
ADD fix_relationshiptype_lc.diff /opt/fix_relationshiptype_lc.diff
ADD obo_extract_typedefs.py /opt/obo_extract_typedefs.py

ADD build.sh /docker-entrypoint-initdb.d/build.sh
16 changes: 16 additions & 0 deletions build.sh
Original file line number Diff line number Diff line change
@@ -1,5 +1,12 @@
#!/bin/bash
set -ex

function fix_typedefs () {
python /opt/obo_extract_typedefs.py /build/${1}.obo /build/${1}_typedefs.obo
go2fmt.pl -p obo_text -w xml /build/${1}_typedefs.obo | go-apply-xslt oboxml_to_chadoxml - > /build/${1}_typedefs.xml
stag-storenode.pl -d 'dbi:Pg:dbname=postgres;host=localhost;port=5432' --user postgres --password postgres /build/${1}_typedefs.xml
}

wget --quiet "https://github.com/GMOD/Chado/archive/${BRANCH}.tar.gz"

# Download ontologies
Expand Down Expand Up @@ -34,6 +41,15 @@ gmod_load_cvterms.pl -s SOFP load/etc/feature_property.obo
gmod_load_cvterms.pl -s PO /build/po.obo
gmod_load_cvterms.pl -s TAXRANK /build/taxrank.obo

# Typedefs defined in each obo are loaded by gmod_load_cvterms.pl only if they are used
# Following lines add the typedefs that were not added
ln -s "/build/Chado-${BRANCH}/chado/load/etc/feature_property.obo" /build/feature_property.obo
fix_typedefs so
fix_typedefs go
fix_typedefs feature_property
fix_typedefs po
fix_typedefs taxrank

# Populate cvtermpath table
psql -h localhost -p 5432 -U postgres < /opt/cvtermpath_fix.sql
echo "select * from fill_cvtermpath('sequence');" | psql -h localhost -p 5432 -U postgres
Expand Down
142 changes: 142 additions & 0 deletions obo_extract_typedefs.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,142 @@

#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
A parser for the OBO v1.2 format
Inspired from https://techoverflow.net/blog/2013/11/18/a-geneontology-obo-v1.2-parser-in-python/
"""

from __future__ import with_statement, print_function
from collections import OrderedDict

__author__ = "Uli Koehler, Anthony Bretaudeau"
__copyright__ = "Copyright 2013 Uli Koehler, 2016 Anthony Bretaudeau"
__license__ = "Apache v2.0"

def processTerm(term):
"""
In an object representing a term, replace single-element lists with
their only member.
Returns the modified object as a dictionary.
"""
ret = term #Input is a OrderedDict, might express unexpected behaviour
for key, value in ret.iteritems():
if len(value) == 1:
ret[key] = value[0]
return ret

def parseHeader(filename):
"""
Parses a file in OBO v1.2 format.
Yields each term
Keyword arguments:
filename: The filename to read
"""
header = ""
with open(filename, "r") as infile:
currentTerm = None
for line in infile:
line = line.strip()
if not line: continue #Skip empty
if line == "[Term]" or line == "[Typedef]":
return header
else: #Not [Term]
header += line + "\n"
return header

def parseOBOTerms(filename):
"""
Parses a file in OBO v1.2 format.
Yields each term
Keyword arguments:
filename: The filename to read
"""
with open(filename, "r") as infile:
currentTerm = None
for line in infile:
line = line.strip()
if not line: continue #Skip empty
if line == "[Term]":
if currentTerm: yield processTerm(currentTerm)
currentTerm = OrderedDict()
elif line == "[Typedef]":
#Skip [Typedef sections]
currentTerm = None
else: #Not [Term]
#Only process if we're inside a [Term] environment
if currentTerm is None: continue
key, sep, val = line.partition(":")
if key not in currentTerm:
currentTerm[key] = []
currentTerm[key].append(val.strip())
#Add last term
if currentTerm is not None:
yield processTerm(currentTerm)

def parseOBOTypedefs(filename):
"""
Parses a file in OBO v1.2 format.
Yields each typedef
Keyword arguments:
filename: The filename to read
"""
with open(filename, "r") as infile:
currentTypedef = None
for line in infile:
line = line.strip()
if not line: continue #Skip empty
if line == "[Term]":
#Skip [Term sections]
currentTypedef = None
elif line == "[Typedef]":
if currentTypedef: yield processTerm(currentTypedef)
currentTypedef = OrderedDict()
else: #Not [Term]
#Only process if we're inside a [Term] environment
if currentTypedef is None: continue
key, sep, val = line.partition(":")
if key != 'is_a': # This is not really used and causes problem with SO
if key not in currentTypedef:
currentTypedef[key] = []
currentTypedef[key].append(val.strip())
#Add last typedef
if currentTypedef is not None:
yield processTerm(currentTypedef)

if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser()
parser.add_argument('infile', help='The input file in OBO v1.2 format.')
parser.add_argument('outfile', help='The output file in OBO v1.2 format with only Typedef not used in the OBO file.')
args = parser.parse_args()

#Iterate over terms
termCounter = 0
seenRelationships = []
for term in parseOBOTerms(args.infile):
if 'relationship' in term:
if isinstance(term['relationship'], basestring):
seenRelationships.append(term['relationship'].split()[0])
else:
for r in term['relationship']:
seenRelationships.append(r.split()[0])
termCounter += 1
print("Found %d terms" % termCounter)

typedefCounter = 0
unusedCounter = 0
with open(args.outfile, "w") as outfile:
header = parseHeader(args.infile)
print(header, file=outfile)

for typedef in parseOBOTypedefs(args.infile):
if typedef['id'] not in seenRelationships:
print("%s typedef was never seen" % typedef['name'])
print("[Typedef]", file=outfile)
for k in typedef:
print("%s: %s" % (k, typedef[k]), file=outfile)
print("", file=outfile)
unusedCounter += 1
typedefCounter += 1
print("Found %d typedefs, %s unused" % (typedefCounter, unusedCounter))

1 comment on commit c3a78ae

@hexylena
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oh dear... didn't mean for you to have to write so much code for this problem...

Remind me to buy you a couple beverages of your choice at next gcc / food / something. Really appreciate you working on this.

Please sign in to comment.