Skip to content

Commit

Permalink
update Biojava 1.9 to handle new FT lines .dat format introduced in U…
Browse files Browse the repository at this point in the history
…niprot release 2019_12
  • Loading branch information
pgdurand committed Mar 17, 2022
1 parent 7b6c6c4 commit d94be2c
Show file tree
Hide file tree
Showing 9 changed files with 3,591 additions and 3 deletions.
Binary file added jar/biojava-1.9.6b.jar
Binary file not shown.
10 changes: 8 additions & 2 deletions src/bzh/plealog/dbmirror/reader/DBUtils.java
Original file line number Diff line number Diff line change
Expand Up @@ -820,7 +820,7 @@ private static void analyseEmblIDLine(String idLine, IBankSequenceInfo si) {
/**
* Returns a SequenceInfo object from a Uniprot sequence.
*/
private static IBankSequenceInfo returnUPSeqInfo(Sequence seq, boolean isEmbl) {
public static IBankSequenceInfo returnUPSeqInfo(Sequence seq, boolean isEmbl) {
IBankSequenceInfo si;
String value;
Annotation annotItem;
Expand Down Expand Up @@ -1075,6 +1075,12 @@ public static void retrieveUPFeatures(FeatureTable ft, Sequence seq,
iter = seq.features();
while (iter.hasNext()) {
feat = (Feature) iter.next();
//Fuzzy Location from Biojava: not handled by BeeDeeM
if ((feat.getLocation().getMin()<0 ||
feat.getLocation().getMin()==Integer.MAX_VALUE ||
feat.getLocation().getMax()<0 ||
feat.getLocation().getMax()==Integer.MAX_VALUE))
continue;
strLoc = gff.formatLocation(feat);
fLoc = analyseFeatureLocation(strLoc, id != null ? id : "seqId", -adjust);
if (fLoc == null)
Expand Down Expand Up @@ -1139,7 +1145,7 @@ private static String cutSequence(String seq, int from, int to) {
*
* @return a feature table.
*/
private static FeatureTable returnUPFeatureTable(Sequence seq, String id,
public static FeatureTable returnUPFeatureTable(Sequence seq, String id,
int start, int stop, boolean remap) {

FeatureTable ft;
Expand Down
3 changes: 2 additions & 1 deletion src/test/unit/AllTests.java
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,8 @@
PAnnotateBlastResultTest.class,
CmdLineQueryTest.class,
PFTPLoaderTest.class,
BankJsonTest.class
BankJsonTest.class,
UniprotFormatTest.class
})
public class AllTests {

Expand Down
306 changes: 306 additions & 0 deletions src/test/unit/UniprotFormatTest.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,306 @@
/* Copyright (C) 2019-2022 Patrick G. Durand
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* You may obtain a copy of the License at
*
* https://www.gnu.org/licenses/agpl-3.0.txt
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*/
package test.unit;

import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertNotNull;
import static org.junit.Assert.assertTrue;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.text.MessageFormat;
import java.util.Enumeration;
import java.util.HashSet;
import java.util.Iterator;

import org.biojava.bio.seq.Feature;
import org.biojava.bio.seq.Sequence;
import org.biojava.bio.seq.SequenceIterator;
import org.biojava.bio.seq.io.SeqIOTools;
import org.junit.After;
import org.junit.AfterClass;
import org.junit.Before;
import org.junit.BeforeClass;
import org.junit.Test;

import bzh.plealog.bioinfo.api.data.feature.FeatureTable;
import bzh.plealog.bioinfo.data.sequence.IBankSequenceInfo;
import bzh.plealog.dbmirror.reader.DBUtils;

@SuppressWarnings("deprecation")
public class UniprotFormatTest {
// change in Uniprot data format (Feature Table) introduce in release 2019_12
// see https://www.uniprot.org/news/2019/12/18/release#text%5Fft
// We use an update release of Biojava legacy 1.9 from
// https://github.com/pgdurand/biojava-legacy.git

private static String up_file_new = "019R_FRG3G_nf.dat";
private static int expect_feat_new = 12;

@SuppressWarnings("unused")
private static String up_file_old = "019R_FRG3G_of.dat";
@SuppressWarnings("unused")
private static int expect_feat_old = 8;

private static String current_file = up_file_new;
private static int current_feats = expect_feat_new;

private static final boolean TEST_ENTIRE_SWISS = false;
private static String SWISS_PATH =
"/tmp/biobanks/p/Uniprot_SwissProt/current/Uniprot_SwissProt/uniprot_sprot.dat";
private static final boolean DISPLAY_ID = true;

private static HashSet<String> featBiojavaLocRef;

@BeforeClass
public static void setUpBeforeClass() throws Exception {
UtilsTest.configureApp();
featBiojavaLocRef = new HashSet<String>();
featBiojavaLocRef.add("chain-1-129-/note=\"toto\" /evidence=\"titi\"");
featBiojavaLocRef.add("mod_res-1-1-/IsoformId=\"P68250-2\" /note=\"toto\" /evidence=\"titi\"");
featBiojavaLocRef.add("chain-2 147 483 647-245-/note=\"toto\" /evidence=\"titi\"");
featBiojavaLocRef.add("chain-31-137-/note=\"toto\" /evidence=\"titi\"");
featBiojavaLocRef.add("chain-1-55-/note=\"toto\" /evidence=\"titi\"");
featBiojavaLocRef.add("transit-1--2 147 483 648-/note=\"toto\" /evidence=\"titi\"");
featBiojavaLocRef.add("chain-69-255-/note=\"toto\" /evidence=\"titi\"");
featBiojavaLocRef.add("chain-1-24-/note=\"toto\" /evidence=\"titi\"");
featBiojavaLocRef.add("signal-1--2 147 483 648-/note=\"toto\" /evidence=\"titi\"");
}

@AfterClass
public static void tearDownAfterClass() throws Exception {
}

@Before
public void setUp() throws Exception {
}

@After
public void tearDown() throws Exception {
}
public static final MessageFormat FEATURE_FORMAT = new MessageFormat("{0}-{1}-{2}-{3}");
private String featureToString(Feature feat) {
String f = FEATURE_FORMAT.format(new Object[] {
feat.getType().toLowerCase(),
feat.getLocation().getMin(),
feat.getLocation().getMax(),
feat.getAnnotation().getProperty("swissprot.featureattribute").toString().trim()});
//System.out.println(" " + f);
return f;
}
// Method used to encode BeeDeeM Feature properties in order to compare them with BioJava ones
private String getDescription(bzh.plealog.bioinfo.api.data.feature.Feature feat) {
Enumeration<bzh.plealog.bioinfo.api.data.feature.Qualifier> enumQ = feat.enumQualifiers();
while(enumQ.hasMoreElements()) {
bzh.plealog.bioinfo.api.data.feature.Qualifier qual = enumQ.nextElement();
if (qual.getName().equalsIgnoreCase(feat.getKey())) {
return qual.getValue();
}
}
return "?";
}
//Method used to encode BioJava Feature properties in order to compare them with BeeDeeM ones
private String featureToString(bzh.plealog.bioinfo.api.data.feature.Feature feat) {
String f = FEATURE_FORMAT.format(new Object[] {
feat.getKey(),
feat.getFrom(),
feat.getTo(),
getDescription(feat)});
//System.out.println(" " + f);
return f;
}

@SuppressWarnings("rawtypes" )
private void testSequence(Sequence seq, int curFeats) {
IBankSequenceInfo si;
FeatureTable ft;
HashSet<String> featBiojava, featBeeDeeM;
assertNotNull(seq);
if (DISPLAY_ID) {
System.out.println("> " + seq.getName());
}
Iterator iterF = seq.features();
featBiojava = new HashSet<String>();
while (iterF.hasNext()) {
Feature feat = (Feature) iterF.next();
if (
!(feat.getLocation().getMin()<0 ||
feat.getLocation().getMin()==Integer.MAX_VALUE ||
feat.getLocation().getMax()<0 ||
feat.getLocation().getMax()==Integer.MAX_VALUE)
) {
//BeeDeeM does not handle fuzzy locations
featBiojava.add(featureToString(feat));
}
}
if (curFeats!=-1) {
assertEquals(featBiojava.size(), curFeats);
}
// get sequence info
si = DBUtils.returnUPSeqInfo(seq, false);
// get feature table; BeeDeeM always add source and protein features w/t what Biojava does provide
// (source contains taxon info, protein contains protein description and db_xrefs)
ft = DBUtils.returnUPFeatureTable(seq, si.getId(), 0, 0, false);

assertEquals(featBiojava.size(), ft.features()-2);
Enumeration<bzh.plealog.bioinfo.api.data.feature.Feature> enumF = ft.enumFeatures();
featBeeDeeM = new HashSet<String>();
while(enumF.hasMoreElements()) {
bzh.plealog.bioinfo.api.data.feature.Feature feat = enumF.nextElement();
featBeeDeeM.add(featureToString(feat));
}
for(String f : featBeeDeeM) {
if (featBiojava.contains(f)) {
featBiojava.remove(f);
}
}
assertEquals(featBiojava.size(), 0);
}

@Test
public void testUniprotFormatChange() {
System.out.println(">> testUniprotFormatChange");
// Uniprot format manual: https://web.expasy.org/docs/userman.html
// Test Uniprot data format, i.e. Feature Table encoded as:
//FT DOMAIN 128 409 Protein kinase.
//FT NP_BIND 134 142 ATP (By similarity).
// From Uniprot release 76 and above, Features are encoded as:
//FT CHAIN 1..851
//FT /note="Putative serine/threonine-protein kinase 019R"
//FT /id="PRO_0000410576"

String file = UtilsTest.getTestFilePath("databank", "uniprot", current_file);
try (BufferedReader reader = new BufferedReader(new FileReader(file))) {
Sequence seq;


// read the Uniprot File
SequenceIterator iter = SeqIOTools.readSwissprot(reader);
assertTrue(iter.hasNext());
seq = iter.nextSequence();
testSequence(seq, current_feats);
} catch (Exception ex) {
System.err.println("unable to read UniProt file: " + file + ": " + ex);
}
}

@Test
public void testLargeFile() {
System.out.println(">> testLargeFile");

String file = UtilsTest.getTestFilePath("databank", "uniprot", "uniprot_new.dat");
try (BufferedReader reader = new BufferedReader(new FileReader(file))) {
Sequence seq;
// read the Uniprot File
SequenceIterator iter = SeqIOTools.readSwissprot(reader);
while(iter.hasNext()) {
seq = iter.nextSequence();
testSequence(seq, -1);
}
} catch (Exception ex) {
System.err.println("unable to read UniProt file: " + file + ": " + ex);
}
}

@Test
public void testUniqueFile() {
//specific test to debug entire SwissProt
//Use this URL to get specifi entry:
// https://www.uniprot.org/uniprot/C0HLM8.txt
System.out.println(">> testUniqueFile");
String file = UtilsTest.getTestFilePath("databank", "uniprot", "single-up-test.dat");
try (BufferedReader reader = new BufferedReader(new FileReader(file))) {
Sequence seq;
// read the Uniprot File
SequenceIterator iter = SeqIOTools.readSwissprot(reader);
while(iter.hasNext()) {
seq = iter.nextSequence();
testSequence(seq, -1);
}
} catch (Exception ex) {
System.err.println("unable to read UniProt file: " + file + ": " + ex);
}
}

@SuppressWarnings("rawtypes")
@Test
public void testAllLocationTypes() {
//specific test to check all types of Uniprot FT locations
System.out.println(">> testUniqueFile");
String file = UtilsTest.getTestFilePath("databank", "uniprot", "locations-up-test.dat");
try (BufferedReader reader = new BufferedReader(new FileReader(file))) {
HashSet<String> featBiojava;
Sequence seq;
// read the Uniprot File
SequenceIterator iter = SeqIOTools.readSwissprot(reader);
while(iter.hasNext()) {
seq = iter.nextSequence();
/**/

assertNotNull(seq);
if (DISPLAY_ID) {
System.out.println("> " + seq.getName());
}
Iterator iterF = seq.features();
featBiojava = new HashSet<String>();
while (iterF.hasNext()) {
Feature feat = (Feature) iterF.next();
//BeeDeeM does not handle fuzzy locations
featBiojava.add(featureToString(feat));
}
assertEquals(featBiojava.size(), 9);
for (String str : featBiojavaLocRef) {
featBiojava.remove(str);
}
assertEquals(featBiojava.size(), 0);
}
} catch (Exception ex) {
System.err.println("unable to read UniProt file: " + file + ": " + ex);
}
}

@Test
public void testEntireSwissProt() {
if (!TEST_ENTIRE_SWISS)
return;
System.out.println(">> testEntireSwissProt");

//not really a bad error
if (new File(SWISS_PATH).exists() == false) {
System.err.println("File not found: " + SWISS_PATH);
return;
}
int counter=0;
try (BufferedReader reader = new BufferedReader(new FileReader(SWISS_PATH))) {
Sequence seq;
// read the Uniprot File
SequenceIterator iter = SeqIOTools.readSwissprot(reader);
while(iter.hasNext()) {
seq = iter.nextSequence();
testSequence(seq, -1);
counter++;
if (counter % 10000 == 0) {
System.out.println("Sequences reviewed: " + counter);
}
}
} catch (Exception ex) {
System.err.println("unable to read UniProt file: " + SWISS_PATH + ": " + ex);
}
System.out.println("Total Sequences: " + counter);
}
}
Loading

0 comments on commit d94be2c

Please sign in to comment.