From 99548ced8e594cd1c06f2c72167f50d45871b9a7 Mon Sep 17 00:00:00 2001 From: kachulis <39926576+kachulis@users.noreply.github.com> Date: Wed, 8 Jul 2020 12:28:01 -0400 Subject: [PATCH] Add Gff3 Writer (#1486) * Adds a Gff3Writer to write out Gff3 files. * Also adjusts the attributes field to account for the fact that attributes map a key to a list of strings, which are separated by commas. As previously written, the comma separated list was left as a single string, which could later be processed into a list by the user. This was vulnerable to situations where one of the entries in the list contained an encoded comma (ie %2C), so that after decoding the decoded comma could not be differentiated from the list separating commas. * This required breaking changes to the existing Gff3 api's but the changes should be simple for downstream users to adapt. --- .../java/htsjdk/tribble/gff/Gff3BaseData.java | 47 ++-- .../java/htsjdk/tribble/gff/Gff3Codec.java | 174 +++++++++++---- .../htsjdk/tribble/gff/Gff3Constants.java | 13 ++ .../java/htsjdk/tribble/gff/Gff3Feature.java | 10 +- .../htsjdk/tribble/gff/Gff3FeatureImpl.java | 16 +- .../java/htsjdk/tribble/gff/Gff3Writer.java | 175 +++++++++++++++ .../htsjdk/tribble/gff/SequenceRegion.java | 28 ++- .../htsjdk/tribble/gff/Gff3CodecTest.java | 183 ++++++++++++---- .../htsjdk/tribble/gff/Gff3FeatureTest.java | 96 ++++---- .../htsjdk/tribble/gff/Gff3WriterTest.java | 205 ++++++++++++++++++ .../feature_extends_past_circular_region.gff3 | 2 +- .../htsjdk/tribble/gff/url_encoding.gff3 | 2 +- 12 files changed, 785 insertions(+), 166 deletions(-) create mode 100644 src/main/java/htsjdk/tribble/gff/Gff3Constants.java create mode 100644 src/main/java/htsjdk/tribble/gff/Gff3Writer.java create mode 100644 src/test/java/htsjdk/tribble/gff/Gff3WriterTest.java diff --git a/src/main/java/htsjdk/tribble/gff/Gff3BaseData.java b/src/main/java/htsjdk/tribble/gff/Gff3BaseData.java index 600bfa56b0..e484d0da6b 100644 --- a/src/main/java/htsjdk/tribble/gff/Gff3BaseData.java +++ b/src/main/java/htsjdk/tribble/gff/Gff3BaseData.java @@ -2,8 +2,10 @@ import htsjdk.tribble.annotation.Strand; +import java.util.ArrayList; import java.util.Collections; import java.util.LinkedHashMap; +import java.util.List; import java.util.Map; public class Gff3BaseData { @@ -18,15 +20,15 @@ public class Gff3BaseData { private final double score; private final Strand strand; private final int phase; - private final Map attributes; + private final Map> attributes; private final String id; private final String name; - private final String alias; + private final List aliases; private final int hashCode; public Gff3BaseData(final String contig, final String source, final String type, final int start, final int end, final Double score, final Strand strand, final int phase, - final Map attributes) { + final Map> attributes) { this.contig = contig; this.source = source; this.type = type; @@ -35,13 +37,24 @@ public Gff3BaseData(final String contig, final String source, final String type, this.score = score; this.phase = phase; this.strand = strand; - this.attributes = Collections.unmodifiableMap(new LinkedHashMap<>(attributes)); - this.id = attributes.get(ID_ATTRIBUTE_KEY); - this.name = attributes.get(NAME_ATTRIBUTE_KEY); - this.alias = attributes.get(ALIAS_ATTRIBUTE_KEY); + this.attributes = copyAttributesSafely(attributes); + this.id = Gff3Codec.extractSingleAttribute(attributes.get(ID_ATTRIBUTE_KEY)); + this.name = Gff3Codec.extractSingleAttribute(attributes.get(NAME_ATTRIBUTE_KEY)); + this.aliases = attributes.getOrDefault(ALIAS_ATTRIBUTE_KEY, Collections.emptyList()); this.hashCode = computeHashCode(); } + private static Map> copyAttributesSafely(final Map> attributes) { + final Map> modifiableDeepMap = new LinkedHashMap<>(); + + for (final Map.Entry> entry : attributes.entrySet()) { + final List unmodifiableDeepList = Collections.unmodifiableList(new ArrayList<>(entry.getValue())); + modifiableDeepMap.put(entry.getKey(), unmodifiableDeepList); + } + + return Collections.unmodifiableMap(modifiableDeepMap); + } + @Override public boolean equals(Object other) { if (other == this) { @@ -73,11 +86,7 @@ public boolean equals(Object other) { ret = ret && otherBaseData.getName() != null && otherBaseData.getName().equals(getName()); } - if (getAlias() == null) { - ret = ret && otherBaseData.getAlias() == null; - } else { - ret = ret && otherBaseData.getAlias() != null && otherBaseData.getAlias().equals(getAlias()); - } + ret = ret && otherBaseData.getAliases().equals(getAliases()); return ret; } @@ -105,9 +114,7 @@ private int computeHashCode() { hash = 31 * hash + getName().hashCode(); } - if (getAlias() != null) { - hash = 31 * hash + getAlias().hashCode(); - } + hash = 31 * hash + aliases.hashCode(); return hash; } @@ -144,10 +151,14 @@ public int getPhase() { return phase; } - public Map getAttributes() { + public Map> getAttributes() { return attributes; } + public List getAttribute(final String key) { + return attributes.getOrDefault(key, Collections.emptyList()); + } + public String getId() { return id; } @@ -156,7 +167,7 @@ public String getName() { return name; } - public String getAlias() { - return alias; + public List getAliases() { + return aliases; } } diff --git a/src/main/java/htsjdk/tribble/gff/Gff3Codec.java b/src/main/java/htsjdk/tribble/gff/Gff3Codec.java index 4659e65feb..17a0188177 100644 --- a/src/main/java/htsjdk/tribble/gff/Gff3Codec.java +++ b/src/main/java/htsjdk/tribble/gff/Gff3Codec.java @@ -16,6 +16,7 @@ import htsjdk.tribble.util.ParsingUtils; + import java.io.*; import java.net.URLDecoder; import java.nio.file.Files; @@ -35,10 +36,7 @@ public class Gff3Codec extends AbstractFeatureCodec { - private static final char FIELD_DELIMITER = '\t'; - private static final char ATTRIBUTE_DELIMITER = ';'; - private static final char KEY_VALUE_SEPARATOR = '='; - private static final char VALUE_DELIMITER = ','; + private static final int NUM_FIELDS = 9; @@ -52,11 +50,7 @@ public class Gff3Codec extends AbstractFeatureCodec { private static final int GENOMIC_PHASE_INDEX = 7; private static final int EXTRA_FIELDS_INDEX = 8; - private static final String COMMENT_START = "#"; - private static final String DIRECTIVE_START = "##"; - - static final String PARENT_ATTRIBUTE_KEY = "Parent"; private static final String IS_CIRCULAR_ATTRIBUTE_KEY = "Is_circular"; private static final String ARTEMIS_FASTA_MARKER = ">"; @@ -66,7 +60,8 @@ public class Gff3Codec extends AbstractFeatureCodec { private final Map> activeFeaturesWithIDs = new HashMap<>(); private final Map> activeParentIDs = new HashMap<>(); - private final Map sequenceRegionMap = new HashMap<>(); + private final Map sequenceRegionMap = new LinkedHashMap<>(); + private final Map commentsWithLineNumbers = new LinkedHashMap<>(); private final static Log logger = Log.getInstance(Gff3Codec.class); @@ -122,11 +117,12 @@ private Gff3Feature decode(final LineIterator lineIterator, final DecodeDepth de return featuresToFlush.poll(); } - if (line.startsWith(COMMENT_START) && !line.startsWith(DIRECTIVE_START)) { + if (line.startsWith(Gff3Constants.COMMENT_START) && !line.startsWith(Gff3Constants.DIRECTIVE_START)) { + commentsWithLineNumbers.put(currentLine, line.substring(Gff3Constants.COMMENT_START.length())); return featuresToFlush.poll(); } - if (line.startsWith(DIRECTIVE_START)) { + if (line.startsWith(Gff3Constants.DIRECTIVE_START)) { parseDirective(line); return featuresToFlush.poll(); } @@ -137,9 +133,7 @@ private Gff3Feature decode(final LineIterator lineIterator, final DecodeDepth de activeFeatures.add(thisFeature); if (depth == DecodeDepth.DEEP) { //link to parents/children/co-features - final String parentIDAttribute = thisFeature.getAttribute(PARENT_ATTRIBUTE_KEY); - final List parentIDs = parentIDAttribute != null? ParsingUtils.split(parentIDAttribute, VALUE_DELIMITER) : new ArrayList<>(); - + final List parentIDs = thisFeature.getAttribute(Gff3Constants.PARENT_ATTRIBUTE_KEY); final String id = thisFeature.getID(); for (final String parentID : parentIDs) { @@ -190,21 +184,24 @@ private Gff3Feature decode(final LineIterator lineIterator, final DecodeDepth de * @return map of keys to values for attributes of this feature * @throws UnsupportedEncodingException */ - static private Map parseAttributes(final String attributesString) throws UnsupportedEncodingException { - final Map attributes = new LinkedHashMap<>(); - final List splitLine = ParsingUtils.split(attributesString,ATTRIBUTE_DELIMITER); + static private Map> parseAttributes(final String attributesString) throws UnsupportedEncodingException { + if (attributesString.equals(Gff3Constants.UNDEFINED_FIELD_VALUE)) { + return Collections.emptyMap(); + } + final Map> attributes = new LinkedHashMap<>(); + final List splitLine = ParsingUtils.split(attributesString,Gff3Constants.ATTRIBUTE_DELIMITER); for(String attribute : splitLine) { - final List key_value = ParsingUtils.split(attribute,KEY_VALUE_SEPARATOR); - if (key_value.size()<2) { - continue; + final List key_value = ParsingUtils.split(attribute,Gff3Constants.KEY_VALUE_SEPARATOR); + if (key_value.size() != 2) { + throw new TribbleException("Attribute string " + attributesString + " is invalid"); } - attributes.put(URLDecoder.decode(key_value.get(0).trim(), "UTF-8"), URLDecoder.decode(key_value.get(1).trim(), "UTF-8")); + attributes.put(URLDecoder.decode(key_value.get(0).trim(), "UTF-8"), decodeAttributeValue(key_value.get(1).trim())); } return attributes; } static private Gff3BaseData parseLine(final String line, final int currentLine) { - final List splitLine = ParsingUtils.split(line, FIELD_DELIMITER); + final List splitLine = ParsingUtils.split(line, Gff3Constants.FIELD_DELIMITER); if (splitLine.size() != NUM_FIELDS) { throw new TribbleException("Found an invalid number of columns in the given Gff3 file at line + " + currentLine + " - Given: " + splitLine.size() + " Expected: " + NUM_FIELDS + " : " + line); @@ -216,10 +213,10 @@ static private Gff3BaseData parseLine(final String line, final int currentLine) final String type = URLDecoder.decode(splitLine.get(FEATURE_TYPE_INDEX), "UTF-8"); final int start = Integer.parseInt(splitLine.get(START_LOCATION_INDEX)); final int end = Integer.parseInt(splitLine.get(END_LOCATION_INDEX)); - final double score = splitLine.get(SCORE_INDEX).equals(".") ? -1 : Double.parseDouble(splitLine.get(SCORE_INDEX)); - final int phase = splitLine.get(GENOMIC_PHASE_INDEX).equals(".") ? -1 : Integer.parseInt(splitLine.get(GENOMIC_PHASE_INDEX)); + final double score = splitLine.get(SCORE_INDEX).equals(Gff3Constants.UNDEFINED_FIELD_VALUE) ? -1 : Double.parseDouble(splitLine.get(SCORE_INDEX)); + final int phase = splitLine.get(GENOMIC_PHASE_INDEX).equals(Gff3Constants.UNDEFINED_FIELD_VALUE) ? -1 : Integer.parseInt(splitLine.get(GENOMIC_PHASE_INDEX)); final Strand strand = Strand.decode(splitLine.get(GENOMIC_STRAND_INDEX)); - final Map attributes = parseAttributes(splitLine.get(EXTRA_FIELDS_INDEX)); + final Map> attributes = parseAttributes(splitLine.get(EXTRA_FIELDS_INDEX)); return new Gff3BaseData(contig, source, type, start, end, score, strand, phase, attributes); } catch (final NumberFormatException ex ) { throw new TribbleException("Cannot read integer value for start/end position from line " + currentLine + ". Line is: " + line, ex); @@ -228,6 +225,30 @@ static private Gff3BaseData parseLine(final String line, final int currentLine) } } + /** + * Get list of sequence regions parsed by the codec. + * @return list of sequence regions + */ + public List getSequenceRegions() { + return Collections.unmodifiableList(new ArrayList<>(sequenceRegionMap.values())); + } + + /** + * Gets map from line number to comment found on that line. The text of the comment EXCLUDES the leading # which indicates a comment line. + * @return Map from line number to comment found on line + */ + public Map getCommentsWithLineNumbers() { + return Collections.unmodifiableMap(new LinkedHashMap<>(commentsWithLineNumbers)); + } + + /** + * Gets list of comments parsed by the codec. Excludes leading # which indicates a comment line. + * @return + */ + public List getCommentTexts() { + return Collections.unmodifiableList(new ArrayList<>(commentsWithLineNumbers.values())); + } + /** * If sequence region of feature's contig has been specified with sequence region directive, validates that * feature's coordinates are within the specified sequence region. TribbleException is thrown if invalid. @@ -238,7 +259,7 @@ private void validateFeature(final Gff3Feature feature) { final SequenceRegion region = sequenceRegionMap.get(feature.getContig()); if (feature.getStart() == region.getStart() && feature.getEnd() == region.getEnd()) { //landmark feature - final boolean isCircular = Boolean.parseBoolean(feature.getAttribute(IS_CIRCULAR_ATTRIBUTE_KEY)); + final boolean isCircular = Boolean.parseBoolean(extractSingleAttribute(feature.getAttribute(IS_CIRCULAR_ATTRIBUTE_KEY))); region.setCircular(isCircular); } if (region.isCircular()? !region.overlaps(feature) : !region.contains(feature)) { @@ -274,7 +295,7 @@ public boolean canDecode(final String inputFilePath) { if (Gff3Directive.toDirective(line) != Gff3Directive.VERSION3_DIRECTIVE) { return false; } - while (line.startsWith(COMMENT_START)) { + while (line.startsWith(Gff3Constants.COMMENT_START)) { line = br.readLine(); if ( line == null ) { return false; @@ -282,7 +303,7 @@ public boolean canDecode(final String inputFilePath) { } // make sure line conforms to gtf spec - final List fields = ParsingUtils.split(line,FIELD_DELIMITER); + final List fields = ParsingUtils.split(line, Gff3Constants.FIELD_DELIMITER); canDecode &= fields.size() == NUM_FIELDS; @@ -318,13 +339,40 @@ public boolean canDecode(final String inputFilePath) { return canDecode; } + static List decodeAttributeValue(final String attributeValue) { + //split on VALUE_DELIMITER, then decode + final List splitValues = ParsingUtils.split(attributeValue, Gff3Constants.VALUE_DELIMITER); + + final List decodedValues = new ArrayList<>(); + for (final String encodedValue : splitValues) { + try { + decodedValues.add(URLDecoder.decode(encodedValue.trim(), "UTF-8")); + } catch (final UnsupportedEncodingException ex) { + throw new TribbleException("Error decoding attribute " + encodedValue, ex); + } + } + + return decodedValues; + } + + static String extractSingleAttribute(final List values) { + if (values == null || values.isEmpty()) { + return null; + } + + if (values.size() != 1) { + throw new TribbleException("Attribute has multiple values when only one expected"); + } + return values.get(0); + } + @Override public FeatureCodecHeader readHeader(LineIterator lineIterator) { List header = new ArrayList<>(); while(lineIterator.hasNext()) { String line = lineIterator.peek(); - if (line.startsWith(COMMENT_START)) { + if (line.startsWith(Gff3Constants.COMMENT_START)) { header.add(line); lineIterator.next(); } else { @@ -425,29 +473,75 @@ public TabixFormat getTabixFormat() { /** * Enum for parsing directive lines. If information in directive line needs to be parsed beyond specifying directive type, decode method should be overriden */ - enum Gff3Directive { + public enum Gff3Directive { - VERSION3_DIRECTIVE("##gff-version\\s+3(?:.\\d)*(?:\\.\\d)*$"), + VERSION3_DIRECTIVE("##gff-version\\s+3(?:\\.\\d*)*$") { + @Override + protected Object decode(final String line) throws IOException { + final String[] splitLine = line.split("\\s+"); + return splitLine[1]; + } + + @Override + String encode(final Object object) { + if (object == null) { + throw new TribbleException("Cannot encode null in VERSION3_DIRECTIVE"); + } + if (!(object instanceof String)) { + throw new TribbleException("Cannot encode object of type " + object.getClass() + " in VERSION3_DIRECTIVE"); + } + + final String versionLine = "##gff-version " + (String)object; + if (!regexPattern.matcher(versionLine).matches()) { + throw new TribbleException("Version " + (String)object + " is not a valid version"); + } + + return versionLine; + } + }, SEQUENCE_REGION_DIRECTIVE("##sequence-region\\s+.+ \\d+ \\d+$") { - private int CONTIG_INDEX = 1; - private int START_INDEX = 2; - private int END_INDEX = 3; + final private int CONTIG_INDEX = 1; + final private int START_INDEX = 2; + final private int END_INDEX = 3; @Override - public Object decode(final String line) throws IOException { + protected Object decode(final String line) throws IOException { final String[] splitLine = line.split("\\s+"); final String contig = URLDecoder.decode(splitLine[CONTIG_INDEX], "UTF-8"); final int start = Integer.parseInt(splitLine[START_INDEX]); final int end = Integer.parseInt(splitLine[END_INDEX]); return new SequenceRegion(contig, start, end); } + + @Override + String encode(final Object object) { + if (object == null) { + throw new TribbleException("Cannot encode null in SEQUENCE_REGION_DIRECTIVE"); + } + if (!(object instanceof SequenceRegion)) { + throw new TribbleException("Cannot encode object of type " + object.getClass() + " in SEQUENCE_REGION_DIRECTIVE"); + } + + final SequenceRegion sequenceRegion = (SequenceRegion) object; + return "##sequence-region " + Gff3Writer.encodeString(sequenceRegion.getContig()) + " " + sequenceRegion.getStart() + " " + sequenceRegion.getEnd(); + } }, - FLUSH_DIRECTIVE("###$"), + FLUSH_DIRECTIVE("###$") { + @Override + String encode(final Object object) { + return "###"; + } + }, - FASTA_DIRECTIVE("##FASTA$"); + FASTA_DIRECTIVE("##FASTA$") { + @Override + String encode(final Object object) { + return "##FASTA"; + } + }; - private final Pattern regexPattern; + protected final Pattern regexPattern; Gff3Directive(String regex) { this.regexPattern = Pattern.compile(regex); @@ -462,9 +556,11 @@ public static Gff3Directive toDirective(final String line) { return null; } - public Object decode(final String line) throws IOException { + protected Object decode(final String line) throws IOException { return null; } + + abstract String encode(final Object object); } } diff --git a/src/main/java/htsjdk/tribble/gff/Gff3Constants.java b/src/main/java/htsjdk/tribble/gff/Gff3Constants.java new file mode 100644 index 0000000000..6d399c1598 --- /dev/null +++ b/src/main/java/htsjdk/tribble/gff/Gff3Constants.java @@ -0,0 +1,13 @@ +package htsjdk.tribble.gff; + +public class Gff3Constants { + public static final char FIELD_DELIMITER = '\t'; + public static final char ATTRIBUTE_DELIMITER = ';'; + public static final char KEY_VALUE_SEPARATOR = '='; + public static final char VALUE_DELIMITER = ','; + public static final String COMMENT_START = "#"; + public static final String DIRECTIVE_START = "##"; + public static final String UNDEFINED_FIELD_VALUE = "."; + public static final String PARENT_ATTRIBUTE_KEY = "Parent"; + public final static char END_OF_LINE_CHARACTER = '\n'; +} diff --git a/src/main/java/htsjdk/tribble/gff/Gff3Feature.java b/src/main/java/htsjdk/tribble/gff/Gff3Feature.java index 78f1ef8107..53ac1ea92e 100644 --- a/src/main/java/htsjdk/tribble/gff/Gff3Feature.java +++ b/src/main/java/htsjdk/tribble/gff/Gff3Feature.java @@ -3,6 +3,7 @@ import htsjdk.tribble.Feature; import htsjdk.tribble.annotation.Strand; +import java.util.List; import java.util.Map; import java.util.Set; @@ -50,17 +51,18 @@ default int getStart() { return getBaseData().getStart(); } - default String getAttribute(final String key) { - return getBaseData().getAttributes().get(key); + + default List getAttribute(final String key) { + return getBaseData().getAttribute(key); } - default Map getAttributes() { return getBaseData().getAttributes();} + default Map> getAttributes() { return getBaseData().getAttributes();} default String getID() { return getBaseData().getId();} default String getName() { return getBaseData().getName();} - default String getAlias() { return getBaseData().getAlias();} + default List getAliases() { return getBaseData().getAliases();} default double getScore() { return getBaseData().getScore();} diff --git a/src/main/java/htsjdk/tribble/gff/Gff3FeatureImpl.java b/src/main/java/htsjdk/tribble/gff/Gff3FeatureImpl.java index 6c331c32e5..9ac33360fa 100644 --- a/src/main/java/htsjdk/tribble/gff/Gff3FeatureImpl.java +++ b/src/main/java/htsjdk/tribble/gff/Gff3FeatureImpl.java @@ -31,7 +31,7 @@ public class Gff3FeatureImpl implements Gff3Feature { public Gff3FeatureImpl(final String contig, final String source, final String type, final int start, final int end, final Double score, final Strand strand, final int phase, - final Map attributes) { + final Map> attributes) { baseData = new Gff3BaseData(contig, source, type, start, end, score, strand, phase, attributes); } @@ -84,15 +84,15 @@ public Gff3BaseData getBaseData() { public Set getAncestors() { final List ancestors = new ArrayList<>(parents); for (final Gff3FeatureImpl parent : parents) { - ancestors.addAll(baseData.getAttributes().containsKey(DERIVES_FROM_ATTRIBUTE_KEY)? parent.getAncestors(baseData.getAttributes().get(DERIVES_FROM_ATTRIBUTE_KEY)) : parent.getAncestors()); + ancestors.addAll(getAttribute(DERIVES_FROM_ATTRIBUTE_KEY).isEmpty()? parent.getAncestors() : parent.getAncestors(new HashSet<>(baseData.getAttributes().get(DERIVES_FROM_ATTRIBUTE_KEY)))); } return new LinkedHashSet<>(ancestors); } - private Set getAncestors(final String derivingFrom) { + private Set getAncestors(final Collection derivingFrom) { final List ancestors = new ArrayList<>(); for (final Gff3FeatureImpl parent : parents) { - if (parent.getID().equals(derivingFrom) || parent.getAncestors().stream().anyMatch(f -> f.getID().equals(derivingFrom))) { + if (derivingFrom.contains(parent.getID()) || parent.getAncestors().stream().anyMatch(f -> derivingFrom.contains(f.getID()))) { ancestors.add(parent); ancestors.addAll(parent.getAncestors()); } @@ -116,8 +116,8 @@ public Set getDescendents() { } private Set getDescendents(final Set idsInLineage) { - final List childrenToAdd = children.stream().filter(c -> c.getAttribute(DERIVES_FROM_ATTRIBUTE_KEY) == null || - idsInLineage.contains(c.getAttribute(DERIVES_FROM_ATTRIBUTE_KEY))). + final List childrenToAdd = children.stream().filter(c -> c.getAttribute(DERIVES_FROM_ATTRIBUTE_KEY).isEmpty() || + !Collections.disjoint(idsInLineage, c.getAttribute(DERIVES_FROM_ATTRIBUTE_KEY))). collect(Collectors.toList()); final List descendants = new ArrayList<>(childrenToAdd); @@ -148,8 +148,8 @@ private Set getDescendents(final Set idsInLineage) { public void addParent(final Gff3FeatureImpl parent) { final Set topLevelFeaturesToAdd = new HashSet<>(parent.getTopLevelFeatures()); - if (baseData.getAttributes().containsKey(DERIVES_FROM_ATTRIBUTE_KEY)) { - topLevelFeaturesToAdd.removeIf(f -> !f.getID().equals(baseData.getAttributes().get(DERIVES_FROM_ATTRIBUTE_KEY)) && f.getDescendents().stream().noneMatch(f2 -> f2.getID()== null? false:f2.getID().equals(baseData.getAttributes().get(DERIVES_FROM_ATTRIBUTE_KEY)))); + if (!getAttribute(DERIVES_FROM_ATTRIBUTE_KEY).isEmpty()) { + topLevelFeaturesToAdd.removeIf(f -> !getAttribute(DERIVES_FROM_ATTRIBUTE_KEY).contains(f.getID()) && f.getDescendents().stream().noneMatch(f2 -> f2.getID()!= null && getAttribute(DERIVES_FROM_ATTRIBUTE_KEY).contains(f2.getID()))); } parents.add(parent); parent.addChild(this); diff --git a/src/main/java/htsjdk/tribble/gff/Gff3Writer.java b/src/main/java/htsjdk/tribble/gff/Gff3Writer.java new file mode 100644 index 0000000000..823557cdf0 --- /dev/null +++ b/src/main/java/htsjdk/tribble/gff/Gff3Writer.java @@ -0,0 +1,175 @@ +package htsjdk.tribble.gff; + +import htsjdk.samtools.util.BlockCompressedOutputStream; +import htsjdk.samtools.util.FileExtensions; +import htsjdk.samtools.util.IOUtil; +import htsjdk.tribble.TribbleException; + +import java.io.BufferedOutputStream; +import java.io.Closeable; +import java.io.IOException; +import java.io.OutputStream; +import java.io.UnsupportedEncodingException; +import java.net.URLEncoder; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.Arrays; +import java.util.Collection; +import java.util.List; +import java.util.Map; +import java.util.function.Consumer; + + +/** + * A class to write out gff3 files. Features are added using {@link #addFeature(Gff3Feature)}, directives using {@link #addDirective(Gff3Codec.Gff3Directive)}, + * and comments using {@link #addComment(String)}. Note that the version 3 directive is automatically added at creation, so should not be added separately. + */ +public class Gff3Writer implements Closeable { + + private final OutputStream out; + private final static String version = "3.1.25"; + + public Gff3Writer(final Path path) throws IOException { + if (FileExtensions.GFF3.stream().noneMatch(e -> path.toString().endsWith(e))) { + throw new TribbleException("File " + path + " does not have extension consistent with gff3"); + } + + final OutputStream outputStream = IOUtil.hasGzipFileExtension(path)? new BlockCompressedOutputStream(path.toFile()) : Files.newOutputStream(path); + out = new BufferedOutputStream(outputStream); + //start with version directive + initialize(); + } + + public Gff3Writer(final OutputStream stream) { + out = stream; + initialize(); + } + + private void initialize() { + try { + writeWithNewLine(Gff3Codec.Gff3Directive.VERSION3_DIRECTIVE.encode(version)); + } catch (final IOException ex) { + throw new TribbleException("Error writing version directive", ex); + } + } + + private void writeWithNewLine(final String txt) throws IOException { + out.write(txt.getBytes()); + out.write(Gff3Constants.END_OF_LINE_CHARACTER); + } + + private void tryToWrite(final String string) { + try { + out.write(string.getBytes()); + } catch (final IOException ex) { + throw new TribbleException("Error writing out string " + string, ex); + } + } + + private void writeFirstEightFields(final Gff3Feature feature) throws IOException { + writeJoinedByDelimiter(Gff3Constants.FIELD_DELIMITER, this::tryToWrite, Arrays.asList( + encodeString(feature.getContig()), + encodeString(feature.getSource()), + encodeString(feature.getType()), + Integer.toString(feature.getStart()), + Integer.toString(feature.getEnd()), + feature.getScore() < 0 ? Gff3Constants.UNDEFINED_FIELD_VALUE : Double.toString(feature.getScore()), + feature.getStrand().toString(), + feature.getPhase() < 0 ? Gff3Constants.UNDEFINED_FIELD_VALUE : Integer.toString(feature.getPhase()) + ) + ); + } + + void writeAttributes(final Map> attributes) throws IOException { + if (attributes.isEmpty()) { + out.write(Gff3Constants.UNDEFINED_FIELD_VALUE.getBytes()); + } + + writeJoinedByDelimiter(Gff3Constants.ATTRIBUTE_DELIMITER, e -> writeKeyValuePair(e.getKey(), e.getValue()), attributes.entrySet()); + } + + void writeKeyValuePair(final String key, final List values) { + try { + tryToWrite(key); + out.write(Gff3Constants.KEY_VALUE_SEPARATOR); + writeJoinedByDelimiter(Gff3Constants.VALUE_DELIMITER, v -> tryToWrite(encodeString(v)), values); + } catch (final IOException ex) { + throw new TribbleException("error writing out key value pair " + key + " " + values); + } + } + + private void writeJoinedByDelimiter(final char delimiter, final Consumer consumer, final Collection fields) throws IOException { + boolean isNotFirstField = false; + for (final T field : fields) { + if (isNotFirstField) { + out.write(delimiter); + } else { + isNotFirstField = true; + } + + consumer.accept(field); + } + } + + /*** + * add a feature + * @param feature the feature to be added + * @throws IOException + */ + public void addFeature(final Gff3Feature feature) throws IOException { + writeFirstEightFields(feature); + out.write(Gff3Constants.FIELD_DELIMITER); + writeAttributes(feature.getAttributes()); + out.write(Gff3Constants.END_OF_LINE_CHARACTER); + } + + static String encodeString(final String s) { + try { + //URLEncoder.encode is hardcoded to change all spaces to +, but we want spaces left unchanged so have to do this + //+ is escaped to %2B, so no loss of information + return URLEncoder.encode(s, "UTF-8").replace("+", " "); + } catch (final UnsupportedEncodingException ex) { + throw new TribbleException("Encoding failure", ex); + } + } + + /** + * Add a directive with an object + * @param directive the directive to be added + * @param object the object to be encoded with the directive + * @throws IOException + */ + public void addDirective(final Gff3Codec.Gff3Directive directive, final Object object) throws IOException { + if (directive == Gff3Codec.Gff3Directive.VERSION3_DIRECTIVE) { + throw new TribbleException("VERSION3_DIRECTIVE is automatically added and should not be added manually."); + } + writeWithNewLine(directive.encode(object)); + } + + /** + * Add a directive + * @param directive the directive to be added + * @throws IOException + */ + public void addDirective(final Gff3Codec.Gff3Directive directive) throws IOException { + if (directive == Gff3Codec.Gff3Directive.VERSION3_DIRECTIVE) { + throw new TribbleException("VERSION3_DIRECTIVE is automatically added and should not be added manually."); + } + addDirective(directive, null); + } + + /** + * Add comment line + * @param comment the comment line (not including leading #) + * @throws IOException + */ + public void addComment(final String comment) throws IOException { + out.write(Gff3Constants.COMMENT_START.getBytes()); + writeWithNewLine(comment); + } + + @Override + public void close() throws IOException { + out.close(); + } +} \ No newline at end of file diff --git a/src/main/java/htsjdk/tribble/gff/SequenceRegion.java b/src/main/java/htsjdk/tribble/gff/SequenceRegion.java index 42798c433e..13dc4f924e 100644 --- a/src/main/java/htsjdk/tribble/gff/SequenceRegion.java +++ b/src/main/java/htsjdk/tribble/gff/SequenceRegion.java @@ -9,7 +9,8 @@ public class SequenceRegion implements Locatable { private final int start; private final int end; private final String contig; - private boolean isCircular; + private Boolean isCircular; + private int hashCode; SequenceRegion(final String contig, final int start, final int end) { this(contig, start, end, false); @@ -20,10 +21,12 @@ public class SequenceRegion implements Locatable { this.start = start; this.end = end; this.isCircular = isCircular; + hashCode = computeHashCode(); } void setCircular(final boolean isCircular) { this.isCircular = isCircular; + hashCode = computeHashCode(); } void setCircular() { @@ -41,9 +44,28 @@ void setCircular() { public boolean isCircular(){return isCircular;} - public boolean equals(final SequenceRegion other) { - return other.start == start && other.end==end && other.contig.equals(contig) && other.isCircular == isCircular; + @Override + public boolean equals(Object other) { + if (other == this) { + return true; + } + + if (!(other instanceof SequenceRegion)) { + return false; + } + + final SequenceRegion otherSequenceRegion = (SequenceRegion) other; + return otherSequenceRegion.start == start && otherSequenceRegion.end==end && otherSequenceRegion.contig.equals(contig) && otherSequenceRegion.isCircular == isCircular; } + private int computeHashCode() { + int hash = contig.hashCode(); + hash = 31 * hash + start; + hash = 31 * hash + end; + hash = 31 * hash + isCircular.hashCode(); + return hash; + } + @Override + public int hashCode() { return hashCode;} } diff --git a/src/test/java/htsjdk/tribble/gff/Gff3CodecTest.java b/src/test/java/htsjdk/tribble/gff/Gff3CodecTest.java index 14fe9e9794..0d617e8b01 100644 --- a/src/test/java/htsjdk/tribble/gff/Gff3CodecTest.java +++ b/src/test/java/htsjdk/tribble/gff/Gff3CodecTest.java @@ -4,6 +4,7 @@ import htsjdk.HtsjdkTest; import htsjdk.tribble.AbstractFeatureReader; import htsjdk.tribble.TestUtils; +import htsjdk.tribble.TribbleException; import htsjdk.tribble.annotation.Strand; import htsjdk.tribble.readers.LineIterator; import org.testng.Assert; @@ -14,6 +15,7 @@ import java.nio.file.Path; import java.nio.file.Paths; import java.util.ArrayList; +import java.util.Arrays; import java.util.Collections; import java.util.HashSet; import java.util.List; @@ -104,7 +106,6 @@ public void testGZipped(final Path inputGff3, final Path inputGff3GZipped) throw final AbstractFeatureReader reader = AbstractFeatureReader.getFeatureReader(inputGff3.toAbsolutePath().toString(), null, new Gff3Codec(), false); final AbstractFeatureReader readerGZipped = AbstractFeatureReader.getFeatureReader(inputGff3GZipped.toAbsolutePath().toString(), null, new Gff3Codec(), false); - final Set topLevelFeatures = new HashSet<>(); final Set topLevelFeaturesGZipped = new HashSet<>(); @@ -178,7 +179,7 @@ public void urlDecodingTest() throws IOException { Assert.assertEquals(feature.getSource(), "a source & also a str*)%nge source"); Assert.assertEquals(feature.getType(), "a region"); Assert.assertEquals(feature.getID(), "this is the ID of this wacky feature^&%##$%*&>,. ,."); - Assert.assertEquals(feature.getAttribute("Another key"), "Another=value"); + Assert.assertEquals(feature.getAttribute("Another key"), Arrays.asList("Another=value", "And a second, value")); } @@ -197,113 +198,113 @@ public Object[][] examplesDataProvider() { final Set canonicalGeneFeatures = new HashSet<>(); - final Gff3FeatureImpl canonicalGene_gene00001 = new Gff3FeatureImpl("ctg123", ".", "gene", 1000, 9000, 1030d, Strand.POSITIVE, -1, ImmutableMap.of("ID", "gene00001", "Name", "EDEN")); + final Gff3FeatureImpl canonicalGene_gene00001 = new Gff3FeatureImpl("ctg123", ".", "gene", 1000, 9000, 1030d, Strand.POSITIVE, -1, ImmutableMap.of("ID", Collections.singletonList("gene00001"), "Name", Collections.singletonList("EDEN"))); canonicalGeneFeatures.add(canonicalGene_gene00001); - final Gff3FeatureImpl canonicalGene_tfbs00001 = new Gff3FeatureImpl("ctg123", ".", "TF_binding_site", 1000, 1012, 0.999d, Strand.POSITIVE, -1, ImmutableMap.of("ID", "tfbs00001", "Parent", "gene00001")); + final Gff3FeatureImpl canonicalGene_tfbs00001 = new Gff3FeatureImpl("ctg123", ".", "TF_binding_site", 1000, 1012, 0.999d, Strand.POSITIVE, -1, ImmutableMap.of("ID", Collections.singletonList("tfbs00001"), "Parent", Collections.singletonList("gene00001"))); canonicalGene_tfbs00001.addParent(canonicalGene_gene00001); canonicalGeneFeatures.add(canonicalGene_tfbs00001); - final Gff3FeatureImpl canonicalGene_mRNA00001 = new Gff3FeatureImpl("ctg123", ".", "mRNA", 1050, 9000, 1.37d, Strand.POSITIVE, -1, ImmutableMap.of("ID", "mRNA00001", "Name", "EDEN.1", "Parent", "gene00001")); + final Gff3FeatureImpl canonicalGene_mRNA00001 = new Gff3FeatureImpl("ctg123", ".", "mRNA", 1050, 9000, 1.37d, Strand.POSITIVE, -1, ImmutableMap.of("ID", Collections.singletonList("mRNA00001"), "Name", Collections.singletonList("EDEN.1"), "Parent", Collections.singletonList("gene00001"))); canonicalGene_mRNA00001.addParent(canonicalGene_gene00001); canonicalGeneFeatures.add(canonicalGene_mRNA00001); - final Gff3FeatureImpl canonicalGene_mRNA00002 = new Gff3FeatureImpl("ctg123", ".", "mRNA", 1050, 9000, -1d, Strand.POSITIVE, -1, ImmutableMap.of("ID", "mRNA00002", "Name", "EDEN.2", "Parent", "gene00001")); + final Gff3FeatureImpl canonicalGene_mRNA00002 = new Gff3FeatureImpl("ctg123", ".", "mRNA", 1050, 9000, -1d, Strand.POSITIVE, -1, ImmutableMap.of("ID", Collections.singletonList("mRNA00002"), "Name", Collections.singletonList("EDEN.2"), "Parent", Collections.singletonList("gene00001"))); canonicalGene_mRNA00002.addParent(canonicalGene_gene00001); canonicalGeneFeatures.add(canonicalGene_mRNA00002); - final Gff3FeatureImpl canonicalGene_mRNA00003 = new Gff3FeatureImpl("ctg123", ".", "mRNA", 1300, 9000, -1d, Strand.POSITIVE, -1, ImmutableMap.of("ID", "mRNA00003", "Name", "EDEN.3", "Parent", "gene00001")); + final Gff3FeatureImpl canonicalGene_mRNA00003 = new Gff3FeatureImpl("ctg123", ".", "mRNA", 1300, 9000, -1d, Strand.POSITIVE, -1, ImmutableMap.of("ID", Collections.singletonList("mRNA00003"), "Name", Collections.singletonList("EDEN.3"), "Parent", Collections.singletonList("gene00001"))); canonicalGene_mRNA00003.addParent(canonicalGene_gene00001); canonicalGeneFeatures.add(canonicalGene_mRNA00003); - final Gff3FeatureImpl canonicalGene_exon00001 = new Gff3FeatureImpl("ctg123", ".", "exon", 1300, 1500, -1d, Strand.POSITIVE, -1, ImmutableMap.of("ID", "exon00001", "Parent", "mRNA00003")); + final Gff3FeatureImpl canonicalGene_exon00001 = new Gff3FeatureImpl("ctg123", ".", "exon", 1300, 1500, -1d, Strand.POSITIVE, -1, ImmutableMap.of("ID", Collections.singletonList("exon00001"), "Parent", Collections.singletonList("mRNA00003"))); canonicalGene_exon00001.addParent(canonicalGene_mRNA00003); canonicalGeneFeatures.add(canonicalGene_exon00001); - final Gff3FeatureImpl canonicalGene_exon00002 = new Gff3FeatureImpl("ctg123", ".", "exon", 1050, 1500, -1d, Strand.POSITIVE, -1, ImmutableMap.of("ID", "exon00002", "Parent", "mRNA00001,mRNA00002")); + final Gff3FeatureImpl canonicalGene_exon00002 = new Gff3FeatureImpl("ctg123", ".", "exon", 1050, 1500, -1d, Strand.POSITIVE, -1, ImmutableMap.of("ID", Collections.singletonList("exon00002"), "Parent", Arrays.asList("mRNA00001", "mRNA00002"))); canonicalGene_exon00002.addParent(canonicalGene_mRNA00001); canonicalGene_exon00002.addParent(canonicalGene_mRNA00002); canonicalGeneFeatures.add(canonicalGene_exon00002); - final Gff3FeatureImpl canonicalGene_exon00003 = new Gff3FeatureImpl("ctg123", ".", "exon", 3000, 3902, -1d, Strand.POSITIVE, -1, ImmutableMap.of("ID", "exon00003", "Parent", "mRNA00001,mRNA00003")); + final Gff3FeatureImpl canonicalGene_exon00003 = new Gff3FeatureImpl("ctg123", ".", "exon", 3000, 3902, -1d, Strand.POSITIVE, -1, ImmutableMap.of("ID", Collections.singletonList("exon00003"), "Parent", Arrays.asList("mRNA00001", "mRNA00003"))); canonicalGene_exon00003.addParent(canonicalGene_mRNA00001); canonicalGene_exon00003.addParent(canonicalGene_mRNA00003); canonicalGeneFeatures.add(canonicalGene_exon00003); - final Gff3FeatureImpl canonicalGene_exon00004 = new Gff3FeatureImpl("ctg123", ".", "exon", 5000, 5500, -1d, Strand.POSITIVE, -1, ImmutableMap.of("ID", "exon00004", "Parent", "mRNA00001,mRNA00002,mRNA00003")); + final Gff3FeatureImpl canonicalGene_exon00004 = new Gff3FeatureImpl("ctg123", ".", "exon", 5000, 5500, -1d, Strand.POSITIVE, -1, ImmutableMap.of("ID", Collections.singletonList("exon00004"), "Parent", Arrays.asList("mRNA00001", "mRNA00002", "mRNA00003"))); canonicalGene_exon00004.addParent(canonicalGene_mRNA00001); canonicalGene_exon00004.addParent(canonicalGene_mRNA00002); canonicalGene_exon00004.addParent(canonicalGene_mRNA00003); canonicalGeneFeatures.add(canonicalGene_exon00004); - final Gff3FeatureImpl canonicalGene_exon00005 = new Gff3FeatureImpl("ctg123", ".", "exon", 7000, 9000, -1d, Strand.POSITIVE, -1, ImmutableMap.of("ID", "exon00005", "Parent", "mRNA00001,mRNA00002,mRNA00003")); + final Gff3FeatureImpl canonicalGene_exon00005 = new Gff3FeatureImpl("ctg123", ".", "exon", 7000, 9000, -1d, Strand.POSITIVE, -1, ImmutableMap.of("ID", Collections.singletonList("exon00005"), "Parent", Arrays.asList("mRNA00001", "mRNA00002", "mRNA00003"))); canonicalGene_exon00005.addParent(canonicalGene_mRNA00001); canonicalGene_exon00005.addParent(canonicalGene_mRNA00002); canonicalGene_exon00005.addParent(canonicalGene_mRNA00003); canonicalGeneFeatures.add(canonicalGene_exon00005); - final Gff3FeatureImpl canonicalGene_cds00001_1 = new Gff3FeatureImpl("ctg123", ".", "CDS", 1201, 1500, -1d, Strand.POSITIVE, 0, ImmutableMap.of("ID", "cds00001", "Parent", "mRNA00001", "Name", "edenprotein.1")); + final Gff3FeatureImpl canonicalGene_cds00001_1 = new Gff3FeatureImpl("ctg123", ".", "CDS", 1201, 1500, -1d, Strand.POSITIVE, 0, ImmutableMap.of("ID", Collections.singletonList("cds00001"), "Parent", Collections.singletonList("mRNA00001"), "Name", Collections.singletonList("edenprotein.1"))); canonicalGene_cds00001_1.addParent(canonicalGene_mRNA00001); canonicalGeneFeatures.add(canonicalGene_cds00001_1); - final Gff3FeatureImpl canonicalGene_cds00001_2 = new Gff3FeatureImpl("ctg123", ".", "CDS", 3000, 3902, -1d, Strand.POSITIVE, 0, ImmutableMap.of("ID", "cds00001", "Parent", "mRNA00001", "Name", "edenprotein.1")); + final Gff3FeatureImpl canonicalGene_cds00001_2 = new Gff3FeatureImpl("ctg123", ".", "CDS", 3000, 3902, -1d, Strand.POSITIVE, 0, ImmutableMap.of("ID", Collections.singletonList("cds00001"), "Parent", Collections.singletonList("mRNA00001"), "Name", Collections.singletonList("edenprotein.1"))); canonicalGene_cds00001_2.addParent(canonicalGene_mRNA00001); canonicalGene_cds00001_2.addCoFeature(canonicalGene_cds00001_1); canonicalGeneFeatures.add(canonicalGene_cds00001_2); - final Gff3FeatureImpl canonicalGene_cds00001_3 = new Gff3FeatureImpl("ctg123", ".", "CDS", 5000, 5500, -1d, Strand.POSITIVE, 0, ImmutableMap.of("ID", "cds00001", "Parent", "mRNA00001", "Name", "edenprotein.1")); + final Gff3FeatureImpl canonicalGene_cds00001_3 = new Gff3FeatureImpl("ctg123", ".", "CDS", 5000, 5500, -1d, Strand.POSITIVE, 0, ImmutableMap.of("ID", Collections.singletonList("cds00001"), "Parent", Collections.singletonList("mRNA00001"), "Name", Collections.singletonList("edenprotein.1"))); canonicalGene_cds00001_3.addParent(canonicalGene_mRNA00001); canonicalGene_cds00001_3.addCoFeature(canonicalGene_cds00001_1); canonicalGene_cds00001_3.addCoFeature(canonicalGene_cds00001_2); canonicalGeneFeatures.add(canonicalGene_cds00001_3); - final Gff3FeatureImpl canonicalGene_cds00001_4 = new Gff3FeatureImpl("ctg123", ".", "CDS", 7000, 7600, -1d, Strand.POSITIVE, 0, ImmutableMap.of("ID", "cds00001", "Parent", "mRNA00001", "Name", "edenprotein.1")); + final Gff3FeatureImpl canonicalGene_cds00001_4 = new Gff3FeatureImpl("ctg123", ".", "CDS", 7000, 7600, -1d, Strand.POSITIVE, 0, ImmutableMap.of("ID", Collections.singletonList("cds00001"), "Parent", Collections.singletonList("mRNA00001"), "Name", Collections.singletonList("edenprotein.1"))); canonicalGene_cds00001_4.addParent(canonicalGene_mRNA00001); canonicalGene_cds00001_4.addCoFeature(canonicalGene_cds00001_1); canonicalGene_cds00001_4.addCoFeature(canonicalGene_cds00001_2); canonicalGene_cds00001_4.addCoFeature(canonicalGene_cds00001_3); canonicalGeneFeatures.add(canonicalGene_cds00001_4); - final Gff3FeatureImpl canonicalGene_cds00002_1 = new Gff3FeatureImpl("ctg123", ".", "CDS", 1201, 1500, -1d, Strand.POSITIVE, 0, ImmutableMap.of("ID", "cds00002", "Parent", "mRNA00002", "Name", "edenprotein.2")); + final Gff3FeatureImpl canonicalGene_cds00002_1 = new Gff3FeatureImpl("ctg123", ".", "CDS", 1201, 1500, -1d, Strand.POSITIVE, 0, ImmutableMap.of("ID", Collections.singletonList("cds00002"), "Parent", Collections.singletonList("mRNA00002"), "Name", Collections.singletonList("edenprotein.2"))); canonicalGene_cds00002_1.addParent(canonicalGene_mRNA00002); canonicalGeneFeatures.add(canonicalGene_cds00002_1); - final Gff3FeatureImpl canonicalGene_cds00002_2 = new Gff3FeatureImpl("ctg123", ".", "CDS", 5000, 5500, -1d, Strand.POSITIVE, 0, ImmutableMap.of("ID", "cds00002", "Parent", "mRNA00002", "Name", "edenprotein.2")); + final Gff3FeatureImpl canonicalGene_cds00002_2 = new Gff3FeatureImpl("ctg123", ".", "CDS", 5000, 5500, -1d, Strand.POSITIVE, 0, ImmutableMap.of("ID", Collections.singletonList("cds00002"), "Parent", Collections.singletonList("mRNA00002"), "Name", Collections.singletonList("edenprotein.2"))); canonicalGene_cds00002_2.addParent(canonicalGene_mRNA00002); canonicalGene_cds00002_2.addCoFeature(canonicalGene_cds00002_1); canonicalGeneFeatures.add(canonicalGene_cds00002_2); - final Gff3FeatureImpl canonicalGene_cds00002_3 = new Gff3FeatureImpl("ctg123", ".", "CDS", 7000, 7600, -1d, Strand.POSITIVE, 0, ImmutableMap.of("ID", "cds00002", "Parent", "mRNA00002", "Name", "edenprotein.2")); + final Gff3FeatureImpl canonicalGene_cds00002_3 = new Gff3FeatureImpl("ctg123", ".", "CDS", 7000, 7600, -1d, Strand.POSITIVE, 0, ImmutableMap.of("ID", Collections.singletonList("cds00002"), "Parent", Collections.singletonList("mRNA00002"), "Name", Collections.singletonList("edenprotein.2"))); canonicalGene_cds00002_3.addParent(canonicalGene_mRNA00002); canonicalGene_cds00002_3.addCoFeature(canonicalGene_cds00002_1); canonicalGene_cds00002_3.addCoFeature(canonicalGene_cds00002_2); canonicalGeneFeatures.add(canonicalGene_cds00002_3); - final Gff3FeatureImpl canonicalGene_cds00003_1 = new Gff3FeatureImpl("ctg123", ".", "CDS", 3301, 3902, -1d, Strand.POSITIVE, 0, ImmutableMap.of("ID", "cds00003", "Parent", "mRNA00003", "Name", "edenprotein.3")); + final Gff3FeatureImpl canonicalGene_cds00003_1 = new Gff3FeatureImpl("ctg123", ".", "CDS", 3301, 3902, -1d, Strand.POSITIVE, 0, ImmutableMap.of("ID", Collections.singletonList("cds00003"), "Parent", Collections.singletonList("mRNA00003"), "Name", Collections.singletonList("edenprotein.3"))); canonicalGene_cds00003_1.addParent(canonicalGene_mRNA00003); canonicalGeneFeatures.add(canonicalGene_cds00003_1); - final Gff3FeatureImpl canonicalGene_cds00003_2 = new Gff3FeatureImpl("ctg123", ".", "CDS", 5000, 5500, -1d, Strand.POSITIVE, 1, ImmutableMap.of("ID", "cds00003", "Parent", "mRNA00003", "Name", "edenprotein.3")); + final Gff3FeatureImpl canonicalGene_cds00003_2 = new Gff3FeatureImpl("ctg123", ".", "CDS", 5000, 5500, -1d, Strand.POSITIVE, 1, ImmutableMap.of("ID", Collections.singletonList("cds00003"), "Parent", Collections.singletonList("mRNA00003"), "Name", Collections.singletonList("edenprotein.3"))); canonicalGene_cds00003_2.addParent(canonicalGene_mRNA00003); canonicalGene_cds00003_2.addCoFeature(canonicalGene_cds00003_1); canonicalGeneFeatures.add(canonicalGene_cds00003_2); - final Gff3FeatureImpl canonicalGene_cds00003_3 = new Gff3FeatureImpl("ctg123", ".", "CDS", 7000, 7600, -1d, Strand.POSITIVE, 1, ImmutableMap.of("ID", "cds00003", "Parent", "mRNA00003", "Name", "edenprotein.3")); + final Gff3FeatureImpl canonicalGene_cds00003_3 = new Gff3FeatureImpl("ctg123", ".", "CDS", 7000, 7600, -1d, Strand.POSITIVE, 1, ImmutableMap.of("ID", Collections.singletonList("cds00003"), "Parent", Collections.singletonList("mRNA00003"), "Name", Collections.singletonList("edenprotein.3"))); canonicalGene_cds00003_3.addParent(canonicalGene_mRNA00003); canonicalGene_cds00003_3.addCoFeature(canonicalGene_cds00003_1); canonicalGene_cds00003_3.addCoFeature(canonicalGene_cds00003_2); canonicalGeneFeatures.add(canonicalGene_cds00003_3); - final Gff3FeatureImpl canonicalGene_cds00004_1 = new Gff3FeatureImpl("ctg123", ".", "CDS", 3391, 3902, -1d, Strand.POSITIVE, 0, ImmutableMap.of("ID", "cds00004", "Parent", "mRNA00003", "Name", "edenprotein.4")); + final Gff3FeatureImpl canonicalGene_cds00004_1 = new Gff3FeatureImpl("ctg123", ".", "CDS", 3391, 3902, -1d, Strand.POSITIVE, 0, ImmutableMap.of("ID", Collections.singletonList("cds00004"), "Parent", Collections.singletonList("mRNA00003"), "Name", Collections.singletonList("edenprotein.4"))); canonicalGene_cds00004_1.addParent(canonicalGene_mRNA00003); canonicalGeneFeatures.add(canonicalGene_cds00004_1); - final Gff3FeatureImpl canonicalGene_cds00004_2 = new Gff3FeatureImpl("ctg123", ".", "CDS", 5000, 5500, -1d, Strand.POSITIVE, 1, ImmutableMap.of("ID", "cds00004", "Parent", "mRNA00003", "Name", "edenprotein.4")); + final Gff3FeatureImpl canonicalGene_cds00004_2 = new Gff3FeatureImpl("ctg123", ".", "CDS", 5000, 5500, -1d, Strand.POSITIVE, 1, ImmutableMap.of("ID", Collections.singletonList("cds00004"), "Parent", Collections.singletonList("mRNA00003"), "Name", Collections.singletonList("edenprotein.4"))); canonicalGene_cds00004_2.addParent(canonicalGene_mRNA00003); canonicalGene_cds00004_2.addCoFeature(canonicalGene_cds00004_1); canonicalGeneFeatures.add(canonicalGene_cds00004_2); - final Gff3FeatureImpl canonicalGene_cds00004_3 = new Gff3FeatureImpl("ctg123", ".", "CDS", 7000, 7600, -1d, Strand.POSITIVE, 1, ImmutableMap.of("ID", "cds00004", "Parent", "mRNA00003", "Name", "edenprotein.4")); + final Gff3FeatureImpl canonicalGene_cds00004_3 = new Gff3FeatureImpl("ctg123", ".", "CDS", 7000, 7600, -1d, Strand.POSITIVE, 1, ImmutableMap.of("ID", Collections.singletonList("cds00004"), "Parent", Collections.singletonList("mRNA00003"), "Name", Collections.singletonList("edenprotein.4"))); canonicalGene_cds00004_3.addParent(canonicalGene_mRNA00003); canonicalGene_cds00004_3.addCoFeature(canonicalGene_cds00004_1); canonicalGene_cds00004_3.addCoFeature(canonicalGene_cds00004_2); @@ -317,40 +318,40 @@ public Object[][] examplesDataProvider() { final Set polycisctronicTranscriptFeatures = new HashSet<>(); - final Gff3FeatureImpl polycistronicTranscript_gene01 = new Gff3FeatureImpl("chrX", ".", "gene", 100, 200, -1d, Strand.POSITIVE, -1, ImmutableMap.of("ID", "gene01", "name", "resA")); - final Gff3FeatureImpl polycistronicTranscript_gene02 = new Gff3FeatureImpl("chrX", ".", "gene", 250, 350, -1d, Strand.POSITIVE, -1, ImmutableMap.of("ID", "gene02", "name", "resB")); - final Gff3FeatureImpl polycistronicTranscript_gene03 = new Gff3FeatureImpl("chrX", ".", "gene", 400, 500, -1d, Strand.POSITIVE, -1, ImmutableMap.of("ID", "gene03", "name", "resX")); - final Gff3FeatureImpl polycistronicTranscript_gene04 = new Gff3FeatureImpl("chrX", ".", "gene", 550, 650, -1d, Strand.POSITIVE, -1, ImmutableMap.of("ID", "gene04", "name", "resZ")); + final Gff3FeatureImpl polycistronicTranscript_gene01 = new Gff3FeatureImpl("chrX", ".", "gene", 100, 200, -1d, Strand.POSITIVE, -1, ImmutableMap.of("ID", Collections.singletonList("gene01"), "name", Collections.singletonList("resA"))); + final Gff3FeatureImpl polycistronicTranscript_gene02 = new Gff3FeatureImpl("chrX", ".", "gene", 250, 350, -1d, Strand.POSITIVE, -1, ImmutableMap.of("ID", Collections.singletonList("gene02"), "name", Collections.singletonList("resB"))); + final Gff3FeatureImpl polycistronicTranscript_gene03 = new Gff3FeatureImpl("chrX", ".", "gene", 400, 500, -1d, Strand.POSITIVE, -1, ImmutableMap.of("ID", Collections.singletonList("gene03"), "name", Collections.singletonList("resX"))); + final Gff3FeatureImpl polycistronicTranscript_gene04 = new Gff3FeatureImpl("chrX", ".", "gene", 550, 650, -1d, Strand.POSITIVE, -1, ImmutableMap.of("ID", Collections.singletonList("gene04"), "name", Collections.singletonList("resZ"))); polycisctronicTranscriptFeatures.add(polycistronicTranscript_gene01); polycisctronicTranscriptFeatures.add(polycistronicTranscript_gene02); polycisctronicTranscriptFeatures.add(polycistronicTranscript_gene03); polycisctronicTranscriptFeatures.add(polycistronicTranscript_gene04); - final Gff3FeatureImpl polycistronicTranscript_mRNA = new Gff3FeatureImpl("chrX", ".", "mRNA", 100, 650, -1d, Strand.POSITIVE, -1, ImmutableMap.of("ID", "tran01", "Parent", "gene01,gene02,gene03,gene04")); + final Gff3FeatureImpl polycistronicTranscript_mRNA = new Gff3FeatureImpl("chrX", ".", "mRNA", 100, 650, -1d, Strand.POSITIVE, -1, ImmutableMap.of("ID", Collections.singletonList("tran01"), "Parent", Arrays.asList("gene01", "gene02", "gene03", "gene04"))); polycistronicTranscript_mRNA.addParent(polycistronicTranscript_gene01); polycistronicTranscript_mRNA.addParent(polycistronicTranscript_gene02); polycistronicTranscript_mRNA.addParent(polycistronicTranscript_gene03); polycistronicTranscript_mRNA.addParent(polycistronicTranscript_gene04); polycisctronicTranscriptFeatures.add(polycistronicTranscript_mRNA); - final Gff3FeatureImpl polycistronicTranscript_exon = new Gff3FeatureImpl("chrX", ".", "exon", 100, 650, -1d, Strand.POSITIVE, -1, ImmutableMap.of("Parent", "tran01")); + final Gff3FeatureImpl polycistronicTranscript_exon = new Gff3FeatureImpl("chrX", ".", "exon", 100, 650, -1d, Strand.POSITIVE, -1, ImmutableMap.of("Parent", Collections.singletonList("tran01"))); polycistronicTranscript_exon.addParent(polycistronicTranscript_mRNA); polycisctronicTranscriptFeatures.add(polycistronicTranscript_exon); - final Gff3FeatureImpl polycistronicTranscript_CDS1 = new Gff3FeatureImpl("chrX", ".", "CDS", 100, 200, -1d, Strand.POSITIVE, 0, ImmutableMap.of("Parent", "tran01", "Derives_from", "gene01")); + final Gff3FeatureImpl polycistronicTranscript_CDS1 = new Gff3FeatureImpl("chrX", ".", "CDS", 100, 200, -1d, Strand.POSITIVE, 0, ImmutableMap.of("Parent", Collections.singletonList("tran01"), "Derives_from", Collections.singletonList("gene01"))); polycistronicTranscript_CDS1.addParent(polycistronicTranscript_mRNA); polycisctronicTranscriptFeatures.add(polycistronicTranscript_CDS1); - final Gff3FeatureImpl polycistronicTranscript_CDS2 = new Gff3FeatureImpl("chrX", ".", "CDS", 250, 350, -1d, Strand.POSITIVE, 0, ImmutableMap.of("Parent", "tran01", "Derives_from", "gene02")); + final Gff3FeatureImpl polycistronicTranscript_CDS2 = new Gff3FeatureImpl("chrX", ".", "CDS", 250, 350, -1d, Strand.POSITIVE, 0, ImmutableMap.of("Parent", Collections.singletonList("tran01"), "Derives_from", Collections.singletonList("gene02"))); polycistronicTranscript_CDS2.addParent(polycistronicTranscript_mRNA); polycisctronicTranscriptFeatures.add(polycistronicTranscript_CDS2); - final Gff3FeatureImpl polycistronicTranscript_CDS3 = new Gff3FeatureImpl("chrX", ".", "CDS", 400, 500, -1d, Strand.POSITIVE, 0, ImmutableMap.of("Parent", "tran01", "Derives_from", "gene03")); + final Gff3FeatureImpl polycistronicTranscript_CDS3 = new Gff3FeatureImpl("chrX", ".", "CDS", 400, 500, -1d, Strand.POSITIVE, 0, ImmutableMap.of("Parent", Collections.singletonList("tran01"), "Derives_from", Collections.singletonList("gene03"))); polycistronicTranscript_CDS3.addParent(polycistronicTranscript_mRNA); polycisctronicTranscriptFeatures.add(polycistronicTranscript_CDS3); - final Gff3FeatureImpl polycistronicTranscript_CDS4 = new Gff3FeatureImpl("chrX", ".", "CDS", 550, 650, -1d, Strand.POSITIVE, 0, ImmutableMap.of("Parent", "tran01", "Derives_from", "gene04")); + final Gff3FeatureImpl polycistronicTranscript_CDS4 = new Gff3FeatureImpl("chrX", ".", "CDS", 550, 650, -1d, Strand.POSITIVE, 0, ImmutableMap.of("Parent", Collections.singletonList("tran01"), "Derives_from", Collections.singletonList("gene04"))); polycistronicTranscript_CDS4.addParent(polycistronicTranscript_mRNA); polycisctronicTranscriptFeatures.add(polycistronicTranscript_CDS4); @@ -362,23 +363,23 @@ public Object[][] examplesDataProvider() { final Set programmedFrameshiftFeatures = new HashSet<>(); - final Gff3FeatureImpl programmedFrameshift_gene = new Gff3FeatureImpl("chrX", ".", "gene", 100, 200, -1d, Strand.POSITIVE, -1, ImmutableMap.of("ID", "gene01", "name", "my_gene")); + final Gff3FeatureImpl programmedFrameshift_gene = new Gff3FeatureImpl("chrX", ".", "gene", 100, 200, -1d, Strand.POSITIVE, -1, ImmutableMap.of("ID", Collections.singletonList("gene01"), "name", Collections.singletonList("my_gene"))); programmedFrameshiftFeatures.add(programmedFrameshift_gene); - final Gff3FeatureImpl programmedFrameshift_mRNA = new Gff3FeatureImpl("chrX", ".", "mRNA", 100, 200, -1d, Strand.POSITIVE, -1, ImmutableMap.of("ID", "tran01", "Parent", "gene01", "Ontology_term", "SO:1000069")); + final Gff3FeatureImpl programmedFrameshift_mRNA = new Gff3FeatureImpl("chrX", ".", "mRNA", 100, 200, -1d, Strand.POSITIVE, -1, ImmutableMap.of("ID", Collections.singletonList("tran01"), "Parent", Collections.singletonList("gene01"), "Ontology_term", Collections.singletonList("SO:1000069"))); programmedFrameshift_mRNA.addParent(programmedFrameshift_gene); programmedFrameshiftFeatures.add(programmedFrameshift_mRNA); - final Gff3FeatureImpl programmedFrameshift_exon = new Gff3FeatureImpl("chrX", ".", "exon", 100, 200, -1d, Strand.POSITIVE, -1, ImmutableMap.of("ID", "exon01", "Parent", "tran01")); + final Gff3FeatureImpl programmedFrameshift_exon = new Gff3FeatureImpl("chrX", ".", "exon", 100, 200, -1d, Strand.POSITIVE, -1, ImmutableMap.of("ID", Collections.singletonList("exon01"), "Parent", Collections.singletonList("tran01"))); programmedFrameshift_exon.addParent(programmedFrameshift_mRNA); programmedFrameshiftFeatures.add(programmedFrameshift_exon); - final Gff3FeatureImpl programmedFrameshift_CDS1_1 = new Gff3FeatureImpl("chrX", ".", "CDS", 100, 150, -1d, Strand.POSITIVE, 0, ImmutableMap.of("ID", "cds01", "Parent", "tran01")); + final Gff3FeatureImpl programmedFrameshift_CDS1_1 = new Gff3FeatureImpl("chrX", ".", "CDS", 100, 150, -1d, Strand.POSITIVE, 0, ImmutableMap.of("ID", Collections.singletonList("cds01"), "Parent",Collections.singletonList( "tran01"))); programmedFrameshift_CDS1_1.addParent(programmedFrameshift_mRNA); programmedFrameshiftFeatures.add(programmedFrameshift_CDS1_1); - final Gff3FeatureImpl programmedFrameshift_CDS1_2 = new Gff3FeatureImpl("chrX", ".", "CDS", 149, 200, -1d, Strand.POSITIVE, 0, ImmutableMap.of("ID", "cds01", "Parent", "tran01")); + final Gff3FeatureImpl programmedFrameshift_CDS1_2 = new Gff3FeatureImpl("chrX", ".", "CDS", 149, 200, -1d, Strand.POSITIVE, 0, ImmutableMap.of("ID", Collections.singletonList("cds01"), "Parent", Collections.singletonList("tran01"))); programmedFrameshift_CDS1_2.addParent(programmedFrameshift_mRNA); programmedFrameshift_CDS1_2.addCoFeature(programmedFrameshift_CDS1_1); programmedFrameshiftFeatures.add(programmedFrameshift_CDS1_2); @@ -391,17 +392,17 @@ public Object[][] examplesDataProvider() { final Set multipleGenesFeatures = new HashSet<>(); - final Gff3FeatureImpl multipleGenes_gene1 = new Gff3FeatureImpl("ctg123", ".", "gene", 1000, 1500, -1d, Strand.POSITIVE, -1, ImmutableMap.of("ID", "gene00001")); + final Gff3FeatureImpl multipleGenes_gene1 = new Gff3FeatureImpl("ctg123", ".", "gene", 1000, 1500, -1d, Strand.POSITIVE, -1, ImmutableMap.of("ID", Collections.singletonList("gene00001"))); multipleGenesFeatures.add(multipleGenes_gene1); - final Gff3FeatureImpl multipleGenes_mRNA1 = new Gff3FeatureImpl("ctg123", ".", "mRNA", 1050, 1400, -1d, Strand.POSITIVE, -1, ImmutableMap.of("Parent", "gene00001")); + final Gff3FeatureImpl multipleGenes_mRNA1 = new Gff3FeatureImpl("ctg123", ".", "mRNA", 1050, 1400, -1d, Strand.POSITIVE, -1, ImmutableMap.of("Parent", Collections.singletonList("gene00001"))); multipleGenes_mRNA1.addParent(multipleGenes_gene1); multipleGenesFeatures.add(multipleGenes_mRNA1); - final Gff3FeatureImpl multipleGenes_gene2 = new Gff3FeatureImpl("ctg123", ".", "gene", 2000, 2500, -1d, Strand.POSITIVE, -1, ImmutableMap.of("ID", "gene00002")); + final Gff3FeatureImpl multipleGenes_gene2 = new Gff3FeatureImpl("ctg123", ".", "gene", 2000, 2500, -1d, Strand.POSITIVE, -1, ImmutableMap.of("ID", Collections.singletonList("gene00002"))); multipleGenesFeatures.add(multipleGenes_gene2); - final Gff3FeatureImpl multipleGenes_mRNA2 = new Gff3FeatureImpl("ctg123", ".", "mRNA", 2050, 2400, -1d, Strand.POSITIVE, -1, ImmutableMap.of("Parent", "gene00002")); + final Gff3FeatureImpl multipleGenes_mRNA2 = new Gff3FeatureImpl("ctg123", ".", "mRNA", 2050, 2400, -1d, Strand.POSITIVE, -1, ImmutableMap.of("Parent", Collections.singletonList("gene00002"))); multipleGenes_mRNA2.addParent(multipleGenes_gene2); multipleGenesFeatures.add(multipleGenes_mRNA2); @@ -423,4 +424,98 @@ public void examplesTest(final String inputGff, final Set expectedF Assert.assertEquals(observedFeatures, expectedFeatures.size()); } + + @DataProvider(name = "directiveDataProvider") + public Object[][] directiveDataProvider() { + return new Object[][] { + {"##gff-version 3.1.25", Gff3Codec.Gff3Directive.VERSION3_DIRECTIVE, "3.1.25"}, + {"##gff-version 3.7", Gff3Codec.Gff3Directive.VERSION3_DIRECTIVE, "3.7"}, + {"##gff-version 3", Gff3Codec.Gff3Directive.VERSION3_DIRECTIVE, "3"}, + {"##gff-version 3.112.25.4.2", Gff3Codec.Gff3Directive.VERSION3_DIRECTIVE, "3.112.25.4.2"}, + {"##gff-version 2.7", null, null}, + {"##sequence-region chr10 250 277", Gff3Codec.Gff3Directive.SEQUENCE_REGION_DIRECTIVE, new SequenceRegion("chr10", 250, 277)}, + {"###", Gff3Codec.Gff3Directive.FLUSH_DIRECTIVE, null}, + {"####", null, null}, + {"##FASTA", Gff3Codec.Gff3Directive.FASTA_DIRECTIVE, null} + }; + } + + @Test(dataProvider = "directiveDataProvider") + public void directiveTest(final String line, final Gff3Codec.Gff3Directive expectedDirectiveType, final Object expectedDecodedDirective) throws IOException { + final Gff3Codec.Gff3Directive directive = Gff3Codec.Gff3Directive.toDirective(line); + Assert.assertEquals(directive, expectedDirectiveType); + if (directive != null) { + Assert.assertEquals(directive.decode(line), expectedDecodedDirective); + if (expectedDecodedDirective != null) { + Assert.assertEquals(directive.encode(expectedDecodedDirective), line); + } + } + } + + @DataProvider(name = "directiveEncodingDataProvider") + public Object [][] directiveEncodingDataProvider() { + return new Object[][] { + {Gff3Codec.Gff3Directive.VERSION3_DIRECTIVE, "3.1.3", "##gff-version 3.1.3"}, + {Gff3Codec.Gff3Directive.SEQUENCE_REGION_DIRECTIVE, new SequenceRegion("theContig", 101, 170), "##sequence-region theContig 101 170"}, + {Gff3Codec.Gff3Directive.FLUSH_DIRECTIVE, null, "###"}, + {Gff3Codec.Gff3Directive.FASTA_DIRECTIVE, null, "##FASTA"} + }; + } + + @Test(dataProvider = "directiveEncodingDataProvider") + public void directiveEncodingTest(final Gff3Codec.Gff3Directive directive, final Object object, final String expectedEncoding) { + final String encoding = directive.encode(object); + + Assert.assertEquals(encoding, expectedEncoding); + } + + @DataProvider(name = "version3InvalidDirectives") + public Object[][] version3InvalidDirectivesDataProvider() { + return new Object[][] { + {"3.1.a"}, + {"2"}, + {"2.1"}, + {".3.1"} + }; + } + + @Test(dataProvider = "version3InvalidDirectives", expectedExceptions = TribbleException.class) + public void version3InvalidDirectivesTest(final String v3Directive) { + Gff3Codec.Gff3Directive.VERSION3_DIRECTIVE.encode(v3Directive); + } + + @DataProvider(name = "decodeAttributeValueDataProvider") + public Object[][] decodeAttributeValueDataProvider() { + return new Object[][] { + {"value1, value2, value3", Arrays.asList("value1", "value2", "value3")}, + {"value1, value %3B with %3D special %26 encoded %2C characters, value3", Arrays.asList("value1", "value ; with = special & encoded , characters", "value3")} + }; + } + + @Test(dataProvider = "decodeAttributeValueDataProvider") + public void decodeAttributeValueTest(final String attributeValueString, final List expectedAttributeValues) { + final List attributeValues = Gff3Codec.decodeAttributeValue(attributeValueString); + + Assert.assertEquals(attributeValues, expectedAttributeValues); + } + + @DataProvider(name = "extractSingleAttributeDataProvider") + public Object[][] extractSingleAttributeDataProvider() { + return new Object[][] { + {null, null, false}, //null returns null + {Collections.emptyList(), null, false}, //empty returns null + {Collections.singletonList("single value"), "single value", false}, //single value returns single value + {Arrays.asList("value1", "value2"), null, true} //multiple values throws exception + }; + } + + @Test(dataProvider = "extractSingleAttributeDataProvider") + public void extractSingleAttributeTest(final List attributes, final String expectedSingleAttribute, final boolean expectException) { + if (expectException) { + Assert.assertThrows(() -> Gff3Codec.extractSingleAttribute(attributes)); + } else { + final String singleAttribute = Gff3Codec.extractSingleAttribute(attributes); + Assert.assertEquals(singleAttribute, expectedSingleAttribute); + } + } } \ No newline at end of file diff --git a/src/test/java/htsjdk/tribble/gff/Gff3FeatureTest.java b/src/test/java/htsjdk/tribble/gff/Gff3FeatureTest.java index 8894c79790..a0d2ad31b8 100644 --- a/src/test/java/htsjdk/tribble/gff/Gff3FeatureTest.java +++ b/src/test/java/htsjdk/tribble/gff/Gff3FeatureTest.java @@ -23,42 +23,42 @@ public class Gff3FeatureTest extends HtsjdkTest { @DataProvider(name = "equalityTestDataProvider") public Object[][] equalityTestDatProvider() { final ArrayList examples = new ArrayList<>(); - examples.add(new Object[] {new Gff3FeatureImpl("chr1", ".", "gene", 1000, 1200, -1d, Strand.NEGATIVE, -1, ImmutableMap.of("ID", "gene01")), - new Gff3FeatureImpl("chr1", ".", "gene", 1000, 1200, -1d, Strand.NEGATIVE, -1, ImmutableMap.of("ID", "gene01")), true}); - examples.add(new Object[] {new Gff3FeatureImpl("chr1", ".", "CDS", 1010, 1050, -1d, Strand.NEGATIVE, 0, ImmutableMap.of("ID", "cds01","Parent", "gene01")), - new Gff3FeatureImpl("chr1", ".", "CDS", 1010, 1050, -1d, Strand.NEGATIVE, 0, ImmutableMap.of("ID", "cds01","Parent", "gene01")), true}); + examples.add(new Object[] {new Gff3FeatureImpl("chr1", ".", "gene", 1000, 1200, -1d, Strand.NEGATIVE, -1, ImmutableMap.of("ID", Collections.singletonList("gene01"))), + new Gff3FeatureImpl("chr1", ".", "gene", 1000, 1200, -1d, Strand.NEGATIVE, -1, ImmutableMap.of("ID", Collections.singletonList("gene01"))), true}); + examples.add(new Object[] {new Gff3FeatureImpl("chr1", ".", "CDS", 1010, 1050, -1d, Strand.NEGATIVE, 0, ImmutableMap.of("ID", Collections.singletonList("cds01"),"Parent", Collections.singletonList("gene01"))), + new Gff3FeatureImpl("chr1", ".", "CDS", 1010, 1050, -1d, Strand.NEGATIVE, 0, ImmutableMap.of("ID", Collections.singletonList("cds01"),"Parent", Collections.singletonList("gene01"))), true}); //two features with same baseData, one with child (or parent) feature, one without - final Gff3FeatureImpl feature1_1 = new Gff3FeatureImpl("chr1", ".", "gene", 1000, 1200, -1d, Strand.NEGATIVE, -1, ImmutableMap.of("ID", "gene01")); - final Gff3FeatureImpl feature2_1 = new Gff3FeatureImpl("chr1", ".", "gene", 1000, 1200, -1d, Strand.NEGATIVE, -1, ImmutableMap.of("ID", "gene01")); - final Gff3FeatureImpl feature3_1 = new Gff3FeatureImpl("chr1", ".", "CDS", 1010, 1050, -1d, Strand.NEGATIVE, 0, ImmutableMap.of("ID", "cds01","Parent", "gene01")); + final Gff3FeatureImpl feature1_1 = new Gff3FeatureImpl("chr1", ".", "gene", 1000, 1200, -1d, Strand.NEGATIVE, -1, ImmutableMap.of("ID", Collections.singletonList("gene01"))); + final Gff3FeatureImpl feature2_1 = new Gff3FeatureImpl("chr1", ".", "gene", 1000, 1200, -1d, Strand.NEGATIVE, -1, ImmutableMap.of("ID", Collections.singletonList("gene01"))); + final Gff3FeatureImpl feature3_1 = new Gff3FeatureImpl("chr1", ".", "CDS", 1010, 1050, -1d, Strand.NEGATIVE, 0, ImmutableMap.of("ID", Collections.singletonList("cds01"),"Parent", Collections.singletonList("gene01"))); feature3_1.addParent(feature1_1); - final Gff3FeatureImpl feature4_1 = new Gff3FeatureImpl("chr1", ".", "CDS", 1010, 1050, -1d, Strand.NEGATIVE, 0, ImmutableMap.of("ID", "cds01","Parent", "gene01")); + final Gff3FeatureImpl feature4_1 = new Gff3FeatureImpl("chr1", ".", "CDS", 1010, 1050, -1d, Strand.NEGATIVE, 0, ImmutableMap.of("ID", Collections.singletonList("cds01"),"Parent", Collections.singletonList("gene01"))); examples.add(new Object[] {feature1_1, feature2_1, false}); examples.add(new Object[] {feature3_1, feature4_1, false}); //give both genes child feature - final Gff3FeatureImpl feature1_2 = new Gff3FeatureImpl("chr1", ".", "gene", 1000, 1200, -1d, Strand.NEGATIVE, -1, ImmutableMap.of("ID", "gene01")); - final Gff3FeatureImpl feature2_2 = new Gff3FeatureImpl("chr1", ".", "gene", 1000, 1200, -1d, Strand.NEGATIVE, -1, ImmutableMap.of("ID", "gene01")); - final Gff3FeatureImpl feature3_2 = new Gff3FeatureImpl("chr1", ".", "CDS", 1010, 1050, -1d, Strand.NEGATIVE, -0, ImmutableMap.of("ID", "cds01","Parent", "gene01")); + final Gff3FeatureImpl feature1_2 = new Gff3FeatureImpl("chr1", ".", "gene", 1000, 1200, -1d, Strand.NEGATIVE, -1, ImmutableMap.of("ID", Collections.singletonList("gene01"))); + final Gff3FeatureImpl feature2_2 = new Gff3FeatureImpl("chr1", ".", "gene", 1000, 1200, -1d, Strand.NEGATIVE, -1, ImmutableMap.of("ID", Collections.singletonList("gene01"))); + final Gff3FeatureImpl feature3_2 = new Gff3FeatureImpl("chr1", ".", "CDS", 1010, 1050, -1d, Strand.NEGATIVE, -0, ImmutableMap.of("ID", Collections.singletonList("cds01"),"Parent", Collections.singletonList("gene01"))); feature3_2.addParent(feature1_2); - final Gff3FeatureImpl feature4_2 = new Gff3FeatureImpl("chr1", ".", "CDS", 1010, 1050, -1d, Strand.NEGATIVE, 0, ImmutableMap.of("ID", "cds01","Parent", "gene01")); + final Gff3FeatureImpl feature4_2 = new Gff3FeatureImpl("chr1", ".", "CDS", 1010, 1050, -1d, Strand.NEGATIVE, 0, ImmutableMap.of("ID", Collections.singletonList("cds01"),"Parent", Collections.singletonList("gene01"))); feature4_2.addParent(feature2_2); examples.add(new Object[] {feature1_2, feature2_2, true}); examples.add(new Object[] {feature3_2, feature4_2, true}); //give one cds a co-feature - final Gff3FeatureImpl feature1_3 = new Gff3FeatureImpl("chr1", ".", "gene", 1000, 1200, -1d, Strand.NEGATIVE, -1, ImmutableMap.of("ID", "gene01")); - final Gff3FeatureImpl feature2_3 = new Gff3FeatureImpl("chr1", ".", "gene", 1000, 1200, -1d, Strand.NEGATIVE, -1, ImmutableMap.of("ID", "gene01")); - final Gff3FeatureImpl feature3_3 = new Gff3FeatureImpl("chr1", ".", "CDS", 1010, 1050, -1d, Strand.NEGATIVE, 0, ImmutableMap.of("ID", "cds01","Parent", "gene01")); + final Gff3FeatureImpl feature1_3 = new Gff3FeatureImpl("chr1", ".", "gene", 1000, 1200, -1d, Strand.NEGATIVE, -1, ImmutableMap.of("ID", Collections.singletonList("gene01"))); + final Gff3FeatureImpl feature2_3 = new Gff3FeatureImpl("chr1", ".", "gene", 1000, 1200, -1d, Strand.NEGATIVE, -1, ImmutableMap.of("ID", Collections.singletonList("gene01"))); + final Gff3FeatureImpl feature3_3 = new Gff3FeatureImpl("chr1", ".", "CDS", 1010, 1050, -1d, Strand.NEGATIVE, 0, ImmutableMap.of("ID", Collections.singletonList("cds01"),"Parent", Collections.singletonList("gene01"))); feature3_3.addParent(feature1_3); - final Gff3FeatureImpl feature4_3 = new Gff3FeatureImpl("chr1", ".", "CDS", 1010, 1050, -1d, Strand.NEGATIVE, 0, ImmutableMap.of("ID", "cds01","Parent", "gene01")); + final Gff3FeatureImpl feature4_3 = new Gff3FeatureImpl("chr1", ".", "CDS", 1010, 1050, -1d, Strand.NEGATIVE, 0, ImmutableMap.of("ID", Collections.singletonList("cds01"),"Parent", Collections.singletonList("gene01"))); feature4_3.addParent(feature2_3); - final Gff3FeatureImpl feature5_3 = new Gff3FeatureImpl("chr1", ".", "CDS", 1080, 1150, -1d, Strand.NEGATIVE, 0, ImmutableMap.of("ID", "cds01","Parent", "gene01")); + final Gff3FeatureImpl feature5_3 = new Gff3FeatureImpl("chr1", ".", "CDS", 1080, 1150, -1d, Strand.NEGATIVE, 0, ImmutableMap.of("ID", Collections.singletonList("cds01"),"Parent", Collections.singletonList("gene01"))); feature5_3.addParent(feature1_3); - final Gff3FeatureImpl feature6_3 = new Gff3FeatureImpl("chr1", ".", "CDS", 1080, 1150, -1d, Strand.NEGATIVE, 0, ImmutableMap.of("ID", "cds01","Parent", "gene01")); + final Gff3FeatureImpl feature6_3 = new Gff3FeatureImpl("chr1", ".", "CDS", 1080, 1150, -1d, Strand.NEGATIVE, 0, ImmutableMap.of("ID", Collections.singletonList("cds01"),"Parent", Collections.singletonList("gene01"))); feature3_3.addCoFeature(feature5_3); @@ -67,15 +67,15 @@ public Object[][] equalityTestDatProvider() { examples.add(new Object[] {feature5_3, feature6_3, false}); //give both cds co-features - final Gff3FeatureImpl feature1_4 = new Gff3FeatureImpl("chr1", ".", "gene", 1000, 1200, -1d, Strand.NEGATIVE, -1, ImmutableMap.of("ID", "gene01")); - final Gff3FeatureImpl feature2_4 = new Gff3FeatureImpl("chr1", ".", "gene", 1000, 1200, -1d, Strand.NEGATIVE, -1, ImmutableMap.of("ID", "gene01")); - final Gff3FeatureImpl feature3_4 = new Gff3FeatureImpl("chr1", ".", "CDS", 1010, 1050, -1d, Strand.NEGATIVE, 0, ImmutableMap.of("ID", "cds01","Parent", "gene01")); + final Gff3FeatureImpl feature1_4 = new Gff3FeatureImpl("chr1", ".", "gene", 1000, 1200, -1d, Strand.NEGATIVE, -1, ImmutableMap.of("ID", Collections.singletonList("gene01"))); + final Gff3FeatureImpl feature2_4 = new Gff3FeatureImpl("chr1", ".", "gene", 1000, 1200, -1d, Strand.NEGATIVE, -1, ImmutableMap.of("ID", Collections.singletonList("gene01"))); + final Gff3FeatureImpl feature3_4 = new Gff3FeatureImpl("chr1", ".", "CDS", 1010, 1050, -1d, Strand.NEGATIVE, 0, ImmutableMap.of("ID", Collections.singletonList("cds01"),"Parent", Collections.singletonList("gene01"))); feature3_4.addParent(feature1_4); - final Gff3FeatureImpl feature4_4 = new Gff3FeatureImpl("chr1", ".", "CDS", 1010, 1050, -1d, Strand.NEGATIVE, 0, ImmutableMap.of("ID", "cds01","Parent", "gene01")); + final Gff3FeatureImpl feature4_4 = new Gff3FeatureImpl("chr1", ".", "CDS", 1010, 1050, -1d, Strand.NEGATIVE, 0, ImmutableMap.of("ID", Collections.singletonList("cds01"),"Parent", Collections.singletonList("gene01"))); feature4_4.addParent(feature2_4); - final Gff3FeatureImpl feature5_4 = new Gff3FeatureImpl("chr1", ".", "CDS", 1080, 1150, -1d, Strand.NEGATIVE, 0, ImmutableMap.of("ID", "cds01","Parent", "gene01")); + final Gff3FeatureImpl feature5_4 = new Gff3FeatureImpl("chr1", ".", "CDS", 1080, 1150, -1d, Strand.NEGATIVE, 0, ImmutableMap.of("ID", Collections.singletonList("cds01"),"Parent", Collections.singletonList("gene01"))); feature5_4.addParent(feature1_4); - final Gff3FeatureImpl feature6_4 = new Gff3FeatureImpl("chr1", ".", "CDS", 1080, 1150, -1d, Strand.NEGATIVE, 0, ImmutableMap.of("ID", "cds01","Parent", "gene01")); + final Gff3FeatureImpl feature6_4 = new Gff3FeatureImpl("chr1", ".", "CDS", 1080, 1150, -1d, Strand.NEGATIVE, 0, ImmutableMap.of("ID", Collections.singletonList("cds01"),"Parent", Collections.singletonList("gene01"))); feature6_4.addParent(feature2_4); feature3_4.addCoFeature(feature5_4); @@ -99,8 +99,8 @@ public void testEquals(final Gff3FeatureImpl feature1, final Gff3FeatureImpl fea @Test public void testChildren() { //test that when a feature has a parent it is added as it's parent's child - final Gff3FeatureImpl feature1 = new Gff3FeatureImpl("chr1", ".", "gene", 1000, 1200, -1d, Strand.NEGATIVE, -1, ImmutableMap.of("ID", "gene01")); - final Gff3FeatureImpl feature2 = new Gff3FeatureImpl("chr1", ".", "CDS", 1010, 1050, -1d, Strand.NEGATIVE, -1, ImmutableMap.of("ID", "cds01","Parent", "gene01")); + final Gff3FeatureImpl feature1 = new Gff3FeatureImpl("chr1", ".", "gene", 1000, 1200, -1d, Strand.NEGATIVE, -1, ImmutableMap.of("ID", Collections.singletonList("gene01"))); + final Gff3FeatureImpl feature2 = new Gff3FeatureImpl("chr1", ".", "CDS", 1010, 1050, -1d, Strand.NEGATIVE, -1, ImmutableMap.of("ID", Collections.singletonList("cds01"),"Parent", Collections.singletonList("gene01"))); feature2.addParent(feature1); Assert.assertTrue(feature1.getChildren().contains(feature2)); @@ -110,12 +110,12 @@ public void testChildren() { @Test public void testCofeatures() { //test that when a adding a cofeature it is reciprocated - final Gff3FeatureImpl region = new Gff3FeatureImpl("chr1", ".", "region", 1, 10000, -1d, Strand.NONE, -1, ImmutableMap.of("ID", "region01")); - final Gff3FeatureImpl feature1 = new Gff3FeatureImpl("chr1", ".", "gene", 1000, 1200, -1d, Strand.NEGATIVE, -1, ImmutableMap.of("ID", "gene01", "Parent", "region01")); + final Gff3FeatureImpl region = new Gff3FeatureImpl("chr1", ".", "region", 1, 10000, -1d, Strand.NONE, -1, ImmutableMap.of("ID", Collections.singletonList("region01"))); + final Gff3FeatureImpl feature1 = new Gff3FeatureImpl("chr1", ".", "gene", 1000, 1200, -1d, Strand.NEGATIVE, -1, ImmutableMap.of("ID", Collections.singletonList("gene01"), "Parent", Collections.singletonList("region01"))); feature1.addParent(region); - final Gff3FeatureImpl feature2 = new Gff3FeatureImpl("chr1", ".", "gene", 1300, 1600, -1d, Strand.NEGATIVE, -1, ImmutableMap.of("ID", "gene01", "Parent", "region01")); + final Gff3FeatureImpl feature2 = new Gff3FeatureImpl("chr1", ".", "gene", 1300, 1600, -1d, Strand.NEGATIVE, -1, ImmutableMap.of("ID", Collections.singletonList("gene01"), "Parent", Collections.singletonList("region01"))); feature2.addParent(region); - final Gff3FeatureImpl feature3 = new Gff3FeatureImpl("chr1", ".", "gene", 1700, 1900, -1d, Strand.NEGATIVE, -1, ImmutableMap.of("ID", "gene01", "Parent", "region01")); + final Gff3FeatureImpl feature3 = new Gff3FeatureImpl("chr1", ".", "gene", 1700, 1900, -1d, Strand.NEGATIVE, -1, ImmutableMap.of("ID", Collections.singletonList("gene01"), "Parent", Collections.singletonList("region01"))); feature3.addParent(region); feature1.addCoFeature(feature2); @@ -128,11 +128,11 @@ public void testCofeatures() { @Test(expectedExceptions = TribbleException.class) public void testCofeautresDifferentParents() { - final Gff3FeatureImpl feature1 = new Gff3FeatureImpl("chr1", ".", "gene", 1000, 1200, -1d, Strand.NEGATIVE, -1, ImmutableMap.of("ID", "gene01")); - final Gff3FeatureImpl feature2 = new Gff3FeatureImpl("chr1", ".", "gene", 1300, 1600, -1d, Strand.NEGATIVE, -1, ImmutableMap.of("ID", "gene02")); - final Gff3FeatureImpl feature3 = new Gff3FeatureImpl("chr1", ".", "CDS", 1010, 1050, -1d, Strand.NEGATIVE, 0, ImmutableMap.of("ID", "cds01","Parent", "gene01")); + final Gff3FeatureImpl feature1 = new Gff3FeatureImpl("chr1", ".", "gene", 1000, 1200, -1d, Strand.NEGATIVE, -1, ImmutableMap.of("ID", Collections.singletonList("gene01"))); + final Gff3FeatureImpl feature2 = new Gff3FeatureImpl("chr1", ".", "gene", 1300, 1600, -1d, Strand.NEGATIVE, -1, ImmutableMap.of("ID", Collections.singletonList("gene02"))); + final Gff3FeatureImpl feature3 = new Gff3FeatureImpl("chr1", ".", "CDS", 1010, 1050, -1d, Strand.NEGATIVE, 0, ImmutableMap.of("ID", Collections.singletonList("cds01"),"Parent", Collections.singletonList("gene01"))); feature3.addParent(feature1); - final Gff3FeatureImpl feature4 = new Gff3FeatureImpl("chr1", ".", "CDS", 1310, 1350, -1d, Strand.NEGATIVE, 0, ImmutableMap.of("ID", "cds01","Parent", "gene02")); + final Gff3FeatureImpl feature4 = new Gff3FeatureImpl("chr1", ".", "CDS", 1310, 1350, -1d, Strand.NEGATIVE, 0, ImmutableMap.of("ID", Collections.singletonList("cds01"),"Parent", Collections.singletonList("gene02"))); feature4.addParent(feature2); //should throw exception because feature3 and feature4 have different parents so should not be co-features @@ -144,7 +144,7 @@ public void testAncestorsAndDescendents() { final int nGenerations = 10; - final Gff3FeatureImpl topLevelFeature = new Gff3FeatureImpl("chrX", ".", "type0", 1, 100, -1d, Strand.NEGATIVE, 0, ImmutableMap.of("ID", "feature0")); + final Gff3FeatureImpl topLevelFeature = new Gff3FeatureImpl("chrX", ".", "type0", 1, 100, -1d, Strand.NEGATIVE, 0, ImmutableMap.of("ID", Collections.singletonList("feature0"))); Gff3FeatureImpl prevFeature = topLevelFeature; final Map> ancestorsMap = new HashMap<>(); @@ -156,7 +156,7 @@ public void testAncestorsAndDescendents() { final List features = new ArrayList<>(Arrays.asList(topLevelFeature)); for (int i=1; i(ancestorsMap.get(prevFeature))); ancestorsMap.get(newFeature).add(prevFeature); @@ -179,13 +179,13 @@ public void testAncestorsAndDescendents() { public void testFlatten() { final int nGenerations = 10; - final Gff3FeatureImpl topLevelFeature = new Gff3FeatureImpl("chrX", ".", "type0", 1, 100, -1d, Strand.NEGATIVE, 0, ImmutableMap.of("ID", "feature0")); + final Gff3FeatureImpl topLevelFeature = new Gff3FeatureImpl("chrX", ".", "type0", 1, 100, -1d, Strand.NEGATIVE, 0, ImmutableMap.of("ID", Collections.singletonList("feature0"))); Gff3FeatureImpl prevFeature = topLevelFeature; final Map> flattenMap = new HashMap<>(Collections.singletonMap(topLevelFeature, new HashSet<>(Collections.singleton(topLevelFeature)))); for (int i=1; i v.add(newFeature)); flattenMap.put(newFeature, new HashSet<>(Collections.singleton(newFeature))); @@ -197,22 +197,22 @@ public void testFlatten() { @Test public void testDerivesFrom() { - final Gff3FeatureImpl region01 = new Gff3FeatureImpl("chrX", ".", "gene", 65, 1000, -1d, Strand.POSITIVE, -1, ImmutableMap.of("ID", "region01")); + final Gff3FeatureImpl region01 = new Gff3FeatureImpl("chrX", ".", "gene", 65, 1000, -1d, Strand.POSITIVE, -1, ImmutableMap.of("ID", Collections.singletonList("region01"))); - final Gff3FeatureImpl gene01 = new Gff3FeatureImpl("chrX", ".", "gene", 1, 35, -1d, Strand.POSITIVE, -1, ImmutableMap.of("ID", "gene01", "Parent", "region01")); + final Gff3FeatureImpl gene01 = new Gff3FeatureImpl("chrX", ".", "gene", 1, 35, -1d, Strand.POSITIVE, -1, ImmutableMap.of("ID", Collections.singletonList("gene01"), "Parent", Collections.singletonList("region01"))); gene01.addParent(region01); - final Gff3FeatureImpl gene02 = new Gff3FeatureImpl("chrX", ".", "gene", 70, 100, -1d, Strand.POSITIVE, -1, ImmutableMap.of("ID", "gene02")); + final Gff3FeatureImpl gene02 = new Gff3FeatureImpl("chrX", ".", "gene", 70, 100, -1d, Strand.POSITIVE, -1, ImmutableMap.of("ID", Collections.singletonList("gene02"))); - final Gff3FeatureImpl mRNA01 = new Gff3FeatureImpl("chrX", ".", "mRNA", 1, 100, -1d, Strand.POSITIVE, -1 , ImmutableMap.of("ID", "mRNA01", "Parent", "gene01, gene02")); + final Gff3FeatureImpl mRNA01 = new Gff3FeatureImpl("chrX", ".", "mRNA", 1, 100, -1d, Strand.POSITIVE, -1 , ImmutableMap.of("ID", Collections.singletonList("mRNA01"), "Parent", Arrays.asList("gene01", "gene02"))); mRNA01.addParent(gene01); mRNA01.addParent(gene02); - final Gff3FeatureImpl cds01 = new Gff3FeatureImpl("chrX", ".", "CDS", 1, 35, -1d, Strand.POSITIVE, 0, ImmutableMap.of("ID", "cds01", "Parent", "mRNA01", "Derives_from", "gene01")); + final Gff3FeatureImpl cds01 = new Gff3FeatureImpl("chrX", ".", "CDS", 1, 35, -1d, Strand.POSITIVE, 0, ImmutableMap.of("ID", Collections.singletonList("cds01"), "Parent", Collections.singletonList("mRNA01"), "Derives_from", Collections.singletonList("gene01"))); cds01.addParent(mRNA01); - final Gff3FeatureImpl cds02 = new Gff3FeatureImpl("chrX", ".", "CDS", 70, 100, -1d, Strand.POSITIVE, 0, ImmutableMap.of("ID", "cds02", "Parent", "mRNA01", "Derives_from", "gene02")); + final Gff3FeatureImpl cds02 = new Gff3FeatureImpl("chrX", ".", "CDS", 70, 100, -1d, Strand.POSITIVE, 0, ImmutableMap.of("ID", Collections.singletonList("cds02"), "Parent", Collections.singletonList("mRNA01"), "Derives_from", Collections.singletonList("gene02"))); cds02.addParent(mRNA01); - final Gff3FeatureImpl codon01 = new Gff3FeatureImpl("chrX", ".", "codon", 1, 3, -1d, Strand.POSITIVE, 0, ImmutableMap.of("ID", "codon01", "Parent", "cds01")); + final Gff3FeatureImpl codon01 = new Gff3FeatureImpl("chrX", ".", "codon", 1, 3, -1d, Strand.POSITIVE, 0, ImmutableMap.of("ID", Collections.singletonList("codon01"), "Parent", Collections.singletonList("cds01"))); codon01.addParent(cds01); Assert.assertEquals(cds01.getAncestors(), ImmutableSet.of(mRNA01, gene01, region01)); @@ -232,12 +232,12 @@ public void testDerivesFrom() { @Test public void testFeatureWithUnLoadedParent() { - final Gff3FeatureImpl gene01 = new Gff3FeatureImpl("chrX", ".", "gene", 1, 35, -1d, Strand.POSITIVE, -1, ImmutableMap.of("ID", "gene01", "Parent", "region01")); - final Gff3FeatureImpl gene02 = new Gff3FeatureImpl("chrX", ".", "gene", 1, 35, -1d, Strand.POSITIVE, -1, ImmutableMap.of("ID", "gene01", "Parent", "region01")); + final Gff3FeatureImpl gene01 = new Gff3FeatureImpl("chrX", ".", "gene", 1, 35, -1d, Strand.POSITIVE, -1, ImmutableMap.of("ID", Collections.singletonList("gene01"), "Parent", Collections.singletonList("region01"))); + final Gff3FeatureImpl gene02 = new Gff3FeatureImpl("chrX", ".", "gene", 1, 35, -1d, Strand.POSITIVE, -1, ImmutableMap.of("ID", Collections.singletonList("gene01"), "Parent", Collections.singletonList("region01"))); Assert.assertEquals(gene01, gene02); - final Gff3FeatureImpl mRNA01 = new Gff3FeatureImpl("chrX", ".", "mRNA", 1, 100, -1d, Strand.POSITIVE, -1 , ImmutableMap.of("ID", "mRNA01", "Parent", "gene01, gene02")); + final Gff3FeatureImpl mRNA01 = new Gff3FeatureImpl("chrX", ".", "mRNA", 1, 100, -1d, Strand.POSITIVE, -1 , ImmutableMap.of("ID", Collections.singletonList("mRNA01"), "Parent", Arrays.asList("gene01", "gene02"))); mRNA01.addParent(gene01); Assert.assertNotEquals(gene01, gene02); diff --git a/src/test/java/htsjdk/tribble/gff/Gff3WriterTest.java b/src/test/java/htsjdk/tribble/gff/Gff3WriterTest.java new file mode 100644 index 0000000000..62178a0bb6 --- /dev/null +++ b/src/test/java/htsjdk/tribble/gff/Gff3WriterTest.java @@ -0,0 +1,205 @@ +package htsjdk.tribble.gff; + +import com.google.common.collect.ImmutableMap; +import htsjdk.HtsjdkTest; +import htsjdk.samtools.util.IOUtil; +import htsjdk.tribble.AbstractFeatureReader; +import htsjdk.tribble.TestUtils; +import htsjdk.tribble.TribbleException; +import htsjdk.tribble.readers.LineIterator; +import org.testng.Assert; +import org.testng.TestException; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.io.ByteArrayOutputStream; +import java.io.File; +import java.io.IOException; +import java.io.RandomAccessFile; +import java.nio.file.Path; +import java.nio.file.Paths; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.HashSet; +import java.util.LinkedHashSet; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.zip.GZIPInputStream; + +public class Gff3WriterTest extends HtsjdkTest { + private final static String DATA_DIR = TestUtils.DATA_DIR + "/gff/"; + private final Path ensembl_human_small = Paths.get(DATA_DIR + "Homo_sapiens.GRCh38.97.chromosome.1.small.gff3"); + private final Path gencode_mouse_small = Paths.get(DATA_DIR + "gencode.vM22.annotation.small.gff3"); + private final Path ncbi_woodpecker_small = Paths.get(DATA_DIR + "ref_ASM69900v1_top_level.small.gff3"); + private final Path feature_extends_past_circular_region = Paths.get(DATA_DIR + "feature_extends_past_circular_region.gff3"); + private final Path with_fasta = Paths.get(DATA_DIR + "fasta_test.gff3"); + private final Path with_fasta_artemis = Paths.get(DATA_DIR + "fasta_test_artemis.gff3"); + private final Path ordered_cofeature = Paths.get(DATA_DIR, "ordered_cofeatures.gff3"); + private final Path child_before_parent = Paths.get(DATA_DIR, "child_before_parent.gff3"); + private final Path url_encoding = Paths.get(DATA_DIR, "url_encoding.gff3"); + private final static Path[] tmpDir = new Path[] {IOUtil.getDefaultTmpDirPath()}; + private final static String version3Directive = "##gff-version 3.1.25\n"; + + @DataProvider(name = "roundTripDataProvider") + public Object[][] roundTripDataProvider() { + return new Object[][] { + {ensembl_human_small}, {gencode_mouse_small}, {ncbi_woodpecker_small}, {feature_extends_past_circular_region}, {with_fasta}, {with_fasta_artemis}, + {ordered_cofeature}, {child_before_parent}, {url_encoding} + }; + } + + @Test(dataProvider = "roundTripDataProvider") + public void testRoundTrip(final Path path) { + + final List comments1 = new ArrayList<>(); + final HashSet regions1 = new HashSet<>(); + final LinkedHashSet features1 = readFromFile(path, comments1, regions1); + + //write out to temp files (one gzipped, on not) + try { + final Path tempFile = IOUtil.newTempPath("gff3Writer", ".gff3", tmpDir); + final Path tempFileGzip = IOUtil.newTempPath("gff3Writer", ".gff3.gz", tmpDir); + + writeToFile(tempFile, comments1, regions1, features1); + writeToFile(tempFileGzip, comments1, regions1, features1); + + //read temp files back in + + Assert.assertTrue(isGZipped(tempFileGzip.toFile())); + final List comments2 = new ArrayList<>(); + final HashSet regions2 = new HashSet<>(); + final LinkedHashSet features2 = readFromFile(tempFile, comments2, regions2); + + + final List comments3 = new ArrayList<>(); + final HashSet regions3 = new HashSet<>(); + final LinkedHashSet features3 = readFromFile(path, comments3, regions3); + + Assert.assertEquals(features1, features2); + Assert.assertEquals(features1, features3); + + Assert.assertEquals(comments1, comments2); + Assert.assertEquals(comments1, comments3); + Assert.assertEquals(regions1, regions2); + Assert.assertEquals(regions1, regions3); + } catch (final IOException ex) { + throw new TribbleException("Error creating temp files", ex); + } + } + + private void writeToFile(final Path path, final List comments, final Set regions, final Set features) { + try (final Gff3Writer writer = new Gff3Writer(path)) { + for (final String comment : comments) { + writer.addComment(comment); + } + + for (final SequenceRegion region : regions) { + writer.addDirective(Gff3Codec.Gff3Directive.SEQUENCE_REGION_DIRECTIVE, region); + } + + for (final Gff3Feature feature : features) { + writer.addFeature(feature); + } + } catch (final IOException ex) { + throw new TribbleException("Error writing to file " + path, ex); + } + } + + private LinkedHashSet readFromFile(final Path path, List commentsStore, Set regionsStore) { + final Gff3Codec codec = new Gff3Codec(); + final LinkedHashSet features = new LinkedHashSet<>(); + try (final AbstractFeatureReader reader = AbstractFeatureReader.getFeatureReader(path.toAbsolutePath().toString(), null, codec, false)) { + for (final Gff3Feature feature : reader.iterator()) { + features.add(feature); + } + + commentsStore.addAll(codec.getCommentTexts()); + regionsStore.addAll(codec.getSequenceRegions()); + } catch (final IOException ex) { + throw new TribbleException("Error reading gff3 file " + path); + } + + return features; + } + + @DataProvider(name = "writeKeyValuePairDataProvider") + public Object[][] writeKeyValuePairDataProvider() { + return new Object[][] { + {"key",Arrays.asList("value1", "value2", "value3"), "key=value1,value2,value3"}, + {"key",Arrays.asList("value1", "value ; with = special & encoded , characters", "value3"), "key=value1,value %3B with %3D special %26 encoded %2C characters,value3"} + }; + } + + @Test(dataProvider = "writeKeyValuePairDataProvider") + public void testWriteKeyValuePair(final String key, final List values, final String expectedOutput) { + final ByteArrayOutputStream outputStream = new ByteArrayOutputStream(); + try(final Gff3Writer writer = new Gff3Writer(outputStream)) { + writer.writeKeyValuePair(key, values); + } catch (final IOException ex) { + throw new TestException("Error writing key value pair", ex); + } + + final byte[] expectedBytes = (version3Directive + expectedOutput).getBytes(); + + Assert.assertEquals(outputStream.toByteArray(), expectedBytes); + } + + @DataProvider(name = "writeAttributesDataProvider") + public Object[][] writeAttributesDataProvider() { + return new Object[][] { + {ImmutableMap.of("key1", Arrays.asList("value1", "value2"), "key2", Collections.singletonList("another value"), "key3", Arrays.asList("thisValue")), + "key1=value1,value2;key2=another value;key3=thisValue"}, + {ImmutableMap.of("singleKey", Arrays.asList("multipleValue1", "multipleValue2")), "singleKey=multipleValue1,multipleValue2"}, + {ImmutableMap.of("singleKey", Collections.singletonList("singleValue")), "singleKey=singleValue"}, + {Collections.emptyMap(), "."} + }; + } + + @Test(dataProvider = "writeAttributesDataProvider") + public void testWriteAttributes(final Map> attributes, final String expectedOutput) { + final ByteArrayOutputStream outputStream = new ByteArrayOutputStream(); + try(final Gff3Writer writer = new Gff3Writer(outputStream)) { + writer.writeAttributes(attributes); + } catch (final IOException ex) { + throw new TestException("Error writing key value pair", ex); + } + + final byte[] expectedBytes = (version3Directive + expectedOutput).getBytes(); + + Assert.assertEquals(outputStream.toByteArray(), expectedBytes); + } + + @DataProvider(name = "encodeStringDataProvider") + public Object[][] encodeStringDataProvider() { + return new Object[][] { + {"%", "%25"}, + {";", "%3B"}, + {"=", "%3D"}, + {"&", "%26"}, + {",", "%2C"}, + {" ", " "}, + {"qwertyuiopasdfghjklzxcvbnmQWERTYUIOPASDFGHJKLZXCVBNM ", "qwertyuiopasdfghjklzxcvbnmQWERTYUIOPASDFGHJKLZXCVBNM "} //these should remain unchanged + }; + } + + @Test(dataProvider = "encodeStringDataProvider") + public void testEncodeString(final String decoded, final String expectedEncoded) { + final String encoded = Gff3Writer.encodeString(decoded); + + Assert.assertEquals(encoded, expectedEncoded); + } + + private static boolean isGZipped(final File f) { + int magic = 0; + try { + RandomAccessFile raf = new RandomAccessFile(f, "r"); + magic = raf.read() & 0xff | ((raf.read() << 8) & 0xff00); + raf.close(); + } catch (Throwable e) { + e.printStackTrace(System.err); + } + return magic == GZIPInputStream.GZIP_MAGIC; + } +} \ No newline at end of file diff --git a/src/test/resources/htsjdk/tribble/gff/feature_extends_past_circular_region.gff3 b/src/test/resources/htsjdk/tribble/gff/feature_extends_past_circular_region.gff3 index 7d327a38e0..0c2e57c243 100644 --- a/src/test/resources/htsjdk/tribble/gff/feature_extends_past_circular_region.gff3 +++ b/src/test/resources/htsjdk/tribble/gff/feature_extends_past_circular_region.gff3 @@ -1,4 +1,4 @@ ##gff-version 3 ##sequence-region 1 1 10 1 . region 1 10 . . . ID=chromosome:1; Is_circular=true -1 . biological_region 7 11 . . . \ No newline at end of file +1 . biological_region 7 11 . . . . \ No newline at end of file diff --git a/src/test/resources/htsjdk/tribble/gff/url_encoding.gff3 b/src/test/resources/htsjdk/tribble/gff/url_encoding.gff3 index 4106c4c8f9..8d45d12d08 100644 --- a/src/test/resources/htsjdk/tribble/gff/url_encoding.gff3 +++ b/src/test/resources/htsjdk/tribble/gff/url_encoding.gff3 @@ -1,3 +1,3 @@ ##gff-version 3 ##sequence-region 1 1 10 -the%20contig a%20source%20%26%20also%20a%20str*)%25nge%20source a%20region 1 10 . . . ID=this%20is%20the%20ID%20of%20this%20wacky%20feature%5E%26%25%23%23%24%25*%26%3E%2C.%20%2C.; Another%20key=Another%3Dvalue \ No newline at end of file +the contig a source %26 also a str*)%25nge source a region 1 10 . . . ID=this is the ID of this wacky feature%5E%26%25%23%23%24%25*%26%3E%2C. %2C.; Another key=Another%3Dvalue,And a second%2C value \ No newline at end of file