diff --git a/src/main/java/htsjdk/samtools/SAMSequenceDictionaryUtils.java b/src/main/java/htsjdk/samtools/SAMSequenceDictionaryUtils.java new file mode 100644 index 0000000000..7f1db9fd94 --- /dev/null +++ b/src/main/java/htsjdk/samtools/SAMSequenceDictionaryUtils.java @@ -0,0 +1,505 @@ +package org.broadinstitute.hellbender.utils; + +import htsjdk.samtools.SAMSequenceDictionary; +import htsjdk.samtools.SAMSequenceRecord; +import org.broadinstitute.hellbender.exceptions.GATKException; +import org.broadinstitute.hellbender.exceptions.UserException; + +import java.util.*; +import java.util.stream.Collectors; + +/** + * + * A series of utility functions that enable the GATK to compare two sequence dictionaries -- from the reference, + * from BAMs, or from feature sources -- for consistency. The system supports two basic modes: get an enum state that + * describes at a high level the consistency between two dictionaries, or a validateDictionaries that will + * blow up with a UserException if the dicts are too incompatible. + * + * Dictionaries are tested for contig name overlaps, consistency in ordering in these overlap set, and length, + * if available. + */ +public final class SequenceDictionaryUtils { + + private SequenceDictionaryUtils(){} + + /** + * Compares sequence records by their order + */ + private static final Comparator SEQUENCE_INDEX_ORDER = Comparator.comparing(SAMSequenceRecord::getSequenceIndex); + + // The following sets of contig records are used to perform the non-canonical human ordering check. + // This check ensures that the order is 1,2,3... instead of 1, 10, 11, 12...2, 20, 21... + + // hg18 + protected static final SAMSequenceRecord CHR1_HG18 = new SAMSequenceRecord("chr1", 247249719); + protected static final SAMSequenceRecord CHR2_HG18 = new SAMSequenceRecord("chr2", 242951149); + protected static final SAMSequenceRecord CHR10_HG18 = new SAMSequenceRecord("chr10", 135374737); + + // hg19 + protected static final SAMSequenceRecord CHR1_HG19 = new SAMSequenceRecord("chr1", 249250621); + protected static final SAMSequenceRecord CHR2_HG19 = new SAMSequenceRecord("chr2", 243199373); + protected static final SAMSequenceRecord CHR10_HG19 = new SAMSequenceRecord("chr10", 135534747); + + // b36 + protected static final SAMSequenceRecord CHR1_B36 = new SAMSequenceRecord("1", 247249719); + protected static final SAMSequenceRecord CHR2_B36 = new SAMSequenceRecord("2", 242951149); + protected static final SAMSequenceRecord CHR10_B36 = new SAMSequenceRecord("10", 135374737); + + // b37 + protected static final SAMSequenceRecord CHR1_B37 = new SAMSequenceRecord("1", 249250621); + protected static final SAMSequenceRecord CHR2_B37 = new SAMSequenceRecord("2", 243199373); + protected static final SAMSequenceRecord CHR10_B37 = new SAMSequenceRecord("10", 135534747); + + + public enum SequenceDictionaryCompatibility { + IDENTICAL, // the dictionaries are identical + COMMON_SUBSET, // there exists a common subset of equivalent contigs + SUPERSET, // the first dict's set of contigs supersets the second dict's set + NO_COMMON_CONTIGS, // no overlap between dictionaries + UNEQUAL_COMMON_CONTIGS, // common subset has contigs that have the same name but different lengths + NON_CANONICAL_HUMAN_ORDER, // human reference detected but the order of the contigs is non-standard (lexicographic, for example) + OUT_OF_ORDER, // the two dictionaries overlap but the overlapping contigs occur in different + // orders with respect to each other + DIFFERENT_INDICES // the two dictionaries overlap and the overlapping contigs occur in the same + // order with respect to each other, but one or more of them have different + // indices in the two dictionaries. Eg., { chrM, chr1, chr2 } vs. { chr1, chr2 } + } + + /** + * Tests for compatibility between two sequence dictionaries, using standard validation settings appropriate + * for the GATK. If the dictionaries are incompatible, then UserExceptions are thrown with detailed error messages. + * + * The standard validation settings used by this method are: + * + * -Require the dictionaries to share a common subset of equivalent contigs + * + * -Do not require dict1 to be a superset of dict2. + * + * -Do not perform checks related to contig ordering: don't throw if the common contigs are in + * different orders with respect to each other, occur at different absolute indices, or are + * lexicographically sorted human dictionaries. GATK uses contig names rather than contig + * indices, and so should not be sensitive to contig ordering issues. + * + * For comparing a CRAM dictionary against a reference dictionary, call + * {@link #validateCRAMDictionaryAgainstReference(SAMSequenceDictionary, SAMSequenceDictionary)} instead. + * + * @param name1 name associated with dict1 + * @param dict1 the sequence dictionary dict1 + * @param name2 name associated with dict2 + * @param dict2 the sequence dictionary dict2 + */ + public static void validateDictionaries( final String name1, + final SAMSequenceDictionary dict1, + final String name2, + final SAMSequenceDictionary dict2) { + final boolean requireSuperset = false; + final boolean checkContigOrdering = false; + + validateDictionaries(name1, dict1, name2, dict2, requireSuperset, checkContigOrdering); + } + + /** + * Tests for compatibility between a reference dictionary and a CRAM dictionary, using appropriate + * validation settings. If the dictionaries are incompatible, then UserExceptions are thrown with + * detailed error messages. + * + * The standard validation settings used by this method are: + * + * -Require the reference dictionary to be a superset of the cram dictionary + * + * -Do not perform checks related to contig ordering: don't throw if the common contigs are in + * different orders with respect to each other, occur at different absolute indices, or are + * lexicographically sorted human dictionaries. GATK uses contig names rather than contig + * indices, and so should not be sensitive to contig ordering issues. + * + * @param referenceDictionary the sequence dictionary for the reference + * @param cramDictionary sequence dictionary from a CRAM file + */ + public static void validateCRAMDictionaryAgainstReference( final SAMSequenceDictionary referenceDictionary, + final SAMSequenceDictionary cramDictionary ) { + // For CRAM, we require the reference dictionary to be a superset of the reads dictionary + final boolean requireSuperset = true; + final boolean checkContigOrdering = false; + + validateDictionaries("reference", referenceDictionary, "reads", cramDictionary, requireSuperset, checkContigOrdering); + } + + + /** + * Tests for compatibility between two sequence dictionaries. If the dictionaries are incompatible, then + * UserExceptions are thrown with detailed error messages. + * + * Two sequence dictionaries are compatible if they share a common subset of equivalent contigs, + * where equivalent contigs are defined as having the same name and length. + * + * @param name1 name associated with dict1 + * @param dict1 the sequence dictionary dict1 + * @param name2 name associated with dict2 + * @param dict2 the sequence dictionary dict2 + * @param requireSuperset if true, require that dict1 be a superset of dict2, rather than dict1 and dict2 sharing a common subset + * @param checkContigOrdering if true, require common contigs to be in the same relative order with respect to each other + * and occur at the same absolute indices, and forbid lexicographically-sorted human dictionaries + */ + public static void validateDictionaries( final String name1, + final SAMSequenceDictionary dict1, + final String name2, + final SAMSequenceDictionary dict2, + final boolean requireSuperset, + final boolean checkContigOrdering ) { + Utils.nonNull(dict1, "Something went wrong with sequence dictionary detection, check that "+name1+" has a valid sequence dictionary"); + Utils.nonNull(dict2, "Something went wrong with sequence dictionary detection, check that "+name2+" has a valid sequence dictionary"); + + final SequenceDictionaryCompatibility type = compareDictionaries(dict1, dict2, checkContigOrdering); + + switch ( type ) { + case IDENTICAL: + return; + case SUPERSET: + return; + case COMMON_SUBSET: + if ( requireSuperset ) { + final Set contigs1 = dict1.getSequences().stream().map(SAMSequenceRecord::getSequenceName).collect(Collectors.toSet()); + final List missingContigs = dict2.getSequences().stream() + .map(SAMSequenceRecord::getSequenceName) + .filter(contig -> !contigs1.contains(contig)) + .collect(Collectors.toList()); + throw new UserException.IncompatibleSequenceDictionaries(String.format("Dictionary %s is missing contigs found in dictionary %s. Missing contigs: \n %s \n", name1, name2, String.join(", ", missingContigs)), name1, dict1, name2, dict2); + } + return; + case NO_COMMON_CONTIGS: + throw new UserException.IncompatibleSequenceDictionaries("No overlapping contigs found", name1, dict1, name2, dict2); + + case UNEQUAL_COMMON_CONTIGS: { + final List x = findDisequalCommonContigs(getCommonContigsByName(dict1, dict2), dict1, dict2); + final SAMSequenceRecord elt1 = x.get(0); + final SAMSequenceRecord elt2 = x.get(1); + throw new UserException.IncompatibleSequenceDictionaries( + String.format("Found contigs with the same name but different lengths:\n contig %s = %s / %d\n contig %s = %s / %d", + name1, elt1.getSequenceName(), elt1.getSequenceLength(), + name2, elt2.getSequenceName(), elt2.getSequenceLength()), + name1, dict1, name2, dict2 + ); + } + + case NON_CANONICAL_HUMAN_ORDER: { + // We only get NON_CANONICAL_HUMAN_ORDER if the caller explicitly requested that we check contig ordering, + // so we should always throw when we see it. + final UserException ex; + if ( nonCanonicalHumanContigOrder(dict1) ) { + ex = new UserException.LexicographicallySortedSequenceDictionary(name1, dict1); + } + else { + ex = new UserException.LexicographicallySortedSequenceDictionary(name2, dict2); + } + + throw ex; + } + + case OUT_OF_ORDER: { + // We only get OUT_OF_ORDER if the caller explicitly requested that we check contig ordering, + // so we should always throw when we see it. + throw new UserException.IncompatibleSequenceDictionaries( + "The relative ordering of the common contigs in " + name1 + " and " + name2 + + " is not the same; to fix this please see: " + + "(https://www.broadinstitute.org/gatk/guide/article?id=1328), " + + " which describes reordering contigs in BAM and VCF files.", + name1, dict1, name2, dict2); + } + + case DIFFERENT_INDICES: { + // We only get DIFFERENT_INDICES if the caller explicitly requested that we check contig ordering, + // so we should always throw when we see it. + final String msg = "One or more contigs common to both dictionaries have " + + "different indices (ie., absolute positions) in each dictionary. Code " + + "that is sensitive to contig ordering can fail when this is the case. " + + "You should fix the sequence dictionaries so that all shared contigs " + + "occur at the same absolute positions in both dictionaries."; + throw new UserException.IncompatibleSequenceDictionaries(msg, name1, dict1, name2, dict2); + } + default: + throw new GATKException("Unexpected SequenceDictionaryComparison type: " + type); + } + } + + /** + * Workhorse routine that takes two dictionaries and returns their compatibility. + * + * @param dict1 first sequence dictionary + * @param dict2 second sequence dictionary + * @param checkContigOrdering if true, perform checks related to contig ordering: forbid lexicographically-sorted + * dictionaries, and require common contigs to be in the same relative order and at the + * same absolute indices + * @return A SequenceDictionaryCompatibility enum value describing the compatibility of the two dictionaries + */ + public static SequenceDictionaryCompatibility compareDictionaries( final SAMSequenceDictionary dict1, final SAMSequenceDictionary dict2, final boolean checkContigOrdering ) { + if ( checkContigOrdering && (nonCanonicalHumanContigOrder(dict1) || nonCanonicalHumanContigOrder(dict2)) ) { + return SequenceDictionaryCompatibility.NON_CANONICAL_HUMAN_ORDER; + } + + final Set commonContigs = getCommonContigsByName(dict1, dict2); + + if (commonContigs.isEmpty()) { + return SequenceDictionaryCompatibility.NO_COMMON_CONTIGS; + } + else if ( ! commonContigsHaveSameLengths(commonContigs, dict1, dict2) ) { + return SequenceDictionaryCompatibility.UNEQUAL_COMMON_CONTIGS; + } + + final boolean commonContigsAreInSameRelativeOrder = commonContigsAreInSameRelativeOrder(commonContigs, dict1, dict2); + + if ( checkContigOrdering && ! commonContigsAreInSameRelativeOrder ) { + return SequenceDictionaryCompatibility.OUT_OF_ORDER; + } + else if ( commonContigsAreInSameRelativeOrder && commonContigs.size() == dict1.size() && commonContigs.size() == dict2.size() ) { + return SequenceDictionaryCompatibility.IDENTICAL; + } + else if ( checkContigOrdering && ! commonContigsAreAtSameIndices(commonContigs, dict1, dict2) ) { + return SequenceDictionaryCompatibility.DIFFERENT_INDICES; + } + else if ( supersets(dict1, dict2) ) { + return SequenceDictionaryCompatibility.SUPERSET; + } + else { + return SequenceDictionaryCompatibility.COMMON_SUBSET; + } + } + + + /** + * Utility function that tests whether dict1's set of contigs is a superset of dict2's + * + * @param dict1 first sequence dictionary + * @param dict2 second sequence dictionary + * @return true if dict1's set of contigs supersets dict2's + */ + private static boolean supersets( SAMSequenceDictionary dict1, SAMSequenceDictionary dict2 ) { + // Cannot rely on SAMSequenceRecord.equals() as it's too strict (takes extended attributes into account). + for ( final SAMSequenceRecord dict2Record : dict2.getSequences() ) { + final SAMSequenceRecord dict1Record = dict1.getSequence(dict2Record.getSequenceName()); + if ( dict1Record == null || ! sequenceRecordsAreEquivalent(dict2Record, dict1Record) ) { + return false; + } + } + + return true; + } + + + + /** + * Utility function that tests whether the commonContigs in both dicts are equivalent. Equivalence means + * that the seq records have the same length, if both are non-zero. + * + * @param commonContigs + * @param dict1 + * @param dict2 + * @return true if all of the common contigs are equivalent + */ + private static boolean commonContigsHaveSameLengths(Set commonContigs, SAMSequenceDictionary dict1, SAMSequenceDictionary dict2) { + return findDisequalCommonContigs(commonContigs, dict1, dict2) == null; + } + + /** + * Returns a List(x,y) that contains two disequal sequence records among the common contigs in both dicts. Returns + * null if all common contigs are equivalent + * + * @param commonContigs + * @param dict1 + * @param dict2 + * @return + */ + private static List findDisequalCommonContigs(Set commonContigs, SAMSequenceDictionary dict1, SAMSequenceDictionary dict2) { + for ( String name : commonContigs ) { + SAMSequenceRecord elt1 = dict1.getSequence(name); + SAMSequenceRecord elt2 = dict2.getSequence(name); + if ( ! sequenceRecordsAreEquivalent(elt1, elt2) ) + return Arrays.asList(elt1,elt2); + } + + return null; + } + + /** + * Helper routine that returns whether two sequence records are equivalent, defined as having the same name and + * lengths. + * + * NOTE: we allow the lengths to differ if one or both are UNKNOWN_SEQUENCE_LENGTH + * + * @param first first sequence record to compare + * @param second second sequence record to compare + * @return true if first and second have the same names and lengths, otherwise false + */ + public static boolean sequenceRecordsAreEquivalent(final SAMSequenceRecord first, final SAMSequenceRecord second) { + if ( first == second ) { + return true; + } + if ( first == null || second == null ) { + return false; + } + final int length1 = first.getSequenceLength(); + final int length2 = second.getSequenceLength(); + + if (length1 != length2 && length1 != SAMSequenceRecord.UNKNOWN_SEQUENCE_LENGTH && length2 != SAMSequenceRecord.UNKNOWN_SEQUENCE_LENGTH){ + return false; + } + if (! first.getSequenceName().equals(second.getSequenceName())){ + return false; + } + return true; + } + + /** + * A very simple (and naive) algorithm to determine (1) if the dict is a human reference (hg18, hg19, b36, or b37) and if it's + * lexicographically sorted. Works by matching lengths of the static chr1, chr10, and chr2, and then if these + * are all matched, requiring that the order be chr1, chr2, chr10. + * + * @param dict + * @return + */ + private static boolean nonCanonicalHumanContigOrder(SAMSequenceDictionary dict) { + SAMSequenceRecord chr1 = null, chr2 = null, chr10 = null; + for ( SAMSequenceRecord elt : dict.getSequences() ) { + if ( isHumanSeqRecord(elt, CHR1_HG18, CHR1_HG19, CHR1_B36, CHR1_B37) ) chr1 = elt; + if ( isHumanSeqRecord(elt, CHR2_HG18, CHR2_HG19, CHR2_B36, CHR2_B37) ) chr2 = elt; + if ( isHumanSeqRecord(elt, CHR10_HG18, CHR10_HG19, CHR10_B36, CHR10_B37) ) chr10 = elt; + } + if ( chr1 != null && chr2 != null && chr10 != null) { + return ! ( chr1.getSequenceIndex() < chr2.getSequenceIndex() && chr2.getSequenceIndex() < chr10.getSequenceIndex() ); + } + + return false; + } + + /** + * Trivial helper that returns true if elt has the same name and length as rec1 or rec2 + * @param elt record to test + * @param recs the list of records to check for name and length equivalence + * @return true if elt has the same name and length as any of the recs + */ + private static boolean isHumanSeqRecord(SAMSequenceRecord elt, SAMSequenceRecord... recs) { + for (SAMSequenceRecord rec : recs) { + if (elt.getSequenceLength() == rec.getSequenceLength() && elt.getSequenceName().equals(rec.getSequenceName())) { + return true; + } + } + return false; + } + + /** + * Returns true if the common contigs in dict1 and dict2 are in the same relative order, without regard to + * absolute index position. This is accomplished by getting the common contigs in both dictionaries, sorting + * these according to their indices, and then walking through the sorted list to ensure that each ordered contig + * is equivalent + * + * @param commonContigs names of the contigs common to both dictionaries + * @param dict1 first SAMSequenceDictionary + * @param dict2 second SAMSequenceDictionary + * @return true if the common contigs occur in the same relative order in both dict1 and dict2, otherwise false + */ + private static boolean commonContigsAreInSameRelativeOrder(Set commonContigs, SAMSequenceDictionary dict1, SAMSequenceDictionary dict2) { + final List list1 = getSequencesOfName(commonContigs, dict1); + final List list2 = getSequencesOfName(commonContigs, dict2); + list1.sort(SEQUENCE_INDEX_ORDER); + list2.sort(SEQUENCE_INDEX_ORDER); + + for ( int i = 0; i < list1.size(); i++ ) { + SAMSequenceRecord elt1 = list1.get(i); + SAMSequenceRecord elt2 = list2.get(i); + if ( ! elt1.getSequenceName().equals(elt2.getSequenceName()) ) + return false; + } + + return true; + } + + /** + * Gets the subset of SAMSequenceRecords in commonContigs in dict + * + * @param commonContigs + * @param dict + * @return + */ + private static List getSequencesOfName(Set commonContigs, SAMSequenceDictionary dict) { + List l = new ArrayList<>(commonContigs.size()); + for ( String name : commonContigs ) { + l.add(dict.getSequence(name) ); + } + + return l; + } + + /** + * Checks whether the common contigs in the given sequence dictionaries occur at the same indices + * in both dictionaries + * + * @param commonContigs Set of names of the contigs that occur in both dictionaries + * @param dict1 first sequence dictionary + * @param dict2 second sequence dictionary + * @return true if the contigs common to dict1 and dict2 occur at the same indices in both dictionaries, + * otherwise false + */ + private static boolean commonContigsAreAtSameIndices( final Set commonContigs, final SAMSequenceDictionary dict1, final SAMSequenceDictionary dict2 ) { + for ( String commonContig : commonContigs ) { + SAMSequenceRecord dict1Record = dict1.getSequence(commonContig); + SAMSequenceRecord dict2Record = dict2.getSequence(commonContig); + + // Each common contig must have the same index in both dictionaries + if ( dict1Record.getSequenceIndex() != dict2Record.getSequenceIndex() ) { + return false; + } + } + + return true; + } + + /** + * Returns the set of contig names found in both dicts. + * @param dict1 + * @param dict2 + * @return + */ + public static Set getCommonContigsByName(SAMSequenceDictionary dict1, SAMSequenceDictionary dict2) { + Set intersectingSequenceNames = getContigNames(dict1); + intersectingSequenceNames.retainAll(getContigNames(dict2)); + return intersectingSequenceNames; + } + + public static Set getContigNames(SAMSequenceDictionary dict) { + Set contigNames = new LinkedHashSet(Utils.optimumHashSize(dict.size())); + for (SAMSequenceRecord dictionaryEntry : dict.getSequences()) + contigNames.add(dictionaryEntry.getSequenceName()); + return contigNames; + } + + public static List getContigNamesList(final SAMSequenceDictionary refSeqDict) { + Utils.nonNull(refSeqDict, "provided reference sequence ditionary is null"); + return refSeqDict.getSequences().stream().map(SAMSequenceRecord::getSequenceName).collect(Collectors.toList()); + } + + /** + * Returns a compact String representation of the sequence dictionary it's passed + * + * The format of the returned String is: + * [ contig1Name(length: contig1Length) contig2Name(length: contig2Length) ... ] + * + * @param dict a non-null SAMSequenceDictionary + * @return A String containing all of the contig names and lengths from the sequence dictionary it's passed + */ + public static String getDictionaryAsString( final SAMSequenceDictionary dict ) { + Utils.nonNull(dict, "Sequence dictionary must be non-null"); + + StringBuilder s = new StringBuilder("[ "); + + for ( SAMSequenceRecord dictionaryEntry : dict.getSequences() ) { + s.append(dictionaryEntry.getSequenceName()); + s.append("(length:"); + s.append(dictionaryEntry.getSequenceLength()); + s.append(") "); + } + + s.append("]"); + + return s.toString(); + } + +} diff --git a/src/test/java/htsjdk/samtools/SAMSequenceDictionaryUtilsTest.java b/src/test/java/htsjdk/samtools/SAMSequenceDictionaryUtilsTest.java new file mode 100644 index 0000000000..37842f8a9a --- /dev/null +++ b/src/test/java/htsjdk/samtools/SAMSequenceDictionaryUtilsTest.java @@ -0,0 +1,357 @@ +package org.broadinstitute.hellbender.utils; + +import htsjdk.samtools.SAMSequenceDictionary; +import htsjdk.samtools.SAMSequenceRecord; +import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.Logger; +import org.broadinstitute.hellbender.exceptions.UserException; +import org.broadinstitute.hellbender.GATKBaseTest; +import org.testng.Assert; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; + +import static org.broadinstitute.hellbender.utils.SequenceDictionaryUtils.*; +import static org.broadinstitute.hellbender.utils.SequenceDictionaryUtils.SequenceDictionaryCompatibility.*; + +public final class SequenceDictionaryUtilsUnitTest extends GATKBaseTest { + + private static Logger logger = LogManager.getLogger(SequenceDictionaryUtilsUnitTest.class); + + @DataProvider( name = "testSequenceRecordsAreEquivalentDataProvider" ) + public Object[][] testSequenceRecordsAreEquivalentDataProvider() { + final SAMSequenceRecord CHRM_HG19 = new SAMSequenceRecord("chrM", 16571); + final SAMSequenceRecord CHR_NONSTANDARD1 = new SAMSequenceRecord("NonStandard1", 8675309); + final SAMSequenceRecord CHR1_HG19_WITH_UNKNOWN_LENGTH = new SAMSequenceRecord(CHR1_HG19.getSequenceName(), SAMSequenceRecord.UNKNOWN_SEQUENCE_LENGTH); + final SAMSequenceRecord CHR1_HG19_WITH_DIFFERENT_LENGTH = new SAMSequenceRecord(CHR1_HG19.getSequenceName(), 123456); + return new Object[][]{ + {CHR1_HG19, CHR1_HG19, true}, + {CHR1_HG19, CHRM_HG19, false}, + {CHR1_HG19, CHR_NONSTANDARD1, false}, + {null, null, true}, + {CHR1_HG19, null, false}, + {null, CHR1_HG19, false}, + {CHR1_HG19, CHR1_HG19_WITH_UNKNOWN_LENGTH, true}, + {CHR1_HG19, CHR1_HG19_WITH_DIFFERENT_LENGTH, false}, + {CHR1_HG19_WITH_UNKNOWN_LENGTH, CHR1_HG19, true}, + {CHR1_HG19_WITH_DIFFERENT_LENGTH, CHR1_HG19, false}, + }; + } + + @Test(dataProvider = "testSequenceRecordsAreEquivalentDataProvider") + public void testSequenceRecordsAreEquivalent(final SAMSequenceRecord one, final SAMSequenceRecord two, final boolean expected){ + final boolean actual = SequenceDictionaryUtils.sequenceRecordsAreEquivalent(one, two); + Assert.assertEquals(actual, expected); + } + + @DataProvider( name = "SequenceDictionaryDataProvider" ) + public Object[][] generateSequenceDictionaryTestData() { + final SAMSequenceRecord CHRM_HG19 = new SAMSequenceRecord("chrM", 16571); + final SAMSequenceRecord CHR_NONSTANDARD1 = new SAMSequenceRecord("NonStandard1", 8675309); + final SAMSequenceRecord CHR_NONSTANDARD2 = new SAMSequenceRecord("NonStandard2", 8675308); + final SAMSequenceRecord CHR1_HG19_WITH_UNKNOWN_LENGTH = new SAMSequenceRecord(CHR1_HG19.getSequenceName(), SAMSequenceRecord.UNKNOWN_SEQUENCE_LENGTH); + final SAMSequenceRecord CHR1_HG19_WITH_DIFFERENT_LENGTH = new SAMSequenceRecord(CHR1_HG19.getSequenceName(), 123456); + + final SAMSequenceRecord CHR1_HG19_WITH_ATTRIBUTES = new SAMSequenceRecord(CHR1_HG19.getSequenceName(), CHR1_HG19.getSequenceLength()); + CHR1_HG19_WITH_ATTRIBUTES.setAttribute("M5", "0dec9660ec1efaaf33281c0d5ea2560f"); + CHR1_HG19_WITH_ATTRIBUTES.setAttribute("UR", "file:/foo/bar"); + + final Class NO_COMMON_CONTIGS_EXCEPTION = UserException.IncompatibleSequenceDictionaries.class; + final Class UNEQUAL_COMMON_CONTIGS_EXCEPTION = UserException.IncompatibleSequenceDictionaries.class; + final Class NON_CANONICAL_HUMAN_ORDER_EXCEPTION = UserException.LexicographicallySortedSequenceDictionary.class; + final Class OUT_OF_ORDER_EXCEPTION = UserException.IncompatibleSequenceDictionaries.class; + final Class DIFFERENT_INDICES_EXCEPTION = UserException.IncompatibleSequenceDictionaries.class; + + final List hg19AllContigsIntervalSet = Arrays.asList( + new SimpleInterval("chrM", 1, 1), + new SimpleInterval("chr1", 1, 1), + new SimpleInterval("chr2", 1, 1), + new SimpleInterval("chr10", 1, 1)); + final List hg19PartialContigsIntervalSet = Arrays.asList( + new SimpleInterval("chrM", 1, 1), + new SimpleInterval("chr1", 1, 1)); + + return new Object[][] { + // Identical dictionaries: + {Arrays.asList(CHR1_HG19), Arrays.asList(CHR1_HG19), IDENTICAL, null, false, false}, + { Arrays.asList(CHR1_HG19), Arrays.asList(CHR1_HG19), IDENTICAL, null, false, false}, + { Arrays.asList(CHR1_HG19), Arrays.asList(CHR1_HG19), IDENTICAL, null, true, false}, + { Arrays.asList(CHR1_HG19), Arrays.asList(CHR1_HG19), IDENTICAL, null, false, true}, + { Arrays.asList(CHR1_HG19), Arrays.asList(CHR1_HG19), IDENTICAL, null, true, true}, + { Arrays.asList(CHR1_HG19, CHR2_HG19, CHR10_HG19), Arrays.asList(CHR1_HG19, CHR2_HG19, CHR10_HG19), IDENTICAL, null, false, false}, + { Arrays.asList(CHR1_B37), Arrays.asList(CHR1_B37), IDENTICAL, null, false, false}, + { Arrays.asList(CHR1_B37, CHR2_B37, CHR10_B37), Arrays.asList(CHR1_B37, CHR2_B37, CHR10_B37), IDENTICAL, null, false, false}, + { Arrays.asList(CHR1_HG19), Arrays.asList(CHR1_HG19_WITH_UNKNOWN_LENGTH), IDENTICAL, null, false, false}, + { Arrays.asList(CHR1_HG19_WITH_UNKNOWN_LENGTH), Arrays.asList(CHR1_HG19), IDENTICAL, null, false, false}, + + // Dictionaries with a common subset: + { Arrays.asList(CHR1_HG19), Arrays.asList(CHR1_HG19, CHR_NONSTANDARD1), COMMON_SUBSET, null, false, false}, + { Arrays.asList(CHR1_HG19), Arrays.asList(CHR1_HG19, CHR_NONSTANDARD1), COMMON_SUBSET, null, false, true}, + // If requireSuperset == true, we should get an exception upon COMMON_SUBSET: + { Arrays.asList(CHRM_HG19, CHR1_HG19, CHR2_HG19), Arrays.asList(CHRM_HG19, CHR1_HG19, CHR10_HG19), COMMON_SUBSET, UserException.IncompatibleSequenceDictionaries.class, true, false}, + { Arrays.asList(CHR1_HG19, CHR_NONSTANDARD1), Arrays.asList(CHR1_HG19, CHR_NONSTANDARD2), COMMON_SUBSET, null, false, false}, + { Arrays.asList(CHR_NONSTANDARD1, CHR1_HG19), Arrays.asList(CHR_NONSTANDARD2, CHR1_HG19), COMMON_SUBSET, null, false, false}, + { Arrays.asList(CHR_NONSTANDARD1, CHR1_HG19), Arrays.asList(CHR_NONSTANDARD2, CHR1_HG19, CHRM_HG19), COMMON_SUBSET, null, false, false}, + { Arrays.asList(CHR1_HG19, CHR2_HG19, CHR10_HG19, CHR_NONSTANDARD1), Arrays.asList(CHR1_HG19, CHR2_HG19, CHR10_HG19, CHR_NONSTANDARD2), COMMON_SUBSET, null, false, false}, + { Arrays.asList(CHR_NONSTANDARD1, CHR1_HG19, CHR2_HG19, CHR10_HG19), Arrays.asList(CHR_NONSTANDARD2, CHR1_HG19, CHR2_HG19, CHR10_HG19), COMMON_SUBSET, null, false, false}, + { Arrays.asList(CHR_NONSTANDARD1, CHR1_HG19, CHR2_HG19, CHR10_HG19), Arrays.asList(CHR_NONSTANDARD2, CHR1_HG19, CHR2_HG19, CHR10_HG19, CHRM_HG19), COMMON_SUBSET, null, false, false}, + { Arrays.asList(CHR1_B37, CHR2_B37, CHR10_B37, CHR_NONSTANDARD1), Arrays.asList(CHR1_B37, CHR2_B37, CHR10_B37, CHR_NONSTANDARD2), COMMON_SUBSET, null, false, false}, + // If requireSuperset == true, we should get an exception upon COMMON_SUBSET: + { Arrays.asList(CHR1_HG19, CHR2_HG19), Arrays.asList(CHR1_HG19, CHR2_HG19, CHR10_HG19), COMMON_SUBSET, UserException.IncompatibleSequenceDictionaries.class, true, false}, + { Arrays.asList(CHR1_HG19, CHR2_HG19, CHR10_HG19), Arrays.asList(CHR1_HG19, CHR2_HG19, CHR10_HG19, CHR_NONSTANDARD1), COMMON_SUBSET, null, false, false}, + { Arrays.asList(CHR1_HG19, CHR2_HG19), Arrays.asList(CHR1_HG19, CHR2_HG19, CHR10_HG19), COMMON_SUBSET, null, false, false}, + // If checkContigOrdering == false, ordering of the common contigs should not matter: + { Arrays.asList(CHR1_HG19, CHR2_HG19), Arrays.asList(CHR2_HG19, CHR1_HG19, CHR10_HG19), COMMON_SUBSET, null, false, false}, + { Arrays.asList(CHR1_HG19, CHR2_HG19), Arrays.asList(CHR10_HG19, CHR2_HG19, CHR1_HG19), COMMON_SUBSET, null, false, false}, + { Arrays.asList(CHR1_HG19, CHR2_HG19), Arrays.asList(CHR2_HG19, CHR10_HG19, CHR1_HG19), COMMON_SUBSET, null, false, false}, + + // Dictionaries with no common contigs: + { Arrays.asList(CHR1_HG19), Arrays.asList(CHR2_HG19), NO_COMMON_CONTIGS, NO_COMMON_CONTIGS_EXCEPTION, false, false}, + { Arrays.asList(CHR1_HG19), Arrays.asList(CHR2_HG19), NO_COMMON_CONTIGS, NO_COMMON_CONTIGS_EXCEPTION, false, true}, + { Arrays.asList(CHR1_HG19), Arrays.asList(CHR2_HG19), NO_COMMON_CONTIGS, NO_COMMON_CONTIGS_EXCEPTION, true, false}, + { Arrays.asList(CHR1_HG19), Arrays.asList(CHR2_HG19), NO_COMMON_CONTIGS, NO_COMMON_CONTIGS_EXCEPTION, true, true}, + { Arrays.asList(CHR1_HG19), Arrays.asList(CHR1_B37), NO_COMMON_CONTIGS, NO_COMMON_CONTIGS_EXCEPTION, false, false}, + { Arrays.asList(CHR1_HG19, CHR2_HG19, CHR10_HG19), Arrays.asList(CHR1_B37, CHR2_B37, CHR10_B37), NO_COMMON_CONTIGS, NO_COMMON_CONTIGS_EXCEPTION, false, false}, + { Arrays.asList(CHR1_HG19, CHR2_HG19), Arrays.asList(CHR1_B37, CHR2_B37, CHR10_B37), NO_COMMON_CONTIGS, NO_COMMON_CONTIGS_EXCEPTION, false, false}, + + // Dictionaries with unequal common contigs: + { Arrays.asList(CHR1_HG19), Arrays.asList(CHR1_HG19_WITH_DIFFERENT_LENGTH), UNEQUAL_COMMON_CONTIGS, UNEQUAL_COMMON_CONTIGS_EXCEPTION, false, false}, + { Arrays.asList(CHR1_HG19_WITH_DIFFERENT_LENGTH), Arrays.asList(CHR1_HG19), UNEQUAL_COMMON_CONTIGS, UNEQUAL_COMMON_CONTIGS_EXCEPTION, false, false}, + { Arrays.asList(CHR1_HG19), Arrays.asList(CHR1_HG18), UNEQUAL_COMMON_CONTIGS, UNEQUAL_COMMON_CONTIGS_EXCEPTION, false, false}, + { Arrays.asList(CHR1_HG19), Arrays.asList(CHR1_HG18), UNEQUAL_COMMON_CONTIGS, UNEQUAL_COMMON_CONTIGS_EXCEPTION, true, false}, + { Arrays.asList(CHR1_HG19), Arrays.asList(CHR1_HG18), UNEQUAL_COMMON_CONTIGS, UNEQUAL_COMMON_CONTIGS_EXCEPTION, false, true}, + { Arrays.asList(CHR1_HG19), Arrays.asList(CHR1_HG18), UNEQUAL_COMMON_CONTIGS, UNEQUAL_COMMON_CONTIGS_EXCEPTION, true, true}, + { Arrays.asList(CHR1_B36), Arrays.asList(CHR1_B37), UNEQUAL_COMMON_CONTIGS, UNEQUAL_COMMON_CONTIGS_EXCEPTION, false, false}, + { Arrays.asList(CHR1_HG19, CHR2_HG19, CHR10_HG19), Arrays.asList(CHR1_HG18, CHR2_HG18, CHR10_HG18), UNEQUAL_COMMON_CONTIGS, UNEQUAL_COMMON_CONTIGS_EXCEPTION, false, false}, + { Arrays.asList(CHR1_B37, CHR2_B37, CHR10_B37), Arrays.asList(CHR1_B36, CHR2_B36, CHR10_B36), UNEQUAL_COMMON_CONTIGS, UNEQUAL_COMMON_CONTIGS_EXCEPTION, false, false}, + { Arrays.asList(CHR1_HG19, CHR2_HG19, CHR10_HG19, CHR_NONSTANDARD1), Arrays.asList(CHR1_HG18, CHR2_HG18, CHR10_HG18, CHR_NONSTANDARD2), UNEQUAL_COMMON_CONTIGS, UNEQUAL_COMMON_CONTIGS_EXCEPTION, false, false}, + { Arrays.asList(CHR_NONSTANDARD1, CHR1_HG19, CHR2_HG19, CHR10_HG19), Arrays.asList(CHR_NONSTANDARD2, CHR1_HG18, CHR2_HG18, CHR10_HG18), UNEQUAL_COMMON_CONTIGS, UNEQUAL_COMMON_CONTIGS_EXCEPTION, false, false}, + { Arrays.asList(CHR1_HG19, CHR2_HG19), Arrays.asList(CHR1_HG18, CHR2_HG18, CHR10_HG18), UNEQUAL_COMMON_CONTIGS, UNEQUAL_COMMON_CONTIGS_EXCEPTION, false, false}, + + // One or both dictionaries in non-canonical human order: + { Arrays.asList(CHR1_HG19, CHR10_HG19, CHR2_HG19), Arrays.asList(CHR1_HG19, CHR10_HG19, CHR2_HG19), NON_CANONICAL_HUMAN_ORDER, NON_CANONICAL_HUMAN_ORDER_EXCEPTION, false, true}, + { Arrays.asList(CHR1_HG19, CHR10_HG19, CHR2_HG19), Arrays.asList(CHR1_HG19, CHR10_HG19, CHR2_HG19), NON_CANONICAL_HUMAN_ORDER, NON_CANONICAL_HUMAN_ORDER_EXCEPTION, true, true}, + { Arrays.asList(CHR1_HG19, CHR10_HG19, CHR2_HG19), Arrays.asList(CHR1_HG19, CHR10_HG19, CHR2_HG19), NON_CANONICAL_HUMAN_ORDER, NON_CANONICAL_HUMAN_ORDER_EXCEPTION, false, true}, + { Arrays.asList(CHR1_HG19, CHR10_HG19, CHR2_HG19), Arrays.asList(CHR1_HG19, CHR10_HG19, CHR2_HG19), NON_CANONICAL_HUMAN_ORDER, NON_CANONICAL_HUMAN_ORDER_EXCEPTION, true, true}, + { Arrays.asList(CHR1_HG18, CHR10_HG18, CHR2_HG18), Arrays.asList(CHR1_HG18, CHR10_HG18, CHR2_HG18), NON_CANONICAL_HUMAN_ORDER, NON_CANONICAL_HUMAN_ORDER_EXCEPTION, false, true}, + { Arrays.asList(CHR1_HG19, CHR10_HG19, CHR2_HG19), Arrays.asList(CHR1_HG19, CHR2_HG19, CHR10_HG19), NON_CANONICAL_HUMAN_ORDER, NON_CANONICAL_HUMAN_ORDER_EXCEPTION, false, true}, + { Arrays.asList(CHR1_HG19, CHR2_HG19, CHR10_HG19), Arrays.asList(CHR1_HG19, CHR10_HG19, CHR2_HG19), NON_CANONICAL_HUMAN_ORDER, NON_CANONICAL_HUMAN_ORDER_EXCEPTION, false, true}, + { Arrays.asList(CHR1_B37, CHR10_B37, CHR2_B37), Arrays.asList(CHR1_B37, CHR10_B37, CHR2_B37), NON_CANONICAL_HUMAN_ORDER, NON_CANONICAL_HUMAN_ORDER_EXCEPTION, false, true}, + { Arrays.asList(CHR1_B36, CHR10_B36, CHR2_B36), Arrays.asList(CHR1_B36, CHR10_B36, CHR2_B36), NON_CANONICAL_HUMAN_ORDER, NON_CANONICAL_HUMAN_ORDER_EXCEPTION, false, true}, + // If checkContigOrdering == false, we should not get NON_CANONICAL_HUMAN_ORDER: + { Arrays.asList(CHR1_HG19, CHR10_HG19, CHR2_HG19), Arrays.asList(CHR1_HG19, CHR10_HG19, CHR2_HG19), IDENTICAL, null, false, false}, + { Arrays.asList(CHR1_HG19, CHR10_HG19, CHR2_HG19), Arrays.asList(CHR1_HG19, CHR10_HG19, CHR2_HG19, CHR_NONSTANDARD1), COMMON_SUBSET, null, false, false}, + + // Dictionaries with a common subset, but different relative ordering within that subset + { Arrays.asList(CHR1_HG19, CHR2_HG19), Arrays.asList(CHR2_HG19, CHR1_HG19), OUT_OF_ORDER, OUT_OF_ORDER_EXCEPTION, false, true}, + { Arrays.asList(CHR1_HG19, CHR2_HG19), Arrays.asList(CHR2_HG19, CHR1_HG19), OUT_OF_ORDER, OUT_OF_ORDER_EXCEPTION, true, true}, + { Arrays.asList(CHR1_HG19, CHR2_HG19), Arrays.asList(CHR2_HG19, CHR1_HG19), OUT_OF_ORDER, OUT_OF_ORDER_EXCEPTION, false, true}, + { Arrays.asList(CHR1_HG19, CHR2_HG19), Arrays.asList(CHR2_HG19, CHR1_HG19), OUT_OF_ORDER, OUT_OF_ORDER_EXCEPTION, true, true}, + { Arrays.asList(CHRM_HG19, CHR1_HG19, CHR2_HG19), Arrays.asList(CHR2_HG19, CHR1_HG19, CHRM_HG19), OUT_OF_ORDER, OUT_OF_ORDER_EXCEPTION, false, true}, + { Arrays.asList(CHRM_HG19, CHR1_HG19, CHR2_HG19), Arrays.asList(CHRM_HG19, CHR2_HG19, CHR1_HG19), OUT_OF_ORDER, OUT_OF_ORDER_EXCEPTION, false, true}, + { Arrays.asList(CHRM_HG19, CHR1_HG19, CHR2_HG19), Arrays.asList(CHR2_HG19, CHRM_HG19, CHR1_HG19), OUT_OF_ORDER, OUT_OF_ORDER_EXCEPTION, false, true}, + { Arrays.asList(CHR1_B37, CHR2_B37), Arrays.asList(CHR2_B37, CHR1_B37), OUT_OF_ORDER, OUT_OF_ORDER_EXCEPTION, false, true}, + // If checkContigOrdering == false, we should not get OUT_OF_ORDER: + { Arrays.asList(CHR1_HG19, CHR2_HG19), Arrays.asList(CHR2_HG19, CHR1_HG19), SUPERSET, null, false, false}, + { Arrays.asList(CHR1_HG19, CHR2_HG19), Arrays.asList(CHR2_HG19, CHR1_HG19, CHR_NONSTANDARD1), COMMON_SUBSET, null, false, false}, + + // Dictionaries with a common subset in the same relative order, but with different indices. + // This will only throw an exception during validation if checkContigOrdering is true + + // These have checkContigOrdering == true, so we expect DIFFERENT_INDICES and an exception: + { Arrays.asList(CHRM_HG19, CHR1_HG19), Arrays.asList(CHR1_HG19), DIFFERENT_INDICES, DIFFERENT_INDICES_EXCEPTION, false, true}, + // Setting requireSuperset == true should make no difference here (we should still get DIFFERENT_INDICES and an exception): + { Arrays.asList(CHRM_HG19, CHR1_HG19), Arrays.asList(CHR1_HG19), DIFFERENT_INDICES, DIFFERENT_INDICES_EXCEPTION, true, true}, + { Arrays.asList(CHR1_HG19, CHR2_HG19), Arrays.asList(CHRM_HG19, CHR1_HG19, CHR2_HG19), DIFFERENT_INDICES, DIFFERENT_INDICES_EXCEPTION, false, true}, + { Arrays.asList(CHR1_HG19, CHR2_HG19), Arrays.asList(CHRM_HG19, CHR1_HG19, CHR2_HG19, CHR_NONSTANDARD1), DIFFERENT_INDICES, DIFFERENT_INDICES_EXCEPTION, false, true}, + { Arrays.asList(CHR1_HG19, CHR2_HG19), Arrays.asList(CHRM_HG19, CHR_NONSTANDARD1, CHR1_HG19, CHR2_HG19), DIFFERENT_INDICES, DIFFERENT_INDICES_EXCEPTION, false, true}, + { Arrays.asList(CHR1_HG19, CHR_NONSTANDARD1, CHR2_HG19), Arrays.asList(CHR1_HG19, CHR2_HG19), DIFFERENT_INDICES, DIFFERENT_INDICES_EXCEPTION, false, true}, + { Arrays.asList(CHR1_HG19, CHR_NONSTANDARD1, CHR2_HG19, CHR10_HG19), Arrays.asList(CHR1_HG19, CHR2_HG19, CHR10_HG19), DIFFERENT_INDICES, DIFFERENT_INDICES_EXCEPTION, false, true}, + { Arrays.asList(CHR1_HG19, CHR2_HG19, CHR_NONSTANDARD1, CHRM_HG19 ), Arrays.asList(CHR1_HG19, CHR2_HG19, CHRM_HG19), DIFFERENT_INDICES, DIFFERENT_INDICES_EXCEPTION, false, true}, + { Arrays.asList(CHR1_HG19, CHR2_HG19, CHR_NONSTANDARD1, CHRM_HG19, CHR_NONSTANDARD2 ), Arrays.asList(CHR1_HG19, CHR2_HG19, CHRM_HG19, CHR_NONSTANDARD2 ), DIFFERENT_INDICES, DIFFERENT_INDICES_EXCEPTION, false, true}, + { Arrays.asList(CHR1_HG19, CHR_NONSTANDARD1, CHR2_HG19, CHRM_HG19, CHR_NONSTANDARD2 ), Arrays.asList(CHR1_HG19, CHR2_HG19, CHRM_HG19, CHR_NONSTANDARD2 ), DIFFERENT_INDICES, DIFFERENT_INDICES_EXCEPTION, false, true}, + + // Same test cases as above, but these have checkContigOrdering == false, so we expect SUPERSET or COMMON_SUBSET instead of DIFFERENT_INDICES, and no exception: + { Arrays.asList(CHRM_HG19, CHR1_HG19), Arrays.asList(CHR1_HG19), SUPERSET, null, false, false}, + { Arrays.asList(CHRM_HG19, CHR1_HG19), Arrays.asList(CHR1_HG19), SUPERSET, null, true, false}, + { Arrays.asList(CHR1_HG19, CHR2_HG19), Arrays.asList(CHRM_HG19, CHR1_HG19, CHR2_HG19), COMMON_SUBSET, null, false, false}, + { Arrays.asList(CHR1_HG19, CHR2_HG19), Arrays.asList(CHRM_HG19, CHR1_HG19, CHR2_HG19, CHR_NONSTANDARD1), COMMON_SUBSET, null, false, false}, + { Arrays.asList(CHR1_HG19, CHR2_HG19), Arrays.asList(CHRM_HG19, CHR_NONSTANDARD1, CHR1_HG19, CHR2_HG19), COMMON_SUBSET, null, false, false}, + { Arrays.asList(CHR1_HG19, CHR_NONSTANDARD1, CHR2_HG19), Arrays.asList(CHR1_HG19, CHR2_HG19), SUPERSET, null, false, false}, + { Arrays.asList(CHR1_HG19, CHR_NONSTANDARD1, CHR2_HG19, CHR10_HG19), Arrays.asList(CHR1_HG19, CHR2_HG19, CHR10_HG19), SUPERSET, null, false, false}, + { Arrays.asList(CHR1_HG19, CHR2_HG19, CHR_NONSTANDARD1, CHRM_HG19 ), Arrays.asList(CHR1_HG19, CHR2_HG19, CHRM_HG19), SUPERSET, null, false, false}, + { Arrays.asList(CHR1_HG19, CHR2_HG19, CHR_NONSTANDARD1, CHRM_HG19, CHR_NONSTANDARD2 ), Arrays.asList(CHR1_HG19, CHR2_HG19, CHRM_HG19, CHR_NONSTANDARD2 ), SUPERSET, null, false, false}, + { Arrays.asList(CHR1_HG19, CHR_NONSTANDARD1, CHR2_HG19, CHRM_HG19, CHR_NONSTANDARD2 ), Arrays.asList(CHR1_HG19, CHR2_HG19, CHRM_HG19, CHR_NONSTANDARD2 ), SUPERSET, null, false, false}, + + // tests for SUPERSET + { Arrays.asList(CHR1_HG19, CHR2_HG19), Arrays.asList(CHR1_HG19), SUPERSET, null, false, false}, + { Arrays.asList(CHR1_HG19, CHR2_HG19), Arrays.asList(CHR1_HG19), SUPERSET, null, false, true}, + { Arrays.asList(CHR1_HG19, CHR2_HG19), Arrays.asList(CHR1_HG19), SUPERSET, null, true, false}, + { Arrays.asList(CHR1_HG19, CHR2_HG19), Arrays.asList(CHR1_HG19), SUPERSET, null, true, true}, + { Arrays.asList(CHR1_HG19, CHR_NONSTANDARD1), Arrays.asList(CHR1_HG19), SUPERSET, null, false, false}, + { Arrays.asList(CHR1_HG19, CHR2_HG19, CHR10_HG19, CHR_NONSTANDARD1), Arrays.asList(CHR1_HG19, CHR2_HG19, CHR10_HG19), SUPERSET, null, false, false}, + { Arrays.asList(CHR1_HG19, CHR_NONSTANDARD1, CHR2_HG19, CHR10_HG19), Arrays.asList(CHR1_HG19, CHR2_HG19, CHR10_HG19), SUPERSET, null, false, false}, + { Arrays.asList(CHR1_HG19, CHR_NONSTANDARD1, CHR2_HG19, CHR10_HG19), Arrays.asList(CHR1_HG19, CHR2_HG19), SUPERSET, null, false, false}, + // Extended attributes should be ignored when determining whether a superset exists: + { Arrays.asList(CHR1_HG19, CHR2_HG19), Arrays.asList(CHR1_HG19_WITH_ATTRIBUTES), SUPERSET, null, false, false}, + { Arrays.asList(CHR1_HG19, CHR2_HG19, CHR10_HG19, CHRM_HG19), Arrays.asList(CHR1_HG19_WITH_ATTRIBUTES, CHR10_HG19), SUPERSET, null, false, false} + }; + } + + @Test( dataProvider = "SequenceDictionaryDataProvider" ) + public void testSequenceDictionaryValidation( final List firstDictionaryContigs, + final List secondDictionaryContigs, + final SequenceDictionaryUtils.SequenceDictionaryCompatibility dictionaryCompatibility, //not needed by this test + final Class expectedExceptionUponValidation, + final boolean requireSuperset, + final boolean checkContigOrdering) { + final SAMSequenceDictionary firstDictionary = createSequenceDictionary(firstDictionaryContigs); + final SAMSequenceDictionary secondDictionary = createSequenceDictionary(secondDictionaryContigs); + final String testDescription = String.format("First dictionary: %s Second dictionary: %s", + SequenceDictionaryUtils.getDictionaryAsString(firstDictionary), + SequenceDictionaryUtils.getDictionaryAsString(secondDictionary)); + Exception exceptionThrown = null; + try { + SequenceDictionaryUtils.validateDictionaries( + "firstDictionary", + firstDictionary, + "secondDictionary", + secondDictionary, + requireSuperset, + checkContigOrdering); + } + catch ( Exception e ) { + exceptionThrown = e; + } + if ( expectedExceptionUponValidation != null ) { + Assert.assertTrue(exceptionThrown != null && expectedExceptionUponValidation.isInstance(exceptionThrown), + String.format("Expected exception %s but saw %s instead. %s", + expectedExceptionUponValidation.getSimpleName(), + exceptionThrown == null ? "no exception" : exceptionThrown.getClass().getSimpleName(), + testDescription)); + } + else { + Assert.assertTrue(exceptionThrown == null, + String.format("Expected no exception but saw exception %s instead. %s", + exceptionThrown != null ? exceptionThrown.getClass().getSimpleName() : "none", + testDescription)); + } + } + + @Test( dataProvider = "SequenceDictionaryDataProvider" ) + public void testSequenceDictionaryComparison( final List firstDictionaryContigs, + final List secondDictionaryContigs, + final SequenceDictionaryUtils.SequenceDictionaryCompatibility dictionaryCompatibility, + final Class expectedExceptionUponValidation, + final boolean requireSuperset, + final boolean checkContigOrdering) { + + final SAMSequenceDictionary firstDictionary = createSequenceDictionary(firstDictionaryContigs); + final SAMSequenceDictionary secondDictionary = createSequenceDictionary(secondDictionaryContigs); + final String testDescription = String.format("First dictionary: %s Second dictionary: %s", + SequenceDictionaryUtils.getDictionaryAsString(firstDictionary), + SequenceDictionaryUtils.getDictionaryAsString(secondDictionary)); + + final SequenceDictionaryUtils.SequenceDictionaryCompatibility reportedCompatibility = + SequenceDictionaryUtils.compareDictionaries(firstDictionary, secondDictionary, checkContigOrdering); + + Assert.assertTrue(reportedCompatibility == dictionaryCompatibility, + String.format("Dictionary comparison should have returned %s but instead returned %s. %s", + dictionaryCompatibility, reportedCompatibility, testDescription)); + } + + @DataProvider(name = "StandardValidationIgnoresContigOrderData") + public Object[][] getStandardValidationIgnoresContigOrderData() { + return new Object[][] { + { Arrays.asList(CHR1_HG19, CHR2_HG19), Arrays.asList(CHR2_HG19, CHR1_HG19) }, + { Arrays.asList(CHR1_HG19, CHR2_HG19), Arrays.asList(CHR2_HG19, CHR1_HG19, CHR10_HG19) }, + { Arrays.asList(CHR1_HG19, CHR2_HG19), Arrays.asList(CHR10_HG19, CHR2_HG19, CHR1_HG19) }, + { Arrays.asList(CHR1_HG19, CHR2_HG19), Arrays.asList(CHR2_HG19, CHR10_HG19, CHR1_HG19) }, + + }; + } + + @Test(dataProvider = "StandardValidationIgnoresContigOrderData") + public void testStandardValidationIgnoresContigOrder( final List firstDictionaryContigs, final List secondDictionaryContigs ) { + final SAMSequenceDictionary firstDictionary = createSequenceDictionary(firstDictionaryContigs); + final SAMSequenceDictionary secondDictionary = createSequenceDictionary(secondDictionaryContigs); + + // Standard validation (the overload of validateDictionaries() that doesn't take any boolean args) + // should ignore differences in ordering of common contigs, so we shouldn't get an exception here + SequenceDictionaryUtils.validateDictionaries("first", firstDictionary, "second", secondDictionary); + } + + @DataProvider(name = "NonSupersetData") + public Object[][] getNonSupersetData() { + return new Object[][] { + { Arrays.asList(CHR1_HG19, CHR2_HG19), Arrays.asList(CHR2_HG19, CHR1_HG19, CHR10_HG19) }, + { Arrays.asList(CHR1_HG19), Arrays.asList(CHR10_HG19, CHR2_HG19, CHR1_HG19) } + }; + } + + @Test(dataProvider = "NonSupersetData") + public void testStandardValidationDoesNotRequireSuperset( final List firstDictionaryContigs, final List secondDictionaryContigs ) { + final SAMSequenceDictionary firstDictionary = createSequenceDictionary(firstDictionaryContigs); + final SAMSequenceDictionary secondDictionary = createSequenceDictionary(secondDictionaryContigs); + + // Standard validation (the overload of validateDictionaries() that doesn't take any boolean args) + // should not require a superset relationship, so we shouldn't get an exception here + SequenceDictionaryUtils.validateDictionaries("first", firstDictionary, "second", secondDictionary); + } + + @Test(dataProvider = "NonSupersetData", expectedExceptions = UserException.IncompatibleSequenceDictionaries.class) + public void testCRAMValidationDoesRequireSuperset( final List refDictionaryContigs, final List cramDictionaryContigs ) { + final SAMSequenceDictionary refDictionary = createSequenceDictionary(refDictionaryContigs); + final SAMSequenceDictionary cramDictionary = createSequenceDictionary(cramDictionaryContigs); + + // CRAM validation against the reference SHOULD require a superset relationship, so we should + // get an exception here + SequenceDictionaryUtils.validateCRAMDictionaryAgainstReference(refDictionary, cramDictionary); + } + + @DataProvider(name = "SupersetData") + public Object[][] getSupersetData() { + return new Object[][] { + { Arrays.asList(CHR2_HG19, CHR1_HG19, CHR10_HG19), Arrays.asList(CHR2_HG19, CHR1_HG19, CHR10_HG19)}, //exactly same + { Arrays.asList(CHR2_HG19, CHR1_HG19, CHR10_HG19), Arrays.asList(CHR1_HG19, CHR2_HG19) }, + { Arrays.asList(CHR10_HG19, CHR2_HG19, CHR1_HG19), Arrays.asList(CHR1_HG19) } + }; + } + + @Test(dataProvider = "SupersetData") + public void testCRAMValidationDoesAcceptSuperset( final List refDictionaryContigs, final List cramDictionaryContigs ) { + final SAMSequenceDictionary refDictionary = createSequenceDictionary(refDictionaryContigs); + final SAMSequenceDictionary cramDictionary = createSequenceDictionary(cramDictionaryContigs); + + //In these inputs , cram contigs are subsets of ref contigs and so it should be accepted + SequenceDictionaryUtils.validateCRAMDictionaryAgainstReference(refDictionary, cramDictionary); + } + + private SAMSequenceDictionary createSequenceDictionary( final List contigs ) { + final List clonedContigs = new ArrayList(contigs.size()); + + // Clone the individual SAMSequenceRecords to avoid contig-index issues with shared objects + // across multiple dictionaries in tests + for ( SAMSequenceRecord contig : contigs ) { + clonedContigs.add(contig.clone()); + } + + return new SAMSequenceDictionary(clonedContigs); + } + + @Test(expectedExceptions = IllegalArgumentException.class) + public void testGetContigNamesListExpectingException() { + getContigNamesList(null); + } + + @Test + public void testGetContigNamesList() { + + final SAMSequenceDictionary samSequenceDictionary = new SAMSequenceDictionary(Arrays.asList(CHR1_B37, CHR2_B37, CHR10_B37)); + + Assert.assertEquals(getContigNamesList(samSequenceDictionary), Arrays.asList("1", "2", "10")); + } +} \ No newline at end of file