From 2223f66978fdcf9affa0b6fb365ba853fc8eff9b Mon Sep 17 00:00:00 2001 From: Ryan Amari Date: Tue, 18 Jun 2024 10:26:54 -0400 Subject: [PATCH] ALS-6330: Fix variant metadata merging bug, refactor variant model, add tests --- .../data/genotype/VariantMetadataIndex.java | 26 +++++++++++-------- .../hpds/processing/GenomicProcessor.java | 2 +- .../hpds/processing/GenomicProcessorNoOp.java | 2 +- .../processing/GenomicProcessorNodeImpl.java | 2 +- .../GenomicProcessorParentImpl.java | 14 +++------- ...omicProcessorPatientMergingParentImpl.java | 4 +-- .../hpds/processing/VariantListProcessor.java | 12 ++++----- .../genomic/GenomicProcessorRestClient.java | 2 +- .../GenomicProcessorParentImplTest.java | 25 +++++++++++++----- 9 files changed, 50 insertions(+), 39 deletions(-) diff --git a/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariantMetadataIndex.java b/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariantMetadataIndex.java index 12315e96..f401c3b0 100644 --- a/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariantMetadataIndex.java +++ b/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariantMetadataIndex.java @@ -63,7 +63,7 @@ public VariantMetadataIndex() throws IOException { * @param variantSpec * @return */ - public String[] findBySingleVariantSpec(String variantSpec, VariantBucketHolder bucketCache) { + public Set findBySingleVariantSpec(String variantSpec, VariantBucketHolder bucketCache) { try { String[] segments = variantSpec.split(","); if (segments.length < 2) { @@ -78,7 +78,7 @@ public String[] findBySingleVariantSpec(String variantSpec, VariantBucketHolder< || chrOffset != bucketCache.lastChunkOffset) { FileBackedByteIndexedStorage> ContigFbbis = indexMap.get(contig); if(ContigFbbis == null) { - return new String[0]; + return Set.of(); } bucketCache.lastValue = ContigFbbis.get(chrOffset); bucketCache.lastContig = contig; @@ -88,20 +88,20 @@ public String[] findBySingleVariantSpec(String variantSpec, VariantBucketHolder< if( bucketCache.lastValue != null) { if(bucketCache.lastValue.get(variantSpec) == null) { log.warn("No variant data found for spec " + variantSpec); - return new String[0]; + return Set.of(); } - return bucketCache.lastValue.get(variantSpec); + return Set.of(bucketCache.lastValue.get(variantSpec)); } log.warn("No bucket found for spec " + variantSpec + " in bucket " + chrOffset); - return new String[0]; + return Set.of(); } catch (UncheckedIOException e) { log.warn("IOException caught looking up variantSpec : " + variantSpec, e); - return new String[0]; + return Set.of(); } } - public Map findByMultipleVariantSpec(Collection varientSpecList) { + public Map> findByMultipleVariantSpec(Collection varientSpecList) { // log.debug("SPEC list " + varientSpecList.size() + " :: " + Arrays.deepToString(varientSpecList.toArray())); VariantBucketHolder bucketCache = new VariantBucketHolder(); @@ -164,7 +164,7 @@ public synchronized void flush() throws IOException { if(contigFbbis == null) { log.info("creating new file for " + contig); String filePath = fileStoragePrefix + "_" + contig + ".bin"; - contigFbbis = new FileBackedJavaIndexedStorage(Integer.class, (Class>)(Class) ConcurrentHashMap.class, new File(filePath)); + contigFbbis = new FileBackedJavaIndexedStorage(Integer.class, ConcurrentHashMap.class, new File(filePath)); indexMap.put(contig, contigFbbis); } @@ -221,20 +221,24 @@ public static void merge(VariantMetadataIndex variantMetadataIndex1, VariantMeta String filePath = outputDirectory + VARIANT_METADATA_STORAGE_FILE_PREFIX + "_" + contig + ".bin"; FileBackedByteIndexedStorage> mergedFbbis = new FileBackedJavaIndexedStorage(Integer.class, ConcurrentHashMap.class, new File(filePath)); + Map> mergedMockFbbis = new HashMap<>(); + FileBackedByteIndexedStorage> fbbis1 = variantMetadataIndex1.indexMap.get(contig); FileBackedByteIndexedStorage> fbbis2 = variantMetadataIndex2.indexMap.get(contig); fbbis1.keys().forEach(key -> { - mergedFbbis.put(key, fbbis1.get(key)); + mergedMockFbbis.put(key, fbbis1.get(key)); }); fbbis2.keys().forEach(key -> { - ConcurrentHashMap metadataMap = mergedFbbis.get(key); + ConcurrentHashMap metadataMap = mergedMockFbbis.get(key); if (metadataMap == null) { - mergedFbbis.put(key, fbbis2.get(key)); + mergedMockFbbis.put(key, fbbis2.get(key)); } else { metadataMap.putAll(fbbis2.get(key)); } }); + + mergedMockFbbis.forEach(mergedFbbis::put); mergedFbbis.complete(); merged.indexMap.put(contig, mergedFbbis); } diff --git a/processing/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/processing/GenomicProcessor.java b/processing/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/processing/GenomicProcessor.java index f2a2c7f2..157b6b11 100644 --- a/processing/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/processing/GenomicProcessor.java +++ b/processing/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/processing/GenomicProcessor.java @@ -30,5 +30,5 @@ public interface GenomicProcessor { List getInfoColumnMeta(); // todo: make the map value a Set instead of array - Map getVariantMetadata(Collection variantList); + Map> getVariantMetadata(Collection variantList); } diff --git a/processing/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/processing/GenomicProcessorNoOp.java b/processing/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/processing/GenomicProcessorNoOp.java index 653a031d..66addb0c 100644 --- a/processing/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/processing/GenomicProcessorNoOp.java +++ b/processing/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/processing/GenomicProcessorNoOp.java @@ -57,7 +57,7 @@ public List getInfoColumnMeta() { } @Override - public Map getVariantMetadata(Collection variantList) { + public Map> getVariantMetadata(Collection variantList) { return null; } } diff --git a/processing/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/processing/GenomicProcessorNodeImpl.java b/processing/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/processing/GenomicProcessorNodeImpl.java index 64ba4f10..3321f4ca 100644 --- a/processing/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/processing/GenomicProcessorNodeImpl.java +++ b/processing/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/processing/GenomicProcessorNodeImpl.java @@ -402,7 +402,7 @@ public List getInfoColumnMeta() { } @Override - public Map getVariantMetadata(Collection variantList) { + public Map> getVariantMetadata(Collection variantList) { return variantMetadataIndex.findByMultipleVariantSpec(variantList); } } diff --git a/processing/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/processing/GenomicProcessorParentImpl.java b/processing/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/processing/GenomicProcessorParentImpl.java index 67754f06..e94b8687 100644 --- a/processing/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/processing/GenomicProcessorParentImpl.java +++ b/processing/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/processing/GenomicProcessorParentImpl.java @@ -141,9 +141,7 @@ public List getInfoColumnMeta() { } @Override - public Map getVariantMetadata(Collection variantList) { - // this is overly complicated because of the array type. - // todo: update this when we change the method signature from array to set + public Map> getVariantMetadata(Collection variantList) { ConcurrentHashMap> result = new ConcurrentHashMap<>(); nodes.stream() .map(node -> node.getVariantMetadata(variantList)) @@ -151,17 +149,13 @@ public Map getVariantMetadata(Collection variantList) variantMap.entrySet().forEach(entry -> { Set metadata = result.get(entry.getKey()); if (metadata != null) { - metadata.addAll(Set.of(entry.getValue())); + metadata.addAll(entry.getValue()); } else { - result.put(entry.getKey(), new HashSet<>(Set.of(entry.getValue()))); + result.put(entry.getKey(), new HashSet<>(entry.getValue())); } }); }); - return result.entrySet().stream() - .collect(Collectors.toMap( - Map.Entry::getKey, - entry -> entry.getValue().toArray(new String[] {}) - )); + return result; } private List initInfoColumnsMeta() { diff --git a/processing/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/processing/GenomicProcessorPatientMergingParentImpl.java b/processing/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/processing/GenomicProcessorPatientMergingParentImpl.java index 5c658267..42d65570 100644 --- a/processing/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/processing/GenomicProcessorPatientMergingParentImpl.java +++ b/processing/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/processing/GenomicProcessorPatientMergingParentImpl.java @@ -160,11 +160,11 @@ public List getInfoColumnMeta() { } @Override - public Map getVariantMetadata(Collection variantList) { + public Map> getVariantMetadata(Collection variantList) { return nodes.parallelStream() .map(node -> node.getVariantMetadata(variantList)) .reduce((p1, p2) -> { - Map mapCopy = new HashMap<>(p1); + Map> mapCopy = new HashMap<>(p1); mapCopy.putAll(p2); return mapCopy; }).orElseGet(Map::of); diff --git a/processing/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/processing/VariantListProcessor.java b/processing/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/processing/VariantListProcessor.java index 99113111..de77d1fe 100644 --- a/processing/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/processing/VariantListProcessor.java +++ b/processing/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/processing/VariantListProcessor.java @@ -79,7 +79,7 @@ public void runQuery(Query query, AsyncResult asyncResult) * * This should not actually do any filtering based on bitmasks, just INFO columns. * - * @param incomingQuery + * @param query * @return a List of VariantSpec strings that would be eligible to filter patients if the incomingQuery was run as a COUNT query. * @throws IOException */ @@ -96,7 +96,7 @@ public String runVariantListQuery(Query query) throws IOException { /** * Process only variantInfoFilters to count the number of variants that would be included in evaluating the query. * - * @param incomingQuery + * @param query * @return the number of variants that would be used to filter patients if the incomingQuery was run as a COUNT query. * @throws IOException */ @@ -119,7 +119,7 @@ public int runVariantCount(Query query) throws IOException { * The default patientId header value can be overridden by passing the ID_CUBE_NAME environment variable to * the java VM. * - * @param Query A VCF_EXCERPT type query + * @param query A VCF_EXCERPT type query * @param includePatientData whether to include patient specific data * @return A Tab-separated string with one line per variant and one column per patient (plus variant data columns) * @throws IOException @@ -141,12 +141,12 @@ public String runVcfExcerptQuery(Query query, boolean includePatientData) throws log.debug("variantList Size " + variantList.size()); - Map metadata = genomicProcessor.getVariantMetadata(variantList); + Map> metadata = genomicProcessor.getVariantMetadata(variantList); log.debug("metadata size " + metadata.size()); // Sort the variantSpecs so that the user doesn't lose their mind - TreeMap metadataSorted = new TreeMap<>((o1, o2) -> { + TreeMap> metadataSorted = new TreeMap<>((o1, o2) -> { return new VariantSpec(o1).compareTo(new VariantSpec(o2)); }); metadataSorted.putAll(metadata); @@ -217,7 +217,7 @@ public String runVcfExcerptQuery(Query query, boolean includePatientData) throws VariantBucketHolder variantMaskBucketHolder = new VariantBucketHolder(); //loop over the variants identified, and build an output row - metadata.forEach((String variantSpec, String[] variantMetadata)->{ + metadata.forEach((String variantSpec, Set variantMetadata)->{ String[] variantDataColumns = variantSpec.split(","); //4 fixed columns in variant ID (CHROM POSITION REF ALT) diff --git a/processing/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/processing/genomic/GenomicProcessorRestClient.java b/processing/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/processing/genomic/GenomicProcessorRestClient.java index f751b74b..8ecfc0f4 100644 --- a/processing/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/processing/genomic/GenomicProcessorRestClient.java +++ b/processing/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/processing/genomic/GenomicProcessorRestClient.java @@ -114,7 +114,7 @@ public List getInfoColumnMeta() { } @Override - public Map getVariantMetadata(Collection variantList) { + public Map> getVariantMetadata(Collection variantList) { throw new RuntimeException("Not implemented yet"); } } diff --git a/processing/src/test/java/edu/harvard/hms/dbmi/avillach/hpds/processing/GenomicProcessorParentImplTest.java b/processing/src/test/java/edu/harvard/hms/dbmi/avillach/hpds/processing/GenomicProcessorParentImplTest.java index 1a015b88..8d8c2fff 100644 --- a/processing/src/test/java/edu/harvard/hms/dbmi/avillach/hpds/processing/GenomicProcessorParentImplTest.java +++ b/processing/src/test/java/edu/harvard/hms/dbmi/avillach/hpds/processing/GenomicProcessorParentImplTest.java @@ -129,16 +129,29 @@ public void getVariantList_oneNode_returnVariants() { assertEquals(Set.of("variant1", "variant2"), variantList); } + @Test + public void getVariantMetadata_mixedEmptyVariants_mergedCorrectly() { + List variantList = List.of("variant1", "variant2", "variant3"); + when(mockProcessor1.getVariantMetadata(variantList)).thenReturn(Map.of()); + when(mockProcessor2.getVariantMetadata(variantList)).thenReturn(Map.of("variant1", Set.of("metadata1", "metadata2"))); + when(mockProcessor3.getVariantMetadata(variantList)).thenReturn(Map.of("variant3", Set.of("metadata31", "metadata32"))); + + Map> variantMetadata = parentProcessor.getVariantMetadata(variantList); + assertEquals(Set.of("metadata1", "metadata2"), variantMetadata.get("variant1")); + assertEquals(Set.of("metadata31", "metadata32"), variantMetadata.get("variant3")); + assertEquals(2, variantMetadata.size()); + } + @Test public void getVariantMetadata_overlappingVariants_mergedCorrectly() { List variantList = List.of("variant1", "variant2", "variant3"); - when(mockProcessor1.getVariantMetadata(variantList)).thenReturn(Map.of("variant1", new String[]{"metadata1", "metadata2"})); - when(mockProcessor2.getVariantMetadata(variantList)).thenReturn(Map.of("variant1", new String[]{"metadata1", "metadata3"})); - when(mockProcessor3.getVariantMetadata(variantList)).thenReturn(Map.of("variant3", new String[]{"metadata31", "metadata32"})); + when(mockProcessor1.getVariantMetadata(variantList)).thenReturn(Map.of("variant1", Set.of("metadata1", "metadata2"))); + when(mockProcessor2.getVariantMetadata(variantList)).thenReturn(Map.of("variant1", Set.of("metadata1", "metadata3"))); + when(mockProcessor3.getVariantMetadata(variantList)).thenReturn(Map.of("variant3", Set.of("metadata31", "metadata32"))); - Map variantMetadata = parentProcessor.getVariantMetadata(variantList); - assertEquals(Set.of("metadata1", "metadata2", "metadata3"), Set.of(variantMetadata.get("variant1"))); - assertEquals(Set.of("metadata31", "metadata32"), Set.of(variantMetadata.get("variant3"))); + Map> variantMetadata = parentProcessor.getVariantMetadata(variantList); + assertEquals(Set.of("metadata1", "metadata2", "metadata3"), variantMetadata.get("variant1")); + assertEquals(Set.of("metadata31", "metadata32"), variantMetadata.get("variant3")); assertEquals(2, variantMetadata.size()); } } \ No newline at end of file