From 8cf10bd9dc3bc1cdf7c65448f70ba0c79510b9c2 Mon Sep 17 00:00:00 2001 From: Ryan Amari Date: Wed, 21 Feb 2024 09:43:53 -0500 Subject: [PATCH] ALS-5819: Update vcf loader to use variable variant indecies --- .../data/genotype/BucketIndexBySample.java | 20 +- .../data/genotype/VariableVariantMasks.java | 206 ++++++++---------- .../hpds/data/genotype/VariantMask.java | 2 +- .../data/genotype/VariantMaskSparseImpl.java | 35 +-- .../FileBackedStorageVariantMasksImpl.java | 9 +- .../genotype/VariantMaskPerformanceTest.java | 17 +- .../VariantMasksSerializationTest.java | 6 +- .../etl/genotype/GenomicDatasetMerger.java | 28 ++- .../hpds/etl/genotype/NewVCFLoader.java | 49 +++-- 9 files changed, 179 insertions(+), 193 deletions(-) diff --git a/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/BucketIndexBySample.java b/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/BucketIndexBySample.java index 341c4794..3b89a329 100644 --- a/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/BucketIndexBySample.java +++ b/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/BucketIndexBySample.java @@ -43,7 +43,7 @@ public BucketIndexBySample(VariantStore variantStore, String storageDir) throws //Create a bucketList, containing keys for all buckets in the variantStore for(String contig: contigSet){ - FileBackedByteIndexedStorage> contigStore = variantStore.getVariantMaskStorage().get(contig); + FileBackedByteIndexedStorage> contigStore = variantStore.getVariantMaskStorage().get(contig); if(contigStore != null && contigStore.keys() != null) { bucketList.addAll(contigStore.keys().stream().map( (Integer bucket)->{ @@ -74,7 +74,7 @@ public BucketIndexBySample(VariantStore variantStore, String storageDir) throws patientBucketCharMasks[x] = emptyBucketMaskChar(); } contigSet.parallelStream().forEach((contig)->{ - FileBackedByteIndexedStorage> contigStore = + FileBackedByteIndexedStorage> contigStore = variantStore.getVariantMaskStorage().get(contig); if(contigStore != null && contigStore.keys() != null) { contigStore.keys().stream().forEach( @@ -82,22 +82,24 @@ public BucketIndexBySample(VariantStore variantStore, String storageDir) throws String bucketKey = contig + ":" + bucket; // Create a bitmask with 1 values for each patient who has any variant in this bucket - BigInteger[] patientMaskForBucket = {variantStore.emptyBitmask()}; - contigStore.get(bucket).values().forEach((VariantMasks masks)->{ + VariantMask[] patientMaskForBucket = {new VariantMaskSparseImpl(Set.of())}; + contigStore.get(bucket).values().forEach((VariableVariantMasks masks)->{ if(masks.heterozygousMask!=null) { - patientMaskForBucket[0] = patientMaskForBucket[0].or(masks.heterozygousMask); + patientMaskForBucket[0] = patientMaskForBucket[0].union(masks.heterozygousMask); } //add hetreo no call bits to mask if(masks.heterozygousNoCallMask!=null) { - patientMaskForBucket[0] = patientMaskForBucket[0].or(masks.heterozygousNoCallMask); + patientMaskForBucket[0] = patientMaskForBucket[0].union(masks.heterozygousNoCallMask); } if(masks.homozygousMask!=null) { - patientMaskForBucket[0] = patientMaskForBucket[0].or(masks.homozygousMask); + patientMaskForBucket[0] = patientMaskForBucket[0].union(masks.homozygousMask); } }); // For each patient set the patientBucketCharMask entry to 0 or 1 if they have a variant in the bucket. - int maxIndex = patientMaskForBucket[0].bitLength() - 1; + + // todo: implement for variant explorer + /*int maxIndex = patientMaskForBucket[0].bitLength() - 1; int indexOfBucket = Collections.binarySearch(bucketList, bucketKey) + 2; //patientBucketCharMasks has bookend bits for(int x = 2;x emptyBitmaskMap = new ConcurrentHashMap<>(); - private static int SPARSE_VARIANT_THRESHOLD = 5; + public static int SPARSE_VARIANT_THRESHOLD = 5; /* * These indices result from the char values of the 3 characters in a VCF sample @@ -68,121 +63,45 @@ public class VariableVariantMasks implements Serializable { // .|. = (46 + 124 + 46) % 7 = 6 public static final int HOMO_NOCALL_CHAR_INDEX = 6; - public VariableVariantMasks(String[] values) { - char[] homozygousBits = new char[values.length]; - char[] heterozygousBits = new char[values.length]; - char[] homozygousNoCallBits = new char[values.length]; - char[] heterozygousNoCallBits = new char[values.length]; - length = values.length; - boolean hasHetero = false; - boolean hasHomo = false; - boolean hasHeteroNoCall = false; - boolean hasHomoNoCall = false; - - for(int x = 0;x VariableVariantMasks.SPARSE_VARIANT_THRESHOLD) { + variantMask = new VariantMaskBitmaskImpl(bitmask); + } else { + Set patientIndexes = new HashSet<>(); + for(int i = 2; i < bitmask.bitLength() - 2; i++) { + if (bitmask.testBit(i)) { + patientIndexes.add(i); + } + } + variantMask = new VariantMaskSparseImpl(patientIndexes); + } + return variantMask; } public VariableVariantMasks() { } + public VariableVariantMasks(int length) { + this.length = length; + } @JsonProperty("ho") public VariantMask homozygousMask; @@ -252,14 +171,69 @@ public BigInteger appendMask(BigInteger mask1, int mask1Length, BigInteger mask2 public VariableVariantMasks append(VariableVariantMasks variantMasks) { VariableVariantMasks appendedMasks = new VariableVariantMasks(); - appendedMasks.homozygousMask = appendMask(this.homozygousMask, variantMasks.homozygousMask); - appendedMasks.heterozygousMask = appendMask(this.heterozygousMask, variantMasks.heterozygousMask); - appendedMasks.homozygousNoCallMask = appendMask(this.homozygousNoCallMask, variantMasks.homozygousNoCallMask); - appendedMasks.heterozygousNoCallMask = appendMask(this.heterozygousNoCallMask, variantMasks.heterozygousNoCallMask); + appendedMasks.homozygousMask = appendMask(this.homozygousMask, variantMasks.homozygousMask, this.length, variantMasks.length); + appendedMasks.heterozygousMask = appendMask(this.heterozygousMask, variantMasks.heterozygousMask, this.length, variantMasks.length); + appendedMasks.homozygousNoCallMask = appendMask(this.homozygousNoCallMask, variantMasks.homozygousNoCallMask, this.length, variantMasks.length); + appendedMasks.heterozygousNoCallMask = appendMask(this.heterozygousNoCallMask, variantMasks.heterozygousNoCallMask, this.length, variantMasks.length); return appendedMasks; } - private VariantMask appendMask(VariantMask homozygousMask, VariantMask homozygousMask1) { - return null; + private VariantMask appendMask(VariantMask variantMask1, VariantMask variantMask2, int length1, int length2) { + if (variantMask1 instanceof VariantMaskSparseImpl) { + if (variantMask2 instanceof VariantMaskSparseImpl) { + return append((VariantMaskSparseImpl) variantMask1, (VariantMaskSparseImpl) variantMask2, length1, length2); + } else if (variantMask2 instanceof VariantMaskBitmaskImpl) { + return append((VariantMaskSparseImpl) variantMask1, (VariantMaskBitmaskImpl) variantMask2, length1, length2); + } else { + throw new RuntimeException("Unknown VariantMask implementation"); + } + } + // todo: bitmask + else { + throw new RuntimeException("Unknown VariantMask implementation"); + } + } + + private VariantMask append(VariantMaskSparseImpl variantMask1, VariantMaskBitmaskImpl variantMask2, int length1, int length2) { + BigInteger mask1 = emptyBitmask(length1); + for (Integer patientId : variantMask1.patientIndexes) { + mask1 = mask1.setBit(patientId); + } + String binaryMask1 = mask1.toString(2); + String binaryMask2 = variantMask2.bitmask.toString(2); + String appendedString = binaryMask1.substring(0, binaryMask1.length() - 2) + + binaryMask2.substring(2); + return new VariantMaskBitmaskImpl(new BigInteger(appendedString, 2)); + } + private VariantMask append(VariantMaskSparseImpl variantMask1, VariantMaskSparseImpl variantMask2, int length1, int length2) { + if (variantMask1.patientIndexes.size() + variantMask2.patientIndexes.size() > SPARSE_VARIANT_THRESHOLD) { + // todo: performance test this vs byte array + BigInteger mask = emptyBitmask(length1 + length2); + for (Integer patientId : variantMask1.patientIndexes) { + mask = mask.setBit(patientId + 2); + } + // todo: explain this. it is not intuitive + for (Integer patientId : variantMask2.patientIndexes) { + mask = mask.setBit(patientId + length1 + 2); + } + return new VariantMaskBitmaskImpl(mask); + } + else { + Set patientIndexSet = new HashSet<>(); + patientIndexSet.addAll(variantMask1.patientIndexes); + patientIndexSet.addAll(variantMask2.patientIndexes); + return new VariantMaskSparseImpl(patientIndexSet); + } + } + +/* if (mask1 == null) { + mask1 = variantStore1.emptyBitmask(); + } + if (mask2 == null) { + mask2 = variantStore2.emptyBitmask(); } + String binaryMask1 = mask1.toString(2); + String binaryMask2 = mask2.toString(2); + String appendedString = binaryMask1.substring(0, binaryMask1.length() - 2) + + binaryMask2.substring(2);*/ } diff --git a/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariantMask.java b/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariantMask.java index 8379734c..ed828d67 100644 --- a/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariantMask.java +++ b/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariantMask.java @@ -23,7 +23,7 @@ public interface VariantMask { static VariantMask union(VariantMaskSparseImpl variantMaskSparse, VariantMaskBitmaskImpl variantMaskBitmask) { BigInteger union = variantMaskBitmask.bitmask; - for (Integer patientId : variantMaskSparse.patientIds) { + for (Integer patientId : variantMaskSparse.patientIndexes) { union = union.setBit(patientId + 2); } return new VariantMaskBitmaskImpl(union); diff --git a/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariantMaskSparseImpl.java b/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariantMaskSparseImpl.java index b88efc33..8f0af1af 100644 --- a/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariantMaskSparseImpl.java +++ b/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariantMaskSparseImpl.java @@ -2,22 +2,20 @@ import com.fasterxml.jackson.annotation.JsonProperty; -import java.math.BigInteger; import java.util.HashSet; -import java.util.Objects; import java.util.Set; public class VariantMaskSparseImpl implements VariantMask { - @JsonProperty("ids") - protected Set patientIds; + @JsonProperty("p") + protected Set patientIndexes; - @JsonProperty("size") - private int size; + public VariantMaskSparseImpl(@JsonProperty("p") Set patientIndexes) { + this.patientIndexes = patientIndexes; + } - public VariantMaskSparseImpl(@JsonProperty("ids") Set patientIds, @JsonProperty("size") int size) { - this.patientIds = patientIds; - this.size = size; + public Set getPatientIndexes() { + return patientIndexes; } @Override @@ -37,21 +35,8 @@ public VariantMask union(VariantMask variantMask) { } private VariantMask union(VariantMaskSparseImpl variantMask) { - HashSet union = new HashSet<>(variantMask.patientIds); - union.addAll(this.patientIds); - return new VariantMaskSparseImpl(union, Math.max(variantMask.size, this.size)); - } - - @Override - public boolean equals(Object o) { - if (this == o) return true; - if (o == null || getClass() != o.getClass()) return false; - VariantMaskSparseImpl that = (VariantMaskSparseImpl) o; - return size == that.size && Objects.equals(patientIds, that.patientIds); - } - - @Override - public int hashCode() { - return Objects.hash(patientIds, size); + HashSet union = new HashSet<>(variantMask.patientIndexes); + union.addAll(this.patientIndexes); + return new VariantMaskSparseImpl(union); } } diff --git a/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/storage/FileBackedStorageVariantMasksImpl.java b/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/storage/FileBackedStorageVariantMasksImpl.java index 9d326a8c..bdd94a51 100644 --- a/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/storage/FileBackedStorageVariantMasksImpl.java +++ b/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/storage/FileBackedStorageVariantMasksImpl.java @@ -1,6 +1,7 @@ package edu.harvard.hms.dbmi.avillach.hpds.data.storage; import com.fasterxml.jackson.core.type.TypeReference; +import edu.harvard.hms.dbmi.avillach.hpds.data.genotype.VariableVariantMasks; import edu.harvard.hms.dbmi.avillach.hpds.data.genotype.VariantMasks; import edu.harvard.hms.dbmi.avillach.hpds.storage.FileBackedJsonIndexStorage; @@ -9,17 +10,17 @@ import java.io.Serializable; import java.util.concurrent.ConcurrentHashMap; -public class FileBackedStorageVariantMasksImpl extends FileBackedJsonIndexStorage> implements Serializable { +public class FileBackedStorageVariantMasksImpl extends FileBackedJsonIndexStorage> implements Serializable { private static final long serialVersionUID = -1086729119489479152L; public FileBackedStorageVariantMasksImpl(File storageFile) throws FileNotFoundException { super(storageFile); } - private static final TypeReference> typeRef - = new TypeReference>() {}; + private static final TypeReference> typeRef + = new TypeReference>() {}; @Override - public TypeReference> getTypeReference() { + public TypeReference> getTypeReference() { return typeRef; } } diff --git a/data/src/test/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariantMaskPerformanceTest.java b/data/src/test/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariantMaskPerformanceTest.java index 803c9b94..aa470219 100644 --- a/data/src/test/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariantMaskPerformanceTest.java +++ b/data/src/test/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariantMaskPerformanceTest.java @@ -1,7 +1,5 @@ package edu.harvard.hms.dbmi.avillach.hpds.data.genotype; -import org.junit.jupiter.api.Test; - import java.math.BigInteger; import java.util.Random; import java.util.Set; @@ -18,9 +16,9 @@ public void test() { VariantMaskBitmaskImpl mask100k2 = new VariantMaskBitmaskImpl(generateRandomBitmask(100_000)); VariantMaskBitmaskImpl mask1m = new VariantMaskBitmaskImpl(generateRandomBitmask(1_000_000)); VariantMaskBitmaskImpl mask1m2 = new VariantMaskBitmaskImpl(generateRandomBitmask(1_000_000)); - VariantMaskSparseImpl sparseMask100k = new VariantMaskSparseImpl(Set.of(100, 200, 400, 50_000, 90_000), 100_000); - VariantMaskSparseImpl sparseMask100k2 = new VariantMaskSparseImpl(Set.of(100, 101, 200, 300, 400, 1000, 20_000, 30_000, 50_000, 90_000), 100_000); - VariantMaskSparseImpl sparseMask1m = new VariantMaskSparseImpl(Set.of(100, 200, 400, 50_000, 90_000, 300_000, 420_000, 555_555, 867_530, 999_999), 1_000_000); + VariantMaskSparseImpl sparseMask100k = new VariantMaskSparseImpl(Set.of(100, 200, 400, 50_000, 90_000)); + VariantMaskSparseImpl sparseMask100k2 = new VariantMaskSparseImpl(Set.of(100, 101, 200, 300, 400, 1000, 20_000, 30_000, 50_000, 90_000)); + VariantMaskSparseImpl sparseMask1m = new VariantMaskSparseImpl(Set.of(100, 200, 400, 50_000, 90_000, 300_000, 420_000, 555_555, 867_530, 999_999)); long time = System.currentTimeMillis(); for (int k = 0; k < 1000; k++) { @@ -32,13 +30,13 @@ public void test() { for (int k = 0; k < 1000; k++) { VariantMask and = sparseMask100k.union(mask100k); } - System.out.println(mask100k.getBitmask().bitLength() + " bitmask and " + sparseMask100k.patientIds.size() + " sparse union completed in " + (System.currentTimeMillis() - time) + " ms"); + System.out.println(mask100k.getBitmask().bitLength() + " bitmask and " + sparseMask100k.patientIndexes.size() + " sparse union completed in " + (System.currentTimeMillis() - time) + " ms"); time = System.currentTimeMillis(); for (int k = 0; k < 1000; k++) { VariantMask and = sparseMask100k2.union(mask100k); } - System.out.println(mask100k.getBitmask().bitLength() + " bitmask and " + sparseMask100k2.patientIds.size() + " sparse union completed in " + (System.currentTimeMillis() - time) + " ms"); + System.out.println(mask100k.getBitmask().bitLength() + " bitmask and " + sparseMask100k2.patientIndexes.size() + " sparse union completed in " + (System.currentTimeMillis() - time) + " ms"); time = System.currentTimeMillis(); for (int k = 0; k < 1000; k++) { @@ -50,17 +48,18 @@ public void test() { for (int k = 0; k < 1000; k++) { VariantMask and = sparseMask100k.union(mask1m); } - System.out.println(mask1m.getBitmask().bitLength() + " bitmask and " + sparseMask100k.patientIds.size() + " sparse union completed in " + (System.currentTimeMillis() - time) + " ms"); + System.out.println(mask1m.getBitmask().bitLength() + " bitmask and " + sparseMask100k.patientIndexes.size() + " sparse union completed in " + (System.currentTimeMillis() - time) + " ms"); time = System.currentTimeMillis(); for (int k = 0; k < 1000; k++) { VariantMask and = sparseMask1m.union(mask1m); } - System.out.println(mask1m.getBitmask().bitLength() + " bitmask and " + sparseMask1m.patientIds.size() + " sparse union completed in " + (System.currentTimeMillis() - time) + " ms"); + System.out.println(mask1m.getBitmask().bitLength() + " bitmask and " + sparseMask1m.patientIndexes.size() + " sparse union completed in " + (System.currentTimeMillis() - time) + " ms"); } private BigInteger generateRandomBitmask(int patients) { return new BigInteger(patients + 4, new Random()); } + } diff --git a/data/src/test/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariantMasksSerializationTest.java b/data/src/test/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariantMasksSerializationTest.java index 5879b809..6bdbb3fc 100644 --- a/data/src/test/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariantMasksSerializationTest.java +++ b/data/src/test/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariantMasksSerializationTest.java @@ -52,9 +52,9 @@ public void testFieldMaxLength() throws JsonProcessingException { @Test public void testVariableVariantMasks() throws JsonProcessingException { VariableVariantMasks variableVariantMasks = new VariableVariantMasks(); - variableVariantMasks.heterozygousMask = new VariantMaskSparseImpl(Set.of(1, 2, 3), 1000); - variableVariantMasks.homozygousMask = new VariantMaskBitmaskImpl(new BigInteger("1010101010101010")); - variableVariantMasks.heterozygousNoCallMask = new VariantMaskSparseImpl(Set.of(), 1000); + variableVariantMasks.heterozygousMask = new VariantMaskSparseImpl(Set.of(1, 2, 3)); + variableVariantMasks.homozygousMask = new VariantMaskBitmaskImpl(new BigInteger("1101010101010101011")); + variableVariantMasks.heterozygousNoCallMask = new VariantMaskSparseImpl(Set.of()); variableVariantMasks.homozygousNoCallMask = null; ObjectMapper objectMapper = new ObjectMapper(); diff --git a/etl/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/etl/genotype/GenomicDatasetMerger.java b/etl/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/etl/genotype/GenomicDatasetMerger.java index d4f51026..09501345 100644 --- a/etl/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/etl/genotype/GenomicDatasetMerger.java +++ b/etl/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/etl/genotype/GenomicDatasetMerger.java @@ -235,7 +235,7 @@ public FileBackedJsonIndexStorage entry : masks2.entrySet()) { if (!mergedMasks.containsKey(entry.getKey())) { - // todo: store empty variant masks as sparse to avoid this awkward null check - mergedMasks.put(entry.getKey(), append(new VariantMasks(), entry.getValue())); + mergedMasks.put(entry.getKey(), new VariableVariantMasks(variantStore1.getPatientIds().length).append(entry.getValue())); } } merged.put(key, mergedMasks); @@ -255,10 +254,31 @@ public FileBackedJsonIndexStorage masks2 = variantMaskStorage2.get(key); for (Map.Entry entry : masks2.entrySet()) { if (!mergedMasks.containsKey(entry.getKey())) { - mergedMasks.put(entry.getKey(), append(new VariableVariantMasks(), entry.getValue())); + mergedMasks.put(entry.getKey(), new VariableVariantMasks(variantStore1.getPatientIds().length).append(entry.getValue())); } } }); return merged; } + + /** + * Appends one mask to another. This assumes the masks are both padded with '11' on each end + * to prevent overflow issues. + */ + /*public BigInteger appendMask(BigInteger mask1, BigInteger mask2) { + if (mask1 == null && mask2 == null) { + return null; + } + if (mask1 == null) { + mask1 = variantStore1.emptyBitmask(); + } + if (mask2 == null) { + mask2 = variantStore2.emptyBitmask(); + } + String binaryMask1 = mask1.toString(2); + String binaryMask2 = mask2.toString(2); + String appendedString = binaryMask1.substring(0, binaryMask1.length() - 2) + + binaryMask2.substring(2); + return new BigInteger(appendedString, 2); + }*/ } diff --git a/etl/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/etl/genotype/NewVCFLoader.java b/etl/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/etl/genotype/NewVCFLoader.java index 7727aba2..43f6c351 100644 --- a/etl/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/etl/genotype/NewVCFLoader.java +++ b/etl/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/etl/genotype/NewVCFLoader.java @@ -8,6 +8,7 @@ import java.util.stream.Collectors; import java.util.zip.GZIPOutputStream; +import edu.harvard.hms.dbmi.avillach.hpds.data.genotype.*; import edu.harvard.hms.dbmi.avillach.hpds.data.storage.FileBackedStorageVariantMasksImpl; import edu.harvard.hms.dbmi.avillach.hpds.storage.FileBackedJsonIndexStorage; import org.apache.commons.csv.CSVFormat; @@ -16,9 +17,6 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import edu.harvard.hms.dbmi.avillach.hpds.data.genotype.InfoStore; -import edu.harvard.hms.dbmi.avillach.hpds.data.genotype.VariantMasks; -import edu.harvard.hms.dbmi.avillach.hpds.data.genotype.VariantStore; import edu.harvard.hms.dbmi.avillach.hpds.storage.FileBackedByteIndexedStorage; import htsjdk.samtools.util.BlockCompressedInputStream; @@ -60,7 +58,7 @@ public static void main(String[] args) throws FileNotFoundException, IOException private static HashMap zygosityMaskStrings; - private static TreeMap>> variantMaskStorage = new TreeMap<>(); + private static TreeMap>> variantMaskStorage = new TreeMap<>(); private static long startTime; @@ -180,7 +178,7 @@ private static void loadVCFs(File indexFile) throws IOException { int[] count = { 0 }; for (String contig : store.getVariantMaskStorage().keySet()) { ArrayList chunkIds = new ArrayList<>(); - FileBackedByteIndexedStorage> chromosomeStorage = store.getVariantMaskStorage() + FileBackedByteIndexedStorage> chromosomeStorage = store.getVariantMaskStorage() .get(contig); if (chromosomeStorage != null) { // print out the top and bottom 50 variants in the store (that have masks) @@ -188,11 +186,11 @@ private static void loadVCFs(File indexFile) throws IOException { for (Integer chunkId : chunkIds) { for (String variantSpec : chromosomeStorage.get(chunkId).keySet()) { count[0]++; - VariantMasks variantMasks = chromosomeStorage.get(chunkId).get(variantSpec); + VariableVariantMasks variantMasks = chromosomeStorage.get(chunkId).get(variantSpec); if (variantMasks != null) { - BigInteger heterozygousMask = variantMasks.heterozygousMask; + VariantMask heterozygousMask = variantMasks.heterozygousMask; String heteroIdList = sampleIdsForMask(allSampleIds, heterozygousMask); - BigInteger homozygousMask = variantMasks.homozygousMask; + VariantMask homozygousMask = variantMasks.homozygousMask; String homoIdList = sampleIdsForMask(allSampleIds, homozygousMask); if (!heteroIdList.isEmpty() && heteroIdList.length() < 1000) @@ -210,11 +208,11 @@ private static void loadVCFs(File indexFile) throws IOException { int chunkId = chunkIds.get(x); chromosomeStorage.get(chunkId).keySet().forEach((variantSpec) -> { count[0]++; - VariantMasks variantMasks = chromosomeStorage.get(chunkId).get(variantSpec); + VariableVariantMasks variantMasks = chromosomeStorage.get(chunkId).get(variantSpec); if (variantMasks != null) { - BigInteger heterozygousMask = variantMasks.heterozygousMask; + VariantMask heterozygousMask = variantMasks.heterozygousMask; String heteroIdList = sampleIdsForMask(allSampleIds, heterozygousMask); - BigInteger homozygousMask = variantMasks.homozygousMask; + VariantMask homozygousMask = variantMasks.homozygousMask; String homoIdList = sampleIdsForMask(allSampleIds, homozygousMask); if (!heteroIdList.isEmpty() && heteroIdList.length() < 1000) @@ -234,12 +232,19 @@ private static void loadVCFs(File indexFile) throws IOException { saveVariantStore(store, variantMaskStorage); } - private static String sampleIdsForMask(String[] sampleIds, BigInteger heterozygousMask) { + private static String sampleIdsForMask(String[] sampleIds, VariantMask variantMask) { String idList = ""; - if (heterozygousMask != null) { - for (int x = 2; x < heterozygousMask.bitLength() - 2; x++) { - if (heterozygousMask.testBit(heterozygousMask.bitLength() - 1 - x)) { - idList += sampleIds[x - 2] + ","; + if (variantMask != null) { + if (variantMask instanceof VariantMaskBitmaskImpl) { + BigInteger mask = ((VariantMaskBitmaskImpl) variantMask).getBitmask(); + for (int x = 2; x < mask.bitLength() - 2; x++) { + if (mask.testBit(mask.bitLength() - 1 - x)) { + idList += sampleIds[x - 2] + ","; + } + } + } else if (variantMask instanceof VariantMaskSparseImpl) { + for (Integer patientIndex : ((VariantMaskSparseImpl) variantMask).getPatientIndexes()) { + idList += sampleIds[patientIndex] + ","; } } } @@ -276,7 +281,7 @@ private static void flipChunk(String lastContigProcessed, int lastChunkProcessed } if (!currentContig.contentEquals(lastContigProcessed) || currentChunk > lastChunkProcessed || isLastChunk) { // flip chunk - TreeMap>> variantMaskStorage_f = variantMaskStorage; + TreeMap>> variantMaskStorage_f = variantMaskStorage; HashMap zygosityMaskStrings_f = zygosityMaskStrings; String lastContigProcessed_f = lastContigProcessed; int lastChunkProcessed_f = lastChunkProcessed; @@ -306,10 +311,10 @@ private static void flipChunk(String lastContigProcessed, int lastChunkProcessed } private static void saveVariantStore(VariantStore store, - TreeMap>> variantMaskStorage) + TreeMap>> variantMaskStorage) throws IOException, FileNotFoundException { store.setVariantMaskStorage(variantMaskStorage); - for (FileBackedByteIndexedStorage> storage : variantMaskStorage + for (FileBackedByteIndexedStorage> storage : variantMaskStorage .values()) { if (storage != null) storage.complete(); @@ -361,11 +366,11 @@ private static void shutdownChunkWriteExecutor() { } } - private static ConcurrentHashMap convertLoadingMapToMaskMap( + private static ConcurrentHashMap convertLoadingMapToMaskMap( HashMap zygosityMaskStrings_f) { - ConcurrentHashMap maskMap = new ConcurrentHashMap<>(zygosityMaskStrings_f.size()); + ConcurrentHashMap maskMap = new ConcurrentHashMap<>(zygosityMaskStrings_f.size()); zygosityMaskStrings_f.entrySet().parallelStream().forEach((entry) -> { - maskMap.put(entry.getKey(), new VariantMasks(entry.getValue())); + maskMap.put(entry.getKey(), new VariableVariantMasks(entry.getValue())); }); return maskMap; }