diff --git a/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariableVariantMasks.java b/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariableVariantMasks.java index 563df590..6afb62f1 100644 --- a/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariableVariantMasks.java +++ b/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariableVariantMasks.java @@ -150,15 +150,6 @@ public static BigInteger emptyBitmask(int length) { return emptyBitmask; } - /*public VariableVariantMasks append(VariableVariantMasks variantMasks) { - VariableVariantMasks appendedMasks = new VariableVariantMasks(); - appendedMasks.homozygousMask = appendMask(this.homozygousMask, variantMasks.homozygousMask, this.length, variantMasks.length); - appendedMasks.heterozygousMask = appendMask(this.heterozygousMask, variantMasks.heterozygousMask, this.length, variantMasks.length); - appendedMasks.homozygousNoCallMask = appendMask(this.homozygousNoCallMask, variantMasks.homozygousNoCallMask, this.length, variantMasks.length); - appendedMasks.heterozygousNoCallMask = appendMask(this.heterozygousNoCallMask, variantMasks.heterozygousNoCallMask, this.length, variantMasks.length); - return appendedMasks; - }*/ - public static VariableVariantMasks append(VariableVariantMasks masks1, int length1, VariableVariantMasks masks2, int length2) { VariableVariantMasks appendedMasks = new VariableVariantMasks(); appendedMasks.homozygousMask = appendMask(masks1.homozygousMask, masks2.homozygousMask, length1, length2); @@ -204,11 +195,16 @@ else if (variantMask1 instanceof VariantMaskBitmaskImpl) { private static VariantMask append(VariantMaskSparseImpl variantMask1, VariantMaskBitmaskImpl variantMask2, int length1, int length2) { BigInteger mask1 = emptyBitmask(length1); - for (Integer patientId : variantMask1.patientIndexes) { - mask1 = mask1.setBit(patientId); + for (Integer patientIndex : variantMask1.patientIndexes) { + mask1 = mask1.setBit(patientIndex + 2); } String binaryMask1 = mask1.toString(2); + String binaryMask2 = variantMask2.bitmask.toString(2); + if (binaryMask2.length() - 4 != length2) { + throw new IllegalArgumentException("Bitmask does not match length (" + length2 + "): " + variantMask2.bitmask); + } + String appendedString = binaryMask2.substring(0, binaryMask2.length() - 2) + binaryMask1.substring(2); return new VariantMaskBitmaskImpl(new BigInteger(appendedString, 2)); @@ -216,10 +212,13 @@ private static VariantMask append(VariantMaskSparseImpl variantMask1, VariantMas private static VariantMask append(VariantMaskBitmaskImpl variantMask1, VariantMaskSparseImpl variantMask2, int length1, int length2) { String binaryMask1 = variantMask1.bitmask.toString(2); + if (binaryMask1.length() - 4 != length1) { + throw new IllegalArgumentException("Bitmask does not match length (" + length1 + "): " + variantMask1.bitmask); + } BigInteger mask2 = emptyBitmask(length2); for (Integer patientId : variantMask2.patientIndexes) { - mask2 = mask2.setBit(patientId); + mask2 = mask2.setBit(patientId + 2); } String binaryMask2 = mask2.toString(2); @@ -232,9 +231,17 @@ private static VariantMask append(VariantMaskBitmaskImpl variantMask1, VariantMa String binaryMask1 = variantMask1.bitmask.toString(2); String binaryMask2 = variantMask2.bitmask.toString(2); + if (binaryMask1.length() - 4 != length1) { + throw new IllegalArgumentException("Bitmask does not match length (" + length1 + "): " + variantMask1.bitmask); + } + if (binaryMask2.length() - 4 != length2) { + throw new IllegalArgumentException("Bitmask does not match length (" + length2 + "): " + variantMask2.bitmask); + } + String appendedString = binaryMask2.substring(0, binaryMask2.length() - 2) + binaryMask1.substring(2); - return new VariantMaskBitmaskImpl(new BigInteger(appendedString, 2)); + BigInteger bitmask = new BigInteger(appendedString, 2); + return new VariantMaskBitmaskImpl(bitmask); } private static VariantMask append(VariantMaskSparseImpl variantMask1, VariantMaskSparseImpl variantMask2, int length1, int length2) { @@ -259,27 +266,4 @@ private static VariantMask append(VariantMaskSparseImpl variantMask1, VariantMas } } - public static Set patientMaskToPatientIdSet(VariantMask patientMask, List patientIds) { - if (patientMask instanceof VariantMaskBitmaskImpl) { - Set ids = new HashSet<>(); - BigInteger bigInteger = ((VariantMaskBitmaskImpl) patientMask).getBitmask(); - for(int x = 0;x < bigInteger.bitLength()-4;x++) { - if(patientMask.testBit(x)) { - String patientId = patientIds.get(x).trim(); - ids.add(Integer.parseInt(patientId)); - } - } - return ids; - } else if (patientMask instanceof VariantMaskSparseImpl) { - return ((VariantMaskSparseImpl) patientMask).getPatientIndexes().stream() - .map(patientIds::get) - .map(String::trim) - .map(Integer::parseInt) - .collect(Collectors.toSet()); - } - throw new IllegalArgumentException("Unknown VariantMask implementation"); - } - - - } diff --git a/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariantMask.java b/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariantMask.java index c170dce6..86d06472 100644 --- a/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariantMask.java +++ b/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariantMask.java @@ -4,6 +4,7 @@ import com.fasterxml.jackson.annotation.JsonTypeInfo; import java.math.BigInteger; +import java.util.List; import java.util.Set; import java.util.stream.Collectors; @@ -30,4 +31,6 @@ public interface VariantMask { static VariantMask emptyInstance() { return new VariantMaskSparseImpl(Set.of()); } + + Set patientMaskToPatientIdSet(List patientIds); } diff --git a/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariantMaskBitmaskImpl.java b/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariantMaskBitmaskImpl.java index 316f9e9c..3df430bc 100644 --- a/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariantMaskBitmaskImpl.java +++ b/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariantMaskBitmaskImpl.java @@ -6,7 +6,10 @@ import com.fasterxml.jackson.databind.ser.std.ToStringSerializer; import java.math.BigInteger; +import java.util.HashSet; +import java.util.List; import java.util.Objects; +import java.util.Set; public class VariantMaskBitmaskImpl implements VariantMask { @@ -60,6 +63,18 @@ public int bitCount() { return bitmask.bitCount(); } + @Override + public Set patientMaskToPatientIdSet(List patientIds) { + Set ids = new HashSet<>(); + for(int x = 0;x < bitmask.bitLength()-4;x++) { + if(testBit(x + 2)) { + String patientId = patientIds.get(x).trim(); + ids.add(Integer.parseInt(patientId)); + } + } + return ids; + } + private VariantMask union(VariantMaskBitmaskImpl variantMaskBitmask) { return new VariantMaskBitmaskImpl(variantMaskBitmask.bitmask.or(this.bitmask)); } diff --git a/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariantMaskSparseImpl.java b/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariantMaskSparseImpl.java index 0b72c63a..fb5e2a6a 100644 --- a/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariantMaskSparseImpl.java +++ b/data/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariantMaskSparseImpl.java @@ -4,6 +4,7 @@ import java.math.BigInteger; import java.util.HashSet; +import java.util.List; import java.util.Objects; import java.util.Set; import java.util.stream.Collectors; @@ -49,6 +50,15 @@ public int bitCount() { return patientIndexes.size(); } + @Override + public Set patientMaskToPatientIdSet(List patientIds) { + return patientIndexes.stream() + .map(patientIds::get) + .map(String::trim) + .map(Integer::parseInt) + .collect(Collectors.toSet()); + } + private VariantMask union(VariantMaskSparseImpl variantMask) { HashSet union = new HashSet<>(variantMask.patientIndexes); union.addAll(this.patientIndexes); diff --git a/data/src/test/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariableVariantMasksTest.java b/data/src/test/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariableVariantMasksTest.java index 6b0c0011..6d00f538 100644 --- a/data/src/test/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariableVariantMasksTest.java +++ b/data/src/test/java/edu/harvard/hms/dbmi/avillach/hpds/data/genotype/VariableVariantMasksTest.java @@ -3,20 +3,93 @@ import org.junit.jupiter.api.Test; import java.math.BigInteger; +import java.util.Set; import static org.junit.jupiter.api.Assertions.*; class VariableVariantMasksTest { @Test - void appendMask() { + void appendMask_bitmaskImpl() { BigInteger bitmask1 = new BigInteger("141128266537977664750676276621047930360647583370951385898970922432171641921591683601425661017874251620531446414001258481539117879583763813143907296662202568713257464719895142475222938754588184582833582945368733841208297077207159012919514593576142503835920156756519924916168242089239884330492690734761017788145472426679068150988914755013607752960816306592042734726724967292489382922774340753414934719473579642086577795740766564256929075326123036793589965882761953873677505969126336030099232419417531157749845223962002657158936982730111297974575206081858525359109608528262092083167977823315667519770531233235502931655993522963196732342798609807510543882273123542784003837220002062547832711681922161218010724226003150184777238854967623299970205873235909960398972219305810619458430315886318451687126866584390511424196256376003247717440484524668930504599413091194844245946142655802347109092922312106655798606759916931458730259588797621736772069847398957694392864416289605234581063232645315000563794292589544477569850783743632035729362609737922579552116678396740867191143283379987783383546964540333832022555110531730782339840166704631278663393147496643156685747417468473443887224102514355150110201463928311558935840684668441340082266117306282514075566574248767903320202249460451453420775463375086967869642395460246215345716051454824012618762098616749770441010729935100690497726813080659572063353154599964806728241328892738034442472863741591198849392657539785115946383059858121434147389809099727185407701653069953116828987459279642700621389676861517273949473071786975738343769782464350475716221573322857987917669867491560145456280976454077958400126376423873081869441464609193216436634914151839730185428390533788824490041131307997850622739795832813220163889874229711053167483400593799762457890098932032534121358716659875839129363445294836150352418351095797845291006220927734160887236196809213101473795"); VariantMask mask1 = new VariantMaskBitmaskImpl(bitmask1); BigInteger bitmask2 = new BigInteger("30986998537042903080350519548408047200882037374471443267447678987916097706689940814664693588471382927433911322922595655683"); VariantMask mask2 = new VariantMaskBitmaskImpl(bitmask2); BigInteger expected = new BigInteger("364428449062309565827240887371711117987574656019257508470944944605332301145049360947562613021825062352285550919581217473685332023988542714833984833584283993391279871691386067405572061472644828810015780133411925154335939452908415258964249206427971383684062861395477403040282307619178797392994855011010859741065713642937722555386553472167057412349499978682577859360114784650082282703011601887907648540462571623667509259030122346885116531561265495702316349933315144109402391832141090339015610768872781857606433548246400953322494469603219947770538558126916689855755477351331412849701993441724311536424509298383050084506206211767917732037267909332303742403672014600658318302593423194411973298402659688783532220840192809967726287753517722498547930875524876017589557370530361981799944335286739781131150443454009586308444310074980028948209472950776329093810066130759876455221806685349198404676070598652587630369814761497643023695920944554335463560556407863991874650393356359742532476840044738380631950001000889884841365536456755318891308758448828432322546827241786950989285280451703830359048521243704376576346418597615317052525572950500220422542100551941365526524572447973251506942320379483402120949518538372004537133062107782710822000749441206410146350956094013529342546401955361956752745578930129151472060056501664176391306323733630924234020572479814310832686356211870629168524382442619274880885230138170761194502147197237097179031772802871639774718532255582374871483153552524216278948609829889298505566944209429410943844015411379772232511986448333683276938151761604643176820833900087800015896793760190738573447504231241253241247411492029282990960833205840126804844971397564707229362865231720213379844987578324282466174232636950775929742330163059024377711272017786328444659381735736216404131832568759328992585474109193095809959345744181054394252160618715444166335074662636279289524790651946664188322635565248044921668500523867959008028963420119948417112017996816651287475428019005246560397815063629879127215220996440067"); - VariantMask merged = VariableVariantMasks.appendMask(mask1, mask2, 0, 0); + // length should equal the length of the binary representation - 4. Bitmasks are padded with 11 on each end + VariantMask merged = VariableVariantMasks.appendMask(mask1, mask2, 6282, 400); assertEquals(expected, ((VariantMaskBitmaskImpl)merged).getBitmask()); } + @Test + void appendMask_bitmaskImpl_invalidLength() { + BigInteger bitmask1 = new BigInteger("141128266537977664750676276621047930360647583370951385898970922432171641921591683601425661017874251620531446414001258481539117879583763813143907296662202568713257464719895142475222938754588184582833582945368733841208297077207159012919514593576142503835920156756519924916168242089239884330492690734761017788145472426679068150988914755013607752960816306592042734726724967292489382922774340753414934719473579642086577795740766564256929075326123036793589965882761953873677505969126336030099232419417531157749845223962002657158936982730111297974575206081858525359109608528262092083167977823315667519770531233235502931655993522963196732342798609807510543882273123542784003837220002062547832711681922161218010724226003150184777238854967623299970205873235909960398972219305810619458430315886318451687126866584390511424196256376003247717440484524668930504599413091194844245946142655802347109092922312106655798606759916931458730259588797621736772069847398957694392864416289605234581063232645315000563794292589544477569850783743632035729362609737922579552116678396740867191143283379987783383546964540333832022555110531730782339840166704631278663393147496643156685747417468473443887224102514355150110201463928311558935840684668441340082266117306282514075566574248767903320202249460451453420775463375086967869642395460246215345716051454824012618762098616749770441010729935100690497726813080659572063353154599964806728241328892738034442472863741591198849392657539785115946383059858121434147389809099727185407701653069953116828987459279642700621389676861517273949473071786975738343769782464350475716221573322857987917669867491560145456280976454077958400126376423873081869441464609193216436634914151839730185428390533788824490041131307997850622739795832813220163889874229711053167483400593799762457890098932032534121358716659875839129363445294836150352418351095797845291006220927734160887236196809213101473795"); + VariantMask mask1 = new VariantMaskBitmaskImpl(bitmask1); + BigInteger bitmask2 = new BigInteger("30986998537042903080350519548408047200882037374471443267447678987916097706689940814664693588471382927433911322922595655683"); + VariantMask mask2 = new VariantMaskBitmaskImpl(bitmask2); + BigInteger expected = new BigInteger("364428449062309565827240887371711117987574656019257508470944944605332301145049360947562613021825062352285550919581217473685332023988542714833984833584283993391279871691386067405572061472644828810015780133411925154335939452908415258964249206427971383684062861395477403040282307619178797392994855011010859741065713642937722555386553472167057412349499978682577859360114784650082282703011601887907648540462571623667509259030122346885116531561265495702316349933315144109402391832141090339015610768872781857606433548246400953322494469603219947770538558126916689855755477351331412849701993441724311536424509298383050084506206211767917732037267909332303742403672014600658318302593423194411973298402659688783532220840192809967726287753517722498547930875524876017589557370530361981799944335286739781131150443454009586308444310074980028948209472950776329093810066130759876455221806685349198404676070598652587630369814761497643023695920944554335463560556407863991874650393356359742532476840044738380631950001000889884841365536456755318891308758448828432322546827241786950989285280451703830359048521243704376576346418597615317052525572950500220422542100551941365526524572447973251506942320379483402120949518538372004537133062107782710822000749441206410146350956094013529342546401955361956752745578930129151472060056501664176391306323733630924234020572479814310832686356211870629168524382442619274880885230138170761194502147197237097179031772802871639774718532255582374871483153552524216278948609829889298505566944209429410943844015411379772232511986448333683276938151761604643176820833900087800015896793760190738573447504231241253241247411492029282990960833205840126804844971397564707229362865231720213379844987578324282466174232636950775929742330163059024377711272017786328444659381735736216404131832568759328992585474109193095809959345744181054394252160618715444166335074662636279289524790651946664188322635565248044921668500523867959008028963420119948417112017996816651287475428019005246560397815063629879127215220996440067"); + // length should equal the length of the binary representation - 4. Bitmasks are padded with 11 on each end + VariantMask merged = VariableVariantMasks.appendMask(mask1, mask2, 6282, 400); + + assertThrows( + IllegalArgumentException.class, + () -> VariableVariantMasks.appendMask(mask1, mask2, 6283, 400) + ); + assertThrows( + IllegalArgumentException.class, + () -> VariableVariantMasks.appendMask(mask1, mask2, 6282, 399) + ); + } + + @Test + void appendMask_sparseImpl() { + VariantMask mask1 = new VariantMaskSparseImpl(Set.of(1, 10, 99)); + VariantMask mask2 = new VariantMaskSparseImpl(Set.of(1, 99)); + VariantMask merged = VariableVariantMasks.appendMask(mask1, mask2, 100, 100); + + VariantMask expected = new VariantMaskSparseImpl(Set.of(1, 10, 99, 101, 199)); + assertEquals(expected, merged); + } + @Test + void appendMask_sparseImplOneEmpty() { + VariantMask mask1 = new VariantMaskSparseImpl(Set.of()); + VariantMask mask2 = new VariantMaskSparseImpl(Set.of(1, 99)); + VariantMask merged = VariableVariantMasks.appendMask(mask1, mask2, 100, 100); + + VariantMask expected = new VariantMaskSparseImpl(Set.of(101, 199)); + assertEquals(expected, merged); + + mask1 = new VariantMaskSparseImpl(Set.of(1, 10, 99)); + mask2 = new VariantMaskSparseImpl(Set.of()); + merged = VariableVariantMasks.appendMask(mask1, mask2, 100, 100); + + expected = new VariantMaskSparseImpl(Set.of(1, 10, 99)); + assertEquals(expected, merged); + } + + @Test + void appendMask_sparseBitmask() { + VariantMask mask1 = new VariantMaskSparseImpl(Set.of(1, 9)); + BigInteger bitmask1 = new BigInteger("11100000010011", 2); + VariantMask mask2 = new VariantMaskBitmaskImpl(bitmask1); + VariantMask merged = VariableVariantMasks.appendMask(mask1, mask2, 10, 10); + + BigInteger bitmask = new BigInteger("111000000100" + "100000001011", 2); + VariantMask expected = new VariantMaskBitmaskImpl(bitmask); + System.out.println(((VariantMaskBitmaskImpl) merged).getBitmask().toString(2)); + assertEquals(expected, merged); + } + + @Test + void appendMask_bitmaskSparse() { + BigInteger bitmask1 = new BigInteger("11100000010011", 2); + VariantMask mask1 = new VariantMaskBitmaskImpl(bitmask1); + VariantMask mask2 = new VariantMaskSparseImpl(Set.of(1, 9)); + VariantMask merged = VariableVariantMasks.appendMask(mask1, mask2, 10, 10); + + BigInteger bitmask = new BigInteger("111000000010" + "100000010011", 2); + VariantMask expected = new VariantMaskBitmaskImpl(bitmask); + System.out.println(((VariantMaskBitmaskImpl) merged).getBitmask().toString(2)); + assertEquals(expected, merged); + } } \ No newline at end of file diff --git a/etl/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/etl/genotype/GenomicDatasetMerger.java b/etl/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/etl/genotype/GenomicDatasetMerger.java index 5ae18bdf..00577e90 100644 --- a/etl/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/etl/genotype/GenomicDatasetMerger.java +++ b/etl/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/etl/genotype/GenomicDatasetMerger.java @@ -295,17 +295,17 @@ public FileBackedJsonIndexStorage { ConcurrentHashMap maskMap = merged.get(key); maskMap.keySet().stream().sorted().limit(5).forEach(variantSpec -> { VariableVariantMasks variableVariantMasks = maskMap.get(variantSpec); VariantMask heteroMask = Optional.ofNullable(variableVariantMasks.heterozygousMask).orElse(VariantMask.emptyInstance()); Set patientsWithVariant = VariableVariantMasks.patientMaskToPatientIdSet(heteroMask, Arrays.asList(mergedVariantStore.getPatientIds())); - log.debug("Patients with variant [" + variantSpec + "]: " + Joiner.on(",").join(patientsWithVariant)); + log.trace("Patients with variant [" + variantSpec + "]: " + Joiner.on(",").join(patientsWithVariant)); }); }); - }*/ + } return merged; } } diff --git a/etl/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/etl/genotype/NewVCFLoader.java b/etl/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/etl/genotype/NewVCFLoader.java index 996c4078..1aed473e 100644 --- a/etl/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/etl/genotype/NewVCFLoader.java +++ b/etl/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/etl/genotype/NewVCFLoader.java @@ -233,22 +233,22 @@ private static void loadVCFs(File indexFile) throws IOException { } private static String sampleIdsForMask(String[] sampleIds, VariantMask variantMask) { - String idList = ""; + StringBuilder idList = new StringBuilder(); if (variantMask != null) { if (variantMask instanceof VariantMaskBitmaskImpl) { BigInteger mask = ((VariantMaskBitmaskImpl) variantMask).getBitmask(); - for (int x = 2; x < mask.bitLength() - 2; x++) { - if (mask.testBit(mask.bitLength() - 1 - x)) { - idList += sampleIds[x - 2] + ","; + for (int x = 0; x < mask.bitLength() - 4; x++) { + if (variantMask.testBit(x)) { + idList.append(sampleIds[x]).append(","); } } } else if (variantMask instanceof VariantMaskSparseImpl) { for (Integer patientIndex : ((VariantMaskSparseImpl) variantMask).getPatientIndexes()) { - idList += sampleIds[patientIndex] + ","; + idList.append(sampleIds[patientIndex]).append(","); } } } - return idList; + return idList.toString(); } private static void flipChunk(String lastContigProcessed, int lastChunkProcessed, int currentChunk,