Skip to content

Commit

Permalink
Allow AnnotateNovelSites to be run in a mode without a prior VCF (#348)
Browse files Browse the repository at this point in the history
  • Loading branch information
bbimber authored Nov 2, 2024
1 parent 0a14e97 commit 1738c47
Show file tree
Hide file tree
Showing 3 changed files with 85 additions and 11 deletions.
30 changes: 23 additions & 7 deletions src/main/java/com/github/discvrseq/walkers/AnnotateNovelSites.java
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,10 @@
import htsjdk.variant.variantcontext.VariantContext;
import htsjdk.variant.variantcontext.VariantContextBuilder;
import htsjdk.variant.variantcontext.writer.VariantContextWriter;
import htsjdk.variant.vcf.*;
import htsjdk.variant.vcf.VCFHeader;
import htsjdk.variant.vcf.VCFHeaderLineCount;
import htsjdk.variant.vcf.VCFHeaderLineType;
import htsjdk.variant.vcf.VCFInfoHeaderLine;
import org.broadinstitute.barclay.argparser.Argument;
import org.broadinstitute.barclay.argparser.CommandLineProgramProperties;
import org.broadinstitute.barclay.help.DocumentedFeature;
Expand Down Expand Up @@ -74,7 +77,7 @@ public class AnnotateNovelSites extends ExtendedMultiVariantWalkerGroupedOnStart
public void onTraversalStart() {
Utils.nonNull(outFile);

if (getMultiVariantInputArgumentCollection().getDrivingVariantPaths().size() > 1) {
if (getAnnotateNovelSitesArgumentCollection().getInputVcfCount() > 1) {
throw new IllegalArgumentException("Cannot provide more than one input VCF (-V)");
}

Expand Down Expand Up @@ -105,7 +108,7 @@ public void onTraversalStart() {
@Override
public void apply(List<VariantContext> variantContexts, ReferenceContext referenceContext, List<ReadsContext> readsContexts) {
Map<FeatureInput<VariantContext>, List<VariantContext>> variants = groupVariantsByFeatureInput(variantContexts);
List<VariantContext> refVariants = variants.get(getAnnotateNovelSitesArgumentCollection().referenceVcf);
List<VariantContext> refVariants = getAnnotateNovelSitesArgumentCollection().referenceVcf == null ? Collections.emptyList() : variants.get(getAnnotateNovelSitesArgumentCollection().referenceVcf);

List<VariantContext> inputVariants = variants.get(getDrivingVariantsFeatureInputs().get(0));
if (inputVariants == null || inputVariants.isEmpty()) {
Expand Down Expand Up @@ -141,14 +144,14 @@ public void apply(List<VariantContext> variantContexts, ReferenceContext referen
// Prior site exists, alleles identical. No action needed:
if (refVariant.getReference().equals(vc.getReference())) {
if (refVariant.getAlternateAlleles().equals(vc.getAlternateAlleles())) {
List<String> val = refVariant.getAttributeAsStringList(novelSiteAnnotationName, existingSiteDefaultAnnotationValue);
List<String> val = refVariant.getAttributeAsStringList(novelSiteAnnotationName, refVariant.getAttributeAsString(novelSiteAnnotationName, existingSiteDefaultAnnotationValue));
if (val != null && !val.isEmpty()) {
vcb.attribute(novelSiteAnnotationName, new ArrayList<>(val));
}
}
else {
// Alleles are different, so annotate with both
Set<String> anns = new LinkedHashSet<>(refVariant.getAttributeAsStringList(novelSiteAnnotationName, existingSiteDefaultAnnotationValue));
Set<String> anns = new LinkedHashSet<>(refVariant.getAttributeAsStringList(novelSiteAnnotationName, refVariant.getAttributeAsString(novelSiteAnnotationName, existingSiteDefaultAnnotationValue)));
anns.add(novelSiteAnnotationValue);
isNovel = true;

Expand Down Expand Up @@ -202,15 +205,28 @@ private AnnotateNovelSitesArgumentCollection getAnnotateNovelSitesArgumentCollec
private static final class AnnotateNovelSitesArgumentCollection extends MultiVariantInputArgumentCollection.DefaultMultiVariantInputArgumentCollection {
private static final long serialVersionUID = 1L;

@Argument(doc="Reference VCF", fullName = "ref-vcf", shortName = "rv", optional = false)
@Argument(doc="Reference VCF", fullName = "ref-vcf", shortName = "rv", optional = true)
public FeatureInput<VariantContext> referenceVcf = null;

@Argument(doc="Allow Missing Reference", fullName = "allow-missing-ref", optional = true)
public boolean allowMissingRef = false;

@Override
public List<GATKPath> getDrivingVariantPaths() {
List<GATKPath> ret = new ArrayList<>(super.getDrivingVariantPaths());
ret.add(referenceVcf);
if (referenceVcf != null) {
ret.add(referenceVcf);
}
else if (!allowMissingRef) {
throw new GATKException("Must either provide reference VCF or specify --allow-missing-ref");
}

return ret;
}

public int getInputVcfCount() {
return super.getDrivingVariantPaths().size();
}

}
}
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,11 @@ public void testBasicOperation() throws Exception {
args.add("ad", "'The first mGAP version where variants at this site appeared'");
args.add("av", "2.0");

args.addRaw("-ns");
args.addRaw("%s");
args.addRaw("-ms");
args.addRaw("%s");

IntegrationTestSpec spec = new IntegrationTestSpec(
args.getString(),
Arrays.asList(
Expand All @@ -44,13 +49,33 @@ private ArgumentsBuilder getBaseArgs() {

args.addRaw("-O");
args.addRaw("%s");
args.addRaw("-ns");
args.addRaw("%s");
args.addRaw("-ms");
args.addRaw("%s");
args.addRaw("--tmp-dir");
args.addRaw(getTmpDir());

return args;
}

@Test
public void testWithoutRef() throws Exception {
ArgumentsBuilder args = getBaseArgs();

args.addRaw("--variant");
File input = new File(testBaseDir, "ClinvarAnnotator.vcf");
ensureVcfIndex(input);
args.addRaw(normalizePath(input));

args.addRaw("--allow-missing-ref");

args.add("an", "mGAPV");
args.add("ad", "'The first mGAP version where variants at this site appeared'");
args.add("av", "2.0");

IntegrationTestSpec spec = new IntegrationTestSpec(
args.getString(),
Arrays.asList(
getTestFile("outputNoRef.vcf").getPath()
));

spec.executeTest("testBasicOperation", this);
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
##fileformat=VCFv4.2
##FORMAT=<ID=AD,Number=R,Type=Integer,Description="Allelic depths for the ref and alt alleles in the order listed">
##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Approximate read depth (reads with MQ=255 or with bad mates are filtered)">
##FORMAT=<ID=FT,Number=.,Type=String,Description="Genotype-level filter">
##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Genotype Quality">
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
##FORMAT=<ID=MV,Number=1,Type=Integer,Description="Number of mendelian violations observed for this sample.">
##FORMAT=<ID=PGT,Number=1,Type=String,Description="Physical phasing haplotype information, describing how the alternate alleles are phased in relation to one another">
##FORMAT=<ID=PID,Number=1,Type=String,Description="Physical phasing ID information, where each unique ID within a given sample (but not across samples) connects records within a phasing group">
##FORMAT=<ID=PL,Number=G,Type=Integer,Description="Normalized, Phred-scaled likelihoods for genotypes as defined in the VCF specification">
##INFO=<ID=MAC,Number=1,Type=Integer,Description="Matching ALT allele count">
##INFO=<ID=PURPOSE,Number=1,Type=String,Description="Purpose of test case">
##INFO=<ID=RN,Number=1,Type=Integer,Description="Random number">
##INFO=<ID=mGAPV,Number=.,Type=String,Description="The first mGAP version where variants at this site appeared">
##contig=<ID=1,length=16000>
##contig=<ID=2,length=16000>
#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT sample1 sample2
1 2 . G A 2328.69 PASS MAC=1;PURPOSE=diff_pos_same_ref_same_alt;RN=12;mGAPV=2.0 GT:AD:DP:FT:GQ:MV:PGT:PID:PL 0/0:13,0:13:PASS:39:0:.:.:0,39,418 0/1:11,9:20:PASS:99:0:.:.:241,0,379
1 4 . G T 2328.69 PASS MAC=1;PURPOSE=same_pos_diff_ref_same_alt;RN=12;mGAPV=2.0 GT:AD:DP:FT:GQ:MV:PGT:PID:PL 0/0:14,0:14:PASS:42:0:.:.:0,42,473 0/0:19,0:19:PASS:51:0:.:.:0,51,765
1 11 . G A 12328.69 PASS MAC=0;PURPOSE=same_pos_same_ref_diff_alt;RN=12;mGAPV=2.0 GT:AD:DP:FT:GQ:MV:PL 0/0:17,0:17:PASS:51:0:0,51,614 0/0:29,0:29:PASS:69:0:0,69,1035
1 12 . C CG 12328.69 PASS MAC=1;PURPOSE=same_pos_same_ref_same_alt;RN=12;mGAPV=2.0 GT:AD:DP:FT:GQ:MV:PL 0/0:20,0:20:PASS:60:0:0,60,804 0/0:34,0:34:PASS:90:0:0,90,1350
1 221 . G A,T 2328.69 PASS MAC=1;PURPOSE=same_pos_same_ref_same_alt1_diff_alt2;RN=12;mGAPV=2.0 GT:AD:DP:FT:GQ:MV:PGT:PID:PL 0/0:27,0:27:PASS:72:0:.:.:0,72,1007 0/0:15,0:15:PASS:45:0:.:.:0,45,597
1 232 . G T,* 1008.03 PASS MAC=1;PURPOSE=same_pos_same_ref_diff_alt1_same_alt2;RN=12;mGAPV=2.0 GT:AD:DP:FT:GQ:MV:PL 0/1:9,15:24:PASS:99:0:396,0,225 0/0:16,0:16:PASS:36:0:0,36,540
1 244 . A AT,CCC 7.80 PASS MAC=2;PURPOSE=same_pos_same_ref_same_alt1_same_alt2;RN=12;mGAPV=2.0 GT:AD:DP:FT:GQ:MV:PGT:PID:PL 0/0:20,0:20:PASS:51:0:.:.:0,51,765 0/1:9,5:14:PASS:99:0:0|1:1750290_TC_T:183,0,363 0/1:10,15:25:PASS:99:0:0|1:1750290_TC_T:599,0,375
1 3111 . TA AT,G,T 9683.63 PASS MAC=1;PURPOSE=same_pos_same_ref_diff_alt1_diff_alt2_diff_alt3;RN=12;mGAPV=2.0 GT:AD:DP:FT:GQ:MV:PL ./.:10,0:10:GQ-LT30:27:0:0,27,405 ./.:10,0:10:GQ-LT30:27:0:0,27,405
1 3211 . C A,G,T 9683.63 PASS MAC=1;PURPOSE=same_pos_same_ref_same_alt1_diff_alt2_diff_alt3;RN=12;mGAPV=2.0 GT:AD:DP:FT:GQ:MV:PL 0/0:12,0:12:PASS:36:0:0,36,434 0/0:12,0:12:PASS:36:0:0,36,411
1 3323 . G A,C,T 6630.50 PASS MAC=1;PURPOSE=same_pos_same_ref_diff_alt1_same_alt2_diff_alt3;RN=12;mGAPV=2.0 GT:AD:DP:FT:GQ:MV:PL ./.:8,0:8:DP-LT10;GQ-LT30:21:0:0,21,315 0/1:5,8:13:PASS:99:0:247,0,179
1 3332 . C CG,G,TTT 37495.79 PASS MAC=1;PURPOSE=same_pos_same_ref_diff_alt1_diff_alt2_same_alt3;RN=12;mGAPV=2.0 GT:AD:DP:FT:GQ:MV:PL ./.:9,0:9:DP-LT10;GQ-LT30:18:0:0,18,270 0/0:16,0:16:PASS:48:0:0,48,649
1 3441 . ATT A,C,G 110541.03 PASS MAC=2;PURPOSE=same_pos_same_ref_same_alt1_same_alt2_diff_alt3;RN=12;mGAPV=2.0 GT:AD:DP:FT:GQ:MV:PL ./.:4,4:8:DP-LT10:99:0:105,0,141 ./.:9,0:9:DP-LT10;GQ-LT30:24:0:0,24,360 0/0:18,0:18:PASS:48:0:0,48,720
1 3452 . GT CG,GGG,TATA 38369.71 PASS MAC=2;PURPOSE=same_pos_same_ref_same_alt1_diff_alt2_same_alt3;RN=12;mGAPV=2.0 GT:AD:DP:FT:GQ:MV:PGT:PID:PL ./.:3,0:3:DP-LT10;GQ-LT30:9:0:.:.:0,9,112 0/0:13,0:13:PASS:36:0:.:.:0,36,540
1 3522 . C A,GC,TTTAAAA 38369.71 PASS MAC=2;PURPOSE=same_pos_same_ref_diff_alt1_same_alt2_same_alt3;RN=12;mGAPV=2.0 GT:AD:DP:FT:GQ:MV:PGT:PID:PL ./.:11,0:11:GQ-LT30:27:0:.:.:0,27,405 ./.:8,0:8:DP-LT10;GQ-LT30:21:0:.:.:0,21,315
1 3633 . C CTAT,GA,TACGTAC 38369.71 PASS MAC=3;PURPOSE=same_pos_same_ref_same_alt1_same_alt2_same_alt3;RN=12;mGAPV=2.0 GT:AD:DP:FT:GQ:MV:PGT:PID:PL ./.:5,0:5:DP-LT10;GQ-LT30:12:0:.:.:0,12,180 ./.:2,0:2:DP-LT10;GQ-LT30:6:0:.:.:0,6,71
1 3725 . G A,GC,TTTAAAA 38369.71 PASS MAC=0;PURPOSE=overlapping_position;RN=12;mGAPV=2.0 GT:AD:DP:FT:GQ:MV:PL 0/0:13,0:13:PASS:33:0:0,33,495 ./.:10,0:10:GQ-LT30:27:0:0,27,405

0 comments on commit 1738c47

Please sign in to comment.