diff --git a/CHANGELOG.md b/CHANGELOG.md index 18b1e5d..8094f60 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,4 +1,10 @@ -# 1.4.3 +# v1.4.4 +## Fixed +* Fixed an error where phasing information that was present in input files would be copied through to output files if it was not overwritten by HiPhase phasing results. HiPhase will now automatically remove this phasing information to prevent accidental mixing of phase results. + * For VCF files, any unphased genotypes will be switched to unphased and sorted by allele index (e.g. 1|0 -> 0/1). The "FORMAT:PS" and "FORMAT:PF" tags will either be removed entirely if the whole record is unphased or set to "." for partially phased records. + * For BAM files, the "HP" and "PS" tags will be removed for any unphased records. + +# v1.4.3 ## Fixed * Replaced a panic caused by a chromosome appearing in a VCF but not in the BAM file with a more descriptive error message * Fixed an error caused by a multi-sample VCF with a mixture of haploid and diploid genotypes diff --git a/Cargo.lock b/Cargo.lock index 28deead..01de4e5 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -526,7 +526,7 @@ dependencies = [ [[package]] name = "hiphase" -version = "1.4.3" +version = "1.4.4" dependencies = [ "bio", "bit-vec", diff --git a/Cargo.toml b/Cargo.toml index 6b5ab01..a20516e 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "hiphase" -version = "1.4.3" +version = "1.4.4" authors = ["J. Matthew Holt "] description = "A tool for jointly phasing small, structural, and tandem repeat variants for PacBio sequencing data" edition = "2021" diff --git a/LICENSE-THIRDPARTY.json b/LICENSE-THIRDPARTY.json index 37f11a4..cf1a9f9 100644 --- a/LICENSE-THIRDPARTY.json +++ b/LICENSE-THIRDPARTY.json @@ -496,7 +496,7 @@ }, { "name": "hiphase", - "version": "1.4.3", + "version": "1.4.4", "authors": "J. Matthew Holt ", "repository": null, "license": null, diff --git a/docs/user_guide.md b/docs/user_guide.md index b186549..bdc4151 100644 --- a/docs/user_guide.md +++ b/docs/user_guide.md @@ -337,3 +337,9 @@ Since TRGT calls often correspond to multiple small variant calls, the total num Additionally, this has the benefit of removing false heterozygous variants in the tandem repeat regions. This can have the side effect of (correctly) reducing block NG50 when those false variants are the only links between two otherwise unlinked phase blocks. Variants that are intentionally excluded from phasing this way will have a FORMAT `PF` tag of `TR_OVERLAP`. + +## Can I provide pre-existing phase information to HiPhase? +Currently no, HiPhase will not use any pre-existing phase information in the provided VCF or BAM files. +To prevent accidental mis-interpretation of results, HiPhase will remove or overwrite any existing phase information from the input files prior to writing the output phased VCFs and/or BAMs. +For VCF files, this includes the "FORMAT:GT" (genotype), "FORMAT:PS" (phase set), and "FORMAT:PF" (phase filter) fields. +For BAM files, this includes the "HP" (haplotype) and "PS" (phase set) tags. diff --git a/src/writers/ordered_bam_writer.rs b/src/writers/ordered_bam_writer.rs index 14c3832..dac3ef0 100644 --- a/src/writers/ordered_bam_writer.rs +++ b/src/writers/ordered_bam_writer.rs @@ -221,7 +221,8 @@ impl OrderedBamWriter { bam_writer.write(&record)?; }, None => { - // no match, so just copy the read over + // no match, so just copy the read over after stripping any phase information + strip_record_phasing(&mut record)?; bam_writer.write(&record)?; } }; @@ -277,7 +278,7 @@ impl OrderedBamWriter { Ok(()) => { for record_result in bam_reader.records() { // set up the record for the new BAM file - let record = record_result?; + let mut record = record_result?; let record_pos = record.pos(); if record_pos < start_pos as i64 { // this can happen when you have reads that overlap the location but don't start *after* the position @@ -286,6 +287,7 @@ impl OrderedBamWriter { } // nothing left should be tagged + strip_record_phasing(&mut record)?; bam_writer.write(&record)?; // adding this last bit should prevent double writes by accident from a user @@ -336,9 +338,10 @@ impl OrderedBamWriter { Ok(()) => { for record_result in bam_reader.records() { // set up the record for the new BAM file - let record = record_result?; + let mut record = record_result?; // nothing left should be tagged + strip_record_phasing(&mut record)?; bam_writer.write(&record)?; } }, @@ -353,3 +356,61 @@ impl OrderedBamWriter { Ok(()) } } + +/// Removes the phase tags that HiPhase will normally write from a record. +/// This is so we erase anything that did not come from the tool. +fn strip_record_phasing(record: &mut bam::Record) -> Result<(), rust_htslib::errors::Error> { + // much simpler than the VCF version since there is no multi-sample to worry about, just drop these tags + match record.remove_aux(b"HP") { + // either it was remove or was not there to begin with + Ok(()) | + Err(rust_htslib::errors::Error::BamAuxTagNotFound) => {}, + // some other crazy error + Err(e) => { + return Err(e); + } + }; + match record.remove_aux(b"PS") { + // either it was remove or was not there to begin with + Ok(()) | + Err(rust_htslib::errors::Error::BamAuxTagNotFound) => Ok(()), + // some other crazy error + Err(e) => Err(e) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + /// Simple empty bam record creator; if we just use a "new()" record, htslib is unhappy when we access Aux + fn dummy_bam_record() -> bam::Record { + let mut record = bam::Record::new(); + record.set( + b"test_qname", + None, + b"A", + &[255] + ); + record + } + + #[test] + fn test_clear_record() { + // make sure running on a record without the tags is fine (this will be *most* cases) + let mut record = dummy_bam_record(); + strip_record_phasing(&mut record).unwrap(); + + // create a record with the tags + let mut record = dummy_bam_record(); + record.push_aux(b"HP", bam::record::Aux::U8(1)).unwrap(); + record.push_aux(b"PS", bam::record::Aux::I32(42)).unwrap(); + assert!(record.aux(b"HP").is_ok()); + assert!(record.aux(b"PS").is_ok()); + + // strip them and verify they are gone + strip_record_phasing(&mut record).unwrap(); + assert_eq!(record.aux(b"HP").err().unwrap(), rust_htslib::errors::Error::BamAuxTagNotFound); + assert_eq!(record.aux(b"PS").err().unwrap(), rust_htslib::errors::Error::BamAuxTagNotFound); + } +} \ No newline at end of file diff --git a/src/writers/ordered_vcf_writer.rs b/src/writers/ordered_vcf_writer.rs index 750c75b..d01ef7b 100644 --- a/src/writers/ordered_vcf_writer.rs +++ b/src/writers/ordered_vcf_writer.rs @@ -97,6 +97,12 @@ impl OrderedVcfWriter { //now setup the outputs, we want to do the header stuff here also let mut output_header: bcf::header::Header = bcf::header::Header::from_template(&vcf_header); + + // we can remove any existing headers without crashing anything + output_header.remove_format(b"PS"); + output_header.remove_format(b"PF"); + + // add in the new headers let cli_string: String = std::env::args().collect::>().join(" "); let cli_version: &str = &crate::cli::FULL_VERSION; output_header.push_record(format!(r#"##HiPhase_version="{cli_version}""#).as_bytes()); @@ -308,12 +314,16 @@ impl OrderedVcfWriter { // we have already written though, so don't write it again continue; } + + // the record will get written, clear out all phasing information first + strip_record_phasing(&mut record)?; // we now have to iterate over each sample and modify the entries as necessary let vcf_sample_indices = &self.sample_indices[vcf_index]; let mut changes_made: bool = false; - // initialize the alleles array to be the same, these may change inside the loop + // initialize the alleles array to be the same as what's currently in the record; this is post-phase stripping + // alleles may get overwritten following this loop depending on the success of the phasing let mut alleles = vec![]; let record_gt = record.genotypes().unwrap(); for si in 0..record.sample_count() { @@ -422,3 +432,139 @@ impl OrderedVcfWriter { Ok(()) } } + +/// This will remove all phasing information from a given VCF record. +/// # Arguments +/// * `record` - the VCF to strip all phasing from +/// # Errors +/// * if it fails to clear the PS tag +/// * if it fails to parse genotypes +/// * if it encounters any weird genotypes (we probably would fail before this) +fn strip_record_phasing(record: &mut bcf::Record) -> Result<(), Box> { + // first remove the "PS" tag + clear_format_tag(record, b"PS")?; + clear_format_tag(record, b"PF")?; + + let record_gt = record.genotypes()?; + let mut alleles = vec![]; + for si in 0..record.sample_count() { + let genotype = record_gt.get(si as usize); + match genotype.len() { + 0 => bail!("Encountered empty genotype record at position {}", record.pos()), + 1 => { + // TRGT can make single-allele GT calls, just copy it over as normal + // it will not be modified below because it is a homozygous allele + let g0 = unphase_genotype_allele(genotype[0]); + alleles.push(i32::from(g0)); + alleles.push(i32::MIN+1); // sentinel value for end + }, + 2 => { + // this is 99.99999999% path + let g0 = i32::from(unphase_genotype_allele(genotype[0])); + let g1 = i32::from(unphase_genotype_allele(genotype[1])); + let min_value = g0.min(g1); + let max_value = g0.max(g1); + alleles.push(min_value); + alleles.push(max_value); + }, + gt_len => { + // we do not have 3+ GT entries implemented + bail!("Encountered GT of length {} at record {}", gt_len, record.desc()) + } + } + } + record.push_format_integer(b"GT", &alleles)?; + + Ok(()) +} + +/// This will remove a specified format field from the provided record. +/// Other approaches seem to set it to "." instead of removing the field entirely. +/// If the tag is absent (from this record or the entire file), this seemingly returns without error. +/// # Arguments +/// * `record` - the record to remove the tag from +/// * `tag` - the tag to remove, e.g. b"PS" +/// # Errors +/// * if we cannot remove the tag for some reason +fn clear_format_tag(record: &mut bcf::Record, tag: &[u8]) -> Result<(), rust_htslib::errors::Error> { + let tag_c_str = std::ffi::CString::new(tag).unwrap(); + unsafe { + if rust_htslib::htslib::bcf_update_format( + record.header().inner, + record.inner, + tag_c_str.into_raw(), + std::ptr::null() as *const ::std::os::raw::c_void, + 0, + rust_htslib::htslib::BCF_HT_INT as i32 + ) == 0 { + Ok(()) + } else { + Err(rust_htslib::errors::Error::BcfSetTag { tag: std::str::from_utf8(tag).unwrap().to_owned() }) + } + } +} + +/// This will convert a genotype allele into an unphased version. +/// # Arguments +/// * `gt` - the genotype allele to unphase +fn unphase_genotype_allele(gt: GenotypeAllele) -> GenotypeAllele { + match gt { + GenotypeAllele::Unphased(i) | + GenotypeAllele::Phased(i) => GenotypeAllele::Unphased(i), + GenotypeAllele::UnphasedMissing | + GenotypeAllele::PhasedMissing => GenotypeAllele::UnphasedMissing + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_clear_record() { + let vcf_fn = PathBuf::from("./test_data/prephased_test/prephased.vcf"); + let mut vcf_reader = bcf::Reader::from_path(&vcf_fn).unwrap(); + + // expectations for sample 1 + let expected_gt1 = vec![ + vec![GenotypeAllele::Unphased(0), GenotypeAllele::Unphased(1)], + vec![GenotypeAllele::Unphased(1), GenotypeAllele::Unphased(1)], + vec![GenotypeAllele::Unphased(1)], + vec![GenotypeAllele::UnphasedMissing, GenotypeAllele::Unphased(1)] + ]; + + // expectations for sample 2 + let expected_gt2 = vec![ + vec![GenotypeAllele::Unphased(0), GenotypeAllele::Unphased(1)], + vec![GenotypeAllele::Unphased(0), GenotypeAllele::Unphased(0)], + vec![GenotypeAllele::Unphased(1), GenotypeAllele::Unphased(2)], + vec![GenotypeAllele::UnphasedMissing, GenotypeAllele::Unphased(0)] + ]; + + for (record_result, (exp_gt1, exp_gt2)) in vcf_reader.records() + .zip(expected_gt1.iter().zip(expected_gt2.iter())) { + // load the record and strip out the phasing information + let mut record = record_result.unwrap(); + println!("Handling {}", record.desc()); + strip_record_phasing(&mut record).unwrap(); + + // verify that PS tag is completely gone + let ps_tag_result = record.format(b"PS").integer(); + assert!(ps_tag_result.is_err()); + assert_eq!(ps_tag_result.err().unwrap(), rust_htslib::errors::Error::BcfMissingTag { tag: "PS".to_string(), record: record.desc() }); + + // verify that PF tag is completely gone + let pf_tag_result = record.format(b"PF").string(); + assert!(pf_tag_result.is_err()); + assert_eq!(pf_tag_result.err().unwrap(), rust_htslib::errors::Error::BcfMissingTag { tag: "PF".to_string(), record: record.desc() }); + + // make sure GT 1 matches our expectation, which will be unphased + let genotype1 = record.genotypes().unwrap().get(0); + assert_eq!(genotype1.as_slice(), exp_gt1.as_slice()); + + // make sure GT 2 matches our expectation, which will be unphased + let genotype2 = record.genotypes().unwrap().get(1); + assert_eq!(genotype2.as_slice(), exp_gt2.as_slice()); + } + } +} \ No newline at end of file diff --git a/test_data/iupac_test/small_variants.vcf.gz b/test_data/iupac_test/small_variants.vcf.gz new file mode 100644 index 0000000..8f3ef1e Binary files /dev/null and b/test_data/iupac_test/small_variants.vcf.gz differ diff --git a/test_data/iupac_test/small_variants.vcf.gz.tbi b/test_data/iupac_test/small_variants.vcf.gz.tbi new file mode 100644 index 0000000..7a54119 Binary files /dev/null and b/test_data/iupac_test/small_variants.vcf.gz.tbi differ diff --git a/test_data/iupac_test/test_reference.fa b/test_data/iupac_test/test_reference.fa new file mode 100644 index 0000000..e86e37d --- /dev/null +++ b/test_data/iupac_test/test_reference.fa @@ -0,0 +1,2 @@ +>chr1 +ACGTARGTACGT diff --git a/test_data/prephased_test/prephased.vcf b/test_data/prephased_test/prephased.vcf new file mode 100644 index 0000000..bbf2b6a --- /dev/null +++ b/test_data/prephased_test/prephased.vcf @@ -0,0 +1,23 @@ +##fileformat=VCFv4.2 +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##INFO= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##DeepVariant_version=1.3.0 +##contig= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT sample_name sample_name2 +chr1 5 . A T 30 PASS . GT:PS:PF 0|1:5:test 1|0:5:. +chr1 15 . A T 30 PASS . GT:PS 1|1:5 0|0:5 +chr1 25 . A T,C 30 PASS . GT:PS 1:5 2|1:5 +chr1 30 . A T 30 PASS . GT:PS .|1:5 0|.:5 \ No newline at end of file