diff --git a/src/consensus.rs b/src/consensus.rs index 0dfd232..56fc0ba 100644 --- a/src/consensus.rs +++ b/src/consensus.rs @@ -197,6 +197,8 @@ mod tests { chrom: "chr1".to_string(), start: 1, end: 100, + id: "".to_string(), + motifs: "CA".to_string(), }, ); println!("Consensus: {}", cons.seq.unwrap()); diff --git a/src/genotype.rs b/src/genotype.rs index 11df3f6..e84a96c 100644 --- a/src/genotype.rs +++ b/src/genotype.rs @@ -330,6 +330,8 @@ mod tests { chrom: String::from("chr7"), start: 154654404, end: 154654432, + id: String::from("test-repeat"), + motifs: "CA".to_string(), }; let flanking = 2000; let minlen = 5; @@ -369,6 +371,8 @@ mod tests { chrom: String::from("chr7"), start: 154654404, end: 154654432, + id: "TEST".to_string(), + motifs: "CA".to_string(), }; let args = Cli { bam: String::from("test_data/small-test-phased.bam"), @@ -397,6 +401,8 @@ mod tests { chrom: String::from("chr7"), start: 154654404, end: 154654432, + id: "TEST".to_string(), + motifs: "CA".to_string(), }; let args = Cli { bam: String::from("test_data/small-test-phased.bam"), @@ -441,6 +447,8 @@ mod tests { chrom: String::from("chr7"), start: 154654404, end: 154654432, + id: "TEST".to_string(), + motifs: "CA".to_string(), }; let mut bam = parse_bam::create_bam_reader(&args.bam, &args.fasta); let genotype = genotype_repeat(&repeat, &args, &mut bam); @@ -469,6 +477,8 @@ mod tests { chrom: String::from("chr7"), start: 154654404, end: 154654432, + id: "TEST".to_string(), + motifs: "CA".to_string(), }; let mut bam = parse_bam::create_bam_reader(&args.bam, &args.fasta); let genotype = genotype_repeat(&repeat, &args, &mut bam); @@ -498,6 +508,8 @@ mod tests { chrom: String::from("chr7"), start: 154654404, end: 154654432, + id: "TEST".to_string(), + motifs: "CA".to_string(), }; let mut bam = parse_bam::create_bam_reader(&args.bam, &args.fasta); let genotype = genotype_repeat(&repeat, &args, &mut bam); diff --git a/src/motif.rs b/src/motif.rs index cdffdb5..123f765 100644 --- a/src/motif.rs +++ b/src/motif.rs @@ -5,6 +5,20 @@ fn create_motif(seq: &str) -> String { unimplemented!() } +pub fn create_mc(motifs_string: &str, repeats: &str) -> String { + let motifs: Vec = motifs_string.split(',').map(|s| s.to_string()).collect(); + + let mut motif_counts = vec![0; motifs.len()]; + for motif in motifs.iter() { + let mut start = 0; + while let Some(index) = repeats[start..].find(motif) { + motif_counts[motifs.iter().position(|x| x == motif).unwrap()] += 1; + start += index + motif.len(); + } + } + motif_counts.iter().map(|count| count.to_string()).collect::>().join("_") + } + #[cfg(test)] mod tests { use super::*; @@ -17,4 +31,22 @@ mod tests { "(CAG)4(CGG)3(CAG)3" ); } + #[test] + fn test_create_mc() { + let motifs = "ATG,CGT,GCA"; + let repeat = "ATGCGTATGCGTAGCGT"; + println!("{:?}", create_mc(&motifs, &repeat)); + assert_eq!( + create_mc(&motifs, repeat), "2_3_0" + ); + } + #[test] + fn test_create_mc_with_n() { + let motifs = "NCG"; + let repeat = "ACGACGACGACG"; + + assert_eq!( + create_mc(&motifs, repeat), "4" + ); + } } diff --git a/src/parse_bam.rs b/src/parse_bam.rs index 1465671..3ae0938 100644 --- a/src/parse_bam.rs +++ b/src/parse_bam.rs @@ -143,6 +143,8 @@ fn test_get_overlapping_reads() { chrom: String::from("chr7"), start: 154654404, end: 154654432, + id: "TEST".to_string(), + motifs: "CA".to_string(), }; let unphased = false; let mut bam = create_bam_reader(&bam, &fasta); @@ -157,6 +159,8 @@ fn test_get_overlapping_reads_url1() { chrom: String::from("chr20"), start: 154654404, end: 154654432, + id: "TEST".to_string(), + motifs: "CA".to_string(), }; let unphased = false; let mut bam = create_bam_reader(&bam, &fasta); @@ -171,6 +175,8 @@ fn test_get_overlapping_reads_url2() { chrom: String::from("chr20"), start: 154654404, end: 154654432, + id: "TEST".to_string(), + motifs: "CA".to_string(), }; let unphased = false; let mut bam = create_bam_reader(&bam, &fasta); @@ -185,6 +191,8 @@ fn test_get_overlapping_reads_url3() { chrom: String::from("chr20"), start: 154654404, end: 154654432, + id: "TEST".to_string(), + motifs: "CA".to_string(), }; let unphased = false; let mut bam = create_bam_reader(&bam, &fasta); @@ -199,6 +207,8 @@ fn test_get_overlapping_reads_url4() { chrom: String::from("chr20"), start: 154654404, end: 154654432, + id: "TEST".to_string(), + motifs: "CA".to_string(), }; let unphased = false; let mut bam = create_bam_reader(&bam, &fasta); diff --git a/src/phase_insertions.rs b/src/phase_insertions.rs index b351139..6b6a3f2 100644 --- a/src/phase_insertions.rs +++ b/src/phase_insertions.rs @@ -332,6 +332,8 @@ mod tests { chrom: "chr7".to_string(), start: 154654404, end: 154654432, + id: "TEST".to_string(), + motifs: "CA".to_string(), }, false, ); @@ -372,6 +374,8 @@ mod tests { chrom: "chr7".to_string(), start: 154654404, end: 154654432, + id: "TEST".to_string(), + motifs: "CA".to_string(), }, false, ); @@ -411,6 +415,8 @@ mod tests { chrom: "chr7".to_string(), start: 154654404, end: 154654432, + id: "TEST".to_string(), + motifs: "CA".to_string(), }, false, ); @@ -453,6 +459,8 @@ mod tests { chrom: "chr7".to_string(), start: 154654404, end: 154654432, + id: "TEST".to_string(), + motifs: "CA".to_string(), }, false, ); @@ -493,6 +501,8 @@ mod tests { chrom: "chr7".to_string(), start: 154654404, end: 154654432, + id: "TEST_RFC1".to_string(), + motifs: "CA".to_string(), }, false, ); @@ -564,6 +574,8 @@ mod tests { chrom: "chr7".to_string(), start: 154654404, end: 154654432, + id: "TEST_RFC1".to_string(), + motifs: "CA".to_string(), }, false, ); diff --git a/src/repeats.rs b/src/repeats.rs index bc8f86a..f799d24 100644 --- a/src/repeats.rs +++ b/src/repeats.rs @@ -21,7 +21,9 @@ impl RepeatIntervalIterator { let end: u32 = interval.split('-').collect::>()[1] .parse() .unwrap(); - let repeat = RepeatInterval::new_interval(chrom, start, end, fasta) + let id = String::new(); + let motifs = String::new(); + let repeat = RepeatInterval::new_interval(chrom, start, end, fasta, id, motifs) .expect("Failed to create repeat interval"); RepeatIntervalIterator { current_index: 0, @@ -74,6 +76,8 @@ impl Clone for RepeatInterval { chrom: self.chrom.clone(), start: self.start, end: self.end, + id: self.id.clone(), + motifs: self.motifs.clone(), } } } @@ -99,6 +103,8 @@ pub struct RepeatInterval { pub chrom: String, pub start: u32, pub end: u32, + pub id: String, + pub motifs: String, } impl fmt::Display for RepeatInterval { @@ -117,10 +123,25 @@ impl RepeatInterval { let chrom = rec.chrom().to_string(); let start = rec.start().try_into().unwrap(); let end = rec.end().try_into().unwrap(); - RepeatInterval::new_interval(chrom, start, end, fasta) + let mut id = String::new(); + let mut motifs = String::new(); + if let Some(info) = rec.name() { + let key_value_pairs: Vec<&str> = info.split(';').collect(); + // Find and extract the value associated with the "ID" key + for pair in key_value_pairs { + if pair.starts_with("ID=") { + id = pair.trim_start_matches("ID=").to_string(); + } + else if pair.starts_with("MOTIFS=") { + motifs = pair.trim_start_matches("MOTIFS=").to_string(); + break; + } + } + } + RepeatInterval::new_interval(chrom, start, end, fasta, id, motifs) } - fn new_interval(chrom: String, start: u32, end: u32, fasta: &str) -> Option { + fn new_interval(chrom: String, start: u32, end: u32, fasta: &str, id: String, motifs: String) -> Option { if end < start { panic!("End coordinate is smaller than start coordinate for {chrom}:{start}-{end}") } @@ -138,7 +159,7 @@ impl RepeatInterval { .expect("Failed parsing chromosome length from fai file") > end { - return Some(Self { chrom, start, end }); + return Some(Self { chrom, start, end, id, motifs}); } } // if the chromosome is not in the fai file or the end does not fit the interval, return None @@ -146,11 +167,13 @@ impl RepeatInterval { "Chromosome {chrom} is not in the fasta file or the end coordinate is out of bounds" ); } - pub fn new(chrom: &str, start: u32, end: u32) -> Self { + pub fn new(chrom: &str, start: u32, end: u32, id: &str, motifs: &str) -> Self { Self { chrom: chrom.to_string(), start, end, + id: id.to_string(), + motifs: motifs.to_string(), } } diff --git a/src/vcf.rs b/src/vcf.rs index 5f5960c..c7d9109 100644 --- a/src/vcf.rs +++ b/src/vcf.rs @@ -7,6 +7,8 @@ use std::cmp::Ordering; use std::fmt; use std::io::Read; +use crate::motif::create_mc; + pub struct Allele { pub length: String, // length of the consensus sequence minus the length of the repeat sequence pub full_length: String, // length of the consensus sequence @@ -55,6 +57,9 @@ pub struct VCFRecord { pub ps: Option, // phase set identifier pub flags: String, pub allele: (String, String), + pub id: String, + pub motifs: String, + pub mc: (String, String), } impl VCFRecord { @@ -87,6 +92,9 @@ impl VCFRecord { "Genotyping {repeat}:{repeat_ref_sequence} with {} and {}", allele1.seq, allele2.seq, ); + let mc1 = create_mc(&repeat.motifs, &allele1.seq); + let mc2 = create_mc(&repeat.motifs, &allele2.seq); + println!("{:?} {:?} {:?} {:?} {:?} {:?}", &repeat, &repeat.motifs, &mc1, &mc2, &allele1.seq, &allele2.seq); // if the consensus is very similar to the reference the variant is considered ref // for this I use a threshold of 5% of the length of the repeat sequence in the reference // e.g. if the repeat is 300bp in the reference this will allow an edit distance of 15 @@ -158,6 +166,9 @@ impl VCFRecord { ps, flags, allele: (genotype1.to_string(), genotype2.to_string()), + id: repeat.id.clone(), + motifs: repeat.motifs.clone(), + mc: (mc1, mc2), } } @@ -182,6 +193,9 @@ impl VCFRecord { ps: None, flags: "".to_string(), allele: (".".to_string(), ".".to_string()), + id: repeat.id.clone(), + motifs: repeat.motifs.clone(), + mc: (".".to_string(), ".".to_string()), } } } @@ -191,12 +205,12 @@ impl fmt::Display for VCFRecord { match &self.alt_seq { Some(alts) => { let (FORMAT, ps) = match self.ps { - Some(ps) => ("GT:RB:FRB:SUP:SC:PS", format!(":{}", ps)), - None => ("GT:RB:FRB:SUP:SC", "".to_string()), + Some(ps) => ("GT:RB:FRB:SUP:MC:SC:PS", format!(":{}", ps)), + None => ("GT:RB:FRB:SUP:MC:SC", "".to_string()), }; write!( f, - "{chrom}\t{start}\t.\t{ref}\t{alt}\t.\t.\t{flags}END={end};STDEV={sd1},{sd2}{somatic}{outliers}\t{FORMAT}\t{genotype1}|{genotype2}:{l1},{l2}:{fl1},{fl2}:{sup1},{sup2}:{score1},{score2}{ps}", + "{chrom}\t{start}\t.\t{ref}\t{alt}\t.\t.\t{flags}REPID={id};MOTIFS={motifs};END={end};STDEV={sd1},{sd2}{somatic}{outliers}\t{FORMAT}\t{genotype1}|{genotype2}:{l1},{l2}:{fl1},{fl2}:{sup1},{sup2}:{mc1},{mc2}:{score1},{score2}{ps}", chrom = self.chrom, start = self.start, flags = self.flags, @@ -217,12 +231,17 @@ impl fmt::Display for VCFRecord { sup2 = self.support.1, score1 = self.score.0, score2 = self.score.1, + id = self.id, + motifs = self.motifs, + mc1 = self.mc.0, + mc2 = self.mc.1, + ) } None => { write!( f, - "{chrom}\t{start}\t.\t{ref}\t.\t.\t.\tEND={end};{somatic}\tGT:SUP\t{genotype1}|{genotype2}:{sup1},{sup2}", + "{chrom}\t{start}\t.\t{ref}\t.\t.\t.\tREPID={id};MOTIFS={motifs};END={end};{somatic}\tGT:SUP\t{genotype1}|{genotype2}:{sup1},{sup2}:{mc1},{mc2}", chrom = self.chrom, start = self.start, end = self.end, @@ -232,6 +251,10 @@ impl fmt::Display for VCFRecord { genotype2 = self.allele.1, sup1 = self.support.0, sup2 = self.support.1, + id = self.id, + motifs = self.motifs, + mc1 = self.mc.0, + mc2 = self.mc.1, ) } } @@ -286,6 +309,12 @@ pub fn write_vcf_header(fasta: &str, bam: &str, sample: &Option) { let length = contig.next().unwrap().parse::().unwrap(); println!(r#"##contig="#, name, length); } + println!( + r#"##INFO="# + ); + println!( + r#"##INFO="# + ); println!( r#"##INFO="# ); @@ -310,6 +339,7 @@ pub fn write_vcf_header(fasta: &str, bam: &str, sample: &Option) { ); println!(r#"##FORMAT="#); println!(r#"##FORMAT="#); + println!(r#"##FORMAT="#); println!(r#"##FORMAT="#); let name = match sample { Some(name) => name,