From 8663a2aa0eaaafc074c64cc7b051fbe3e248bcdd Mon Sep 17 00:00:00 2001 From: fellen31 Date: Thu, 16 May 2024 12:46:19 +0200 Subject: [PATCH 1/6] working code --- src/consensus.rs | 1 + src/genotype.rs | 6 ++++++ src/motif.rs | 22 ++++++++++++++++++++++ src/parse_bam.rs | 5 +++++ src/phase_insertions.rs | 6 ++++++ src/repeats.rs | 34 +++++++++++++++++++++++++++++----- src/vcf.rs | 31 +++++++++++++++++++++++++++---- 7 files changed, 96 insertions(+), 9 deletions(-) diff --git a/src/consensus.rs b/src/consensus.rs index 0dfd232..3744224 100644 --- a/src/consensus.rs +++ b/src/consensus.rs @@ -197,6 +197,7 @@ mod tests { chrom: "chr1".to_string(), start: 1, end: 100, + id: "".to_string(), }, ); println!("Consensus: {}", cons.seq.unwrap()); diff --git a/src/genotype.rs b/src/genotype.rs index 11df3f6..d3b7956 100644 --- a/src/genotype.rs +++ b/src/genotype.rs @@ -330,6 +330,7 @@ mod tests { chrom: String::from("chr7"), start: 154654404, end: 154654432, + id: String::from("test-repeat"), }; let flanking = 2000; let minlen = 5; @@ -369,6 +370,7 @@ mod tests { chrom: String::from("chr7"), start: 154654404, end: 154654432, + id: "TEST".to_string(), }; let args = Cli { bam: String::from("test_data/small-test-phased.bam"), @@ -397,6 +399,7 @@ mod tests { chrom: String::from("chr7"), start: 154654404, end: 154654432, + id: "TEST".to_string(), }; let args = Cli { bam: String::from("test_data/small-test-phased.bam"), @@ -441,6 +444,7 @@ mod tests { chrom: String::from("chr7"), start: 154654404, end: 154654432, + id: "TEST".to_string(), }; let mut bam = parse_bam::create_bam_reader(&args.bam, &args.fasta); let genotype = genotype_repeat(&repeat, &args, &mut bam); @@ -469,6 +473,7 @@ mod tests { chrom: String::from("chr7"), start: 154654404, end: 154654432, + id: "TEST".to_string(), }; let mut bam = parse_bam::create_bam_reader(&args.bam, &args.fasta); let genotype = genotype_repeat(&repeat, &args, &mut bam); @@ -498,6 +503,7 @@ mod tests { chrom: String::from("chr7"), start: 154654404, end: 154654432, + id: "TEST".to_string(), }; let mut bam = parse_bam::create_bam_reader(&args.bam, &args.fasta); let genotype = genotype_repeat(&repeat, &args, &mut bam); diff --git a/src/motif.rs b/src/motif.rs index cdffdb5..85029e4 100644 --- a/src/motif.rs +++ b/src/motif.rs @@ -5,6 +5,21 @@ fn create_motif(seq: &str) -> String { unimplemented!() } +pub fn create_mc(motifs_string: &str, repeats: &str) -> String { + let motifs: Vec = motifs_string.split(',').map(|s| s.to_string()).collect(); + + let mut motif_counts = vec![0; motifs.len()]; + for motif in motifs.iter() { + let mut start = 0; + while let Some(index) = repeats[start..].find(motif) { + motif_counts[motifs.iter().position(|x| x == motif).unwrap()] += 1; + start += index + motif.len(); + } + } + motif_counts.iter().map(|count| count.to_string()).collect::>().join("_") +} + + #[cfg(test)] mod tests { use super::*; @@ -17,4 +32,11 @@ mod tests { "(CAG)4(CGG)3(CAG)3" ); } + fn test_create_mc() { + let motifs = "ATG,CGT,GCA"; + let repeat = "ATGCGTATGCGTAGCGT"; + assert_eq!( + create_mc(&motifs, repeat), "3_1_0" + ); + } } diff --git a/src/parse_bam.rs b/src/parse_bam.rs index 1465671..d2d7da5 100644 --- a/src/parse_bam.rs +++ b/src/parse_bam.rs @@ -143,6 +143,7 @@ fn test_get_overlapping_reads() { chrom: String::from("chr7"), start: 154654404, end: 154654432, + id: "TEST".to_string(), }; let unphased = false; let mut bam = create_bam_reader(&bam, &fasta); @@ -157,6 +158,7 @@ fn test_get_overlapping_reads_url1() { chrom: String::from("chr20"), start: 154654404, end: 154654432, + id: "TEST".to_string(), }; let unphased = false; let mut bam = create_bam_reader(&bam, &fasta); @@ -171,6 +173,7 @@ fn test_get_overlapping_reads_url2() { chrom: String::from("chr20"), start: 154654404, end: 154654432, + id: "TEST".to_string(), }; let unphased = false; let mut bam = create_bam_reader(&bam, &fasta); @@ -185,6 +188,7 @@ fn test_get_overlapping_reads_url3() { chrom: String::from("chr20"), start: 154654404, end: 154654432, + id: "TEST".to_string(), }; let unphased = false; let mut bam = create_bam_reader(&bam, &fasta); @@ -199,6 +203,7 @@ fn test_get_overlapping_reads_url4() { chrom: String::from("chr20"), start: 154654404, end: 154654432, + id: "TEST".to_string(), }; let unphased = false; let mut bam = create_bam_reader(&bam, &fasta); diff --git a/src/phase_insertions.rs b/src/phase_insertions.rs index b351139..d1cb85d 100644 --- a/src/phase_insertions.rs +++ b/src/phase_insertions.rs @@ -332,6 +332,7 @@ mod tests { chrom: "chr7".to_string(), start: 154654404, end: 154654432, + id: "TEST".to_string(), }, false, ); @@ -372,6 +373,7 @@ mod tests { chrom: "chr7".to_string(), start: 154654404, end: 154654432, + id: "TEST".to_string(), }, false, ); @@ -411,6 +413,7 @@ mod tests { chrom: "chr7".to_string(), start: 154654404, end: 154654432, + id: "TEST".to_string(), }, false, ); @@ -453,6 +456,7 @@ mod tests { chrom: "chr7".to_string(), start: 154654404, end: 154654432, + id: "TEST".to_string(), }, false, ); @@ -493,6 +497,7 @@ mod tests { chrom: "chr7".to_string(), start: 154654404, end: 154654432, + id: "TEST_RFC1".to_string(), }, false, ); @@ -564,6 +569,7 @@ mod tests { chrom: "chr7".to_string(), start: 154654404, end: 154654432, + id: "TEST_RFC1".to_string(), }, false, ); diff --git a/src/repeats.rs b/src/repeats.rs index bc8f86a..5f6a8d5 100644 --- a/src/repeats.rs +++ b/src/repeats.rs @@ -21,7 +21,9 @@ impl RepeatIntervalIterator { let end: u32 = interval.split('-').collect::>()[1] .parse() .unwrap(); - let repeat = RepeatInterval::new_interval(chrom, start, end, fasta) + let id = String::new(); + let motifs = String::new(); + let repeat = RepeatInterval::new_interval(chrom, start, end, fasta, id, motifs) .expect("Failed to create repeat interval"); RepeatIntervalIterator { current_index: 0, @@ -74,6 +76,8 @@ impl Clone for RepeatInterval { chrom: self.chrom.clone(), start: self.start, end: self.end, + id: self.id.clone(), + motifs: self.motifs.clone(), } } } @@ -99,6 +103,8 @@ pub struct RepeatInterval { pub chrom: String, pub start: u32, pub end: u32, + pub id: String, + pub motifs: String, } impl fmt::Display for RepeatInterval { @@ -117,10 +123,26 @@ impl RepeatInterval { let chrom = rec.chrom().to_string(); let start = rec.start().try_into().unwrap(); let end = rec.end().try_into().unwrap(); - RepeatInterval::new_interval(chrom, start, end, fasta) + // TODO: Should be ID + let mut id = String::new(); + let mut motifs = String::new(); + if let Some(info) = rec.name() { + let key_value_pairs: Vec<&str> = info.split(';').collect(); + // Find and extract the value associated with the "ID" key + for pair in key_value_pairs { + if pair.starts_with("ID=") { + id = pair.trim_start_matches("ID=").to_string(); + } + else if pair.starts_with("MOTIFS=") { + motifs = pair.trim_start_matches("MOTIFS=").to_string(); + break; + } + } + } + RepeatInterval::new_interval(chrom, start, end, fasta, id, motifs) } - fn new_interval(chrom: String, start: u32, end: u32, fasta: &str) -> Option { + fn new_interval(chrom: String, start: u32, end: u32, fasta: &str, id: String, motifs: String) -> Option { if end < start { panic!("End coordinate is smaller than start coordinate for {chrom}:{start}-{end}") } @@ -138,7 +160,7 @@ impl RepeatInterval { .expect("Failed parsing chromosome length from fai file") > end { - return Some(Self { chrom, start, end }); + return Some(Self { chrom, start, end, id, motifs}); } } // if the chromosome is not in the fai file or the end does not fit the interval, return None @@ -146,11 +168,13 @@ impl RepeatInterval { "Chromosome {chrom} is not in the fasta file or the end coordinate is out of bounds" ); } - pub fn new(chrom: &str, start: u32, end: u32) -> Self { + pub fn new(chrom: &str, start: u32, end: u32, id: &str, motifs: &str) -> Self { Self { chrom: chrom.to_string(), start, end, + id: id.to_string(), + motifs: motifs.to_string(), } } diff --git a/src/vcf.rs b/src/vcf.rs index 5f5960c..08c942f 100644 --- a/src/vcf.rs +++ b/src/vcf.rs @@ -7,6 +7,8 @@ use std::cmp::Ordering; use std::fmt; use std::io::Read; +use crate::motif::create_mc; + pub struct Allele { pub length: String, // length of the consensus sequence minus the length of the repeat sequence pub full_length: String, // length of the consensus sequence @@ -55,6 +57,9 @@ pub struct VCFRecord { pub ps: Option, // phase set identifier pub flags: String, pub allele: (String, String), + pub id: String, + pub motifs: String, + pub mc: (String, String), } impl VCFRecord { @@ -87,6 +92,8 @@ impl VCFRecord { "Genotyping {repeat}:{repeat_ref_sequence} with {} and {}", allele1.seq, allele2.seq, ); + let mc1 = create_mc(&repeat.motifs, &allele1.seq); + let mc2 = create_mc(&repeat.motifs, &allele2.seq); // if the consensus is very similar to the reference the variant is considered ref // for this I use a threshold of 5% of the length of the repeat sequence in the reference // e.g. if the repeat is 300bp in the reference this will allow an edit distance of 15 @@ -158,6 +165,9 @@ impl VCFRecord { ps, flags, allele: (genotype1.to_string(), genotype2.to_string()), + id: repeat.id.clone(), + motifs: repeat.motifs.clone(), + mc: (mc1, mc2), } } @@ -182,6 +192,9 @@ impl VCFRecord { ps: None, flags: "".to_string(), allele: (".".to_string(), ".".to_string()), + id: repeat.id.clone(), + motifs: repeat.motifs.clone(), + mc: (".".to_string(), ".".to_string()), } } } @@ -191,12 +204,12 @@ impl fmt::Display for VCFRecord { match &self.alt_seq { Some(alts) => { let (FORMAT, ps) = match self.ps { - Some(ps) => ("GT:RB:FRB:SUP:SC:PS", format!(":{}", ps)), - None => ("GT:RB:FRB:SUP:SC", "".to_string()), + Some(ps) => ("GT:RB:FRB:SUP:MC:SC:PS", format!(":{}", ps)), + None => ("GT:RB:FRB:SUP:MC:SC", "".to_string()), }; write!( f, - "{chrom}\t{start}\t.\t{ref}\t{alt}\t.\t.\t{flags}END={end};STDEV={sd1},{sd2}{somatic}{outliers}\t{FORMAT}\t{genotype1}|{genotype2}:{l1},{l2}:{fl1},{fl2}:{sup1},{sup2}:{score1},{score2}{ps}", + "{chrom}\t{start}\t.\t{ref}\t{alt}\t.\t.\t{flags}REPID={id};MOTIFS={motifs};END={end};STDEV={sd1},{sd2}{somatic}{outliers}\t{FORMAT}\t{genotype1}|{genotype2}:{l1},{l2}:{fl1},{fl2}:{sup1},{sup2}:{mc1},{mc2}:{score1},{score2}{ps}", chrom = self.chrom, start = self.start, flags = self.flags, @@ -217,12 +230,17 @@ impl fmt::Display for VCFRecord { sup2 = self.support.1, score1 = self.score.0, score2 = self.score.1, + id = self.id, + motifs = self.motifs, + mc1 = self.mc.0, + mc2 = self.mc.1, + ) } None => { write!( f, - "{chrom}\t{start}\t.\t{ref}\t.\t.\t.\tEND={end};{somatic}\tGT:SUP\t{genotype1}|{genotype2}:{sup1},{sup2}", + "{chrom}\t{start}\t.\t{ref}\t.\t.\t.\tREPID={id};MOTIFS={motifs};END={end};{somatic}\tGT:SUP\t{genotype1}|{genotype2}:{sup1},{sup2}:{mc1},{mc2}", chrom = self.chrom, start = self.start, end = self.end, @@ -232,6 +250,10 @@ impl fmt::Display for VCFRecord { genotype2 = self.allele.1, sup1 = self.support.0, sup2 = self.support.1, + id = self.id, + motifs = self.motifs, + mc1 = self.mc.0, + mc2 = self.mc.1, ) } } @@ -310,6 +332,7 @@ pub fn write_vcf_header(fasta: &str, bam: &str, sample: &Option) { ); println!(r#"##FORMAT="#); println!(r#"##FORMAT="#); + println!(r#"##FORMAT="#); println!(r#"##FORMAT="#); let name = match sample { Some(name) => name, From ba2e079c5c00be5e956454310cbb33ff6636cf09 Mon Sep 17 00:00:00 2001 From: fellen31 Date: Thu, 16 May 2024 12:51:42 +0200 Subject: [PATCH 2/6] remove TODO --- src/repeats.rs | 1 - 1 file changed, 1 deletion(-) diff --git a/src/repeats.rs b/src/repeats.rs index 5f6a8d5..f799d24 100644 --- a/src/repeats.rs +++ b/src/repeats.rs @@ -123,7 +123,6 @@ impl RepeatInterval { let chrom = rec.chrom().to_string(); let start = rec.start().try_into().unwrap(); let end = rec.end().try_into().unwrap(); - // TODO: Should be ID let mut id = String::new(); let mut motifs = String::new(); if let Some(info) = rec.name() { From 8107fa64210549d0d7c7ddfd684592fa425fd8f3 Mon Sep 17 00:00:00 2001 From: fellen31 Date: Thu, 16 May 2024 13:11:05 +0200 Subject: [PATCH 3/6] add header lines --- src/vcf.rs | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/vcf.rs b/src/vcf.rs index 08c942f..b27af12 100644 --- a/src/vcf.rs +++ b/src/vcf.rs @@ -308,6 +308,12 @@ pub fn write_vcf_header(fasta: &str, bam: &str, sample: &Option) { let length = contig.next().unwrap().parse::().unwrap(); println!(r#"##contig="#, name, length); } + println!( + r#"##INFO="# + ); + println!( + r#"##INFO="# + ); println!( r#"##INFO="# ); From 4a9c0609b96586f1c0eca3a443bb33b5e0eba472 Mon Sep 17 00:00:00 2001 From: fellen31 Date: Thu, 16 May 2024 13:25:48 +0200 Subject: [PATCH 4/6] MC should be String --- src/vcf.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/vcf.rs b/src/vcf.rs index b27af12..b3eb542 100644 --- a/src/vcf.rs +++ b/src/vcf.rs @@ -338,7 +338,7 @@ pub fn write_vcf_header(fasta: &str, bam: &str, sample: &Option) { ); println!(r#"##FORMAT="#); println!(r#"##FORMAT="#); - println!(r#"##FORMAT="#); + println!(r#"##FORMAT="#); println!(r#"##FORMAT="#); let name = match sample { Some(name) => name, From f08e5b16ff70fa6d7d368544e39d97543bb7261d Mon Sep 17 00:00:00 2001 From: fellen31 Date: Thu, 16 May 2024 14:21:43 +0200 Subject: [PATCH 5/6] make tests pass with mc --- src/vcf.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/src/vcf.rs b/src/vcf.rs index b3eb542..c7d9109 100644 --- a/src/vcf.rs +++ b/src/vcf.rs @@ -94,6 +94,7 @@ impl VCFRecord { ); let mc1 = create_mc(&repeat.motifs, &allele1.seq); let mc2 = create_mc(&repeat.motifs, &allele2.seq); + println!("{:?} {:?} {:?} {:?} {:?} {:?}", &repeat, &repeat.motifs, &mc1, &mc2, &allele1.seq, &allele2.seq); // if the consensus is very similar to the reference the variant is considered ref // for this I use a threshold of 5% of the length of the repeat sequence in the reference // e.g. if the repeat is 300bp in the reference this will allow an edit distance of 15 From 300be24097c619ea3a3d839d5d1492d67eb53149 Mon Sep 17 00:00:00 2001 From: fellen31 Date: Thu, 16 May 2024 14:21:53 +0200 Subject: [PATCH 6/6] make tests pass with mc --- src/consensus.rs | 1 + src/genotype.rs | 6 ++++++ src/motif.rs | 36 +++++++++++++++++++++++------------- src/parse_bam.rs | 5 +++++ src/phase_insertions.rs | 6 ++++++ 5 files changed, 41 insertions(+), 13 deletions(-) diff --git a/src/consensus.rs b/src/consensus.rs index 3744224..56fc0ba 100644 --- a/src/consensus.rs +++ b/src/consensus.rs @@ -198,6 +198,7 @@ mod tests { start: 1, end: 100, id: "".to_string(), + motifs: "CA".to_string(), }, ); println!("Consensus: {}", cons.seq.unwrap()); diff --git a/src/genotype.rs b/src/genotype.rs index d3b7956..e84a96c 100644 --- a/src/genotype.rs +++ b/src/genotype.rs @@ -331,6 +331,7 @@ mod tests { start: 154654404, end: 154654432, id: String::from("test-repeat"), + motifs: "CA".to_string(), }; let flanking = 2000; let minlen = 5; @@ -371,6 +372,7 @@ mod tests { start: 154654404, end: 154654432, id: "TEST".to_string(), + motifs: "CA".to_string(), }; let args = Cli { bam: String::from("test_data/small-test-phased.bam"), @@ -400,6 +402,7 @@ mod tests { start: 154654404, end: 154654432, id: "TEST".to_string(), + motifs: "CA".to_string(), }; let args = Cli { bam: String::from("test_data/small-test-phased.bam"), @@ -445,6 +448,7 @@ mod tests { start: 154654404, end: 154654432, id: "TEST".to_string(), + motifs: "CA".to_string(), }; let mut bam = parse_bam::create_bam_reader(&args.bam, &args.fasta); let genotype = genotype_repeat(&repeat, &args, &mut bam); @@ -474,6 +478,7 @@ mod tests { start: 154654404, end: 154654432, id: "TEST".to_string(), + motifs: "CA".to_string(), }; let mut bam = parse_bam::create_bam_reader(&args.bam, &args.fasta); let genotype = genotype_repeat(&repeat, &args, &mut bam); @@ -504,6 +509,7 @@ mod tests { start: 154654404, end: 154654432, id: "TEST".to_string(), + motifs: "CA".to_string(), }; let mut bam = parse_bam::create_bam_reader(&args.bam, &args.fasta); let genotype = genotype_repeat(&repeat, &args, &mut bam); diff --git a/src/motif.rs b/src/motif.rs index 85029e4..123f765 100644 --- a/src/motif.rs +++ b/src/motif.rs @@ -6,19 +6,18 @@ fn create_motif(seq: &str) -> String { } pub fn create_mc(motifs_string: &str, repeats: &str) -> String { - let motifs: Vec = motifs_string.split(',').map(|s| s.to_string()).collect(); - - let mut motif_counts = vec![0; motifs.len()]; - for motif in motifs.iter() { - let mut start = 0; - while let Some(index) = repeats[start..].find(motif) { - motif_counts[motifs.iter().position(|x| x == motif).unwrap()] += 1; - start += index + motif.len(); - } - } - motif_counts.iter().map(|count| count.to_string()).collect::>().join("_") -} + let motifs: Vec = motifs_string.split(',').map(|s| s.to_string()).collect(); + let mut motif_counts = vec![0; motifs.len()]; + for motif in motifs.iter() { + let mut start = 0; + while let Some(index) = repeats[start..].find(motif) { + motif_counts[motifs.iter().position(|x| x == motif).unwrap()] += 1; + start += index + motif.len(); + } + } + motif_counts.iter().map(|count| count.to_string()).collect::>().join("_") + } #[cfg(test)] mod tests { @@ -32,11 +31,22 @@ mod tests { "(CAG)4(CGG)3(CAG)3" ); } + #[test] fn test_create_mc() { let motifs = "ATG,CGT,GCA"; let repeat = "ATGCGTATGCGTAGCGT"; + println!("{:?}", create_mc(&motifs, &repeat)); + assert_eq!( + create_mc(&motifs, repeat), "2_3_0" + ); + } + #[test] + fn test_create_mc_with_n() { + let motifs = "NCG"; + let repeat = "ACGACGACGACG"; + assert_eq!( - create_mc(&motifs, repeat), "3_1_0" + create_mc(&motifs, repeat), "4" ); } } diff --git a/src/parse_bam.rs b/src/parse_bam.rs index d2d7da5..3ae0938 100644 --- a/src/parse_bam.rs +++ b/src/parse_bam.rs @@ -144,6 +144,7 @@ fn test_get_overlapping_reads() { start: 154654404, end: 154654432, id: "TEST".to_string(), + motifs: "CA".to_string(), }; let unphased = false; let mut bam = create_bam_reader(&bam, &fasta); @@ -159,6 +160,7 @@ fn test_get_overlapping_reads_url1() { start: 154654404, end: 154654432, id: "TEST".to_string(), + motifs: "CA".to_string(), }; let unphased = false; let mut bam = create_bam_reader(&bam, &fasta); @@ -174,6 +176,7 @@ fn test_get_overlapping_reads_url2() { start: 154654404, end: 154654432, id: "TEST".to_string(), + motifs: "CA".to_string(), }; let unphased = false; let mut bam = create_bam_reader(&bam, &fasta); @@ -189,6 +192,7 @@ fn test_get_overlapping_reads_url3() { start: 154654404, end: 154654432, id: "TEST".to_string(), + motifs: "CA".to_string(), }; let unphased = false; let mut bam = create_bam_reader(&bam, &fasta); @@ -204,6 +208,7 @@ fn test_get_overlapping_reads_url4() { start: 154654404, end: 154654432, id: "TEST".to_string(), + motifs: "CA".to_string(), }; let unphased = false; let mut bam = create_bam_reader(&bam, &fasta); diff --git a/src/phase_insertions.rs b/src/phase_insertions.rs index d1cb85d..6b6a3f2 100644 --- a/src/phase_insertions.rs +++ b/src/phase_insertions.rs @@ -333,6 +333,7 @@ mod tests { start: 154654404, end: 154654432, id: "TEST".to_string(), + motifs: "CA".to_string(), }, false, ); @@ -374,6 +375,7 @@ mod tests { start: 154654404, end: 154654432, id: "TEST".to_string(), + motifs: "CA".to_string(), }, false, ); @@ -414,6 +416,7 @@ mod tests { start: 154654404, end: 154654432, id: "TEST".to_string(), + motifs: "CA".to_string(), }, false, ); @@ -457,6 +460,7 @@ mod tests { start: 154654404, end: 154654432, id: "TEST".to_string(), + motifs: "CA".to_string(), }, false, ); @@ -498,6 +502,7 @@ mod tests { start: 154654404, end: 154654432, id: "TEST_RFC1".to_string(), + motifs: "CA".to_string(), }, false, ); @@ -570,6 +575,7 @@ mod tests { start: 154654404, end: 154654432, id: "TEST_RFC1".to_string(), + motifs: "CA".to_string(), }, false, );