Skip to content

Commit

Permalink
adding --consensus_reads to limit number of reads to use for generati…
Browse files Browse the repository at this point in the history
…ng consensus
  • Loading branch information
wdecoster committed Aug 4, 2024
1 parent b467d86 commit 6406987
Show file tree
Hide file tree
Showing 4 changed files with 40 additions and 42 deletions.
2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "STRdust"
version = "0.9.1"
version = "0.10.0"
edition = "2021"


Expand Down
26 changes: 5 additions & 21 deletions src/consensus.rs
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ impl fmt::Display for Consensus {
pub fn consensus(
seqs: &[String],
support: usize,
consensus_reads: usize,
repeat: &crate::repeats::RepeatInterval,
) -> Consensus {
if seqs.is_empty() {
Expand Down Expand Up @@ -66,36 +67,18 @@ pub fn consensus(
} else {
// if there are more than 20 reads, downsample to 20 before taking the consensus
// for performance and memory reasons
let seqs = if num_reads > 20 {
debug!("{repeat}: Too many reads, downsampling to 20");
seqs.choose_multiple(&mut rand::thread_rng(), 20)
let seqs = if num_reads > consensus_reads {
debug!("{repeat}: Too many reads, downsampling to {consensus_reads}");
seqs.choose_multiple(&mut rand::thread_rng(), consensus_reads)
.cloned()
.collect::<Vec<&String>>()
} else {
seqs
};
let mut seqs_bytes = vec![];
// code below is for rust-bio, but going back to rust-spoa now to avoid errors
// hopefully able to revert back to rust-bio as soon as those errors are patched
for seq in seqs.iter() {
seqs_bytes.push(seq.to_string().bytes().collect::<Vec<u8>>());
}

// let consensus_max_length = seqs.iter().map(|x| x.len()).max().unwrap_or(0);
// for seq in seqs.iter() {
// seqs_bytes.push(format!("{seq}\0").bytes().collect::<Vec<u8>>());
// }
// let consensus = poa_consensus(
// &seqs_bytes,
// consensus_max_length,
// 1, // 0 = local, 1 = global, 2 = gapped
// 3, // match_score,
// -4, // mismatch_score,
// -12, // gap_open,
// -6, // gap_extend,
// );

// code below is again for rust-bio poa
// I empirically determined the following parameters to be suitable,
// but further testing on other repeats would be good
// mainly have to make sure the consensus does not get longer than the individual insertions
Expand Down Expand Up @@ -196,6 +179,7 @@ mod tests {
let cons = consensus(
&seqs,
10,
20,
&crate::repeats::RepeatInterval {
chrom: "chr1".to_string(),
start: 1,
Expand Down
50 changes: 30 additions & 20 deletions src/genotype.rs
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ fn genotype_repeat(
repeat,
"N",
".".to_string(),
args
args,
))
}
};
Expand Down Expand Up @@ -76,7 +76,7 @@ fn genotype_repeat(
repeat,
&repeat_ref_seq,
0.to_string(),
args
args,
));
}
};
Expand Down Expand Up @@ -127,12 +127,12 @@ fn genotype_repeat(
repeat,
&repeat_ref_seq,
insertions.len().to_string(),
args
args,
));
}
// there is only one haplotype, haploid, so this gets duplicated for reporting in the VCF module
// Ideally vcf.rs would explicitly handle haploid chromosomes
let consensus = crate::consensus::consensus(&insertions, args.support, repeat);
let consensus = crate::consensus::consensus(&insertions, args.support, args.consensus_reads, repeat);
consenses.push(consensus.clone());
consenses.push(consensus);
if let Some(ref mut all_ins) = all_insertions {
Expand All @@ -148,12 +148,15 @@ fn genotype_repeat(
if insertions.len() < args.support {
// Return a missing genotype if not enough insertions are found
// this is too lenient - the support parameter is meant to be per haplotype
debug!("{repeat}: Not enough insertions found: {}", insertions.len());
debug!(
"{repeat}: Not enough insertions found: {}",
insertions.len()
);
return Ok(crate::vcf::VCFRecord::missing_genotype(
repeat,
&repeat_ref_seq,
insertions.len().to_string(),
args
args,
));
}
// handle the special case where there is only 1 read for the repeat (but minimal support is 1 so it passes)
Expand All @@ -166,7 +169,7 @@ fn genotype_repeat(
all_insertions,
reads.ps,
flags,
args
args,
));
}

Expand All @@ -177,9 +180,10 @@ fn genotype_repeat(
consenses.push(crate::consensus::consensus(
&phased.hap1,
args.support,
args.consensus_reads,
repeat,
));
consenses.push(crate::consensus::consensus(&phase2, args.support, repeat));
consenses.push(crate::consensus::consensus(&phase2, args.support, args.consensus_reads, repeat));
// store all inserted sequences for identifying somatic variation
if let Some(ref mut all_ins) = all_insertions {
all_ins.extend([phased.hap1.join(":"), phase2.join(":")]);
Expand All @@ -189,7 +193,7 @@ fn genotype_repeat(
// there was only one haplotype, homozygous, so this gets duplicated for reporting
// not sure if cloning is the best approach here, but this is only the case for unphased data
// and therefore is typically for small datasets obtained through capture methods
let consensus = crate::consensus::consensus(&phased.hap1, args.support, repeat);
let consensus = crate::consensus::consensus(&phased.hap1, args.support, args.consensus_reads, repeat);
consenses.push(consensus.clone());
consenses.push(consensus);
// store all inserted sequences for identifying somatic variation
Expand Down Expand Up @@ -225,6 +229,7 @@ fn genotype_repeat(
consenses.push(crate::consensus::consensus(
&insertions,
args.support,
args.consensus_reads,
repeat,
));

Expand All @@ -242,7 +247,7 @@ fn genotype_repeat(
repeat,
reads.ps,
flags,
args
args,
))
}

Expand Down Expand Up @@ -356,7 +361,7 @@ mod tests {
chrom: String::from("chr7"),
start: 154654404,
end: 154654432,
created: None
created: None,
};
let flanking = 2000;
let minlen = 5;
Expand Down Expand Up @@ -396,7 +401,7 @@ mod tests {
chrom: String::from("chr7"),
start: 154654404,
end: 154654432,
created: None
created: None,
};
let args = Cli {
bam: String::from("test_data/small-test-phased.bam"),
Expand All @@ -414,6 +419,7 @@ mod tests {
haploid: None,
debug: false,
sorted: false,
consensus_reads: 20,
};
let mut bam = parse_bam::create_bam_reader(&args.bam, &args.fasta);
let genotype = genotype_repeat(&repeat, &args, &mut bam);
Expand All @@ -426,7 +432,7 @@ mod tests {
chrom: String::from("chr7"),
start: 154654404,
end: 154654432,
created: None
created: None,
};
let args = Cli {
bam: String::from("test_data/small-test-phased.bam"),
Expand All @@ -443,7 +449,8 @@ mod tests {
sample: None,
haploid: Some(String::from("chr7")),
debug: false,
sorted: false
sorted: false,
consensus_reads: 20,
};
let mut bam = parse_bam::create_bam_reader(&args.bam, &args.fasta);
let genotype = genotype_repeat(&repeat, &args, &mut bam);
Expand All @@ -467,13 +474,14 @@ mod tests {
sample: None,
haploid: None,
debug: false,
sorted: false
sorted: false,
consensus_reads: 20,
};
let repeat = crate::repeats::RepeatInterval {
chrom: String::from("chr7"),
start: 154654404,
end: 154654432,
created: None
created: None,
};
let mut bam = parse_bam::create_bam_reader(&args.bam, &args.fasta);
let genotype = genotype_repeat(&repeat, &args, &mut bam);
Expand All @@ -497,13 +505,14 @@ mod tests {
sample: None,
haploid: None,
debug: false,
sorted: false
sorted: false,
consensus_reads: 20,
};
let repeat = crate::repeats::RepeatInterval {
chrom: String::from("chr7"),
start: 154654404,
end: 154654432,
created: None
created: None,
};
let mut bam = parse_bam::create_bam_reader(&args.bam, &args.fasta);
let genotype = genotype_repeat(&repeat, &args, &mut bam);
Expand All @@ -527,14 +536,15 @@ mod tests {
sample: None,
haploid: None,
debug: false,
sorted: false
sorted: false,
consensus_reads: 20
};

let repeat = crate::repeats::RepeatInterval {
chrom: String::from("chr7"),
start: 154654404,
end: 154654432,
created: None
created: None,
};
let mut bam = parse_bam::create_bam_reader(&args.bam, &args.fasta);
let genotype = genotype_repeat(&repeat, &args, &mut bam);
Expand Down
4 changes: 4 additions & 0 deletions src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,10 @@ pub struct Cli {
/// Sort output by chrom, start and end
#[clap(long, value_parser, default_value_t = false)]
sorted: bool,

/// Max number of reads to use to generate consensus alt sequence
#[clap(long, value_parser, default_value_t = 20)]
consensus_reads: usize,
}

fn is_file(pathname: &str) -> Result<(), String> {
Expand Down

0 comments on commit 6406987

Please sign in to comment.