Skip to content

Commit

Permalink
Merge pull request #11 from genomewalker:trim-ends-cov
Browse files Browse the repository at this point in the history
Trim-ends-cov
  • Loading branch information
genomewalker authored Nov 18, 2022
2 parents d8521ca + df82847 commit 7e549ea
Show file tree
Hide file tree
Showing 4 changed files with 143 additions and 13 deletions.
9 changes: 8 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,8 @@ filterBAM only needs a BAM file. For a complete list of options:
```
$ filterBAM --help
usage: filterBAM [-h] [-t THREADS] [-p PREFIX] [-A MIN_READ_ANI] [-l MIN_READ_LENGTH]
usage: filterBAM [-h] [-t THREADS] [--reference-trim-length TRIM_ENDS] [--trim-min TRIM_MIN]
[--trim-max TRIM_MAX] [-p PREFIX] [-A MIN_READ_ANI] [-l MIN_READ_LENGTH]
[-n MIN_READ_COUNT] [-b MIN_EXPECTED_BREADTH_RATIO] [-e MIN_NORM_ENTROPY]
[-g MIN_NORM_GINI] [-B MIN_BREADTH] [-a MIN_AVG_READ_ANI] [-c MIN_COVERAGE_EVENNESS]
[-m SORT_MEMORY] [-N] [--scale SCALE] [-r REFERENCE_LENGTHS] [--read-length-freqs]
Expand All @@ -75,6 +76,12 @@ optional arguments:
-h, --help show this help message and exit
-t THREADS, --threads THREADS
Number of threads to use (default: 1)
--reference-trim-length TRIM_ENDS
Exclude n bases at the ends of the reference sequences (default: 0)
--trim-min TRIM_MIN Remove coverage that are below this percentile. Used for the Truncated Average
Depth (TAD) calculation (default: 10)
--trim-max TRIM_MAX Remove coverage that are above this percentile. Used for the Truncated Average
Depth (TAD) calculation (default: 90)
-p PREFIX, --prefix PREFIX
Prefix used for the output files (default: None)
-A MIN_READ_ANI, --min-read-ani MIN_READ_ANI
Expand Down
8 changes: 8 additions & 0 deletions bam_filter/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,11 @@ def main():
)

args = get_arguments()

if args.trim_min >= args.trim_max:
log.error("trim_min must be less than trim_max")
exit(1)

logging.getLogger("my_logger").setLevel(
logging.DEBUG if args.debug else logging.INFO
)
Expand All @@ -87,6 +92,9 @@ def main():
reference_lengths=args.reference_lengths,
min_read_count=args.min_read_count,
min_read_ani=args.min_read_ani,
trim_ends=args.trim_ends,
trim_min=args.trim_min,
trim_max=args.trim_max,
scale=args.scale,
sort_memory=args.sort_memory,
plot=args.plot,
Expand Down
86 changes: 83 additions & 3 deletions bam_filter/sam_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,20 @@
sys.setrecursionlimit(10**6)


def get_tad(cov, trim_min=10, trim_max=90):
"""
Get the TAD of a coverage
"""
cov = cov[
(cov >= np.percentile(cov, trim_min)) & (cov <= np.percentile(cov, trim_max))
]

if sum(cov) == 0:
return 0, None
else:
return sum(cov) / len(cov), len(cov)


# Function to calculate evenness of coverage
def coverage_evenness(coverage):
"""
Expand Down Expand Up @@ -97,6 +111,9 @@ def get_bam_stats(
ref_lengths=None,
min_read_ani=90.0,
scale=1e6,
trim_ends=0,
trim_min=10,
trim_max=90,
plot=False,
plots_dir="coverage-plots",
read_length_freqs=False,
Expand Down Expand Up @@ -188,6 +205,21 @@ def get_bam_stats(
# max_depth=100000000,
# )
# ]
if trim_ends > len(cov_np) or trim_ends * 2 > len(cov_np):
log.warning(
f"Trimming ends ({trim_ends}) is larger than reference length ({len(cov_np)}). Disabling trimming."
)
trim_ends = 0

if trim_ends > 0:
cov_np = cov_np[trim_ends:-trim_ends]

mean_coverage_trunc, mean_coverage_trunc_len = get_tad(
cov_np,
trim_min=10,
trim_max=90,
)

cov_pos = cov_np[cov_np > 0]
cov_positions = np.where(cov_np > 0)[0]
# convert datafrane to pyranges
Expand All @@ -205,10 +237,10 @@ def get_bam_stats(
cov_sd = np.std(cov_pos, ddof=1)
cov_var = np.var(cov_pos, ddof=1)
# get average coverage
mean_coverage = sum(cov_pos) / reference_length
mean_coverage = sum(cov_pos) / (reference_length - (2 * trim_ends))
mean_coverage_covered = sum(cov_pos) / bases_covered

breadth = bases_covered / reference_length
breadth = bases_covered / (reference_length - (2 * trim_ends))
exp_breadth = 1 - np.exp(-mean_coverage)
breadth_exp_ratio = breadth / exp_breadth

Expand All @@ -229,6 +261,23 @@ def get_bam_stats(
tax_abund_aln = round((n_alns / reference_length) * scale)
tax_abund_read = round((len(set(read_names)) / reference_length) * scale)

# Using the trimmed mean to estimate number of reads that map to the reference
# This is to avoid the issue of having a very high coverage region that
# skews the mean coverage
# C = LN / G
# • C stands for coverage
# • G is the haploid genome length
# • L is the read length
# • N is the number of reads
#
if mean_coverage_trunc_len > 0 and mean_coverage_trunc > 0:
n_reads_tad = round(
(reference_length * mean_coverage_trunc) / np.mean(read_length)
)
tax_abund_tad = round((n_reads_tad / mean_coverage_trunc_len) * scale)
else:
n_reads_tad = 0
tax_abund_tad = 0
# Analyse site distribution
n_sites = len(cov_pos)
genome_length = bam_reference_length
Expand All @@ -250,6 +299,8 @@ def get_bam_stats(
log.debug(f"Number of alignments: {n_alns:,}")
log.debug(f"Bases covered: {bases_covered:,}")
log.debug(f"Mean coverage: {mean_coverage:.2f}")
log.debug(f"Mean coverage (truncated): {mean_coverage_trunc:.2f}")
log.debug(f"Reference length (truncated): {mean_coverage_trunc_len:.2f}")
log.debug(f"Mean coverage covered: {mean_coverage_covered:.2f}")
log.debug(f"Max covered bases: {max_covered_bases:,}")
log.debug(f"Mean covered bases: {mean_covered_bases:.2f}")
Expand All @@ -270,7 +321,8 @@ def get_bam_stats(
log.debug(f"GC content: {gc_content:.2f}")
log.debug(f"Taxonomic abundance (alns): {tax_abund_aln:.2f}")
log.debug(f"Taxonomic abundance (reads): {tax_abund_read:.2f}")

log.debug(f"Taxonomic abundance (TAD): {tax_abund_tad:.2f}")
log.debug(f"Number of reads (TAD): {n_reads_tad:,}")
if plot:
fig, ax = plt.subplots(nrows=1, ncols=1) # create figure & 1 axis
# infer number of bins using Freedman-Diaconis rule
Expand Down Expand Up @@ -305,6 +357,8 @@ def get_bam_stats(
reference_length=reference_length,
bam_reference_length=bam_reference_length,
mean_coverage=mean_coverage,
mean_coverage_trunc=mean_coverage_trunc,
mean_coverage_trunc_len=mean_coverage_trunc_len,
mean_coverage_covered=mean_coverage_covered,
bases_covered=bases_covered,
max_covered_bases=max_covered_bases,
Expand Down Expand Up @@ -333,6 +387,8 @@ def get_bam_stats(
read_aln_score=read_aln_score,
tax_abund_aln=tax_abund_aln,
tax_abund_read=tax_abund_read,
tax_abund_tad=tax_abund_tad,
n_reads_tad=n_reads_tad,
)
results.append(data)
samfile.close()
Expand Down Expand Up @@ -375,6 +431,8 @@ def __init__(
max_covered_bases,
mean_covered_bases,
mean_coverage,
mean_coverage_trunc,
mean_coverage_trunc_len,
mean_coverage_covered,
reference_length,
bam_reference_length,
Expand All @@ -394,6 +452,8 @@ def __init__(
read_aln_score,
tax_abund_aln,
tax_abund_read,
tax_abund_tad,
n_reads_tad,
):
self.reference = reference
self.n_alns = n_alns
Expand All @@ -410,6 +470,8 @@ def __init__(
self.max_covered_bases = max_covered_bases
self.mean_covered_bases = mean_covered_bases
self.mean_coverage = mean_coverage
self.mean_coverage_trunc = mean_coverage_trunc
self.mean_coverage_trunc_len = mean_coverage_trunc_len
self.mean_coverage_covered = mean_coverage_covered
self.reference_length = reference_length
self.bam_reference_length = bam_reference_length
Expand All @@ -428,6 +490,8 @@ def __init__(
self.read_names = read_names
self.tax_abund_aln = tax_abund_aln
self.tax_abund_read = tax_abund_read
self.tax_abund_tad = tax_abund_tad
self.n_reads_tad = n_reads_tad
# function to convert class to dict

def as_dict(self):
Expand All @@ -448,6 +512,8 @@ def as_dict(self):
"max_covered_bases": self.max_covered_bases,
"mean_covered_bases": self.mean_covered_bases,
"mean_coverage": self.mean_coverage,
"mean_coverage_trunc": self.mean_coverage_trunc,
"mean_coverage_trunc_len": self.mean_coverage_trunc_len,
"mean_coverage_covered": self.mean_coverage_covered,
"reference_length": self.reference_length,
"breadth": self.breadth,
Expand All @@ -464,6 +530,8 @@ def as_dict(self):
"cov_evenness": self.cov_evenness,
"tax_abund_read": self.tax_abund_read,
"tax_abund_aln": self.tax_abund_aln,
"tax_abund_tad": self.tax_abund_tad,
"n_reads_tad": self.n_reads_tad,
}

def to_summary(self):
Expand Down Expand Up @@ -507,6 +575,8 @@ def to_summary(self):
"max_covered_bases": self.max_covered_bases,
"mean_covered_bases": self.mean_covered_bases,
"coverage_mean": self.mean_coverage,
"coverage_mean_trunc": self.mean_coverage_trunc,
"coverage_mean_trunc_len": self.mean_coverage_trunc_len,
"coverage_covered_mean": self.mean_coverage_covered,
"reference_length": self.reference_length,
"bam_reference_length": self.bam_reference_length,
Expand All @@ -524,6 +594,8 @@ def to_summary(self):
"cov_evenness": self.cov_evenness,
"tax_abund_read": self.tax_abund_read,
"tax_abund_aln": self.tax_abund_aln,
"tax_abund_tad": self.tax_abund_tad,
"n_reads_tad": self.n_reads_tad,
}

def get_read_length_freqs(self):
Expand All @@ -542,6 +614,9 @@ def process_bam(
reference_lengths=None,
min_read_ani=90.0,
min_read_count=10,
trim_ends=0,
trim_min=10,
trim_max=90,
scale=1e6,
sort_memory="1G",
plot=False,
Expand Down Expand Up @@ -657,6 +732,9 @@ def process_bam(
get_bam_stats,
ref_lengths=ref_lengths,
min_read_ani=min_read_ani,
trim_ends=0,
trim_min=trim_min,
trim_max=trim_max,
scale=scale,
plot=plot,
plots_dir=plots_dir,
Expand All @@ -680,6 +758,8 @@ def process_bam(
get_bam_stats,
ref_lengths=ref_lengths,
min_read_ani=min_read_ani,
trim_ends=0,
trim_min=trim_min,
scale=scale,
plot=plot,
plots_dir=plots_dir,
Expand Down
Loading

0 comments on commit 7e549ea

Please sign in to comment.