diff --git a/jcvi/formats/bed.py b/jcvi/formats/bed.py index 3a158b37..9b1bef40 100755 --- a/jcvi/formats/bed.py +++ b/jcvi/formats/bed.py @@ -2,9 +2,7 @@ Classes to handle the .bed files """ -import logging import math -import numpy as np import os import os.path as op import shutil @@ -12,30 +10,33 @@ from collections import defaultdict, OrderedDict from itertools import groupby -from typing import Optional +from typing import Optional, Tuple + +import numpy as np from more_itertools import pairwise from natsort import natsorted, natsort_key -from jcvi.formats.base import DictFile, LineFile, must_open, is_number, get_number -from jcvi.formats.sizes import Sizes -from jcvi.utils.cbook import SummaryStats, thousands, percentage -from jcvi.utils.grouper import Grouper -from jcvi.utils.range import ( - Range, - range_union, - range_chain, - range_distance, - range_intersect, -) -from jcvi.apps.base import ( +from ..apps.base import ( OptionParser, ActionDispatcher, cleanup, + logger, need_update, popen, sh, ) +from ..formats.base import DictFile, LineFile, must_open, is_number, get_number +from ..formats.sizes import Sizes +from ..utils.cbook import SummaryStats, thousands, percentage +from ..utils.grouper import Grouper +from ..utils.range import ( + Range, + range_union, + range_chain, + range_distance, + range_intersect, +) class BedLine(object): @@ -176,7 +177,7 @@ def print_to_file(self, filename="stdout", sorted=False): fw = must_open(filename, "w") for b in self: if b.start < 1: - logging.error("Start < 1. Reset start for `{0}`.".format(b.accn)) + logger.error("Start < 1. Reset start for `%s`.", b.accn) b.start = 1 print(b, file=fw) fw.close() @@ -542,7 +543,7 @@ def gaps(args): n_gaps += 1 for seqid, gap_sizes in all_gaps.items(): total_gap_size = sum(gap_sizes) - logging.debug( + logger.debug( "Total gaps in %s: %d, %s", seqid, len(gap_sizes), @@ -622,7 +623,7 @@ def filterbedgraph(args): regions are 1, two copies .5, etc. """ p = OptionParser(filterbedgraph.__doc__) - opts, args = p.parse_args(args) + _, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) @@ -641,10 +642,11 @@ def filterbedgraph(args): print(b, file=fw) nfiltered += 1 fw.close() - logging.debug( - "A total of {} intervals (score >= {}) written to `{}`".format( - percentage(nfiltered, ntotal), cutoff, filteredbed - ) + logger.debug( + "A total of %s intervals (score >= %.2f) written to `%s`", + percentage(nfiltered, ntotal), + cutoff, + filteredbed, ) mergeBed(filteredbed, sorted=True, delim=None) @@ -726,10 +728,8 @@ def tiling(args): tilingbedfile = bedfile.rsplit(".", 1)[0] + ".tiling.bed" selected.print_to_file(filename=tilingbedfile, sorted=True) - logging.debug( - "A total of {} tiling features written to `{}`".format( - len(selected), tilingbedfile - ) + logger.debug( + "A total of %d tiling features written to `%s`", len(selected), tilingbedfile ) @@ -781,7 +781,7 @@ def density(args): Calculates density of features per seqid. """ p = OptionParser(density.__doc__) - opts, args = p.parse_args(args) + _, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) @@ -854,16 +854,14 @@ def sfa_to_fq(sfa, qvchar): name, seq = row.split() qual = len(seq) * qvchar print("\n".join(("@" + name, seq, "+", qual)), file=fw) - logging.debug("A total of {0} sequences written to `{1}`.".format(total, fq)) + logger.debug("A total of %d sequences written to `%s`.", total, fq) return fq def filter_bedpe(bedpe, filtered, ref, rc=False, rlen=None, minlen=2000, maxlen=8000): tag = " after RC" if rc else "" - logging.debug( - "Filter criteria: innie{0}, {1} <= insertsize <= {2}".format( - tag, minlen, maxlen - ) + logger.debug( + "Filter criteria: innie%s, %d <= insertsize <= %d", tag, minlen, maxlen ) sizes = Sizes(ref).mapping fp = must_open(bedpe) @@ -883,10 +881,8 @@ def filter_bedpe(bedpe, filtered, ref, rc=False, rlen=None, minlen=2000, maxlen= if rlen: b.extend(rlen, sizes[b.seqid1]) print(b, file=fw) - logging.debug( - "A total of {0} mates written to `{1}`.".format( - percentage(retained, total), filtered - ) + logger.debug( + "A total of %d mates written to `%s`.", percentage(retained, total), filtered ) fw.close() @@ -896,12 +892,12 @@ def rmdup_bedpe(filtered, rmdup, dupwiggle=10): if need_update(filtered, sortedfiltered): sh("sort -k1,1 -k2,2n -i {0} -o {1}".format(filtered, sortedfiltered)) - logging.debug("Rmdup criteria: wiggle <= {0}".format(dupwiggle)) + logger.debug("Rmdup criteria: wiggle <= %d", dupwiggle) fp = must_open(sortedfiltered) fw = must_open(rmdup, "w") data = [BedpeLine(x) for x in fp] retained = total = 0 - for seqid, ss in groupby(data, key=lambda x: x.seqid1): + for _, ss in groupby(data, key=lambda x: x.seqid1): ss = list(ss) for i, a in enumerate(ss): if a.isdup: @@ -922,10 +918,8 @@ def rmdup_bedpe(filtered, rmdup, dupwiggle=10): continue retained += 1 print(a, file=fw) - logging.debug( - "A total of {0} mates written to `{1}`.".format( - percentage(retained, total), rmdup - ) + logger.debug( + "A total of %s mates written to `%s`.", percentage(retained, total), rmdup ) fw.close() @@ -1102,7 +1096,7 @@ def random(args): outfile = bedfile.rsplit(".", 1)[0] + ".{0}.bed".format(N) new_bed.print_to_file(outfile) - logging.debug("Write {0} features to `{1}`".format(NN, outfile)) + logger.debug("Write %d features to `%s`", NN, outfile) def filter(args): @@ -1154,8 +1148,8 @@ def filter(args): print(b, file=fw) keep.append(span) - logging.debug("Stats: {0} features kept.".format(percentage(len(keep), len(total)))) - logging.debug("Stats: {0} bases kept.".format(percentage(sum(keep), sum(total)))) + logger.debug("Stats: %s features kept.", percentage(len(keep), len(total))) + logger.debug("Stats: %s bases kept.", percentage(sum(keep), sum(total))) def make_bedgraph(bedfile, fastafile): @@ -1284,15 +1278,13 @@ def longest(args): ids.add(max_accn) newids = remove_isoforms(ids) - logging.debug("Remove isoforms: before={0} after={1}".format(len(ids), len(newids))) + logger.debug("Remove isoforms: before=%d after=%d", len(ids), len(newids)) longestidsfile = pf + ".longest.ids" fw = open(longestidsfile, "w") print("\n".join(newids), file=fw) fw.close() - logging.debug( - "A total of {0} records written to `{1}`.".format(len(newids), longestidsfile) - ) + logger.debug("A total of %d records written to `%s`.", len(newids), longestidsfile) longestbedfile = pf + ".longest.bed" some( @@ -1372,9 +1364,9 @@ def fix(args): ntotal += 1 if nfixed: - logging.debug("Total fixed: {0}".format(percentage(nfixed, ntotal))) + logger.debug("Total fixed: %s".format(percentage(nfixed, ntotal))) if nfiltered: - logging.debug("Total filtered: {0}".format(percentage(nfiltered, ntotal))) + logger.debug("Total filtered: %s".format(percentage(nfiltered, ntotal))) def some(args): @@ -1422,7 +1414,7 @@ def some(args): print(b, file=fw) fw.close() - logging.debug("Stats: {0} features kept.".format(percentage(nkeep, ntotal))) + logger.debug("Stats: %s features kept.".format(percentage(nkeep, ntotal))) def uniq(args): @@ -1478,7 +1470,7 @@ def uniq(args): leftoverbed.extend(notselected) leftoverbed.print_to_file(leftoverfile, sorted=True) - logging.debug("Imported: {0}, Exported: {1}".format(len(bed), len(newbed))) + logger.debug("Imported: %d, Exported: %d", len(bed), len(newbed)) return uniqbedfile @@ -1505,6 +1497,16 @@ def subtractbins(binfile1, binfile2): return binfile1 +def get_nbins(clen: int, shift: int) -> Tuple[int, int]: + """ + Get the number of bins for a given chromosome length and shift. + """ + nbins, last_bin = divmod(clen, shift) + if last_bin: + nbins += 1 + return nbins, last_bin + + def bins(args): """ %prog bins bedfile fastafile @@ -1512,7 +1514,6 @@ def bins(args): Bin bed lengths into each consecutive window. Use --subtract to remove bases from window, e.g. --subtract gaps.bed ignores the gap sequences. """ - from jcvi.formats.sizes import Sizes p = OptionParser(bins.__doc__) p.add_option("--binsize", default=100000, type="int", help="Size of the bins") @@ -1562,10 +1563,7 @@ def bins(args): for chr, chr_len in sorted(sizes.items()): chr_len = sizes[chr] subbeds = sbdict.get(chr, []) - nbins = chr_len / binsize - last_bin = chr_len % binsize - if last_bin: - nbins += 1 + nbins, last_bin = get_nbins(chr_len, binsize) a = np.zeros(nbins) # values b = np.zeros(nbins, dtype="int") # bases @@ -1575,8 +1573,8 @@ def bins(args): for bb in subbeds: start, end = bb.start, bb.end - startbin = start / binsize - endbin = end / binsize + startbin = start // binsize + endbin = end // binsize assert startbin <= endbin c[startbin : endbin + 1] += 1 @@ -1638,7 +1636,7 @@ def pile(args): ngroups += 1 print("|".join(group)) - logging.debug("A total of {0} piles (>= 2 members)".format(ngroups)) + logger.debug("A total of %d piles (>= 2 members)", ngroups) def index(args): @@ -1718,7 +1716,7 @@ def mergeBed( if nms: nargs = len(open(bedfile).readline().split()) if nargs <= 3: - logging.debug("Only {0} columns detected... set nms=True".format(nargs)) + logger.debug("Only %d columns detected... set nms=True", nargs) else: cmd += " -c 4 -o collapse" if s: @@ -1952,7 +1950,7 @@ def distance(args): print(dist) valid += 1 - logging.debug("Total valid (> 0) distances: {0}.".format(percentage(valid, total))) + logger.debug("Total valid (> 0) distances: %s.", percentage(valid, total)) def sample(args): @@ -2009,10 +2007,11 @@ def sample(args): reverse.append(b) for tag, L in zip(("forward", "reverse"), (forward, reverse)): - logging.debug( - "Selected {0} features in {1} direction, span: {2}".format( - len(L), tag, sum(x.span for x in L) - ) + logger.debug( + "Selected %d features in %s direction, span: %d", + len(L), + tag, + sum(x.span for x in L), ) selected = Bed() @@ -2032,7 +2031,7 @@ def sample(args): for b in sub_bed: print(b, file=fw) - logging.debug("File written to `%s`.", samplebed) + logger.debug("File written to `%s`.", samplebed) return c = Coverage(bedfile, sizesfile) @@ -2050,7 +2049,7 @@ def sample(args): samplebedfile = pf + ".sample.bed" cmd = "intersectBed -a {0} -b {1} -wa -u".format(bedfile, samplecoveragefile) sh(cmd, outfile=samplebedfile) - logging.debug("Sampled bedfile written to `{0}`.".format(samplebedfile)) + logger.debug("Sampled bedfile written to `%s`.", samplebedfile) def bedpe(args): @@ -2111,7 +2110,7 @@ def sizes(args): b = Bed(bedfile) for s, sbeds in b.sub_beds(): print("{0}\t{1}".format(s, max(x.end for x in sbeds)), file=fw) - logging.debug("Sizes file written to `{0}`.".format(sizesfile)) + logger.debug("Sizes file written to `%s`.", sizesfile) return sizesfile @@ -2126,19 +2125,19 @@ def analyze_dists(dists, cutoff=1000, alpha=0.1): peak0 = [d for d in dists if d < cutoff] peak1 = [d for d in dists if d >= cutoff] c0, c1 = len(peak0), len(peak1) - logging.debug("Component counts: {0} {1}".format(c0, c1)) + logger.debug("Component counts: %d %d", c0, c1) if c0 == 0 or c1 == 0 or float(c1) / len(dists) < alpha: - logging.debug( - "Single peak identified ({0} / {1} < {2})".format(c1, len(dists), alpha) - ) + logger.debug("Single peak identified (%d / %d < %.1f)", c1, len(dists), alpha) return np.median(dists) peak0_median = np.median(peak0) peak1_median = np.median(peak1) - logging.debug( - "Dual peaks identified: {0}bp ({1}), {2}bp ({3}) (selected)".format( - int(peak0_median), c0, int(peak1_median), c1 - ) + logger.debug( + "Dual peaks identified: %dbp (%d), %dbp (%d) (selected)", + int(peak0_median), + c0, + int(peak1_median), + c1, ) return peak1_median @@ -2215,10 +2214,7 @@ def report_pairs( p0 = analyze_dists(dists, cutoff=mpcutoff) cutoff = int(2 * p0) # initial estimate cutoff = int(math.ceil(cutoff / bins)) * bins - logging.debug( - "Insert size cutoff set to {0}, ".format(cutoff) - + "use '--cutoff' to override" - ) + logger.debug("Insert size cutoff set to %d, use '--cutoff' to override", cutoff) for dist, orientation, aquery, bquery in all_dist: if dist > cutoff: @@ -2354,7 +2350,7 @@ def summary(args): for span, accn in bs.mspans: print(span, file=fw) fw.close() - logging.debug("Spans written to `{0}`.".format(sizesfile)) + logger.debug("Spans written to `%s`.", sizesfile) return bs if not opts.all: @@ -2496,7 +2492,7 @@ def mates(args): num_fragments = num_pairs = 0 matesbedfile = matesfile + ".bed" fwm = open(matesbedfile, "w") - for pe, lines in groupby(bed, key=key): + for _, lines in groupby(bed, key=key): lines = list(lines) if len(lines) != 2: num_fragments += len(lines) @@ -2524,10 +2520,12 @@ def mates(args): print(a, file=fwm) print(b, file=fwm) - logging.debug( - "Discard {0} frags and write {1} pairs to `{2}` and `{3}`.".format( - num_fragments, num_pairs, matesfile, matesbedfile - ) + logger.debug( + "Discard %d frags and write %d pairs to `%s` and `%s`.", + num_fragments, + num_pairs, + matesfile, + matesbedfile, ) fw.close() diff --git a/jcvi/graphics/landscape.py b/jcvi/graphics/landscape.py index b0322b6a..a53bc1b1 100644 --- a/jcvi/graphics/landscape.py +++ b/jcvi/graphics/landscape.py @@ -18,7 +18,7 @@ from ..algorithms.matrix import moving_sum from ..apps.base import OptionParser, ActionDispatcher, logger from ..formats.base import BaseFile, DictFile, LineFile, must_open -from ..formats.bed import Bed, bins +from ..formats.bed import Bed, bins, get_nbins from ..formats.sizes import Sizes from ..utils.cbook import human_size, autoscale @@ -35,7 +35,7 @@ set_human_axis, ticker, ) - +from .chromosome import HorizontalChromosome # Colors picked from Schmutz soybean genome paper using ColorPic palette = ["#ACABD5", "#DBF0F5", "#3EA77A", "#FBF5AB", "#C162A6"] + list("rgbymck") @@ -543,6 +543,9 @@ def depth(args): def add_window_options(p): + """ + Add options for window plotting. + """ p.add_option("--window", default=500000, type="int", help="Size of window") p.add_option("--shift", default=100000, type="int", help="Size of shift") p.add_option("--subtract", help="Subtract bases from window") @@ -552,12 +555,15 @@ def add_window_options(p): def check_window_options(opts): + """ + Check the window options, and return the values. + """ window = opts.window shift = opts.shift subtract = opts.subtract assert window % shift == 0, "--window must be divisible by --shift" logger.debug( - "Line/stack-plot options: window=%d shift=%d subtract=%d", + "Line/stack-plot options: window=%d shift=%d subtract=%s", window, shift, subtract, @@ -571,13 +577,6 @@ def get_beds(s, binned=False): return [x + ".bed" for x in s] if not binned else [x for x in s] -def get_nbins(clen, shift): - nbins = clen / shift - if clen % shift: - nbins += 1 - return nbins - - def linearray(binfile, chr, window, shift): mn = binfile.mapping[chr] m, n = zip(*mn) @@ -636,8 +635,6 @@ def composite(args): --bars: show where the extent of features are --altbars: similar to bars, yet in two alternating tracks, e.g. scaffolds """ - from jcvi.graphics.chromosome import HorizontalChromosome - p = OptionParser(composite.__doc__) p.add_option("--lines", help="Features to plot in lineplot") p.add_option("--bars", help="Features to plot in bars") @@ -661,7 +658,7 @@ def composite(args): sys.exit(not p.print_help()) fastafile, chr = args - window, shift, subtract, merge = check_window_options(opts) + window, shift, _, merge = check_window_options(opts) linebeds, barbeds, altbarbeds = [], [], [] fatten = opts.fatten if opts.lines: @@ -678,7 +675,7 @@ def composite(args): margin = 0.12 clen = Sizes(fastafile).mapping[chr] - nbins = get_nbins(clen, shift) + nbins, _ = get_nbins(clen, shift) plt.rcParams["xtick.major.size"] = 0 plt.rcParams["ytick.major.size"] = 0 @@ -775,7 +772,7 @@ def multilineplot(args): sys.exit(not p.print_help()) fastafile, chr = args - window, shift, subtract, merge = check_window_options(opts) + window, shift, _, merge = check_window_options(opts) linebeds = [] colors = opts.colors if opts.lines: @@ -790,7 +787,7 @@ def multilineplot(args): ) clen = Sizes(fastafile).mapping[chr] - nbins = get_nbins(clen, shift) + nbins, _ = get_nbins(clen, shift) plt.rcParams["xtick.major.size"] = 0 plt.rcParams["ytick.major.size"] = 0 @@ -882,7 +879,7 @@ def heatmap(args): root.add_patch(Rectangle((xx, yy), xlen, yinterval - inner, color=gray)) ax = fig.add_axes((xx, yy, xlen, yinterval - inner)) - nbins = get_nbins(clen, shift) + nbins, _ = get_nbins(clen, shift) owindow = clen / 100 if owindow > window: @@ -987,11 +984,14 @@ def draw_gauge(ax, margin, maxl, rightmargin=None): def get_binfiles( inputfiles, fastafile, shift, mode="span", subtract=None, binned=False, merge=True ): + """ + Get binfiles from input files. If not binned, then bin them first. + """ if not binned: - binopts = ["--binsize={0}".format(shift)] - binopts.append("--mode={0}".format(mode)) + binopts = [f"--binsize={shift}"] + binopts.append(f"--mode={mode}") if subtract: - binopts.append("--subtract={0}".format(subtract)) + binopts.append(f"--subtract={subtract}") if not merge: binopts.append("--nomerge") binfiles = [bins([x, fastafile] + binopts) for x in inputfiles if op.exists(x)] @@ -1002,14 +1002,17 @@ def get_binfiles( return binfiles -def stackarray(binfile, chr, window, shift): +def stackarray(binfile: BinFile, chr: str, window: int, shift: int): + """ + Get stack array from binfile for the given chr. + """ mn = binfile.mapping[chr] m, n = zip(*mn) m = np.array(m, dtype="float") n = np.array(n, dtype="float") - w = window / shift + w = window // shift m = moving_sum(m, window=w) n = moving_sum(n, window=w) m /= n @@ -1017,7 +1020,18 @@ def stackarray(binfile, chr, window, shift): return m -def stackplot(ax, binfiles, nbins, palette, chr, window, shift): +def stackplot( + ax, + binfiles: List[BinFile], + nbins: int, + palette: List[str], + chr: str, + window: int, + shift: int, +): + """ + Plot stackplot on the given axes, using data from binfiles. + """ t = np.arange(nbins, dtype="float") + 0.5 m = np.zeros(nbins, dtype="float") zorders = range(10)[::-1] @@ -1094,9 +1108,7 @@ def stack(args): root.add_patch(Rectangle((xx, yy), xlen, yinterval - inner, color=gray)) ax = fig.add_axes((xx, yy, xlen, yinterval - inner)) - nbins = clen / shift - if clen % shift: - nbins += 1 + nbins, _ = get_nbins(clen, shift) stackplot(ax, binfiles, nbins, palette, chr, window, shift) root.text( diff --git a/jcvi/utils/cbook.py b/jcvi/utils/cbook.py index 1f482c17..4dda9439 100644 --- a/jcvi/utils/cbook.py +++ b/jcvi/utils/cbook.py @@ -260,7 +260,7 @@ def human_size(size, a_kilobyte_is_1024_bytes=False, precision=1, target=None): return "{0:.{1}f}{2}".format(size, precision, suffix) -def autoscale(bp, optimal=6): +def autoscale(bp: int, optimal: int = 6): """ >>> autoscale(150000000) 20000000 @@ -272,7 +272,7 @@ def autoscale(bp, optimal=6): precision = len(slen) - 2 # how many zeros we need to pad? bp_len_scaled = int(tlen) # scale bp_len to range (0, 100) tick_diffs = [(x, abs(bp_len_scaled / x - optimal)) for x in [1, 2, 5, 10]] - best_stride, best_tick_diff = min(tick_diffs, key=lambda x: x[1]) + best_stride, _ = min(tick_diffs, key=lambda x: x[1]) while precision > 0: best_stride *= 10