diff --git a/jcvi/algorithms/ml.py b/jcvi/algorithms/ml.py deleted file mode 100644 index 10079759..00000000 --- a/jcvi/algorithms/ml.py +++ /dev/null @@ -1,54 +0,0 @@ -#!/usr/bin/env python -# -*- coding: UTF-8 -*- - -""" -Machine learning algorithms. -""" -import sys - -from jcvi.apps.base import OptionParser, ActionDispatcher - - -def main(): - - actions = (("libsvm", "convert csv file to LIBSVM format"),) - p = ActionDispatcher(actions) - p.dispatch(globals()) - - -def libsvm(args): - """ - %prog libsvm csvfile prefix.ids - - Convert csv file to LIBSVM format. `prefix.ids` contains the prefix mapping. - Ga -1 - Gr 1 - - So the feature in the first column of csvfile get scanned with the prefix - and mapped to different classes. Formatting spec: - - http://svmlight.joachims.org/ - """ - from jcvi.formats.base import DictFile - - p = OptionParser(libsvm.__doc__) - opts, args = p.parse_args(args) - - if len(args) != 2: - sys.exit(not p.print_help()) - - csvfile, prefixids = args - d = DictFile(prefixids) - fp = open(csvfile) - next(fp) - for row in fp: - atoms = row.split() - klass = atoms[0] - kp = klass.split("_")[0] - klass = d.get(kp, "0") - feats = ["{0}:{1}".format(i + 1, x) for i, x in enumerate(atoms[1:])] - print(" ".join([klass] + feats)) - - -if __name__ == "__main__": - main() diff --git a/jcvi/apps/script.py b/jcvi/apps/script.py deleted file mode 100644 index 28972abe..00000000 --- a/jcvi/apps/script.py +++ /dev/null @@ -1,112 +0,0 @@ -#!/usr/bin/env python -# -*- coding: UTF-8 -*- - - -import sys -import logging - -from jcvi.formats.base import write_file -from jcvi.apps.base import OptionParser - - -default_template = """ -\"\"\" - -\"\"\" - -import sys - -{} -from jcvi.apps.base import OptionParser, ActionDispatcher - - -def main(): - - actions = ( - ('app', ''), - ) - p = ActionDispatcher(actions) - p.dispatch(globals()) - -{} - -if __name__ == '__main__': - main() -""" - -default_imports = "" - -graphic_imports = """ -from jcvi.graphics.base import plt, savefig, normalize_axes""" - -default_app = """ -def app(args): - \"\"\" - %prog app datafile - - \"\"\" - p = OptionParser(app.__doc__) - opts, args = p.parse_args(args) - - if len(args) != 1: - sys.exit(not p.print_help()) -""" - -graphic_app = """ -def app(args): - \"\"\" - %prog app datafile - - \"\"\" - p = OptionParser(app.__doc__) - opts, args, iopts = p.set_image_options(args) - - if len(args) != 1: - sys.exit(not p.print_help()) - - datafile, = args - pf = datafile.rsplit(".", 1)[0] - fig = plt.figure(1, (iopts.w, iopts.h)) - root = fig.add_axes([0, 0, 1, 1]) - - normalize_axes(root) - - image_name = pf + "." + iopts.format - savefig(image_name, dpi=iopts.dpi, iopts=iopts) -""" - - -def main(): - """ - %prog scriptname.py - - create a minimal boilerplate for a new script - """ - p = OptionParser(main.__doc__) - p.add_option( - "-g", - "--graphic", - default=False, - action="store_true", - help="Create boilerplate for a graphic script", - ) - - opts, args = p.parse_args() - if len(args) != 1: - sys.exit(not p.print_help()) - - (script,) = args - imports = graphic_imports if opts.graphic else default_imports - app = graphic_app if opts.graphic else default_app - template = default_template.format(imports, app) - write_file(script, template) - - message = "template writes to `{0}`".format(script) - if opts.graphic: - message = "graphic " + message - message = message[0].upper() + message[1:] - logging.debug(message) - - -if __name__ == "__main__": - main() diff --git a/jcvi/graphics/logo.py b/jcvi/graphics/logo.py deleted file mode 100644 index 60a240c8..00000000 --- a/jcvi/graphics/logo.py +++ /dev/null @@ -1,54 +0,0 @@ -#!/usr/bin/env python -# -*- coding: UTF-8 -*- - -""" -%prog text - -Generate logo with certain color and font. Font must be present in: -jcvi/graphics/fonts - -You can download a number of free TTF fonts from: - -""" - - -import sys - -from jcvi.graphics.base import plt, savefig, fontprop, available_fonts -from jcvi.apps.base import OptionParser - - -def main(): - p = OptionParser(__doc__) - p.add_option( - "--customfont", - default="Airswing.ttf", - choices=available_fonts, - help="Custom font name", - ) - p.add_option("--color", default="limegreen", help="Font color") - p.add_option("--size", default=36, type="int", help="Font size") - opts, args, iopts = p.set_image_options(figsize="2x1", dpi=60, format="png") - - if len(args) != 1: - sys.exit(not p.print_help()) - - (text,) = args - - plt.rcdefaults() - fig = plt.figure(1, (iopts.w, iopts.h)) - ax = fig.add_axes([0, 0, 1, 1]) - - ax.text(0.5, 0.5, text, color=opts.color, ha="center", va="center") - fontprop(ax, opts.customfont, size=opts.size) - - ax.set_xlim(0, 1) - ax.set_ylim(0, 1) - ax.set_axis_off() - - image_name = text + "." + iopts.format - savefig(image_name, dpi=iopts.dpi, iopts=iopts) - - -if __name__ == "__main__": - main() diff --git a/jcvi/graphics/whisker.py b/jcvi/graphics/whisker.py deleted file mode 100644 index be67d2c9..00000000 --- a/jcvi/graphics/whisker.py +++ /dev/null @@ -1,68 +0,0 @@ -#!/usr/bin/env python -# -*- coding: UTF-8 -*- - -""" -%prog datafile - -Generate whisker plot for two column file. The first column contains group, the -second column is values. -""" - - -import sys - -from jcvi.apps.r import RTemplate -from jcvi.apps.base import OptionParser - - -whisker_template = """ -library(ggplot2) -data <- read.table('$datafile', header=T, sep='\t') -data$$$x <- factor(data$$$x, c($levels)) -m <- ggplot(data, aes($x, $y)) -m + geom_boxplot(colour="darkgreen", notch='$notch') + opts(title='$title') + -scale_x_discrete('$lx') + scale_y_continuous('$ly') + -theme(text=element_text(size=$fontsize)) -ggsave('$outfile') -""" - - -def main(): - p = OptionParser(__doc__) - p.add_option( - "--levels", help="Reorder factors, comma-delimited [default: alphabetical]" - ) - p.add_option( - "--notch", - default=False, - action="store_true", - help="notched box plot", - ) - p.add_option("--title", default=" ", help="Title of the figure") - p.add_option("--xlabel", help="X-axis label") - p.add_option("--ylabel", help="Y-axis label") - p.add_option("--fontsize", default=16, help="Font size") - opts, args = p.parse_args() - - if len(args) != 1: - sys.exit(not p.print_help()) - - (datafile,) = args - header = open(datafile) - levels = opts.levels - xlabel = opts.xlabel - ylabel = opts.ylabel - lx, ly = next(header).rstrip().split("\t") - - lx = xlabel or lx - ly = ylabel or ly - - if levels: - levels = levels.split(",") - levels = ", ".join("'{0}'".format(d) for d in levels) - rtemplate = RTemplate(whisker_template, locals()) - rtemplate.run() - - -if __name__ == "__main__": - main() diff --git a/jcvi/projects/heterosis.py b/jcvi/projects/heterosis.py deleted file mode 100644 index d6cd3cf3..00000000 --- a/jcvi/projects/heterosis.py +++ /dev/null @@ -1,188 +0,0 @@ -#!/usr/bin/env python -# -*- coding: UTF-8 -*- - -""" -List of routines to analyze rice heterosis datasets. -Starting from a list of .count files. The .count files are generated by HT-seq. -""" -import os.path as op -import os -import sys -import logging - -import numpy as np -from collections import defaultdict - -from jcvi.formats.base import BaseFile -from jcvi.apps.base import OptionParser, ActionDispatcher, glob, mkdir - - -class RiceSample(BaseFile): - """ - Examples: - 1. LCS48-3_GTCCGC_L006_R_tophat_accepted_hits.count - 2. RF18-1_GTTTCG_L003_R_tophat_accepted_hits.count - - First letter is always tissue: L - leaf, R - root - Next componet before dash is sample name: - CS48 (9311) is parent for F, CS66 (nipponbare) is parent for C - Each family has a sequential id, in the second example, family ID is 18. Two - parents for that family is F (CS48) and 18, hybrid progeny is F18. - """ - - def __init__(self, filename): - super(RiceSample, self).__init__(filename) - self.shortname = name = op.basename(filename).split("_")[0] - name, rep = name.split("-") - tissue, ind = name[0], name[1:] - self.tissue, self.ind = tissue, ind - self.rep = rep - self.P1 = self.P2 = "na" - if ind in ("CS48", "CS66"): - self.label = "P1" - self.family = "Recur" - elif ind[0] in ("F", "C"): - self.label = "F1" - self.P1 = "CS48" if ind[0] == "F" else "CS66" - self.P2 = ind[1:] - self.family = int(ind[1:]) - else: - self.label = "P2" - self.family = int(ind) - - fp = open(filename) - data = [row.split() for row in fp] - self.header, self.data = zip(*data) - self.data = np.array(self.data, dtype=np.int32) - self.working = True - - def __str__(self): - return "\t".join( - str(x) - for x in (self.filename, self.tissue, self.label, self.family, self.rep) - ) - - def merge(self, rs): - logging.debug("Merge '{0}' and '{1}'".format(self.filename, rs.filename)) - self.filename = ",".join((self.filename, rs.filename)) - assert self.tissue == rs.tissue - assert self.ind == rs.ind - assert self.rep == rs.rep - assert self.label == rs.label - assert self.family == rs.family - assert self.P1 == rs.P1 - assert self.P2 == rs.P2 - assert self.header == rs.header - self.data += rs.data - - -def main(): - - actions = ( - ("prepare", "parse list of count files and group per family"), - ("mergeclean", "clean redundant merged bam files"), - ) - p = ActionDispatcher(actions) - p.dispatch(globals()) - - -def mergeclean(args): - """ - %prog mergeclean [*.bam|*.count] - - Clean redundant merged bam/count files. This usually happens after running - formats.sam.merge() several times. - """ - from itertools import groupby - from jcvi.apps.base import cleanup - - p = OptionParser(mergeclean.__doc__) - p.set_sep(sep="_", help="Separator to group per prefix") - opts, args = p.parse_args(args) - - if len(args) < 1: - sys.exit(not p.print_help()) - - files = sorted(args) - sep = opts.sep - key = lambda x: x.split(sep)[0] - mtime = lambda x: os.stat(x).st_mtime - for pf, fs in groupby(files, key=key): - fs = list(fs) - if len(fs) == 1: - continue - newest_f = max(fs, key=mtime) - print("|".join(fs), "=>", newest_f, file=sys.stderr) - fs.remove(newest_f) - cleanup(fs) - - -def merge_counts(ss, outfile): - fw = open(outfile, "w") - header = ["Gene"] + [x.shortname for x in ss] - print("\t".join(header), file=fw) - data = [ss[0].header] + [x.data for x in ss] - data = zip(*data) - for a in data: - print("\t".join(str(x) for x in a), file=fw) - logging.debug("File `{0}` written (size={1}).".format(outfile, len(ss))) - - -def prepare(args): - """ - %prog prepare countfolder families - - Parse list of count files and group per family into families folder. - """ - p = OptionParser(prepare.__doc__) - opts, args = p.parse_args(args) - - if len(args) != 2: - sys.exit(not p.print_help()) - - counts, families = args - countfiles = glob(op.join(counts, "*.count")) - countsdb = defaultdict(list) - for c in countfiles: - rs = RiceSample(c) - countsdb[(rs.tissue, rs.ind)].append(rs) - - # Merge duplicates - data sequenced in different batches - key = lambda x: (x.label, x.rep) - for (tissue, ind), rs in sorted(countsdb.items()): - rs.sort(key=key) - nrs = len(rs) - for i in range(nrs): - ri = rs[i] - if not ri.working: - continue - for j in range(i + 1, nrs): - rj = rs[j] - if key(ri) != key(rj): - continue - ri.merge(rj) - rj.working = False - countsdb[(tissue, ind)] = [x for x in rs if x.working] - - # Group into families - mkdir("families") - for (tissue, ind), r in sorted(countsdb.items()): - r = list(r) - if r[0].label != "F1": - continue - P1, P2 = r[0].P1, r[0].P2 - P1, P2 = countsdb[(tissue, P1)], countsdb[(tissue, P2)] - rs = P1 + P2 + r - groups = [1] * len(P1) + [2] * len(P2) + [3] * len(r) - assert len(rs) == len(groups) - - outfile = "-".join((tissue, ind)) - merge_counts(rs, op.join(families, outfile)) - groupsfile = outfile + ".groups" - fw = open(op.join(families, groupsfile), "w") - print(",".join(str(x) for x in groups), file=fw) - fw.close() - - -if __name__ == "__main__": - main() diff --git a/jcvi/variation/tassel.py b/jcvi/variation/tassel.py deleted file mode 100644 index e5360e28..00000000 --- a/jcvi/variation/tassel.py +++ /dev/null @@ -1,124 +0,0 @@ -#!/usr/bin/env python -# -*- coding: UTF-8 -*- - -""" -Run TASSEL GBS module, following manual here: -http://www.maizegenetics.net/tassel/docs/TasselPipelineGBS.pdf -""" - -import os.path as op -import sys - -from jcvi.formats.base import write_file -from jcvi.apps.base import OptionParser, ActionDispatcher, mkdir, get_abs_path - - -def main(): - - actions = (("prepare", "prepare TASSEL pipeline"),) - p = ActionDispatcher(actions) - p.dispatch(globals()) - - -def run_pipeline(thome, plugin, options, log=False): - cmd = op.join(thome, "run_pipeline.pl") - cmd += " -Xmx16g -fork1 -{0} {1} -endPlugin -runfork1".format(plugin, options) - if log: - cmd += " | tee {0}.log".format(plugin) - return cmd - - -def prepare(args): - """ - %prog prepare barcode_key.csv reference.fasta - - Prepare TASSEL pipeline. - """ - valid_enzymes = ( - "ApeKI|ApoI|BamHI|EcoT22I|HinP1I|HpaII|MseI|MspI|" - "NdeI|PasI|PstI|Sau3AI|SbfI|AsiSI-MspI|BssHII-MspI|" - "FseI-MspI|PaeR7I-HhaI|PstI-ApeKI|PstI-EcoT22I|PstI-MspI" - "PstI-TaqI|SalI-MspI|SbfI-MspI".split("|") - ) - p = OptionParser(prepare.__doc__) - p.add_option( - "--enzyme", - default="ApeKI", - choices=valid_enzymes, - help="Restriction enzyme used", - ) - p.set_home("tassel") - p.set_aligner(aligner="bwa") - p.set_cpus() - opts, args = p.parse_args(args) - - if len(args) != 2: - sys.exit(not p.print_help()) - - barcode, reference = args - thome = opts.tassel_home - reference = get_abs_path(reference) - folders = ( - "fastq", - "tagCounts", - "mergedTagCounts", - "topm", - "tbt", - "mergedTBT", - "hapmap", - "hapmap/raw", - "hapmap/mergedSNPs", - "hapmap/filt", - "hapmap/bpec", - ) - for f in folders: - mkdir(f) - - # Build the pipeline - runsh = [] - o = "-i fastq -k {0} -e {1} -o tagCounts".format(barcode, opts.enzyme) - cmd = run_pipeline(thome, "FastqToTagCountPlugin", o) - runsh.append(cmd) - - o = "-i tagCounts -o mergedTagCounts/myMasterTags.cnt" - o += " -c 5 -t mergedTagCounts/myMasterTags.cnt.fq" - cmd = run_pipeline(thome, "MergeMultipleTagCountPlugin", o) - runsh.append(cmd) - runsh.append("cd mergedTagCounts") - - cmd = "python -m jcvi.apps.{0} align --cpus {1}".format(opts.aligner, opts.cpus) - cmd += " {0} myMasterTags.cnt.fq".format(reference) - runsh.append(cmd) - runsh.append("cd ..") - - o = "-i mergedTagCounts/*.sam -o topm/myMasterTags.topm" - cmd = run_pipeline(thome, "SAMConverterPlugin", o) - runsh.append(cmd) - - o = "-i mergedTBT/myStudy.tbt.byte -y -m topm/myMasterTags.topm" - o += " -mUpd topm/myMasterTagsWithVariants.topm" - o += " -o hapmap/raw/myGBSGenos_chr+.hmp.txt" - o += " -mnF 0.8 -p myPedigreeFile.ped -mnMAF 0.02 -mnMAC 100000" - o += " -ref {0} -sC 1 -eC 10".format(reference) - cmd = run_pipeline(thome, "TagsToSNPByAlignmentPlugin", o) - runsh.append(cmd) - - o = "-hmp hapmap/raw/myGBSGenos_chr+.hmp.txt" - o += " -o hapmap/mergedSNPs/myGBSGenos_mergedSNPs_chr+.hmp.txt" - o += " -misMat 0.1 -p myPedigreeFile.ped -callHets -sC 1 -eC 10" - cmd = run_pipeline(thome, "MergeDuplicateSNPsPlugin", o) - runsh.append(cmd) - - o = "-hmp hapmap/mergedSNPs/myGBSGenos_mergedSNPs_chr+.hmp.txt" - o += " -o hapmap/filt/myGBSGenos_mergedSNPsFilt_chr+.hmp.txt" - o += " -mnTCov 0.01 -mnSCov 0.2 -mnMAF 0.01 -sC 1 -eC 10" - # o += "-hLD -mnR2 0.2 -mnBonP 0.005" - cmd = run_pipeline(thome, "GBSHapMapFiltersPlugin", o) - runsh.append(cmd) - - runfile = "run.sh" - write_file(runfile, "\n".join(runsh)) - - -if __name__ == "__main__": - main()