diff --git a/jcvi/projects/sugarcane.py b/jcvi/projects/sugarcane.py index 887dfb31..1fd63131 100644 --- a/jcvi/projects/sugarcane.py +++ b/jcvi/projects/sugarcane.py @@ -7,13 +7,18 @@ # Created by Haibao Tang on 12/02/19 # Copyright © 2019 Haibao Tang. All rights reserved. # +""" +Simulate sugarcane genomes and analyze the diversity in the progeny genomes. +""" + import os.path as op import sys from collections import Counter, defaultdict +from enum import Enum from itertools import combinations, groupby, product from random import random, sample -from typing import Dict +from typing import Dict, List import numpy as np import matplotlib.pyplot as plt @@ -23,11 +28,22 @@ from ..apps.base import ActionDispatcher, OptionParser, logger, mkdir from ..formats.blast import Blast from ..graphics.base import adjust_spines, markup, normalize_axes, savefig -from ..utils.validator import validate_in_choices SoColor = "#7436a4" # Purple SsColor = "#5a8340" # Green + +class CrossMode(Enum): + """ + How the F1 is generated. + """ + + nplusn = "n+n" + nx2plusn = "nx2+n" + twoplusnFDR = "2n+n_FDR" + twoplusnSDR = "2n+n_SDR" + + # Computed using prepare(), corrected with real sizes ChrSizes = { "SO-chr01": 148750011, @@ -53,7 +69,9 @@ # Simulate genome composition class Genome: - def __init__(self, name, prefix, ploidy, haploid_chromosome_count): + def __init__( + self, name: str, prefix: str, ploidy: int, haploid_chromosome_count: int + ): """ Simulate a genome with given ploidy and haploid_chromosome_count. Example: @@ -72,7 +90,7 @@ def __len__(self): return len(self.chromosomes) @classmethod - def make(cls, name, chromosomes): + def make(cls, name: str, chromosomes: List[str]): genome = Genome(name, "", 0, 0) genome.chromosomes = chromosomes return genome @@ -101,7 +119,7 @@ def prefix(x): return x.split("_", 1)[0] # Randomly assign the rest, singleton chromosomes - for group, chromosomes in groupby(singleton_chromosomes, key=prefix): + for _, chromosomes in groupby(singleton_chromosomes, key=prefix): chromosomes = list(chromosomes) halfn = len(chromosomes) // 2 # Odd number, e.g. 5, equal chance to be 2 or 3 @@ -110,7 +128,7 @@ def prefix(x): gamete_chromosomes += sorted(sample(chromosomes, halfn)) return Genome.make(self.name + " gamete", gamete_chromosomes) - def mate_nplusn(self, name, other_genome, verbose=True): + def mate_nplusn(self, name: str, other_genome: "Genome", verbose: bool = True): if verbose: print( f"Crossing '{self.name}' x '{other_genome.name}' (n+n)", file=sys.stderr @@ -120,7 +138,7 @@ def mate_nplusn(self, name, other_genome, verbose=True): ) return Genome.make(name, f1_chromosomes) - def mate_nx2plusn(self, name, other_genome, verbose=True): + def mate_nx2plusn(self, name: str, other_genome: "Genome", verbose: bool = True): if verbose: print( f"Crossing '{self.name}' x '{other_genome.name}' (2xn+n)", @@ -131,15 +149,23 @@ def mate_nx2plusn(self, name, other_genome, verbose=True): ) return Genome.make(name, f1_chromosomes) - def mate_2nplusn(self, name, other_genome, verbose=True): + def mate_2nplusn_FDR(self, name: str, other_genome: "Genome", verbose: bool = True): if verbose: print( - f"Crossing '{self.name}' x '{other_genome.name}' (2n+n)", + f"Crossing '{self.name}' x '{other_genome.name}' (2n+n_FDR)", file=sys.stderr, ) f1_chromosomes = sorted(self.chromosomes + other_genome.gamete.chromosomes) return Genome.make(name, f1_chromosomes) + def mate_2nplusn_SDR(self, name: str, other_genome: "Genome", verbose: bool = True): + if verbose: + print( + f"Crossing '{self.name}' x '{other_genome.name}' (2n+n_SDR)", + file=sys.stderr, + ) + raise NotImplementedError("2n+n_SDR not yet supported") + def __str__(self): return self.name + ": " + ",".join(self.chromosomes) @@ -186,27 +212,27 @@ def __init__(self, SO_data, SS_data, percent_SO_data): self.percent_SS_data = [100 - x for x in percent_SO_data] def _summary(self, a, tag, precision=0): - mean, min, max = ( + mean, mn, mx = ( round(np.mean(a), precision), round(np.min(a), precision), round(np.max(a), precision), ) s = f"*{tag}* chr: {mean:.0f}" - if min == mean and max == mean: + if mn == mean and mx == mean: return s - return s + f" ({min:.0f}-{max:.0f})" + return s + f" ({mn:.0f}-{mx:.0f})" def _percent_summary(self, a, tag, precision=1): - mean, min, max = ( + mean, mn, mx = ( round(np.mean(a), precision), round(np.min(a), precision), round(np.max(a), precision), ) s = f"*{tag}*%: {mean:.1f}%" print(s) - if min == mean and max == mean: + if mn == mean and mx == mean: return s - return s + f" ({min:.1f}-{max:.1f}%)" + return s + f" ({mn:.1f}-{mx:.1f}%)" @property def percent_SO_summary(self): @@ -225,18 +251,19 @@ def SS_summary(self): return self._summary(self.SS_data, "Ss") -def simulate_F1(SO, SS, mode="nx2+n", verbose=False): - SO_SS_F1 = ( - SO.mate_nx2plusn("SOxSS F1", SS, verbose=verbose) - if mode == "nx2+n" - else SO.mate_2nplusn("SOxSS F1", SS, verbose=verbose) - ) +def simulate_F1(SO: Genome, SS: Genome, mode: CrossMode, verbose: bool = False): + if mode == CrossMode.nx2plusn: + SO_SS_F1 = SO.mate_nx2plusn("SOxSS F1", SS, verbose=verbose) + elif mode == CrossMode.twoplusnFDR: + SO_SS_F1 = SO.mate_2nplusn_FDR("SOxSS F1", SS, verbose=verbose) + elif mode == CrossMode.twoplusnSDR: + SO_SS_F1 = SO.mate_2nplusn_SDR("SOxSS F1", SS, verbose=verbose) if verbose: SO_SS_F1.print_summary() return SO_SS_F1 -def simulate_F2(SO, SS, mode="nx2+n", verbose=False): +def simulate_F2(SO: Genome, SS: Genome, mode: CrossMode, verbose: bool = False): SO_SS_F1 = simulate_F1(SO, SS, mode=mode, verbose=verbose) SO_SS_F2_nplusn = SO_SS_F1.mate_nplusn("SOxSS F2", SO_SS_F1, verbose=verbose) if verbose: @@ -244,7 +271,7 @@ def simulate_F2(SO, SS, mode="nx2+n", verbose=False): return SO_SS_F2_nplusn -def simulate_F1intercross(SO, SS, mode="nx2+n", verbose=False): +def simulate_F1intercross(SO: Genome, SS: Genome, mode: CrossMode, verbose=False): SO_SS_F1_1 = simulate_F1(SO, SS, mode=mode, verbose=verbose) SO_SS_F1_2 = simulate_F1(SO, SS, mode=mode, verbose=verbose) SO_SS_F1intercross_nplusn = SO_SS_F1_1.mate_nplusn( @@ -253,7 +280,7 @@ def simulate_F1intercross(SO, SS, mode="nx2+n", verbose=False): return SO_SS_F1intercross_nplusn -def simulate_BCn(n, SO, SS, mode="nx2+n", verbose=False): +def simulate_BCn(n: int, SO: Genome, SS: Genome, mode: CrossMode, verbose=False): SS_SO_F1 = simulate_F1(SO, SS, mode=mode, verbose=verbose) SS_SO_BC1, SS_SO_BC2_nplusn, SS_SO_BC3_nplusn, SS_SO_BC4_nplusn = ( None, @@ -263,12 +290,12 @@ def simulate_BCn(n, SO, SS, mode="nx2+n", verbose=False): ) # BC1 if n >= 1: - SS_SO_BC1 = ( - # Expecting one more round of female restitution in BC1 - SO.mate_nx2plusn("SSxSO BC1", SS_SO_F1, verbose=verbose) - if mode == "nx2+n" - else SO.mate_2nplusn("SSxSO BC1", SS_SO_F1, verbose=verbose) - ) + if mode == CrossMode.nx2plusn: + SS_SO_BC1 = SO.mate_nx2plusn("SSxSO BC1", SS_SO_F1, verbose=verbose) + elif mode == CrossMode.twoplusnFDR: + SS_SO_BC1 = SO.mate_2nplusn_FDR("SSxSO BC1", SS_SO_F1, verbose=verbose) + elif mode == CrossMode.twoplusnSDR: + SS_SO_BC1 = SO.mate_2nplusn_SDR("SSxSO BC1", SS_SO_F1, verbose=verbose) # BC2 if n >= 2: SS_SO_BC2_nplusn = SO.mate_nplusn("SSxSO BC2", SS_SO_BC1, verbose=verbose) @@ -304,8 +331,8 @@ def plot_summary(ax, samples: list[Genome]) -> GenomeSummary: SO_data = [] SS_data = [] percent_SO_data = [] - for sample in samples: - summary = sample.summary + for s in samples: + summary = s.summary try: _, _, group_unique, _, _ = [x for x in summary if x[0] == "SO"][0] except: @@ -326,7 +353,7 @@ def plot_summary(ax, samples: list[Genome]) -> GenomeSummary: shift = 0.5 # used to offset bars a bit to avoid cluttering if overlaps: for overlap in overlaps: - logger.debug(f"Modify bar offsets at {overlap} due to SS and SO overlaps") + logger.debug("Modify bar offsets at %s due to SS and SO overlaps", overlap) SS_counter[overlap - shift] = SS_counter[overlap] del SS_counter[overlap] SO_counter[overlap + shift] = SO_counter[overlap] @@ -338,7 +365,7 @@ def modify_range_end(d: dict, value: int): # Has data at the range end, but no adjacent data points (i.e. isolated bar) if value in d and (value - 1 in d or value + 1 in d): return - logger.debug(f"Modify bar offsets at {value} due to end of range ends") + logger.debug("Modify bar offsets at %d due to end of range ends", value) d[value - shift if value else value + shift] = d[80] del d[value] @@ -383,7 +410,7 @@ def write_chromosomes(genomes: list[Genome], filename: str): filename (str): File path to write to. """ print(f"Write chromosomes to `{filename}`", file=sys.stderr) - with open(filename, "w") as fw: + with open(filename, "w", encoding="utf-8") as fw: for genome in genomes: print(genome, file=fw) @@ -396,16 +423,17 @@ def write_SO_percent(summary: GenomeSummary, filename: str): filename (str): File path to write to. """ print(f"Write SO percent to `{filename}`", file=sys.stderr) - with open(filename, "w") as fw: + with open(filename, "w", encoding="utf-8") as fw: print("\n".join(str(x) for x in sorted(summary.percent_SO_data)), file=fw) def simulate(args): """ - %prog simulate [2n+n|nx2+n] + %prog simulate [2n+n_FDR|2n+n_SDR|nx2+n] Run simulation on female restitution. There are two modes: - - 2n+n: merger between a somatic and a germline + - 2n+n_FDR: merger between a somatic and a germline + - 2n+n_SDR: merger between a recombined germline and a germline (not yet supported) - nx2+n: merger between a doubled germline and a germline These two modes would impact the sequence diversity in the progeny @@ -428,8 +456,8 @@ def simulate(args): sys.exit(not p.print_help()) (mode,) = args - validate_in_choices(mode, ["2n+n", "nx2+n"], "Mode") - logger.info(f"Transmission: {mode}") + mode = CrossMode(mode) + logger.info("Transmission: %s", mode) # Construct a composite figure with 6 tracks fig = plt.figure(1, (iopts.w, iopts.h)) @@ -512,7 +540,12 @@ def simulate(args): normalize_axes(root) # Title - mode_title = r"$n_1\times2 + n_2$" if mode == "nx2+n" else r"$2n + n$" + if mode == CrossMode.nx2plusn: + mode_title = r"$n_1\times2 + n_2$" + elif mode == CrossMode.twoplusnFDR: + mode_title = r"$2n + n$ (FDR)" + elif mode == CrossMode.twoplusnSDR: + mode_title = r"$2n + n$ (SDR)" root.text(0.5, 0.95, f"Transmission: {mode_title}", ha="center") savefig(f"{mode}.pdf", dpi=120) @@ -548,10 +581,8 @@ def _get_sizes(filename, prefix_length, tag, target_size=None): tag (str): Prepend `tag-` to the seqid. target_size (int): Expected genome size. Defaults to None. """ - from collections import defaultdict - sizes_list = defaultdict(list) - with open(filename) as fp: + with open(filename, encoding="utf-8") as fp: for row in fp: if not row.startswith("Chr"): continue