Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Minor update #684

Merged
merged 4 commits into from
Jul 6, 2024
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
119 changes: 75 additions & 44 deletions jcvi/projects/sugarcane.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,13 +7,18 @@
# Created by Haibao Tang on 12/02/19
# Copyright © 2019 Haibao Tang. All rights reserved.
#
"""
Simulate sugarcane genomes and analyze the diversity in the progeny genomes.
"""

import os.path as op
import sys

from collections import Counter, defaultdict
from enum import Enum
from itertools import combinations, groupby, product
from random import random, sample
from typing import Dict
from typing import Dict, List

import numpy as np
import matplotlib.pyplot as plt
Expand All @@ -23,11 +28,22 @@
from ..apps.base import ActionDispatcher, OptionParser, logger, mkdir
from ..formats.blast import Blast
from ..graphics.base import adjust_spines, markup, normalize_axes, savefig
from ..utils.validator import validate_in_choices

SoColor = "#7436a4" # Purple
SsColor = "#5a8340" # Green


class CrossMode(Enum):
"""
How the F1 is generated.
"""

nplusn = "n+n"
nx2plusn = "nx2+n"
twoplusnFDR = "2n+n_FDR"
twoplusnSDR = "2n+n_SDR"


# Computed using prepare(), corrected with real sizes
ChrSizes = {
"SO-chr01": 148750011,
Expand All @@ -53,7 +69,9 @@

# Simulate genome composition
class Genome:
def __init__(self, name, prefix, ploidy, haploid_chromosome_count):
def __init__(
self, name: str, prefix: str, ploidy: int, haploid_chromosome_count: int
):
"""
Simulate a genome with given ploidy and haploid_chromosome_count. Example:

Expand All @@ -72,7 +90,7 @@ def __len__(self):
return len(self.chromosomes)

@classmethod
def make(cls, name, chromosomes):
def make(cls, name: str, chromosomes: List[str]):
genome = Genome(name, "", 0, 0)
genome.chromosomes = chromosomes
return genome
Expand Down Expand Up @@ -101,7 +119,7 @@ def prefix(x):
return x.split("_", 1)[0]

# Randomly assign the rest, singleton chromosomes
for group, chromosomes in groupby(singleton_chromosomes, key=prefix):
for _, chromosomes in groupby(singleton_chromosomes, key=prefix):
chromosomes = list(chromosomes)
halfn = len(chromosomes) // 2
# Odd number, e.g. 5, equal chance to be 2 or 3
Expand All @@ -110,7 +128,7 @@ def prefix(x):
gamete_chromosomes += sorted(sample(chromosomes, halfn))
return Genome.make(self.name + " gamete", gamete_chromosomes)

def mate_nplusn(self, name, other_genome, verbose=True):
def mate_nplusn(self, name: str, other_genome: "Genome", verbose: bool = True):
if verbose:
print(
f"Crossing '{self.name}' x '{other_genome.name}' (n+n)", file=sys.stderr
Expand All @@ -120,7 +138,7 @@ def mate_nplusn(self, name, other_genome, verbose=True):
)
return Genome.make(name, f1_chromosomes)

def mate_nx2plusn(self, name, other_genome, verbose=True):
def mate_nx2plusn(self, name: str, other_genome: "Genome", verbose: bool = True):
if verbose:
print(
f"Crossing '{self.name}' x '{other_genome.name}' (2xn+n)",
Expand All @@ -131,15 +149,23 @@ def mate_nx2plusn(self, name, other_genome, verbose=True):
)
return Genome.make(name, f1_chromosomes)

def mate_2nplusn(self, name, other_genome, verbose=True):
def mate_2nplusn_FDR(self, name: str, other_genome: "Genome", verbose: bool = True):
if verbose:
print(
f"Crossing '{self.name}' x '{other_genome.name}' (2n+n)",
f"Crossing '{self.name}' x '{other_genome.name}' (2n+n_FDR)",
file=sys.stderr,
)
f1_chromosomes = sorted(self.chromosomes + other_genome.gamete.chromosomes)
return Genome.make(name, f1_chromosomes)

def mate_2nplusn_SDR(self, name: str, other_genome: "Genome", verbose: bool = True):
if verbose:
print(
f"Crossing '{self.name}' x '{other_genome.name}' (2n+n_SDR)",
file=sys.stderr,
)
raise NotImplementedError("2n+n_SDR not yet supported")

def __str__(self):
return self.name + ": " + ",".join(self.chromosomes)

Expand Down Expand Up @@ -186,27 +212,27 @@ def __init__(self, SO_data, SS_data, percent_SO_data):
self.percent_SS_data = [100 - x for x in percent_SO_data]

def _summary(self, a, tag, precision=0):
mean, min, max = (
mean, mn, mx = (
round(np.mean(a), precision),
round(np.min(a), precision),
round(np.max(a), precision),
)
s = f"*{tag}* chr: {mean:.0f}"
if min == mean and max == mean:
if mn == mean and mx == mean:
return s
return s + f" ({min:.0f}-{max:.0f})"
return s + f" ({mn:.0f}-{mx:.0f})"

def _percent_summary(self, a, tag, precision=1):
mean, min, max = (
mean, mn, mx = (
round(np.mean(a), precision),
round(np.min(a), precision),
round(np.max(a), precision),
)
s = f"*{tag}*%: {mean:.1f}%"
print(s)
if min == mean and max == mean:
if mn == mean and mx == mean:
return s
return s + f" ({min:.1f}-{max:.1f}%)"
return s + f" ({mn:.1f}-{mx:.1f}%)"

@property
def percent_SO_summary(self):
Expand All @@ -225,26 +251,27 @@ def SS_summary(self):
return self._summary(self.SS_data, "Ss")


def simulate_F1(SO, SS, mode="nx2+n", verbose=False):
SO_SS_F1 = (
SO.mate_nx2plusn("SOxSS F1", SS, verbose=verbose)
if mode == "nx2+n"
else SO.mate_2nplusn("SOxSS F1", SS, verbose=verbose)
)
def simulate_F1(SO: Genome, SS: Genome, mode: CrossMode, verbose: bool = False):
if mode == CrossMode.nx2plusn:
SO_SS_F1 = SO.mate_nx2plusn("SOxSS F1", SS, verbose=verbose)
elif mode == CrossMode.twoplusnFDR:
SO_SS_F1 = SO.mate_2nplusn_FDR("SOxSS F1", SS, verbose=verbose)
elif mode == CrossMode.twoplusnSDR:
SO_SS_F1 = SO.mate_2nplusn_SDR("SOxSS F1", SS, verbose=verbose)
if verbose:
SO_SS_F1.print_summary()
return SO_SS_F1


def simulate_F2(SO, SS, mode="nx2+n", verbose=False):
def simulate_F2(SO: Genome, SS: Genome, mode: CrossMode, verbose: bool = False):
SO_SS_F1 = simulate_F1(SO, SS, mode=mode, verbose=verbose)
SO_SS_F2_nplusn = SO_SS_F1.mate_nplusn("SOxSS F2", SO_SS_F1, verbose=verbose)
if verbose:
SO_SS_F2_nplusn.print_summary()
return SO_SS_F2_nplusn


def simulate_F1intercross(SO, SS, mode="nx2+n", verbose=False):
def simulate_F1intercross(SO: Genome, SS: Genome, mode: CrossMode, verbose=False):
SO_SS_F1_1 = simulate_F1(SO, SS, mode=mode, verbose=verbose)
SO_SS_F1_2 = simulate_F1(SO, SS, mode=mode, verbose=verbose)
SO_SS_F1intercross_nplusn = SO_SS_F1_1.mate_nplusn(
Expand All @@ -253,7 +280,7 @@ def simulate_F1intercross(SO, SS, mode="nx2+n", verbose=False):
return SO_SS_F1intercross_nplusn


def simulate_BCn(n, SO, SS, mode="nx2+n", verbose=False):
def simulate_BCn(n: int, SO: Genome, SS: Genome, mode: CrossMode, verbose=False):
SS_SO_F1 = simulate_F1(SO, SS, mode=mode, verbose=verbose)
SS_SO_BC1, SS_SO_BC2_nplusn, SS_SO_BC3_nplusn, SS_SO_BC4_nplusn = (
None,
Expand All @@ -263,12 +290,12 @@ def simulate_BCn(n, SO, SS, mode="nx2+n", verbose=False):
)
# BC1
if n >= 1:
SS_SO_BC1 = (
# Expecting one more round of female restitution in BC1
SO.mate_nx2plusn("SSxSO BC1", SS_SO_F1, verbose=verbose)
if mode == "nx2+n"
else SO.mate_2nplusn("SSxSO BC1", SS_SO_F1, verbose=verbose)
)
if mode == CrossMode.nx2plusn:
SS_SO_BC1 = SO.mate_nx2plusn("SSxSO BC1", SS_SO_F1, verbose=verbose)
elif mode == CrossMode.twoplusnFDR:
SS_SO_BC1 = SO.mate_2nplusn_FDR("SSxSO BC1", SS_SO_F1, verbose=verbose)
elif mode == CrossMode.twoplusnSDR:
SS_SO_BC1 = SO.mate_2nplusn_SDR("SSxSO BC1", SS_SO_F1, verbose=verbose)
# BC2
if n >= 2:
SS_SO_BC2_nplusn = SO.mate_nplusn("SSxSO BC2", SS_SO_BC1, verbose=verbose)
Expand Down Expand Up @@ -304,8 +331,8 @@ def plot_summary(ax, samples: list[Genome]) -> GenomeSummary:
SO_data = []
SS_data = []
percent_SO_data = []
for sample in samples:
summary = sample.summary
for s in samples:
summary = s.summary
try:
_, _, group_unique, _, _ = [x for x in summary if x[0] == "SO"][0]
except:
Expand All @@ -326,7 +353,7 @@ def plot_summary(ax, samples: list[Genome]) -> GenomeSummary:
shift = 0.5 # used to offset bars a bit to avoid cluttering
if overlaps:
for overlap in overlaps:
logger.debug(f"Modify bar offsets at {overlap} due to SS and SO overlaps")
logger.debug("Modify bar offsets at %s due to SS and SO overlaps", overlap)
SS_counter[overlap - shift] = SS_counter[overlap]
del SS_counter[overlap]
SO_counter[overlap + shift] = SO_counter[overlap]
Expand All @@ -338,7 +365,7 @@ def modify_range_end(d: dict, value: int):
# Has data at the range end, but no adjacent data points (i.e. isolated bar)
if value in d and (value - 1 in d or value + 1 in d):
return
logger.debug(f"Modify bar offsets at {value} due to end of range ends")
logger.debug("Modify bar offsets at %d due to end of range ends", value)
d[value - shift if value else value + shift] = d[80]
del d[value]

Expand Down Expand Up @@ -383,7 +410,7 @@ def write_chromosomes(genomes: list[Genome], filename: str):
filename (str): File path to write to.
"""
print(f"Write chromosomes to `{filename}`", file=sys.stderr)
with open(filename, "w") as fw:
with open(filename, "w", encoding="utf-8") as fw:
for genome in genomes:
print(genome, file=fw)

Expand All @@ -396,16 +423,17 @@ def write_SO_percent(summary: GenomeSummary, filename: str):
filename (str): File path to write to.
"""
print(f"Write SO percent to `{filename}`", file=sys.stderr)
with open(filename, "w") as fw:
with open(filename, "w", encoding="utf-8") as fw:
print("\n".join(str(x) for x in sorted(summary.percent_SO_data)), file=fw)


def simulate(args):
"""
%prog simulate [2n+n|nx2+n]
%prog simulate [2n+n_FDR|2n+n_SDR|nx2+n]

Run simulation on female restitution. There are two modes:
- 2n+n: merger between a somatic and a germline
- 2n+n_FDR: merger between a somatic and a germline
- 2n+n_SDR: merger between a recombined germline and a germline (not yet supported)
- nx2+n: merger between a doubled germline and a germline

These two modes would impact the sequence diversity in the progeny
Expand All @@ -428,8 +456,8 @@ def simulate(args):
sys.exit(not p.print_help())

(mode,) = args
validate_in_choices(mode, ["2n+n", "nx2+n"], "Mode")
logger.info(f"Transmission: {mode}")
mode = CrossMode(mode)
logger.info("Transmission: %s", mode)

# Construct a composite figure with 6 tracks
fig = plt.figure(1, (iopts.w, iopts.h))
Expand Down Expand Up @@ -512,7 +540,12 @@ def simulate(args):
normalize_axes(root)

# Title
mode_title = r"$n_1\times2 + n_2$" if mode == "nx2+n" else r"$2n + n$"
if mode == CrossMode.nx2plusn:
mode_title = r"$n_1\times2 + n_2$"
elif mode == CrossMode.twoplusnFDR:
mode_title = r"$2n + n$ (FDR)"
elif mode == CrossMode.twoplusnSDR:
mode_title = r"$2n + n$ (SDR)"
root.text(0.5, 0.95, f"Transmission: {mode_title}", ha="center")

savefig(f"{mode}.pdf", dpi=120)
Expand Down Expand Up @@ -548,10 +581,8 @@ def _get_sizes(filename, prefix_length, tag, target_size=None):
tag (str): Prepend `tag-` to the seqid.
target_size (int): Expected genome size. Defaults to None.
"""
from collections import defaultdict

sizes_list = defaultdict(list)
with open(filename) as fp:
with open(filename, encoding="utf-8") as fp:
for row in fp:
if not row.startswith("Chr"):
continue
Expand Down
Loading