Skip to content

Commit

Permalink
Add JCVI demo script (#651)
Browse files Browse the repository at this point in the history
* cleanup graphics.landscape

* Add params

* Add an arrow to the right

* Move ks and pedigree into compara

* Check pedigree contains valid nodes

* Add graph title

* Deprecate alfalfa.py and pistachio.py

* Deprecate graph.py

* Revert set_ticks()

* Move total_bins out

* Refactor into read_matrix()

* Refactor to plot_nbinom_fit()

* Revert back to percent

* logging => logger

* Refactor into read_subsampled_matrix()

* Refactor to draw_geneticmap_heatmap()

* dev sync

* Refactor

* breaks => plot_breaks

* Refactor to draw_ks_histogram()

* Add diversity()

* panels in diversity()

* diversity complete

* Add landscape()
  • Loading branch information
tanghaibao authored Apr 29, 2024
1 parent 66975be commit 1c8816c
Show file tree
Hide file tree
Showing 13 changed files with 775 additions and 617 deletions.
79 changes: 48 additions & 31 deletions jcvi/assembly/geneticmap.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@

from itertools import combinations, groupby
from random import sample
from typing import Tuple

import numpy as np
import seaborn as sns
Expand All @@ -22,6 +23,7 @@
from ..graphics.base import (
Rectangle,
draw_cmap,
normalize_axes,
plt,
plot_heatmap,
savefig,
Expand Down Expand Up @@ -336,37 +338,21 @@ def dotplot(args):
fig.clear()


def heatmap(args):
def read_subsampled_matrix(mstmap: str, subsample: int) -> Tuple[np.ndarray, str, int]:
"""
%prog heatmap map
Calculate pairwise linkage disequilibrium given MSTmap.
Read the subsampled matrix from file if it exists, otherwise calculate it.
"""
p = OptionParser(heatmap.__doc__)
p.add_option(
"--subsample",
default=1000,
type="int",
help="Subsample markers to speed up",
)
opts, args, iopts = p.set_image_options(args, figsize="8x8")

if len(args) != 1:
sys.exit(not p.print_help())

(mstmap,) = args
subsample = opts.subsample
data = MSTMap(mstmap)

markerbedfile = mstmap + ".subsample.bed"
ldmatrix = mstmap + ".subsample.matrix"
# Take random subsample while keeping marker order
if subsample < data.nmarkers:
data = [data[x] for x in sorted(sample(range(len(data)), subsample))]
else:
logger.debug("Use all markers, --subsample ignored")

nmarkers = len(data)
markerbedfile = mstmap + ".subsample.bed"
ldmatrix = mstmap + ".subsample.matrix"
if need_update(mstmap, (ldmatrix, markerbedfile)):
with open(markerbedfile, "w", encoding="utf-8") as fw:
print("\n".join(x.bedline for x in data), file=fw)
Expand All @@ -389,21 +375,24 @@ def heatmap(args):
M = np.fromfile(ldmatrix, dtype="float").reshape(nmarkers, nmarkers)
logger.debug("LD matrix `%s` exists (%dx%d).", ldmatrix, nmarkers, nmarkers)

plt.rcParams["axes.linewidth"] = 0
return M, markerbedfile, nmarkers

fig = plt.figure(1, (iopts.w, iopts.h))
root = fig.add_axes((0, 0, 1, 1))
ax = fig.add_axes((0.1, 0.1, 0.8, 0.8)) # the heatmap

def draw_geneticmap_heatmap(root, ax, mstmap: str, subsample: int):
"""
Draw the heatmap of the genetic map.
"""
M, markerbedfile, nmarkers = read_subsampled_matrix(mstmap, subsample)

# Plot chromosomes breaks
bed = Bed(markerbedfile)
xsize = len(bed)
b = Bed(markerbedfile)
xsize = len(b)
extent = (0, nmarkers)
chr_labels = []
ignore_size = 20

breaks = []
for seqid, beg, end in bed.get_breaks():
for seqid, beg, end in b.get_breaks():
ignore = abs(end - beg) < ignore_size
pos = (beg + end) / 2
chr_labels.append((seqid, pos, ignore))
Expand Down Expand Up @@ -434,11 +423,39 @@ def heatmap(args):
m = mstmap.split(".")[0]
root.text(0.5, 0.06, f"Linkage Disequilibrium between {m} markers", ha="center")

root.set_xlim(0, 1)
root.set_ylim(0, 1)
root.set_axis_off()
normalize_axes(root)


def heatmap(args):
"""
%prog heatmap map
Calculate pairwise linkage disequilibrium given MSTmap.
"""
p = OptionParser(heatmap.__doc__)
p.add_option(
"--subsample",
default=1000,
type="int",
help="Subsample markers to speed up",
)
opts, args, iopts = p.set_image_options(args, figsize="8x8")

if len(args) != 1:
sys.exit(not p.print_help())

(mstmap,) = args

plt.rcParams["axes.linewidth"] = 0

fig = plt.figure(1, (iopts.w, iopts.h))
root = fig.add_axes((0, 0, 1, 1))
ax = fig.add_axes((0.1, 0.1, 0.8, 0.8)) # the heatmap

draw_geneticmap_heatmap(root, ax, mstmap, opts.subsample)

image_name = m + ".subsample" + "." + iopts.format
pf = mstmap.split(".")[0]
image_name = pf + ".subsample" + "." + iopts.format
savefig(image_name, dpi=iopts.dpi, iopts=iopts)


Expand Down
173 changes: 114 additions & 59 deletions jcvi/assembly/hic.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,11 @@
import os
import os.path as op
import sys

from collections import defaultdict
from functools import partial
from multiprocessing import Pool
from typing import List, Optional, Tuple

import numpy as np

Expand Down Expand Up @@ -683,63 +685,30 @@ def generate_groups(groupsfile):
yield seqids, color


def heatmap(args):
def read_matrix(
npyfile: str,
header: dict,
contig: Optional[str],
groups: List[Tuple[str, str]],
vmin: int,
vmax: int,
plot_breaks: bool,
):
"""
%prog heatmap input.npy genome.json
Plot heatmap based on .npy data file. The .npy stores a square matrix with
bins of genome, and cells inside the matrix represent number of links
between bin i and bin j. The `genome.json` contains the offsets of each
contig/chr so that we know where to draw boundary lines, or extract per
contig/chromosome heatmap.
If a 'groups' file is given (with --groups), we will draw squares on the
heatmap. The 'groups' file has the following format, for example:
seq1,seq2 b
seq1 g
seq2 g
This will first draw a square around seq1+seq2 with blue color, then seq1
and seq2 individually with green color.
Read the matrix from the npy file and apply log transformation and thresholding.
"""
p = OptionParser(heatmap.__doc__)
p.add_option("--title", help="Title of the heatmap")
p.add_option("--groups", help="Groups file, see doc")
p.add_option("--vmin", default=1, type="int", help="Minimum value in the heatmap")
p.add_option("--vmax", default=6, type="int", help="Maximum value in the heatmap")
p.add_option("--chr", help="Plot this contig/chr only")
p.add_option(
"--nobreaks",
default=False,
action="store_true",
help="Do not plot breaks (esp. if contigs are small)",
)
opts, args, iopts = p.set_image_options(
args, figsize="11x11", style="white", cmap="coolwarm", format="png", dpi=120
)

if len(args) != 2:
sys.exit(not p.print_help())

npyfile, jsonfile = args
contig = opts.chr
groups = list(generate_groups(opts.groups)) if opts.groups else []

# Load contig/chromosome starts and sizes
header = json.loads(open(jsonfile).read())
resolution = header.get("resolution")
assert resolution is not None, "`resolution` not found in `{}`".format(jsonfile)
logger.debug("Resolution set to %d", resolution)
# Load the matrix
A = np.load(npyfile)
total_bins = header["total_bins"]

# Select specific submatrix
if contig:
contig_start = header["starts"][contig]
contig_size = header["sizes"][contig]
contig_end = contig_start + contig_size
A = A[contig_start:contig_end, contig_start:contig_end]
else:
A = A[:total_bins, :total_bins]

# Convert seqids to positions for each group
new_groups = []
Expand All @@ -766,40 +735,126 @@ def heatmap(args):
B = A.astype("float64")
B += 1.0
B = np.log(B)
vmin, vmax = opts.vmin, opts.vmax
B[B < vmin] = vmin
B[B > vmax] = vmax
print(B)
logger.debug("Matrix log-transformation and thresholding (%d-%d) done", vmin, vmax)

# Canvas
fig = plt.figure(1, (iopts.w, iopts.h))
root = fig.add_axes((0, 0, 1, 1)) # whole canvas
ax = fig.add_axes((0.05, 0.05, 0.9, 0.9)) # just the heatmap

breaks = list(header["starts"].values())
breaks += [header["total_bins"]] # This is actually discarded
breaks += [total_bins] # This is actually discarded
breaks = sorted(breaks)[1:]
if contig or opts.nobreaks:
if contig or not plot_breaks:
breaks = []

return B, new_groups, breaks


def draw_hic_heatmap(
root,
ax,
npyfile: str,
jsonfile: str,
contig: Optional[str],
groups_file: str,
title: str,
vmin: int,
vmax: int,
plot_breaks: bool,
):
"""
Draw heatmap based on .npy file. The .npy file stores a square matrix with
bins of genome, and cells inside the matrix represent number of links
between bin i and bin j. The `genome.json` contains the offsets of each
contig/chr so that we know where to draw boundary lines, or extract per
contig/chromosome heatmap.
"""
groups = list(generate_groups(groups_file)) if groups_file else []

# Load contig/chromosome starts and sizes
header = json.loads(open(jsonfile, encoding="utf-8").read())
resolution = header.get("resolution")
assert resolution is not None, "`resolution` not found in `{}`".format(jsonfile)
logger.debug("Resolution set to %d", resolution)

B, new_groups, breaks = read_matrix(
npyfile, header, contig, groups, vmin, vmax, plot_breaks
)
plot_heatmap(ax, B, breaks, groups=new_groups, binsize=resolution)

# Title
pf = npyfile.rsplit(".", 1)[0]
title = opts.title
if contig:
title += "-{}".format(contig)
title += f"-{contig}"
root.text(
0.5,
0.98,
0.96,
markup(title),
color="darkslategray",
size=18,
ha="center",
va="center",
)

normalize_axes(root)


def heatmap(args):
"""
%prog heatmap input.npy genome.json
Plot heatmap based on .npy data file. The .npy stores a square matrix with
bins of genome, and cells inside the matrix represent number of links
between bin i and bin j. The `genome.json` contains the offsets of each
contig/chr so that we know where to draw boundary lines, or extract per
contig/chromosome heatmap.
If a 'groups' file is given (with --groups), we will draw squares on the
heatmap. The 'groups' file has the following format, for example:
seq1,seq2 b
seq1 g
seq2 g
This will first draw a square around seq1+seq2 with blue color, then seq1
and seq2 individually with green color.
"""
p = OptionParser(heatmap.__doc__)
p.add_option("--title", help="Title of the heatmap")
p.add_option("--groups", help="Groups file, see doc")
p.add_option("--vmin", default=1, type="int", help="Minimum value in the heatmap")
p.add_option("--vmax", default=6, type="int", help="Maximum value in the heatmap")
p.add_option("--chr", help="Plot this contig/chr only")
p.add_option(
"--nobreaks",
default=False,
action="store_true",
help="Do not plot breaks (esp. if contigs are small)",
)
opts, args, iopts = p.set_image_options(
args, figsize="11x11", style="white", cmap="coolwarm", dpi=120
)

if len(args) != 2:
sys.exit(not p.print_help())

npyfile, jsonfile = args
# Canvas
fig = plt.figure(1, (iopts.w, iopts.h))
root = fig.add_axes((0, 0, 1, 1)) # whole canvas
ax = fig.add_axes((0.05, 0.05, 0.9, 0.9)) # just the heatmap

draw_hic_heatmap(
root,
ax,
npyfile,
jsonfile,
contig=opts.chr,
groups_file=opts.groups,
title=opts.title,
vmin=opts.vmin,
vmax=opts.vmax,
plot_breaks=not opts.nobreaks,
)

pf = npyfile.rsplit(".", 1)[0]
image_name = pf + "." + iopts.format
# macOS sometimes has way too verbose output
savefig(image_name, dpi=iopts.dpi, iopts=iopts)
Expand Down
Loading

0 comments on commit 1c8816c

Please sign in to comment.