Skip to content

Commit

Permalink
Generate ortholog PDFs when data are unavailable (#10)
Browse files Browse the repository at this point in the history
- If there are missing species pairs in the DBs,
generate a PDF figure only if at least one trio has
complete data (Ks list and Ks peak estimate).
- Log warnings when skipping a trio due to
missing data in DBs.
- The missing pairs are listed at the end, as
missing 1) from both databases, 2) from only 
peak DB and 3) from only Ks list DB.
- Note: if the ortholog peak DB isn't present but
the Ks list DB is complete, the distributions
will still be plotted, all without their modes.
  • Loading branch information
Cecilia-Sensalari authored Apr 8, 2021
1 parent 3798044 commit 0113723
Showing 1 changed file with 94 additions and 34 deletions.
128 changes: 94 additions & 34 deletions ksrates/plot_orthologs.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,83 +57,143 @@ def plot_orthologs_distr(config_file, trios_file):

# -----------------------------------------------------------------------------

# GENERATING 6-PANEL FIGURE with ortholog distributions FOR EACH TRIO
# GENERATING PDF FIGURE with ortholog distributions FOR EACH TRIO
outgroups_per_divergent_pair_dict = {}
missing_pairs_ks_list, missing_pairs_peaks = [], []

for __, row in trios.iterrows():
species, sister, out = row['Species'], row['Sister_Species'], row['Out_Species']
# Generate dictionary of divergent pairs linked with their outgroups
divergent_pair_key = f"{species}_{sister}"
if divergent_pair_key not in outgroups_per_divergent_pair_dict.keys():
outgroups_per_divergent_pair_dict[divergent_pair_key] = [out]
else:
outgroups_per_divergent_pair_dict[divergent_pair_key].append(out)


# PLOTTING THE DISTRIBUTIONS
for divergent_pair in outgroups_per_divergent_pair_dict.keys():
with PdfPages(os.path.join("rate_adjustment", f"{species_of_interest}", f"orthologs_{divergent_pair}.pdf")) as pdf:
species = divergent_pair.split("_")[0]
latinSpecies = latin_names[species]
sister = divergent_pair.split("_")[1]
latinSister = latin_names[sister]

species, sister = divergent_pair.split("_")[0], divergent_pair.split("_")[1]
latinSpecies, latinSister = latin_names[species], latin_names[sister]
# Tags (sorted names, e.g. A.filiculoides_S.cucullata)
species_sister = "_".join(sorted([latinSpecies, latinSister], key=str.casefold))

out_list = outgroups_per_divergent_pair_dict[divergent_pair]
available_trios, unavailable_trios = [], []
for out in out_list: # Check if all data are available for this trio
latinOut = latin_names[out]
species_out = "_".join(sorted([latinSpecies, latinOut], key=str.casefold))
sister_out = "_".join(sorted([latinSister, latinOut], key=str.casefold))

available_data = True
for pair in [species_sister, species_out, sister_out]:
if pair not in list(ks_list_db.index):
available_data = False
if pair not in missing_pairs_ks_list:
missing_pairs_ks_list.append(pair)
if not no_peak_db: # If ortholog Ks peak database is available
if pair not in list(db.index):
available_data = False
if pair not in missing_pairs_peaks:
missing_pairs_peaks.append(pair)

if available_data:
available_trios.append(out)
else:
unavailable_trios.append(out)

if len(available_trios) == 0:
logging.info("")
logging.info(f"Plotting ortholog Ks distributions for species pair [{latinSpecies} - {latinSister}]")
logging.warning(f"- Skipping all outspecies: not enough ortholog data available (PDF figure not generated)")
continue

# tags, e.g. A.filiculoides_S.cucullata
species_sister = "_".join(sorted([latinSpecies, latinSister], key=str.casefold))
with PdfPages(os.path.join("rate_adjustment", f"{species_of_interest}", f"orthologs_{divergent_pair}.pdf")) as pdf:
logging.info("")
logging.info(f"Plotting ortholog Ks distributions for species pair [{latinSpecies} - {latinSister}]")

# SPECIES - SISTER
ks_list_species_sister = literal_eval(ks_list_db.at[species_sister, 'Ks_Values'])
# run again the bootstrap, only 20 times (very quick) to get the KDE lines
# to be plotted onto the ortholog distribution
logging.info(f"- Calculating KDEs for the two sister species [{latinSpecies} - {latinSister}]")
bootstrap_kde_species_sister = fcPeak.bootstrap_KDE(ks_list_species_sister, 20, x_lim,
bin_width_ortho)

out_list = outgroups_per_divergent_pair_dict[divergent_pair]
for out in out_list:
# Getting 20 KDE curves through bootstrap
bootstrap_kde_species_sister = fcPeak.bootstrap_KDE(ks_list_species_sister, 20, x_lim, bin_width_ortho)

for out in unavailable_trios:
latinOut = latin_names[out]
logging.info(f"- Processing outspecies [{latinOut}]")
logging.warning(f"- Skipping outspecies [{latinOut}]: not enough ortholog data available")

for out in available_trios:
latinOut = latin_names[out]
logging.info(f"- Using outspecies [{latinOut}]:")
fig, axes = fcPlot.generate_orthologs_figure(latinSpecies, latinSister, latinOut, x_lim)

# tags, e.g. A.filiculoides_S.cucullata
species_out = "_".join(sorted([latinSpecies, latinOut], key=str.casefold))
sister_out = "_".join(sorted([latinSister, latinOut], key=str.casefold))

# SPECIES - SISTER
# Plotting Ks lists and their KDE lines
logging.info(f" Plotting data for the two sister species [{latinSpecies} - {latinSister}]")
fcPlot.plot_orthologs_histogram_kdes(ks_list_species_sister, bin_list_ortho, axes[0],
bootstrap_kde_species_sister)
bootstrap_kde_species_sister)

# SPECIES - OUTGROUP
ks_list = literal_eval(ks_list_db.at[species_out, 'Ks_Values'])
# run again the bootstrap, only 20 times (very quick) to get the KDE lines
# to be plotted onto the ortholog distribution
logging.info(f" Calculating KDEs for focal species and outspecies [{latinSpecies} - {latinOut}]")
# Getting 20 KDE curves through bootstrap
logging.info(f" Plotting data for focal species and outspecies [{latinSpecies} - {latinOut}]")
bootstrap_kde = fcPeak.bootstrap_KDE(ks_list, 20, x_lim, bin_width_ortho)
# Plotting Ks lists and their KDE lines
fcPlot.plot_orthologs_histogram_kdes(ks_list, bin_list_ortho, axes[1], bootstrap_kde)

# SISTER - OUTGROUP
ks_list = literal_eval(ks_list_db.at[sister_out, 'Ks_Values'])
# run again the bootstrap, only 20 times (very quick) to get the KDE lines
# to be plotted onto the ortholog distribution
logging.info(f" Calculating KDEs for sister species and outspecies [{latinSister} - {latinOut}]")
# Getting 20 KDE curves through bootstrap
logging.info(f" Plotting data for sister species and outspecies [{latinSister} - {latinOut}]")
bootstrap_kde = fcPeak.bootstrap_KDE(ks_list, 20, x_lim, bin_width_ortho)
# Plotting Ks lists and their KDE lines
fcPlot.plot_orthologs_histogram_kdes(ks_list, bin_list_ortho, axes[2], bootstrap_kde)

# Plotting estimated mode of the orthologs distributions as vertical lines
y_upper_lim = axes[0].get_ylim()[1]
if not no_peak_db: # if the peak database is available
# Plotting estimated mode of the orthologs distributions as vertical lines
if not no_peak_db: # If ortholog Ks peak database is available
fcPlot.plot_orthologs_peak_lines(db, species_sister, axes[0], y_upper_lim)
fcPlot.plot_orthologs_peak_lines(db, species_out, axes[1], y_upper_lim)
fcPlot.plot_orthologs_peak_lines(db, sister_out, axes[2], y_upper_lim)

object_list = []
for ax in axes:
lgd = ax.get_legend()
object_list.append(lgd)
sup = fig._suptitle
object_list.append(sup)

pdf.savefig(fig, transparent=True, bbox_extra_artists=(sup,), bbox_inches='tight')
pdf.savefig(fig, transparent=True, bbox_extra_artists=(fig._suptitle,), bbox_inches='tight')
plt.close()
logging.info(f"- Saving PDF figure [orthologs_{divergent_pair}.pdf]")

# Report if species are missing from any of the two ortholog databases
if len(missing_pairs_ks_list) != 0 or len(missing_pairs_peaks) != 0:
logging.warning("")
logging.warning("The species pairs listed below are not (yet) available in the ortholog databases")
logging.warning("The trios involving such species pairs have not been plotted")
logging.warning("")

missing_in_both_dbs = list((set(missing_pairs_peaks) & set(missing_pairs_ks_list)))
if len(missing_in_both_dbs) != 0:
logging.warning("Species pairs not yet available in both Ks peak and Ks list ortholog databases:")
for pair in sorted(missing_in_both_dbs):
logging.warning(f" {pair.split('_')[0]} - {pair.split('_')[1]}")
logging.warning("")

missing_pairs_peaks = list(set(missing_pairs_peaks) - set(missing_in_both_dbs))
if len(missing_pairs_peaks) != 0:
logging.warning("Species pairs not yet available in the ortholog Ks peak database:")
for pair in sorted(missing_pairs_peaks):
logging.warning(f" {pair.split('_')[0]} - {pair.split('_')[1]}")
logging.warning("")

missing_pairs_ks_list = list(set(missing_pairs_ks_list) - set(missing_in_both_dbs))
if len(missing_pairs_ks_list) != 0:
logging.warning("Species pairs not yet available in the ortholog Ks list database:")
for pair in sorted(missing_pairs_ks_list):
logging.warning(f" {pair.split('_')[0]} - {pair.split('_')[1]}")
logging.warning("")

logging.warning("Please compute their ortholog Ks data and/or add the ortholog data to the databases,")
logging.warning("then rerun this step.")

logging.info("")
logging.info("All done")

0 comments on commit 0113723

Please sign in to comment.