From 0113723a77958be82af3a41d917e026046f9e48b Mon Sep 17 00:00:00 2001 From: Cecilia-Sensalari <57489957+Cecilia-Sensalari@users.noreply.github.com> Date: Thu, 8 Apr 2021 11:23:20 +0200 Subject: [PATCH] Generate ortholog PDFs when data are unavailable (#10) - If there are missing species pairs in the DBs, generate a PDF figure only if at least one trio has complete data (Ks list and Ks peak estimate). - Log warnings when skipping a trio due to missing data in DBs. - The missing pairs are listed at the end, as missing 1) from both databases, 2) from only peak DB and 3) from only Ks list DB. - Note: if the ortholog peak DB isn't present but the Ks list DB is complete, the distributions will still be plotted, all without their modes. --- ksrates/plot_orthologs.py | 128 ++++++++++++++++++++++++++++---------- 1 file changed, 94 insertions(+), 34 deletions(-) diff --git a/ksrates/plot_orthologs.py b/ksrates/plot_orthologs.py index 53b58e7..7ec9f55 100644 --- a/ksrates/plot_orthologs.py +++ b/ksrates/plot_orthologs.py @@ -57,83 +57,143 @@ def plot_orthologs_distr(config_file, trios_file): # ----------------------------------------------------------------------------- - # GENERATING 6-PANEL FIGURE with ortholog distributions FOR EACH TRIO + # GENERATING PDF FIGURE with ortholog distributions FOR EACH TRIO outgroups_per_divergent_pair_dict = {} + missing_pairs_ks_list, missing_pairs_peaks = [], [] + for __, row in trios.iterrows(): species, sister, out = row['Species'], row['Sister_Species'], row['Out_Species'] + # Generate dictionary of divergent pairs linked with their outgroups divergent_pair_key = f"{species}_{sister}" if divergent_pair_key not in outgroups_per_divergent_pair_dict.keys(): outgroups_per_divergent_pair_dict[divergent_pair_key] = [out] else: outgroups_per_divergent_pair_dict[divergent_pair_key].append(out) + + # PLOTTING THE DISTRIBUTIONS for divergent_pair in outgroups_per_divergent_pair_dict.keys(): - with PdfPages(os.path.join("rate_adjustment", f"{species_of_interest}", f"orthologs_{divergent_pair}.pdf")) as pdf: - species = divergent_pair.split("_")[0] - latinSpecies = latin_names[species] - sister = divergent_pair.split("_")[1] - latinSister = latin_names[sister] + species, sister = divergent_pair.split("_")[0], divergent_pair.split("_")[1] + latinSpecies, latinSister = latin_names[species], latin_names[sister] + # Tags (sorted names, e.g. A.filiculoides_S.cucullata) + species_sister = "_".join(sorted([latinSpecies, latinSister], key=str.casefold)) + + out_list = outgroups_per_divergent_pair_dict[divergent_pair] + available_trios, unavailable_trios = [], [] + for out in out_list: # Check if all data are available for this trio + latinOut = latin_names[out] + species_out = "_".join(sorted([latinSpecies, latinOut], key=str.casefold)) + sister_out = "_".join(sorted([latinSister, latinOut], key=str.casefold)) + + available_data = True + for pair in [species_sister, species_out, sister_out]: + if pair not in list(ks_list_db.index): + available_data = False + if pair not in missing_pairs_ks_list: + missing_pairs_ks_list.append(pair) + if not no_peak_db: # If ortholog Ks peak database is available + if pair not in list(db.index): + available_data = False + if pair not in missing_pairs_peaks: + missing_pairs_peaks.append(pair) + + if available_data: + available_trios.append(out) + else: + unavailable_trios.append(out) + + if len(available_trios) == 0: logging.info("") logging.info(f"Plotting ortholog Ks distributions for species pair [{latinSpecies} - {latinSister}]") + logging.warning(f"- Skipping all outspecies: not enough ortholog data available (PDF figure not generated)") + continue - # tags, e.g. A.filiculoides_S.cucullata - species_sister = "_".join(sorted([latinSpecies, latinSister], key=str.casefold)) + with PdfPages(os.path.join("rate_adjustment", f"{species_of_interest}", f"orthologs_{divergent_pair}.pdf")) as pdf: + logging.info("") + logging.info(f"Plotting ortholog Ks distributions for species pair [{latinSpecies} - {latinSister}]") + # SPECIES - SISTER ks_list_species_sister = literal_eval(ks_list_db.at[species_sister, 'Ks_Values']) - # run again the bootstrap, only 20 times (very quick) to get the KDE lines - # to be plotted onto the ortholog distribution - logging.info(f"- Calculating KDEs for the two sister species [{latinSpecies} - {latinSister}]") - bootstrap_kde_species_sister = fcPeak.bootstrap_KDE(ks_list_species_sister, 20, x_lim, - bin_width_ortho) - - out_list = outgroups_per_divergent_pair_dict[divergent_pair] - for out in out_list: + # Getting 20 KDE curves through bootstrap + bootstrap_kde_species_sister = fcPeak.bootstrap_KDE(ks_list_species_sister, 20, x_lim, bin_width_ortho) + + for out in unavailable_trios: latinOut = latin_names[out] - logging.info(f"- Processing outspecies [{latinOut}]") + logging.warning(f"- Skipping outspecies [{latinOut}]: not enough ortholog data available") + for out in available_trios: + latinOut = latin_names[out] + logging.info(f"- Using outspecies [{latinOut}]:") fig, axes = fcPlot.generate_orthologs_figure(latinSpecies, latinSister, latinOut, x_lim) # tags, e.g. A.filiculoides_S.cucullata species_out = "_".join(sorted([latinSpecies, latinOut], key=str.casefold)) sister_out = "_".join(sorted([latinSister, latinOut], key=str.casefold)) + # SPECIES - SISTER # Plotting Ks lists and their KDE lines + logging.info(f" Plotting data for the two sister species [{latinSpecies} - {latinSister}]") fcPlot.plot_orthologs_histogram_kdes(ks_list_species_sister, bin_list_ortho, axes[0], - bootstrap_kde_species_sister) + bootstrap_kde_species_sister) + # SPECIES - OUTGROUP ks_list = literal_eval(ks_list_db.at[species_out, 'Ks_Values']) - # run again the bootstrap, only 20 times (very quick) to get the KDE lines - # to be plotted onto the ortholog distribution - logging.info(f" Calculating KDEs for focal species and outspecies [{latinSpecies} - {latinOut}]") + # Getting 20 KDE curves through bootstrap + logging.info(f" Plotting data for focal species and outspecies [{latinSpecies} - {latinOut}]") bootstrap_kde = fcPeak.bootstrap_KDE(ks_list, 20, x_lim, bin_width_ortho) # Plotting Ks lists and their KDE lines fcPlot.plot_orthologs_histogram_kdes(ks_list, bin_list_ortho, axes[1], bootstrap_kde) + # SISTER - OUTGROUP ks_list = literal_eval(ks_list_db.at[sister_out, 'Ks_Values']) - # run again the bootstrap, only 20 times (very quick) to get the KDE lines - # to be plotted onto the ortholog distribution - logging.info(f" Calculating KDEs for sister species and outspecies [{latinSister} - {latinOut}]") + # Getting 20 KDE curves through bootstrap + logging.info(f" Plotting data for sister species and outspecies [{latinSister} - {latinOut}]") bootstrap_kde = fcPeak.bootstrap_KDE(ks_list, 20, x_lim, bin_width_ortho) # Plotting Ks lists and their KDE lines fcPlot.plot_orthologs_histogram_kdes(ks_list, bin_list_ortho, axes[2], bootstrap_kde) + # Plotting estimated mode of the orthologs distributions as vertical lines y_upper_lim = axes[0].get_ylim()[1] - if not no_peak_db: # if the peak database is available - # Plotting estimated mode of the orthologs distributions as vertical lines + if not no_peak_db: # If ortholog Ks peak database is available fcPlot.plot_orthologs_peak_lines(db, species_sister, axes[0], y_upper_lim) fcPlot.plot_orthologs_peak_lines(db, species_out, axes[1], y_upper_lim) fcPlot.plot_orthologs_peak_lines(db, sister_out, axes[2], y_upper_lim) - object_list = [] - for ax in axes: - lgd = ax.get_legend() - object_list.append(lgd) - sup = fig._suptitle - object_list.append(sup) - - pdf.savefig(fig, transparent=True, bbox_extra_artists=(sup,), bbox_inches='tight') + pdf.savefig(fig, transparent=True, bbox_extra_artists=(fig._suptitle,), bbox_inches='tight') plt.close() logging.info(f"- Saving PDF figure [orthologs_{divergent_pair}.pdf]") + + # Report if species are missing from any of the two ortholog databases + if len(missing_pairs_ks_list) != 0 or len(missing_pairs_peaks) != 0: + logging.warning("") + logging.warning("The species pairs listed below are not (yet) available in the ortholog databases") + logging.warning("The trios involving such species pairs have not been plotted") + logging.warning("") + + missing_in_both_dbs = list((set(missing_pairs_peaks) & set(missing_pairs_ks_list))) + if len(missing_in_both_dbs) != 0: + logging.warning("Species pairs not yet available in both Ks peak and Ks list ortholog databases:") + for pair in sorted(missing_in_both_dbs): + logging.warning(f" {pair.split('_')[0]} - {pair.split('_')[1]}") + logging.warning("") + + missing_pairs_peaks = list(set(missing_pairs_peaks) - set(missing_in_both_dbs)) + if len(missing_pairs_peaks) != 0: + logging.warning("Species pairs not yet available in the ortholog Ks peak database:") + for pair in sorted(missing_pairs_peaks): + logging.warning(f" {pair.split('_')[0]} - {pair.split('_')[1]}") + logging.warning("") + + missing_pairs_ks_list = list(set(missing_pairs_ks_list) - set(missing_in_both_dbs)) + if len(missing_pairs_ks_list) != 0: + logging.warning("Species pairs not yet available in the ortholog Ks list database:") + for pair in sorted(missing_pairs_ks_list): + logging.warning(f" {pair.split('_')[0]} - {pair.split('_')[1]}") + logging.warning("") + + logging.warning("Please compute their ortholog Ks data and/or add the ortholog data to the databases,") + logging.warning("then rerun this step.") + logging.info("") logging.info("All done") \ No newline at end of file