From f46ce063512633f34ef092424949c47a14566101 Mon Sep 17 00:00:00 2001 From: Fernando Meyer Date: Fri, 11 Jun 2021 09:15:36 +0200 Subject: [PATCH] output genome metrics --- amber.py | 37 +++++++++++++++++++++++-------------- src/binning_classes.py | 20 ++++++++++++++++---- src/plots.py | 20 +++++++++----------- src/utils/load_data.py | 5 +++-- 4 files changed, 51 insertions(+), 31 deletions(-) diff --git a/amber.py b/amber.py index 59f038e..1b0ae62 100755 --- a/amber.py +++ b/amber.py @@ -179,9 +179,6 @@ def evaluate(queries_list, sample_id): df_summary = pd.concat([df_summary, query_metrics_df], ignore_index=True, sort=True) query.precision_df[utils_labels.TOOL] = query.label - - # query.recall_df.to_csv(os.path.join(query.options.output_dir, 'genome', query.label, 'metrics_per_genome.tsv'), sep='\t', index=False) - pd_bins_all = pd.concat([pd_bins_all, query.precision_df.reset_index()], ignore_index=True, sort=True) pd_bins_all['sample_id'] = sample_id @@ -207,10 +204,10 @@ def evaluate_samples_queries(sample_id_to_queries_list): return df_summary_all, pd_bins_all -def save_metrics(df_summary, pd_bins, output_dir, stdout): +def save_metrics(sample_id_to_queries_list, df_summary, pd_bins, output_dir, stdout): logging.getLogger('amber').info('Saving computed metrics') df_summary.to_csv(os.path.join(output_dir, 'results.tsv'), sep='\t', index=False) - # pd_bins.to_csv(os.path.join(output_dir, 'bins.tsv'), index=False, sep='\t') + pd_bins.to_csv(os.path.join(output_dir, 'bin_metrics.tsv'), index=False, sep='\t') if stdout: summary_columns = [utils_labels.TOOL] + [col for col in df_summary if col != utils_labels.TOOL] print(df_summary[summary_columns].to_string(index=False)) @@ -225,6 +222,18 @@ def save_metrics(df_summary, pd_bins, output_dir, stdout): table = pd_group[['sample_id'] + list(bins_columns.keys())].rename(columns=dict(bins_columns)) table.to_csv(os.path.join(output_dir, 'taxonomic', tool, 'metrics_per_bin.tsv'), sep='\t', index=False) + pd_genomes_all = pd.DataFrame() + for sample_id in sample_id_to_queries_list: + pd_genomes_sample = pd.DataFrame() + for query in sample_id_to_queries_list[sample_id]: + if isinstance(query, binning_classes.GenomeQuery): + query.recall_df_cami1[utils_labels.TOOL] = query.label + pd_genomes_sample = pd.concat([pd_genomes_sample, query.recall_df_cami1], ignore_index=True, sort=False) + pd_genomes_sample['sample_id'] = sample_id + pd_genomes_all = pd.concat([pd_genomes_all, pd_genomes_sample], ignore_index=True, sort=False) + if not pd_genomes_all.empty: + pd_genomes_all.to_csv(os.path.join(output_dir, 'genome_metrics_cami1.tsv'), index=False, sep='\t') + def main(args=None): parser = argparse.ArgumentParser(description="AMBER: Assessment of Metagenome BinnERs", @@ -286,16 +295,16 @@ def main(args=None): df_summary, pd_bins = evaluate_samples_queries(sample_id_to_queries_list) - save_metrics(df_summary, pd_bins, output_dir, args.stdout) + save_metrics(sample_id_to_queries_list, df_summary, pd_bins, output_dir, args.stdout) - # plot_genome_binning(args.colors, - # sample_id_to_queries_list, - # df_summary, - # pd_bins[pd_bins['rank'] == 'NA'], - # labels, - # coverages_pd, - # output_dir) - # plot_taxonomic_binning(args.colors, df_summary, pd_bins, labels, output_dir) + plot_genome_binning(args.colors, + sample_id_to_queries_list, + df_summary, + pd_bins[pd_bins['rank'] == 'NA'], + labels, + coverages_pd, + output_dir) + plot_taxonomic_binning(args.colors, df_summary, pd_bins, labels, output_dir) amber_html.create_html(df_summary, pd_bins, diff --git a/src/binning_classes.py b/src/binning_classes.py index ddd2ed9..2ef5a59 100755 --- a/src/binning_classes.py +++ b/src/binning_classes.py @@ -1,4 +1,4 @@ -# Copyright 2020 Department of Computational Biology for Infection Research - Helmholtz Centre for Infection Research +# Copyright 2021 Department of Computational Biology for Infection Research - Helmholtz Centre for Infection Research # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by @@ -390,7 +390,7 @@ def __init__(self, label, sample_id, options): self.__gold_standard = None self.__gold_standard_df = None self.__precision_df = pd.DataFrame() - self.__recall_df = None + self.__recall_df = pd.DataFrame() self.__confusion_df = None self.__metrics = None self.__metrics_filtered = None @@ -490,16 +490,25 @@ class GenomeQuery(Query): def __init__(self, df, label, sample_id, options): super().__init__(label, sample_id, options) self.__df = df + self.__recall_df_cami1 = pd.DataFrame() self.metrics = Metrics() @property def df(self): return self.__df + @property + def recall_df_cami1(self): + return self.__recall_df_cami1 + @df.setter def df(self, df): self.__df = df + @recall_df_cami1.setter + def recall_df_cami1(self, recall_df_cami1): + self.__recall_df_cami1 = recall_df_cami1 + def get_metrics_df(self): metrics_dict = self.metrics.get_ordered_dict() metrics_dict[utils_labels.TOOL] = self.label @@ -618,8 +627,7 @@ def compute_metrics(self): if self.options.genome_to_unique_common: unmapped_genomes -= set(self.options.genome_to_unique_common) num_unmapped_genomes = len(unmapped_genomes) - prec_copy = precision_df[['recall_bp', 'recall_seq']].reset_index() - # prec_copy = precision_df[['recall_bp', 'recall_seq', 'genome_id']].loc[precision_df.groupby('genome_id', sort=False)['recall_bp'].idxmax()].reset_index() + prec_copy = precision_df.reset_index() if num_unmapped_genomes: prec_copy = prec_copy.reindex(prec_copy.index.tolist() + list(range(len(prec_copy), len(prec_copy) + num_unmapped_genomes))).fillna(.0) self.metrics.recall_avg_bp_cami1 = prec_copy['recall_bp'].mean() @@ -627,6 +635,7 @@ def compute_metrics(self): self.metrics.recall_avg_bp_sem_cami1 = prec_copy['recall_bp'].sem() self.metrics.recall_avg_seq_sem_cami1 = prec_copy['recall_seq'].sem() self.metrics.recall_avg_bp_var_cami1 = prec_copy['recall_bp'].var() + self.recall_df_cami1 = prec_copy # End Compute recall as in CAMI 1 self.metrics.accuracy_bp = precision_df['tp_length'].sum() / recall_df['length_gs'].sum() @@ -732,6 +741,9 @@ def rank_to_df(self, rank_to_df): self.__rank_to_df = rank_to_df def _create_profile(self, all_bins): + if self.recall_df.empty: + return [], [] + class Prediction: def __init__(self): pass diff --git a/src/plots.py b/src/plots.py index e836f66..ba03882 100755 --- a/src/plots.py +++ b/src/plots.py @@ -84,17 +84,15 @@ def plot_by_genome_coverage(pd_bins, pd_target_column, available_tools, output_d for i, (color, tool) in enumerate(zip(colors_list, available_tools)): pd_tool = pd_bins[pd_bins[utils_labels.TOOL] == tool].sort_values(by=['genome_index']) - axs.scatter(pd_tool['genome_coverage'], pd_tool[pd_target_column], marker='o', color=colors_list[i], s=[2] * pd_tool.shape[0]) + axs.scatter(pd_tool['genome_coverage'], pd_tool[pd_target_column], marker='o', color=colors_list[i], s=[3] * pd_tool.shape[0]) window = 50 rolling_mean = pd_tool[pd_target_column].rolling(window=window, min_periods=10).mean() axs.plot(pd_tool['genome_coverage'], rolling_mean, color=colors_list[i]) - axs.set_xlim([0.0, pd_tool['genome_coverage'].max()]) - axs.set_ylim([0.0, 1.0]) + axs.set_ylim([-0.01, 1.01]) - # transform plot_labels to percentages - vals = axs.get_yticks() - axs.set_yticklabels(['{:3.0f}'.format(x * 100) for x in vals], fontsize=12) + axs.set_xticklabels(['{:,.1f}'.format(np.exp(x)) for x in axs.get_xticks()], fontsize=12) + axs.set_yticklabels(['{:3.0f}'.format(x * 100) for x in axs.get_yticks()], fontsize=12) axs.tick_params(axis='x', labelsize=12) @@ -106,7 +104,7 @@ def plot_by_genome_coverage(pd_bins, pd_target_column, available_tools, output_d file_name = 'completeness_by_genome_coverage' plt.ylabel(ylabel, fontsize=15) - plt.xlabel('log$_{10}$(average genome coverage)', fontsize=15) + plt.xlabel('Average genome coverage', fontsize=15) colors_iter = iter(colors_list) circles = [] @@ -124,7 +122,7 @@ def get_pd_genomes_recall(sample_id_to_queries_list): for query in sample_id_to_queries_list[sample_id]: if not isinstance(query, binning_classes.GenomeQuery): continue - recall_df = query.recall_df[['genome_id', 'recall_bp']].copy() + recall_df = query.recall_df_cami1[['genome_id', 'recall_bp']].copy() recall_df[utils_labels.TOOL] = query.label recall_df['sample_id'] = sample_id recall_df = recall_df.reset_index().set_index(['sample_id', utils_labels.TOOL]) @@ -141,13 +139,13 @@ def plot_precision_recall_by_coverage(sample_id_to_queries_list, pd_bins_g, cove pd_genomes_recall = get_pd_genomes_recall(sample_id_to_queries_list) pd_genomes_recall['genome_index'] = pd_genomes_recall['genome_id'].map(coverages_pd['rank'].to_dict()) - pd_genomes_recall = pd_genomes_recall.groupby([utils_labels.TOOL, 'genome_id']).mean().reset_index() - pd_genomes_recall['genome_coverage'] = np.log10(pd_genomes_recall['genome_id'].map(coverages_pd['COVERAGE'].to_dict())) + pd_genomes_recall = pd_genomes_recall.reset_index() + pd_genomes_recall['genome_coverage'] = np.log(pd_genomes_recall['genome_id'].map(coverages_pd['COVERAGE'].to_dict())) plot_by_genome_coverage(pd_genomes_recall, 'recall_bp', available_tools, output_dir) pd_bins_precision = pd_bins_g[[utils_labels.TOOL, 'precision_bp', 'genome_id']].copy().dropna(subset=['precision_bp']) pd_bins_precision['genome_index'] = pd_bins_precision['genome_id'].map(coverages_pd['rank'].to_dict()) - pd_bins_precision['genome_coverage'] = np.log10(pd_bins_precision['genome_id'].map(coverages_pd['COVERAGE'].to_dict())) + pd_bins_precision['genome_coverage'] = np.log(pd_bins_precision['genome_id'].map(coverages_pd['COVERAGE'].to_dict())) plot_by_genome_coverage(pd_bins_precision, 'precision_bp', available_tools, output_dir) diff --git a/src/utils/load_data.py b/src/utils/load_data.py index bf96440..15b81b7 100755 --- a/src/utils/load_data.py +++ b/src/utils/load_data.py @@ -132,12 +132,13 @@ def read_metadata(file_path_query): def load_binnings(samples_metadata, file_path_query): + columns = ['SEQUENCEID', 'BINID', 'TAXID', 'LENGTH', '_LENGTH'] sample_id_to_query_df = OrderedDict() for metadata in samples_metadata: logging.getLogger('amber').info('Loading ' + metadata[2]['SAMPLEID']) nrows = metadata[1] - metadata[0] + 1 - - df = pd.read_csv(file_path_query, sep='\t', comment='#', skiprows=metadata[0], nrows=nrows, header=None) + col_indices = [k for k, v in metadata[3].items() if v in columns] + df = pd.read_csv(file_path_query, sep='\t', comment='#', skiprows=metadata[0], nrows=nrows, header=None, usecols=col_indices) df.rename(columns=metadata[3], inplace=True) df.rename(columns={'_LENGTH': 'LENGTH'}, inplace=True) sample_id_to_query_df[metadata[2]['SAMPLEID']] = df