Skip to content

Commit

Permalink
use StringIO to read multi-sample files
Browse files Browse the repository at this point in the history
  • Loading branch information
fernandomeyer committed Sep 6, 2024
1 parent 2b4daa5 commit 3d74993
Show file tree
Hide file tree
Showing 7 changed files with 53 additions and 31 deletions.
5 changes: 3 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ AMBER is an evaluation package for the comparative assessment of genome reconstr

## Requirements

AMBER 2.0.4 has been tested with Python 3.11.
AMBER 2.0.7 has been tested with Python 3.11.

See [requirements.txt](requirements.txt) for all dependencies.

Expand Down Expand Up @@ -145,7 +145,7 @@ Binnings of datasets with multiple samples are supported by AMBER. For each binn
## Running _amber.py_

~~~BASH
usage: AMBER [-h] -g GOLD_STANDARD_FILE [-l LABELS] [-p FILTER] [-n MIN_LENGTH] -o OUTPUT_DIR [--stdout] [-d DESC] [--colors COLORS] [--silent] [-v] [-x MIN_COMPLETENESS]
usage: AMBER [-h] -g GOLD_STANDARD_FILE [-l LABELS] [-p FILTER] [-n MIN_LENGTH] -o OUTPUT_DIR [--stdout] [-d DESC] [--colors COLORS] [--silent] [--skip_gs] [-v] [-x MIN_COMPLETENESS]
[-y MAX_CONTAMINATION] [-r REMOVE_GENOMES] [-k KEYWORD] [--genome_coverage GENOME_COVERAGE] [--ncbi_dir NCBI_DIR]
bin_files [bin_files ...]

Expand All @@ -169,6 +169,7 @@ options:
--stdout Print summary to stdout
-d DESC, --desc DESC Description for HTML page
--silent Silent mode
--skip_gs Skip gold standard evaluation vs itself
-v, --version show program's version number and exit
genome binning-specific arguments:
Expand Down
31 changes: 13 additions & 18 deletions amber.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,6 @@
from cami_amber.utils import argparse_parents
from cami_amber.utils import labels as utils_labels
from version import __version__
from collections import defaultdict
import argparse
import errno
import logging
Expand Down Expand Up @@ -56,10 +55,13 @@ def make_sure_path_exists(path):
raise


def create_output_directories(output_dir, sample_id_to_queries_list):
def create_output_directories(output_dir, sample_id_to_g_queries_list, sample_id_to_t_queries_list):
logging.getLogger('amber').info('Creating output directories')
for sample_id in sample_id_to_queries_list:
for query in sample_id_to_queries_list[sample_id]:
for sample_id in sample_id_to_g_queries_list:
for query in sample_id_to_g_queries_list[sample_id]:
make_sure_path_exists(os.path.join(output_dir, query.binning_type, query.label))
for sample_id in sample_id_to_t_queries_list:
for query in sample_id_to_t_queries_list[sample_id]:
make_sure_path_exists(os.path.join(output_dir, query.binning_type, query.label))


Expand All @@ -77,7 +79,7 @@ def get_labels(labels, bin_files):
return tool_id


def save_metrics(sample_id_to_queries_list, df_summary, pd_bins, output_dir, stdout):
def save_metrics(sample_id_to_g_queries_list, df_summary, pd_bins, output_dir, stdout):
logging.getLogger('amber').info('Saving computed metrics')
df_summary.to_csv(os.path.join(output_dir, 'results.tsv'), sep='\t', index=False)
pd_bins.to_csv(os.path.join(output_dir, 'bin_metrics.tsv'), index=False, sep='\t')
Expand All @@ -95,12 +97,11 @@ def save_metrics(sample_id_to_queries_list, df_summary, pd_bins, output_dir, std
table.to_csv(os.path.join(output_dir, 'taxonomic', tool, 'metrics_per_bin.tsv'), sep='\t', index=False)

pd_genomes_all = pd.DataFrame()
for sample_id in sample_id_to_queries_list:
for sample_id in sample_id_to_g_queries_list:
pd_genomes_sample = pd.DataFrame()
for query in sample_id_to_queries_list[sample_id]:
if isinstance(query, binning_classes.GenomeQuery):
query.recall_df_cami1[utils_labels.TOOL] = query.label
pd_genomes_sample = pd.concat([pd_genomes_sample, query.recall_df_cami1], ignore_index=True, sort=False)
for query in sample_id_to_g_queries_list[sample_id]:
query.recall_df_cami1[utils_labels.TOOL] = query.label
pd_genomes_sample = pd.concat([pd_genomes_sample, query.recall_df_cami1], ignore_index=True, sort=False)
pd_genomes_sample['sample_id'] = sample_id
pd_genomes_all = pd.concat([pd_genomes_all, pd_genomes_sample], ignore_index=True, sort=False)
if not pd_genomes_all.empty:
Expand Down Expand Up @@ -166,17 +167,11 @@ def main(args=None):

coverages_pd = load_data.open_coverages(args.genome_coverage)

sample_id_to_queries_list = defaultdict(list)
for sample_id in sample_id_to_g_queries_list:
sample_id_to_queries_list[sample_id] += sample_id_to_g_queries_list[sample_id]
for sample_id in sample_id_to_t_queries_list:
sample_id_to_queries_list[sample_id] += sample_id_to_t_queries_list[sample_id]

create_output_directories(output_dir, sample_id_to_queries_list)
create_output_directories(output_dir, sample_id_to_g_queries_list, sample_id_to_t_queries_list)

df_summary, pd_bins = evaluate.evaluate_samples_queries(sample_id_to_g_queries_list, sample_id_to_t_queries_list)

save_metrics(sample_id_to_queries_list, df_summary, pd_bins, output_dir, args.stdout)
save_metrics(sample_id_to_g_queries_list, df_summary, pd_bins, output_dir, args.stdout)

plots.plot_genome_binning(args.colors,
sample_id_to_g_queries_list,
Expand Down
16 changes: 13 additions & 3 deletions cami_amber/binning_classes.py
Original file line number Diff line number Diff line change
Expand Up @@ -637,7 +637,8 @@ def safe_divide(x, y):
self.precision_df['sample_id'] = self.sample_id
self.recall_df = recall_df

self.heatmap_sdf = precision_recall_per_bin.transform_confusion_matrix2(query_w_length, confusion_df, precision_df, gs_df, log_scale=True)
if not self.options.skip_heatmap:
self.heatmap_sdf = precision_recall_per_bin.transform_confusion_matrix2(query_w_length, confusion_df, precision_df, gs_df, log_scale=True)

self.eval_success = True

Expand Down Expand Up @@ -690,7 +691,7 @@ def plot_recall_vs_genome_size(self):
plt.close(fig)

def plot_heat_maps(self):
if self.label == utils_labels.GS:
if self.label == utils_labels.GS or self.options.skip_heatmap:
return
plots.plot_heatmap(self.heatmap_sdf, self.sample_id, self.options.output_dir, self.label, log_scale=True)

Expand Down Expand Up @@ -947,7 +948,7 @@ def compute_metrics(self, gs_rank_to_df):
class Options:
def __init__(self, filter_tail_percentage=0, genome_to_unique_common=None, filter_keyword=None, min_length=0,
rank_as_genome_binning=None, output_dir=None, min_completeness=None, max_contamination=None,
ncbi_dir=None, skip_gs=False):
ncbi_dir=None, skip_gs=False, skip_heatmap=False):
self.__filter_tail_percentage = float(filter_tail_percentage) if filter_tail_percentage else .0
self.__genome_to_unique_common = genome_to_unique_common
self.__filter_keyword = filter_keyword
Expand All @@ -967,6 +968,7 @@ def __init__(self, filter_tail_percentage=0, genome_to_unique_common=None, filte
else:
self.__max_contamination = [.1, .05]
self.__skip_gs = skip_gs
self.__skip_heatmap = skip_heatmap
self.__ncbi_dir = ncbi_dir

@property
Expand Down Expand Up @@ -1017,6 +1019,10 @@ def ncbi_dir(self):
def skip_gs(self):
return self.__skip_gs

@property
def skip_heatmap(self):
return self.__skip_heatmap

@filter_tail_percentage.setter
def filter_tail_percentage(self, filter_tail_percentage):
self.__filter_tail_percentage = filter_tail_percentage
Expand Down Expand Up @@ -1064,3 +1070,7 @@ def ncbi_dir(self, ncbi_dir):
@skip_gs.setter
def skip_gs(self, skip_gs):
self.__skip_gs = skip_gs

@skip_heatmap.setter
def skip_heatmap(self, skip_heatmap):
self.__skip_heatmap = skip_heatmap
1 change: 0 additions & 1 deletion cami_amber/evaluate.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,6 @@ def evaluate_sample(queries_list):
gs_data = query1.gold_standard_data
for query in queries_list:
query.compute_metrics(gs_data)
del gs_data


def evaluate_samples_queries(sample_id_to_g_queries_list, sample_id_to_t_queries_list):
Expand Down
27 changes: 21 additions & 6 deletions cami_amber/utils/load_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
import io
import tarfile
import zipfile
import itertools
from multiprocessing.pool import ThreadPool
from collections import defaultdict
from collections import OrderedDict
Expand Down Expand Up @@ -167,13 +168,27 @@ def read_metadata(path_label_tuple):
def load_sample(metadata):
columns = ['SEQUENCEID', 'BINID', 'TAXID', 'LENGTH', '_LENGTH']
logging.getLogger('amber').info('Loading %s of %s' % (metadata[2]['SAMPLEID'], metadata[5]))
nrows = metadata[1] - metadata[0] + 1
usecols = [v for v in metadata[3] if v in columns]
df = pd.read_csv(metadata[4], sep='\t', comment='#', skiprows=metadata[0], nrows=nrows, header=None,
names=metadata[3],
usecols=usecols,
dtype={'SEQUENCEID': pd.StringDtype(), 'BINID': pd.StringDtype(), 'TAXID': pd.UInt32Dtype(),
'LENGTH': pd.UInt32Dtype(), '_LENGTH': pd.UInt32Dtype()})

if metadata[0] < 1000:
nrows = metadata[1] - metadata[0] + 1
df = pd.read_csv(metadata[4], sep='\t', comment='#', skiprows=metadata[0], nrows=nrows, header=None,
names=metadata[3],
usecols=usecols,
dtype={'SEQUENCEID': pd.StringDtype(), 'BINID': pd.StringDtype(), 'TAXID': pd.UInt32Dtype(),
'LENGTH': pd.UInt32Dtype(), '_LENGTH': pd.UInt32Dtype()})
else:
# Avoid high memory peak by using StringIO due to possible pandas bug
text = io.StringIO()
with open_generic(metadata[4]) as f:
for line in itertools.islice(f, metadata[0], metadata[1] + 1):
text.write(line)
text.seek(0)
df = pd.read_csv(text, sep='\t', comment='#', header=None,
names=metadata[3],
usecols=usecols,
dtype={'SEQUENCEID': pd.StringDtype(), 'BINID': pd.StringDtype(), 'TAXID': pd.UInt32Dtype(),
'LENGTH': pd.UInt32Dtype(), '_LENGTH': pd.UInt32Dtype()})
df.rename(columns={'_LENGTH': 'LENGTH'}, inplace=True)
return df

Expand Down
2 changes: 2 additions & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@ def dependencies():
name = 'cami-amber',
version = __version__,
description = 'AMBER: Assessment of Metagenome BinnERs',
long_description = open('README.md').read(),
long_description_content_type="text/markdown",
author = 'CAMI',
author_email = '[email protected]',
url = 'http://cami-challenge.org',
Expand Down
2 changes: 1 addition & 1 deletion version.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = '2.0.6'
__version__ = '2.0.7'

0 comments on commit 3d74993

Please sign in to comment.