Skip to content

Commit

Permalink
integrate metadata in tabulate-las-results
Browse files Browse the repository at this point in the history
  • Loading branch information
gregcaporaso committed Sep 2, 2024
1 parent e2cec2e commit ab244d8
Show file tree
Hide file tree
Showing 8 changed files with 151 additions and 11 deletions.
4 changes: 3 additions & 1 deletion q2_dwq2/_pipelines.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ def align_and_summarize(

def search_and_summarize(
ctx, query_seqs, reference_seqs,
reference_metadata=None,
n=_las_defaults['n'],
split_size=_split_seqs_defaults['split_size'],
title=_tabulate_las_defaults['title'],
Expand All @@ -53,6 +54,7 @@ def search_and_summarize(
las_results.append(las_result)

las_results, = combine_action(las_results)
result_table, = tabulate_las_results_action(las_results, title=title)
result_table, = tabulate_las_results_action(
las_results, title=title, reference_metadata=reference_metadata)

return (las_results, result_table)
31 changes: 30 additions & 1 deletion q2_dwq2/_visualizers.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@
import pandas as pd
from skbio.alignment import TabularMSA

import qiime2


_tabulate_las_defaults = {'title': 'Local Alignment Search Results'}

Expand All @@ -23,8 +25,35 @@ def summarize_alignment(output_dir: str, msa: TabularMSA) -> None:

def tabulate_las_results(output_dir: str,
hits: pd.DataFrame,
title: str = _tabulate_las_defaults['title']) \
title: str = _tabulate_las_defaults['title'],
reference_metadata: qiime2.Metadata = None) \
-> None:
if reference_metadata is not None:
reference_metadata = reference_metadata.to_dataframe()

hits.reset_index(inplace=True)

metadata_index = reference_metadata.index.name
metadata_columns = reference_metadata.columns.to_list()
reference_metadata.reset_index(inplace=True)

missing_ids = \
set(hits['reference id']) - set(reference_metadata[metadata_index])
if len(missing_ids) > 0:
raise KeyError(
f"The following {len(missing_ids)} IDs are missing from "
f"reference metadata: {missing_ids}.")

hits = pd.merge(hits, reference_metadata,
left_on='reference id',
right_on=metadata_index,
how='inner')

hits.set_index(['query id', 'reference id'], inplace=True)
column_order = \
['percent similarity', 'alignment length', 'score'] + \
metadata_columns + ['aligned query', 'aligned reference']
hits = hits.reindex(columns=column_order)
with open(os.path.join(output_dir, "index.html"), "w") as fh:
fh.write(_html_template % (title, hits.to_html()))

Expand Down
8 changes: 8 additions & 0 deletions q2_dwq2/examples/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
# flake8: noqa
# ----------------------------------------------------------------------------
# Copyright (c) 2024, Greg Caporaso.
#
# Distributed under the terms of the Modified BSD License.
#
# The full license is in the file LICENSE, distributed with this software.
# ----------------------------------------------------------------------------
9 changes: 9 additions & 0 deletions q2_dwq2/examples/_examples.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,9 +73,17 @@ def reference1_factory():
)


def metadata1_factory():
fp = get_filepath_from_package(
'examples/data/search-and-summarize/reference-metadata.tsv')
return qiime2.Metadata.load(fp)


def search_and_summarize_example_1(use):
query_seqs = use.init_artifact('query_seqs', query1_factory)
reference_seqs = use.init_artifact('reference_seqs', reference1_factory)
reference_metadata = use.init_artifact('reference_metadata',
metadata1_factory)
use.comment("This is an example of running this Pipeline serially.")
use.comment("The modification to run this in parallel depends on the "
"interface you're using (for example, using q2cli you would "
Expand All @@ -86,6 +94,7 @@ def search_and_summarize_example_1(use):
action_id='search_and_summarize'),
use.UsageInputs(query_seqs=query_seqs,
reference_seqs=reference_seqs,
reference_metadata=reference_metadata,
split_size=1),
use.UsageOutputNames(hits='hits', hits_table='hits-table')
)
21 changes: 21 additions & 0 deletions q2_dwq2/examples/data/search-and-summarize/reference-metadata.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
id domain phylum class order family genus species gene location ssu length contig length
RS_GCF_000620805.1~NZ_KK211187.1-#3 Bacteria Bacillota Bacilli Exiguobacterales Exiguobacteraceae Exiguobacterium_A Exiguobacterium_A undae 338855..340301 1447 3126126
RS_GCF_900459785.1~NZ_UHGY01000002.1 Bacteria Bacillota Bacilli Lactobacillales Streptococcaceae Streptococcus Streptococcus pyogenes 16841..18385 1545 1836331
RS_GCF_024758665.1~NZ_CP094950.1-#4 Bacteria Bacillota Bacilli Lactobacillales Lactobacillaceae Lactiplantibacillus Lactiplantibacillus plantarum 2433818..2435382 1565 3230328
RS_GCF_019355415.1~NZ_CP049290.1-#6 Bacteria Pseudomonadota Gammaproteobacteria Enterobacterales Enterobacteriaceae Enterobacter Enterobacter hormaechei_B 476185..477723 1539 4903326
GB_GCA_001034065.1~KQ089641.1-#4 Bacteria Pseudomonadota Gammaproteobacteria Enterobacterales Enterobacteriaceae Klebsiella Klebsiella variicola 819912..820216 305 2799861
RS_GCF_000640835.1~NZ_KK227193.1 Bacteria Bacillota Bacilli Staphylococcales Staphylococcaceae Staphylococcus Staphylococcus aureus 412569..414118 1550 416113
RS_GCF_021533575.1~NZ_JAIKUL010000067.1 Bacteria Pseudomonadota Gammaproteobacteria Enterobacterales Enterobacteriaceae Escherichia Escherichia coli 4..503 500 524
RS_GCF_002283275.1~NZ_NQWV01000054.1 Bacteria Pseudomonadota Gammaproteobacteria Enterobacterales Enterobacteriaceae Salmonella Salmonella enterica 2..442 441 447
RS_GCF_003368425.2~NZ_QSMS02000004.1-#3 Bacteria Pseudomonadota Gammaproteobacteria Enterobacterales Enterobacteriaceae Escherichia Escherichia coli 2532173..2533710 1538 3780032
RS_GCF_002547115.1~NZ_NNYU01000079.1 Bacteria Pseudomonadota Gammaproteobacteria Enterobacterales Enterobacteriaceae Escherichia Escherichia coli 150..1687 1538 1698
GB_GCA_000171035.2~ABDL02000058.1 Bacteria Bacillota Bacilli Bacillales Bacillaceae_G Bacillus_A Bacillus_A paranthracis 4704..6253 1550 6309
RS_GCF_029025045.1~NZ_CP118911.1-#9 Bacteria Bacillota Bacilli Bacillales Bacillaceae Bacillus Bacillus velezensis 3344838..3346385 1548 4371975
GB_GCA_948682535.1~CAPPVW010000159.1 Bacteria Campylobacterota Campylobacteria Campylobacterales Helicobacteraceae Helicobacter_C Helicobacter_C japonicus 1..848 848 848
RS_GCF_000585375.1~NZ_KK073874.1-#2 Bacteria Actinomycetota Actinomycetes Mycobacteriales Cryptosporangiaceae Cryptosporangium Cryptosporangium arvum 1844671..1846183 1513 9195993
RS_GCF_000624895.2~NZ_CP007396.2-#6 Bacteria Pseudomonadota Gammaproteobacteria Enterobacterales Enterobacteriaceae Salmonella Salmonella enterica 3452081..3453618 1538 4685838
RS_GCF_001036725.1~NZ_LEJZ01000149.1 Bacteria Pseudomonadota Gammaproteobacteria Pseudomonadales Pseudomonadaceae Pseudomonas Pseudomonas aeruginosa 1..348 348 349
RS_GCF_000949965.1~NZ_JZTC01000078.1 Bacteria Pseudomonadota Gammaproteobacteria Enterobacterales Vibrionaceae Photobacterium Photobacterium kishitanii 13..1077 1065 1081
GB_GCA_949503465.1~CATBHR010000016.1 Bacteria Bacillota_A Clostridia Christensenellales Borkfalkiaceae Gallimonas Gallimonas sp910585595 30509..30836 328 30836
RS_GCF_011090075.1~NZ_WSUO01000039.1 Bacteria Pseudomonadota Gammaproteobacteria Enterobacterales Enterobacteriaceae Salmonella Salmonella enterica 429..1967 1539 5374
GB_GCA_000333535.1~BADH01000751.1-#2 Bacteria Pseudomonadota Gammaproteobacteria Enterobacterales Enterobacteriaceae Kosakonia Kosakonia cowanii 7253..7466 214 34707
8 changes: 5 additions & 3 deletions q2_dwq2/plugin_setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
import importlib

from qiime2.plugin import (Citations, Plugin, Float, Range, Visualization, Int,
Str, Collection)
Str, Collection, Metadata)
from q2_types.feature_data import FeatureData, AlignedSequence, Sequence
from q2_dwq2 import __version__
from q2_dwq2._methods import (nw_align, local_alignment_search,
Expand Down Expand Up @@ -197,9 +197,11 @@
citations=[],
)

_tabulate_las_parameters = {'title': Str}
_tabulate_las_parameters = {'title': Str,
'reference_metadata': Metadata}
_tabulate_las_parameter_descriptions = {
'title': 'Title to use inside visualization.'
'title': 'Title to use inside visualization.',
'reference_metadata': 'Reference metadata to be integrated in output.'
}

plugin.visualizers.register_function(
Expand Down
26 changes: 26 additions & 0 deletions q2_dwq2/tests/test_pipelines.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,14 @@ def setUp(self):
self.reference_sequences_art = qiime2.Artifact.import_data(
"FeatureData[Sequence]", reference_sequences, view_type=DNAIterator
)
metadata_df = pd.DataFrame([
['r1', 'abc;def', 10101],
['r2', 'ghi;jkl;mno', 101],
['r3', 'pqr; stu', 110011],
['r4', 'vwxyz', 424242]],
columns=['sample-id', 'some column', 'another'])
metadata_df.set_index('sample-id', inplace=True)
self.metadata = qiime2.Metadata(metadata_df)

def _test_simple1_helper(self, observed_hits, observed_viz):
expected_hits = pd.DataFrame([
Expand Down Expand Up @@ -121,6 +129,24 @@ def test_simple1_serial(self):
self.query_sequences_art, self.reference_sequences_art)
self._test_simple1_helper(observed_hits, observed_viz)

def test_metadata1_serial(self):

observed_hits, observed_viz = self.search_and_summarize_pipeline(
self.query_sequences_art, self.reference_sequences_art,
reference_metadata=self.metadata)
self._test_simple1_helper(observed_hits, observed_viz)

# check that metadata is included in output Visualization
index_fp = observed_viz.get_index_paths(relative=False)['html']
with open(index_fp, 'r') as fh:
observed_index = fh.read()
self.assertIn('ghi;jkl;mno', observed_index)
self.assertIn('some column', observed_index)
self.assertIn('110011', observed_index)
# metadata associated with ids not referenced in hits does
# not show up in observed
self.assertNotIn('424242', observed_index)

def test_simple1_parallel(self):
with ParallelConfig():
observed_hits, observed_viz = \
Expand Down
55 changes: 49 additions & 6 deletions q2_dwq2/tests/test_visualizers.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
from skbio.alignment import TabularMSA
from skbio.sequence import DNA

import qiime2
from qiime2.plugin.testing import TestPluginBase

from q2_dwq2._visualizers import summarize_alignment, tabulate_las_results
Expand All @@ -38,8 +39,9 @@ def test_simple1(self):
class TabulateLasResultsTests(TestPluginBase):
package = 'q2_dwq2.tests'

def test_simple1(self):
hits = pd.DataFrame([
def setUp(self):
super().setUp()
self.hits = pd.DataFrame([
['q1', 'r2', 100., 20, 40., 'ACACTCTCCACCCATTTGCT',
'ACACTCTCCACCCATTTGCT'],
['q1', 'r3', 95., 20, 35., 'ACACTCTCCACCCATTTGCT',
Expand All @@ -52,12 +54,21 @@ def test_simple1(self):
'ACACTCTCCACCCATTTGCT'],
['q2', 'r3', 85., 20, 25., 'ACACTCACCACCCAATTGCT',
'ACACTCTCCAGCCATTTGCT']],
columns=['query id', 'reference id', 'percent similarity',
'alignment length', 'score', 'aligned query',
'aligned reference'])
columns=['query id', 'reference id', 'percent similarity',
'alignment length', 'score', 'aligned query',
'aligned reference'])
metadata_df = pd.DataFrame([
['r1', 'abc;def', 10101],
['r2', 'ghi;jkl;mno', 101],
['r3', 'pqr; stu', 110011],
['r4', 'vwxyz', 424242]],
columns=['sample-id', 'some column', 'another'])
metadata_df.set_index('sample-id', inplace=True)
self.metadata = qiime2.Metadata(metadata_df)

def test_simple1(self):
with self.temp_dir as output_dir:
observed = tabulate_las_results(output_dir, hits)
observed = tabulate_las_results(output_dir, self.hits)

with open(os.path.join(output_dir, 'index.html'), 'r') as fh:
observed = fh.read()
Expand All @@ -68,3 +79,35 @@ def test_simple1(self):
self.assertIn('r2', observed)
self.assertIn('ACACTCACCACCCAATTGCT', observed)
self.assertIn('ACACTCTCCACCCATTTGCT', observed)

def test_metadata1(self):
with self.temp_dir as output_dir:
observed = tabulate_las_results(output_dir, self.hits,
reference_metadata=self.metadata)
with open(os.path.join(output_dir, 'index.html'), 'r') as fh:
observed = fh.read()
self.assertIn('q1', observed)
self.assertIn('q2', observed)
self.assertIn('r1', observed)
self.assertIn('r2', observed)
self.assertIn('ACACTCACCACCCAATTGCT', observed)
self.assertIn('ACACTCTCCACCCATTTGCT', observed)
self.assertIn('ghi;jkl;mno', observed)
self.assertIn('some column', observed)
self.assertIn('110011', observed)

# metadata associated with ids not referenced in hits does
# not show up in observed
self.assertNotIn('424242', observed)

def test_missing_metadata(self):
df = pd.DataFrame([
['r1', 'abc;def', 10101],
['r4', 'vwxyz', 424242]],
columns=['sample-id', 'some column', 'another'])
df.set_index('sample-id', inplace=True)
bad_metadata = qiime2.Metadata(df)
with self.assertRaisesRegex(KeyError, "g 2 IDs.*'r3'"):
with self.temp_dir as output_dir:
tabulate_las_results(output_dir, self.hits,
reference_metadata=bad_metadata)

0 comments on commit ab244d8

Please sign in to comment.