From 1961442a82459aa7914c385741aafa7c8b8a8d58 Mon Sep 17 00:00:00 2001 From: Colin Wood Date: Mon, 23 Dec 2024 10:32:58 -0700 Subject: [PATCH 1/3] move tabular json types from q2-stats --- q2_types/plugin_setup.py | 1 + q2_types/tabular/__init__.py | 20 ++ q2_types/tabular/_deferred_setup/__init__.py | 45 +++++ .../tabular/_deferred_setup/_transformers.py | 172 ++++++++++++++++++ .../tabular/_deferred_setup/_validators.py | 46 +++++ q2_types/tabular/formats.py | 49 +++++ q2_types/tabular/tests/__init__.py | 7 + .../tests/data/empty_data_dist.table.jsonl | 1 + .../tests/data/empty_data_dist/data.ndjson | 0 .../data/empty_data_dist/dataresource.json | 33 ++++ .../tests/data/faithpd_refdist.table.jsonl | 29 +++ .../tests/data/faithpd_refdist/data.ndjson | 28 +++ .../data/faithpd_refdist/dataresource.json | 27 +++ .../tests/data/faithpd_timedist.table.jsonl | 88 +++++++++ .../tests/data/faithpd_timedist/data.ndjson | 87 +++++++++ .../data/faithpd_timedist/dataresource.json | 33 ++++ q2_types/tabular/tests/test_transformers.py | 63 +++++++ q2_types/tabular/tests/test_validators.py | 41 +++++ q2_types/tabular/types.py | 29 +++ setup.py | 6 +- 20 files changed, 804 insertions(+), 1 deletion(-) create mode 100644 q2_types/tabular/__init__.py create mode 100644 q2_types/tabular/_deferred_setup/__init__.py create mode 100644 q2_types/tabular/_deferred_setup/_transformers.py create mode 100644 q2_types/tabular/_deferred_setup/_validators.py create mode 100644 q2_types/tabular/formats.py create mode 100644 q2_types/tabular/tests/__init__.py create mode 100644 q2_types/tabular/tests/data/empty_data_dist.table.jsonl create mode 100644 q2_types/tabular/tests/data/empty_data_dist/data.ndjson create mode 100644 q2_types/tabular/tests/data/empty_data_dist/dataresource.json create mode 100644 q2_types/tabular/tests/data/faithpd_refdist.table.jsonl create mode 100644 q2_types/tabular/tests/data/faithpd_refdist/data.ndjson create mode 100644 q2_types/tabular/tests/data/faithpd_refdist/dataresource.json create mode 100644 q2_types/tabular/tests/data/faithpd_timedist.table.jsonl create mode 100644 q2_types/tabular/tests/data/faithpd_timedist/data.ndjson create mode 100644 q2_types/tabular/tests/data/faithpd_timedist/dataresource.json create mode 100644 q2_types/tabular/tests/test_transformers.py create mode 100644 q2_types/tabular/tests/test_validators.py create mode 100644 q2_types/tabular/types.py diff --git a/q2_types/plugin_setup.py b/q2_types/plugin_setup.py index ed59706c..2d5308b9 100644 --- a/q2_types/plugin_setup.py +++ b/q2_types/plugin_setup.py @@ -153,3 +153,4 @@ importlib.import_module('q2_types.reference_db._deferred_setup') importlib.import_module('q2_types.sample_data._deferred_setup') importlib.import_module('q2_types.tree._deferred_setup') +importlib.import_module('q2_types.tabular._deferred_setup') diff --git a/q2_types/tabular/__init__.py b/q2_types/tabular/__init__.py new file mode 100644 index 00000000..64507a35 --- /dev/null +++ b/q2_types/tabular/__init__.py @@ -0,0 +1,20 @@ +# ---------------------------------------------------------------------------- +# Copyright (c) 2024, QIIME 2 development team. +# +# Distributed under the terms of the Modified BSD License. +# +# The full license is in the file LICENSE, distributed with this software. +# ---------------------------------------------------------------------------- + +from .formats import (TableJSONLFileFormat, TableJSONLDirFmt, + NDJSONFileFormat, DataResourceSchemaFileFormat, + TabularDataResourceDirFmt) +from .types import (StatsTable, Pairwise, Dist1D, Ordered, Unordered, + NestedOrdered, NestedUnordered, Multi, + Matched, Independent) + +__all__ = ['TableJSONLFileFormat', 'TableJSONLDirFmt', + 'NDJSONFileFormat', 'DataResourceSchemaFileFormat', + 'TabularDataResourceDirFmt', 'StatsTable', 'Pairwise', + 'Dist1D', 'Ordered', 'Unordered', 'NestedOrdered', + 'NestedUnordered', 'Multi', 'Matched', 'Independent'] diff --git a/q2_types/tabular/_deferred_setup/__init__.py b/q2_types/tabular/_deferred_setup/__init__.py new file mode 100644 index 00000000..0585a7d0 --- /dev/null +++ b/q2_types/tabular/_deferred_setup/__init__.py @@ -0,0 +1,45 @@ +# ---------------------------------------------------------------------------- +# Copyright (c) 2024, QIIME 2 development team. +# +# Distributed under the terms of the Modified BSD License. +# +# The full license is in the file LICENSE, distributed with this software. +# ---------------------------------------------------------------------------- + +import importlib + +from .. import (NDJSONFileFormat, + DataResourceSchemaFileFormat, + TabularDataResourceDirFmt, + TableJSONLFileFormat, TableJSONLDirFmt, + StatsTable, Pairwise, Dist1D, + Matched, Independent, Ordered, Unordered, Multi, + NestedOrdered, NestedUnordered) + +from ...plugin_setup import plugin + +plugin.register_formats(NDJSONFileFormat, DataResourceSchemaFileFormat, + TabularDataResourceDirFmt) +plugin.register_formats(TableJSONLFileFormat, TableJSONLDirFmt) + + +plugin.register_semantic_types(StatsTable, Pairwise, Dist1D, + NestedOrdered, NestedUnordered, Matched, + Independent, Ordered, Unordered, Multi) + +plugin.register_semantic_type_to_format( + Dist1D[Ordered | Unordered | NestedOrdered | NestedUnordered | Multi, + Matched | Independent] | + StatsTable[Pairwise], + TableJSONLDirFmt) + +importlib.import_module('._transformers', __name__) +importlib.import_module('._validators', __name__) + + +__all__ = [ + 'StatsTable', 'Pairwise', 'Dist1D', 'NestedOrdered', 'NestedUnordered', + 'Matched', 'Independent', 'Ordered', 'Unordered', 'Multi', + 'NDJSONFileFormat', 'DataResourceSchemaFileFormat', + 'TabularDataResourceDirFmt', 'TableJSONLFileFormat', 'TableJSONLDirFmt', +] diff --git a/q2_types/tabular/_deferred_setup/_transformers.py b/q2_types/tabular/_deferred_setup/_transformers.py new file mode 100644 index 00000000..46915435 --- /dev/null +++ b/q2_types/tabular/_deferred_setup/_transformers.py @@ -0,0 +1,172 @@ +# ---------------------------------------------------------------------------- +# Copyright (c) 2024, QIIME 2 development team. +# +# Distributed under the terms of the Modified BSD License. +# +# The full license is in the file LICENSE, distributed with this software. +# ---------------------------------------------------------------------------- + +import pandas as pd +import frictionless as fls +import json + +from ..formats import TableJSONLFileFormat + +from .. import (NDJSONFileFormat, + DataResourceSchemaFileFormat, + TabularDataResourceDirFmt) + +from ...plugin_setup import plugin + + +def table_jsonl_header(df: pd.DataFrame) -> str: + header = {} + header['doctype'] = dict( + name='table.jsonl', format='application/x-json-lines', version='1.0') + header['direction'] = 'row' + header['style'] = 'key:value' + + fields = [] + for name in df.columns: + attrs = df[name].attrs.copy() + title = attrs.pop('title', '') + description = attrs.pop('description', '') + type = attrs.pop('type', None) + missing = attrs.pop('missing', False) + extra = attrs.pop('extra', None) + if extra is None: + extra = attrs + fields.append(dict( + name=name, type=type, missing=missing, title=title, + description=description, extra=extra)) + + header['fields'] = fields + header['index'] = [] + header['title'] = df.attrs.get('title', '') + header['description'] = df.attrs.get('description', '') + header['extra'] = df.attrs.get('extra', {}) + + # prevent whitespace after comma and colon + return json.dumps(header, separators=(',', ':')) + + +@plugin.register_transformer +def table_jsonl_to_df(ff: TableJSONLFileFormat) -> pd.DataFrame: + with ff.open() as fh: + header = json.loads(next(fh)) + df = pd.read_json(fh, lines=True, orient='records') + if df.empty: + df = pd.DataFrame(columns=[ + spec['name'] for spec in header['fields']]) + + # The order of these steps matters. + + # 1. set order of columns + df = df[[spec['name'] for spec in header['fields']]] + + # 2. update types + for spec in header['fields']: + col = spec['name'] + if spec['type'] == 'integer': + df[col] = df[col].astype('int64') + elif spec['type'] == 'number': + df[col] = df[col].astype('float64') + elif spec['type'] == 'datetime': + df[col] = pd.to_datetime(df[col], format='iso8601') + elif spec['type'] == 'date': + df[col] = pd.to_datetime(df[col], format='iso8601') + elif spec['type'] == 'time': + df[col] = pd.to_datetime(df[col], format='mixed').dt.time + elif spec['type'] == 'duration': + df[col] = pd.to_timedelta(df[col]) + + # 3. set index + if len(header['index']) > 0: + df = df.set_index(header['index'], drop=False) + + # 4. add metadata to columns + for spec in header['fields']: + df[spec['name']].attrs.update(spec) + + # 5. add metadata to table + attrs = dict(title=header['title'], description=header['description']) + df.attrs.update(attrs) + + return df + + +@plugin.register_transformer +def df_to_table_jsonl(obj: pd.DataFrame) -> TableJSONLFileFormat: + header = table_jsonl_header(obj) + + ff = TableJSONLFileFormat() + with ff.open() as fh: + fh.write(header) + fh.write('\n') + if not obj.empty: + obj.to_json(fh, orient='records', lines=True, date_format='iso') + + return ff + + +@plugin.register_transformer +def _1(obj: pd.DataFrame) -> NDJSONFileFormat: + ff = NDJSONFileFormat() + obj.to_json(str(ff), lines=True, orient='records') + return ff + + +@plugin.register_transformer +def _2(obj: DataResourceSchemaFileFormat) -> fls.Resource: + return fls.Resource(str(obj)) + + +@plugin.register_transformer +def _3(df: TabularDataResourceDirFmt) -> pd.DataFrame: + path = df.data.view(NDJSONFileFormat) + data = pd.read_json(str(path), lines=True) + resource = df.metadata.view(fls.Resource) + + if data.empty: + data = pd.DataFrame( + columns=[c.name for c in resource.schema.fields]) + + for field in resource.schema.fields: + data[field.name].attrs = field.to_dict() + + return data + + +@plugin.register_transformer +def _4(obj: pd.DataFrame) -> TabularDataResourceDirFmt: + metadata_obj = [] + + for col in obj.columns: + series = obj[col] + dtype = series.convert_dtypes().dtype + metadata = series.attrs.copy() + + if pd.api.types.is_float_dtype(dtype): + schema_dtype = 'number' + elif pd.api.types.is_integer_dtype(dtype): + schema_dtype = 'integer' + else: + schema_dtype = 'string' + + metadata['name'] = col + metadata['type'] = schema_dtype + + metadata_obj.append(metadata) + + metadata_dict = {'schema': {'fields': metadata_obj}, **obj.attrs} + metadata_dict['format'] = 'ndjson' + metadata_dict['path'] = 'data.ndjson' + metadata_dict['name'] = 'data' + + dir_fmt = TabularDataResourceDirFmt() + + dir_fmt.data.write_data(obj, pd.DataFrame) + with open(dir_fmt.path / 'dataresource.json', 'w') as fh: + fh.write(json.dumps(metadata_dict, indent=4)) + + return dir_fmt diff --git a/q2_types/tabular/_deferred_setup/_validators.py b/q2_types/tabular/_deferred_setup/_validators.py new file mode 100644 index 00000000..6817722c --- /dev/null +++ b/q2_types/tabular/_deferred_setup/_validators.py @@ -0,0 +1,46 @@ +# ---------------------------------------------------------------------------- +# Copyright (c) 2022-2023, QIIME 2 development team. +# +# Distributed under the terms of the Modified BSD License. +# +# The full license is in the file LICENSE, distributed with this software. +# ---------------------------------------------------------------------------- + +import pandas as pd + +from qiime2.plugin import ValidationError +from .. import (Dist1D, Ordered, Unordered, NestedOrdered, + NestedUnordered, Matched, Independent) +from ...plugin_setup import plugin + + +@plugin.register_validator(Dist1D[Ordered | Unordered, + Matched | Independent]) +def validate_all_dist_columns_present(data: pd.DataFrame, level): + req_cols = ['id', 'measure', 'group'] + for col in req_cols: + if col not in data.columns: + raise ValidationError(f'"{col}" not found in distribution.') + + +@plugin.register_validator(Dist1D[Ordered | Unordered, Matched]) +def validate_unique_subjects_within_group(data: pd.DataFrame, level): + if 'subject' not in data.columns: + raise ValidationError('"subject" not found in distribution.') + + for group_id, group_df in data.groupby('group'): + if group_df['subject'].duplicated().any(): + dupes = list(group_df['subject'][group_df['subject'].duplicated()]) + raise ValidationError( + 'Unique subject found more than once within an individual' + ' group. Group(s) where duplicated subject was found:' + f' [{group_id}] Duplicated subjects: {dupes}') + + +@plugin.register_validator(Dist1D[NestedOrdered | NestedUnordered, + Matched | Independent]) +def validate_all_nesteddist_columns_present(data: pd.DataFrame, level): + req_cols = ['id', 'measure', 'group', 'class', "level"] + for col in req_cols: + if col not in data.columns: + raise ValidationError(f'"{col}" not found in distribution.') diff --git a/q2_types/tabular/formats.py b/q2_types/tabular/formats.py new file mode 100644 index 00000000..6c14bddb --- /dev/null +++ b/q2_types/tabular/formats.py @@ -0,0 +1,49 @@ +# ---------------------------------------------------------------------------- +# Copyright (c) 2024, QIIME 2 development team. +# +# Distributed under the terms of the Modified BSD License. +# +# The full license is in the file LICENSE, distributed with this software. +# ---------------------------------------------------------------------------- + +from qiime2.plugin import ValidationError, model + +from frictionless import validate + + +class TableJSONLFileFormat(model.TextFileFormat): + def _validate_(self, level): + with self.open() as fh: + assert fh.read(33)[:33] == '{"doctype":{"name":"table.jsonl",' + + +TableJSONLDirFmt = model.SingleFileDirectoryFormat( + 'TableJSONLDirFmt', 'data.table.jsonl', TableJSONLFileFormat) + + +class NDJSONFileFormat(model.TextFileFormat): + """Format for newline-delimited (ND) JSON file.""" + def _validate_(self, level): + pass + + +class DataResourceSchemaFileFormat(model.TextFileFormat): + """ + Format for data resource schema. + """ + def _validate_(self, level): + pass + + +class TabularDataResourceDirFmt(model.DirectoryFormat): + data = model.File('data.ndjson', format=NDJSONFileFormat) + metadata = model.File('dataresource.json', + format=DataResourceSchemaFileFormat) + + def _validate_(self, level='min'): + try: + validate(str(self.path/'dataresource.json')) + except ValidationError: + raise model.ValidationError( + 'The dataresource does not completely describe' + ' the data.ndjson file') diff --git a/q2_types/tabular/tests/__init__.py b/q2_types/tabular/tests/__init__.py new file mode 100644 index 00000000..688e933e --- /dev/null +++ b/q2_types/tabular/tests/__init__.py @@ -0,0 +1,7 @@ +# ---------------------------------------------------------------------------- +# Copyright (c) 2024, QIIME 2 development team. +# +# Distributed under the terms of the Modified BSD License. +# +# The full license is in the file LICENSE, distributed with this software. +# ---------------------------------------------------------------------------- diff --git a/q2_types/tabular/tests/data/empty_data_dist.table.jsonl b/q2_types/tabular/tests/data/empty_data_dist.table.jsonl new file mode 100644 index 00000000..20f99054 --- /dev/null +++ b/q2_types/tabular/tests/data/empty_data_dist.table.jsonl @@ -0,0 +1 @@ +{"doctype":{"name":"table.jsonl","format":"application/x-json-lines","version":"1.0"},"direction":"row","style":"key:value","fields":[{"name":"id","type":"string","missing":false,"title":"id","description":"...","extra":{"name":"id"}},{"name":"measure","type":"number","missing":false,"title":"faith_pd","description":"...","extra":{"name":"measure"}},{"name":"group","type":"integer","missing":false,"title":"week","description":"...","extra":{"name":"group"}},{"name":"subject","type":"string","missing":false,"title":"SubjectID","description":"...","extra":{"name":"subject"}}],"index":[],"title":"","description":"","extra":{}} diff --git a/q2_types/tabular/tests/data/empty_data_dist/data.ndjson b/q2_types/tabular/tests/data/empty_data_dist/data.ndjson new file mode 100644 index 00000000..e69de29b diff --git a/q2_types/tabular/tests/data/empty_data_dist/dataresource.json b/q2_types/tabular/tests/data/empty_data_dist/dataresource.json new file mode 100644 index 00000000..dca00995 --- /dev/null +++ b/q2_types/tabular/tests/data/empty_data_dist/dataresource.json @@ -0,0 +1,33 @@ +{ + "schema": { + "fields": [ + { + "title": "id", + "description": "...", + "name": "id", + "type": "string" + }, + { + "title": "faith_pd", + "description": "...", + "name": "measure", + "type": "number" + }, + { + "title": "week", + "description": "...", + "name": "group", + "type": "integer" + }, + { + "title": "SubjectID", + "description": "...", + "name": "subject", + "type": "string" + } + ] + }, + "format": "ndjson", + "path": "data.ndjson", + "name": "data" +} diff --git a/q2_types/tabular/tests/data/faithpd_refdist.table.jsonl b/q2_types/tabular/tests/data/faithpd_refdist.table.jsonl new file mode 100644 index 00000000..4c014acb --- /dev/null +++ b/q2_types/tabular/tests/data/faithpd_refdist.table.jsonl @@ -0,0 +1,29 @@ +{"doctype":{"name":"table.jsonl","format":"application/x-json-lines","version":"1.0"},"direction":"row","style":"key:value","fields":[{"name":"id","type":"string","missing":false,"title":"id","description":"...","extra":{"name":"id"}},{"name":"measure","type":"number","missing":false,"title":"faith_pd","description":"faith_pd","extra":{"name":"measure"}},{"name":"group","type":"string","missing":false,"title":"InitialDonorSampleID","description":"...","extra":{"name":"group"}}],"index":[],"title":"","description":"","extra":{}} +{"id":"S167511","measure":10.24883918,"group":"reference"} +{"id":"S396444","measure":8.379513518,"group":"reference"} +{"id":"S575788","measure":10.37997086,"group":"reference"} +{"id":"S94321","measure":9.263042352,"group":"reference"} +{"id":"S952974","measure":10.55198028,"group":"reference"} +{"id":"S953010","measure":12.138687524,"group":"control"} +{"id":"S358668","measure":12.402660535,"group":"control"} +{"id":"S842739","measure":12.771875893,"group":"control"} +{"id":"S691441","measure":8.145593314,"group":"control"} +{"id":"S427307","measure":12.967708926,"group":"control"} +{"id":"S238639","measure":11.649627363,"group":"control"} +{"id":"S611569","measure":7.989246996,"group":"control"} +{"id":"S298842","measure":9.500026592,"group":"control"} +{"id":"S160074","measure":14.460895127,"group":"control"} +{"id":"S211105","measure":11.767674432,"group":"control"} +{"id":"S460372","measure":9.547484785,"group":"control"} +{"id":"S728037","measure":9.597018329,"group":"control"} +{"id":"S282059","measure":9.282152505,"group":"control"} +{"id":"S388055","measure":10.279025972,"group":"control"} +{"id":"S630647","measure":12.086278795,"group":"control"} +{"id":"S59807","measure":11.167537754,"group":"control"} +{"id":"S255688","measure":10.029524157,"group":"control"} +{"id":"S889561","measure":4.465584696,"group":"control"} +{"id":"S990074","measure":5.138442046,"group":"control"} +{"id":"S269036","measure":13.110583296,"group":"control"} +{"id":"S684586","measure":12.705425981,"group":"control"} +{"id":"S696581","measure":15.902021065,"group":"control"} +{"id":"S322808","measure":13.960754461,"group":"control"} diff --git a/q2_types/tabular/tests/data/faithpd_refdist/data.ndjson b/q2_types/tabular/tests/data/faithpd_refdist/data.ndjson new file mode 100644 index 00000000..f38a2633 --- /dev/null +++ b/q2_types/tabular/tests/data/faithpd_refdist/data.ndjson @@ -0,0 +1,28 @@ +{"id":"S167511","measure":10.24883918,"group":"reference"} +{"id":"S396444","measure":8.379513518,"group":"reference"} +{"id":"S575788","measure":10.37997086,"group":"reference"} +{"id":"S94321","measure":9.263042352,"group":"reference"} +{"id":"S952974","measure":10.55198028,"group":"reference"} +{"id":"S953010","measure":12.138687523999998,"group":"control"} +{"id":"S358668","measure":12.402660535,"group":"control"} +{"id":"S842739","measure":12.771875893,"group":"control"} +{"id":"S691441","measure":8.145593314,"group":"control"} +{"id":"S427307","measure":12.967708926,"group":"control"} +{"id":"S238639","measure":11.649627363,"group":"control"} +{"id":"S611569","measure":7.989246995999999,"group":"control"} +{"id":"S298842","measure":9.500026592,"group":"control"} +{"id":"S160074","measure":14.460895127,"group":"control"} +{"id":"S211105","measure":11.767674432,"group":"control"} +{"id":"S460372","measure":9.547484785000002,"group":"control"} +{"id":"S728037","measure":9.597018329,"group":"control"} +{"id":"S282059","measure":9.282152505,"group":"control"} +{"id":"S388055","measure":10.279025972,"group":"control"} +{"id":"S630647","measure":12.086278795,"group":"control"} +{"id":"S59807","measure":11.167537754,"group":"control"} +{"id":"S255688","measure":10.029524157,"group":"control"} +{"id":"S889561","measure":4.4655846960000005,"group":"control"} +{"id":"S990074","measure":5.138442046,"group":"control"} +{"id":"S269036","measure":13.110583296,"group":"control"} +{"id":"S684586","measure":12.705425981,"group":"control"} +{"id":"S696581","measure":15.902021065,"group":"control"} +{"id":"S322808","measure":13.960754461,"group":"control"} diff --git a/q2_types/tabular/tests/data/faithpd_refdist/dataresource.json b/q2_types/tabular/tests/data/faithpd_refdist/dataresource.json new file mode 100644 index 00000000..c895a008 --- /dev/null +++ b/q2_types/tabular/tests/data/faithpd_refdist/dataresource.json @@ -0,0 +1,27 @@ +{ + "schema": { + "fields": [ + { + "title": "id", + "description": "...", + "name": "id", + "type": "string" + }, + { + "title": "faith_pd", + "description": "faith_pd", + "name": "measure", + "type": "number" + }, + { + "title": "InitialDonorSampleID", + "description": "...", + "name": "group", + "type": "string" + } + ] + }, + "format": "ndjson", + "path": "data.ndjson", + "name": "data" +} diff --git a/q2_types/tabular/tests/data/faithpd_timedist.table.jsonl b/q2_types/tabular/tests/data/faithpd_timedist.table.jsonl new file mode 100644 index 00000000..1c77d033 --- /dev/null +++ b/q2_types/tabular/tests/data/faithpd_timedist.table.jsonl @@ -0,0 +1,88 @@ +{"doctype":{"name":"table.jsonl","format":"application/x-json-lines","version":"1.0"},"direction":"row","style":"key:value","fields":[{"name":"id","type":"string","missing":false,"title":"id","description":"...","extra":{"name":"id"}},{"name":"measure","type":"number","missing":false,"title":"faith_pd","description":"...","extra":{"name":"measure"}},{"name":"group","type":"integer","missing":false,"title":"week","description":"...","extra":{"name":"group"}},{"name":"subject","type":"string","missing":false,"title":"SubjectID","description":"...","extra":{"name":"subject"}}],"index":[],"title":"","description":"","extra":{}} +{"id":"S76237","measure":14.95236844,"group":100,"subject":"P266"} +{"id":"S693625","measure":11.68795212,"group":18,"subject":"P266"} +{"id":"S219379","measure":7.662921088,"group":0,"subject":"P266"} +{"id":"S713801","measure":10.83320209,"group":3,"subject":"P266"} +{"id":"S20609","measure":10.83670704,"group":10,"subject":"P266"} +{"id":"S560202","measure":10.65262785,"group":18,"subject":"P580"} +{"id":"S189125","measure":10.26127503,"group":100,"subject":"P580"} +{"id":"S662977","measure":9.8956928,"group":3,"subject":"P580"} +{"id":"S598645","measure":9.172597583,"group":10,"subject":"P580"} +{"id":"S64328","measure":8.431734297,"group":0,"subject":"P580"} +{"id":"S389756","measure":12.24557912,"group":100,"subject":"P222"} +{"id":"S537986","measure":11.86970243,"group":3,"subject":"P222"} +{"id":"S168217","measure":11.60664407,"group":10,"subject":"P222"} +{"id":"S771714","measure":10.94184424,"group":18,"subject":"P222"} +{"id":"S674625","measure":8.513263823,"group":0,"subject":"P222"} +{"id":"S799849","measure":10.78345691,"group":10,"subject":"P675"} +{"id":"S308785","measure":19.96358026,"group":100,"subject":"P675"} +{"id":"S883804","measure":11.47176562,"group":0,"subject":"P675"} +{"id":"S678730","measure":10.12497616,"group":3,"subject":"P675"} +{"id":"S160537","measure":13.26471277,"group":18,"subject":"P675"} +{"id":"S928726","measure":11.48168273,"group":10,"subject":"P565"} +{"id":"S690628","measure":10.82326355,"group":100,"subject":"P565"} +{"id":"S747669","measure":9.478838334,"group":3,"subject":"P565"} +{"id":"S391776","measure":11.69071803,"group":18,"subject":"P565"} +{"id":"S686571","measure":9.914657833,"group":0,"subject":"P565"} +{"id":"S714159","measure":12.95731892,"group":100,"subject":"P702"} +{"id":"S11568","measure":8.502002897,"group":3,"subject":"P702"} +{"id":"S822582","measure":14.96844078,"group":18,"subject":"P702"} +{"id":"S157377","measure":17.43370405,"group":10,"subject":"P702"} +{"id":"S370574","measure":12.62560928,"group":0,"subject":"P702"} +{"id":"S871089","measure":14.46464274,"group":100,"subject":"P689"} +{"id":"S186877","measure":10.49866422,"group":18,"subject":"P689"} +{"id":"S272596","measure":9.246871802,"group":3,"subject":"P689"} +{"id":"S623283","measure":15.2623358,"group":10,"subject":"P689"} +{"id":"S775390","measure":9.453417415,"group":0,"subject":"P689"} +{"id":"S697934","measure":9.646052305,"group":0,"subject":"P397"} +{"id":"S525055","measure":13.53128149,"group":10,"subject":"P397"} +{"id":"S488007","measure":14.68263437,"group":100,"subject":"P397"} +{"id":"S500707","measure":12.62000749,"group":3,"subject":"P397"} +{"id":"S350626","measure":11.09989491,"group":18,"subject":"P397"} +{"id":"S667896","measure":10.31360211,"group":18,"subject":"P69"} +{"id":"S335960","measure":15.96814451,"group":10,"subject":"P69"} +{"id":"S707922","measure":7.778748399,"group":0,"subject":"P69"} +{"id":"S727887","measure":15.48569583,"group":100,"subject":"P480"} +{"id":"S853463","measure":10.50579797,"group":3,"subject":"P480"} +{"id":"S998321","measure":11.76570288,"group":18,"subject":"P480"} +{"id":"S798646","measure":13.61043486,"group":10,"subject":"P480"} +{"id":"S31967","measure":12.68877984,"group":0,"subject":"P480"} +{"id":"S352843","measure":16.0492848,"group":100,"subject":"P26"} +{"id":"S489042","measure":10.62683459,"group":10,"subject":"P26"} +{"id":"S116625","measure":11.79632603,"group":0,"subject":"P26"} +{"id":"S25508","measure":7.830091789,"group":3,"subject":"P26"} +{"id":"S389665","measure":13.66553354,"group":18,"subject":"P26"} +{"id":"S777504","measure":10.87510264,"group":100,"subject":"P72"} +{"id":"S844011","measure":8.598030851,"group":18,"subject":"P72"} +{"id":"S745584","measure":7.444009936,"group":10,"subject":"P72"} +{"id":"S346600","measure":6.725748683,"group":3,"subject":"P72"} +{"id":"S714926","measure":7.386190882,"group":0,"subject":"P72"} +{"id":"S128715","measure":8.866448342,"group":100,"subject":"P254"} +{"id":"S174270","measure":9.728676749,"group":10,"subject":"P254"} +{"id":"S974982","measure":11.78182748,"group":18,"subject":"P254"} +{"id":"S539974","measure":11.98183356,"group":3,"subject":"P254"} +{"id":"S203181","measure":11.19719621,"group":0,"subject":"P254"} +{"id":"S791754","measure":12.43679596,"group":3,"subject":"P713"} +{"id":"S725002","measure":12.77874569,"group":10,"subject":"P713"} +{"id":"S944586","measure":10.18787122,"group":0,"subject":"P713"} +{"id":"S517303","measure":12.20070358,"group":18,"subject":"P713"} +{"id":"S102980","measure":9.750043462,"group":18,"subject":"P931"} +{"id":"S716254","measure":13.57054725,"group":100,"subject":"P931"} +{"id":"S595428","measure":10.65211873,"group":10,"subject":"P931"} +{"id":"S347599","measure":8.448317067,"group":0,"subject":"P931"} +{"id":"S560594","measure":9.592979726,"group":3,"subject":"P931"} +{"id":"S851838","measure":12.36040446,"group":100,"subject":"P170"} +{"id":"S171189","measure":8.880897012,"group":3,"subject":"P170"} +{"id":"S986392","measure":11.12683676,"group":10,"subject":"P170"} +{"id":"S119501","measure":11.3228199,"group":0,"subject":"P170"} +{"id":"S581897","measure":10.8908099,"group":18,"subject":"P170"} +{"id":"S181906","measure":9.092078686,"group":3,"subject":"P529"} +{"id":"S850352","measure":11.07684349,"group":100,"subject":"P529"} +{"id":"S291324","measure":9.027651356,"group":10,"subject":"P529"} +{"id":"S627638","measure":8.990062701,"group":18,"subject":"P529"} +{"id":"S79536","measure":7.314753809,"group":0,"subject":"P529"} +{"id":"S427098","measure":9.059711997,"group":10,"subject":"P299"} +{"id":"S171037","measure":12.98841452,"group":100,"subject":"P299"} +{"id":"S772179","measure":14.73197158,"group":18,"subject":"P299"} +{"id":"S141248","measure":8.27086375,"group":0,"subject":"P299"} +{"id":"S698747","measure":8.522314493,"group":3,"subject":"P299"} diff --git a/q2_types/tabular/tests/data/faithpd_timedist/data.ndjson b/q2_types/tabular/tests/data/faithpd_timedist/data.ndjson new file mode 100644 index 00000000..208b78a9 --- /dev/null +++ b/q2_types/tabular/tests/data/faithpd_timedist/data.ndjson @@ -0,0 +1,87 @@ +{"id":"S76237","measure":14.95236844,"group":100.0,"subject":"P266"} +{"id":"S693625","measure":11.68795212,"group":18.0,"subject":"P266"} +{"id":"S219379","measure":7.662921088,"group":0.0,"subject":"P266"} +{"id":"S713801","measure":10.83320209,"group":3.0,"subject":"P266"} +{"id":"S20609","measure":10.83670704,"group":10.0,"subject":"P266"} +{"id":"S560202","measure":10.65262785,"group":18.0,"subject":"P580"} +{"id":"S189125","measure":10.26127503,"group":100.0,"subject":"P580"} +{"id":"S662977","measure":9.8956928,"group":3.0,"subject":"P580"} +{"id":"S598645","measure":9.172597583,"group":10.0,"subject":"P580"} +{"id":"S64328","measure":8.431734297,"group":0.0,"subject":"P580"} +{"id":"S389756","measure":12.24557912,"group":100.0,"subject":"P222"} +{"id":"S537986","measure":11.86970243,"group":3.0,"subject":"P222"} +{"id":"S168217","measure":11.60664407,"group":10.0,"subject":"P222"} +{"id":"S771714","measure":10.94184424,"group":18.0,"subject":"P222"} +{"id":"S674625","measure":8.513263823,"group":0.0,"subject":"P222"} +{"id":"S799849","measure":10.78345691,"group":10.0,"subject":"P675"} +{"id":"S308785","measure":19.96358026,"group":100.0,"subject":"P675"} +{"id":"S883804","measure":11.47176562,"group":0.0,"subject":"P675"} +{"id":"S678730","measure":10.12497616,"group":3.0,"subject":"P675"} +{"id":"S160537","measure":13.26471277,"group":18.0,"subject":"P675"} +{"id":"S928726","measure":11.48168273,"group":10.0,"subject":"P565"} +{"id":"S690628","measure":10.82326355,"group":100.0,"subject":"P565"} +{"id":"S747669","measure":9.478838334,"group":3.0,"subject":"P565"} +{"id":"S391776","measure":11.69071803,"group":18.0,"subject":"P565"} +{"id":"S686571","measure":9.914657833,"group":0.0,"subject":"P565"} +{"id":"S714159","measure":12.95731892,"group":100.0,"subject":"P702"} +{"id":"S11568","measure":8.502002897,"group":3.0,"subject":"P702"} +{"id":"S822582","measure":14.96844078,"group":18.0,"subject":"P702"} +{"id":"S157377","measure":17.43370405,"group":10.0,"subject":"P702"} +{"id":"S370574","measure":12.62560928,"group":0.0,"subject":"P702"} +{"id":"S871089","measure":14.46464274,"group":100.0,"subject":"P689"} +{"id":"S186877","measure":10.49866422,"group":18.0,"subject":"P689"} +{"id":"S272596","measure":9.246871802,"group":3.0,"subject":"P689"} +{"id":"S623283","measure":15.2623358,"group":10.0,"subject":"P689"} +{"id":"S775390","measure":9.453417415,"group":0.0,"subject":"P689"} +{"id":"S697934","measure":9.646052305,"group":0.0,"subject":"P397"} +{"id":"S525055","measure":13.53128149,"group":10.0,"subject":"P397"} +{"id":"S488007","measure":14.68263437,"group":100.0,"subject":"P397"} +{"id":"S500707","measure":12.62000749,"group":3.0,"subject":"P397"} +{"id":"S350626","measure":11.09989491,"group":18.0,"subject":"P397"} +{"id":"S667896","measure":10.31360211,"group":18.0,"subject":"P69"} +{"id":"S335960","measure":15.96814451,"group":10.0,"subject":"P69"} +{"id":"S707922","measure":7.778748399,"group":0.0,"subject":"P69"} +{"id":"S727887","measure":15.48569583,"group":100.0,"subject":"P480"} +{"id":"S853463","measure":10.50579797,"group":3.0,"subject":"P480"} +{"id":"S998321","measure":11.76570288,"group":18.0,"subject":"P480"} +{"id":"S798646","measure":13.61043486,"group":10.0,"subject":"P480"} +{"id":"S31967","measure":12.68877984,"group":0.0,"subject":"P480"} +{"id":"S352843","measure":16.0492848,"group":100.0,"subject":"P26"} +{"id":"S489042","measure":10.62683459,"group":10.0,"subject":"P26"} +{"id":"S116625","measure":11.79632603,"group":0.0,"subject":"P26"} +{"id":"S25508","measure":7.830091789,"group":3.0,"subject":"P26"} +{"id":"S389665","measure":13.66553354,"group":18.0,"subject":"P26"} +{"id":"S777504","measure":10.87510264,"group":100.0,"subject":"P72"} +{"id":"S844011","measure":8.598030851,"group":18.0,"subject":"P72"} +{"id":"S745584","measure":7.444009936,"group":10.0,"subject":"P72"} +{"id":"S346600","measure":6.725748683,"group":3.0,"subject":"P72"} +{"id":"S714926","measure":7.386190882,"group":0.0,"subject":"P72"} +{"id":"S128715","measure":8.866448342,"group":100.0,"subject":"P254"} +{"id":"S174270","measure":9.728676749,"group":10.0,"subject":"P254"} +{"id":"S974982","measure":11.78182748,"group":18.0,"subject":"P254"} +{"id":"S539974","measure":11.98183356,"group":3.0,"subject":"P254"} +{"id":"S203181","measure":11.19719621,"group":0.0,"subject":"P254"} +{"id":"S791754","measure":12.43679596,"group":3.0,"subject":"P713"} +{"id":"S725002","measure":12.77874569,"group":10.0,"subject":"P713"} +{"id":"S944586","measure":10.18787122,"group":0.0,"subject":"P713"} +{"id":"S517303","measure":12.20070358,"group":18.0,"subject":"P713"} +{"id":"S102980","measure":9.750043462,"group":18.0,"subject":"P931"} +{"id":"S716254","measure":13.57054725,"group":100.0,"subject":"P931"} +{"id":"S595428","measure":10.65211873,"group":10.0,"subject":"P931"} +{"id":"S347599","measure":8.448317067,"group":0.0,"subject":"P931"} +{"id":"S560594","measure":9.592979726,"group":3.0,"subject":"P931"} +{"id":"S851838","measure":12.36040446,"group":100.0,"subject":"P170"} +{"id":"S171189","measure":8.880897012,"group":3.0,"subject":"P170"} +{"id":"S986392","measure":11.12683676,"group":10.0,"subject":"P170"} +{"id":"S119501","measure":11.3228199,"group":0.0,"subject":"P170"} +{"id":"S581897","measure":10.8908099,"group":18.0,"subject":"P170"} +{"id":"S181906","measure":9.092078686,"group":3.0,"subject":"P529"} +{"id":"S850352","measure":11.07684349,"group":100.0,"subject":"P529"} +{"id":"S291324","measure":9.027651356,"group":10.0,"subject":"P529"} +{"id":"S627638","measure":8.990062701,"group":18.0,"subject":"P529"} +{"id":"S79536","measure":7.314753809,"group":0.0,"subject":"P529"} +{"id":"S427098","measure":9.059711997,"group":10.0,"subject":"P299"} +{"id":"S171037","measure":12.98841452,"group":100.0,"subject":"P299"} +{"id":"S772179","measure":14.73197158,"group":18.0,"subject":"P299"} +{"id":"S141248","measure":8.27086375,"group":0.0,"subject":"P299"} +{"id":"S698747","measure":8.522314493,"group":3.0,"subject":"P299"} diff --git a/q2_types/tabular/tests/data/faithpd_timedist/dataresource.json b/q2_types/tabular/tests/data/faithpd_timedist/dataresource.json new file mode 100644 index 00000000..dca00995 --- /dev/null +++ b/q2_types/tabular/tests/data/faithpd_timedist/dataresource.json @@ -0,0 +1,33 @@ +{ + "schema": { + "fields": [ + { + "title": "id", + "description": "...", + "name": "id", + "type": "string" + }, + { + "title": "faith_pd", + "description": "...", + "name": "measure", + "type": "number" + }, + { + "title": "week", + "description": "...", + "name": "group", + "type": "integer" + }, + { + "title": "SubjectID", + "description": "...", + "name": "subject", + "type": "string" + } + ] + }, + "format": "ndjson", + "path": "data.ndjson", + "name": "data" +} diff --git a/q2_types/tabular/tests/test_transformers.py b/q2_types/tabular/tests/test_transformers.py new file mode 100644 index 00000000..5a0e4c1b --- /dev/null +++ b/q2_types/tabular/tests/test_transformers.py @@ -0,0 +1,63 @@ +# ---------------------------------------------------------------------------- +# Copyright (c) 2024, QIIME 2 development team. +# +# Distributed under the terms of the Modified BSD License. +# +# The full license is in the file LICENSE, distributed with this software. +# ---------------------------------------------------------------------------- + +import pandas as pd + +from qiime2.plugin.testing import TestPluginBase +from qiime2.plugin.util import transform + +from q2_types.tabular.formats import ( + TabularDataResourceDirFmt, TableJSONLFileFormat, +) + + +class TestTransformers(TestPluginBase): + package = 'q2_types.tabular.tests' + + def test_empty_tabular_data_resource_to_dataframe(self): + _, obs = self.transform_format(TabularDataResourceDirFmt, + pd.DataFrame, + filename='empty_data_dist') + + exp = pd.DataFrame(columns=['id', 'measure', 'group', 'subject']) + + pd.testing.assert_frame_equal(obs, exp, check_dtype=False) + + def test_empty_table_jsonl_to_dataframe(self): + _, obs = self.transform_format(TableJSONLFileFormat, + pd.DataFrame, + filename='empty_data_dist.table.jsonl') + + exp = pd.DataFrame(columns=['id', 'measure', 'group', 'subject']) + + pd.testing.assert_frame_equal(obs, exp, check_dtype=False) + + def _assert_jsonl_roundtrip(self, path): + exp, df = self.transform_format(TableJSONLFileFormat, + pd.DataFrame, + filename=path) + res = transform(df, to_type=TableJSONLFileFormat) + + exp.validate() + res.validate() + + with exp.open() as fh: + expected = fh.read() + with res.open() as fh: + result = fh.read() + + self.assertEqual(result, expected) + + def test_jsonl_roundtrip_empty(self): + self._assert_jsonl_roundtrip('empty_data_dist.table.jsonl') + + def test_jsonl_roundtrip_refdist(self): + self._assert_jsonl_roundtrip('faithpd_refdist.table.jsonl') + + def test_jsonl_roundtrip_timedist(self): + self._assert_jsonl_roundtrip('faithpd_timedist.table.jsonl') diff --git a/q2_types/tabular/tests/test_validators.py b/q2_types/tabular/tests/test_validators.py new file mode 100644 index 00000000..1e16016b --- /dev/null +++ b/q2_types/tabular/tests/test_validators.py @@ -0,0 +1,41 @@ +# ---------------------------------------------------------------------------- +# Copyright (c) 2024, QIIME 2 development team. +# +# Distributed under the terms of the Modified BSD License. +# +# The full license is in the file LICENSE, distributed with this software. +# ---------------------------------------------------------------------------- + +import unittest + +import pandas as pd + +from qiime2.plugin import ValidationError + +from q2_types.tabular._deferred_setup._validators import ( + validate_all_dist_columns_present, + validate_unique_subjects_within_group, +) + + +class TestValidators(unittest.TestCase): + def test_validators_missing_columns_in_dist(self): + with self.assertRaisesRegex(ValidationError, '"group" not found' + ' in distribution.'): + df = pd.DataFrame({ + 'id': ['S340445', 'S892825', 'S460691'], + 'measure': [7.662921088, 8.431734297, 8.513263823] + }) + validate_all_dist_columns_present(df, level=min) + + def test_validators_unique_subjects_not_duplicated_per_group(self): + with self.assertRaisesRegex(ValidationError, 'Unique subject found' + ' more than once within an individual' + ' group.*0.*P26'): + df = pd.DataFrame({ + 'id': ['S116625', 'S813956'], + 'measure': [7.662921088, 8.431734297], + 'group': [0, 0], + 'subject': ['P26', 'P26'] + }) + validate_unique_subjects_within_group(df, level=min) diff --git a/q2_types/tabular/types.py b/q2_types/tabular/types.py new file mode 100644 index 00000000..49083ab6 --- /dev/null +++ b/q2_types/tabular/types.py @@ -0,0 +1,29 @@ +# ---------------------------------------------------------------------------- +# Copyright (c) 2024, QIIME 2 development team. +# +# Distributed under the terms of the Modified BSD License. +# +# The full license is in the file LICENSE, distributed with this software. +# ---------------------------------------------------------------------------- + +from qiime2.plugin import SemanticType + +StatsTable = SemanticType('StatsTable', field_names=['kind']) + +Pairwise = SemanticType('Pairwise', variant_of=StatsTable.field['kind']) +Global = SemanticType('Global', variant_of=StatsTable.field['kind']) + +Dist1D = SemanticType('Dist1D', field_names=['order', 'dependence']) + +Ordered = SemanticType('Ordered', variant_of=(Dist1D.field['order'])) +Unordered = SemanticType('Unordered', variant_of=(Dist1D.field['order'])) +Multi = SemanticType('Multi', variant_of=Dist1D.field['order']) +NestedOrdered = SemanticType('NestedOrdered', + variant_of=(Dist1D.field['order'])) +NestedUnordered = SemanticType('NestedUnordered', + variant_of=(Dist1D.field['order'])) + +Matched = SemanticType('Matched', + variant_of=(Dist1D.field['dependence'])) +Independent = SemanticType('Independent', + variant_of=(Dist1D.field['dependence'])) diff --git a/setup.py b/setup.py index a71fae39..31a676c6 100644 --- a/setup.py +++ b/setup.py @@ -120,7 +120,11 @@ 'q2_types.reference_db.tests': ['data/*', 'data/*/*', 'data/*/*/*'], 'q2_types.profile_hmms.tests': - ['data/*', 'data/*/*'] + ['data/*', 'data/*/*'], + 'q2_stats.tabular.tests': [ + 'data/*', 'data/faithpd_timedist/*', 'data/faithpd_refdist/*', + 'data/empty_data_dist/*' + ], }, zip_safe=False, ) From 6926e7f14f352f2273ce33cbbec09b6c2c135860 Mon Sep 17 00:00:00 2001 From: Colin Wood Date: Mon, 23 Dec 2024 12:17:31 -0700 Subject: [PATCH 2/3] lint --- q2_types/tabular/tests/test_validators.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/q2_types/tabular/tests/test_validators.py b/q2_types/tabular/tests/test_validators.py index 1e16016b..32531a23 100644 --- a/q2_types/tabular/tests/test_validators.py +++ b/q2_types/tabular/tests/test_validators.py @@ -12,7 +12,7 @@ from qiime2.plugin import ValidationError -from q2_types.tabular._deferred_setup._validators import ( +from q2_types.tabular._deferred_setup._validators import ( validate_all_dist_columns_present, validate_unique_subjects_within_group, ) From 17a74060ce90c1b6664135947255f5fc60fe5cca Mon Sep 17 00:00:00 2001 From: Colin Wood Date: Mon, 23 Dec 2024 12:27:18 -0700 Subject: [PATCH 3/3] frictionless dependency --- conda-recipe/meta.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/conda-recipe/meta.yaml b/conda-recipe/meta.yaml index da71c1ce..0bd7f612 100644 --- a/conda-recipe/meta.yaml +++ b/conda-recipe/meta.yaml @@ -23,6 +23,7 @@ requirements: - qiime2 {{ qiime2_epoch }}.* - samtools - pyhmmer + - frictionless<=5.5.0 build: - setuptools - versioningit