Skip to content

Commit 43773d9

Browse files
authored
MAINT: Move tabular json types from q2-stats (#351)
1 parent ff058fb commit 43773d9

20 files changed

+800
-0
lines changed

conda-recipe/meta.yaml

+1
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ requirements:
2323
- qiime2 {{ qiime2_epoch }}.*
2424
- samtools
2525
- pyhmmer
26+
- frictionless<=5.5.0
2627
build:
2728
- python {{ python }}
2829
- setuptools

q2_types/plugin_setup.py

+1
Original file line numberDiff line numberDiff line change
@@ -153,3 +153,4 @@
153153
importlib.import_module('q2_types.reference_db._deferred_setup')
154154
importlib.import_module('q2_types.sample_data._deferred_setup')
155155
importlib.import_module('q2_types.tree._deferred_setup')
156+
importlib.import_module('q2_types.tabular._deferred_setup')

q2_types/tabular/__init__.py

+20
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
# ----------------------------------------------------------------------------
2+
# Copyright (c) 2024, QIIME 2 development team.
3+
#
4+
# Distributed under the terms of the Modified BSD License.
5+
#
6+
# The full license is in the file LICENSE, distributed with this software.
7+
# ----------------------------------------------------------------------------
8+
9+
from .formats import (TableJSONLFileFormat, TableJSONLDirFmt,
10+
NDJSONFileFormat, DataResourceSchemaFileFormat,
11+
TabularDataResourceDirFmt)
12+
from .types import (StatsTable, Pairwise, Dist1D, Ordered, Unordered,
13+
NestedOrdered, NestedUnordered, Multi,
14+
Matched, Independent)
15+
16+
__all__ = ['TableJSONLFileFormat', 'TableJSONLDirFmt',
17+
'NDJSONFileFormat', 'DataResourceSchemaFileFormat',
18+
'TabularDataResourceDirFmt', 'StatsTable', 'Pairwise',
19+
'Dist1D', 'Ordered', 'Unordered', 'NestedOrdered',
20+
'NestedUnordered', 'Multi', 'Matched', 'Independent']
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
# ----------------------------------------------------------------------------
2+
# Copyright (c) 2024, QIIME 2 development team.
3+
#
4+
# Distributed under the terms of the Modified BSD License.
5+
#
6+
# The full license is in the file LICENSE, distributed with this software.
7+
# ----------------------------------------------------------------------------
8+
9+
import importlib
10+
11+
from .. import (NDJSONFileFormat,
12+
DataResourceSchemaFileFormat,
13+
TabularDataResourceDirFmt,
14+
TableJSONLFileFormat, TableJSONLDirFmt,
15+
StatsTable, Pairwise, Dist1D,
16+
Matched, Independent, Ordered, Unordered, Multi,
17+
NestedOrdered, NestedUnordered)
18+
19+
from ...plugin_setup import plugin
20+
21+
plugin.register_formats(NDJSONFileFormat, DataResourceSchemaFileFormat,
22+
TabularDataResourceDirFmt)
23+
plugin.register_formats(TableJSONLFileFormat, TableJSONLDirFmt)
24+
25+
26+
plugin.register_semantic_types(StatsTable, Pairwise, Dist1D,
27+
NestedOrdered, NestedUnordered, Matched,
28+
Independent, Ordered, Unordered, Multi)
29+
30+
plugin.register_semantic_type_to_format(
31+
Dist1D[Ordered | Unordered | NestedOrdered | NestedUnordered | Multi,
32+
Matched | Independent] |
33+
StatsTable[Pairwise],
34+
TableJSONLDirFmt)
35+
36+
importlib.import_module('._transformers', __name__)
37+
importlib.import_module('._validators', __name__)
38+
39+
40+
__all__ = [
41+
'StatsTable', 'Pairwise', 'Dist1D', 'NestedOrdered', 'NestedUnordered',
42+
'Matched', 'Independent', 'Ordered', 'Unordered', 'Multi',
43+
'NDJSONFileFormat', 'DataResourceSchemaFileFormat',
44+
'TabularDataResourceDirFmt', 'TableJSONLFileFormat', 'TableJSONLDirFmt',
45+
]
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,172 @@
1+
# ----------------------------------------------------------------------------
2+
# Copyright (c) 2024, QIIME 2 development team.
3+
#
4+
# Distributed under the terms of the Modified BSD License.
5+
#
6+
# The full license is in the file LICENSE, distributed with this software.
7+
# ----------------------------------------------------------------------------
8+
9+
import pandas as pd
10+
import frictionless as fls
11+
import json
12+
13+
from ..formats import TableJSONLFileFormat
14+
15+
from .. import (NDJSONFileFormat,
16+
DataResourceSchemaFileFormat,
17+
TabularDataResourceDirFmt)
18+
19+
from ...plugin_setup import plugin
20+
21+
22+
def table_jsonl_header(df: pd.DataFrame) -> str:
23+
header = {}
24+
header['doctype'] = dict(
25+
name='table.jsonl', format='application/x-json-lines', version='1.0')
26+
header['direction'] = 'row'
27+
header['style'] = 'key:value'
28+
29+
fields = []
30+
for name in df.columns:
31+
attrs = df[name].attrs.copy()
32+
title = attrs.pop('title', '')
33+
description = attrs.pop('description', '')
34+
type = attrs.pop('type', None)
35+
missing = attrs.pop('missing', False)
36+
extra = attrs.pop('extra', None)
37+
if extra is None:
38+
extra = attrs
39+
fields.append(dict(
40+
name=name, type=type, missing=missing, title=title,
41+
description=description, extra=extra))
42+
43+
header['fields'] = fields
44+
header['index'] = []
45+
header['title'] = df.attrs.get('title', '')
46+
header['description'] = df.attrs.get('description', '')
47+
header['extra'] = df.attrs.get('extra', {})
48+
49+
# prevent whitespace after comma and colon
50+
return json.dumps(header, separators=(',', ':'))
51+
52+
53+
@plugin.register_transformer
54+
def table_jsonl_to_df(ff: TableJSONLFileFormat) -> pd.DataFrame:
55+
with ff.open() as fh:
56+
header = json.loads(next(fh))
57+
df = pd.read_json(fh, lines=True, orient='records')
58+
if df.empty:
59+
df = pd.DataFrame(columns=[
60+
spec['name'] for spec in header['fields']])
61+
62+
# The order of these steps matters.
63+
64+
# 1. set order of columns
65+
df = df[[spec['name'] for spec in header['fields']]]
66+
67+
# 2. update types
68+
for spec in header['fields']:
69+
col = spec['name']
70+
if spec['type'] == 'integer':
71+
df[col] = df[col].astype('int64')
72+
elif spec['type'] == 'number':
73+
df[col] = df[col].astype('float64')
74+
elif spec['type'] == 'datetime':
75+
df[col] = pd.to_datetime(df[col], format='iso8601')
76+
elif spec['type'] == 'date':
77+
df[col] = pd.to_datetime(df[col], format='iso8601')
78+
elif spec['type'] == 'time':
79+
df[col] = pd.to_datetime(df[col], format='mixed').dt.time
80+
elif spec['type'] == 'duration':
81+
df[col] = pd.to_timedelta(df[col])
82+
83+
# 3. set index
84+
if len(header['index']) > 0:
85+
df = df.set_index(header['index'], drop=False)
86+
87+
# 4. add metadata to columns
88+
for spec in header['fields']:
89+
df[spec['name']].attrs.update(spec)
90+
91+
# 5. add metadata to table
92+
attrs = dict(title=header['title'], description=header['description'])
93+
df.attrs.update(attrs)
94+
95+
return df
96+
97+
98+
@plugin.register_transformer
99+
def df_to_table_jsonl(obj: pd.DataFrame) -> TableJSONLFileFormat:
100+
header = table_jsonl_header(obj)
101+
102+
ff = TableJSONLFileFormat()
103+
with ff.open() as fh:
104+
fh.write(header)
105+
fh.write('\n')
106+
if not obj.empty:
107+
obj.to_json(fh, orient='records', lines=True, date_format='iso')
108+
109+
return ff
110+
111+
112+
@plugin.register_transformer
113+
def _1(obj: pd.DataFrame) -> NDJSONFileFormat:
114+
ff = NDJSONFileFormat()
115+
obj.to_json(str(ff), lines=True, orient='records')
116+
return ff
117+
118+
119+
@plugin.register_transformer
120+
def _2(obj: DataResourceSchemaFileFormat) -> fls.Resource:
121+
return fls.Resource(str(obj))
122+
123+
124+
@plugin.register_transformer
125+
def _3(df: TabularDataResourceDirFmt) -> pd.DataFrame:
126+
path = df.data.view(NDJSONFileFormat)
127+
data = pd.read_json(str(path), lines=True)
128+
resource = df.metadata.view(fls.Resource)
129+
130+
if data.empty:
131+
data = pd.DataFrame(
132+
columns=[c.name for c in resource.schema.fields])
133+
134+
for field in resource.schema.fields:
135+
data[field.name].attrs = field.to_dict()
136+
137+
return data
138+
139+
140+
@plugin.register_transformer
141+
def _4(obj: pd.DataFrame) -> TabularDataResourceDirFmt:
142+
metadata_obj = []
143+
144+
for col in obj.columns:
145+
series = obj[col]
146+
dtype = series.convert_dtypes().dtype
147+
metadata = series.attrs.copy()
148+
149+
if pd.api.types.is_float_dtype(dtype):
150+
schema_dtype = 'number'
151+
elif pd.api.types.is_integer_dtype(dtype):
152+
schema_dtype = 'integer'
153+
else:
154+
schema_dtype = 'string'
155+
156+
metadata['name'] = col
157+
metadata['type'] = schema_dtype
158+
159+
metadata_obj.append(metadata)
160+
161+
metadata_dict = {'schema': {'fields': metadata_obj}, **obj.attrs}
162+
metadata_dict['format'] = 'ndjson'
163+
metadata_dict['path'] = 'data.ndjson'
164+
metadata_dict['name'] = 'data'
165+
166+
dir_fmt = TabularDataResourceDirFmt()
167+
168+
dir_fmt.data.write_data(obj, pd.DataFrame)
169+
with open(dir_fmt.path / 'dataresource.json', 'w') as fh:
170+
fh.write(json.dumps(metadata_dict, indent=4))
171+
172+
return dir_fmt
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
# ----------------------------------------------------------------------------
2+
# Copyright (c) 2022-2023, QIIME 2 development team.
3+
#
4+
# Distributed under the terms of the Modified BSD License.
5+
#
6+
# The full license is in the file LICENSE, distributed with this software.
7+
# ----------------------------------------------------------------------------
8+
9+
import pandas as pd
10+
11+
from qiime2.plugin import ValidationError
12+
from .. import (Dist1D, Ordered, Unordered, NestedOrdered,
13+
NestedUnordered, Matched, Independent)
14+
from ...plugin_setup import plugin
15+
16+
17+
@plugin.register_validator(Dist1D[Ordered | Unordered,
18+
Matched | Independent])
19+
def validate_all_dist_columns_present(data: pd.DataFrame, level):
20+
req_cols = ['id', 'measure', 'group']
21+
for col in req_cols:
22+
if col not in data.columns:
23+
raise ValidationError(f'"{col}" not found in distribution.')
24+
25+
26+
@plugin.register_validator(Dist1D[Ordered | Unordered, Matched])
27+
def validate_unique_subjects_within_group(data: pd.DataFrame, level):
28+
if 'subject' not in data.columns:
29+
raise ValidationError('"subject" not found in distribution.')
30+
31+
for group_id, group_df in data.groupby('group'):
32+
if group_df['subject'].duplicated().any():
33+
dupes = list(group_df['subject'][group_df['subject'].duplicated()])
34+
raise ValidationError(
35+
'Unique subject found more than once within an individual'
36+
' group. Group(s) where duplicated subject was found:'
37+
f' [{group_id}] Duplicated subjects: {dupes}')
38+
39+
40+
@plugin.register_validator(Dist1D[NestedOrdered | NestedUnordered,
41+
Matched | Independent])
42+
def validate_all_nesteddist_columns_present(data: pd.DataFrame, level):
43+
req_cols = ['id', 'measure', 'group', 'class', "level"]
44+
for col in req_cols:
45+
if col not in data.columns:
46+
raise ValidationError(f'"{col}" not found in distribution.')

q2_types/tabular/formats.py

+49
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,49 @@
1+
# ----------------------------------------------------------------------------
2+
# Copyright (c) 2024, QIIME 2 development team.
3+
#
4+
# Distributed under the terms of the Modified BSD License.
5+
#
6+
# The full license is in the file LICENSE, distributed with this software.
7+
# ----------------------------------------------------------------------------
8+
9+
from qiime2.plugin import ValidationError, model
10+
11+
from frictionless import validate
12+
13+
14+
class TableJSONLFileFormat(model.TextFileFormat):
15+
def _validate_(self, level):
16+
with self.open() as fh:
17+
assert fh.read(33)[:33] == '{"doctype":{"name":"table.jsonl",'
18+
19+
20+
TableJSONLDirFmt = model.SingleFileDirectoryFormat(
21+
'TableJSONLDirFmt', 'data.table.jsonl', TableJSONLFileFormat)
22+
23+
24+
class NDJSONFileFormat(model.TextFileFormat):
25+
"""Format for newline-delimited (ND) JSON file."""
26+
def _validate_(self, level):
27+
pass
28+
29+
30+
class DataResourceSchemaFileFormat(model.TextFileFormat):
31+
"""
32+
Format for data resource schema.
33+
"""
34+
def _validate_(self, level):
35+
pass
36+
37+
38+
class TabularDataResourceDirFmt(model.DirectoryFormat):
39+
data = model.File('data.ndjson', format=NDJSONFileFormat)
40+
metadata = model.File('dataresource.json',
41+
format=DataResourceSchemaFileFormat)
42+
43+
def _validate_(self, level='min'):
44+
try:
45+
validate(str(self.path/'dataresource.json'))
46+
except ValidationError:
47+
raise model.ValidationError(
48+
'The dataresource does not completely describe'
49+
' the data.ndjson file')

q2_types/tabular/tests/__init__.py

+7
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
# ----------------------------------------------------------------------------
2+
# Copyright (c) 2024, QIIME 2 development team.
3+
#
4+
# Distributed under the terms of the Modified BSD License.
5+
#
6+
# The full license is in the file LICENSE, distributed with this software.
7+
# ----------------------------------------------------------------------------
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
{"doctype":{"name":"table.jsonl","format":"application/x-json-lines","version":"1.0"},"direction":"row","style":"key:value","fields":[{"name":"id","type":"string","missing":false,"title":"id","description":"...","extra":{"name":"id"}},{"name":"measure","type":"number","missing":false,"title":"faith_pd","description":"...","extra":{"name":"measure"}},{"name":"group","type":"integer","missing":false,"title":"week","description":"...","extra":{"name":"group"}},{"name":"subject","type":"string","missing":false,"title":"SubjectID","description":"...","extra":{"name":"subject"}}],"index":[],"title":"","description":"","extra":{}}

q2_types/tabular/tests/data/empty_data_dist/data.ndjson

Whitespace-only changes.
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
{
2+
"schema": {
3+
"fields": [
4+
{
5+
"title": "id",
6+
"description": "...",
7+
"name": "id",
8+
"type": "string"
9+
},
10+
{
11+
"title": "faith_pd",
12+
"description": "...",
13+
"name": "measure",
14+
"type": "number"
15+
},
16+
{
17+
"title": "week",
18+
"description": "...",
19+
"name": "group",
20+
"type": "integer"
21+
},
22+
{
23+
"title": "SubjectID",
24+
"description": "...",
25+
"name": "subject",
26+
"type": "string"
27+
}
28+
]
29+
},
30+
"format": "ndjson",
31+
"path": "data.ndjson",
32+
"name": "data"
33+
}

0 commit comments

Comments
 (0)