|
| 1 | +# ---------------------------------------------------------------------------- |
| 2 | +# Copyright (c) 2024, QIIME 2 development team. |
| 3 | +# |
| 4 | +# Distributed under the terms of the Modified BSD License. |
| 5 | +# |
| 6 | +# The full license is in the file LICENSE, distributed with this software. |
| 7 | +# ---------------------------------------------------------------------------- |
| 8 | + |
| 9 | +import pandas as pd |
| 10 | +import frictionless as fls |
| 11 | +import json |
| 12 | + |
| 13 | +from ..formats import TableJSONLFileFormat |
| 14 | + |
| 15 | +from .. import (NDJSONFileFormat, |
| 16 | + DataResourceSchemaFileFormat, |
| 17 | + TabularDataResourceDirFmt) |
| 18 | + |
| 19 | +from ...plugin_setup import plugin |
| 20 | + |
| 21 | + |
| 22 | +def table_jsonl_header(df: pd.DataFrame) -> str: |
| 23 | + header = {} |
| 24 | + header['doctype'] = dict( |
| 25 | + name='table.jsonl', format='application/x-json-lines', version='1.0') |
| 26 | + header['direction'] = 'row' |
| 27 | + header['style'] = 'key:value' |
| 28 | + |
| 29 | + fields = [] |
| 30 | + for name in df.columns: |
| 31 | + attrs = df[name].attrs.copy() |
| 32 | + title = attrs.pop('title', '') |
| 33 | + description = attrs.pop('description', '') |
| 34 | + type = attrs.pop('type', None) |
| 35 | + missing = attrs.pop('missing', False) |
| 36 | + extra = attrs.pop('extra', None) |
| 37 | + if extra is None: |
| 38 | + extra = attrs |
| 39 | + fields.append(dict( |
| 40 | + name=name, type=type, missing=missing, title=title, |
| 41 | + description=description, extra=extra)) |
| 42 | + |
| 43 | + header['fields'] = fields |
| 44 | + header['index'] = [] |
| 45 | + header['title'] = df.attrs.get('title', '') |
| 46 | + header['description'] = df.attrs.get('description', '') |
| 47 | + header['extra'] = df.attrs.get('extra', {}) |
| 48 | + |
| 49 | + # prevent whitespace after comma and colon |
| 50 | + return json.dumps(header, separators=(',', ':')) |
| 51 | + |
| 52 | + |
| 53 | +@plugin.register_transformer |
| 54 | +def table_jsonl_to_df(ff: TableJSONLFileFormat) -> pd.DataFrame: |
| 55 | + with ff.open() as fh: |
| 56 | + header = json.loads(next(fh)) |
| 57 | + df = pd.read_json(fh, lines=True, orient='records') |
| 58 | + if df.empty: |
| 59 | + df = pd.DataFrame(columns=[ |
| 60 | + spec['name'] for spec in header['fields']]) |
| 61 | + |
| 62 | + # The order of these steps matters. |
| 63 | + |
| 64 | + # 1. set order of columns |
| 65 | + df = df[[spec['name'] for spec in header['fields']]] |
| 66 | + |
| 67 | + # 2. update types |
| 68 | + for spec in header['fields']: |
| 69 | + col = spec['name'] |
| 70 | + if spec['type'] == 'integer': |
| 71 | + df[col] = df[col].astype('int64') |
| 72 | + elif spec['type'] == 'number': |
| 73 | + df[col] = df[col].astype('float64') |
| 74 | + elif spec['type'] == 'datetime': |
| 75 | + df[col] = pd.to_datetime(df[col], format='iso8601') |
| 76 | + elif spec['type'] == 'date': |
| 77 | + df[col] = pd.to_datetime(df[col], format='iso8601') |
| 78 | + elif spec['type'] == 'time': |
| 79 | + df[col] = pd.to_datetime(df[col], format='mixed').dt.time |
| 80 | + elif spec['type'] == 'duration': |
| 81 | + df[col] = pd.to_timedelta(df[col]) |
| 82 | + |
| 83 | + # 3. set index |
| 84 | + if len(header['index']) > 0: |
| 85 | + df = df.set_index(header['index'], drop=False) |
| 86 | + |
| 87 | + # 4. add metadata to columns |
| 88 | + for spec in header['fields']: |
| 89 | + df[spec['name']].attrs.update(spec) |
| 90 | + |
| 91 | + # 5. add metadata to table |
| 92 | + attrs = dict(title=header['title'], description=header['description']) |
| 93 | + df.attrs.update(attrs) |
| 94 | + |
| 95 | + return df |
| 96 | + |
| 97 | + |
| 98 | +@plugin.register_transformer |
| 99 | +def df_to_table_jsonl(obj: pd.DataFrame) -> TableJSONLFileFormat: |
| 100 | + header = table_jsonl_header(obj) |
| 101 | + |
| 102 | + ff = TableJSONLFileFormat() |
| 103 | + with ff.open() as fh: |
| 104 | + fh.write(header) |
| 105 | + fh.write('\n') |
| 106 | + if not obj.empty: |
| 107 | + obj.to_json(fh, orient='records', lines=True, date_format='iso') |
| 108 | + |
| 109 | + return ff |
| 110 | + |
| 111 | + |
| 112 | +@plugin.register_transformer |
| 113 | +def _1(obj: pd.DataFrame) -> NDJSONFileFormat: |
| 114 | + ff = NDJSONFileFormat() |
| 115 | + obj.to_json(str(ff), lines=True, orient='records') |
| 116 | + return ff |
| 117 | + |
| 118 | + |
| 119 | +@plugin.register_transformer |
| 120 | +def _2(obj: DataResourceSchemaFileFormat) -> fls.Resource: |
| 121 | + return fls.Resource(str(obj)) |
| 122 | + |
| 123 | + |
| 124 | +@plugin.register_transformer |
| 125 | +def _3(df: TabularDataResourceDirFmt) -> pd.DataFrame: |
| 126 | + path = df.data.view(NDJSONFileFormat) |
| 127 | + data = pd.read_json(str(path), lines=True) |
| 128 | + resource = df.metadata.view(fls.Resource) |
| 129 | + |
| 130 | + if data.empty: |
| 131 | + data = pd.DataFrame( |
| 132 | + columns=[c.name for c in resource.schema.fields]) |
| 133 | + |
| 134 | + for field in resource.schema.fields: |
| 135 | + data[field.name].attrs = field.to_dict() |
| 136 | + |
| 137 | + return data |
| 138 | + |
| 139 | + |
| 140 | +@plugin.register_transformer |
| 141 | +def _4(obj: pd.DataFrame) -> TabularDataResourceDirFmt: |
| 142 | + metadata_obj = [] |
| 143 | + |
| 144 | + for col in obj.columns: |
| 145 | + series = obj[col] |
| 146 | + dtype = series.convert_dtypes().dtype |
| 147 | + metadata = series.attrs.copy() |
| 148 | + |
| 149 | + if pd.api.types.is_float_dtype(dtype): |
| 150 | + schema_dtype = 'number' |
| 151 | + elif pd.api.types.is_integer_dtype(dtype): |
| 152 | + schema_dtype = 'integer' |
| 153 | + else: |
| 154 | + schema_dtype = 'string' |
| 155 | + |
| 156 | + metadata['name'] = col |
| 157 | + metadata['type'] = schema_dtype |
| 158 | + |
| 159 | + metadata_obj.append(metadata) |
| 160 | + |
| 161 | + metadata_dict = {'schema': {'fields': metadata_obj}, **obj.attrs} |
| 162 | + metadata_dict['format'] = 'ndjson' |
| 163 | + metadata_dict['path'] = 'data.ndjson' |
| 164 | + metadata_dict['name'] = 'data' |
| 165 | + |
| 166 | + dir_fmt = TabularDataResourceDirFmt() |
| 167 | + |
| 168 | + dir_fmt.data.write_data(obj, pd.DataFrame) |
| 169 | + with open(dir_fmt.path / 'dataresource.json', 'w') as fh: |
| 170 | + fh.write(json.dumps(metadata_dict, indent=4)) |
| 171 | + |
| 172 | + return dir_fmt |
0 commit comments