From 5bd2f8cbcc09353e39e798982c8caf16de1e8abd Mon Sep 17 00:00:00 2001 From: Ian Stride Date: Tue, 17 Sep 2024 23:06:24 +0100 Subject: [PATCH 01/33] Create universal parser --- pyproject.toml | 1 + src/rpft/cli.py | 124 +++-- src/rpft/converters.py | 39 +- src/rpft/parsers/common/rowparser.py | 14 +- src/rpft/parsers/common/sheetparser.py | 10 +- .../parsers/creation/contentindexparser.py | 3 +- .../parsers/creation/contentindexrowmodel.py | 33 ++ src/rpft/parsers/sheets.py | 5 + src/rpft/parsers/universal.py | 336 +++++++++++++ tests/test_contentindexparser.py | 81 +--- tests/test_universal.py | 452 ++++++++++++++++++ 11 files changed, 965 insertions(+), 133 deletions(-) create mode 100644 src/rpft/parsers/universal.py create mode 100644 tests/test_universal.py diff --git a/pyproject.toml b/pyproject.toml index df4f6f9..46b1df3 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -36,6 +36,7 @@ dependencies = [ "networkx~=2.5.1", "openpyxl", "pydantic >= 2", + "python-benedict", "tablib[ods]>=3.1.0", ] diff --git a/src/rpft/cli.py b/src/rpft/cli.py index 1c78946..97cd70a 100644 --- a/src/rpft/cli.py +++ b/src/rpft/cli.py @@ -39,16 +39,25 @@ def flows_to_sheets(args): ) -def save_data_sheets(args): - output = converters.save_data_sheets( +def legacy_sheets_to_uni(args): + data: dict = converters.legacy_sheets_to_uni( args.input, - None, args.format, - data_models=args.datamodels, - tags=args.tags, + data_models=args.models, ) + with open(args.output, "w", encoding="utf-8") as export: - json.dump(output, export, indent=4) + json.dump(data, export, indent=2) + + +def uni_to_sheets(args): + with open(args.output, "wb") as handle: + handle.write(converters.uni_to_sheets(args.input)) + + +def sheets_to_uni(args): + # TODO: convert uni sheets to uni JSON + ... def create_parser(): @@ -64,7 +73,9 @@ def create_parser(): _add_create_command(sub) _add_convert_command(sub) _add_flows_to_sheets_command(sub) - _add_save_data_sheets_command(sub) + _add_legacy_to_uni_command(sub) + _add_uni_to_sheets_command(sub) + _add_sheets_to_uni_command(sub) return parser @@ -77,25 +88,13 @@ def _add_create_command(sub): ) parser.set_defaults(func=create_flows) - _add_content_index_arguments(parser) - - -def _add_content_index_arguments(parser): parser.add_argument( - "--datamodels", + "input", help=( - "name of the module defining user data models underlying the data sheets," - " e.g. if the model definitions reside in" - " ./myfolder/mysubfolder/mymodelsfile.py, then this argument should be" - " myfolder.mysubfolder.mymodelsfile" + "paths to XLSX or JSON files, or directories containing CSV files, or" + " Google Sheets IDs i.e. from the URL; inputs should be of the same format" ), - ) - parser.add_argument( - "-f", - "--format", - choices=["csv", "google_sheets", "json", "xlsx"], - help="input sheet format", - required=True, + nargs="+", ) parser.add_argument( "-o", @@ -114,12 +113,20 @@ def _add_content_index_arguments(parser): nargs="*", ) parser.add_argument( - "input", + "--datamodels", help=( - "paths to XLSX or JSON files, or directories containing CSV files, or" - " Google Sheets IDs i.e. from the URL; inputs should be of the same format" + "name of the module defining user data models underlying the data sheets," + " e.g. if the model definitions reside in" + " ./myfolder/mysubfolder/mymodelsfile.py, then this argument should be" + " myfolder.mysubfolder.mymodelsfile" ), - nargs="+", + ) + parser.add_argument( + "-f", + "--format", + choices=["csv", "google_sheets", "json", "xlsx"], + help="input sheet format", + required=True, ) @@ -180,14 +187,67 @@ def _add_flows_to_sheets_command(sub): ) -def _add_save_data_sheets_command(sub): +def _add_legacy_to_uni_command(sub): parser = sub.add_parser( - "save_data_sheets", - help="save data sheets referenced in context index as nested json", + "legacy-to-uni", + help="convert legacy sheets to nested JSON", + ) + + parser.set_defaults(func=legacy_sheets_to_uni) + parser.add_argument( + "input", + help=( + "location of workbook (xlsx, Google Sheets) or directory containing CSVs" + ), + ) + parser.add_argument( + "output", + help=("location where JSON output file will be saved"), + ) + parser.add_argument( + "--models", + help=("name of the Python module defining user data models"), + ) + parser.add_argument( + "-f", + "--format", + choices=["csv", "google_sheets", "xlsx"], + help="input sheet format", + required=True, ) - parser.set_defaults(func=save_data_sheets) - _add_content_index_arguments(parser) + +def _add_uni_to_sheets_command(sub): + parser = sub.add_parser( + "uni-to-sheets", + help="convert JSON to sheets", + ) + parser.set_defaults(func=uni_to_sheets) + parser.add_argument( + "input", + help=("location of input JSON file"), + ) + parser.add_argument( + "output", + help=("location where sheets will be saved"), + ) + + +def _add_sheets_to_uni_command(sub): + parser = sub.add_parser( + "sheets-to-uni", + help="convert sheets to nested JSON", + ) + + parser.set_defaults(func=sheets_to_uni) + parser.add_argument( + "input", + help=("location of workbook"), + ) + parser.add_argument( + "output", + help=("location where JSON will be saved"), + ) if __name__ == "__main__": diff --git a/src/rpft/converters.py b/src/rpft/converters.py index 86a1e40..8f8e689 100644 --- a/src/rpft/converters.py +++ b/src/rpft/converters.py @@ -5,6 +5,7 @@ import sys from pathlib import Path +from rpft.parsers.universal import create_workbook, parse_legacy_sheets from rpft.parsers.creation.contentindexparser import ContentIndexParser from rpft.parsers.creation.tagmatcher import TagMatcher from rpft.parsers.sheets import ( @@ -16,6 +17,7 @@ XLSXSheetReader, ) from rpft.rapidpro.models.containers import RapidProContainer +from tablib import Databook, Dataset LOGGER = logging.getLogger(__name__) @@ -50,30 +52,33 @@ def create_flows(input_files, output_file, sheet_format, data_models=None, tags= return flows -def save_data_sheets(input_files, output_file, sheet_format, data_models=None, tags=[]): +def legacy_sheets_to_uni(in_file, sheet_format, data_models=None) -> dict: """ - Save data sheets as JSON. + Convert legacy data sheets to universal format + """ + reader = create_sheet_reader(sheet_format, in_file) - Collect the data sheets referenced in the source content index spreadsheet(s) and - save this collection in a single JSON file. Returns the output as a dict. + return parse_legacy_sheets(data_models, reader) - :param sources: list of source spreadsheets - :param output_files: (deprecated) path of file to export output to as JSON - :param sheet_format: format of the spreadsheets - :param data_models: name of module containing supporting Python data classes - :param tags: names of tags to be used to filter the source spreadsheets - :returns: dict representing the collection of data sheets. - """ - parser = get_content_index_parser(input_files, sheet_format, data_models, tags) +def uni_to_sheets(infile) -> bytes: + with open(infile, "r") as handle: + data = json.load(handle) - output = parser.data_sheets_to_dict() + sheets = create_workbook(data) + book = Databook( + [ + Dataset(*sheet[1][1:], headers=sheet[1][0], title=sheet[0]) + for sheet in sheets + ] + ) + + return book.export("xlsx") - if output_file: - with open(output_file, "w") as export: - json.dump(output, export, indent=4) - return output +def sheets_to_uni(infile) -> dict: + # TODO: convert uni sheets to dictionary + ... def get_content_index_parser(input_files, sheet_format, data_models, tags): diff --git a/src/rpft/parsers/common/rowparser.py b/src/rpft/parsers/common/rowparser.py index 3971a69..c5f8787 100644 --- a/src/rpft/parsers/common/rowparser.py +++ b/src/rpft/parsers/common/rowparser.py @@ -3,7 +3,7 @@ from collections.abc import Iterable from typing import List -from pydantic import BaseModel +from pydantic import BaseModel, Field from rpft.parsers.common.cellparser import CellParser @@ -34,6 +34,14 @@ def header_name_to_field_name_with_context(header, row): return header +def field_names(model: BaseModel) -> List[str]: + return [field.alias for field in model.model_fields.values()] + + +def get_field(model: BaseModel, name: str) -> Field: + return next(field for field in model.model_fields.values() if field.alias == name) + + def get_list_child_model(model): if is_basic_list_type(model): # If not specified, list elements may be anything. @@ -162,7 +170,7 @@ def assign_value(self, field, key, value, model): # Get the list of keys that are available for the target model # Note: The fields have a well defined ordering. # See https://pydantic-docs.helpmanual.io/usage/models/#field-ordering - model_fields = list(model.model_fields.keys()) + model_fields = field_names(model) if type(value) is not list: # It could be that an object is specified via a single element. @@ -279,7 +287,7 @@ def find_entry(self, model, output_field, field_path): else: assert is_parser_model_type(model) key = model.header_name_to_field_name(field_name) - if key not in model.model_fields: + if key not in field_names(model): raise ValueError(f"Field {key} doesn't exist in target type {model}.") child_model = model.model_fields[key].annotation diff --git a/src/rpft/parsers/common/sheetparser.py b/src/rpft/parsers/common/sheetparser.py index a11f44c..351890d 100644 --- a/src/rpft/parsers/common/sheetparser.py +++ b/src/rpft/parsers/common/sheetparser.py @@ -34,12 +34,12 @@ def __init__(self, table, row_model=None, row_parser=None, context={}): raise ValueError("SheetParser: needs either row_parser or row_model") self.row_parser = row_parser or RowParser(row_model) self.bookmarks = {} - self.input_rows = [] - for row_idx, row in enumerate(table): - row_dict = {h: e for h, e in zip(table.headers, row)} - self.input_rows.append((row_dict, row_idx + 2)) + self.input_rows = [ + ({h: e for h, e in zip(table.headers, row)}, row_idx) + for row_idx, row in enumerate(table, start=2) + ] self.iterator = iter(self.input_rows) - self.context = copy.deepcopy(context) + self.context = context if context is None else copy.deepcopy(context) def add_to_context(self, key, value): self.context[key] = value diff --git a/src/rpft/parsers/creation/contentindexparser.py b/src/rpft/parsers/creation/contentindexparser.py index c4440b9..285da3f 100644 --- a/src/rpft/parsers/creation/contentindexparser.py +++ b/src/rpft/parsers/creation/contentindexparser.py @@ -1,6 +1,7 @@ import importlib import logging from collections import OrderedDict + from rpft.logger.logger import logging_context from rpft.parsers.common.model_inference import model_from_headers from rpft.parsers.common.sheetparser import SheetParser @@ -56,7 +57,7 @@ def __init__( self.tag_matcher = tag_matcher self.template_sheets = {} self.data_sheets = {} - self.flow_definition_rows: list[ContentIndexRowModel] = [] + self.flow_definition_rows = [] self.campaign_parsers: dict[str, tuple[str, CampaignParser]] = {} self.surveys = {} self.trigger_parsers = OrderedDict() diff --git a/src/rpft/parsers/creation/contentindexrowmodel.py b/src/rpft/parsers/creation/contentindexrowmodel.py index ecf196e..2c622f7 100644 --- a/src/rpft/parsers/creation/contentindexrowmodel.py +++ b/src/rpft/parsers/creation/contentindexrowmodel.py @@ -1,6 +1,9 @@ from enum import Enum +from pydantic.v1 import Field + from rpft.parsers.common.rowparser import ParserModel +from rpft.parsers.creation.flowrowmodel import WhatsAppTemplating, Webhook from rpft.parsers.creation.models import SurveyConfig @@ -49,3 +52,33 @@ def header_name_to_field_name_with_context(header, row): return "survey_config" else: return header + + +class CreateFlowRowModel(ParserModel): + audio: str = "" + choices: List[str] = [] + condition: str = "" + condition_name: str = "" + condition_type: str = "" + condition_var: str = "" + data_row_id: str = "" + data_sheet: str = "" + from_: str = Field(alias="from", default="") + image: str = "" + include_if: str = "" + loop_variable: str = "" + mainarg_destination_row_ids: List[str] = [] + mainarg_expression: str = "" + message_text: str = "" + no_response: str = "" + nodeId: str = Field(alias="_nodeId", default="") + node_name: str = "" + obj_id: str = "" + obj_name: str = "" + row_id: str = "" + save_name: str = "" + template_arguments: list = [] + type: str = "" + video: str = "" + wa_template: WhatsAppTemplating = WhatsAppTemplating() + webhook: Webhook = Webhook() diff --git a/src/rpft/parsers/sheets.py b/src/rpft/parsers/sheets.py index bc89d4e..fa92308 100644 --- a/src/rpft/parsers/sheets.py +++ b/src/rpft/parsers/sheets.py @@ -156,6 +156,11 @@ def get_sheets_by_name(self, name): return sheets +class DatasetSheetReader(AbstractSheetReader): + def __init__(self, datasets): + self._sheets = {d.title: Sheet(self, d.title, d) for d in datasets} + + def load_csv(path): with open(path, mode="r", encoding="utf-8") as csv: return tablib.import_set(csv, format="csv") diff --git a/src/rpft/parsers/universal.py b/src/rpft/parsers/universal.py new file mode 100644 index 0000000..4479d68 --- /dev/null +++ b/src/rpft/parsers/universal.py @@ -0,0 +1,336 @@ +import importlib +import logging +from collections import defaultdict +from collections.abc import Sequence +from functools import singledispatch +from typing import Any, List + +from benedict import benedict + +from rpft.parsers.common.cellparser import CellParser +from rpft.parsers.common.rowparser import RowParser +from rpft.parsers.common.sheetparser import SheetParser +from rpft.parsers.creation.campaigneventrowmodel import CampaignEventRowModel +from rpft.parsers.creation.contentindexrowmodel import ( + ContentIndexRowModel, + CreateFlowRowModel, +) +from rpft.parsers.creation.triggerrowmodel import TriggerRowModel +from rpft.parsers.sheets import AbstractSheetReader, Sheet + +LOGGER = logging.getLogger(__name__) + +KEY_VALUE_SEP = ":" +PROP_ACCESSOR = "." +SEQ_ITEM_SEP = "|" + + +def parse_legacy_sheets(models_module: str, reader: AbstractSheetReader) -> dict: + """ + Convert multiple sheets in the legacy format into a nested data structure + """ + content_index: List[ContentIndexRowModel] = parse_content_index( + reader, + "content_index", + ) + model_finder = ModelFinder(models_module) + data = { + "content_index": to_dict( + parse_sheet( + ContentIndexRowModel, + reader.get_sheet("content_index"), + ) + ) + } + + for entry in content_index: + + if len(entry.sheet_name) == 1: + name = entry.sheet_name[0] + model = model_finder.find_for_entry(entry) + sheet = reader.get_sheet(name) + + if sheet and model: + data[name] = to_dict(parse_sheet(model, sheet)) + else: + LOGGER.warning( + "%s not found, %s", + "Sheet" if not sheet else "Model", + {"sheet": name, "model": model}, + ) + + remaining = set(reader.sheets.keys()) - set(data.keys()) + + for name in remaining: + table = reader.get_sheet(name).table + data[name] = [list(table.headers)] + [list(r) for r in table] + + LOGGER.info( + str( + { + "index": {"count": len(data)}, + "sheets": {"count": len(reader.sheets)}, + "unconverted": {"count": len(remaining), "names": remaining}, + } + ) + ) + + return data + + +def parse_content_index(reader, name): + content_index: List[ContentIndexRowModel] = parse_sheet( + ContentIndexRowModel, + reader.get_sheet(name), + ) + acc = [] + + for entry in content_index: + acc += [entry] + + if entry.type == "content_index": + acc += parse_content_index(reader, entry.sheet_name[0]) + + return acc + + +def parse_sheet(model, sheet: Sheet): + try: + return SheetParser( + RowParser(model, CellParser()), + sheet.table, + context=None, + ).parse_all() + except Exception as e: + raise Exception( + "Parse failed", + {"sheet": sheet.name if sheet else None, "model": model}, + e, + ) + + +def to_dict(instances): + return [ + instance.dict( + by_alias=True, + exclude_unset=True, + ) + for instance in instances + ] + + +class ModelFinder: + type_model_map = { + "content_index": ContentIndexRowModel, + "create_campaign": CampaignEventRowModel, + "create_flow": CreateFlowRowModel, + "create_triggers": TriggerRowModel, + "template_definition": CreateFlowRowModel, + } + + def __init__(self, module=None): + self._module = importlib.import_module(module) if module else None + + def find_for_entry(self, entry): + if entry.type in self.type_model_map: + return self.type_model_map.get(entry.type) + + if entry.data_model: + try: + return getattr(self._module, entry.data_model) + except AttributeError: + pass + + return None + + +def create_workbook(data: dict) -> list: + return [(k, tabulate(v)) for k, v in data.items()] + + +def tabulate(data, meta: dict = {}) -> List[List[str]]: + """ + Convert a nested data structure to a tabular form + """ + flattened = tabulate_data(data, meta, []) + headers = { + (k, v.get("meta", {}).get("alias")): None + for item in flattened + for k, v in item.items() + } + rows = [ + [item.get(h, {}).get("data", "") for h, _ in headers.keys()] + for item in flattened + ] + + return [[alias or name for name, alias in headers.keys()]] + rows + + +@singledispatch +def tabulate_data(data, meta, path): + return create_prop(path, str(data), meta) + + +@tabulate_data.register +def _(data: list, meta, path): + if len(path) > 0: + if meta.get("layout") == "wide": + out = dict() + + for i, item in enumerate(data, start=1): + out = out | tabulate_data(item, meta | {"alias": path}, path + [str(i)]) + + return out + + return create_prop(path, stringify(data), meta) + + rows = [] + + for item in data: + rows.append(tabulate_data(item, meta, path + ["[]"])) + + return rows + + +@tabulate_data.register +def _(data: dict, meta, path): + if not path: + raise Exception("Cannot tabulate dict to table... yet.") + + if len(path) <= 1 or meta.get("layout") == "wide": + out = dict() + + for k, v in data.items(): + out = out | tabulate_data(v, meta.get(k, {}), path + [k]) + + return out + + return create_prop(path, stringify(data), meta) + + +@tabulate_data.register +def _(data: bool, meta, path): + return create_prop(path, str(data).lower(), meta) + + +def create_prop(path, data, meta={}) -> dict: + if meta.get("alias"): + meta["alias"] = ".".join(meta["alias"][1:]) + + key = ".".join(path[1:]) + + return {key: {"meta": meta, "data": data}} + + +@singledispatch +def stringify(value) -> str: + return str(value) + + +@stringify.register +def _(value: dict) -> str: + return " | ".join( + "{0}: {1}".format(stringify(k), stringify(v)) for k, v in value.items() + ) + + +@stringify.register +def _(value: list) -> str: + return " | ".join(stringify(i) for i in value) + + +@stringify.register +def _(value: bool) -> str: + return str(value).lower() + + +# TODO: create a function to parse a list of tables i.e. a workbook +def parse_table( + title: str = None, + headers: Sequence[str] = tuple(), + rows: Sequence[Sequence[str]] = tuple(), +): + """ + Parse data in tabular form into a nested structure + """ + if not headers or not rows: + return {title or "table": []} + + return create_obj(stream(title or "table", headers, rows)) + + +def stream( + title: str = None, + headers: Sequence[str] = tuple(), + rows: Sequence[Sequence[str]] = tuple(), +): + yield [("_idems", "tabulate", title, "headers"), headers] + + counters = defaultdict(int) + hs = [] + + for key in headers: + hs += [(key, counters[key])] + counters[key] += 1 + + hs = [create_keypath(h, i, counters[h]) for h, i in hs] + + for i, row in enumerate(rows): + for h, v in zip(hs, row): + yield [[title, i] + h, convert_cell(v)] + + +def create_keypath(header, index, count): + expanded = header.split(PROP_ACCESSOR) + i = index if index < count else count - 1 + + return expanded + [i] if count > 1 else expanded + + +def create_obj(pairs): + obj = benedict() + + for kp, v in pairs: + obj[*kp] = v + + return obj + + +def convert_cell(s: str, recursive=True) -> Any: + if type(s) is not str: + raise TypeError("Value to convert is not a string") + + clean = s.strip() if s else "" + + try: + return int(clean) + except Exception: + pass + + try: + return float(clean) + except Exception: + pass + + if clean in ("true", "false"): + return clean == "true" + + if recursive and KEY_VALUE_SEP in s: + try: + props = [p.split(KEY_VALUE_SEP, 1) for p in s.split(SEQ_ITEM_SEP) if p] + + return {k.strip(): convert_cell(v, recursive=False) for k, v in props} + except Exception: + pass + + if recursive and SEQ_ITEM_SEP in s: + try: + return [ + convert_cell(item, recursive=False) + for item in s.split(SEQ_ITEM_SEP) + if item + ] + except Exception: + pass + + return clean diff --git a/tests/test_contentindexparser.py b/tests/test_contentindexparser.py index e2bb72e..d711f59 100644 --- a/tests/test_contentindexparser.py +++ b/tests/test_contentindexparser.py @@ -3,8 +3,13 @@ from rpft.parsers.creation.contentindexparser import ContentIndexParser from rpft.parsers.creation.tagmatcher import TagMatcher -from rpft.parsers.sheets import CompositeSheetReader, CSVSheetReader, XLSXSheetReader +from rpft.parsers.sheets import ( + CompositeSheetReader, + CSVSheetReader, + XLSXSheetReader, +) from rpft.rapidpro.models.triggers import RapidProTriggerError + from tests import TESTS_ROOT from tests.mocks import MockSheetReader from tests.utils import Context, csv_join, traverse_flow @@ -1417,77 +1422,3 @@ def test_with_model(self): self.assertFlowMessages(flows, "template - a", ["hello georg"]) self.assertFlowMessages(flows, "template - b", ["hello chiara"]) - - -class TestSaveAsDict(TestCase): - def test_save_as_dict(self): - self.maxDiff = None - ci_sheet = ( - "type,sheet_name,data_sheet,data_row_id,new_name,data_model,status\n" - "data_sheet,simpledata,,,simpledata_renamed,ListRowModel,\n" - "create_flow,my_basic_flow,,,,,\n" - "data_sheet,nesteddata,,,,NestedRowModel,\n" - ) - simpledata = csv_join( - "ID,list_value.1,list_value.2", - "rowID,val1,val2", - ) - nesteddata = ( - "ID,value1,custom_field.happy,custom_field.sad\n" - "row1,Value1,Happy1,Sad1\n" - "row2,Value2,Happy2,Sad2\n" - ) - my_basic_flow = csv_join( - "row_id,type,from,message_text", - ",send_message,start,Some text", - ) - sheet_dict = { - "simpledata": simpledata, - "my_basic_flow": my_basic_flow, - "nesteddata": nesteddata, - } - - output = ContentIndexParser( - MockSheetReader(ci_sheet, sheet_dict), - "tests.datarowmodels.nestedmodel", - ).data_sheets_to_dict() - - output["meta"].pop("version") - exp = { - "meta": { - "user_models_module": "tests.datarowmodels.nestedmodel", - }, - "sheets": { - "simpledata_renamed": { - "model": "ListRowModel", - "rows": [ - { - "ID": "rowID", - "list_value": ["val1", "val2"], - } - ], - }, - "nesteddata": { - "model": "NestedRowModel", - "rows": [ - { - "ID": "row1", - "value1": "Value1", - "custom_field": { - "happy": "Happy1", - "sad": "Sad1", - }, - }, - { - "ID": "row2", - "value1": "Value2", - "custom_field": { - "happy": "Happy2", - "sad": "Sad2", - }, - }, - ], - }, - }, - } - self.assertEqual(output, exp) diff --git a/tests/test_universal.py b/tests/test_universal.py new file mode 100644 index 0000000..f6c0b83 --- /dev/null +++ b/tests/test_universal.py @@ -0,0 +1,452 @@ +from unittest import TestCase + +from rpft.parsers.sheets import DatasetSheetReader +from rpft.parsers.universal import ( + convert_cell, + create_workbook, + parse_legacy_sheets, + parse_table, + tabulate, +) +from tablib import Dataset + + +class TestConvertUniversalToTable(TestCase): + def test_headers_must_be_first_row(self): + data = [ + {"type": "create_flow", "sheet_name": "flow1"}, + ] + + table = tabulate(data) + + self.assertEqual( + table[0], + ["type", "sheet_name"], + "First row must be column headers", + ) + self.assertEqual( + table[1], + ["create_flow", "flow1"], + "Subsequent rows must contain values", + ) + + def test_values_must_be_strings(self): + data = [ + { + "boolean": True, + "float": 1.23, + "integer": "123", + "string": "hello", + }, + ] + + table = tabulate(data) + + self.assertEqual(table[1], ["true", "1.23", "123", "hello"]) + + def test_arrays_use_single_cell_layout_by_default(self): + data = [ + { + "choices": ["yes", "no", 1, False], + }, + ] + + table = tabulate(data) + + self.assertEqual(table[1], ["yes | no | 1 | false"]) + + def test_arrays_use_wide_layout_if_indicated_by_metadata(self): + meta = {"choices": {"layout": "wide"}} + data = [ + { + "choices": ["yes", "no", 1, False], + }, + ] + + table = tabulate(data, meta) + + self.assertEqual(table[0], ["choices", "choices", "choices", "choices"]) + self.assertEqual(table[1], ["yes", "no", "1", "false"]) + + def test_objects_use_single_cell_layout_by_default(self): + data = [ + { + "obj": { + "prop1": "val1", + "prop2": "val2", + }, + }, + ] + + table = tabulate(data) + + self.assertEqual(table[1], ["prop1: val1 | prop2: val2"]) + + def test_objects_use_wide_layout_if_indicated_by_metadata(self): + meta = {"obj": {"layout": "wide"}} + data = [ + { + "obj": { + "prop1": "val1", + "prop2": "val2", + }, + }, + ] + + table = tabulate(data, meta) + + self.assertEqual(table[0], ["obj.prop1", "obj.prop2"]) + self.assertEqual(table[1], ["val1", "val2"]) + + # TODO: test pointers/references + # TODO: add explicit type information + # TODO: integrate zero-knowledge type inference + + +class TestUniversalToWorkbook(TestCase): + def test_assembly(self): + data = { + "group1": [{"a": "a1", "b": "b1"}], + "group2": [{"A": "A1", "B": "B1"}], + } + + workbook = create_workbook(data) + + self.assertEqual(len(workbook), 2) + self.assertEqual(workbook[0][0], "group1") + self.assertEqual(workbook[0][1], [["a", "b"], ["a1", "b1"]]) + self.assertEqual(workbook[1][0], "group2") + self.assertEqual(workbook[1][1], [["A", "B"], ["A1", "B1"]]) + + +class TestConvertLegacyToUniversal(TestCase): + def test_skip_non_existent_sheets(self): + datasets = [ + Dataset( + ("create_flow", "my_basic_flow"), + headers=("type", "sheet_name"), + title="content_index", + ) + ] + + output = parse_legacy_sheets( + "tests.datarowmodels.nestedmodel", + DatasetSheetReader(datasets), + ) + + self.assertIn("content_index", output) + self.assertNotIn("my_basic_flow", output) + + def test_skip_evaluation_of_cells_with_templates(self): + datasets = [ + Dataset( + ("template_definition", "my_template"), + headers=("type", "sheet_name"), + title="content_index", + ), + Dataset( + ("send_message", "start", "Hello, {{name}}"), + headers=("type", "from", "message_text"), + title="my_template", + ), + ] + + output = parse_legacy_sheets( + "tests.datarowmodels.nestedmodel", + DatasetSheetReader(datasets), + ) + + self.assertEqual( + output["my_template"][0]["message_text"], + "Hello, {{name}}", + "Template notation must remain intact", + ) + + def test_unparseable_sheets_are_converted_to_2d_array(self): + datasets = [ + Dataset( + headers=("type", "sheet_name"), + title="content_index", + ), + Dataset( + ("", "b2"), + ("c1", ""), + ("d1", "d2"), + headers=("", ""), + title="unparseable", + ), + ] + + output = parse_legacy_sheets( + "tests.datarowmodels.nestedmodel", + DatasetSheetReader(datasets), + ) + + self.assertEqual( + output["unparseable"], + [["", ""], ["", "b2"], ["c1", ""], ["d1", "d2"]], + "Data should be 2-dimensional array of strings", + ) + + def test_process_content_indices_recursively(self): + datasets = [ + Dataset( + ("content_index", "sub_index"), + headers=("type", "sheet_name"), + title="content_index", + ), + Dataset( + ("data_sheet", "simpledata", "ListRowModel"), + headers=("type", "sheet_name", "data_model"), + title="sub_index", + ), + Dataset( + ("rowID", "val1", "val2"), + headers=("ID", "list_value.1", "list_value.2"), + title="simpledata", + ), + ] + + output = parse_legacy_sheets( + "tests.datarowmodels.nestedmodel", + DatasetSheetReader(datasets), + ) + + self.assertEqual( + output["simpledata"][0]["list_value"], + ["val1", "val2"], + "Data should be converted to nested form because all content indices should" + " have been processed", + ) + + def test_save_as_dict(self): + # TODO: Break up this test into smaller, more manageable pieces + self.maxDiff = None + datasets = [ + Dataset( + ("data_sheet", "simpledata", "simpledata_new", "ListRowModel", ""), + ("create_flow", "my_basic_flow", "", "", ""), + ("data_sheet", "nesteddata", "", "NestedRowModel", ""), + ("create_campaign", "my_campaign", "", "", "grp1"), + headers=("type", "sheet_name", "new_name", "data_model", "group"), + title="content_index", + ), + Dataset( + ("rowID", "val1", "val2"), + headers=("ID", "list_value.1", "list_value.2"), + title="simpledata", + ), + Dataset( + ("row1", "Value1", "Happy1", "Sad1"), + ("row2", "Value2", "Happy2", "Sad2"), + headers=("ID", "value1", "custom_field.happy", "custom_field.sad"), + title="nesteddata", + ), + Dataset( + ("", "send_message", "start", "Some text"), + headers=("row_id", "type", "from", "message_text"), + title="my_basic_flow", + ), + Dataset( + ("15", "H", "F", "Last Seen On", "I", "my_basic_flow"), + headers=( + "offset", + "unit", + "event_type", + "relative_to", + "start_mode", + "flow", + ), + title="my_campaign", + ), + ] + + output = parse_legacy_sheets( + "tests.datarowmodels.nestedmodel", + DatasetSheetReader(datasets), + ) + exp = { + "content_index": [ + { + "type": "data_sheet", + "sheet_name": ["simpledata"], + "new_name": "simpledata_new", + "data_model": "ListRowModel", + "group": "", + }, + { + "type": "create_flow", + "sheet_name": ["my_basic_flow"], + "new_name": "", + "data_model": "", + "group": "", + }, + { + "type": "data_sheet", + "sheet_name": ["nesteddata"], + "new_name": "", + "data_model": "NestedRowModel", + "group": "", + }, + { + "type": "create_campaign", + "sheet_name": ["my_campaign"], + "new_name": "", + "data_model": "", + "group": "grp1", + }, + ], + "simpledata": [ + { + "ID": "rowID", + "list_value": ["val1", "val2"], + } + ], + "nesteddata": [ + { + "ID": "row1", + "value1": "Value1", + "custom_field": { + "happy": "Happy1", + "sad": "Sad1", + }, + }, + { + "ID": "row2", + "value1": "Value2", + "custom_field": { + "happy": "Happy2", + "sad": "Sad2", + }, + }, + ], + "my_basic_flow": [ + { + "row_id": "", + "type": "send_message", + "from": "start", + "message_text": "Some text", + }, + ], + "my_campaign": [ + { + "offset": "15", + "unit": "H", + "event_type": "F", + "relative_to": "Last Seen On", + "start_mode": "I", + "flow": "my_basic_flow", + }, + ], + } + + self.assertEqual(output, exp) + + +class TestConvertTableToNested(TestCase): + + def test_default_type_is_string(self): + self.assertEqual( + parse_table( + title="title", + headers=["a"], + rows=[["a1"]], + ), + { + "_idems": {"tabulate": {"title": {"headers": ["a"]}}}, + "title": [{"a": "a1"}], + }, + ) + + def test_table_must_have_title(self): + self.assertEqual(parse_table(), {"table": []}) + + def test_integer_as_string_is_int(self): + parsed = parse_table(headers=["a"], rows=[["123"]]) + + self.assertEqual(parsed["table"][0]["a"], 123) + + def test_boolean_as_string_is_bool(self): + parsed = parse_table(headers=("a", "b"), rows=[("true", "false")]) + + self.assertEqual(parsed["table"][0]["a"], True) + self.assertEqual(parsed["table"][0]["b"], False) + + def test_delimited_string_is_array(self): + parsed = parse_table(headers=["a"], rows=[["one | 2 | true | 3.4"]]) + + self.assertEqual(parsed["table"][0]["a"], ["one", 2, True, 3.4]) + + def test_columns_with_same_name_are_grouped_into_list(self): + parsed = parse_table(headers=["a"] * 4, rows=[("one", "2", "true", "3.4")]) + + self.assertEqual(parsed["table"][0]["a"], ["one", 2, True, 3.4]) + # self.assertEqual(parsed["_idems"]["tabulate"]["table"]["a"]["layout"], "wide") + + def test_columns_with_same_name_and_delimited_strings_is_2d_array(self): + parsed = parse_table(headers=["a"] * 2, rows=[("one | 2", "true | 3.4")]) + + self.assertEqual(parsed["table"][0]["a"], [["one", 2], [True, 3.4]]) + + def test_column_using_dot_notation_is_nested_object_property(self): + parsed = parse_table( + headers=("obj.prop1", "obj.prop2"), + rows=[("one", "2")], + ) + + self.assertEqual(parsed["table"][0]["obj"], {"prop1": "one", "prop2": 2}) + self.assertEqual( + parsed["_idems"]["tabulate"]["table"]["headers"], + ("obj.prop1", "obj.prop2"), + ) + + def test_nested_object_with_2d_array_property_value(self): + parsed = parse_table(headers=["obj.k1"] * 2, rows=[["1 | 2", "3 | 4"]]) + + self.assertEqual(parsed["table"][0]["obj"], {"k1": [[1, 2], [3, 4]]}) + + def test_nested_object_with_nested_object(self): + parsed = parse_table( + headers=["obj.k1"] * 2, + rows=[["k2: 2 | k3: false", "k4: v4 | k5: true"]], + ) + + self.assertEqual( + parsed["table"][0]["obj"], + {"k1": [{"k2": 2, "k3": False}, {"k4": "v4", "k5": True}]}, + ) + + +class TestCellConversion(TestCase): + + def test_convert_cell_string_to_number(self): + self.assertEqual(convert_cell("123"), 123) + self.assertEqual(convert_cell("1.23"), 1.23) + + def test_output_clean_string_if_no_conversion_possible(self): + self.assertEqual(convert_cell("one"), "one") + self.assertEqual(convert_cell(" one "), "one") + self.assertEqual(convert_cell(""), "") + + def test_raises_error_if_not_string_input(self): + self.assertRaises(TypeError, convert_cell, None) + self.assertRaises(TypeError, convert_cell, 123) + + def test_convert_cell_string_to_bool(self): + self.assertEqual(convert_cell("true"), True) + self.assertEqual(convert_cell(" true "), True) + self.assertEqual(convert_cell("false"), False) + + def test_convert_cell_string_to_list(self): + self.assertEqual(convert_cell("one | 2 | false"), ["one", 2, False]) + self.assertEqual(convert_cell("one |"), ["one"]) + self.assertEqual(convert_cell("|"), []) + self.assertEqual(convert_cell("| 2 |"), [2]) + self.assertEqual(convert_cell("k1 | v1 : k2 | v2"), ["k1", "v1 : k2", "v2"]) + + def test_convert_cell_string_to_dict(self): + self.assertEqual(convert_cell("k1: v1"), {"k1": "v1"}) + self.assertEqual(convert_cell(" k1 : v1 "), {"k1": "v1"}) + self.assertEqual(convert_cell("k1: v1 |"), {"k1": "v1"}) + self.assertEqual(convert_cell("k1: k2: v2"), {"k1": "k2: v2"}) + self.assertEqual(convert_cell("k1: 1 | k2: true"), {"k1": 1, "k2": True}) From 70998ba45da7fe94fdb09bd6edec1f1d44b2855e Mon Sep 17 00:00:00 2001 From: Ian Stride Date: Tue, 17 Sep 2024 23:20:10 +0100 Subject: [PATCH 02/33] Fix invalid dict access syntax --- src/rpft/parsers/universal.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/rpft/parsers/universal.py b/src/rpft/parsers/universal.py index 4479d68..49f2127 100644 --- a/src/rpft/parsers/universal.py +++ b/src/rpft/parsers/universal.py @@ -291,7 +291,7 @@ def create_obj(pairs): obj = benedict() for kp, v in pairs: - obj[*kp] = v + obj[kp] = v return obj From 0ba9d7341e130dfe47cd2146914c5be8383c2359 Mon Sep 17 00:00:00 2001 From: Ian Stride Date: Mon, 23 Sep 2024 17:40:05 +0100 Subject: [PATCH 03/33] Create function to convert uni format workbook into nested --- src/rpft/converters.py | 7 +++---- src/rpft/parsers/universal.py | 19 +++++++++++++++---- tests/test_universal.py | 18 ++++++++++++++++++ 3 files changed, 36 insertions(+), 8 deletions(-) diff --git a/src/rpft/converters.py b/src/rpft/converters.py index 8f8e689..89d341a 100644 --- a/src/rpft/converters.py +++ b/src/rpft/converters.py @@ -5,7 +5,7 @@ import sys from pathlib import Path -from rpft.parsers.universal import create_workbook, parse_legacy_sheets +from rpft.parsers.universal import create_workbook, parse_legacy_sheets, parse_tables from rpft.parsers.creation.contentindexparser import ContentIndexParser from rpft.parsers.creation.tagmatcher import TagMatcher from rpft.parsers.sheets import ( @@ -76,9 +76,8 @@ def uni_to_sheets(infile) -> bytes: return book.export("xlsx") -def sheets_to_uni(infile) -> dict: - # TODO: convert uni sheets to dictionary - ... +def sheets_to_uni(infile, fmt) -> list: + return parse_tables(create_sheet_reader(fmt, infile)) def get_content_index_parser(input_files, sheet_format, data_models, tags): diff --git a/src/rpft/parsers/universal.py b/src/rpft/parsers/universal.py index 49f2127..52a3a70 100644 --- a/src/rpft/parsers/universal.py +++ b/src/rpft/parsers/universal.py @@ -244,7 +244,16 @@ def _(value: bool) -> str: return str(value).lower() -# TODO: create a function to parse a list of tables i.e. a workbook +def parse_tables(reader: AbstractSheetReader) -> dict: + """ + Parse a workbook into a nested structure + """ + return [ + parse_table(title, sheet.table.headers, sheet.table[:]) + for title, sheet in reader.sheets.items() + ] + + def parse_table( title: str = None, headers: Sequence[str] = tuple(), @@ -253,10 +262,12 @@ def parse_table( """ Parse data in tabular form into a nested structure """ + title = title or "table" + if not headers or not rows: - return {title or "table": []} + return {title: []} - return create_obj(stream(title or "table", headers, rows)) + return create_obj(stream(title, headers, rows)) def stream( @@ -293,7 +304,7 @@ def create_obj(pairs): for kp, v in pairs: obj[kp] = v - return obj + return dict(obj) def convert_cell(s: str, recursive=True) -> Any: diff --git a/tests/test_universal.py b/tests/test_universal.py index f6c0b83..2dc0a50 100644 --- a/tests/test_universal.py +++ b/tests/test_universal.py @@ -6,6 +6,7 @@ create_workbook, parse_legacy_sheets, parse_table, + parse_tables, tabulate, ) from tablib import Dataset @@ -343,6 +344,23 @@ def test_save_as_dict(self): self.assertEqual(output, exp) +class TestConvertWorkbookToUniversal(TestCase): + + def test_workbook_converts_to_list_of_objects(self): + workbook = DatasetSheetReader( + [ + Dataset(("t1a1", "t1b1"), headers=("T1A", "T1B"), title="table1"), + Dataset(("t2a1", "t2b1"), headers=("T2A", "T2B"), title="table2"), + ] + ) + + nested = parse_tables(workbook) + + self.assertIsInstance(nested, list) + self.assertEqual(len(nested), 2) + self.assertTrue(all(type(o) is dict for o in nested)) + + class TestConvertTableToNested(TestCase): def test_default_type_is_string(self): From ff6fa77afebb4dac66a0447ff2917ec2eb722a40 Mon Sep 17 00:00:00 2001 From: Ian Stride Date: Tue, 24 Sep 2024 17:58:16 +0100 Subject: [PATCH 04/33] Ensure sheet order when converting legacy sheets --- src/rpft/parsers/universal.py | 44 ++++++++++++++++------------------- tests/test_universal.py | 31 ++++++++++++++++++++++++ 2 files changed, 51 insertions(+), 24 deletions(-) diff --git a/src/rpft/parsers/universal.py b/src/rpft/parsers/universal.py index 52a3a70..4c816a7 100644 --- a/src/rpft/parsers/universal.py +++ b/src/rpft/parsers/universal.py @@ -29,26 +29,21 @@ def parse_legacy_sheets(models_module: str, reader: AbstractSheetReader) -> dict """ Convert multiple sheets in the legacy format into a nested data structure """ - content_index: List[ContentIndexRowModel] = parse_content_index( - reader, - "content_index", - ) - model_finder = ModelFinder(models_module) - data = { - "content_index": to_dict( - parse_sheet( - ContentIndexRowModel, - reader.get_sheet("content_index"), - ) + content_index: List[ContentIndexRowModel] = { + entry.sheet_name[0]: entry + for entry in parse_content_index( + reader, + "content_index", ) + if len(entry.sheet_name) == 1 } + model_finder = ModelFinder(models_module) + data = {} + unconverted = [] - for entry in content_index: - - if len(entry.sheet_name) == 1: - name = entry.sheet_name[0] - model = model_finder.find_for_entry(entry) - sheet = reader.get_sheet(name) + for name, sheet in reader.sheets.items(): + if name in content_index: + model = model_finder.find_for_entry(content_index[name]) if sheet and model: data[name] = to_dict(parse_sheet(model, sheet)) @@ -58,19 +53,20 @@ def parse_legacy_sheets(models_module: str, reader: AbstractSheetReader) -> dict "Sheet" if not sheet else "Model", {"sheet": name, "model": model}, ) - - remaining = set(reader.sheets.keys()) - set(data.keys()) - - for name in remaining: - table = reader.get_sheet(name).table - data[name] = [list(table.headers)] + [list(r) for r in table] + elif name == "content_index": + data[name] = to_dict(parse_sheet(ContentIndexRowModel, sheet)) + else: + data[name] = [list(sheet.table.headers)] + [ + list(r) for r in sheet.table + ] + unconverted += [name] LOGGER.info( str( { "index": {"count": len(data)}, "sheets": {"count": len(reader.sheets)}, - "unconverted": {"count": len(remaining), "names": remaining}, + "unconverted": {"count": len(unconverted), "names": unconverted}, } ) ) diff --git a/tests/test_universal.py b/tests/test_universal.py index 2dc0a50..03d7f01 100644 --- a/tests/test_universal.py +++ b/tests/test_universal.py @@ -220,6 +220,37 @@ def test_process_content_indices_recursively(self): " have been processed", ) + def test_sheet_order_is_preserved(self): + datasets = [ + Dataset( + ("data_sheet", "sheet_2", "SimpleRowModel"), + ("data_sheet", "sheet_3", "SimpleRowModel"), + headers=("type", "sheet_name", "data_model"), + title="content_index", + ), + Dataset( + ("val1", "val2"), + headers=("value1", "value2"), + title="sheet_3", + ), + Dataset( + ("val1", "val2"), + headers=("value1", "value2"), + title="sheet_2", + ), + ] + + output = parse_legacy_sheets( + "tests.datarowmodels.simplemodel", + DatasetSheetReader(datasets), + ) + + self.assertEqual( + list(output.keys()), + ["content_index", "sheet_3", "sheet_2"], + "Order of keys should be same as in workbook", + ) + def test_save_as_dict(self): # TODO: Break up this test into smaller, more manageable pieces self.maxDiff = None From 0477412b2d7fb9e761e2eb411c2610374564f12d Mon Sep 17 00:00:00 2001 From: Ian Stride Date: Wed, 25 Sep 2024 21:15:09 +0100 Subject: [PATCH 05/33] Ensure column headers are preserved accurately --- src/rpft/parsers/universal.py | 114 +++++++++++----------------------- tests/test_universal.py | 86 ++++++++++++++++++++++++- 2 files changed, 119 insertions(+), 81 deletions(-) diff --git a/src/rpft/parsers/universal.py b/src/rpft/parsers/universal.py index 4c816a7..3b4893d 100644 --- a/src/rpft/parsers/universal.py +++ b/src/rpft/parsers/universal.py @@ -1,5 +1,6 @@ import importlib import logging +import re from collections import defaultdict from collections.abc import Sequence from functools import singledispatch @@ -23,6 +24,7 @@ KEY_VALUE_SEP = ":" PROP_ACCESSOR = "." SEQ_ITEM_SEP = "|" +DEINDEX_PATTERN = re.compile(r"(.*)\.\d+") def parse_legacy_sheets(models_module: str, reader: AbstractSheetReader) -> dict: @@ -39,6 +41,7 @@ def parse_legacy_sheets(models_module: str, reader: AbstractSheetReader) -> dict } model_finder = ModelFinder(models_module) data = {} + meta = {} unconverted = [] for name, sheet in reader.sheets.items(): @@ -46,7 +49,7 @@ def parse_legacy_sheets(models_module: str, reader: AbstractSheetReader) -> dict model = model_finder.find_for_entry(content_index[name]) if sheet and model: - data[name] = to_dict(parse_sheet(model, sheet)) + data[name] = to_dicts(parse_sheet(model, sheet)) else: LOGGER.warning( "%s not found, %s", @@ -54,13 +57,18 @@ def parse_legacy_sheets(models_module: str, reader: AbstractSheetReader) -> dict {"sheet": name, "model": model}, ) elif name == "content_index": - data[name] = to_dict(parse_sheet(ContentIndexRowModel, sheet)) + data[name] = to_dicts(parse_sheet(ContentIndexRowModel, sheet)) else: - data[name] = [list(sheet.table.headers)] + [ - list(r) for r in sheet.table - ] + data[name] = [list(sheet.table.headers)] + [list(r) for r in sheet.table] unconverted += [name] + meta[name] = { + "headers": [ + m[1] if (m := DEINDEX_PATTERN.match(h)) else h + for h in sheet.table.headers + ] + } + LOGGER.info( str( { @@ -71,6 +79,8 @@ def parse_legacy_sheets(models_module: str, reader: AbstractSheetReader) -> dict ) ) + data["_idems"] = {"tabulate": meta} + return data @@ -105,7 +115,7 @@ def parse_sheet(model, sheet: Sheet): ) -def to_dict(instances): +def to_dicts(instances): return [ instance.dict( by_alias=True, @@ -141,81 +151,25 @@ def find_for_entry(self, entry): def create_workbook(data: dict) -> list: - return [(k, tabulate(v)) for k, v in data.items()] + meta = data.pop("_idems", {}) + + return [(k, tabulate(v, meta.get(k, {}))) for k, v in data.items()] def tabulate(data, meta: dict = {}) -> List[List[str]]: """ Convert a nested data structure to a tabular form """ - flattened = tabulate_data(data, meta, []) - headers = { - (k, v.get("meta", {}).get("alias")): None - for item in flattened - for k, v in item.items() - } - rows = [ - [item.get(h, {}).get("data", "") for h, _ in headers.keys()] - for item in flattened - ] - - return [[alias or name for name, alias in headers.keys()]] + rows - - -@singledispatch -def tabulate_data(data, meta, path): - return create_prop(path, str(data), meta) - - -@tabulate_data.register -def _(data: list, meta, path): - if len(path) > 0: - if meta.get("layout") == "wide": - out = dict() - - for i, item in enumerate(data, start=1): - out = out | tabulate_data(item, meta | {"alias": path}, path + [str(i)]) - - return out - - return create_prop(path, stringify(data), meta) - + headers = meta.get("headers", []) or list( + {k: None for item in data for k, v in item.items()}.keys() + ) rows = [] for item in data: - rows.append(tabulate_data(item, meta, path + ["[]"])) - - return rows + obj = benedict(item) + rows += [[stringify(obj[kp]) for kp in keypaths(headers)]] - -@tabulate_data.register -def _(data: dict, meta, path): - if not path: - raise Exception("Cannot tabulate dict to table... yet.") - - if len(path) <= 1 or meta.get("layout") == "wide": - out = dict() - - for k, v in data.items(): - out = out | tabulate_data(v, meta.get(k, {}), path + [k]) - - return out - - return create_prop(path, stringify(data), meta) - - -@tabulate_data.register -def _(data: bool, meta, path): - return create_prop(path, str(data).lower(), meta) - - -def create_prop(path, data, meta={}) -> dict: - if meta.get("alias"): - meta["alias"] = ".".join(meta["alias"][1:]) - - key = ".".join(path[1:]) - - return {key: {"meta": meta, "data": data}} + return [headers] + rows @singledispatch @@ -273,21 +227,23 @@ def stream( ): yield [("_idems", "tabulate", title, "headers"), headers] + for i, row in enumerate(rows): + for h, v in zip(keypaths(headers), row): + yield [[title, i] + h, convert_cell(v)] + + +def keypaths(headers): counters = defaultdict(int) - hs = [] + indexed = [] for key in headers: - hs += [(key, counters[key])] + indexed += [(key, counters[key])] counters[key] += 1 - hs = [create_keypath(h, i, counters[h]) for h, i in hs] - - for i, row in enumerate(rows): - for h, v in zip(hs, row): - yield [[title, i] + h, convert_cell(v)] + return [keypath(h, i, counters[h]) for h, i in indexed] -def create_keypath(header, index, count): +def keypath(header, index, count): expanded = header.split(PROP_ACCESSOR) i = index if index < count else count - 1 diff --git a/tests/test_universal.py b/tests/test_universal.py index 03d7f01..90b15ae 100644 --- a/tests/test_universal.py +++ b/tests/test_universal.py @@ -45,6 +45,25 @@ def test_values_must_be_strings(self): self.assertEqual(table[1], ["true", "1.23", "123", "hello"]) + def test_columns_can_be_ordered_by_metadata(self): + meta = {"headers": ["integer", "float", "string", "boolean"]} + data = [ + { + "boolean": True, + "float": 1.23, + "integer": "123", + "string": "hello", + }, + ] + + table = tabulate(data, meta) + + self.assertEqual( + table[1], + ["123", "1.23", "hello", "true"], + "Columns should be in the same order as the headers metadata", + ) + def test_arrays_use_single_cell_layout_by_default(self): data = [ { @@ -57,7 +76,14 @@ def test_arrays_use_single_cell_layout_by_default(self): self.assertEqual(table[1], ["yes | no | 1 | false"]) def test_arrays_use_wide_layout_if_indicated_by_metadata(self): - meta = {"choices": {"layout": "wide"}} + meta = { + "headers": [ + "choices", + "choices", + "choices", + "choices", + ] + } data = [ { "choices": ["yes", "no", 1, False], @@ -84,7 +110,7 @@ def test_objects_use_single_cell_layout_by_default(self): self.assertEqual(table[1], ["prop1: val1 | prop2: val2"]) def test_objects_use_wide_layout_if_indicated_by_metadata(self): - meta = {"obj": {"layout": "wide"}} + meta = {"headers": ["obj.prop1", "obj.prop2"]} data = [ { "obj": { @@ -109,6 +135,10 @@ def test_assembly(self): data = { "group1": [{"a": "a1", "b": "b1"}], "group2": [{"A": "A1", "B": "B1"}], + "_idems": { + "group1": {"headers": ["a", "b"]}, + "group2": {"headers": ["A", "B"]}, + }, } workbook = create_workbook(data) @@ -244,6 +274,7 @@ def test_sheet_order_is_preserved(self): "tests.datarowmodels.simplemodel", DatasetSheetReader(datasets), ) + del output["_idems"] self.assertEqual( list(output.keys()), @@ -251,6 +282,56 @@ def test_sheet_order_is_preserved(self): "Order of keys should be same as in workbook", ) + def test_original_column_headers_are_preserved(self): + datasets = [ + Dataset( + ("data_sheet", "sheet_2", "SimpleRowModel"), + headers=("type", "sheet_name", "data_model"), + title="content_index", + ), + Dataset( + ("val2", "val1"), + headers=("value2", "value1"), + title="sheet_2", + ), + ] + + output = parse_legacy_sheets( + "tests.datarowmodels.simplemodel", + DatasetSheetReader(datasets), + ) + + self.assertEqual( + output["_idems"]["tabulate"]["sheet_2"]["headers"], + ["value2", "value1"], + "Original column headers should be stored as metadata", + ) + + def test_list_indices_removed_from_headers(self): + datasets = [ + Dataset( + ("data_sheet", "simpledata", "ListRowModel"), + headers=("type", "sheet_name", "data_model"), + title="content_index", + ), + Dataset( + ("rowID", "val1", "val2"), + headers=("ID", "list_value.1", "list_value.2"), + title="simpledata", + ), + ] + + output = parse_legacy_sheets( + "tests.datarowmodels.nestedmodel", + DatasetSheetReader(datasets), + ) + + self.assertEqual( + output["_idems"]["tabulate"]["simpledata"]["headers"], + ["ID", "list_value", "list_value"], + "Column headers should be stored as metadata, without indices", + ) + def test_save_as_dict(self): # TODO: Break up this test into smaller, more manageable pieces self.maxDiff = None @@ -297,6 +378,7 @@ def test_save_as_dict(self): "tests.datarowmodels.nestedmodel", DatasetSheetReader(datasets), ) + del output["_idems"] exp = { "content_index": [ { From 55f0e77a03a29d5796b204a971ec6a53c04015d4 Mon Sep 17 00:00:00 2001 From: Ian Stride Date: Thu, 26 Sep 2024 16:02:41 +0100 Subject: [PATCH 06/33] Support the same dotted path notation as legacy sheets --- src/rpft/parsers/universal.py | 48 +++++++++++++++--------- tests/test_universal.py | 70 ++++++++++++++++++----------------- 2 files changed, 66 insertions(+), 52 deletions(-) diff --git a/src/rpft/parsers/universal.py b/src/rpft/parsers/universal.py index 3b4893d..a41c37b 100644 --- a/src/rpft/parsers/universal.py +++ b/src/rpft/parsers/universal.py @@ -25,6 +25,9 @@ PROP_ACCESSOR = "." SEQ_ITEM_SEP = "|" DEINDEX_PATTERN = re.compile(r"(.*)\.\d+") +META_KEY = "_idems" +TABULATE_KEY = "tabulate" +HEADERS_KEY = "headers" def parse_legacy_sheets(models_module: str, reader: AbstractSheetReader) -> dict: @@ -62,12 +65,7 @@ def parse_legacy_sheets(models_module: str, reader: AbstractSheetReader) -> dict data[name] = [list(sheet.table.headers)] + [list(r) for r in sheet.table] unconverted += [name] - meta[name] = { - "headers": [ - m[1] if (m := DEINDEX_PATTERN.match(h)) else h - for h in sheet.table.headers - ] - } + meta[name] = {HEADERS_KEY: sheet.table.headers} LOGGER.info( str( @@ -79,7 +77,7 @@ def parse_legacy_sheets(models_module: str, reader: AbstractSheetReader) -> dict ) ) - data["_idems"] = {"tabulate": meta} + data[META_KEY] = {TABULATE_KEY: meta} return data @@ -116,13 +114,17 @@ def parse_sheet(model, sheet: Sheet): def to_dicts(instances): - return [ - instance.dict( - by_alias=True, - exclude_unset=True, - ) - for instance in instances - ] + objs = [] + + for instance in instances: + obj = instance.dict(by_alias=True, exclude_unset=True) + + if "template_argument_definitions" in obj: + obj["template_arguments"] = obj.pop("template_argument_definitions") + + objs += [obj] + + return objs class ModelFinder: @@ -151,7 +153,7 @@ def find_for_entry(self, entry): def create_workbook(data: dict) -> list: - meta = data.pop("_idems", {}) + meta = data.pop(META_KEY, {}).get(TABULATE_KEY, {}) return [(k, tabulate(v, meta.get(k, {}))) for k, v in data.items()] @@ -160,7 +162,10 @@ def tabulate(data, meta: dict = {}) -> List[List[str]]: """ Convert a nested data structure to a tabular form """ - headers = meta.get("headers", []) or list( + if all(type(item) is list for item in data): + return data + + headers = meta.get(HEADERS_KEY, []) or list( {k: None for item in data for k, v in item.items()}.keys() ) rows = [] @@ -225,7 +230,7 @@ def stream( headers: Sequence[str] = tuple(), rows: Sequence[Sequence[str]] = tuple(), ): - yield [("_idems", "tabulate", title, "headers"), headers] + yield [(META_KEY, TABULATE_KEY, title, HEADERS_KEY), headers] for i, row in enumerate(rows): for h, v in zip(keypaths(headers), row): @@ -244,12 +249,19 @@ def keypaths(headers): def keypath(header, index, count): - expanded = header.split(PROP_ACCESSOR) + expanded = [normalise_key(k) for k in header.split(PROP_ACCESSOR)] i = index if index < count else count - 1 return expanded + [i] if count > 1 else expanded +def normalise_key(key): + try: + return int(key) - 1 + except ValueError: + return key + + def create_obj(pairs): obj = benedict() diff --git a/tests/test_universal.py b/tests/test_universal.py index 90b15ae..19bceba 100644 --- a/tests/test_universal.py +++ b/tests/test_universal.py @@ -110,20 +110,41 @@ def test_objects_use_single_cell_layout_by_default(self): self.assertEqual(table[1], ["prop1: val1 | prop2: val2"]) def test_objects_use_wide_layout_if_indicated_by_metadata(self): - meta = {"headers": ["obj.prop1", "obj.prop2"]} + meta = {"headers": ["obj1.k1", "obj1.k2", "seq1.1.k1", "seq1.2.k2"]} data = [ { - "obj": { - "prop1": "val1", - "prop2": "val2", + "obj1": { + "k1": "obj1_k1_v", + "k2": "obj1_k2_v", }, + "seq1": [ + {"k1": "seq1_k1_v"}, + {"k2": "seq1_k2_v"}, + ], }, ] table = tabulate(data, meta) - self.assertEqual(table[0], ["obj.prop1", "obj.prop2"]) - self.assertEqual(table[1], ["val1", "val2"]) + self.assertEqual( + table[0], + ["obj1.k1", "obj1.k2", "seq1.1.k1", "seq1.2.k2"], + ) + self.assertEqual( + table[1], + ["obj1_k1_v", "obj1_k2_v", "seq1_k1_v", "seq1_k2_v"], + ) + + def test_2d_arrays_are_passed_through(self): + meta = {"headers": ["A", "B"]} + data = [ + ["A", "B"], + ["a1", "b1"], + ] + + table = tabulate(data, meta) + + self.assertEqual(table, data) # TODO: test pointers/references # TODO: add explicit type information @@ -136,8 +157,10 @@ def test_assembly(self): "group1": [{"a": "a1", "b": "b1"}], "group2": [{"A": "A1", "B": "B1"}], "_idems": { - "group1": {"headers": ["a", "b"]}, - "group2": {"headers": ["A", "B"]}, + "tabulate": { + "group1": {"headers": ["a", "b"]}, + "group2": {"headers": ["B", "A"]}, + }, }, } @@ -147,7 +170,11 @@ def test_assembly(self): self.assertEqual(workbook[0][0], "group1") self.assertEqual(workbook[0][1], [["a", "b"], ["a1", "b1"]]) self.assertEqual(workbook[1][0], "group2") - self.assertEqual(workbook[1][1], [["A", "B"], ["A1", "B1"]]) + self.assertEqual( + workbook[1][1], + [["B", "A"], ["B1", "A1"]], + "Columns should be ordered according to metadata", + ) class TestConvertLegacyToUniversal(TestCase): @@ -307,31 +334,6 @@ def test_original_column_headers_are_preserved(self): "Original column headers should be stored as metadata", ) - def test_list_indices_removed_from_headers(self): - datasets = [ - Dataset( - ("data_sheet", "simpledata", "ListRowModel"), - headers=("type", "sheet_name", "data_model"), - title="content_index", - ), - Dataset( - ("rowID", "val1", "val2"), - headers=("ID", "list_value.1", "list_value.2"), - title="simpledata", - ), - ] - - output = parse_legacy_sheets( - "tests.datarowmodels.nestedmodel", - DatasetSheetReader(datasets), - ) - - self.assertEqual( - output["_idems"]["tabulate"]["simpledata"]["headers"], - ["ID", "list_value", "list_value"], - "Column headers should be stored as metadata, without indices", - ) - def test_save_as_dict(self): # TODO: Break up this test into smaller, more manageable pieces self.maxDiff = None From be4d2a1deaadd8f11d047cdc3c40f157a8c8f2af Mon Sep 17 00:00:00 2001 From: Ian Stride Date: Mon, 30 Sep 2024 14:50:06 +0100 Subject: [PATCH 07/33] Create command to convert universal sheets to JSON --- src/rpft/cli.py | 6 ++++-- src/rpft/converters.py | 17 ++++++++++++++--- src/rpft/parsers/universal.py | 26 ++++++++++++++++++-------- tests/test_universal.py | 24 +++++++++++++++++------- 4 files changed, 53 insertions(+), 20 deletions(-) diff --git a/src/rpft/cli.py b/src/rpft/cli.py index 97cd70a..0a2961f 100644 --- a/src/rpft/cli.py +++ b/src/rpft/cli.py @@ -56,8 +56,10 @@ def uni_to_sheets(args): def sheets_to_uni(args): - # TODO: convert uni sheets to uni JSON - ... + data = converters.sheets_to_uni(args.input) + + with open(args.output, "w", encoding="utf-8") as f: + json.dump(data, f, indent=2) def create_parser(): diff --git a/src/rpft/converters.py b/src/rpft/converters.py index 89d341a..c35a113 100644 --- a/src/rpft/converters.py +++ b/src/rpft/converters.py @@ -1,6 +1,7 @@ import json import logging import os +import re import shutil import sys from pathlib import Path @@ -73,11 +74,11 @@ def uni_to_sheets(infile) -> bytes: ] ) - return book.export("xlsx") + return book.export("ods") -def sheets_to_uni(infile, fmt) -> list: - return parse_tables(create_sheet_reader(fmt, infile)) +def sheets_to_uni(infile) -> list: + return parse_tables(create_sheet_reader(None, infile)) def get_content_index_parser(input_files, sheet_format, data_models, tags): @@ -125,6 +126,8 @@ def flows_to_sheets( def create_sheet_reader(sheet_format, input_file): + sheet_format = sheet_format if sheet_format else detect_format(input_file) + if sheet_format == "csv": sheet_reader = CSVSheetReader(input_file) elif sheet_format == "xlsx": @@ -139,6 +142,14 @@ def create_sheet_reader(sheet_format, input_file): return sheet_reader +def detect_format(fp): + if bool(re.fullmatch(r"[a-z0-9_-]{44}", fp, re.IGNORECASE)): + return "google_sheets" + + if Path(fp).suffix.lower() == ".xlsx": + return "xlsx" + + def sheets_to_csv(path, sheet_ids): prepare_dir(path) diff --git a/src/rpft/parsers/universal.py b/src/rpft/parsers/universal.py index a41c37b..0768a4c 100644 --- a/src/rpft/parsers/universal.py +++ b/src/rpft/parsers/universal.py @@ -184,10 +184,16 @@ def stringify(value) -> str: @stringify.register def _(value: dict) -> str: - return " | ".join( - "{0}: {1}".format(stringify(k), stringify(v)) for k, v in value.items() + + s = " | ".join( + f"{stringify(k)}{KEY_VALUE_SEP} {stringify(v)}" for k, v in value.items() ) + if len(value) == 1: + s += " " + SEQ_ITEM_SEP + + return s + @stringify.register def _(value: list) -> str: @@ -203,10 +209,12 @@ def parse_tables(reader: AbstractSheetReader) -> dict: """ Parse a workbook into a nested structure """ - return [ - parse_table(title, sheet.table.headers, sheet.table[:]) - for title, sheet in reader.sheets.items() - ] + obj = benedict() + + for title, sheet in reader.sheets.items(): + obj.merge(parse_table(title, sheet.table.headers, sheet.table[:])) + + return obj def parse_table( @@ -266,9 +274,11 @@ def create_obj(pairs): obj = benedict() for kp, v in pairs: + # print("KP:", kp) + # print("V:", v) obj[kp] = v - return dict(obj) + return obj def convert_cell(s: str, recursive=True) -> Any: @@ -290,7 +300,7 @@ def convert_cell(s: str, recursive=True) -> Any: if clean in ("true", "false"): return clean == "true" - if recursive and KEY_VALUE_SEP in s: + if recursive and KEY_VALUE_SEP in s and SEQ_ITEM_SEP in s: try: props = [p.split(KEY_VALUE_SEP, 1) for p in s.split(SEQ_ITEM_SEP) if p] diff --git a/tests/test_universal.py b/tests/test_universal.py index 19bceba..56fedcd 100644 --- a/tests/test_universal.py +++ b/tests/test_universal.py @@ -109,6 +109,13 @@ def test_objects_use_single_cell_layout_by_default(self): self.assertEqual(table[1], ["prop1: val1 | prop2: val2"]) + def test_object_with_single_property_within_cell_has_trailing_separator(self): + data = [{"obj": {"k": "v"}}] + + table = tabulate(data) + + self.assertEqual(table[1], ["k: v |"]) + def test_objects_use_wide_layout_if_indicated_by_metadata(self): meta = {"headers": ["obj1.k1", "obj1.k2", "seq1.1.k1", "seq1.2.k2"]} data = [ @@ -461,7 +468,7 @@ def test_save_as_dict(self): class TestConvertWorkbookToUniversal(TestCase): - def test_workbook_converts_to_list_of_objects(self): + def test_workbook_converts_to_object(self): workbook = DatasetSheetReader( [ Dataset(("t1a1", "t1b1"), headers=("T1A", "T1B"), title="table1"), @@ -471,9 +478,12 @@ def test_workbook_converts_to_list_of_objects(self): nested = parse_tables(workbook) - self.assertIsInstance(nested, list) - self.assertEqual(len(nested), 2) - self.assertTrue(all(type(o) is dict for o in nested)) + self.assertIsInstance(nested, dict) + self.assertEqual(list(nested.keys()), ["_idems", "table1", "table2"]) + self.assertEqual( + list(nested["_idems"]["tabulate"].keys()), + ["table1", "table2"], + ) class TestConvertTableToNested(TestCase): @@ -560,6 +570,8 @@ def test_output_clean_string_if_no_conversion_possible(self): self.assertEqual(convert_cell("one"), "one") self.assertEqual(convert_cell(" one "), "one") self.assertEqual(convert_cell(""), "") + self.assertEqual(convert_cell("http://example.com/"), "http://example.com/") + self.assertEqual(convert_cell("k1: v1"), "k1: v1") def test_raises_error_if_not_string_input(self): self.assertRaises(TypeError, convert_cell, None) @@ -578,8 +590,6 @@ def test_convert_cell_string_to_list(self): self.assertEqual(convert_cell("k1 | v1 : k2 | v2"), ["k1", "v1 : k2", "v2"]) def test_convert_cell_string_to_dict(self): - self.assertEqual(convert_cell("k1: v1"), {"k1": "v1"}) - self.assertEqual(convert_cell(" k1 : v1 "), {"k1": "v1"}) self.assertEqual(convert_cell("k1: v1 |"), {"k1": "v1"}) - self.assertEqual(convert_cell("k1: k2: v2"), {"k1": "k2: v2"}) + self.assertEqual(convert_cell("k1: k2: v2 |"), {"k1": "k2: v2"}) self.assertEqual(convert_cell("k1: 1 | k2: true"), {"k1": 1, "k2": True}) From b8a9a19c15b449eb013056ea2b569c21059de8f5 Mon Sep 17 00:00:00 2001 From: Ian Stride Date: Thu, 10 Oct 2024 19:46:20 +0100 Subject: [PATCH 08/33] Create cell parser that preserves templates --- src/rpft/parsers/common/cellparser.py | 14 +++++ .../parsers/creation/contentindexrowmodel.py | 33 ----------- src/rpft/parsers/creation/flowrowmodel.py | 33 +++++++++++ src/rpft/parsers/universal.py | 16 ++---- tests/test_template_preserver.py | 55 +++++++++++++++++++ tests/test_universal.py | 2 +- 6 files changed, 109 insertions(+), 44 deletions(-) create mode 100644 tests/test_template_preserver.py diff --git a/src/rpft/parsers/common/cellparser.py b/src/rpft/parsers/common/cellparser.py index 2aeda14..4192c73 100644 --- a/src/rpft/parsers/common/cellparser.py +++ b/src/rpft/parsers/common/cellparser.py @@ -153,3 +153,17 @@ def unescape(nested_list): if type(nested_list) is str else [unescape(item) for item in nested_list] ) + + +class TemplatePreserver(CellParser): + + def split_into_lists(self, string): + if "{{" in string or "{@" in string: + return [ + item.strip() + for item in re.findall( + r"(\{[{@][^\{\}]*[}@]\}|[^{}@|;]+) *[;|]? *", string + ) + ] + else: + return super().split_into_lists(string) diff --git a/src/rpft/parsers/creation/contentindexrowmodel.py b/src/rpft/parsers/creation/contentindexrowmodel.py index 2c622f7..ecf196e 100644 --- a/src/rpft/parsers/creation/contentindexrowmodel.py +++ b/src/rpft/parsers/creation/contentindexrowmodel.py @@ -1,9 +1,6 @@ from enum import Enum -from pydantic.v1 import Field - from rpft.parsers.common.rowparser import ParserModel -from rpft.parsers.creation.flowrowmodel import WhatsAppTemplating, Webhook from rpft.parsers.creation.models import SurveyConfig @@ -52,33 +49,3 @@ def header_name_to_field_name_with_context(header, row): return "survey_config" else: return header - - -class CreateFlowRowModel(ParserModel): - audio: str = "" - choices: List[str] = [] - condition: str = "" - condition_name: str = "" - condition_type: str = "" - condition_var: str = "" - data_row_id: str = "" - data_sheet: str = "" - from_: str = Field(alias="from", default="") - image: str = "" - include_if: str = "" - loop_variable: str = "" - mainarg_destination_row_ids: List[str] = [] - mainarg_expression: str = "" - message_text: str = "" - no_response: str = "" - nodeId: str = Field(alias="_nodeId", default="") - node_name: str = "" - obj_id: str = "" - obj_name: str = "" - row_id: str = "" - save_name: str = "" - template_arguments: list = [] - type: str = "" - video: str = "" - wa_template: WhatsAppTemplating = WhatsAppTemplating() - webhook: Webhook = Webhook() diff --git a/src/rpft/parsers/creation/flowrowmodel.py b/src/rpft/parsers/creation/flowrowmodel.py index baaaadc..ae91ddf 100644 --- a/src/rpft/parsers/creation/flowrowmodel.py +++ b/src/rpft/parsers/creation/flowrowmodel.py @@ -1,5 +1,7 @@ from pydantic import ConfigDict +from pydantic.v1 import Field + from rpft.parsers.common.rowparser import ParserModel from rpft.parsers.creation.models import Condition @@ -175,3 +177,34 @@ def header_name_to_field_name_with_context(header, row): def is_starting_row(self): return len(self.edges) == 1 and self.edges[0].from_ == "start" + + +class FlowTemplateStatement(ParserModel): + attachments: List[str] = [] + audio: str = "" + choices: List[str] = [] + condition: List[str] = Field(default_factory=list) + condition_value: List[str] = Field(default_factory=list) + condition_name: List[str] = Field(default_factory=list) + condition_type: List[str] = Field(default_factory=list) + condition_var: List[str] = Field(default_factory=list) + condition_variable: List[str] = Field(default_factory=list) + data_row_id: str = "" + data_sheet: str = "" + from_: List[str] = Field(alias="from", default_factory=list) + image: str = "" + include_if: str = "true" + loop_variable: List[str] = Field(default_factory=list) + message_text: str = "" + no_response: str = "" + nodeId: str = Field(alias="_nodeId", default="") + node_name: str = "" + obj_id: str = "" + obj_name: str = "" + row_id: str = "" + save_name: str = "" + template_arguments: list = [] + type: str = "" + video: str = "" + wa_template: WhatsAppTemplating = WhatsAppTemplating() + webhook: Webhook = Webhook() diff --git a/src/rpft/parsers/universal.py b/src/rpft/parsers/universal.py index 0768a4c..9fce288 100644 --- a/src/rpft/parsers/universal.py +++ b/src/rpft/parsers/universal.py @@ -8,14 +8,12 @@ from benedict import benedict -from rpft.parsers.common.cellparser import CellParser +from rpft.parsers.common.cellparser import TemplatePreserver from rpft.parsers.common.rowparser import RowParser from rpft.parsers.common.sheetparser import SheetParser from rpft.parsers.creation.campaigneventrowmodel import CampaignEventRowModel -from rpft.parsers.creation.contentindexrowmodel import ( - ContentIndexRowModel, - CreateFlowRowModel, -) +from rpft.parsers.creation.contentindexrowmodel import ContentIndexRowModel +from rpft.parsers.creation.flowrowmodel import FlowTemplateStatement from rpft.parsers.creation.triggerrowmodel import TriggerRowModel from rpft.parsers.sheets import AbstractSheetReader, Sheet @@ -101,7 +99,7 @@ def parse_content_index(reader, name): def parse_sheet(model, sheet: Sheet): try: return SheetParser( - RowParser(model, CellParser()), + RowParser(model, TemplatePreserver()), sheet.table, context=None, ).parse_all() @@ -131,9 +129,9 @@ class ModelFinder: type_model_map = { "content_index": ContentIndexRowModel, "create_campaign": CampaignEventRowModel, - "create_flow": CreateFlowRowModel, + "create_flow": FlowTemplateStatement, "create_triggers": TriggerRowModel, - "template_definition": CreateFlowRowModel, + "template_definition": FlowTemplateStatement, } def __init__(self, module=None): @@ -274,8 +272,6 @@ def create_obj(pairs): obj = benedict() for kp, v in pairs: - # print("KP:", kp) - # print("V:", v) obj[kp] = v return obj diff --git a/tests/test_template_preserver.py b/tests/test_template_preserver.py new file mode 100644 index 0000000..99320a3 --- /dev/null +++ b/tests/test_template_preserver.py @@ -0,0 +1,55 @@ +from unittest import TestCase + +from rpft.parsers.common.cellparser import TemplatePreserver + + +class TestTemplatePreserver(TestCase): + + def test_templates_preserved_when_splitting_into_list(self): + parser = TemplatePreserver() + data = [ + ("{{test}}|", ["{{test}}"], "Single string template with pipe"), + ("{{ test }} | ", ["{{ test }}"], "Single string template pipe and spaces"), + ("{{test}};", ["{{test}}"], "Single string template with semi-colon"), + ( + "{{ test }} ; ", + ["{{ test }}"], + "Single string template with semi-colon and spaces", + ), + ( + "{@ test @} |", + ["{@ test @}"], + "Single native type template with pipe and spaces", + ), + ( + "{@ test @} ;", + ["{@ test @}"], + "Single native type template with semi-colon and spaces", + ), + ( + "{@ something @} | something | {{ blah }}", + ["{@ something @}", "something", "{{ blah }}"], + "Native type, string, string template, with pipes", + ), + ( + "{@ something @} ; something ; {{ blah }}", + ["{@ something @}", "something", "{{ blah }}"], + "Native type, string, string template, with semi-colons", + ), + ( + "{{3*(steps.values()|length -1)}};{{3*(steps.values()|length -1)+2}}", + [ + "{{3*(steps.values()|length -1)}}", + "{{3*(steps.values()|length -1)+2}}", + ], + "Real-life example", + ), + ("A | B | C", ["A", "B", "C"], "No templates"), + ] + + for string, expected, message in data: + self.assertEqual( + parser.split_into_lists(string), + expected, + message, + ) diff --git a/tests/test_universal.py b/tests/test_universal.py index 56fedcd..fb148da 100644 --- a/tests/test_universal.py +++ b/tests/test_universal.py @@ -447,7 +447,7 @@ def test_save_as_dict(self): { "row_id": "", "type": "send_message", - "from": "start", + "from": ["start"], "message_text": "Some text", }, ], From a79259037b5cf355c274d872a174cb0d570f1c80 Mon Sep 17 00:00:00 2001 From: Ian Stride Date: Fri, 17 Jan 2025 18:42:45 +0000 Subject: [PATCH 09/33] Fix tests --- src/rpft/parsers/universal.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/rpft/parsers/universal.py b/src/rpft/parsers/universal.py index 9fce288..600ec37 100644 --- a/src/rpft/parsers/universal.py +++ b/src/rpft/parsers/universal.py @@ -99,8 +99,8 @@ def parse_content_index(reader, name): def parse_sheet(model, sheet: Sheet): try: return SheetParser( - RowParser(model, TemplatePreserver()), sheet.table, + row_parser=RowParser(model, TemplatePreserver()), context=None, ).parse_all() except Exception as e: @@ -236,11 +236,11 @@ def stream( headers: Sequence[str] = tuple(), rows: Sequence[Sequence[str]] = tuple(), ): - yield [(META_KEY, TABULATE_KEY, title, HEADERS_KEY), headers] + yield [META_KEY, TABULATE_KEY, title, HEADERS_KEY], headers for i, row in enumerate(rows): for h, v in zip(keypaths(headers), row): - yield [[title, i] + h, convert_cell(v)] + yield [title, i] + h, convert_cell(v) def keypaths(headers): From 34c83557c862d248f4308dc279c3f2cb063e4eb0 Mon Sep 17 00:00:00 2001 From: Ian Stride Date: Mon, 27 Jan 2025 17:09:35 +0000 Subject: [PATCH 10/33] Parse cell content from existing sheets to nested JSON without models --- pyproject.toml | 1 + src/rpft/parsers/universal.py | 96 ++++++++++++++++++++++++++++++----- tests/test_universal.py | 84 ++++++++++++++++++++---------- 3 files changed, 140 insertions(+), 41 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 46b1df3..efde8d1 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -30,6 +30,7 @@ classifiers = [ "Topic :: Utilities", ] dependencies = [ + "lark", "Jinja2~=3.0.3", "google-api-python-client~=2.6.0", "google-auth-oauthlib~=0.4.4", diff --git a/src/rpft/parsers/universal.py b/src/rpft/parsers/universal.py index 600ec37..730b750 100644 --- a/src/rpft/parsers/universal.py +++ b/src/rpft/parsers/universal.py @@ -7,6 +7,7 @@ from typing import Any, List from benedict import benedict +from lark import Lark, Transformer from rpft.parsers.common.cellparser import TemplatePreserver from rpft.parsers.common.rowparser import RowParser @@ -19,13 +20,42 @@ LOGGER = logging.getLogger(__name__) -KEY_VALUE_SEP = ":" +KEY_VALUE_SEP = ";" PROP_ACCESSOR = "." SEQ_ITEM_SEP = "|" -DEINDEX_PATTERN = re.compile(r"(.*)\.\d+") META_KEY = "_idems" TABULATE_KEY = "tabulate" HEADERS_KEY = "headers" +CELL_GRAMMAR = r""" +?start : TEMPLATE -> template + | seq + | item + +seq : (item? "|" item?)+ + +?item : subseq + | value + +subseq : (value? ";" value?)+ + +?value : NUMBER -> number + | BOOLEAN -> boolean + | STRING -> string + | -> empty + +NUMBER : SIGNED_NUMBER + +BOOLEAN : "true" + | "false" + +TEMPLATE : /.*{[{@%].*/ + +STRING : /(\\[|;]|[^|;])+/ + +%import common (SIGNED_NUMBER, WS) +%ignore WS +""" +PARSER = Lark(CELL_GRAMMAR) def parse_legacy_sheets(models_module: str, reader: AbstractSheetReader) -> dict: @@ -277,7 +307,7 @@ def create_obj(pairs): return obj -def convert_cell(s: str, recursive=True) -> Any: +def convert_cell(s: str, delimiters=["|", ";"]) -> Any: if type(s) is not str: raise TypeError("Value to convert is not a string") @@ -296,22 +326,60 @@ def convert_cell(s: str, recursive=True) -> Any: if clean in ("true", "false"): return clean == "true" - if recursive and KEY_VALUE_SEP in s and SEQ_ITEM_SEP in s: - try: - props = [p.split(KEY_VALUE_SEP, 1) for p in s.split(SEQ_ITEM_SEP) if p] + if is_template(clean): + return clean + + delim, *delims = delimiters if delimiters else [None] + + if delim and delim in clean: + seq = [convert_cell(item, delimiters=delims) for item in clean.split(delim)] + + return seq[:-1] if clean and clean[-1] == delim else seq + + if any(s in clean for s in delims): + return convert_cell(clean, delimiters=delims) + + return clean - return {k.strip(): convert_cell(v, recursive=False) for k, v in props} + +def is_template(s: str) -> bool: + return bool(re.match("{{.*?}}|{@.*?@}|{%.*?%}", s)) + + +class CellTransformer(Transformer): + + seq = subseq = list + + def boolean(self, tokens): + return (tokens[0]).strip() == "true" + + def empty(self, tokens): + return "" + + def number(self, tokens): + token = (tokens[0]).strip() + + try: + return int(token) except Exception: pass - if recursive and SEQ_ITEM_SEP in s: try: - return [ - convert_cell(item, recursive=False) - for item in s.split(SEQ_ITEM_SEP) - if item - ] + return float(token) except Exception: pass - return clean + raise Exception(f"Conversion to number failed, token={token}") + + def string(self, tokens): + return re.sub(r"\\(.{1})", r"\g<1>", tokens[0].strip()) + + def template(self, tokens): + return self.string(tokens) + + +def parse_cell(cell: str) -> Any: + if type(cell) is not str: + raise TypeError("Value to convert must be a string") + + return CellTransformer().transform(PARSER.parse(cell)) if cell else "" diff --git a/tests/test_universal.py b/tests/test_universal.py index fb148da..3c4c690 100644 --- a/tests/test_universal.py +++ b/tests/test_universal.py @@ -4,6 +4,7 @@ from rpft.parsers.universal import ( convert_cell, create_workbook, + parse_cell, parse_legacy_sheets, parse_table, parse_tables, @@ -107,14 +108,14 @@ def test_objects_use_single_cell_layout_by_default(self): table = tabulate(data) - self.assertEqual(table[1], ["prop1: val1 | prop2: val2"]) + self.assertEqual(table[1], ["prop1; val1 | prop2; val2"]) def test_object_with_single_property_within_cell_has_trailing_separator(self): data = [{"obj": {"k": "v"}}] table = tabulate(data) - self.assertEqual(table[1], ["k: v |"]) + self.assertEqual(table[1], ["k; v |"]) def test_objects_use_wide_layout_if_indicated_by_metadata(self): meta = {"headers": ["obj1.k1", "obj1.k2", "seq1.1.k1", "seq1.2.k2"]} @@ -524,7 +525,6 @@ def test_columns_with_same_name_are_grouped_into_list(self): parsed = parse_table(headers=["a"] * 4, rows=[("one", "2", "true", "3.4")]) self.assertEqual(parsed["table"][0]["a"], ["one", 2, True, 3.4]) - # self.assertEqual(parsed["_idems"]["tabulate"]["table"]["a"]["layout"], "wide") def test_columns_with_same_name_and_delimited_strings_is_2d_array(self): parsed = parse_table(headers=["a"] * 2, rows=[("one | 2", "true | 3.4")]) @@ -551,45 +551,75 @@ def test_nested_object_with_2d_array_property_value(self): def test_nested_object_with_nested_object(self): parsed = parse_table( headers=["obj.k1"] * 2, - rows=[["k2: 2 | k3: false", "k4: v4 | k5: true"]], + rows=[["k2; 2 | k3; false", "k4; v4 | k5; true"]], ) self.assertEqual( parsed["table"][0]["obj"], - {"k1": [{"k2": 2, "k3": False}, {"k4": "v4", "k5": True}]}, + {"k1": [[["k2", 2], ["k3", False]], [["k4", "v4"], ["k5", True]]]}, ) class TestCellConversion(TestCase): + def setUp(self): + self.func = convert_cell + def test_convert_cell_string_to_number(self): - self.assertEqual(convert_cell("123"), 123) - self.assertEqual(convert_cell("1.23"), 1.23) + self.assertEqual(self.func("123"), 123) + self.assertEqual(self.func("1.23"), 1.23) def test_output_clean_string_if_no_conversion_possible(self): - self.assertEqual(convert_cell("one"), "one") - self.assertEqual(convert_cell(" one "), "one") - self.assertEqual(convert_cell(""), "") - self.assertEqual(convert_cell("http://example.com/"), "http://example.com/") - self.assertEqual(convert_cell("k1: v1"), "k1: v1") + self.assertEqual(self.func("one"), "one") + self.assertEqual(self.func(" one "), "one") + self.assertEqual(self.func(""), "") + self.assertEqual(self.func("http://example.com/"), "http://example.com/") + self.assertEqual(self.func("k1: v1"), "k1: v1") def test_raises_error_if_not_string_input(self): - self.assertRaises(TypeError, convert_cell, None) - self.assertRaises(TypeError, convert_cell, 123) + self.assertRaises(TypeError, self.func, None) + self.assertRaises(TypeError, self.func, 123) def test_convert_cell_string_to_bool(self): - self.assertEqual(convert_cell("true"), True) - self.assertEqual(convert_cell(" true "), True) - self.assertEqual(convert_cell("false"), False) + self.assertEqual(self.func("true"), True) + self.assertEqual(self.func(" true "), True) + self.assertEqual(self.func("false"), False) def test_convert_cell_string_to_list(self): - self.assertEqual(convert_cell("one | 2 | false"), ["one", 2, False]) - self.assertEqual(convert_cell("one |"), ["one"]) - self.assertEqual(convert_cell("|"), []) - self.assertEqual(convert_cell("| 2 |"), [2]) - self.assertEqual(convert_cell("k1 | v1 : k2 | v2"), ["k1", "v1 : k2", "v2"]) - - def test_convert_cell_string_to_dict(self): - self.assertEqual(convert_cell("k1: v1 |"), {"k1": "v1"}) - self.assertEqual(convert_cell("k1: k2: v2 |"), {"k1": "k2: v2"}) - self.assertEqual(convert_cell("k1: 1 | k2: true"), {"k1": 1, "k2": True}) + self.assertEqual(self.func("one | 2 | false"), ["one", 2, False]) + self.assertEqual(self.func("one ; 2 ; false"), ["one", 2, False]) + self.assertEqual(self.func("one |"), ["one"]) + self.assertEqual(self.func("|"), [""]) + self.assertEqual(self.func("| 2 |"), ["", 2]) + self.assertEqual(self.func("a||"), ["a", ""]) + self.assertEqual(self.func("k1 | v1 : k2 | v2"), ["k1", "v1 : k2", "v2"]) + + def test_convert_cell_string_to_list_of_lists(self): + self.assertEqual(self.func("k1; v1 |"), [["k1", "v1"]]) + self.assertEqual(self.func("k1; k2; v2 |"), [["k1", "k2", "v2"]]) + self.assertEqual(self.func("k1; 1 | k2; true"), [["k1", 1], ["k2", True]]) + + def test_inline_templates_are_preserved(self): + self.assertEqual(self.func("{{ template }}"), "{{ template }}") + self.assertEqual(self.func("{@ template @}"), "{@ template @}") + self.assertEqual( + self.func("{% if other_option!=" "%}1wc;1wt;1wb{%endif-%}"), + "{% if other_option!=" "%}1wc;1wt;1wb{%endif-%}", + ) + self.assertEqual(self.func("{{ template }} |"), "{{ template }} |") + self.assertEqual( + self.func("{{ template }} | something | {{ blah }}"), + "{{ template }} | something | {{ blah }}", + ) + self.assertEqual( + self.func( + "{{3*(steps.values()|length -1)}}|{{3*(steps.values()|length -1)+2}}" + ), + "{{3*(steps.values()|length -1)}}|{{3*(steps.values()|length -1)+2}}", + ) + + +class TestLarkCellConversion(TestCellConversion): + + def setUp(self): + self.func = parse_cell From bcd708ffcd0817363db7b54d8e740e1e9e83076a Mon Sep 17 00:00:00 2001 From: Ian Stride Date: Tue, 28 Jan 2025 20:53:16 +0000 Subject: [PATCH 11/33] Remove legacy sheets to universal format using models --- src/rpft/cli.py | 42 ----- src/rpft/converters.py | 11 +- src/rpft/parsers/universal.py | 132 +--------------- tests/test_universal.py | 283 ---------------------------------- 4 files changed, 2 insertions(+), 466 deletions(-) diff --git a/src/rpft/cli.py b/src/rpft/cli.py index 0a2961f..d61f9f0 100644 --- a/src/rpft/cli.py +++ b/src/rpft/cli.py @@ -39,17 +39,6 @@ def flows_to_sheets(args): ) -def legacy_sheets_to_uni(args): - data: dict = converters.legacy_sheets_to_uni( - args.input, - args.format, - data_models=args.models, - ) - - with open(args.output, "w", encoding="utf-8") as export: - json.dump(data, export, indent=2) - - def uni_to_sheets(args): with open(args.output, "wb") as handle: handle.write(converters.uni_to_sheets(args.input)) @@ -75,7 +64,6 @@ def create_parser(): _add_create_command(sub) _add_convert_command(sub) _add_flows_to_sheets_command(sub) - _add_legacy_to_uni_command(sub) _add_uni_to_sheets_command(sub) _add_sheets_to_uni_command(sub) @@ -189,36 +177,6 @@ def _add_flows_to_sheets_command(sub): ) -def _add_legacy_to_uni_command(sub): - parser = sub.add_parser( - "legacy-to-uni", - help="convert legacy sheets to nested JSON", - ) - - parser.set_defaults(func=legacy_sheets_to_uni) - parser.add_argument( - "input", - help=( - "location of workbook (xlsx, Google Sheets) or directory containing CSVs" - ), - ) - parser.add_argument( - "output", - help=("location where JSON output file will be saved"), - ) - parser.add_argument( - "--models", - help=("name of the Python module defining user data models"), - ) - parser.add_argument( - "-f", - "--format", - choices=["csv", "google_sheets", "xlsx"], - help="input sheet format", - required=True, - ) - - def _add_uni_to_sheets_command(sub): parser = sub.add_parser( "uni-to-sheets", diff --git a/src/rpft/converters.py b/src/rpft/converters.py index c35a113..b5df6db 100644 --- a/src/rpft/converters.py +++ b/src/rpft/converters.py @@ -6,7 +6,7 @@ import sys from pathlib import Path -from rpft.parsers.universal import create_workbook, parse_legacy_sheets, parse_tables +from rpft.parsers.universal import create_workbook, parse_tables from rpft.parsers.creation.contentindexparser import ContentIndexParser from rpft.parsers.creation.tagmatcher import TagMatcher from rpft.parsers.sheets import ( @@ -53,15 +53,6 @@ def create_flows(input_files, output_file, sheet_format, data_models=None, tags= return flows -def legacy_sheets_to_uni(in_file, sheet_format, data_models=None) -> dict: - """ - Convert legacy data sheets to universal format - """ - reader = create_sheet_reader(sheet_format, in_file) - - return parse_legacy_sheets(data_models, reader) - - def uni_to_sheets(infile) -> bytes: with open(infile, "r") as handle: data = json.load(handle) diff --git a/src/rpft/parsers/universal.py b/src/rpft/parsers/universal.py index 730b750..e393e1a 100644 --- a/src/rpft/parsers/universal.py +++ b/src/rpft/parsers/universal.py @@ -1,4 +1,3 @@ -import importlib import logging import re from collections import defaultdict @@ -9,14 +8,7 @@ from benedict import benedict from lark import Lark, Transformer -from rpft.parsers.common.cellparser import TemplatePreserver -from rpft.parsers.common.rowparser import RowParser -from rpft.parsers.common.sheetparser import SheetParser -from rpft.parsers.creation.campaigneventrowmodel import CampaignEventRowModel -from rpft.parsers.creation.contentindexrowmodel import ContentIndexRowModel -from rpft.parsers.creation.flowrowmodel import FlowTemplateStatement -from rpft.parsers.creation.triggerrowmodel import TriggerRowModel -from rpft.parsers.sheets import AbstractSheetReader, Sheet +from rpft.parsers.sheets import AbstractSheetReader LOGGER = logging.getLogger(__name__) @@ -58,128 +50,6 @@ PARSER = Lark(CELL_GRAMMAR) -def parse_legacy_sheets(models_module: str, reader: AbstractSheetReader) -> dict: - """ - Convert multiple sheets in the legacy format into a nested data structure - """ - content_index: List[ContentIndexRowModel] = { - entry.sheet_name[0]: entry - for entry in parse_content_index( - reader, - "content_index", - ) - if len(entry.sheet_name) == 1 - } - model_finder = ModelFinder(models_module) - data = {} - meta = {} - unconverted = [] - - for name, sheet in reader.sheets.items(): - if name in content_index: - model = model_finder.find_for_entry(content_index[name]) - - if sheet and model: - data[name] = to_dicts(parse_sheet(model, sheet)) - else: - LOGGER.warning( - "%s not found, %s", - "Sheet" if not sheet else "Model", - {"sheet": name, "model": model}, - ) - elif name == "content_index": - data[name] = to_dicts(parse_sheet(ContentIndexRowModel, sheet)) - else: - data[name] = [list(sheet.table.headers)] + [list(r) for r in sheet.table] - unconverted += [name] - - meta[name] = {HEADERS_KEY: sheet.table.headers} - - LOGGER.info( - str( - { - "index": {"count": len(data)}, - "sheets": {"count": len(reader.sheets)}, - "unconverted": {"count": len(unconverted), "names": unconverted}, - } - ) - ) - - data[META_KEY] = {TABULATE_KEY: meta} - - return data - - -def parse_content_index(reader, name): - content_index: List[ContentIndexRowModel] = parse_sheet( - ContentIndexRowModel, - reader.get_sheet(name), - ) - acc = [] - - for entry in content_index: - acc += [entry] - - if entry.type == "content_index": - acc += parse_content_index(reader, entry.sheet_name[0]) - - return acc - - -def parse_sheet(model, sheet: Sheet): - try: - return SheetParser( - sheet.table, - row_parser=RowParser(model, TemplatePreserver()), - context=None, - ).parse_all() - except Exception as e: - raise Exception( - "Parse failed", - {"sheet": sheet.name if sheet else None, "model": model}, - e, - ) - - -def to_dicts(instances): - objs = [] - - for instance in instances: - obj = instance.dict(by_alias=True, exclude_unset=True) - - if "template_argument_definitions" in obj: - obj["template_arguments"] = obj.pop("template_argument_definitions") - - objs += [obj] - - return objs - - -class ModelFinder: - type_model_map = { - "content_index": ContentIndexRowModel, - "create_campaign": CampaignEventRowModel, - "create_flow": FlowTemplateStatement, - "create_triggers": TriggerRowModel, - "template_definition": FlowTemplateStatement, - } - - def __init__(self, module=None): - self._module = importlib.import_module(module) if module else None - - def find_for_entry(self, entry): - if entry.type in self.type_model_map: - return self.type_model_map.get(entry.type) - - if entry.data_model: - try: - return getattr(self._module, entry.data_model) - except AttributeError: - pass - - return None - - def create_workbook(data: dict) -> list: meta = data.pop(META_KEY, {}).get(TABULATE_KEY, {}) diff --git a/tests/test_universal.py b/tests/test_universal.py index 3c4c690..766c4b7 100644 --- a/tests/test_universal.py +++ b/tests/test_universal.py @@ -5,7 +5,6 @@ convert_cell, create_workbook, parse_cell, - parse_legacy_sheets, parse_table, parse_tables, tabulate, @@ -185,288 +184,6 @@ def test_assembly(self): ) -class TestConvertLegacyToUniversal(TestCase): - def test_skip_non_existent_sheets(self): - datasets = [ - Dataset( - ("create_flow", "my_basic_flow"), - headers=("type", "sheet_name"), - title="content_index", - ) - ] - - output = parse_legacy_sheets( - "tests.datarowmodels.nestedmodel", - DatasetSheetReader(datasets), - ) - - self.assertIn("content_index", output) - self.assertNotIn("my_basic_flow", output) - - def test_skip_evaluation_of_cells_with_templates(self): - datasets = [ - Dataset( - ("template_definition", "my_template"), - headers=("type", "sheet_name"), - title="content_index", - ), - Dataset( - ("send_message", "start", "Hello, {{name}}"), - headers=("type", "from", "message_text"), - title="my_template", - ), - ] - - output = parse_legacy_sheets( - "tests.datarowmodels.nestedmodel", - DatasetSheetReader(datasets), - ) - - self.assertEqual( - output["my_template"][0]["message_text"], - "Hello, {{name}}", - "Template notation must remain intact", - ) - - def test_unparseable_sheets_are_converted_to_2d_array(self): - datasets = [ - Dataset( - headers=("type", "sheet_name"), - title="content_index", - ), - Dataset( - ("", "b2"), - ("c1", ""), - ("d1", "d2"), - headers=("", ""), - title="unparseable", - ), - ] - - output = parse_legacy_sheets( - "tests.datarowmodels.nestedmodel", - DatasetSheetReader(datasets), - ) - - self.assertEqual( - output["unparseable"], - [["", ""], ["", "b2"], ["c1", ""], ["d1", "d2"]], - "Data should be 2-dimensional array of strings", - ) - - def test_process_content_indices_recursively(self): - datasets = [ - Dataset( - ("content_index", "sub_index"), - headers=("type", "sheet_name"), - title="content_index", - ), - Dataset( - ("data_sheet", "simpledata", "ListRowModel"), - headers=("type", "sheet_name", "data_model"), - title="sub_index", - ), - Dataset( - ("rowID", "val1", "val2"), - headers=("ID", "list_value.1", "list_value.2"), - title="simpledata", - ), - ] - - output = parse_legacy_sheets( - "tests.datarowmodels.nestedmodel", - DatasetSheetReader(datasets), - ) - - self.assertEqual( - output["simpledata"][0]["list_value"], - ["val1", "val2"], - "Data should be converted to nested form because all content indices should" - " have been processed", - ) - - def test_sheet_order_is_preserved(self): - datasets = [ - Dataset( - ("data_sheet", "sheet_2", "SimpleRowModel"), - ("data_sheet", "sheet_3", "SimpleRowModel"), - headers=("type", "sheet_name", "data_model"), - title="content_index", - ), - Dataset( - ("val1", "val2"), - headers=("value1", "value2"), - title="sheet_3", - ), - Dataset( - ("val1", "val2"), - headers=("value1", "value2"), - title="sheet_2", - ), - ] - - output = parse_legacy_sheets( - "tests.datarowmodels.simplemodel", - DatasetSheetReader(datasets), - ) - del output["_idems"] - - self.assertEqual( - list(output.keys()), - ["content_index", "sheet_3", "sheet_2"], - "Order of keys should be same as in workbook", - ) - - def test_original_column_headers_are_preserved(self): - datasets = [ - Dataset( - ("data_sheet", "sheet_2", "SimpleRowModel"), - headers=("type", "sheet_name", "data_model"), - title="content_index", - ), - Dataset( - ("val2", "val1"), - headers=("value2", "value1"), - title="sheet_2", - ), - ] - - output = parse_legacy_sheets( - "tests.datarowmodels.simplemodel", - DatasetSheetReader(datasets), - ) - - self.assertEqual( - output["_idems"]["tabulate"]["sheet_2"]["headers"], - ["value2", "value1"], - "Original column headers should be stored as metadata", - ) - - def test_save_as_dict(self): - # TODO: Break up this test into smaller, more manageable pieces - self.maxDiff = None - datasets = [ - Dataset( - ("data_sheet", "simpledata", "simpledata_new", "ListRowModel", ""), - ("create_flow", "my_basic_flow", "", "", ""), - ("data_sheet", "nesteddata", "", "NestedRowModel", ""), - ("create_campaign", "my_campaign", "", "", "grp1"), - headers=("type", "sheet_name", "new_name", "data_model", "group"), - title="content_index", - ), - Dataset( - ("rowID", "val1", "val2"), - headers=("ID", "list_value.1", "list_value.2"), - title="simpledata", - ), - Dataset( - ("row1", "Value1", "Happy1", "Sad1"), - ("row2", "Value2", "Happy2", "Sad2"), - headers=("ID", "value1", "custom_field.happy", "custom_field.sad"), - title="nesteddata", - ), - Dataset( - ("", "send_message", "start", "Some text"), - headers=("row_id", "type", "from", "message_text"), - title="my_basic_flow", - ), - Dataset( - ("15", "H", "F", "Last Seen On", "I", "my_basic_flow"), - headers=( - "offset", - "unit", - "event_type", - "relative_to", - "start_mode", - "flow", - ), - title="my_campaign", - ), - ] - - output = parse_legacy_sheets( - "tests.datarowmodels.nestedmodel", - DatasetSheetReader(datasets), - ) - del output["_idems"] - exp = { - "content_index": [ - { - "type": "data_sheet", - "sheet_name": ["simpledata"], - "new_name": "simpledata_new", - "data_model": "ListRowModel", - "group": "", - }, - { - "type": "create_flow", - "sheet_name": ["my_basic_flow"], - "new_name": "", - "data_model": "", - "group": "", - }, - { - "type": "data_sheet", - "sheet_name": ["nesteddata"], - "new_name": "", - "data_model": "NestedRowModel", - "group": "", - }, - { - "type": "create_campaign", - "sheet_name": ["my_campaign"], - "new_name": "", - "data_model": "", - "group": "grp1", - }, - ], - "simpledata": [ - { - "ID": "rowID", - "list_value": ["val1", "val2"], - } - ], - "nesteddata": [ - { - "ID": "row1", - "value1": "Value1", - "custom_field": { - "happy": "Happy1", - "sad": "Sad1", - }, - }, - { - "ID": "row2", - "value1": "Value2", - "custom_field": { - "happy": "Happy2", - "sad": "Sad2", - }, - }, - ], - "my_basic_flow": [ - { - "row_id": "", - "type": "send_message", - "from": ["start"], - "message_text": "Some text", - }, - ], - "my_campaign": [ - { - "offset": "15", - "unit": "H", - "event_type": "F", - "relative_to": "Last Seen On", - "start_mode": "I", - "flow": "my_basic_flow", - }, - ], - } - - self.assertEqual(output, exp) - - class TestConvertWorkbookToUniversal(TestCase): def test_workbook_converts_to_object(self): From f6201bb8281739a5f41a434ee3342eab19ad01c2 Mon Sep 17 00:00:00 2001 From: Ian Stride Date: Tue, 28 Jan 2025 23:12:22 +0000 Subject: [PATCH 12/33] Clean up uni to sheets conversion --- src/rpft/parsers/universal.py | 26 ++++++++++++-------------- tests/test_universal.py | 25 ++++++++++++++----------- 2 files changed, 26 insertions(+), 25 deletions(-) diff --git a/src/rpft/parsers/universal.py b/src/rpft/parsers/universal.py index e393e1a..ed53cec 100644 --- a/src/rpft/parsers/universal.py +++ b/src/rpft/parsers/universal.py @@ -12,9 +12,9 @@ LOGGER = logging.getLogger(__name__) -KEY_VALUE_SEP = ";" +DELIM_LVL_1 = "|" +DELIM_LVL_2 = ";" PROP_ACCESSOR = "." -SEQ_ITEM_SEP = "|" META_KEY = "_idems" TABULATE_KEY = "tabulate" HEADERS_KEY = "headers" @@ -51,26 +51,24 @@ def create_workbook(data: dict) -> list: - meta = data.pop(META_KEY, {}).get(TABULATE_KEY, {}) + meta = data.get(META_KEY, {}).get(TABULATE_KEY, {}) - return [(k, tabulate(v, meta.get(k, {}))) for k, v in data.items()] + return [(k, tabulate(v, meta.get(k, {}))) for k, v in data.items() if k != META_KEY] def tabulate(data, meta: dict = {}) -> List[List[str]]: """ Convert a nested data structure to a tabular form """ - if all(type(item) is list for item in data): - return data - headers = meta.get(HEADERS_KEY, []) or list( - {k: None for item in data for k, v in item.items()}.keys() + {k: None for item in data for k, _ in item.items()}.keys() ) + paths = keypaths(headers) rows = [] for item in data: obj = benedict(item) - rows += [[stringify(obj[kp]) for kp in keypaths(headers)]] + rows += [[stringify(obj[kp]) for kp in paths]] return [headers] + rows @@ -83,19 +81,19 @@ def stringify(value) -> str: @stringify.register def _(value: dict) -> str: - s = " | ".join( - f"{stringify(k)}{KEY_VALUE_SEP} {stringify(v)}" for k, v in value.items() + s = f" {DELIM_LVL_1} ".join( + f"{stringify(k)}{DELIM_LVL_2} {stringify(v)}" for k, v in value.items() ) if len(value) == 1: - s += " " + SEQ_ITEM_SEP + s += " " + DELIM_LVL_1 return s @stringify.register def _(value: list) -> str: - return " | ".join(stringify(i) for i in value) + return f" {DELIM_LVL_1} ".join(stringify(i) for i in value) @stringify.register @@ -177,7 +175,7 @@ def create_obj(pairs): return obj -def convert_cell(s: str, delimiters=["|", ";"]) -> Any: +def convert_cell(s: str, delimiters=[DELIM_LVL_1, DELIM_LVL_2]) -> Any: if type(s) is not str: raise TypeError("Value to convert is not a string") diff --git a/tests/test_universal.py b/tests/test_universal.py index 766c4b7..64a98ef 100644 --- a/tests/test_universal.py +++ b/tests/test_universal.py @@ -142,17 +142,6 @@ def test_objects_use_wide_layout_if_indicated_by_metadata(self): ["obj1_k1_v", "obj1_k2_v", "seq1_k1_v", "seq1_k2_v"], ) - def test_2d_arrays_are_passed_through(self): - meta = {"headers": ["A", "B"]} - data = [ - ["A", "B"], - ["a1", "b1"], - ] - - table = tabulate(data, meta) - - self.assertEqual(table, data) - # TODO: test pointers/references # TODO: add explicit type information # TODO: integrate zero-knowledge type inference @@ -182,6 +171,20 @@ def test_assembly(self): [["B", "A"], ["B1", "A1"]], "Columns should be ordered according to metadata", ) + self.assertEqual( + data, + { + "group1": [{"a": "a1", "b": "b1"}], + "group2": [{"A": "A1", "B": "B1"}], + "_idems": { + "tabulate": { + "group1": {"headers": ["a", "b"]}, + "group2": {"headers": ["B", "A"]}, + }, + }, + }, + "Input data should not be mutated" + ) class TestConvertWorkbookToUniversal(TestCase): From c96f511e3333c33075f3d5c52da44032e79c410e Mon Sep 17 00:00:00 2001 From: Ian Stride Date: Tue, 28 Jan 2025 23:21:21 +0000 Subject: [PATCH 13/33] Remove template preserver --- src/rpft/parsers/common/cellparser.py | 14 ------- tests/test_template_preserver.py | 55 --------------------------- 2 files changed, 69 deletions(-) delete mode 100644 tests/test_template_preserver.py diff --git a/src/rpft/parsers/common/cellparser.py b/src/rpft/parsers/common/cellparser.py index 4192c73..2aeda14 100644 --- a/src/rpft/parsers/common/cellparser.py +++ b/src/rpft/parsers/common/cellparser.py @@ -153,17 +153,3 @@ def unescape(nested_list): if type(nested_list) is str else [unescape(item) for item in nested_list] ) - - -class TemplatePreserver(CellParser): - - def split_into_lists(self, string): - if "{{" in string or "{@" in string: - return [ - item.strip() - for item in re.findall( - r"(\{[{@][^\{\}]*[}@]\}|[^{}@|;]+) *[;|]? *", string - ) - ] - else: - return super().split_into_lists(string) diff --git a/tests/test_template_preserver.py b/tests/test_template_preserver.py deleted file mode 100644 index 99320a3..0000000 --- a/tests/test_template_preserver.py +++ /dev/null @@ -1,55 +0,0 @@ -from unittest import TestCase - -from rpft.parsers.common.cellparser import TemplatePreserver - - -class TestTemplatePreserver(TestCase): - - def test_templates_preserved_when_splitting_into_list(self): - parser = TemplatePreserver() - data = [ - ("{{test}}|", ["{{test}}"], "Single string template with pipe"), - ("{{ test }} | ", ["{{ test }}"], "Single string template pipe and spaces"), - ("{{test}};", ["{{test}}"], "Single string template with semi-colon"), - ( - "{{ test }} ; ", - ["{{ test }}"], - "Single string template with semi-colon and spaces", - ), - ( - "{@ test @} |", - ["{@ test @}"], - "Single native type template with pipe and spaces", - ), - ( - "{@ test @} ;", - ["{@ test @}"], - "Single native type template with semi-colon and spaces", - ), - ( - "{@ something @} | something | {{ blah }}", - ["{@ something @}", "something", "{{ blah }}"], - "Native type, string, string template, with pipes", - ), - ( - "{@ something @} ; something ; {{ blah }}", - ["{@ something @}", "something", "{{ blah }}"], - "Native type, string, string template, with semi-colons", - ), - ( - "{{3*(steps.values()|length -1)}};{{3*(steps.values()|length -1)+2}}", - [ - "{{3*(steps.values()|length -1)}}", - "{{3*(steps.values()|length -1)+2}}", - ], - "Real-life example", - ), - ("A | B | C", ["A", "B", "C"], "No templates"), - ] - - for string, expected, message in data: - self.assertEqual( - parser.split_into_lists(string), - expected, - message, - ) From 9d27fe94f5db4706bf7743e3b584dd8329aac5aa Mon Sep 17 00:00:00 2001 From: Ian Stride Date: Fri, 31 Jan 2025 14:02:00 +0000 Subject: [PATCH 14/33] Tidy conversion to sheets; fix bugs --- src/rpft/converters.py | 8 ++++---- src/rpft/parsers/universal.py | 28 ++++++++++++++++++++-------- tests/test_universal.py | 35 +++++++++++++++++++++++++++++------ 3 files changed, 53 insertions(+), 18 deletions(-) diff --git a/src/rpft/converters.py b/src/rpft/converters.py index b5df6db..d32d383 100644 --- a/src/rpft/converters.py +++ b/src/rpft/converters.py @@ -6,7 +6,7 @@ import sys from pathlib import Path -from rpft.parsers.universal import create_workbook, parse_tables +from rpft.parsers.universal import bookify, parse_tables from rpft.parsers.creation.contentindexparser import ContentIndexParser from rpft.parsers.creation.tagmatcher import TagMatcher from rpft.parsers.sheets import ( @@ -57,11 +57,11 @@ def uni_to_sheets(infile) -> bytes: with open(infile, "r") as handle: data = json.load(handle) - sheets = create_workbook(data) + sheets = bookify(data) book = Databook( [ - Dataset(*sheet[1][1:], headers=sheet[1][0], title=sheet[0]) - for sheet in sheets + Dataset(*table[1:], headers=table[0], title=name) + for name, table in sheets ] ) diff --git a/src/rpft/parsers/universal.py b/src/rpft/parsers/universal.py index ed53cec..52b5399 100644 --- a/src/rpft/parsers/universal.py +++ b/src/rpft/parsers/universal.py @@ -3,7 +3,7 @@ from collections import defaultdict from collections.abc import Sequence from functools import singledispatch -from typing import Any, List +from typing import Any, TypeAlias, Union from benedict import benedict from lark import Lark, Transformer @@ -48,15 +48,20 @@ %ignore WS """ PARSER = Lark(CELL_GRAMMAR) +Table: TypeAlias = list[list[str]] +Book: TypeAlias = list[tuple[str, Table]] -def create_workbook(data: dict) -> list: +def bookify(data: dict) -> Book: + """ + Convert a dict into a 'book' - a list of named tables. + """ meta = data.get(META_KEY, {}).get(TABULATE_KEY, {}) return [(k, tabulate(v, meta.get(k, {}))) for k, v in data.items() if k != META_KEY] -def tabulate(data, meta: dict = {}) -> List[List[str]]: +def tabulate(data, meta: dict = {}) -> Table: """ Convert a nested data structure to a tabular form """ @@ -74,12 +79,12 @@ def tabulate(data, meta: dict = {}) -> List[List[str]]: @singledispatch -def stringify(value) -> str: +def stringify(value, **_) -> str: return str(value) @stringify.register -def _(value: dict) -> str: +def _(value: dict, **_) -> str: s = f" {DELIM_LVL_1} ".join( f"{stringify(k)}{DELIM_LVL_2} {stringify(v)}" for k, v in value.items() @@ -92,12 +97,19 @@ def _(value: dict) -> str: @stringify.register -def _(value: list) -> str: - return f" {DELIM_LVL_1} ".join(stringify(i) for i in value) +def _(value: Union[list, tuple], delimiters=[DELIM_LVL_1, DELIM_LVL_2]) -> str: + delim, *delims = delimiters if delimiters else [None] + + if not delim: + raise Exception("Value is too deeply nested") + + s = f" {delim} ".join(stringify(i, delimiters=delims) for i in value) + + return f"{s} {delim}" if len(value) == 1 else s @stringify.register -def _(value: bool) -> str: +def _(value: bool, **_) -> str: return str(value).lower() diff --git a/tests/test_universal.py b/tests/test_universal.py index 64a98ef..64f63e6 100644 --- a/tests/test_universal.py +++ b/tests/test_universal.py @@ -2,8 +2,8 @@ from rpft.parsers.sheets import DatasetSheetReader from rpft.parsers.universal import ( + bookify, convert_cell, - create_workbook, parse_cell, parse_table, parse_tables, @@ -66,14 +66,37 @@ def test_columns_can_be_ordered_by_metadata(self): def test_arrays_use_single_cell_layout_by_default(self): data = [ - { - "choices": ["yes", "no", 1, False], - }, + {"h1": ["yes", "no", 1, False]}, + {"h1": ("yes", "no", 1, False)}, ] table = tabulate(data) self.assertEqual(table[1], ["yes | no | 1 | false"]) + self.assertEqual(table[2], ["yes | no | 1 | false"]) + + def test_single_item_array(self): + data = [{"k1": ["seq1v1"]}] + + table = tabulate(data) + + self.assertEqual(table[1][0], "seq1v1 |") + + def test_nested_arrays_within_a_single_cell(self): + data = [ + {"k1": ["seq1v1", ["seq2v1", "seq2v2"]]}, + ] + + table = tabulate(data) + + self.assertEqual(table[1][0], "seq1v1 | seq2v1 ; seq2v2") + + def test_raise_exception_if_too_much_nesting_for_a_single_cell(self): + data = [ + {"k1": ["seq1v1", ["seq2v1", ["seq3v1"]]]}, + ] + + self.assertRaises(Exception, tabulate, data) def test_arrays_use_wide_layout_if_indicated_by_metadata(self): meta = { @@ -160,7 +183,7 @@ def test_assembly(self): }, } - workbook = create_workbook(data) + workbook = bookify(data) self.assertEqual(len(workbook), 2) self.assertEqual(workbook[0][0], "group1") @@ -183,7 +206,7 @@ def test_assembly(self): }, }, }, - "Input data should not be mutated" + "Input data should not be mutated", ) From e8d5580271edffffba3a4e6e84060211de0c0441 Mon Sep 17 00:00:00 2001 From: Ian Stride Date: Fri, 31 Jan 2025 14:48:48 +0000 Subject: [PATCH 15/33] Support Python v3.9 --- src/rpft/parsers/universal.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/src/rpft/parsers/universal.py b/src/rpft/parsers/universal.py index 52b5399..e1dbb7d 100644 --- a/src/rpft/parsers/universal.py +++ b/src/rpft/parsers/universal.py @@ -3,7 +3,7 @@ from collections import defaultdict from collections.abc import Sequence from functools import singledispatch -from typing import Any, TypeAlias, Union +from typing import Any from benedict import benedict from lark import Lark, Transformer @@ -48,8 +48,8 @@ %ignore WS """ PARSER = Lark(CELL_GRAMMAR) -Table: TypeAlias = list[list[str]] -Book: TypeAlias = list[tuple[str, Table]] +Table = list[list[str]] +Book = list[tuple[str, Table]] def bookify(data: dict) -> Book: @@ -97,7 +97,7 @@ def _(value: dict, **_) -> str: @stringify.register -def _(value: Union[list, tuple], delimiters=[DELIM_LVL_1, DELIM_LVL_2]) -> str: +def _(value: list, delimiters=[DELIM_LVL_1, DELIM_LVL_2]) -> str: delim, *delims = delimiters if delimiters else [None] if not delim: @@ -108,6 +108,11 @@ def _(value: Union[list, tuple], delimiters=[DELIM_LVL_1, DELIM_LVL_2]) -> str: return f"{s} {delim}" if len(value) == 1 else s +@stringify.register +def _(value: tuple, delimiters=[DELIM_LVL_1, DELIM_LVL_2]) -> str: + return stringify(list(value)) + + @stringify.register def _(value: bool, **_) -> str: return str(value).lower() From 1b372f0d39dc52ff4e36354bf60dcc6bd66cd325 Mon Sep 17 00:00:00 2001 From: Ian Stride Date: Fri, 31 Jan 2025 15:13:53 +0000 Subject: [PATCH 16/33] Remove type hints from parse_table --- src/rpft/parsers/universal.py | 14 ++------------ 1 file changed, 2 insertions(+), 12 deletions(-) diff --git a/src/rpft/parsers/universal.py b/src/rpft/parsers/universal.py index e1dbb7d..c4483e1 100644 --- a/src/rpft/parsers/universal.py +++ b/src/rpft/parsers/universal.py @@ -1,7 +1,6 @@ import logging import re from collections import defaultdict -from collections.abc import Sequence from functools import singledispatch from typing import Any @@ -85,7 +84,6 @@ def stringify(value, **_) -> str: @stringify.register def _(value: dict, **_) -> str: - s = f" {DELIM_LVL_1} ".join( f"{stringify(k)}{DELIM_LVL_2} {stringify(v)}" for k, v in value.items() ) @@ -130,11 +128,7 @@ def parse_tables(reader: AbstractSheetReader) -> dict: return obj -def parse_table( - title: str = None, - headers: Sequence[str] = tuple(), - rows: Sequence[Sequence[str]] = tuple(), -): +def parse_table(title: str = None, headers=tuple(), rows=tuple()): """ Parse data in tabular form into a nested structure """ @@ -146,11 +140,7 @@ def parse_table( return create_obj(stream(title, headers, rows)) -def stream( - title: str = None, - headers: Sequence[str] = tuple(), - rows: Sequence[Sequence[str]] = tuple(), -): +def stream(title: str = None, headers=tuple(), rows=tuple()): yield [META_KEY, TABULATE_KEY, title, HEADERS_KEY], headers for i, row in enumerate(rows): From ab6cde7e43a4295d461c70caaa0d1580e2ff073f Mon Sep 17 00:00:00 2001 From: Ian Stride Date: Fri, 31 Jan 2025 15:20:28 +0000 Subject: [PATCH 17/33] Remove Lark-based cell parser --- src/rpft/parsers/universal.py | 70 ----------------------------------- tests/test_universal.py | 7 ---- 2 files changed, 77 deletions(-) diff --git a/src/rpft/parsers/universal.py b/src/rpft/parsers/universal.py index c4483e1..844c144 100644 --- a/src/rpft/parsers/universal.py +++ b/src/rpft/parsers/universal.py @@ -5,7 +5,6 @@ from typing import Any from benedict import benedict -from lark import Lark, Transformer from rpft.parsers.sheets import AbstractSheetReader @@ -17,36 +16,6 @@ META_KEY = "_idems" TABULATE_KEY = "tabulate" HEADERS_KEY = "headers" -CELL_GRAMMAR = r""" -?start : TEMPLATE -> template - | seq - | item - -seq : (item? "|" item?)+ - -?item : subseq - | value - -subseq : (value? ";" value?)+ - -?value : NUMBER -> number - | BOOLEAN -> boolean - | STRING -> string - | -> empty - -NUMBER : SIGNED_NUMBER - -BOOLEAN : "true" - | "false" - -TEMPLATE : /.*{[{@%].*/ - -STRING : /(\\[|;]|[^|;])+/ - -%import common (SIGNED_NUMBER, WS) -%ignore WS -""" -PARSER = Lark(CELL_GRAMMAR) Table = list[list[str]] Book = list[tuple[str, Table]] @@ -219,42 +188,3 @@ def convert_cell(s: str, delimiters=[DELIM_LVL_1, DELIM_LVL_2]) -> Any: def is_template(s: str) -> bool: return bool(re.match("{{.*?}}|{@.*?@}|{%.*?%}", s)) - - -class CellTransformer(Transformer): - - seq = subseq = list - - def boolean(self, tokens): - return (tokens[0]).strip() == "true" - - def empty(self, tokens): - return "" - - def number(self, tokens): - token = (tokens[0]).strip() - - try: - return int(token) - except Exception: - pass - - try: - return float(token) - except Exception: - pass - - raise Exception(f"Conversion to number failed, token={token}") - - def string(self, tokens): - return re.sub(r"\\(.{1})", r"\g<1>", tokens[0].strip()) - - def template(self, tokens): - return self.string(tokens) - - -def parse_cell(cell: str) -> Any: - if type(cell) is not str: - raise TypeError("Value to convert must be a string") - - return CellTransformer().transform(PARSER.parse(cell)) if cell else "" diff --git a/tests/test_universal.py b/tests/test_universal.py index 64f63e6..60a21dd 100644 --- a/tests/test_universal.py +++ b/tests/test_universal.py @@ -4,7 +4,6 @@ from rpft.parsers.universal import ( bookify, convert_cell, - parse_cell, parse_table, parse_tables, tabulate, @@ -360,9 +359,3 @@ def test_inline_templates_are_preserved(self): ), "{{3*(steps.values()|length -1)}}|{{3*(steps.values()|length -1)+2}}", ) - - -class TestLarkCellConversion(TestCellConversion): - - def setUp(self): - self.func = parse_cell From c2e1fb09459944de0dfb2538afa2e79eea2d87d8 Mon Sep 17 00:00:00 2001 From: Ian Stride Date: Fri, 31 Jan 2025 15:27:11 +0000 Subject: [PATCH 18/33] Remove lark package --- pyproject.toml | 1 - 1 file changed, 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index efde8d1..46b1df3 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -30,7 +30,6 @@ classifiers = [ "Topic :: Utilities", ] dependencies = [ - "lark", "Jinja2~=3.0.3", "google-api-python-client~=2.6.0", "google-auth-oauthlib~=0.4.4", From 70f8dc869283f78db00d0ca7a50861b03fbef34d Mon Sep 17 00:00:00 2001 From: Ian Stride Date: Fri, 31 Jan 2025 16:11:54 +0000 Subject: [PATCH 19/33] Remove non-essential changes --- src/rpft/converters.py | 10 ------- src/rpft/parsers/common/rowparser.py | 14 +++------- src/rpft/parsers/common/sheetparser.py | 10 +++---- src/rpft/parsers/creation/flowrowmodel.py | 33 ----------------------- tests/test_universal.py | 4 --- 5 files changed, 8 insertions(+), 63 deletions(-) diff --git a/src/rpft/converters.py b/src/rpft/converters.py index d32d383..da6d29d 100644 --- a/src/rpft/converters.py +++ b/src/rpft/converters.py @@ -117,8 +117,6 @@ def flows_to_sheets( def create_sheet_reader(sheet_format, input_file): - sheet_format = sheet_format if sheet_format else detect_format(input_file) - if sheet_format == "csv": sheet_reader = CSVSheetReader(input_file) elif sheet_format == "xlsx": @@ -133,14 +131,6 @@ def create_sheet_reader(sheet_format, input_file): return sheet_reader -def detect_format(fp): - if bool(re.fullmatch(r"[a-z0-9_-]{44}", fp, re.IGNORECASE)): - return "google_sheets" - - if Path(fp).suffix.lower() == ".xlsx": - return "xlsx" - - def sheets_to_csv(path, sheet_ids): prepare_dir(path) diff --git a/src/rpft/parsers/common/rowparser.py b/src/rpft/parsers/common/rowparser.py index c5f8787..14f06dd 100644 --- a/src/rpft/parsers/common/rowparser.py +++ b/src/rpft/parsers/common/rowparser.py @@ -3,7 +3,7 @@ from collections.abc import Iterable from typing import List -from pydantic import BaseModel, Field +from pydantic import BaseModel from rpft.parsers.common.cellparser import CellParser @@ -34,14 +34,6 @@ def header_name_to_field_name_with_context(header, row): return header -def field_names(model: BaseModel) -> List[str]: - return [field.alias for field in model.model_fields.values()] - - -def get_field(model: BaseModel, name: str) -> Field: - return next(field for field in model.model_fields.values() if field.alias == name) - - def get_list_child_model(model): if is_basic_list_type(model): # If not specified, list elements may be anything. @@ -170,7 +162,7 @@ def assign_value(self, field, key, value, model): # Get the list of keys that are available for the target model # Note: The fields have a well defined ordering. # See https://pydantic-docs.helpmanual.io/usage/models/#field-ordering - model_fields = field_names(model) + model_fields = list(model.__fields__.keys()) if type(value) is not list: # It could be that an object is specified via a single element. @@ -287,7 +279,7 @@ def find_entry(self, model, output_field, field_path): else: assert is_parser_model_type(model) key = model.header_name_to_field_name(field_name) - if key not in field_names(model): + if key not in model.model_fields: raise ValueError(f"Field {key} doesn't exist in target type {model}.") child_model = model.model_fields[key].annotation diff --git a/src/rpft/parsers/common/sheetparser.py b/src/rpft/parsers/common/sheetparser.py index 351890d..a11f44c 100644 --- a/src/rpft/parsers/common/sheetparser.py +++ b/src/rpft/parsers/common/sheetparser.py @@ -34,12 +34,12 @@ def __init__(self, table, row_model=None, row_parser=None, context={}): raise ValueError("SheetParser: needs either row_parser or row_model") self.row_parser = row_parser or RowParser(row_model) self.bookmarks = {} - self.input_rows = [ - ({h: e for h, e in zip(table.headers, row)}, row_idx) - for row_idx, row in enumerate(table, start=2) - ] + self.input_rows = [] + for row_idx, row in enumerate(table): + row_dict = {h: e for h, e in zip(table.headers, row)} + self.input_rows.append((row_dict, row_idx + 2)) self.iterator = iter(self.input_rows) - self.context = context if context is None else copy.deepcopy(context) + self.context = copy.deepcopy(context) def add_to_context(self, key, value): self.context[key] = value diff --git a/src/rpft/parsers/creation/flowrowmodel.py b/src/rpft/parsers/creation/flowrowmodel.py index ae91ddf..baaaadc 100644 --- a/src/rpft/parsers/creation/flowrowmodel.py +++ b/src/rpft/parsers/creation/flowrowmodel.py @@ -1,7 +1,5 @@ from pydantic import ConfigDict -from pydantic.v1 import Field - from rpft.parsers.common.rowparser import ParserModel from rpft.parsers.creation.models import Condition @@ -177,34 +175,3 @@ def header_name_to_field_name_with_context(header, row): def is_starting_row(self): return len(self.edges) == 1 and self.edges[0].from_ == "start" - - -class FlowTemplateStatement(ParserModel): - attachments: List[str] = [] - audio: str = "" - choices: List[str] = [] - condition: List[str] = Field(default_factory=list) - condition_value: List[str] = Field(default_factory=list) - condition_name: List[str] = Field(default_factory=list) - condition_type: List[str] = Field(default_factory=list) - condition_var: List[str] = Field(default_factory=list) - condition_variable: List[str] = Field(default_factory=list) - data_row_id: str = "" - data_sheet: str = "" - from_: List[str] = Field(alias="from", default_factory=list) - image: str = "" - include_if: str = "true" - loop_variable: List[str] = Field(default_factory=list) - message_text: str = "" - no_response: str = "" - nodeId: str = Field(alias="_nodeId", default="") - node_name: str = "" - obj_id: str = "" - obj_name: str = "" - row_id: str = "" - save_name: str = "" - template_arguments: list = [] - type: str = "" - video: str = "" - wa_template: WhatsAppTemplating = WhatsAppTemplating() - webhook: Webhook = Webhook() diff --git a/tests/test_universal.py b/tests/test_universal.py index 60a21dd..7deebc4 100644 --- a/tests/test_universal.py +++ b/tests/test_universal.py @@ -164,10 +164,6 @@ def test_objects_use_wide_layout_if_indicated_by_metadata(self): ["obj1_k1_v", "obj1_k2_v", "seq1_k1_v", "seq1_k2_v"], ) - # TODO: test pointers/references - # TODO: add explicit type information - # TODO: integrate zero-knowledge type inference - class TestUniversalToWorkbook(TestCase): def test_assembly(self): From 286f1e4520565d649c13d430ee870d17d5d9e3a3 Mon Sep 17 00:00:00 2001 From: Ian Stride Date: Fri, 31 Jan 2025 17:59:01 +0000 Subject: [PATCH 20/33] Allow dicts to be stringified with variable delimiters --- src/rpft/parsers/universal.py | 40 +++++++++++++++++++---------------- 1 file changed, 22 insertions(+), 18 deletions(-) diff --git a/src/rpft/parsers/universal.py b/src/rpft/parsers/universal.py index 844c144..66cb2f2 100644 --- a/src/rpft/parsers/universal.py +++ b/src/rpft/parsers/universal.py @@ -10,8 +10,7 @@ LOGGER = logging.getLogger(__name__) -DELIM_LVL_1 = "|" -DELIM_LVL_2 = ";" +DELIMS = "|;" PROP_ACCESSOR = "." META_KEY = "_idems" TABULATE_KEY = "tabulate" @@ -52,31 +51,36 @@ def stringify(value, **_) -> str: @stringify.register -def _(value: dict, **_) -> str: - s = f" {DELIM_LVL_1} ".join( - f"{stringify(k)}{DELIM_LVL_2} {stringify(v)}" for k, v in value.items() +def _(value: dict, delimiters=DELIMS) -> str: + if len(delimiters) > 1: + d1, d2 = delimiters + else: + raise ValueError("Too few delimiters to stringify dict") + + s = f" {d1} ".join( + f"{stringify(k)}{d2} {stringify(v, delimiters=[])}" for k, v in value.items() ) if len(value) == 1: - s += " " + DELIM_LVL_1 + s += " " + d1 return s @stringify.register -def _(value: list, delimiters=[DELIM_LVL_1, DELIM_LVL_2]) -> str: - delim, *delims = delimiters if delimiters else [None] +def _(value: list, delimiters=DELIMS) -> str: + d, *delims = delimiters if delimiters else [None] - if not delim: - raise Exception("Value is too deeply nested") + if not d: + raise ValueError("Too few delimiters to stringify list") - s = f" {delim} ".join(stringify(i, delimiters=delims) for i in value) + s = f" {d} ".join(stringify(item, delimiters=delims) for item in value) - return f"{s} {delim}" if len(value) == 1 else s + return f"{s} {d}" if len(value) == 1 else s @stringify.register -def _(value: tuple, delimiters=[DELIM_LVL_1, DELIM_LVL_2]) -> str: +def _(value: tuple, delimiters=DELIMS) -> str: return stringify(list(value)) @@ -151,7 +155,7 @@ def create_obj(pairs): return obj -def convert_cell(s: str, delimiters=[DELIM_LVL_1, DELIM_LVL_2]) -> Any: +def convert_cell(s: str, delimiters=DELIMS) -> Any: if type(s) is not str: raise TypeError("Value to convert is not a string") @@ -173,12 +177,12 @@ def convert_cell(s: str, delimiters=[DELIM_LVL_1, DELIM_LVL_2]) -> Any: if is_template(clean): return clean - delim, *delims = delimiters if delimiters else [None] + d, *delims = delimiters if delimiters else [None] - if delim and delim in clean: - seq = [convert_cell(item, delimiters=delims) for item in clean.split(delim)] + if d and d in clean: + seq = [convert_cell(item, delimiters=delims) for item in clean.split(d)] - return seq[:-1] if clean and clean[-1] == delim else seq + return seq[:-1] if clean and clean[-1] == d else seq if any(s in clean for s in delims): return convert_cell(clean, delimiters=delims) From cf308cd0baa5564b2829ac0996236df99017ca41 Mon Sep 17 00:00:00 2001 From: Ian Stride Date: Sat, 1 Feb 2025 00:45:28 +0000 Subject: [PATCH 21/33] Create documentation for spreadsheet notation --- docs/notation.md | 144 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 144 insertions(+) create mode 100644 docs/notation.md diff --git a/docs/notation.md b/docs/notation.md new file mode 100644 index 0000000..49bb8f7 --- /dev/null +++ b/docs/notation.md @@ -0,0 +1,144 @@ +# Spreadsheet notation + +Summary of spreadsheet notation used to convert sheets into a nested data structure (JSON). A series of data tables will be shown alongside the resultant JSON structure. + +# Tables + +Also known as a sheet in a spreadsheet (or workbook). + +| a | b | +|----|----| +| v1 | v2 | +sheet1 + +```json +{ + "sheet1": [ + {"a": "v1", "b": "v2"} + ] +} +``` + +Additional tables (sheets) will be added as additional properties. + +```json +{ + "sheet1": [...], + "sheet2": [...], + ... +} +``` + +# Basic types + +Refers to the following value types in JSON: `string`, `number`, `true` and `false`. + +| string | number | true | false | +|--------|--------|------|-------| +| hello | 123 | true | false | +basic\_types + +```json +{ + "basic_types": [ + { + "string": "hello", + "number": 123, + "true": true, + "false": false + } + ] +} +``` + +The JSON type `null` is not represented because an empty cell is assumed to be equivalent to the empty string (""). + +# Sequences + +An ordered sequence of items. Also known as lists or arrays. + +| seq1 | seq1 | seq2.1 | seq2.2 | seq3 | seq4 | +|------|------|--------|--------|----------|--------------------| +| v1 | v2 | v1 | v2 | v1 \| v2 | v1 ; v2 \| v3 ; v4 | +sequences + +```json +{ + "sequences": [ + { + "seq1": ["v1", "v2"], + "seq2": ["v1", "v2"], + "seq3": ["v1", "v2"] + "seq4": [["v1", "v2"], ["v3", "v4"]] + } + ] +} +``` + +`seq1`, `seq2` and `seq3` are equivalent. In all cases, the order of items is specified from left to right. + +`seq1` uses a 'wide' layout, where the column header is repeated and each column holds one item in the sequence. Values from columns with the same name are collected into a sequence in the resulting JSON object. + +`seq3` uses an 'inline' layout, where the sequence is defined as a delimited string within a single cell of the table. The default delimiter is a vertical bar or pipe character ('|'). + +Two levels of nesting are possible within a cell, as shown by `seq4` - a list of lists. This could be used to model a list of key-value pairs, which could easily be converted to an object (map / dictionary). + +# Objects + +An unordered collection of key-value pairs (properties). Also known as maps, dictionaries or associative arrays. + +| obj1.key1 | obj1.key2 | obj2 | +|-----------|-----------|------------------------| +| v1 | v2 | key1 ; v1 \| key2 ; v2 | +objects + +```json +{ + "objects": [ + { + "obj1": { + "key1": "v1", + "key2": "v2" + }, + "obj2": [ + ["key1", "v1"], + ["key2", "v2"] + ] + } + ] +} +``` + +`obj1` and `obj2` are not quite the same, but can be interpreted in the same way, as a list of key-value pairs. + +A wide layout is used for `obj1`, where one or more column headers use a dotted 'keypath' notation to identify a particular property key belonging to a particular object, and the corresponding cells in subsequent rows contain the values for that property. The dotted keypath notation can be used to access properties at deeper levels of nesting e.g. `obj.key.subkey.etc`. + +An inline layout is used for `obj2`, where properties are defined as a sequence of key-value pairs. The delimiter of properties is a vertical bar or pip character - the same as for top-level arrays. The delimiter of keys and values is a semi-colon character - the same as for 2nd-level arrays. + +All the previous notation can be combined to create fairly complicated structures. + +| obj1.key1 | obj1.key1 | +|------------------------|--------------------------------| +| 1 ; 2 ; 3 \| one ; two | active ; true \| debug ; false | +nesting + +```json +{ + "nesting": [ + { + "obj1": { + "key1": [ + [ + [1, 2, 3], + ["one", "two"] + ], + [ + ["active", true], + ["debug", false] + ] + ], + } + } + ] +} +``` From c6c6a2690323156ca0910dd0c76202579790ce9b Mon Sep 17 00:00:00 2001 From: Ian Stride Date: Sat, 1 Feb 2025 00:51:40 +0000 Subject: [PATCH 22/33] Amend docs --- docs/notation.md | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/docs/notation.md b/docs/notation.md index 49bb8f7..8889eef 100644 --- a/docs/notation.md +++ b/docs/notation.md @@ -9,11 +9,12 @@ Also known as a sheet in a spreadsheet (or workbook). | a | b | |----|----| | v1 | v2 | -sheet1 + +`data` ```json { - "sheet1": [ + "data": [ {"a": "v1", "b": "v2"} ] } @@ -23,9 +24,8 @@ Additional tables (sheets) will be added as additional properties. ```json { - "sheet1": [...], - "sheet2": [...], - ... + "sheet1": [{}, {}], + "sheet2": [{}, {}] } ``` @@ -36,7 +36,8 @@ Refers to the following value types in JSON: `string`, `number`, `true` and `fal | string | number | true | false | |--------|--------|------|-------| | hello | 123 | true | false | -basic\_types + +`basic_types` ```json { @@ -60,7 +61,8 @@ An ordered sequence of items. Also known as lists or arrays. | seq1 | seq1 | seq2.1 | seq2.2 | seq3 | seq4 | |------|------|--------|--------|----------|--------------------| | v1 | v2 | v1 | v2 | v1 \| v2 | v1 ; v2 \| v3 ; v4 | -sequences + +`sequences` ```json { @@ -90,7 +92,8 @@ An unordered collection of key-value pairs (properties). Also known as maps, dic | obj1.key1 | obj1.key2 | obj2 | |-----------|-----------|------------------------| | v1 | v2 | key1 ; v1 \| key2 ; v2 | -objects + +`objects` ```json { @@ -120,7 +123,8 @@ All the previous notation can be combined to create fairly complicated structure | obj1.key1 | obj1.key1 | |------------------------|--------------------------------| | 1 ; 2 ; 3 \| one ; two | active ; true \| debug ; false | -nesting + +`nesting` ```json { From 9a1d86c949dad166c5ee6614ba6cb22801b718d6 Mon Sep 17 00:00:00 2001 From: Ian Stride Date: Mon, 3 Feb 2025 00:26:41 +0000 Subject: [PATCH 23/33] Add support for ODS files --- src/rpft/converters.py | 35 +++++++++++++++---------- src/rpft/parsers/sheets.py | 52 +++++++++++++++++++++++++++----------- 2 files changed, 59 insertions(+), 28 deletions(-) diff --git a/src/rpft/converters.py b/src/rpft/converters.py index da6d29d..e0e9754 100644 --- a/src/rpft/converters.py +++ b/src/rpft/converters.py @@ -15,6 +15,7 @@ CSVSheetReader, GoogleSheetReader, JSONSheetReader, + ODSSheetReader, XLSXSheetReader, ) from rpft.rapidpro.models.containers import RapidProContainer @@ -59,10 +60,7 @@ def uni_to_sheets(infile) -> bytes: sheets = bookify(data) book = Databook( - [ - Dataset(*table[1:], headers=table[0], title=name) - for name, table in sheets - ] + [Dataset(*table[1:], headers=table[0], title=name) for name, table in sheets] ) return book.export("ods") @@ -117,18 +115,29 @@ def flows_to_sheets( def create_sheet_reader(sheet_format, input_file): - if sheet_format == "csv": - sheet_reader = CSVSheetReader(input_file) - elif sheet_format == "xlsx": - sheet_reader = XLSXSheetReader(input_file) - elif sheet_format == "json": - sheet_reader = JSONSheetReader(input_file) - elif sheet_format == "google_sheets": - sheet_reader = GoogleSheetReader(input_file) + sheet_format = sheet_format if sheet_format else detect_format(input_file) + cls = { + "csv": CSVSheetReader, + "google_sheets": GoogleSheetReader, + "json": JSONSheetReader, + "ods": ODSSheetReader, + "xlsx": XLSXSheetReader, + }.get(sheet_format) + + if cls: + return cls(input_file) else: raise Exception(f"Format {sheet_format} currently unsupported.") - return sheet_reader + +def detect_format(fp): + if bool(re.fullmatch(r"[a-z0-9_-]{44}", fp, re.IGNORECASE)): + return "google_sheets" + + ext = Path(fp).suffix.lower()[1:] + + if ext in ["xlsx", "ods"]: + return ext def sheets_to_csv(path, sheet_ids): diff --git a/src/rpft/parsers/sheets.py b/src/rpft/parsers/sheets.py index fa92308..51eb2e1 100644 --- a/src/rpft/parsers/sheets.py +++ b/src/rpft/parsers/sheets.py @@ -19,6 +19,9 @@ def __init__(self, reader, name, table): self.name = name self.table = table + def __repr__(self): + return f"Sheet(name: '{self.name}')" + class AbstractSheetReader(ABC): @property @@ -31,6 +34,9 @@ def get_sheet(self, name) -> Sheet: def get_sheets_by_name(self, name) -> list[Sheet]: return [sheet] if (sheet := self.get_sheet(name)) else [] + def __repr__(self): + return f"{type(self).__name__}(name: '{self.name}')" + class CSVSheetReader(AbstractSheetReader): def __init__(self, path): @@ -62,23 +68,9 @@ def __init__(self, filename): self.sheets[sheet.title] = Sheet( reader=self, name=sheet.title, - table=self._sanitize(sheet), + table=sanitize(sheet), ) - def _sanitize(self, sheet): - data = tablib.Dataset() - data.headers = sheet.headers - # remove trailing Nones - while data.headers[-1] is None: - data.headers.pop() - for row in sheet: - vals = tuple(str(e) if e is not None else "" for e in row) - new_row = vals[: len(data.headers)] - if any(new_row): - # omit empty rows - data.append(new_row) - return data - class GoogleSheetReader(AbstractSheetReader): @@ -159,6 +151,36 @@ def get_sheets_by_name(self, name): class DatasetSheetReader(AbstractSheetReader): def __init__(self, datasets): self._sheets = {d.title: Sheet(self, d.title, d) for d in datasets} + self.name = "[datasets]" + + +class ODSSheetReader(AbstractSheetReader): + def __init__(self, path): + book = tablib.Databook() + + with open(path, "rb") as f: + book.load(f, format="ods") + + self._sheets = { + sheet.title: Sheet(self, sheet.title, sanitize(sheet)) + for sheet in book.sheets() + } + self.name = str(path) + + +def sanitize(sheet): + data = tablib.Dataset() + data.headers = sheet.headers + # remove trailing Nones + while data.headers and data.headers[-1] is None: + data.headers.pop() + for row in sheet: + vals = tuple(str(e) if e is not None else "" for e in row) + new_row = vals[: len(data.headers)] + if any(new_row): + # omit empty rows + data.append(new_row) + return data def load_csv(path): From 79d76aa521ed7894f65da5dd32c7732b8616b661 Mon Sep 17 00:00:00 2001 From: Ian Stride Date: Mon, 3 Feb 2025 00:27:49 +0000 Subject: [PATCH 24/33] Preserve templates; escape delimiters --- src/rpft/parsers/universal.py | 48 ++++++++++++++++++++--------------- tests/test_universal.py | 32 ++++++++++++++++++++++- 2 files changed, 59 insertions(+), 21 deletions(-) diff --git a/src/rpft/parsers/universal.py b/src/rpft/parsers/universal.py index 66cb2f2..1ed7d7f 100644 --- a/src/rpft/parsers/universal.py +++ b/src/rpft/parsers/universal.py @@ -46,19 +46,19 @@ def tabulate(data, meta: dict = {}) -> Table: @singledispatch -def stringify(value, **_) -> str: - return str(value) +def stringify(value, delimiters=DELIMS, **_) -> str: + return re.sub(rf"([{delimiters}])", r"\\\1", str(value)) @stringify.register -def _(value: dict, delimiters=DELIMS) -> str: - if len(delimiters) > 1: - d1, d2 = delimiters +def _(value: dict, delimiters=DELIMS, depth=0) -> str: + if len(delimiters[depth:]) > 1: + d1, d2 = delimiters[depth : depth + 2] else: raise ValueError("Too few delimiters to stringify dict") s = f" {d1} ".join( - f"{stringify(k)}{d2} {stringify(v, delimiters=[])}" for k, v in value.items() + f"{stringify(k)}{d2} {stringify(v, depth=depth + 2)}" for k, v in value.items() ) if len(value) == 1: @@ -68,20 +68,25 @@ def _(value: dict, delimiters=DELIMS) -> str: @stringify.register -def _(value: list, delimiters=DELIMS) -> str: - d, *delims = delimiters if delimiters else [None] +def _(value: list, delimiters=DELIMS, depth=0) -> str: + d = delimiters[depth] if depth < len(delimiters) else None if not d: raise ValueError("Too few delimiters to stringify list") - s = f" {d} ".join(stringify(item, delimiters=delims) for item in value) + s = f" {d} ".join(stringify(item, depth=depth + 1) for item in value) - return f"{s} {d}" if len(value) == 1 else s + if len(value) == 1: + s += f" {d}" + elif value[-1] == "": + s += d + + return s @stringify.register -def _(value: tuple, delimiters=DELIMS) -> str: - return stringify(list(value)) +def _(value: tuple, delimiters=DELIMS, depth=0) -> str: + return stringify(list(value), depth=depth) @stringify.register @@ -155,7 +160,7 @@ def create_obj(pairs): return obj -def convert_cell(s: str, delimiters=DELIMS) -> Any: +def convert_cell(s: str, delimiters=DELIMS, depth=0) -> Any: if type(s) is not str: raise TypeError("Value to convert is not a string") @@ -177,18 +182,21 @@ def convert_cell(s: str, delimiters=DELIMS) -> Any: if is_template(clean): return clean - d, *delims = delimiters if delimiters else [None] + d = delimiters[depth] if depth < len(delimiters) else "" + pattern = rf"(?", clean) def is_template(s: str) -> bool: - return bool(re.match("{{.*?}}|{@.*?@}|{%.*?%}", s)) + return bool(re.search("{{.*?}}|{@.*?@}|{%.*?%}", s)) diff --git a/tests/test_universal.py b/tests/test_universal.py index 7deebc4..e04d1fb 100644 --- a/tests/test_universal.py +++ b/tests/test_universal.py @@ -74,6 +74,15 @@ def test_arrays_use_single_cell_layout_by_default(self): self.assertEqual(table[1], ["yes | no | 1 | false"]) self.assertEqual(table[2], ["yes | no | 1 | false"]) + def test_array_delimiters_are_escaped(self): + data = [ + {"h1": [[1, 2], "3 | 4", "5 ; 6"]}, + ] + + table = tabulate(data) + + self.assertEqual(table[1], ["1 ; 2 | 3 \| 4 | 5 \; 6"]) + def test_single_item_array(self): data = [{"k1": ["seq1v1"]}] @@ -81,6 +90,20 @@ def test_single_item_array(self): self.assertEqual(table[1][0], "seq1v1 |") + def test_arrays_with_empty_single_item(self): + data = [{"k1": [""]}] + + table = tabulate(data) + + self.assertEqual(table[1][0], " |") + + def test_arrays_with_empty_last_item(self): + data = [{"k1": ["v1", ""]}] + + table = tabulate(data) + + self.assertEqual(table[1][0], "v1 | |") + def test_nested_arrays_within_a_single_cell(self): data = [ {"k1": ["seq1v1", ["seq2v1", "seq2v2"]]}, @@ -131,7 +154,7 @@ def test_objects_use_single_cell_layout_by_default(self): self.assertEqual(table[1], ["prop1; val1 | prop2; val2"]) - def test_object_with_single_property_within_cell_has_trailing_separator(self): + def test_object_with_single_property_within_cell_has_trailing_delimiter(self): data = [{"obj": {"k": "v"}}] table = tabulate(data) @@ -355,3 +378,10 @@ def test_inline_templates_are_preserved(self): ), "{{3*(steps.values()|length -1)}}|{{3*(steps.values()|length -1)+2}}", ) + self.assertEqual( + self.func("6;0{%if skip_option != "" -%};skip{% endif %}"), + "6;0{%if skip_option != "" -%};skip{% endif %}", + ) + + def test_delimiters_can_be_escaped(self): + self.assertEqual(self.func(r"1 ; 2 | 3 \| 4 | 5 \; 6"), [[1, 2], "3 | 4", "5 ; 6"]) From 7beb27d084a364431b86ad4849cde3514a6c78e0 Mon Sep 17 00:00:00 2001 From: Ian Stride Date: Mon, 3 Feb 2025 00:44:27 +0000 Subject: [PATCH 25/33] Rename convert_cell to parse_cell --- src/rpft/parsers/universal.py | 8 ++++---- tests/test_universal.py | 15 +++++++++------ 2 files changed, 13 insertions(+), 10 deletions(-) diff --git a/src/rpft/parsers/universal.py b/src/rpft/parsers/universal.py index 1ed7d7f..abbc166 100644 --- a/src/rpft/parsers/universal.py +++ b/src/rpft/parsers/universal.py @@ -123,7 +123,7 @@ def stream(title: str = None, headers=tuple(), rows=tuple()): for i, row in enumerate(rows): for h, v in zip(keypaths(headers), row): - yield [title, i] + h, convert_cell(v) + yield [title, i] + h, parse_cell(v) def keypaths(headers): @@ -160,7 +160,7 @@ def create_obj(pairs): return obj -def convert_cell(s: str, delimiters=DELIMS, depth=0) -> Any: +def parse_cell(s: str, delimiters=DELIMS, depth=0) -> Any: if type(s) is not str: raise TypeError("Value to convert is not a string") @@ -186,14 +186,14 @@ def convert_cell(s: str, delimiters=DELIMS, depth=0) -> Any: pattern = rf"(?", clean) diff --git a/tests/test_universal.py b/tests/test_universal.py index e04d1fb..e3cd69f 100644 --- a/tests/test_universal.py +++ b/tests/test_universal.py @@ -3,7 +3,7 @@ from rpft.parsers.sheets import DatasetSheetReader from rpft.parsers.universal import ( bookify, - convert_cell, + parse_cell, parse_table, parse_tables, tabulate, @@ -81,7 +81,7 @@ def test_array_delimiters_are_escaped(self): table = tabulate(data) - self.assertEqual(table[1], ["1 ; 2 | 3 \| 4 | 5 \; 6"]) + self.assertEqual(table[1], [r"1 ; 2 | 3 \| 4 | 5 \; 6"]) def test_single_item_array(self): data = [{"k1": ["seq1v1"]}] @@ -324,7 +324,7 @@ def test_nested_object_with_nested_object(self): class TestCellConversion(TestCase): def setUp(self): - self.func = convert_cell + self.func = parse_cell def test_convert_cell_string_to_number(self): self.assertEqual(self.func("123"), 123) @@ -379,9 +379,12 @@ def test_inline_templates_are_preserved(self): "{{3*(steps.values()|length -1)}}|{{3*(steps.values()|length -1)+2}}", ) self.assertEqual( - self.func("6;0{%if skip_option != "" -%};skip{% endif %}"), - "6;0{%if skip_option != "" -%};skip{% endif %}", + self.func("6;0{%if skip_option != " " -%};skip{% endif %}"), + "6;0{%if skip_option != " " -%};skip{% endif %}", ) def test_delimiters_can_be_escaped(self): - self.assertEqual(self.func(r"1 ; 2 | 3 \| 4 | 5 \; 6"), [[1, 2], "3 | 4", "5 ; 6"]) + self.assertEqual( + self.func(r"1 ; 2 | 3 \| 4 | 5 \; 6"), + [[1, 2], "3 | 4", "5 ; 6"], + ) From 4993ac3f19b88ff8e59d8c5565f21d415c632a04 Mon Sep 17 00:00:00 2001 From: Ian Stride Date: Mon, 3 Feb 2025 14:19:50 +0000 Subject: [PATCH 26/33] Consider escaped delimiters at the end of a cell --- src/rpft/parsers/universal.py | 2 +- tests/test_universal.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/rpft/parsers/universal.py b/src/rpft/parsers/universal.py index abbc166..5e55fa8 100644 --- a/src/rpft/parsers/universal.py +++ b/src/rpft/parsers/universal.py @@ -188,7 +188,7 @@ def parse_cell(s: str, delimiters=DELIMS, depth=0) -> Any: if d and re.search(pattern, clean): seq = [parse_cell(item, depth=depth + 1) for item in re.split(pattern, clean)] - return seq[:-1] if clean and clean[-1] == d else seq + return seq[:-1] if re.search(rf"(? Date: Mon, 3 Feb 2025 23:33:10 +0000 Subject: [PATCH 27/33] Stop escaping delimiters in templates --- src/rpft/parsers/universal.py | 4 +++- tests/test_universal.py | 9 +++++++++ 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/src/rpft/parsers/universal.py b/src/rpft/parsers/universal.py index 5e55fa8..4b06181 100644 --- a/src/rpft/parsers/universal.py +++ b/src/rpft/parsers/universal.py @@ -47,7 +47,9 @@ def tabulate(data, meta: dict = {}) -> Table: @singledispatch def stringify(value, delimiters=DELIMS, **_) -> str: - return re.sub(rf"([{delimiters}])", r"\\\1", str(value)) + s = str(value) + + return s if is_template(s) else re.sub(rf"([{delimiters}])", r"\\\1", s) @stringify.register diff --git a/tests/test_universal.py b/tests/test_universal.py index 4f6eecc..8b712bf 100644 --- a/tests/test_universal.py +++ b/tests/test_universal.py @@ -83,6 +83,15 @@ def test_array_delimiters_are_escaped(self): self.assertEqual(table[1], [r"1 ; 2 | 3 \| 4 | 5 \; 6"]) + def test_delimiters_in_templates_are_not_escaped(self): + data = [ + {"h1": '{@ values | map(attribute="ID") @}'}, + ] + + table = tabulate(data) + + self.assertEqual(table[1], ['{@ values | map(attribute="ID") @}']) + def test_single_item_array(self): data = [{"k1": ["seq1v1"]}] From 21c90981d39749907185dda6b8f378452aa3ea62 Mon Sep 17 00:00:00 2001 From: Ian Stride Date: Tue, 4 Feb 2025 13:11:13 +0000 Subject: [PATCH 28/33] Update docs --- README.md | 1 - docs/notation.md | 44 +++++++++++++++++++++++++++++++++----------- 2 files changed, 33 insertions(+), 12 deletions(-) diff --git a/README.md b/README.md index 1550f7b..636bd9b 100644 --- a/README.md +++ b/README.md @@ -16,7 +16,6 @@ The CLI supports the following subcommands: - `create_flows`: create RapidPro flows (in JSON format) from spreadsheets using content index - `flows_to_sheets`: convert RapidPro flows (in JSON format) into spreadsheets - `convert`: save input spreadsheets as JSON -- `save_data_sheets`: save input spreadsheets as nested JSON using content index - an experimental feature that is likely to change. Full details of the available options for each can be found via the help feature: diff --git a/docs/notation.md b/docs/notation.md index 8889eef..aef86cf 100644 --- a/docs/notation.md +++ b/docs/notation.md @@ -2,10 +2,25 @@ Summary of spreadsheet notation used to convert sheets into a nested data structure (JSON). A series of data tables will be shown alongside the resultant JSON structure. +# Books + +A container for multiple tables. Also known as a spreadsheet or workbook. A book is converted to an object containing a property for each table. The property key is the name of the sheet; the value is the converted contents of the sheet. + +For example, given an Excel workbook with two sheets ("table1" and "table2"), the resulting JSON will be: + +```json +{ + "table1": [], + "table2": [] +} +``` + # Tables Also known as a sheet in a spreadsheet (or workbook). +The contents of a table are converted to a sequence of objects - corresponding to rows in the sheet. Each object will have keys corresponding to the column headers of the sheet, and values corresponding to a particular row in the sheet. + | a | b | |----|----| | v1 | v2 | @@ -20,14 +35,7 @@ Also known as a sheet in a spreadsheet (or workbook). } ``` -Additional tables (sheets) will be added as additional properties. - -```json -{ - "sheet1": [{}, {}], - "sheet2": [{}, {}] -} -``` +This means that the first row of every table should be a header row that specifies the name of each column. # Basic types @@ -81,9 +89,13 @@ An ordered sequence of items. Also known as lists or arrays. `seq1` uses a 'wide' layout, where the column header is repeated and each column holds one item in the sequence. Values from columns with the same name are collected into a sequence in the resulting JSON object. +`seq2` is similar to `seq1`, but the index of each item is specified explicitly. + `seq3` uses an 'inline' layout, where the sequence is defined as a delimited string within a single cell of the table. The default delimiter is a vertical bar or pipe character ('|'). -Two levels of nesting are possible within a cell, as shown by `seq4` - a list of lists. This could be used to model a list of key-value pairs, which could easily be converted to an object (map / dictionary). +Two levels of nesting are possible within a cell, as shown by `seq4` - a list of lists. This could be used to model a list of key-value pairs, which could easily be converted to an object (map / dictionary). The default delimiter for second-level sequences is a semi-colon (';'). + +The interpretation of delimiter characters can be skipped by escaping the delimiter characters. An escape sequence begins with a backslash ('\\') and ends with the character to be escaped. For example, to escape a vertical bar, use: '\\|'. # Objects @@ -112,11 +124,11 @@ An unordered collection of key-value pairs (properties). Also known as maps, dic } ``` -`obj1` and `obj2` are not quite the same, but can be interpreted in the same way, as a list of key-value pairs. +`obj1` and `obj2` are slightly different, but can be interpreted in the same way, as a list of key-value pairs. A wide layout is used for `obj1`, where one or more column headers use a dotted 'keypath' notation to identify a particular property key belonging to a particular object, and the corresponding cells in subsequent rows contain the values for that property. The dotted keypath notation can be used to access properties at deeper levels of nesting e.g. `obj.key.subkey.etc`. -An inline layout is used for `obj2`, where properties are defined as a sequence of key-value pairs. The delimiter of properties is a vertical bar or pip character - the same as for top-level arrays. The delimiter of keys and values is a semi-colon character - the same as for 2nd-level arrays. +An inline layout is used for `obj2`, where properties are defined as a sequence of key-value pairs. The delimiter of properties is a vertical bar or pipe character - same as top-level sequences. The delimiter of keys and values is a semi-colon character - same as second-level sequences. All the previous notation can be combined to create fairly complicated structures. @@ -146,3 +158,13 @@ All the previous notation can be combined to create fairly complicated structure ] } ``` + +# Templates + +Table cells may contain Jinja templates. A cell is considered a template if it contains template placeholders anywhere within it. There are three types of template placeholders: + +- `{{ ... }}` +- `{% ... %}` +- `{@ ... @}` + +When converting between spreadsheets and JSON, templates will not be interpreted in any way, just copied verbatim. This means that sequence delimiters do not need to be escaped if they exist within a template. It is intended for templates to eventually be interpreted at a later stage, during further processing. From 7d113e3ac40f7ed19e9e63d638411ab8b421cd2e Mon Sep 17 00:00:00 2001 From: Ian Stride Date: Tue, 4 Feb 2025 14:12:31 +0000 Subject: [PATCH 29/33] Add info about metadata --- docs/notation.md | 35 +++++++++++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) diff --git a/docs/notation.md b/docs/notation.md index aef86cf..5e2438b 100644 --- a/docs/notation.md +++ b/docs/notation.md @@ -168,3 +168,38 @@ Table cells may contain Jinja templates. A cell is considered a template if it c - `{@ ... @}` When converting between spreadsheets and JSON, templates will not be interpreted in any way, just copied verbatim. This means that sequence delimiters do not need to be escaped if they exist within a template. It is intended for templates to eventually be interpreted at a later stage, during further processing. + +# Metadata + +Information that would otherwise be lost during the conversion from spreadsheets to JSON is stored as metadata - in a top-level property with key `_idems`. The metadata property is intended to be 'hidden' and unlikely to be shared by any sheet name. + +The original header names for each sheet are held as metadata to direct the conversion process from JSON back to spreadsheet. The original headers preserve the order of columns and whether a wide or inline layout was used. + + +| seq1 | seq1 | seq2 | +|------|------|----------| +| v1 | v2 | v1 \| v2 | + +`sequences` + +```json +{ + "_idems": { + "tabulate": { + "sequences": { + "headers": [ + "seq1", + "seq1", + "seq2" + ] + } + } + } + "sequences": [ + { + "seq1": ["v1", "v2"], + "seq2": ["v1", "v2"] + } + ] +} +``` From e2170e0517d4b2381f913facc2fd7b35710580a9 Mon Sep 17 00:00:00 2001 From: Ian Stride Date: Thu, 6 Feb 2025 22:09:28 +0000 Subject: [PATCH 30/33] Treat RapidPro expressions as templates --- src/rpft/parsers/universal.py | 2 +- tests/test_universal.py | 4 ++++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/src/rpft/parsers/universal.py b/src/rpft/parsers/universal.py index 4b06181..2ac88a3 100644 --- a/src/rpft/parsers/universal.py +++ b/src/rpft/parsers/universal.py @@ -201,4 +201,4 @@ def parse_cell(s: str, delimiters=DELIMS, depth=0) -> Any: def is_template(s: str) -> bool: - return bool(re.search("{{.*?}}|{@.*?@}|{%.*?%}", s)) + return bool(re.search(r"{{.*?}}|{@.*?@}|{%.*?%}|@\(.*?\)", s)) diff --git a/tests/test_universal.py b/tests/test_universal.py index 8b712bf..b83fb9b 100644 --- a/tests/test_universal.py +++ b/tests/test_universal.py @@ -391,6 +391,10 @@ def test_inline_templates_are_preserved(self): self.func("6;0{%if skip_option != " " -%};skip{% endif %}"), "6;0{%if skip_option != " " -%};skip{% endif %}", ) + self.assertEqual( + self.func('@(fields.survey_behave & "no|")'), + '@(fields.survey_behave & "no|")', + ) def test_delimiters_can_be_escaped(self): self.assertEqual( From ab9d6b69ed61ac37f1a77503f46f0c7d72855e40 Mon Sep 17 00:00:00 2001 From: Ian Stride Date: Mon, 10 Feb 2025 15:57:22 +0000 Subject: [PATCH 31/33] Allow sequence delimiters to be configured --- src/rpft/parsers/universal.py | 18 ++++++++++++------ tests/test_universal.py | 27 +++++++++++++++++++++++++++ 2 files changed, 39 insertions(+), 6 deletions(-) diff --git a/src/rpft/parsers/universal.py b/src/rpft/parsers/universal.py index 2ac88a3..26433f8 100644 --- a/src/rpft/parsers/universal.py +++ b/src/rpft/parsers/universal.py @@ -60,7 +60,8 @@ def _(value: dict, delimiters=DELIMS, depth=0) -> str: raise ValueError("Too few delimiters to stringify dict") s = f" {d1} ".join( - f"{stringify(k)}{d2} {stringify(v, depth=depth + 2)}" for k, v in value.items() + f"{stringify(k)}{d2} {stringify(v, delimiters=delimiters, depth=depth + 2)}" + for k, v in value.items() ) if len(value) == 1: @@ -76,7 +77,9 @@ def _(value: list, delimiters=DELIMS, depth=0) -> str: if not d: raise ValueError("Too few delimiters to stringify list") - s = f" {d} ".join(stringify(item, depth=depth + 1) for item in value) + s = f" {d} ".join( + stringify(item, delimiters=delimiters, depth=depth + 1) for item in value + ) if len(value) == 1: s += f" {d}" @@ -88,7 +91,7 @@ def _(value: list, delimiters=DELIMS, depth=0) -> str: @stringify.register def _(value: tuple, delimiters=DELIMS, depth=0) -> str: - return stringify(list(value), depth=depth) + return stringify(list(value), delimiters=delimiters, depth=depth) @stringify.register @@ -188,16 +191,19 @@ def parse_cell(s: str, delimiters=DELIMS, depth=0) -> Any: pattern = rf"(?", clean) + return re.sub(rf"\\([{delimiters}])", r"\g<1>", clean) def is_template(s: str) -> bool: diff --git a/tests/test_universal.py b/tests/test_universal.py index b83fb9b..2a440e6 100644 --- a/tests/test_universal.py +++ b/tests/test_universal.py @@ -6,11 +6,26 @@ parse_cell, parse_table, parse_tables, + stringify, tabulate, ) from tablib import Dataset +class TestConvertDataToCell(TestCase): + def test_delimiters_can_be_configured(self): + self.assertEqual( + stringify( + [ + ["click", ["auth", "sign_in_google"]], + ["click", ["emit", "force_reprocess"]], + ], + delimiters=";|:", + ), + "click | auth : sign_in_google ; click | emit : force_reprocess", + ) + + class TestConvertUniversalToTable(TestCase): def test_headers_must_be_first_row(self): data = [ @@ -369,6 +384,18 @@ def test_convert_cell_string_to_list_of_lists(self): self.assertEqual(self.func("k1; k2; v2 |"), [["k1", "k2", "v2"]]) self.assertEqual(self.func("k1; 1 | k2; true"), [["k1", 1], ["k2", True]]) + def test_delimiters_can_be_configured(self): + self.assertEqual( + self.func( + "click | auth: sign_in_google; click | emit: force_reprocess", + delimiters=";|:", + ), + [ + ["click", ["auth", "sign_in_google"]], + ["click", ["emit", "force_reprocess"]], + ], + ) + def test_inline_templates_are_preserved(self): self.assertEqual(self.func("{{ template }}"), "{{ template }}") self.assertEqual(self.func("{@ template @}"), "{@ template @}") From 4c7bb7cddb52bc092e5b6ad712908d7644425a12 Mon Sep 17 00:00:00 2001 From: Ian Stride Date: Wed, 19 Feb 2025 22:11:50 +0000 Subject: [PATCH 32/33] Require patched tablib A bug in ODS import has been fixed but not yet released. --- pyproject.toml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 46b1df3..bcc0d65 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -34,10 +34,11 @@ dependencies = [ "google-api-python-client~=2.6.0", "google-auth-oauthlib~=0.4.4", "networkx~=2.5.1", + "odfpy", "openpyxl", "pydantic >= 2", "python-benedict", - "tablib[ods]>=3.1.0", + "tablib @ git+https://github.com/istride/tablib@v3.8.0-0", ] [project.urls] From b1f34d2c2a411be5bf2ca72cb36c43f81aa4f770 Mon Sep 17 00:00:00 2001 From: Ian Stride Date: Thu, 6 Mar 2025 18:32:21 +0000 Subject: [PATCH 33/33] Bug fix --- src/rpft/parsers/common/rowparser.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/rpft/parsers/common/rowparser.py b/src/rpft/parsers/common/rowparser.py index 14f06dd..3971a69 100644 --- a/src/rpft/parsers/common/rowparser.py +++ b/src/rpft/parsers/common/rowparser.py @@ -162,7 +162,7 @@ def assign_value(self, field, key, value, model): # Get the list of keys that are available for the target model # Note: The fields have a well defined ordering. # See https://pydantic-docs.helpmanual.io/usage/models/#field-ordering - model_fields = list(model.__fields__.keys()) + model_fields = list(model.model_fields.keys()) if type(value) is not list: # It could be that an object is specified via a single element.