Skip to content

Commit

Permalink
Merge pull request #2529 from IDEMSInternational/feat/data-list-metadata
Browse files Browse the repository at this point in the history
Feat: data list metadata
  • Loading branch information
chrismclarke authored Nov 22, 2024
2 parents 4eec129 + 2ada398 commit 0bbf468
Show file tree
Hide file tree
Showing 5 changed files with 223 additions and 14 deletions.
17 changes: 17 additions & 0 deletions packages/data-models/flowTypes.ts
Original file line number Diff line number Diff line change
Expand Up @@ -74,9 +74,26 @@ export namespace FlowTypes {
flow_type: "asset_pack";
rows: Data_listRow<IAssetEntry>[];
}

/**
* Data types supported within data_lists
* These are a subset of types provided by Json Schema standards
* https://json-schema.org/draft/2020-12/json-schema-core#name-instance-data-model
* */
export type Data_listColumnType = "boolean" | "number" | "object" | "string";

export interface Data_listColumnMetadata {
/** Column data type. Default "string" when not defined */
type?: Data_listColumnType;
}

export interface Data_list extends FlowTypeWithData {
flow_type: "data_list";
rows: Data_listRow[];
/** Hashmap of any additional column metadata (omitted if all columns default string type) */
_metadata?: {
[column_name: string]: Data_listColumnMetadata;
};
}
export interface DataPipeFlow extends FlowTypeWithData {
flow_type: "data_pipe";
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ import { IConverterPaths, IFlowHashmapByType, IParsedWorkbookData } from "../../
import { arrayToHashmap, groupJsonByKey, IContentsEntry } from "../../utils";
import BaseProcessor from "../base";

const cacheVersion = 20241024.0;
const cacheVersion = 20241121.2;

export class FlowParserProcessor extends BaseProcessor<FlowTypes.FlowTypeWithData> {
public parsers: { [flowType in FlowTypes.FlowType]: Parsers.DefaultParser } = {
Expand Down
Original file line number Diff line number Diff line change
@@ -1,10 +1,98 @@
import { FlowTypes } from "data-models";
import { DataListParser } from ".";
import { TEST_DATA_PATHS } from "../../../../../../../test/helpers/utils";
import { FlowParserProcessor } from "../flowParser";

const testFlow = {
const MOCK_DATA_LIST = (): FlowTypes.Data_list => ({
flow_type: "data_list",
flow_name: "test_data_list",
flow_name: "mock_data_list",
rows: [
{
id: "id_1",
number: 1,
text: "hello",
boolean: true,
nested: { string: "hello" },
},
{
id: "id_1",
number: 1,
text: "hello",
boolean: true,
nested: { string: "hello" },
additional: false,
},
],
_xlsxPath: "/mock/data/path",
});

/***********************************************************************
* Basic
**********************************************************************/

/** yarn workspace scripts test -t data_list.parser.spec.ts **/
describe("data_list Parser Metadata", () => {
let parser: DataListParser;

beforeAll(() => {
parser = new DataListParser({ processedFlowHashmap: {} } as any);
});
it("Infers column metadata from data (for non-string columns)", () => {
const { _metadata } = parser.postProcessFlow(MOCK_DATA_LIST());
expect(_metadata).toEqual({
number: { type: "number" },
boolean: { type: "boolean" },
nested: { type: "object" },
// data missing or undefined in first row identified from subsequent
additional: { type: "boolean" },
});
});
it("Assigns column metadata from metadata row (and removes it)", () => {
const flowWithMetadataRow = {
...MOCK_DATA_LIST(),
rows: [
{ id: "@metadata", number: "type: number", text: "type: string" },
{ id: "id_1", number: undefined, text: undefined },
],
};
const { _metadata, rows } = parser.postProcessFlow(flowWithMetadataRow);
expect(_metadata).toEqual({
number: { type: "number" },
});
expect(rows.length).toEqual(1);
});
it("Omits metadata when all columns are strings", () => {
const flowWithStringColumns = {
...MOCK_DATA_LIST(),
rows: [{ id: "id_1", text: "hello" }],
};
const { _metadata } = parser.postProcessFlow(flowWithStringColumns);
expect(_metadata).toBeUndefined();
});

// eslint-disable-next-line jest/no-commented-out-tests
// it("QC metadata", () => {
// const loggerSpy = useMockLogger(false);
// const flowWithMetadataRow = {
// ...MOCK_DATA_LIST(),
// rows: [{ id: "id_1", number: 1, text: undefined }],
// };
// parser.postProcessFlow(flowWithMetadataRow);
// expect(loggerSpy.warning).toHaveBeenCalledTimes(1);
// expect(loggerSpy.warning).toHaveBeenCalledWith({
// msg1: "[mock_data_list] Data List validation has (1) warnings",
// msg2: "/mock/data/path",
// });
// });
});

/***********************************************************************
* Advanced use cases
**********************************************************************/

const MOCK_DATA_LIST_FLOW_ADVANCED = (): FlowTypes.Data_list => ({
flow_type: "data_list",
flow_name: "mock_data_list_advanced",
rows: [
{
id: "test_1",
Expand All @@ -14,18 +102,18 @@ const testFlow = {
"nested.test_2": "two",
},
],
};
});

/** yarn workspace scripts test -t data_list.parser.spec.ts **/
describe("data_list Parser (single)", () => {
let outputRows: any[];
let parser: DataListParser;

beforeAll(() => {
const parser = new DataListParser({ processedFlowHashmap: {} } as any);
const output = parser.run(testFlow as any);
outputRows = output.rows;
parser = new DataListParser({ processedFlowHashmap: {} } as any);
});

it("Extracts condition_list", async () => {
const { test_condition_list } = outputRows[0];
const { rows } = parser.run(MOCK_DATA_LIST_FLOW_ADVANCED() as any);
const { test_condition_list } = rows[0];
expect(test_condition_list).toEqual([
{
condition_type: "calc",
Expand All @@ -35,12 +123,14 @@ describe("data_list Parser (single)", () => {
]);
});
it("Extracts nested properties", async () => {
const { nested } = outputRows[0];
const { rows } = parser.run(MOCK_DATA_LIST_FLOW_ADVANCED() as any);
const { nested } = rows[0];
expect(nested).toEqual({ test_1: 1, test_2: "two" });
});
// TODO - likely deprecated (can't see in codebase)
it("Extracts notification_schedule", async () => {
const { test_notification_schedule } = outputRows[0];
const { rows } = parser.run(MOCK_DATA_LIST_FLOW_ADVANCED() as any);
const { test_notification_schedule } = rows[0];
expect(test_notification_schedule).toEqual({ key_1: "value_1", key_2: "value_2" });
});
});
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ import {
setNestedProperty,
} from "../../../utils";
import { DefaultParser } from "./default.parser";
import { isEmptyObjectDeep, isObjectLiteral, Logger } from "shared";

export class DataListParser extends DefaultParser {
postProcessRow(row: FlowTypes.Data_listRow) {
Expand Down Expand Up @@ -33,8 +34,109 @@ export class DataListParser extends DefaultParser {
return row;
}

public postProcessFlows(flows: FlowTypes.FlowTypeWithData[]) {
public override postProcessFlow(flow: FlowTypes.Data_list) {
const flowWithMetadata = this.getFlowMetadata(flow);
return flowWithMetadata;
}

public postProcessFlows(flows: FlowTypes.Data_list[]) {
const flowsWithOverrides = assignFlowOverrides(flows);
return flowsWithOverrides;
}

/** Assign column metadata from @metadata row if provided, or infer from data if not */
private getFlowMetadata(flow: FlowTypes.Data_list) {
const [firstRow] = flow.rows;
let metadataInitial: FlowTypes.Data_list["_metadata"];
// assign from metadata row and remove
if (firstRow?.id === "@metadata") {
metadataInitial = this.assignMetadataFromRow(firstRow);
flow.rows = flow.rows.slice(1);
}
// infer from data when metadata row not available
else {
metadataInitial = this.inferMetadataFromData(flow.rows);
}
// quality check metadata and assign cleaned meta to flow if non-empty
const { warnings, metadata } = this.qualityCheckMetadata(metadataInitial);
if (!isEmptyObjectDeep(metadata)) {
flow._metadata = metadata;
}

if (warnings.length > 0) {
for (const warning of warnings) {
console.warn(warning);
}
Logger.warning({
msg1: `[${flow.flow_name}] Data List validation has (${warnings.length}) warnings`,
msg2: flow._xlsxPath,
});
}
return flow;
}

/** Assign data_list metadata using a first `@metadata` row */
private assignMetadataFromRow(metadataRow: FlowTypes.Data_listRow) {
// do not extract metadata for id column
const { id, ...values } = metadataRow;
const metadata: FlowTypes.Data_list["_metadata"] = {};
for (const [field, metadataString] of Object.entries(values)) {
if (metadataString) {
metadata[field] = parseAppDataCollectionString(metadataString as string) as any;
}
}
return metadata;
}

/** Infer data_list metadata from data */
private inferMetadataFromData(rows: FlowTypes.Data_listRow[]) {
const metadata: FlowTypes.Data_list["_metadata"] = {};
for (const row of rows) {
// do not extract metadata for id column
const { id, ...values } = row;
for (const [field, value] of Object.entries(values)) {
// do not assign type to other metadata columns
if (!field.startsWith("_")) {
// only assign type if previously unassigned
if (!metadata[field]?.type) {
const inferredType = this.inferColumnDataType(value);
metadata[field] ??= {};
metadata[field].type = inferredType;
}
}
}
}
return metadata;
}

private inferColumnDataType(value: any): FlowTypes.Data_listColumnType {
if (value === undefined) return undefined;
if (typeof value === "string") return "string";
if (typeof value === "number") return "number";
if (typeof value === "boolean") return "boolean";
if (isObjectLiteral(value)) return "object";
return undefined;
}

private qualityCheckMetadata(metadata: FlowTypes.Data_list["_metadata"]) {
const warnings: string[] = [];
for (const [columnName, columnMetadata] of Object.entries(metadata)) {
// omit metadata for string or undefined types, will used default "string" at runtime
if (columnMetadata.type === "string" || columnMetadata.type === undefined) {
delete metadata[columnName];
}

/**
* TODO - consider whether check below actually useful to have
* ensure all columns have data type
*/
// if (columnMetadata.type === undefined) {
// metadata[columnName].type = "string";
// warnings.push(
// `"${columnName}" column cannot infer type (assume string)\nEither include an @metadata column or add example value to a row`
// );
// }
}
return { warnings, metadata };
}
}
2 changes: 1 addition & 1 deletion packages/scripts/test/helpers/utils.ts
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ import { Logger } from "shared/src/utils/logging/console-logger";
* @param callOriginal specify whether to still call original Logger
* methods after spy intercept
*/
export function useMockLogger(callOriginal = true) {
export function useMockLogger(callOriginal = false) {
const error = jest.spyOn(Logger, "error");
const warning = jest.spyOn(Logger, "warning");
if (!callOriginal) {
Expand Down

0 comments on commit 0bbf468

Please sign in to comment.