From c614e22c098614ff64bcd1adc0e0db36da856e57 Mon Sep 17 00:00:00 2001 From: Lance Gliser Date: Wed, 25 Sep 2024 10:34:42 -0500 Subject: [PATCH] Refactored to use basic stats to generate more attributes --- src/features/infer.test.ts | 2 +- src/features/sources/Array.test.ts | 7 +- src/features/sources/Array.ts | 162 ++++++++++++++++++----------- src/features/sources/Base.ts | 81 +++++++++++---- src/features/utils.ts | 20 ++++ 5 files changed, 189 insertions(+), 83 deletions(-) diff --git a/src/features/infer.test.ts b/src/features/infer.test.ts index 106f471..6417b41 100644 --- a/src/features/infer.test.ts +++ b/src/features/infer.test.ts @@ -1,6 +1,6 @@ import { FeatureAttributes, FeatureAttributesIndex } from "../types"; -it.skip("TODO implement generic infer tests", () => {}); +it.todo("TODO implement generic infer tests"); export const expectFeatureAttributesIndex = (index: FeatureAttributesIndex | undefined) => { if (!index) { diff --git a/src/features/sources/Array.test.ts b/src/features/sources/Array.test.ts index 3637317..40b2237 100644 --- a/src/features/sources/Array.test.ts +++ b/src/features/sources/Array.test.ts @@ -6,13 +6,16 @@ import { InferFeatureAttributesFromArray } from "./Array"; describe("features/sources/Array", () => { const now = new Date(); + const yesterday = new Date(); + yesterday.setDate(yesterday.getDate() - 1); const columns = ["id", "number", "date", "boolean"]; const data: ArrayData = { columns, data: [ - ["0", 1.2, now.toISOString(), false], + ["0", 1.2, yesterday.toISOString(), false], ["1", 2.4, now.toISOString(), true], ["3", 2.4, null, true], + ["4", 5, now.toISOString(), true], ], }; @@ -37,7 +40,7 @@ describe("features/sources/Array", () => { expectFeatureAttributesIndex(features); // Id - expect(features["id"].type).toBe("continuous"); + expect(features["id"].type).toBe("nominal"); expect(features["id"].data_type).toBe("string"); // Number expect(features["number"].type).toBe("continuous"); diff --git a/src/features/sources/Array.ts b/src/features/sources/Array.ts index 626feae..798edf7 100644 --- a/src/features/sources/Array.ts +++ b/src/features/sources/Array.ts @@ -8,7 +8,7 @@ import { isArrayData, } from "../base"; import * as utils from "../utils"; -import { FeatureSerializerBase, InferFeatureAttributesBase } from "./Base"; +import { FeatureSerializerBase, InferFeatureAttributeFeatureStatistics, InferFeatureAttributesBase } from "./Base"; export class InferFeatureAttributesFromArray extends InferFeatureAttributesBase { public static sourceFormat: FeatureSourceFormat = "array"; @@ -30,21 +30,53 @@ export class InferFeatureAttributesFromArray extends InferFeatureAttributesBase } } - protected inferType(featureName: string): FeatureAttributes["type"] { + protected async getStatistics(featureName: string): Promise { + if (this.statistics[featureName]) { + return this.statistics[featureName]; + } + const index = this.dataset.columns.indexOf(featureName); - const values = this.dataset.data.reduce( - (values, data) => { - const value = data[index] || Infinity; - values[value] ||= 0; - values[value]++; - return values; + const { values, ...statistics } = this.dataset.data.reduce( + (statistics, data) => { + const value = data[index]; + const isNull = utils.isNull(value); + + // Unique value counts + const uniqueValue = value || Infinity; + statistics.values[uniqueValue] ||= 0; + statistics.values[uniqueValue]++; + + // Bounds + statistics.hasNulls = statistics.hasNulls || isNull; + statistics.minimum = + statistics.minimum === undefined ? value : statistics.minimum > value ? value : statistics.minimum; + statistics.maximum = + statistics.maximum === undefined ? value : statistics.maximum < value ? value : statistics.maximum; + + // Sample + statistics.samples ||= []; + if (statistics.samples.length <= 5 && !isNull) { + statistics.samples.push(value); + } + + return statistics; + }, + { + values: {}, + } as Omit & { + values: Record; }, - {} as Record, ); const totalValues = Object.values(values).reduce((sum, count) => sum + count, 0); const uniqueValues = Object.keys(values).length; - return InferFeatureAttributesBase.isNominal(uniqueValues, totalValues) ? "nominal" : "continuous"; + this.statistics[featureName] = { + ...statistics, + totalValues, + unique: uniqueValues === totalValues, + uniqueValues, + }; + return this.statistics[featureName]; } protected async inferInteger(featureName: string): Promise { @@ -52,6 +84,7 @@ export class InferFeatureAttributesFromArray extends InferFeatureAttributesBase } protected async inferFloat(featureName: string): Promise { + const { samples, totalValues, uniqueValues } = await this.getStatistics(featureName); let decimal_places = 0; let asNominal = false; @@ -60,7 +93,6 @@ export class InferFeatureAttributesFromArray extends InferFeatureAttributesBase decimal_places = Math.max(decimal_places, utils.precision(x[index])); return x[index]; }); - const numUnique = new Set(column).size; const intLike = decimal_places === 0; if (decimal_places >= 15) { @@ -69,11 +101,11 @@ export class InferFeatureAttributesFromArray extends InferFeatureAttributesBase } // Detect if column should be treated as nominal - if (this.getSample(featureName) !== undefined) { + if (samples.at(0) !== undefined) { if (intLike) { - asNominal = numUnique < Math.pow(column.length, 0.5); + asNominal = uniqueValues < Math.pow(column.length, 0.5); } else { - asNominal = InferFeatureAttributesFromArray.isNominal(numUnique, column.length); + asNominal = uniqueValues <= 2 && totalValues > 10; } } @@ -83,13 +115,12 @@ export class InferFeatureAttributesFromArray extends InferFeatureAttributesBase data_type: "number", decimal_places, }; - } else { - return { - type: "continuous", - data_type: "number", - decimal_places, - }; } + return { + type: "continuous", + data_type: "number", + decimal_places, + }; } public async inferBounds( @@ -97,27 +128,42 @@ export class InferFeatureAttributesFromArray extends InferFeatureAttributesBase featureName: string, options: InferFeatureBoundsOptions, ): Promise { - let hasNull = false; - let isDate = false; + const { minimum, maximum, hasNulls, samples, uniqueValues, totalValues } = await this.getStatistics(featureName); + + const sample = samples.at(0); + const isDate = sample instanceof Date; + const coercedDate = typeof sample !== "number" ? utils.coerceDate(sample) : undefined; const output: FeatureAttributes["bounds"] = {}; const index = this.dataset.columns.indexOf(featureName); const column = this.dataset.data.reduce((result, el) => { - if (utils.isNull(el[index])) { - // Exclude nulls - hasNull = true; - } else if (el[index] instanceof Date) { - result.push(el[index].getTime()); - isDate = true; + const value = el[index]; + if (!!coercedDate) { + const date = utils.coerceDate(value); + result.push(date?.getTime() ?? value); } else { - result.push(el[index]); + result.push(value); } return result; }, []); if (attributes.type === "continuous") { - column.sort((a, b) => a - b || Number(isNaN(a)) - Number(isNaN(b))); - let minValue = column[0]; - let maxValue = column[column.length - 1]; + const getNumericValue = (value: string | number | Date | undefined | null): number | undefined => { + if (typeof value === "number") { + return value; + } + if (!value) { + return undefined; + } + if (typeof value === "string" || value instanceof Date) { + const date = utils.coerceDate(value); + return date?.getTime(); + } + return undefined; + }; + + let minValue = getNumericValue(minimum); + let maxValue = getNumericValue(maximum); + // Save original value const actualMin = minValue; const actualMax = maxValue; @@ -133,8 +179,7 @@ export class InferFeatureAttributesFromArray extends InferFeatureAttributesBase const { modeBounds = true } = options; if (modeBounds || (Array.isArray(modeBounds) && modeBounds.indexOf(featureName) >= 0)) { // Check for mode bounds - const numUnique = new Set(column).size; - if (numUnique !== column.length) { + if (uniqueValues !== totalValues) { const [modes, modeCount] = utils.allModes(column); // If the mode for the feature is same as an original bound, set that appropriate bound to the mode value @@ -152,18 +197,32 @@ export class InferFeatureAttributesFromArray extends InferFeatureAttributesBase } } } - // Convert back to date object - if (isDate) { - minValue = new Date(minValue); - maxValue = new Date(maxValue); - } + + const getBoundValue = (value: number | undefined): number | string | Date | undefined => { + if (value === undefined) { + return undefined; + } + + if (isDate) { + return new Date(value); + } + + if (!!coercedDate) { + // TODO There's a small concern here that we may need to go back into the format we found + // For now we can't currently find anything that isn't ISO in the JS implementation + return new Date(value).toISOString(); + } + + return value; + }; + // Set bounds - output["min"] = minValue; - output["max"] = maxValue; + output["min"] = getBoundValue(minValue); + output["max"] = getBoundValue(maxValue); } } - output["allow_null"] = hasNull; + output["allow_null"] = hasNulls; if (Object.keys(output).length > 0) { return output; @@ -183,25 +242,6 @@ export class InferFeatureAttributesFromArray extends InferFeatureAttributesBase throw new Error("Method not implemented."); } - protected async inferUnique( - /* eslint-disable-next-line @typescript-eslint/no-unused-vars*/ - featureName: string, - ): Promise { - // Arrays don't support unique constraints - return false; - } - - protected getSample(featureName: string): any | undefined { - const index = this.dataset.columns.indexOf(featureName); - for (const row of this.dataset.data) { - if (utils.isNull(row[index])) { - continue; - } - return row[index]; - } - return undefined; - } - public async getFeatureNames(): Promise { return this.dataset.columns || []; } diff --git a/src/features/sources/Base.ts b/src/features/sources/Base.ts index 6d15140..029f106 100644 --- a/src/features/sources/Base.ts +++ b/src/features/sources/Base.ts @@ -6,6 +6,17 @@ import { InferFeatureBoundsOptions, InferFeatureTimeSeriesOptions, } from "../base"; +import { coerceDate } from "../utils"; + +export type InferFeatureAttributeFeatureStatistics = { + uniqueValues: number; + totalValues: number; + minimum: string | number | Date; + maximum: string | number | Date; + hasNulls: boolean; + unique: boolean; + samples: (string | number | Date)[]; +}; export abstract class InferFeatureAttributesBase { public static sourceFormat: FeatureSourceFormat; @@ -17,10 +28,9 @@ export abstract class InferFeatureAttributesBase { throw new Error("InferFeatureAttributesBase must be implemented in non-abstract classes"); } - public static isNominal(uniqueValues: number, totalValues: number): boolean { - return uniqueValues <= 2 && totalValues > 10; - } - protected abstract inferType(featureName: string): FeatureAttributes["type"]; + protected statistics: Record = {}; + /** Returns cached basic statistics for the feature */ + protected abstract getStatistics(featureName: string): Promise; /* Entrypoint */ public async infer(options: InferFeatureAttributesOptions = {}): Promise { @@ -28,6 +38,21 @@ export abstract class InferFeatureAttributesBase { const { ordinalFeatureValues = {}, dependentFeatures = {} } = options; const columns = await this.getFeatureNames(); + // Generate initial statistics + const statisticsItems = await Promise.all( + columns.map(async (featureName) => ({ + featureName, + statistics: await this.getStatistics(featureName), + })), + ); + this.statistics = statisticsItems.reduce( + (allStatistics, { featureName, statistics }) => { + allStatistics[featureName] = statistics; + return allStatistics; + }, + {} as Record, + ); + // Determine base feature attributes for (let i = 0; i < columns.length; i++) { const feature = columns[i]; @@ -117,7 +142,7 @@ export abstract class InferFeatureAttributesBase { } if (options.includeSample) { - attributes[feature].sample = this.getSample(feature); + attributes[feature].sample = await this.getSample(feature); } } @@ -126,7 +151,7 @@ export abstract class InferFeatureAttributesBase { /* Feature types */ protected async getOriginalFeatureType(featureName: string): Promise { - const value = this.getSample(featureName); + const value = await this.getSample(featureName); const dataType = typeof value; switch (dataType) { case "bigint": @@ -151,13 +176,13 @@ export abstract class InferFeatureAttributesBase { featureName: string, value?: string, ): Promise { - value ||= this.getSample(featureName); + value ||= await this.getSample(featureName); if (!value) { return undefined; } - const dateParsed = value.match(/^[\d]{4}-[\d]{2}-[\d]{2}/) ? Date.parse(value) : 0; - if (dateParsed > 0) { + const date = coerceDate(value); + if (date) { return { data_type: "datetime" }; } @@ -184,32 +209,44 @@ export abstract class InferFeatureAttributesBase { }; } - protected async inferDateTime(featureName: string): Promise { + protected async inferDateTime( + /* eslint-disable-next-line @typescript-eslint/no-unused-vars*/ + featureName: string, + ): Promise { return { - type: this.inferType(featureName), + type: "continuous", data_type: "formatted_date_time", date_time_format: "%Y-%m-%dT%H:%M:%SZ", }; } - protected async inferDate(featureName: string): Promise { + protected async inferDate( + /* eslint-disable-next-line @typescript-eslint/no-unused-vars*/ + featureName: string, + ): Promise { return { - type: this.inferType(featureName), + type: "continuous", data_type: "formatted_date_time", date_time_format: "%Y-%m-%d", }; } - protected async inferTime(featureName: string): Promise { + protected async inferTime( + /* eslint-disable-next-line @typescript-eslint/no-unused-vars*/ + featureName: string, + ): Promise { return { - type: this.inferType(featureName), + type: "continuous", data_type: "string", }; } - protected async inferString(featureName: string): Promise { + protected async inferString( + /* eslint-disable-next-line @typescript-eslint/no-unused-vars*/ + featureName: string, + ): Promise { return { - type: this.inferType(featureName), + type: "nominal", data_type: "string", }; } @@ -224,7 +261,11 @@ export abstract class InferFeatureAttributesBase { } /* Feature properties */ - protected abstract inferUnique(featureName: string): Promise; + + protected async inferUnique(featureName: string): Promise { + const { unique } = await this.getStatistics(featureName); + return unique; + } public abstract inferBounds( attributes: Readonly, featureName: string, @@ -236,7 +277,9 @@ export abstract class InferFeatureAttributesBase { options: InferFeatureTimeSeriesOptions, ): Promise>; - protected abstract getSample(featureName: string): any | undefined; + protected async getSample(featureName: string): Promise { + return this.statistics[featureName]?.samples.at(0); + } /* Descriptive operations */ public abstract getFeatureNames(): Promise; diff --git a/src/features/utils.ts b/src/features/utils.ts index bff55a8..29bf08a 100644 --- a/src/features/utils.ts +++ b/src/features/utils.ts @@ -88,3 +88,23 @@ export function allModes(values: T[]): [T[], number] { return [modeValues, mode]; } + +/** Attempts to get a Date object from a string or Date */ +export const coerceDate = (value: string | Date | null | undefined): Date | undefined => { + if (!value || isNull(value)) { + return undefined; + } + + if (value instanceof Date) { + return value; + } + + const dateParsed = + // "1", "2" etc as strings count as dates to Date.parse, we'll need a minimum standard to avoid that. + value.match(/^[\d]{4}-[\d]{2}-[\d]{2}/) ? Date.parse(value) : 0; + if (dateParsed === 0) { + return undefined; + } + + return new Date(dateParsed); +};