Skip to content

Commit

Permalink
Refactored to use basic stats to generate more attributes
Browse files Browse the repository at this point in the history
  • Loading branch information
lancegliser committed Sep 25, 2024
1 parent 047a5fa commit c614e22
Show file tree
Hide file tree
Showing 5 changed files with 189 additions and 83 deletions.
2 changes: 1 addition & 1 deletion src/features/infer.test.ts
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import { FeatureAttributes, FeatureAttributesIndex } from "../types";

it.skip("TODO implement generic infer tests", () => {});
it.todo("TODO implement generic infer tests");

export const expectFeatureAttributesIndex = (index: FeatureAttributesIndex | undefined) => {
if (!index) {
Expand Down
7 changes: 5 additions & 2 deletions src/features/sources/Array.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -6,13 +6,16 @@ import { InferFeatureAttributesFromArray } from "./Array";

describe("features/sources/Array", () => {
const now = new Date();
const yesterday = new Date();
yesterday.setDate(yesterday.getDate() - 1);
const columns = ["id", "number", "date", "boolean"];
const data: ArrayData = {
columns,
data: [
["0", 1.2, now.toISOString(), false],
["0", 1.2, yesterday.toISOString(), false],
["1", 2.4, now.toISOString(), true],
["3", 2.4, null, true],
["4", 5, now.toISOString(), true],
],
};

Expand All @@ -37,7 +40,7 @@ describe("features/sources/Array", () => {
expectFeatureAttributesIndex(features);

// Id
expect(features["id"].type).toBe("continuous");
expect(features["id"].type).toBe("nominal");
expect(features["id"].data_type).toBe("string");
// Number
expect(features["number"].type).toBe("continuous");
Expand Down
162 changes: 101 additions & 61 deletions src/features/sources/Array.ts
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ import {
isArrayData,
} from "../base";
import * as utils from "../utils";
import { FeatureSerializerBase, InferFeatureAttributesBase } from "./Base";
import { FeatureSerializerBase, InferFeatureAttributeFeatureStatistics, InferFeatureAttributesBase } from "./Base";

export class InferFeatureAttributesFromArray extends InferFeatureAttributesBase {
public static sourceFormat: FeatureSourceFormat = "array";
Expand All @@ -30,28 +30,61 @@ export class InferFeatureAttributesFromArray extends InferFeatureAttributesBase
}
}

protected inferType(featureName: string): FeatureAttributes["type"] {
protected async getStatistics(featureName: string): Promise<InferFeatureAttributeFeatureStatistics> {
if (this.statistics[featureName]) {
return this.statistics[featureName];
}

const index = this.dataset.columns.indexOf(featureName);
const values = this.dataset.data.reduce(
(values, data) => {
const value = data[index] || Infinity;
values[value] ||= 0;
values[value]++;
return values;
const { values, ...statistics } = this.dataset.data.reduce(
(statistics, data) => {
const value = data[index];
const isNull = utils.isNull(value);

// Unique value counts
const uniqueValue = value || Infinity;
statistics.values[uniqueValue] ||= 0;
statistics.values[uniqueValue]++;

// Bounds
statistics.hasNulls = statistics.hasNulls || isNull;
statistics.minimum =
statistics.minimum === undefined ? value : statistics.minimum > value ? value : statistics.minimum;
statistics.maximum =
statistics.maximum === undefined ? value : statistics.maximum < value ? value : statistics.maximum;

// Sample
statistics.samples ||= [];
if (statistics.samples.length <= 5 && !isNull) {
statistics.samples.push(value);
}

return statistics;
},
{
values: {},
} as Omit<InferFeatureAttributeFeatureStatistics, "uniqueValues" | "totalValues"> & {
values: Record<string | number, number>;
},
{} as Record<string | number, number>,
);
const totalValues = Object.values(values).reduce((sum, count) => sum + count, 0);
const uniqueValues = Object.keys(values).length;

return InferFeatureAttributesBase.isNominal(uniqueValues, totalValues) ? "nominal" : "continuous";
this.statistics[featureName] = {
...statistics,
totalValues,
unique: uniqueValues === totalValues,
uniqueValues,
};
return this.statistics[featureName];
}

protected async inferInteger(featureName: string): Promise<FeatureAttributes> {
return await this.inferFloat(featureName);
}

protected async inferFloat(featureName: string): Promise<FeatureAttributes> {
const { samples, totalValues, uniqueValues } = await this.getStatistics(featureName);
let decimal_places = 0;
let asNominal = false;

Expand All @@ -60,7 +93,6 @@ export class InferFeatureAttributesFromArray extends InferFeatureAttributesBase
decimal_places = Math.max(decimal_places, utils.precision(x[index]));
return x[index];
});
const numUnique = new Set(column).size;

const intLike = decimal_places === 0;
if (decimal_places >= 15) {
Expand All @@ -69,11 +101,11 @@ export class InferFeatureAttributesFromArray extends InferFeatureAttributesBase
}

// Detect if column should be treated as nominal
if (this.getSample(featureName) !== undefined) {
if (samples.at(0) !== undefined) {
if (intLike) {
asNominal = numUnique < Math.pow(column.length, 0.5);
asNominal = uniqueValues < Math.pow(column.length, 0.5);
} else {
asNominal = InferFeatureAttributesFromArray.isNominal(numUnique, column.length);
asNominal = uniqueValues <= 2 && totalValues > 10;
}
}

Expand All @@ -83,41 +115,55 @@ export class InferFeatureAttributesFromArray extends InferFeatureAttributesBase
data_type: "number",
decimal_places,
};
} else {
return {
type: "continuous",
data_type: "number",
decimal_places,
};
}
return {
type: "continuous",
data_type: "number",
decimal_places,
};
}

public async inferBounds(
attributes: Readonly<FeatureAttributes>,
featureName: string,
options: InferFeatureBoundsOptions,
): Promise<FeatureAttributes["bounds"]> {
let hasNull = false;
let isDate = false;
const { minimum, maximum, hasNulls, samples, uniqueValues, totalValues } = await this.getStatistics(featureName);

const sample = samples.at(0);
const isDate = sample instanceof Date;
const coercedDate = typeof sample !== "number" ? utils.coerceDate(sample) : undefined;
const output: FeatureAttributes["bounds"] = {};
const index = this.dataset.columns.indexOf(featureName);
const column = this.dataset.data.reduce((result, el) => {
if (utils.isNull(el[index])) {
// Exclude nulls
hasNull = true;
} else if (el[index] instanceof Date) {
result.push(el[index].getTime());
isDate = true;
const value = el[index];
if (!!coercedDate) {
const date = utils.coerceDate(value);
result.push(date?.getTime() ?? value);
} else {
result.push(el[index]);
result.push(value);
}
return result;
}, []);

if (attributes.type === "continuous") {
column.sort((a, b) => a - b || Number(isNaN(a)) - Number(isNaN(b)));
let minValue = column[0];
let maxValue = column[column.length - 1];
const getNumericValue = (value: string | number | Date | undefined | null): number | undefined => {
if (typeof value === "number") {
return value;
}
if (!value) {
return undefined;
}
if (typeof value === "string" || value instanceof Date) {
const date = utils.coerceDate(value);
return date?.getTime();
}
return undefined;
};

let minValue = getNumericValue(minimum);
let maxValue = getNumericValue(maximum);

// Save original value
const actualMin = minValue;
const actualMax = maxValue;
Expand All @@ -133,8 +179,7 @@ export class InferFeatureAttributesFromArray extends InferFeatureAttributesBase
const { modeBounds = true } = options;
if (modeBounds || (Array.isArray(modeBounds) && modeBounds.indexOf(featureName) >= 0)) {
// Check for mode bounds
const numUnique = new Set(column).size;
if (numUnique !== column.length) {
if (uniqueValues !== totalValues) {
const [modes, modeCount] = utils.allModes(column);

// If the mode for the feature is same as an original bound, set that appropriate bound to the mode value
Expand All @@ -152,18 +197,32 @@ export class InferFeatureAttributesFromArray extends InferFeatureAttributesBase
}
}
}
// Convert back to date object
if (isDate) {
minValue = new Date(minValue);
maxValue = new Date(maxValue);
}

const getBoundValue = (value: number | undefined): number | string | Date | undefined => {
if (value === undefined) {
return undefined;
}

if (isDate) {
return new Date(value);
}

if (!!coercedDate) {
// TODO There's a small concern here that we may need to go back into the format we found
// For now we can't currently find anything that isn't ISO in the JS implementation
return new Date(value).toISOString();
}

return value;
};

// Set bounds
output["min"] = minValue;
output["max"] = maxValue;
output["min"] = getBoundValue(minValue);
output["max"] = getBoundValue(maxValue);
}
}

output["allow_null"] = hasNull;
output["allow_null"] = hasNulls;

if (Object.keys(output).length > 0) {
return output;
Expand All @@ -183,25 +242,6 @@ export class InferFeatureAttributesFromArray extends InferFeatureAttributesBase
throw new Error("Method not implemented.");
}

protected async inferUnique(
/* eslint-disable-next-line @typescript-eslint/no-unused-vars*/
featureName: string,
): Promise<boolean> {
// Arrays don't support unique constraints
return false;
}

protected getSample(featureName: string): any | undefined {
const index = this.dataset.columns.indexOf(featureName);
for (const row of this.dataset.data) {
if (utils.isNull(row[index])) {
continue;
}
return row[index];
}
return undefined;
}

public async getFeatureNames(): Promise<string[]> {
return this.dataset.columns || [];
}
Expand Down
Loading

0 comments on commit c614e22

Please sign in to comment.