Skip to content

Commit

Permalink
feat(core): Collect and aggregate metrics from the evaluation workflo…
Browse files Browse the repository at this point in the history
…w execution (no-changelog) (#11945)

Co-authored-by: Tomi Turtiainen <[email protected]>
  • Loading branch information
burivuhster and tomi committed Dec 2, 2024
1 parent be69f5c commit b5b95ff
Show file tree
Hide file tree
Showing 5 changed files with 183 additions and 11 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
import { EvaluationMetrics } from '../evaluation-metrics.ee';

describe('EvaluationMetrics', () => {
test('should aggregate metrics correctly', () => {
const testMetricNames = new Set(['metric1', 'metric2']);
const metrics = new EvaluationMetrics(testMetricNames);

metrics.addResults({ metric1: 1, metric2: 0 });
metrics.addResults({ metric1: 0.5, metric2: 0.2 });

const aggregatedMetrics = metrics.getAggregatedMetrics();

expect(aggregatedMetrics).toEqual({ metric1: 0.75, metric2: 0.1 });
});

test('should aggregate only numbers', () => {
const testMetricNames = new Set(['metric1', 'metric2']);
const metrics = new EvaluationMetrics(testMetricNames);

metrics.addResults({ metric1: 1, metric2: 0 });
metrics.addResults({ metric1: '0.5', metric2: 0.2 });
metrics.addResults({ metric1: 'not a number', metric2: [1, 2, 3] });

const aggregatedUpMetrics = metrics.getAggregatedMetrics();

expect(aggregatedUpMetrics).toEqual({ metric1: 1, metric2: 0.1 });
});

test('should handle missing values', () => {
const testMetricNames = new Set(['metric1', 'metric2']);
const metrics = new EvaluationMetrics(testMetricNames);

metrics.addResults({ metric1: 1 });
metrics.addResults({ metric2: 0.2 });

const aggregatedMetrics = metrics.getAggregatedMetrics();

expect(aggregatedMetrics).toEqual({ metric1: 1, metric2: 0.2 });
});

test('should handle empty metrics', () => {
const testMetricNames = new Set(['metric1', 'metric2']);
const metrics = new EvaluationMetrics(testMetricNames);

const aggregatedMetrics = metrics.getAggregatedMetrics();

expect(aggregatedMetrics).toEqual({});
});

test('should handle empty testMetrics', () => {
const metrics = new EvaluationMetrics(new Set());

metrics.addResults({ metric1: 1, metric2: 0 });
metrics.addResults({ metric1: 0.5, metric2: 0.2 });

const aggregatedMetrics = metrics.getAggregatedMetrics();

expect(aggregatedMetrics).toEqual({});
});

test('should ignore non-relevant values', () => {
const testMetricNames = new Set(['metric1']);
const metrics = new EvaluationMetrics(testMetricNames);

metrics.addResults({ metric1: 1, notRelevant: 0 });
metrics.addResults({ metric1: 0.5, notRelevant2: { foo: 'bar' } });

const aggregatedMetrics = metrics.getAggregatedMetrics();

expect(aggregatedMetrics).toEqual({ metric1: 0.75 });
});
});
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,12 @@
"name": "success",
"value": true,
"type": "boolean"
},
{
"id": "877d1bf8-31a7-4571-9293-a6837b51d22b",
"name": "metric1",
"value": 0.1,
"type": "number"
}
]
},
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,15 +2,17 @@ import type { SelectQueryBuilder } from '@n8n/typeorm';
import { stringify } from 'flatted';
import { readFileSync } from 'fs';
import { mock, mockDeep } from 'jest-mock-extended';
import type { IRun } from 'n8n-workflow';
import type { GenericValue, IRun } from 'n8n-workflow';
import path from 'path';

import type { ActiveExecutions } from '@/active-executions';
import type { ExecutionEntity } from '@/databases/entities/execution-entity';
import type { TestDefinition } from '@/databases/entities/test-definition.ee';
import type { TestMetric } from '@/databases/entities/test-metric.ee';
import type { TestRun } from '@/databases/entities/test-run.ee';
import type { User } from '@/databases/entities/user';
import type { ExecutionRepository } from '@/databases/repositories/execution.repository';
import type { TestMetricRepository } from '@/databases/repositories/test-metric.repository.ee';
import type { TestRunRepository } from '@/databases/repositories/test-run.repository.ee';
import type { WorkflowRepository } from '@/databases/repositories/workflow.repository';
import type { WorkflowRunner } from '@/workflow-runner';
Expand Down Expand Up @@ -58,12 +60,38 @@ function mockExecutionData() {
});
}

function mockEvaluationExecutionData(metrics: Record<string, GenericValue>) {
return mock<IRun>({
data: {
resultData: {
lastNodeExecuted: 'lastNode',
runData: {
lastNode: [
{
data: {
main: [
[
{
json: metrics,
},
],
],
},
},
],
},
},
},
});
}

describe('TestRunnerService', () => {
const executionRepository = mock<ExecutionRepository>();
const workflowRepository = mock<WorkflowRepository>();
const workflowRunner = mock<WorkflowRunner>();
const activeExecutions = mock<ActiveExecutions>();
const testRunRepository = mock<TestRunRepository>();
const testMetricRepository = mock<TestMetricRepository>();

beforeEach(() => {
const executionsQbMock = mockDeep<SelectQueryBuilder<ExecutionEntity>>({
Expand All @@ -80,6 +108,11 @@ describe('TestRunnerService', () => {
.mockResolvedValueOnce(executionMocks[1]);

testRunRepository.createTestRun.mockResolvedValue(mock<TestRun>({ id: 'test-run-id' }));

testMetricRepository.find.mockResolvedValue([
mock<TestMetric>({ name: 'metric1' }),
mock<TestMetric>({ name: 'metric2' }),
]);
});

afterEach(() => {
Expand All @@ -97,6 +130,7 @@ describe('TestRunnerService', () => {
executionRepository,
activeExecutions,
testRunRepository,
testMetricRepository,
);

expect(testRunnerService).toBeInstanceOf(TestRunnerService);
Expand All @@ -109,6 +143,7 @@ describe('TestRunnerService', () => {
executionRepository,
activeExecutions,
testRunRepository,
testMetricRepository,
);

workflowRepository.findById.calledWith('workflow-under-test-id').mockResolvedValueOnce({
Expand Down Expand Up @@ -143,6 +178,7 @@ describe('TestRunnerService', () => {
executionRepository,
activeExecutions,
testRunRepository,
testMetricRepository,
);

workflowRepository.findById.calledWith('workflow-under-test-id').mockResolvedValueOnce({
Expand All @@ -166,17 +202,17 @@ describe('TestRunnerService', () => {
.mockResolvedValue(mockExecutionData());

activeExecutions.getPostExecutePromise
.calledWith('some-execution-id-2')
.calledWith('some-execution-id-3')
.mockResolvedValue(mockExecutionData());

// Mock executions of evaluation workflow
activeExecutions.getPostExecutePromise
.calledWith('some-execution-id-3')
.mockResolvedValue(mockExecutionData());
.calledWith('some-execution-id-2')
.mockResolvedValue(mockEvaluationExecutionData({ metric1: 1, metric2: 0 }));

activeExecutions.getPostExecutePromise
.calledWith('some-execution-id-4')
.mockResolvedValue(mockExecutionData());
.mockResolvedValue(mockEvaluationExecutionData({ metric1: 0.5 }));

await testRunnerService.runTest(
mock<User>(),
Expand Down Expand Up @@ -225,7 +261,8 @@ describe('TestRunnerService', () => {
expect(testRunRepository.markAsRunning).toHaveBeenCalledWith('test-run-id');
expect(testRunRepository.markAsCompleted).toHaveBeenCalledTimes(1);
expect(testRunRepository.markAsCompleted).toHaveBeenCalledWith('test-run-id', {
success: false,
metric1: 0.75,
metric2: 0,
});
});
});
32 changes: 32 additions & 0 deletions packages/cli/src/evaluation/test-runner/evaluation-metrics.ee.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
import type { IDataObject } from 'n8n-workflow';

export class EvaluationMetrics {
private readonly rawMetricsByName = new Map<string, number[]>();

constructor(private readonly metricNames: Set<string>) {
for (const metricName of metricNames) {
this.rawMetricsByName.set(metricName, []);
}
}

addResults(result: IDataObject) {
for (const [metricName, metricValue] of Object.entries(result)) {
if (typeof metricValue === 'number' && this.metricNames.has(metricName)) {
this.rawMetricsByName.get(metricName)!.push(metricValue);
}
}
}

getAggregatedMetrics() {
const aggregatedMetrics: Record<string, number> = {};

for (const [metricName, metricValues] of this.rawMetricsByName.entries()) {
if (metricValues.length > 0) {
const metricSum = metricValues.reduce((acc, val) => acc + val, 0);
aggregatedMetrics[metricName] = metricSum / metricValues.length;
}
}

return aggregatedMetrics;
}
}
35 changes: 30 additions & 5 deletions packages/cli/src/evaluation/test-runner/test-runner.service.ee.ts
Original file line number Diff line number Diff line change
Expand Up @@ -15,11 +15,13 @@ import type { TestDefinition } from '@/databases/entities/test-definition.ee';
import type { User } from '@/databases/entities/user';
import type { WorkflowEntity } from '@/databases/entities/workflow-entity';
import { ExecutionRepository } from '@/databases/repositories/execution.repository';
import { TestMetricRepository } from '@/databases/repositories/test-metric.repository.ee';
import { TestRunRepository } from '@/databases/repositories/test-run.repository.ee';
import { WorkflowRepository } from '@/databases/repositories/workflow.repository';
import { getRunData } from '@/workflow-execute-additional-data';
import { WorkflowRunner } from '@/workflow-runner';

import { EvaluationMetrics } from './evaluation-metrics.ee';
import { createPinData, getPastExecutionStartNode } from './utils.ee';

/**
Expand All @@ -40,6 +42,7 @@ export class TestRunnerService {
private readonly executionRepository: ExecutionRepository,
private readonly activeExecutions: ActiveExecutions,
private readonly testRunRepository: TestRunRepository,
private readonly testMetricRepository: TestMetricRepository,
) {}

/**
Expand Down Expand Up @@ -113,6 +116,11 @@ export class TestRunnerService {
return await executePromise;
}

/**
* Evaluation result is the first item in the output of the last node
* executed in the evaluation workflow. Defaults to an empty object
* in case the node doesn't produce any output items.
*/
private extractEvaluationResult(execution: IRun): IDataObject {
const lastNodeExecuted = execution.data.resultData.lastNodeExecuted;
assert(lastNodeExecuted, 'Could not find the last node executed in evaluation workflow');
Expand All @@ -124,6 +132,21 @@ export class TestRunnerService {
return mainConnectionData?.[0]?.json ?? {};
}

/**
* Get the metrics to collect from the evaluation workflow execution results.
*/
private async getTestMetricNames(testDefinitionId: string) {
const metrics = await this.testMetricRepository.find({
where: {
testDefinition: {
id: testDefinitionId,
},
},
});

return new Set(metrics.map((m) => m.name));
}

/**
* Creates a new test run for the given test definition.
*/
Expand Down Expand Up @@ -152,11 +175,15 @@ export class TestRunnerService {
.andWhere('execution.workflowId = :workflowId', { workflowId: test.workflowId })
.getMany();

// Get the metrics to collect from the evaluation workflow
const testMetricNames = await this.getTestMetricNames(test.id);

// 2. Run over all the test cases

await this.testRunRepository.markAsRunning(testRun.id);

const metrics = [];
// Object to collect the results of the evaluation workflow executions
const metrics = new EvaluationMetrics(testMetricNames);

for (const { id: pastExecutionId } of pastExecutions) {
// Fetch past execution with data
Expand Down Expand Up @@ -192,12 +219,10 @@ export class TestRunnerService {
assert(evalExecution);

// Extract the output of the last node executed in the evaluation workflow
metrics.push(this.extractEvaluationResult(evalExecution));
metrics.addResults(this.extractEvaluationResult(evalExecution));
}

// TODO: 3. Aggregate the results
// Now we just set success to true if all the test cases passed
const aggregatedMetrics = { success: metrics.every((metric) => metric.success) };
const aggregatedMetrics = metrics.getAggregatedMetrics();

await this.testRunRepository.markAsCompleted(testRun.id, aggregatedMetrics);
}
Expand Down

0 comments on commit b5b95ff

Please sign in to comment.