Skip to content

Commit

Permalink
Distinguish error type/sources in alerting (elastic#173932)
Browse files Browse the repository at this point in the history
Resolves: elastic#168631

This PR intends to distinguish the error type/source (user or framework)
that are returned by the alerting plugin.
The type/source is used by the TaskManager to generate TaskRun metrics
(user_errors or framework_errors).


|Error type|Error description|
|---|---|
|USER| Disabled rule type error|
|USER| Validation errors|
|USER| Rule type executor errors|
|FRAMEWORK| Error getting decrypted rule|
|FRAMEWORK| Disabled rule errors|
|FRAMEWORK| Fetching RawRule errors|
|FRAMEWORK| Enqueue actions errors|
|FRAMEWORK| Fetching summarised alerts errors|
|FRAMEWORK| All the other rule run errors|

## To Verify:

Throw an error in one of the try-catch blocks (or anywhere in the run
method of the TaskTunner) and expect to see the metric counter to
increment the number of respective errors on the below page.

http://localhost:5601/api/task_manager/metrics?reset=false
  • Loading branch information
ersin-erdal authored Dec 29, 2023
1 parent 8ab2fab commit cf2bf53
Show file tree
Hide file tree
Showing 7 changed files with 175 additions and 37 deletions.
25 changes: 20 additions & 5 deletions x-pack/plugins/alerting/server/task_runner/execution_handler.ts
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,11 @@ import type { PublicMethodsOf } from '@kbn/utility-types';
import { Logger } from '@kbn/core/server';
import { ALERT_UUID, getRuleDetailsRoute, triggersActionsRoute } from '@kbn/rule-data-utils';
import { asSavedObjectExecutionSource } from '@kbn/actions-plugin/server';
import { isEphemeralTaskRejectedDueToCapacityError } from '@kbn/task-manager-plugin/server';
import {
createTaskRunError,
isEphemeralTaskRejectedDueToCapacityError,
TaskErrorSource,
} from '@kbn/task-manager-plugin/server';
import {
ExecuteOptions as EnqueueExecutionOptions,
ExecutionResponseItem,
Expand Down Expand Up @@ -360,10 +364,15 @@ export class ExecutionHandler<

if (!!bulkActions.length) {
for (const c of chunk(bulkActions, CHUNK_SIZE)) {
const response = await this.actionsClient!.bulkEnqueueExecution(c);
if (response.errors) {
let enqueueResponse;
try {
enqueueResponse = await this.actionsClient!.bulkEnqueueExecution(c);
} catch (e) {
throw createTaskRunError(e, TaskErrorSource.FRAMEWORK);
}
if (enqueueResponse.errors) {
bulkActionsResponse = bulkActionsResponse.concat(
response.items.filter(
enqueueResponse.items.filter(
(i) => i.response === ExecutionResponseType.QUEUED_ACTIONS_LIMIT_ERROR
)
);
Expand Down Expand Up @@ -730,7 +739,13 @@ export class ExecutionHandler<
executionUuid: this.executionId,
};
}
const alerts = await this.alertsClient.getSummarizedAlerts!(options);

let alerts;
try {
alerts = await this.alertsClient.getSummarizedAlerts!(options);
} catch (e) {
throw createTaskRunError(e, TaskErrorSource.FRAMEWORK);
}

/**
* We need to remove all new alerts with maintenance windows retrieved from
Expand Down
12 changes: 10 additions & 2 deletions x-pack/plugins/alerting/server/task_runner/rule_loader.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ import { ErrorWithReason, getReasonFromError } from '../lib/error_with_reason';
import { alertingEventLoggerMock } from '../lib/alerting_event_logger/alerting_event_logger.mock';
import { mockedRawRuleSO, mockedRule } from './fixtures';
import { RULE_SAVED_OBJECT_TYPE } from '../saved_objects';
import { getErrorSource, TaskErrorSource } from '@kbn/task-manager-plugin/server/task_running';

// create mocks
const rulesClient = rulesClientMock.create();
Expand Down Expand Up @@ -142,6 +143,7 @@ describe('rule_loader', () => {
} catch (err) {
outcome = 'failure';
expect(getReasonFromError(err)).toBe(RuleExecutionStatusErrorReasons.Disabled);
expect(getErrorSource(err)).toBe(TaskErrorSource.FRAMEWORK);
}
expect(outcome).toBe('failure');
});
Expand All @@ -162,6 +164,7 @@ describe('rule_loader', () => {
outcome = 'failure';
expect(err.message).toBe('rule-type-not-enabled: 2112');
expect(getReasonFromError(err)).toBe(RuleExecutionStatusErrorReasons.License);
expect(getErrorSource(err)).toBe(TaskErrorSource.USER);
}
expect(outcome).toBe('failure');
});
Expand All @@ -178,6 +181,7 @@ describe('rule_loader', () => {
outcome = 'failure';
expect(err.message).toMatch('[bar]: expected value of type [boolean] but got [string]');
expect(getReasonFromError(err)).toBe(RuleExecutionStatusErrorReasons.Validate);
expect(getErrorSource(err)).toBe(TaskErrorSource.USER);
}
expect(outcome).toBe('failure');
});
Expand Down Expand Up @@ -229,8 +233,12 @@ describe('rule_loader', () => {
}
);

const promise = getRuleAttributes(context, ruleId, spaceId);
await expect(promise).rejects.toThrow('wops');
try {
await getRuleAttributes(context, ruleId, spaceId);
} catch (e) {
expect(e.message).toMatch('wops');
expect(getErrorSource(e)).toBe(TaskErrorSource.FRAMEWORK);
}
});
});

Expand Down
39 changes: 28 additions & 11 deletions x-pack/plugins/alerting/server/task_runner/rule_loader.ts
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,13 @@
*/

import { addSpaceIdToPath } from '@kbn/spaces-plugin/server';
import { CoreKibanaRequest, FakeRawRequest, Headers } from '@kbn/core/server';
import { CoreKibanaRequest, FakeRawRequest, Headers, SavedObject } from '@kbn/core/server';
import { PublicMethodsOf } from '@kbn/utility-types';
import {
LoadedIndirectParams,
LoadIndirectParamsResult,
} from '@kbn/task-manager-plugin/server/task';
import { createTaskRunError, TaskErrorSource } from '@kbn/task-manager-plugin/server';
import { TaskRunnerContext } from './task_runner_factory';
import { ErrorWithReason, validateRuleTypeParams } from '../lib';
import {
Expand Down Expand Up @@ -70,23 +71,33 @@ export function validateRule<Params extends RuleTypeParams>(
const { enabled, apiKey } = indirectParams;

if (!enabled) {
throw new ErrorWithReason(
RuleExecutionStatusErrorReasons.Disabled,
new Error(`Rule failed to execute because rule ran after it was disabled.`)
throw createTaskRunError(
new ErrorWithReason(
RuleExecutionStatusErrorReasons.Disabled,
new Error(`Rule failed to execute because rule ran after it was disabled.`)
),
TaskErrorSource.FRAMEWORK
);
}

alertingEventLogger.setRuleName(rule.name);
try {
ruleTypeRegistry.ensureRuleTypeEnabled(rule.alertTypeId);
} catch (err) {
throw new ErrorWithReason(RuleExecutionStatusErrorReasons.License, err);
throw createTaskRunError(
new ErrorWithReason(RuleExecutionStatusErrorReasons.License, err),
TaskErrorSource.USER
);
}

let validatedParams: Params;
try {
validatedParams = validateRuleTypeParams<Params>(rule.params, paramValidator);
} catch (err) {
throw new ErrorWithReason(RuleExecutionStatusErrorReasons.Validate, err);
throw createTaskRunError(
new ErrorWithReason(RuleExecutionStatusErrorReasons.Validate, err),
TaskErrorSource.USER
);
}

if (rule.monitoring) {
Expand Down Expand Up @@ -114,11 +125,17 @@ export async function getRuleAttributes<Params extends RuleTypeParams>(
): Promise<RuleData<Params>> {
const namespace = context.spaceIdToNamespace(spaceId);

const rawRule = await context.encryptedSavedObjectsClient.getDecryptedAsInternalUser<RawRule>(
RULE_SAVED_OBJECT_TYPE,
ruleId,
{ namespace }
);
let rawRule: SavedObject<RawRule>;

try {
rawRule = await context.encryptedSavedObjectsClient.getDecryptedAsInternalUser<RawRule>(
RULE_SAVED_OBJECT_TYPE,
ruleId,
{ namespace }
);
} catch (e) {
throw createTaskRunError(e, TaskErrorSource.FRAMEWORK);
}

const fakeRequest = getFakeKibanaRequest(context, spaceId, rawRule.attributes.apiKey);
const rulesClient = context.getRulesClientWithRequest(fakeRequest);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,11 @@ import {
RuleAction,
RuleAlertData,
} from '../types';
import { ConcreteTaskInstance, isUnrecoverableError } from '@kbn/task-manager-plugin/server';
import {
ConcreteTaskInstance,
isUnrecoverableError,
TaskErrorSource,
} from '@kbn/task-manager-plugin/server';
import { TaskRunnerContext } from './task_runner_factory';
import { TaskRunner } from './task_runner';
import { encryptedSavedObjectsMock } from '@kbn/encrypted-saved-objects-plugin/server/mocks';
Expand Down Expand Up @@ -83,6 +87,7 @@ import { getMockMaintenanceWindow } from '../data/maintenance_window/test_helper
import { alertsClientMock } from '../alerts_client/alerts_client.mock';
import { MaintenanceWindow } from '../application/maintenance_window/types';
import { RULE_SAVED_OBJECT_TYPE } from '../saved_objects';
import { getErrorSource } from '@kbn/task-manager-plugin/server/task_running';

jest.mock('uuid', () => ({
v4: () => '5f6aa57d-3e22-484e-bae8-cbed868f4d28',
Expand Down Expand Up @@ -1942,6 +1947,7 @@ describe('Task Runner', () => {
expect(loggerMeta?.tags).toEqual(['test', '1', 'rule-run-failed']);
expect(loggerMeta?.error?.stack_trace).toBeDefined();
expect(logger.error).toBeCalledTimes(1);
expect(getErrorSource(runnerResult.taskRunError as Error)).toBe(TaskErrorSource.USER);
});

test('recovers gracefully when the Rule Task Runner throws an exception when loading rule to prepare for run', async () => {
Expand Down Expand Up @@ -3326,6 +3332,7 @@ describe('Task Runner', () => {

expect(encryptedSavedObjectsClient.getDecryptedAsInternalUser).toHaveBeenCalledTimes(1);
expect(result).toEqual({ error });
expect(getErrorSource(result.error as Error)).toBe(TaskErrorSource.FRAMEWORK);
});

function testAlertingEventLogCalls({
Expand Down
45 changes: 29 additions & 16 deletions x-pack/plugins/alerting/server/task_runner/task_runner.ts
Original file line number Diff line number Diff line change
Expand Up @@ -12,33 +12,34 @@ import { v4 as uuidv4 } from 'uuid';
import { Logger } from '@kbn/core/server';
import {
ConcreteTaskInstance,
throwUnrecoverableError,
createTaskRunError,
TaskErrorSource,
throwUnrecoverableError,
} from '@kbn/task-manager-plugin/server';
import { nanosToMillis } from '@kbn/event-log-plugin/server';
import { DEFAULT_NAMESPACE_STRING } from '@kbn/core-saved-objects-utils-server';
import { getErrorSource } from '@kbn/task-manager-plugin/server/task_running';
import { ExecutionHandler, RunResult } from './execution_handler';
import { TaskRunnerContext } from './task_runner_factory';
import {
ElasticsearchError,
ErrorWithReason,
executionStatusFromError,
executionStatusFromState,
ruleExecutionStatusToRaw,
getNextRun,
isRuleSnoozed,
lastRunFromError,
getNextRun,
ruleExecutionStatusToRaw,
} from '../lib';
import {
RuleExecutionStatus,
RuleExecutionStatusErrorReasons,
IntervalSchedule,
RawRuleExecutionStatus,
RawRuleLastRun,
RawRuleMonitoring,
RuleExecutionStatus,
RuleExecutionStatusErrorReasons,
RuleTaskState,
RuleTypeRegistry,
RawRuleLastRun,
} from '../types';
import { asErr, asOk, isErr, isOk, map, resolveErr, Result } from '../lib/result_type';
import { taskInstanceToAlertTaskInstance } from './alert_task_instance';
Expand All @@ -47,18 +48,18 @@ import { partiallyUpdateRule, RULE_SAVED_OBJECT_TYPE } from '../saved_objects';
import {
AlertInstanceContext,
AlertInstanceState,
RuleTypeParams,
RuleTypeState,
parseDuration,
RawAlertInstance,
RuleLastRunOutcomeOrderMap,
RuleAlertData,
SanitizedRule,
RuleLastRunOutcomeOrderMap,
RuleNotifyWhen,
RuleTypeParams,
RuleTypeState,
SanitizedRule,
} from '../../common';
import { NormalizedRuleType, UntypedNormalizedRuleType } from '../rule_type_registry';
import { getEsErrorMessage } from '../lib/errors';
import { InMemoryMetrics, IN_MEMORY_METRICS } from '../monitoring';
import { IN_MEMORY_METRICS, InMemoryMetrics } from '../monitoring';
import {
RuleTaskInstance,
RuleTaskRunResult,
Expand Down Expand Up @@ -552,7 +553,10 @@ export class TaskRunner<
message: err,
stackTrace: err.stack,
};
throw new ErrorWithReason(RuleExecutionStatusErrorReasons.Execute, err);
throw createTaskRunError(
new ErrorWithReason(RuleExecutionStatusErrorReasons.Execute, err),
TaskErrorSource.USER
);
}
}

Expand Down Expand Up @@ -838,7 +842,10 @@ export class TaskRunner<
const data = await getRuleAttributes<Params>(this.context, ruleId, spaceId);
this.ruleData = { data };
} catch (err) {
const error = new ErrorWithReason(RuleExecutionStatusErrorReasons.Decrypt, err);
const error = createTaskRunError(
new ErrorWithReason(RuleExecutionStatusErrorReasons.Decrypt, err),
getErrorSource(err)
);
this.ruleData = { error };
}
return this.ruleData;
Expand Down Expand Up @@ -930,6 +937,14 @@ export class TaskRunner<
timings: this.timer.toJson(),
});

const getTaskRunError = (state: Result<RuleTaskStateAndMetrics, Error>) => {
return isErr(state)
? {
taskRunError: createTaskRunError(state.error, getErrorSource(state.error)),
}
: {};
};

return {
state: map<RuleTaskStateAndMetrics, ElasticsearchError, RuleTaskState>(
stateWithMetrics,
Expand Down Expand Up @@ -977,9 +992,7 @@ export class TaskRunner<
return { interval: retryInterval };
}),
monitoring: this.ruleMonitoring.getMonitoring(),
...(isErr(schedule)
? { taskRunError: createTaskRunError(schedule.error, TaskErrorSource.FRAMEWORK) }
: {}),
...getTaskRunError(stateWithMetrics),
};
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,8 @@ interface TaskRunCounts extends JsonObject {
[TaskRunKeys.SUCCESS]: number;
[TaskRunKeys.NOT_TIMED_OUT]: number;
[TaskRunKeys.TOTAL]: number;
[TaskRunKeys.USER_ERRORS]: number;
[TaskRunKeys.FRAMEWORK_ERRORS]: number;
}

export interface TaskRunMetrics extends JsonObject {
Expand Down
Loading

0 comments on commit cf2bf53

Please sign in to comment.