Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

basic alerting, for internal use primarily #778

Closed
wants to merge 5 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion docker-compose-dev.yml
Original file line number Diff line number Diff line change
Expand Up @@ -133,7 +133,7 @@ services:
dockerfile: stacks/flow.Dockerfile
target: flow-snapshot-worker
environment:
<<: [*flow-worker-env]
<<: [*catalog-config, *flow-worker-env]
depends_on:
temporal-admin-tools:
condition: service_healthy
Expand Down
2 changes: 1 addition & 1 deletion docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -112,7 +112,7 @@ services:
container_name: flow-snapshot-worker
image: ghcr.io/peerdb-io/flow-snapshot-worker:latest-dev
environment:
<<: [*flow-worker-env]
<<: [*catalog-config, *flow-worker-env]
depends_on:
temporal-admin-tools:
condition: service_healthy
Expand Down
43 changes: 43 additions & 0 deletions flow/activities/flowable.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,10 @@ import (
"database/sql"
"errors"
"fmt"
"os"
"runtime"
"strconv"
"strings"
"sync"
"time"

Expand All @@ -17,6 +21,7 @@ import (
"github.com/PeerDB-io/peer-flow/generated/protos"
"github.com/PeerDB-io/peer-flow/model"
"github.com/PeerDB-io/peer-flow/shared"
"github.com/PeerDB-io/peer-flow/utils/evervigil"
"github.com/jackc/pglogrepl"
"github.com/jackc/pgx/v5"
"github.com/jackc/pgx/v5/pgtype"
Expand All @@ -41,6 +46,25 @@ type SlotSnapshotSignal struct {

type FlowableActivity struct {
CatalogPool *pgxpool.Pool
Vigil *evervigil.EverVigil
}

func currentFunction() string {
counter, _, _, ok := runtime.Caller(2)

if !ok {
return "unknown"
}

splitStr := strings.Split(runtime.FuncForPC(counter).Name(), ".")
return splitStr[len(splitStr)-1]
}

func (a *FlowableActivity) vigilForActivityFailures(flowJobName string, err error) {
if err != nil {
a.Vigil.AlertIf(fmt.Sprintf("%s-%s-failed", flowJobName, currentFunction()),
fmt.Sprintf("```%s```", err.Error()))
}
}

// CheckConnection implements CheckConnection.
Expand Down Expand Up @@ -173,6 +197,21 @@ func (a *FlowableActivity) handleSlotInfo(
return err
}

slotLagInMBThresholdStr, ok := os.LookupEnv("PEERDB_SLOT_LAG_MB_ALERT_THRESHOLD")
if ok {
slotLagInMBThreshold, err := strconv.ParseInt(slotLagInMBThresholdStr, 10, 64)
if err != nil {
log.Warnf("failed to parse PEERDB_SLOT_LAG_MB_ALERT_THRESHOLD as integer!")
return nil
}

if int64(slotInfo[0].LagInMb) >= slotLagInMBThreshold {
a.Vigil.AlertIf(fmt.Sprintf("%s-slot-size-exceeded", peerName),
fmt.Sprintf("Slot %s on peer %s has exceeded threshold size of %dMB, currently at %fMB!",
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
fmt.Sprintf("Slot %s on peer %s has exceeded threshold size of %dMB, currently at %fMB!",
fmt.Sprintf("Slot %s on peer %s has exceeded threshold size of %dMB, currently at %.2fMB!",

slotName, peerName, slotLagInMBThreshold, slotInfo[0].LagInMb))
}
}

if len(slotInfo) != 0 {
return monitoring.AppendSlotSizeInfo(ctx, a.CatalogPool, peerName, slotInfo[0])
}
Expand All @@ -185,6 +224,10 @@ func (a *FlowableActivity) recordSlotSizePeriodically(
slotName string,
peerName string,
) {
err := a.handleSlotInfo(ctx, srcConn, slotName, peerName)
if err != nil {
return
}
timeout := 10 * time.Minute
ticker := time.NewTicker(timeout)

Expand Down
11 changes: 9 additions & 2 deletions flow/cmd/worker.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ import (
"github.com/PeerDB-io/peer-flow/activities"
utils "github.com/PeerDB-io/peer-flow/connectors/utils/catalog"
"github.com/PeerDB-io/peer-flow/shared"
"github.com/PeerDB-io/peer-flow/utils/evervigil"
peerflow "github.com/PeerDB-io/peer-flow/workflows"

"github.com/grafana/pyroscope-go"
Expand Down Expand Up @@ -103,10 +104,15 @@ func WorkerMain(opts *WorkerOptions) error {
clientOptions.ConnectionOptions = connOptions
}

conn, err := utils.GetCatalogConnectionPoolFromEnv()
catalogPool, err := utils.GetCatalogConnectionPoolFromEnv()
if err != nil {
return fmt.Errorf("unable to create catalog connection pool: %w", err)
}
vigil, err := evervigil.NewVigil(catalogPool)
if err != nil {
return fmt.Errorf("unable to create Vigil: %w", err)
}
defer vigil.Close()

c, err := client.Dial(clientOptions)
if err != nil {
Expand All @@ -131,7 +137,8 @@ func WorkerMain(opts *WorkerOptions) error {
w.RegisterWorkflow(peerflow.DropFlowWorkflow)
w.RegisterWorkflow(peerflow.HeartbeatFlowWorkflow)
w.RegisterActivity(&activities.FlowableActivity{
CatalogPool: conn,
CatalogPool: catalogPool,
Vigil: vigil,
})

err = w.Run(worker.InterruptCh())
Expand Down
3 changes: 0 additions & 3 deletions flow/connectors/core.go
Original file line number Diff line number Diff line change
Expand Up @@ -43,9 +43,6 @@ type CDCPullConnector interface {
// PullFlowCleanup drops both the Postgres publication and replication slot, as a part of DROP MIRROR
PullFlowCleanup(jobName string) error

// SendWALHeartbeat allows for activity to progress restart_lsn on postgres.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this being removed because it's unused? Should be split out to another PR

SendWALHeartbeat() error

// GetSlotInfo returns the WAL (or equivalent) info of a slot for the connector.
GetSlotInfo(slotName string) ([]*protos.SlotInfo, error)
}
Expand Down
16 changes: 0 additions & 16 deletions flow/connectors/postgres/postgres.go
Original file line number Diff line number Diff line change
Expand Up @@ -892,19 +892,3 @@ func (c *PostgresConnector) SyncFlowCleanup(jobName string) error {
}
return nil
}

func (c *PostgresConnector) SendWALHeartbeat() error {
command := `
BEGIN;
DROP aggregate IF EXISTS PEERDB_EPHEMERAL_HEARTBEAT(float4);
CREATE AGGREGATE PEERDB_EPHEMERAL_HEARTBEAT(float4) (SFUNC = float4pl, STYPE = float4);
DROP aggregate PEERDB_EPHEMERAL_HEARTBEAT(float4);
END;
`
_, err := c.pool.Exec(c.ctx, command)
if err != nil {
return fmt.Errorf("error bumping wal position: %w", err)
}

return nil
}
5 changes: 4 additions & 1 deletion flow/go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ require (
github.com/lib/pq v1.10.9
github.com/linkedin/goavro/v2 v2.12.0
github.com/microsoft/go-mssqldb v1.6.0
github.com/nikoksr/notify v0.41.0
github.com/orcaman/concurrent-map/v2 v2.0.1
github.com/sirupsen/logrus v1.9.3
github.com/snowflakedb/gosnowflake v1.7.1
Expand Down Expand Up @@ -54,6 +55,7 @@ require (
github.com/getsentry/sentry-go v0.25.0 // indirect
github.com/go-logr/logr v1.3.0 // indirect
github.com/go-logr/stdr v1.2.2 // indirect
github.com/gorilla/websocket v1.5.0 // indirect
github.com/kr/pretty v0.3.1 // indirect
github.com/kr/text v0.2.0 // indirect
github.com/matttproud/golang_protobuf_extensions/v2 v2.0.0 // indirect
Expand All @@ -63,6 +65,7 @@ require (
github.com/prometheus/common v0.45.0 // indirect
github.com/prometheus/procfs v0.12.0 // indirect
github.com/rogpeppe/go-internal v1.11.0 // indirect
github.com/slack-go/slack v0.12.2 // indirect
github.com/ysmood/gop v0.2.0 // indirect
go.opentelemetry.io/otel v1.21.0 // indirect
go.opentelemetry.io/otel/metric v1.21.0 // indirect
Expand Down Expand Up @@ -146,7 +149,7 @@ require (
github.com/xrash/smetrics v0.0.0-20201216005158-039620a65673 // indirect
github.com/zeebo/xxh3 v1.0.2 // indirect
go.opencensus.io v0.24.0 // indirect
golang.org/x/crypto v0.16.0 // indirect
golang.org/x/crypto v0.16.0
golang.org/x/exp v0.0.0-20231206192017-f3f8817b8deb
golang.org/x/mod v0.14.0 // indirect
golang.org/x/net v0.19.0 // indirect
Expand Down
12 changes: 12 additions & 0 deletions flow/go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -165,6 +165,8 @@ github.com/go-logr/stdr v1.2.2/go.mod h1:mMo/vtBO5dYbehREoey6XUKy/eSumjCCveDpRre
github.com/go-sql-driver/mysql v1.6.0 h1:BCTh4TKNUYmOmMUcQ3IipzF5prigylS7XXjEkfCHuOE=
github.com/go-sql-driver/mysql v1.6.0/go.mod h1:DCzpHaOWr8IXmIStZouvnhqoel9Qv2LBy8hT2VhHyBg=
github.com/go-stack/stack v1.8.0/go.mod h1:v0f6uXyyMGvRgIKkXu+yp6POWl0qKG85gN/melR3HDY=
github.com/go-test/deep v1.0.4 h1:u2CU3YKy9I2pmu9pX0eq50wCgjfGIt539SqR7FbHiho=
github.com/go-test/deep v1.0.4/go.mod h1:wGDj63lr65AM2AQyKZd/NYHGb0R+1RLqB8NKt3aSFNA=
github.com/goccy/go-json v0.10.2 h1:CrxCmQqYDkv1z7lO7Wbh2HN93uovUHgrECaO5ZrCXAU=
github.com/goccy/go-json v0.10.2/go.mod h1:6MelG93GURQebXPDq3khkgXZkazVtN9CRI+MGFi0w8I=
github.com/godbus/dbus v0.0.0-20190726142602-4481cbc300e2 h1:ZpnhV/YsD2/4cESfV5+Hoeu/iUR3ruzNvZ+yQfO03a0=
Expand Down Expand Up @@ -215,6 +217,7 @@ github.com/google/go-cmp v0.4.0/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/
github.com/google/go-cmp v0.5.0/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE=
github.com/google/go-cmp v0.5.3/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE=
github.com/google/go-cmp v0.5.5/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE=
github.com/google/go-cmp v0.5.7/go.mod h1:n+brtR0CgQNWTVd5ZUFpTBC8YFBDLK/h/bpaJ8/DtOE=
github.com/google/go-cmp v0.6.0 h1:ofyhxvXcZhMsU5ulbFiLKl/XBFqE1GSq7atu8tAmTRI=
github.com/google/go-cmp v0.6.0/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY=
github.com/google/martian/v3 v3.3.2 h1:IqNFLAmvJOgVlpdEBiQbDc2EwKW77amAycfTuWKdfvw=
Expand All @@ -229,6 +232,9 @@ github.com/googleapis/enterprise-certificate-proxy v0.3.2 h1:Vie5ybvEvT75RniqhfF
github.com/googleapis/enterprise-certificate-proxy v0.3.2/go.mod h1:VLSiSSBs/ksPL8kq3OBOQ6WRI2QnaFynd1DCjZ62+V0=
github.com/googleapis/gax-go/v2 v2.12.0 h1:A+gCJKdRfqXkr+BIRGtZLibNXf0m1f9E4HG56etFpas=
github.com/googleapis/gax-go/v2 v2.12.0/go.mod h1:y+aIqrI5eb1YGMVJfuV3185Ts/D7qKpsEkdD5+I6QGU=
github.com/gorilla/websocket v1.4.2/go.mod h1:YR8l580nyteQvAITg2hZ9XVh4b55+EU/adAjf1fMHhE=
github.com/gorilla/websocket v1.5.0 h1:PPwGk2jz7EePpoHN/+ClbZu8SPxiqlu12wZP/3sWmnc=
github.com/gorilla/websocket v1.5.0/go.mod h1:YR8l580nyteQvAITg2hZ9XVh4b55+EU/adAjf1fMHhE=
github.com/grafana/pyroscope-go v1.0.4 h1:oyQX0BOkL+iARXzHuCdIF5TQ7/sRSel1YFViMHC7Bm0=
github.com/grafana/pyroscope-go v1.0.4/go.mod h1:0d7ftwSMBV/Awm7CCiYmHQEG8Y44Ma3YSjt+nWcWztY=
github.com/grafana/pyroscope-go/godeltaprof v0.1.5 h1:gkFVqihFRL1Nro2FCC0u6mW47jclef96Zu8I/ykq+4E=
Expand Down Expand Up @@ -271,6 +277,8 @@ github.com/jmoiron/sqlx v1.3.5 h1:vFFPA71p1o5gAeqtEAwLU4dnX2napprKtHr7PYIcN3g=
github.com/jmoiron/sqlx v1.3.5/go.mod h1:nRVWtLre0KfCLJvgxzCsLVMogSvQ1zNJtpYr2Ccp0mQ=
github.com/joho/godotenv v1.5.1 h1:7eLL/+HRGLY0ldzfGMeQkb7vMd0as4CfYvUVzLqw0N0=
github.com/joho/godotenv v1.5.1/go.mod h1:f4LDr5Voq0i2e/R5DDNOoa2zzDfwtkZa6DnEwAbqwq4=
github.com/jordan-wright/email v4.0.1-0.20210109023952-943e75fe5223+incompatible h1:jdpOPRN1zP63Td1hDQbZW73xKmzDvZHzVdNYxhnTMDA=
github.com/jordan-wright/email v4.0.1-0.20210109023952-943e75fe5223+incompatible/go.mod h1:1c7szIrayyPPB/987hsnvNzLushdWf4o/79s3P08L8A=
github.com/kisielk/errcheck v1.5.0/go.mod h1:pFxgyoBC7bSaBwPgfKdkLd5X25qrDl4LWUI2bnpBCr8=
github.com/kisielk/gotool v1.0.0/go.mod h1:XhKaO+MFFWcvkIS/tQcRk01m1F5IRFswLeQ+oQHNcck=
github.com/klauspost/asmfmt v1.3.2 h1:4Ri7ox3EwapiOjCki+hw14RyKk201CN4rzyCJRFLpK4=
Expand Down Expand Up @@ -309,6 +317,8 @@ github.com/minio/c2goasm v0.0.0-20190812172519-36a3d3bbc4f3/go.mod h1:RagcQ7I8Ie
github.com/mtibben/percent v0.2.1 h1:5gssi8Nqo8QU/r2pynCm+hBQHpkB/uNK7BJCFogWdzs=
github.com/mtibben/percent v0.2.1/go.mod h1:KG9uO+SZkUp+VkRHsCdYQV3XSZrrSpR3O9ibNBTZrns=
github.com/niemeyer/pretty v0.0.0-20200227124842-a10e7caefd8e/go.mod h1:zD1mROLANZcx1PVRCS0qkT7pwLkGfwJo4zjcN/Tysno=
github.com/nikoksr/notify v0.41.0 h1:4LGE41GpWdHX5M3Xo6DlWRwS2WLDbOq1Rk7IzY4vjmQ=
github.com/nikoksr/notify v0.41.0/go.mod h1:FoE0UVPeopz1Vy5nm9vQZ+JVmYjEIjQgbFstbkw+cRE=
github.com/opentracing/opentracing-go v1.1.0/go.mod h1:UkNAQd3GIcIGf0SeVgPpRdFStlNbqXla1AfSYxPUl2o=
github.com/orcaman/concurrent-map/v2 v2.0.1 h1:jOJ5Pg2w1oeB6PeDurIYf6k9PQ+aTITr/6lP/L/zp6c=
github.com/orcaman/concurrent-map/v2 v2.0.1/go.mod h1:9Eq3TG2oBe5FirmYWQfYO5iH1q0Jv47PLaNK++uCdOM=
Expand Down Expand Up @@ -349,6 +359,8 @@ github.com/russross/blackfriday/v2 v2.1.0/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQD
github.com/sirupsen/logrus v1.4.2/go.mod h1:tLMulIdttU9McNUspp0xgXVQah82FyeX6MwdIuYE2rE=
github.com/sirupsen/logrus v1.9.3 h1:dueUQJ1C2q9oE3F7wvmSGAaVtTmUizReu6fjN8uqzbQ=
github.com/sirupsen/logrus v1.9.3/go.mod h1:naHLuLoDiP4jHNo9R0sCBMtWGeIprob74mVsIT4qYEQ=
github.com/slack-go/slack v0.12.2 h1:x3OppyMyGIbbiyFhsBmpf9pwkUzMhthJMRNmNlA4LaQ=
github.com/slack-go/slack v0.12.2/go.mod h1:hlGi5oXA+Gt+yWTPP0plCdRKmjsDxecdHxYQdlMQKOw=
github.com/snowflakedb/gosnowflake v1.7.1 h1:c9JjyjjDlvxex9ud71TwKL+Wu54Vfx+39h4DAwbIdqU=
github.com/snowflakedb/gosnowflake v1.7.1/go.mod h1:JI3eRZL8CpimPek6CJO0aTbDQjDGOt7Rxv9A/ti4f5c=
github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
Expand Down
120 changes: 120 additions & 0 deletions flow/utils/evervigil/ever_vigil.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,120 @@
package evervigil

import (
"context"
"encoding/json"
"fmt"
"time"

"github.com/jackc/pgx/v5"
"github.com/jackc/pgx/v5/pgxpool"
"github.com/nikoksr/notify"
serprex marked this conversation as resolved.
Show resolved Hide resolved
"github.com/nikoksr/notify/service/slack"
"github.com/sirupsen/logrus"
heavycrystal marked this conversation as resolved.
Show resolved Hide resolved
)

// alerting service, cool name
type EverVigil struct {
notifier *notify.Notify
catalogPool *pgxpool.Pool
}

type slackServiceConfig struct {
AuthToken string `json:"auth_token"`
ChannelIDs []string `json:"channel_ids"`
}

func registerServicesForNotifier(catalogPool *pgxpool.Pool) (*notify.Notify, error) {
notifier := notify.New()

rows, err := catalogPool.Query(context.Background(),
"SELECT service_type,service_config FROM peerdb_stats.alerting_config")
if err != nil {
return nil, fmt.Errorf("failed to read everVigil config from catalog: %w", err)
}

registeredAtleastOneService := false
var serviceType, serviceConfig string
_, err = pgx.ForEachRow(rows, []any{&serviceType, &serviceConfig}, func() error {
switch serviceType {
case "slack":
var slackServiceConfig slackServiceConfig
err = json.Unmarshal([]byte(serviceConfig), &slackServiceConfig)
if err != nil {
return fmt.Errorf("failed to unmarshal Slack service config: %w", err)
}

slackService := slack.New(slackServiceConfig.AuthToken)
slackService.AddReceivers(slackServiceConfig.ChannelIDs...)
notifier.UseServices(slackService)
registeredAtleastOneService = true
default:
return fmt.Errorf("unknown service type: %s", serviceType)
}
return nil
})

// vigil is currently useless, marking it as such
if !registeredAtleastOneService {
notifier.Disabled = true
}
return notifier, nil
}

func NewVigil(catalogPool *pgxpool.Pool) (*EverVigil, error) {
notifier, err := registerServicesForNotifier(catalogPool)
if err != nil {
return nil, err
}

return &EverVigil{
notifier: notifier,
catalogPool: catalogPool,
serprex marked this conversation as resolved.
Show resolved Hide resolved
}, nil
}

func (ev *EverVigil) Close() {
if ev.catalogPool != nil {
ev.catalogPool.Close()
}
Comment on lines +77 to +79
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Catalog pool should never be closed

}

// Only raises an alert if another alert with the same key hasn't been raised
// in the past 15 minutes
func (ev *EverVigil) AlertIf(alertKey string, alertMessage string) {
if ev.catalogPool != nil && ev.notifier != nil {
// try to make the vigil not useless if possible
if ev.notifier.Disabled {
var err error
ev.notifier, err = registerServicesForNotifier(ev.catalogPool)
if err != nil {
logrus.Warnf("failed to register services for vigil: %v", err)
return
}
}

row := ev.catalogPool.QueryRow(context.Background(),
`SELECT created_timestamp FROM peerdb_stats.alerts_v1 WHERE alert_key=$1
ORDER BY created_timestamp DESC LIMIT 1`,
alertKey)
var createdTimestamp time.Time
err := row.Scan(&createdTimestamp)
if err != nil && err != pgx.ErrNoRows {
logrus.Warnf("failed to send alert: %v", err)
return
}

if time.Since(createdTimestamp) >= 15*time.Minute {
err = ev.notifier.Send(context.Background(),
fmt.Sprintf(":rotating_light: *Alert Alert* :rotating_light:: %s since %s", alertKey,
time.Now().Format("2006-01-02 15:04:05.999999")), alertMessage)
if err != nil {
logrus.Warnf("failed to send alert: %v", err)
return
}
_, _ = ev.catalogPool.Exec(context.Background(),
"INSERT INTO peerdb_stats.alerts_v1(alert_key,alert_message) VALUES($1,$2)",
alertKey, alertMessage)
}
}
}
13 changes: 13 additions & 0 deletions nexus/catalog/migrations/V13__alerting_config_init.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
CREATE TABLE IF NOT EXISTS peerdb_stats.alerting_config (
id BIGINT PRIMARY KEY GENERATED BY DEFAULT AS IDENTITY,
service_type TEXT NOT NULL CHECK (service_type IN ('slack')),
service_config JSONB NOT NULL
);

CREATE TABLE IF NOT EXISTS peerdb_stats.alerts_v1 (
id BIGINT PRIMARY KEY GENERATED BY DEFAULT AS IDENTITY,
alert_key TEXT NOT NULL,
alert_level TEXT NOT NULL CHECK (alert_level IN ('critical')) DEFAULT 'critical',
alert_message TEXT NOT NULL,
created_timestamp TIMESTAMP DEFAULT now()
);
Loading