Skip to content

Commit

Permalink
Merge pull request #49 from fly-apps/fence
Browse files Browse the repository at this point in the history
Fencing a failed primary
  • Loading branch information
davissp14 authored Jan 30, 2023
2 parents a3a90ee + fdc2d32 commit 1ab7d9d
Show file tree
Hide file tree
Showing 9 changed files with 768 additions and 63 deletions.
81 changes: 71 additions & 10 deletions cmd/event_handler/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,9 @@ import (
"context"
"flag"
"fmt"
"os"
"strconv"
"time"

"github.com/fly-apps/postgres-flex/pkg/flypg"
)
Expand All @@ -21,27 +23,87 @@ func main() {
details := flag.String("details", "", "details")
flag.Parse()

fmt.Printf("Event: %s\n Node: %d\n Success: %s\n Details: %s\n",
*event, *nodeID, *success, *details)
eventDetails := fmt.Sprintf("%s - Event: %s\n Node: %d\n Success: %s\n Details: %s\n", time.Now().String(), *event, *nodeID, *success, *details)

// TODO - Use an actual logging framework instead of just writing strings to a file.
logFile, err := os.OpenFile("/data/event.log", os.O_RDWR|os.O_CREATE|os.O_APPEND, 0666)
if err != nil {
fmt.Printf("failed to open event log: %s", err)
}
defer logFile.Close()

logFile.WriteString(eventDetails)

switch *event {

case "repmgrd_failover_promote", "standby_promote":
// TODO - Need to figure out what to do when success == 0.
if err := reconfigurePGBouncer(*nodeID); err != nil {
fmt.Println(err.Error())
return

retry := 0
maxRetries := 5
success := false

for retry < maxRetries {
if err := reconfigurePGBouncer(*nodeID); err != nil {
errMsg := fmt.Sprintf("%s [%s] attempt: %d - failed to reconfigure pgbouncer: %s\n", *event, time.Now().String(), retry, err)
logFile.WriteString(errMsg)

retry++
time.Sleep(1 * time.Second)
continue
}

success = true
break
}

if success {
msg := fmt.Sprintf("%s [%s] Successfully reconfigured pgBouncer to %d\n", *event, time.Now().String(), *nodeID)
logFile.WriteString(msg)
os.Exit(0)
} else {
msg := fmt.Sprintf(" %s [%s] Failed ot reconfigured pgBouncer to %d\n", *event, time.Now().String(), *nodeID)
logFile.WriteString(msg)
os.Exit(1)
}

case "standby_follow":

newMemberID, err := strconv.Atoi(*newPrimary)
if err != nil {
fmt.Printf("failed to parse new member id: %s", err)
errMsg := fmt.Sprintf("failed to parse newMemberID %s: %s\n", *newPrimary, err)
logFile.WriteString(errMsg)
os.Exit(1)
}

if err := reconfigurePGBouncer(newMemberID); err != nil {
fmt.Println(err.Error())
return
retry := 0
maxRetries := 5
success := false

for retry < maxRetries {
if err := reconfigurePGBouncer(*&newMemberID); err != nil {
errMsg := fmt.Sprintf("%s [%s] attempt: %d - failed to reconfigure pgbouncer: %s\n", *event, time.Now().String(), retry, err)
logFile.WriteString(errMsg)

retry++
time.Sleep(1 * time.Second)
continue
}

success = true
break
}

if success {
msg := fmt.Sprintf("%s [%s] Successfully reconfigured pgBouncer to %d\n", *event, time.Now().String(), newMemberID)
logFile.WriteString(msg)
os.Exit(0)
} else {
msg := fmt.Sprintf(" %s [%s] Failed ot reconfigured pgBouncer to %d\n", *event, time.Now().String(), newMemberID)
logFile.WriteString(msg)
os.Exit(1)
}

default:
// noop
}
Expand All @@ -63,7 +125,6 @@ func reconfigurePGBouncer(id int) error {
return err
}

fmt.Println("Reconfiguring pgbouncer primary")
if err := node.PGBouncer.ConfigurePrimary(context.TODO(), member.Hostname, true); err != nil {
return fmt.Errorf("failed to reconfigure pgbouncer primary %s", err)
}
Expand Down
6 changes: 4 additions & 2 deletions cmd/start/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,19 +22,20 @@ func main() {
node, err := flypg.NewNode()
if err != nil {
panicHandler(err)
return
}

ctx := context.Background()

if err = node.Init(ctx); err != nil {
panicHandler(err)
return
}

go func() {
t := time.NewTicker(1 * time.Second)
defer t.Stop()
for range t.C {

if err := node.PostInit(ctx); err != nil {
fmt.Printf("failed post-init: %s. Retrying...\n", err)
continue
Expand All @@ -44,7 +45,8 @@ func main() {
}()

svisor := supervisor.New("flypg", 5*time.Minute)
svisor.AddProcess("flypg", fmt.Sprintf("gosu postgres postgres -D %s -p %d", node.DataDir, node.Port))

svisor.AddProcess("postgres", fmt.Sprintf("gosu postgres postgres -D %s -p %d", node.DataDir, node.Port))

svisor.AddProcess("pgbouncer", fmt.Sprintf("pgbouncer %s/pgbouncer.ini", node.PGBouncer.ConfigPath),
supervisor.WithRestart(0, 1*time.Second),
Expand Down
12 changes: 11 additions & 1 deletion pkg/flycheck/role.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ package flycheck

import (
"context"
"fmt"

"github.com/fly-apps/postgres-flex/pkg/flypg"
"github.com/pkg/errors"
Expand All @@ -26,14 +27,23 @@ func PostgreSQLRole(ctx context.Context, checks *check.CheckSuite) (*check.Check
}

checks.AddCheck("role", func() (string, error) {
if flypg.ZombieLockExists() {
return "zombie", fmt.Errorf("member is in a zombie state. see logs for more details")
}

member, err := node.RepMgr.Member(ctx, conn)
if err != nil {
return "failed", errors.Wrap(err, "failed to check role")
}

switch member.Role {
case flypg.PrimaryRoleName:
return "primary", nil
if member.Active {
return "primary", nil
} else {
return "zombie", nil
}

case flypg.StandbyRoleName:
return "replica", nil
default:
Expand Down
42 changes: 42 additions & 0 deletions pkg/flypg/admin/admin.go
Original file line number Diff line number Diff line change
Expand Up @@ -330,3 +330,45 @@ func GetSetting(ctx context.Context, pg *pgx.Conn, setting string) (*PGSetting,
}
return &out, nil
}

func SetReadOnly(ctx context.Context, conn *pgx.Conn) error {
databases, err := ListDatabases(ctx, conn)
if err != nil {
return err
}

for _, db := range databases {
if db.Name == "repmgr" || db.Name == "postgres" {
continue
}

sql := fmt.Sprintf("ALTER DATABASE %s set default_transaction_read_only = true;", db.Name)
_, err := conn.Exec(ctx, sql)
if err != nil {
return err
}
}

return nil
}

func UnsetReadOnly(ctx context.Context, conn *pgx.Conn) error {
databases, err := ListDatabases(ctx, conn)
if err != nil {
return err
}

for _, db := range databases {
if db.Name == "repmgr" || db.Name == "postgres" {
continue
}

sql := fmt.Sprintf("ALTER DATABASE %s set default_transaction_read_only = false;", db.Name)
_, err := conn.Exec(ctx, sql)
if err != nil {
return err
}
}

return nil
}
Loading

0 comments on commit 1ab7d9d

Please sign in to comment.