Normalize concurrently with sync flows (#893)

Previously after each sync we'd pause reading slot to process table schema deltas & normalize This has two problems: 1. we want to always be reading slot, we aren't reading slot during normalize 2. merging multiple batches at once can be less expensive Now NormalizeFlow is created as a child workflow at start of cdc flow & a signal is sent after each sync flow with schema updates Normalize consumes all signals since it last checked, merging their processing in parallel with sync flows NormalizeFlow only reads up to the signal's batch id to avoid potentially syncing a batch without its schema This creates a range `(normid..syncid]` in which normid is always catching up to syncid as we normalize `normid+1` to `syncid` Normalize logic already handled this, so it goes untouched in this change `PEERDB_ENABLE_PARALLEL_SYNC_NORMALIZE` needs to be set to true, for now keep this change behind feature flag to avoid potentially increasing data warehouse costs
PeerDB-io · Jan 25, 2024 · 8adab3f · 8adab3f
1 parent 251eaad
commit 8adab3f
Show file tree

Hide file tree

Showing 13 changed files with 290 additions and 222 deletions.
diff --git a/flow/activities/flowable.go b/flow/activities/flowable.go
@@ -198,7 +198,6 @@ func (a *FlowableActivity) StartFlow(ctx context.Context,
 	}
 	defer connectors.CloseConnector(dstConn)
 
-	activity.RecordHeartbeat(ctx, "initialized table schema")
 	slog.InfoContext(ctx, "pulling records...")
 	tblNameMapping := make(map[string]model.NameAndExclude)
 	for _, v := range input.FlowConnectionConfigs.TableMappings {
@@ -268,6 +267,7 @@ func (a *FlowableActivity) StartFlow(ctx context.Context,
 		}
 
 		return &model.SyncResponse{
+			CurrentSyncBatchID:     -1,
 			TableSchemaDeltas:      recordBatch.SchemaDeltas,
 			RelationMessageMapping: input.RelationMessageMapping,
 		}, nil
@@ -379,13 +379,8 @@ func (a *FlowableActivity) StartNormalize(
 		}
 		defer connectors.CloseConnector(dstConn)
 
-		lastSyncBatchID, err := dstConn.GetLastSyncBatchID(input.FlowConnectionConfigs.FlowJobName)
-		if err != nil {
-			return nil, fmt.Errorf("failed to get last sync batch ID: %w", err)
-		}
-
 		err = monitoring.UpdateEndTimeForCDCBatch(ctx, a.CatalogPool, input.FlowConnectionConfigs.FlowJobName,
-			lastSyncBatchID)
+			input.SyncBatchID)
 		return nil, err
 	} else if err != nil {
 		return nil, err
@@ -399,6 +394,7 @@ func (a *FlowableActivity) StartNormalize(
 
 	res, err := dstConn.NormalizeRecords(&model.NormalizeRecordsRequest{
 		FlowJobName:            input.FlowConnectionConfigs.FlowJobName,
+		SyncBatchID:            input.SyncBatchID,
 		SoftDelete:             input.FlowConnectionConfigs.SoftDelete,
 		SoftDeleteColName:      input.FlowConnectionConfigs.SoftDeleteColName,
 		SyncedAtColName:        input.FlowConnectionConfigs.SyncedAtColName,
@@ -423,10 +419,8 @@ func (a *FlowableActivity) StartNormalize(
 	}
 
 	// log the number of batches normalized
-	if res != nil {
-		slog.InfoContext(ctx, fmt.Sprintf("normalized records from batch %d to batch %d",
-			res.StartBatchID, res.EndBatchID))
-	}
+	slog.InfoContext(ctx, fmt.Sprintf("normalized records from batch %d to batch %d",
+		res.StartBatchID, res.EndBatchID))
 
 	return res, nil
 }

diff --git a/flow/connectors/bigquery/bigquery.go b/flow/connectors/bigquery/bigquery.go
@@ -359,6 +359,7 @@ func (c *BigQueryConnector) GetLastSyncBatchID(jobName string) (int64, error) {
 	query := fmt.Sprintf("SELECT sync_batch_id FROM %s WHERE mirror_job_name = '%s'",
 		MirrorJobsTable, jobName)
 	q := c.client.Query(query)
+	q.DisableQueryCache = true
 	q.DefaultProjectID = c.projectID
 	q.DefaultDatasetID = c.datasetID
 	it, err := q.Read(c.ctx)
@@ -382,37 +383,30 @@ func (c *BigQueryConnector) GetLastSyncBatchID(jobName string) (int64, error) {
 	}
 }
 
-func (c *BigQueryConnector) GetLastSyncAndNormalizeBatchID(jobName string) (model.SyncAndNormalizeBatchID, error) {
-	query := fmt.Sprintf("SELECT sync_batch_id, normalize_batch_id FROM %s WHERE mirror_job_name = '%s'",
+func (c *BigQueryConnector) GetLastNormalizeBatchID(jobName string) (int64, error) {
+	query := fmt.Sprintf("SELECT normalize_batch_id FROM %s WHERE mirror_job_name = '%s'",
 		MirrorJobsTable, jobName)
 	q := c.client.Query(query)
+	q.DisableQueryCache = true
 	q.DefaultProjectID = c.projectID
 	q.DefaultDatasetID = c.datasetID
 	it, err := q.Read(c.ctx)
 	if err != nil {
 		err = fmt.Errorf("failed to run query %s on BigQuery:\n %w", query, err)
-		return model.SyncAndNormalizeBatchID{}, err
+		return 0, err
 	}
 
 	var row []bigquery.Value
 	err = it.Next(&row)
 	if err != nil {
 		c.logger.Info("no row found for job")
-		return model.SyncAndNormalizeBatchID{}, nil
+		return 0, nil
 	}
 
-	syncBatchID := int64(0)
-	normBatchID := int64(0)
 	if row[0] != nil {
-		syncBatchID = row[0].(int64)
-	}
-	if row[1] != nil {
-		normBatchID = row[1].(int64)
+		return row[0].(int64), nil
 	}
-	return model.SyncAndNormalizeBatchID{
-		SyncBatchID:      syncBatchID,
-		NormalizeBatchID: normBatchID,
-	}, nil
+	return 0, nil
 }
 
 func (c *BigQueryConnector) getDistinctTableNamesInBatch(flowJobName string, syncBatchID int64,
@@ -546,7 +540,7 @@ func (c *BigQueryConnector) syncRecordsViaAvro(
 func (c *BigQueryConnector) NormalizeRecords(req *model.NormalizeRecordsRequest) (*model.NormalizeResponse, error) {
 	rawTableName := c.getRawTableName(req.FlowJobName)
 
-	batchIDs, err := c.GetLastSyncAndNormalizeBatchID(req.FlowJobName)
+	normBatchID, err := c.GetLastNormalizeBatchID(req.FlowJobName)
 	if err != nil {
 		return nil, fmt.Errorf("failed to get batch for the current mirror: %v", err)
 	}
@@ -557,27 +551,27 @@ func (c *BigQueryConnector) NormalizeRecords(req *model.NormalizeRecordsRequest)
 	}
 	// if job is not yet found in the peerdb_mirror_jobs_table
 	// OR sync is lagging end normalize
-	if !hasJob || batchIDs.NormalizeBatchID >= batchIDs.SyncBatchID {
+	if !hasJob || normBatchID >= req.SyncBatchID {
 		c.logger.Info("waiting for sync to catch up, so finishing")
 		return &model.NormalizeResponse{
 			Done:         false,
-			StartBatchID: batchIDs.NormalizeBatchID,
-			EndBatchID:   batchIDs.SyncBatchID,
+			StartBatchID: normBatchID,
+			EndBatchID:   req.SyncBatchID,
 		}, nil
 	}
 	distinctTableNames, err := c.getDistinctTableNamesInBatch(
 		req.FlowJobName,
-		batchIDs.SyncBatchID,
-		batchIDs.NormalizeBatchID,
+		req.SyncBatchID,
+		normBatchID,
 	)
 	if err != nil {
 		return nil, fmt.Errorf("couldn't get distinct table names to normalize: %w", err)
 	}
 
 	tableNametoUnchangedToastCols, err := c.getTableNametoUnchangedCols(
 		req.FlowJobName,
-		batchIDs.SyncBatchID,
-		batchIDs.NormalizeBatchID,
+		req.SyncBatchID,
+		normBatchID,
 	)
 	if err != nil {
 		return nil, fmt.Errorf("couldn't get tablename to unchanged cols mapping: %w", err)
@@ -599,8 +593,8 @@ func (c *BigQueryConnector) NormalizeRecords(req *model.NormalizeRecordsRequest)
 			dstTableName:          tableName,
 			dstDatasetTable:       dstDatasetTable,
 			normalizedTableSchema: req.TableNameSchemaMapping[tableName],
-			syncBatchID:           batchIDs.SyncBatchID,
-			normalizeBatchID:      batchIDs.NormalizeBatchID,
+			syncBatchID:           req.SyncBatchID,
+			normalizeBatchID:      normBatchID,
 			peerdbCols: &protos.PeerDBColumns{
 				SoftDeleteColName: req.SoftDeleteColName,
 				SyncedAtColName:   req.SyncedAtColName,
@@ -625,7 +619,7 @@ func (c *BigQueryConnector) NormalizeRecords(req *model.NormalizeRecordsRequest)
 	// update metadata to make the last normalized batch id to the recent last sync batch id.
 	updateMetadataStmt := fmt.Sprintf(
 		"UPDATE %s SET normalize_batch_id=%d WHERE mirror_job_name='%s';",
-		MirrorJobsTable, batchIDs.SyncBatchID, req.FlowJobName)
+		MirrorJobsTable, req.SyncBatchID, req.FlowJobName)
 
 	query := c.client.Query(updateMetadataStmt)
 	query.DefaultProjectID = c.projectID
@@ -637,8 +631,8 @@ func (c *BigQueryConnector) NormalizeRecords(req *model.NormalizeRecordsRequest)
 
 	return &model.NormalizeResponse{
 		Done:         true,
-		StartBatchID: batchIDs.NormalizeBatchID + 1,
-		EndBatchID:   batchIDs.SyncBatchID,
+		StartBatchID: normBatchID + 1,
+		EndBatchID:   req.SyncBatchID,
 	}, nil
 }
 

diff --git a/flow/connectors/postgres/client.go b/flow/connectors/postgres/client.go
@@ -38,11 +38,11 @@ const (
 	createRawTableBatchIDIndexSQL  = "CREATE INDEX IF NOT EXISTS %s_batchid_idx ON %s.%s(_peerdb_batch_id)"
 	createRawTableDstTableIndexSQL = "CREATE INDEX IF NOT EXISTS %s_dst_table_idx ON %s.%s(_peerdb_destination_table_name)"
 
-	getLastOffsetSQL                   = "SELECT lsn_offset FROM %s.%s WHERE mirror_job_name=$1"
-	setLastOffsetSQL                   = "UPDATE %s.%s SET lsn_offset=GREATEST(lsn_offset, $1) WHERE mirror_job_name=$2"
-	getLastSyncBatchID_SQL             = "SELECT sync_batch_id FROM %s.%s WHERE mirror_job_name=$1"
-	getLastSyncAndNormalizeBatchID_SQL = "SELECT sync_batch_id,normalize_batch_id FROM %s.%s WHERE mirror_job_name=$1"
-	createNormalizedTableSQL           = "CREATE TABLE IF NOT EXISTS %s(%s)"
+	getLastOffsetSQL            = "SELECT lsn_offset FROM %s.%s WHERE mirror_job_name=$1"
+	setLastOffsetSQL            = "UPDATE %s.%s SET lsn_offset=GREATEST(lsn_offset, $1) WHERE mirror_job_name=$2"
+	getLastSyncBatchID_SQL      = "SELECT sync_batch_id FROM %s.%s WHERE mirror_job_name=$1"
+	getLastNormalizeBatchID_SQL = "SELECT normalize_batch_id FROM %s.%s WHERE mirror_job_name=$1"
+	createNormalizedTableSQL    = "CREATE TABLE IF NOT EXISTS %s(%s)"
 
 	upsertJobMetadataForSyncSQL = `INSERT INTO %s.%s AS j VALUES ($1,$2,$3,$4)
 	 ON CONFLICT(mirror_job_name) DO UPDATE SET lsn_offset=GREATEST(j.lsn_offset, EXCLUDED.lsn_offset), sync_batch_id=EXCLUDED.sync_batch_id`
@@ -471,24 +471,21 @@ func (c *PostgresConnector) GetLastSyncBatchID(jobName string) (int64, error) {
 	return result.Int64, nil
 }
 
-func (c *PostgresConnector) GetLastSyncAndNormalizeBatchID(jobName string) (*model.SyncAndNormalizeBatchID, error) {
-	var syncResult, normalizeResult pgtype.Int8
+func (c *PostgresConnector) GetLastNormalizeBatchID(jobName string) (int64, error) {
+	var result pgtype.Int8
 	err := c.pool.QueryRow(c.ctx, fmt.Sprintf(
-		getLastSyncAndNormalizeBatchID_SQL,
+		getLastNormalizeBatchID_SQL,
 		c.metadataSchema,
 		mirrorJobsTableIdentifier,
-	), jobName).Scan(&syncResult, &normalizeResult)
+	), jobName).Scan(&result)
 	if err != nil {
 		if err == pgx.ErrNoRows {
 			c.logger.Info("No row found, returning 0")
-			return &model.SyncAndNormalizeBatchID{}, nil
+			return 0, nil
 		}
-		return nil, fmt.Errorf("error while reading result row: %w", err)
+		return 0, fmt.Errorf("error while reading result row: %w", err)
 	}
-	return &model.SyncAndNormalizeBatchID{
-		SyncBatchID:      syncResult.Int64,
-		NormalizeBatchID: normalizeResult.Int64,
-	}, nil
+	return result.Int64, nil
 }
 
 func (c *PostgresConnector) jobMetadataExists(jobName string) (bool, error) {

diff --git a/flow/connectors/postgres/postgres.go b/flow/connectors/postgres/postgres.go
@@ -431,28 +431,29 @@ func (c *PostgresConnector) NormalizeRecords(req *model.NormalizeRecordsRequest)
 		}, nil
 	}
 
-	batchIDs, err := c.GetLastSyncAndNormalizeBatchID(req.FlowJobName)
+	normBatchID, err := c.GetLastNormalizeBatchID(req.FlowJobName)
 	if err != nil {
 		return nil, fmt.Errorf("failed to get batch for the current mirror: %v", err)
 	}
+
 	// normalize has caught up with sync, chill until more records are loaded.
-	if batchIDs.NormalizeBatchID >= batchIDs.SyncBatchID {
+	if normBatchID >= req.SyncBatchID {
 		c.logger.Info(fmt.Sprintf("no records to normalize: syncBatchID %d, normalizeBatchID %d",
-			batchIDs.SyncBatchID, batchIDs.NormalizeBatchID))
+			req.SyncBatchID, normBatchID))
 		return &model.NormalizeResponse{
 			Done:         false,
-			StartBatchID: batchIDs.NormalizeBatchID,
-			EndBatchID:   batchIDs.SyncBatchID,
+			StartBatchID: normBatchID,
+			EndBatchID:   req.SyncBatchID,
 		}, nil
 	}
 
 	destinationTableNames, err := c.getDistinctTableNamesInBatch(
-		req.FlowJobName, batchIDs.SyncBatchID, batchIDs.NormalizeBatchID)
+		req.FlowJobName, req.SyncBatchID, normBatchID)
 	if err != nil {
 		return nil, err
 	}
 	unchangedToastColsMap, err := c.getTableNametoUnchangedCols(req.FlowJobName,
-		batchIDs.SyncBatchID, batchIDs.NormalizeBatchID)
+		req.SyncBatchID, normBatchID)
 	if err != nil {
 		return nil, err
 	}
@@ -491,7 +492,7 @@ func (c *PostgresConnector) NormalizeRecords(req *model.NormalizeRecordsRequest)
 		}
 		normalizeStatements := normalizeStmtGen.generateNormalizeStatements()
 		for _, normalizeStatement := range normalizeStatements {
-			mergeStatementsBatch.Queue(normalizeStatement, batchIDs.NormalizeBatchID, batchIDs.SyncBatchID, destinationTableName).Exec(
+			mergeStatementsBatch.Queue(normalizeStatement, normBatchID, req.SyncBatchID, destinationTableName).Exec(
 				func(ct pgconn.CommandTag) error {
 					totalRowsAffected += int(ct.RowsAffected())
 					return nil
@@ -508,7 +509,7 @@ func (c *PostgresConnector) NormalizeRecords(req *model.NormalizeRecordsRequest)
 	c.logger.Info(fmt.Sprintf("normalized %d records", totalRowsAffected))
 
 	// updating metadata with new normalizeBatchID
-	err = c.updateNormalizeMetadata(req.FlowJobName, batchIDs.SyncBatchID, normalizeRecordsTx)
+	err = c.updateNormalizeMetadata(req.FlowJobName, req.SyncBatchID, normalizeRecordsTx)
 	if err != nil {
 		return nil, err
 	}
@@ -520,8 +521,8 @@ func (c *PostgresConnector) NormalizeRecords(req *model.NormalizeRecordsRequest)
 
 	return &model.NormalizeResponse{
 		Done:         true,
-		StartBatchID: batchIDs.NormalizeBatchID + 1,
-		EndBatchID:   batchIDs.SyncBatchID,
+		StartBatchID: normBatchID + 1,
+		EndBatchID:   req.SyncBatchID,
 	}, nil
 }
 
@@ -711,7 +712,8 @@ func (c *PostgresConnector) SetupNormalizedTables(req *protos.SetupNormalizedTab
 
 // ReplayTableSchemaDelta changes a destination table to match the schema at source
 // This could involve adding or dropping multiple columns.
-func (c *PostgresConnector) ReplayTableSchemaDeltas(flowJobName string,
+func (c *PostgresConnector) ReplayTableSchemaDeltas(
+	flowJobName string,
 	schemaDeltas []*protos.TableSchemaDelta,
 ) error {
 	if len(schemaDeltas) == 0 {