From e112156830a6f1ffd4ac156b34e026b54e9e685b Mon Sep 17 00:00:00 2001 From: Pankaj B Date: Thu, 18 Jan 2024 19:47:57 +0530 Subject: [PATCH 01/36] initial cdc code --- flow/connectors/clickhouse/cdc.go | 560 +++++++++++++++++++++++ flow/connectors/clickhouse/clickhouse.go | 2 + 2 files changed, 562 insertions(+) create mode 100644 flow/connectors/clickhouse/cdc.go diff --git a/flow/connectors/clickhouse/cdc.go b/flow/connectors/clickhouse/cdc.go new file mode 100644 index 0000000000..6c286e01a5 --- /dev/null +++ b/flow/connectors/clickhouse/cdc.go @@ -0,0 +1,560 @@ +package connclickhouse + +import ( + "database/sql" + "fmt" + "log/slog" + "regexp" + "strings" + "sync/atomic" + "time" + + _ "github.com/ClickHouse/clickhouse-go/v2" + _ "github.com/ClickHouse/clickhouse-go/v2/lib/driver" + "github.com/PeerDB-io/peer-flow/generated/protos" + "github.com/PeerDB-io/peer-flow/model/qvalue" +) + +const ( + checkIfTableExistsSQL = `SELECT exists(SELECT 1 FROM system.tables WHERE database = ? AND name = ?) AS table_exists;` + mirrorJobsTableIdentifier = "PEERDB_MIRROR_JOBS" +) + +// getRawTableName returns the raw table name for the given table identifier. +func (c *ClickhouseConnector) getRawTableName(flowJobName string) string { + // replace all non-alphanumeric characters with _ + flowJobName = regexp.MustCompile("[^a-zA-Z0-9]+").ReplaceAllString(flowJobName, "_") + return fmt.Sprintf("_peerdb_raw_%s", flowJobName) +} + +func (c *ClickhouseConnector) checkIfTableExists(databaseName string, tableIdentifier string) (bool, error) { + var result pgtype.Bool + err := c.database.QueryRowContext(c.ctx, checkIfTableExistsSQL, databaseName, tableIdentifier).Scan(&result) + if err != nil { + return false, fmt.Errorf("error while reading result row: %w", err) + } + fmt.Printf("result: %+v\n", result) + return result.Bool, nil +} + +type MirrorJobRow struct { + MirrorJobName string + Offset int + SyncBatchID int + NormalizeBatchID int +} + +func (c *ClickhouseConnector) getMirrorRowByJobNAme(jobName string) (*MirrorJobRow, error) { + getLastOffsetSQL := "SELECT mirror_job_name, offset, sync_batch_id, normalize_batch_id FROM %s WHERE MIRROR_JOB_NAME=? Limit 1" + + row := c.database.QueryRowContext(c.ctx, fmt.Sprintf(getLastOffsetSQL, mirrorJobsTableIdentifier), jobName) + + var result MirrorJobRow + + err := row.Scan( + &result.MirrorJobName, + &result.Offset, + &result.SyncBatchID, + &result.NormalizeBatchID, + ) + + if err != nil { + return nil, err + } + + return &result, nil +} + +func (c *ClickhouseConnector) NeedsSetupMetadataTables() bool { + result, err := c.checkIfTableExists(c.config.Database, mirrorJobsTableIdentifier) + if err != nil { + return true + } + return !result +} + +func (c *ClickhouseConnector) SetupMetadataTables() error { + + createMirrorJobsTableSQL := `CREATE TABLE IF NOT EXISTS %s ( + MIRROR_JOB_NAME String NOT NULL, + OFFSET Int32 NOT NULL, + SYNC_BATCH_ID Int32 NOT NULL, + NORMALIZE_BATCH_ID Int32 NOT NULL + ) ENGINE = MergeTree() + ORDER BY MIRROR_JOB_NAME;` + + // NOTE that Clickhouse does not support transactional DDL + //createMetadataTablesTx, err := c.database.BeginTx(c.ctx, nil) + // if err != nil { + // return fmt.Errorf("unable to begin transaction for creating metadata tables: %w", err) + // } + // in case we return after error, ensure transaction is rolled back + // defer func() { + // deferErr := createMetadataTablesTx.Rollback() + // if deferErr != sql.ErrTxDone && deferErr != nil { + // c.logger.Error("error while rolling back transaction for creating metadata tables", + // slog.Any("error", deferErr)) + // } + // }() + + // Not needed as we dont have schema + // err = c.createPeerDBInternalSchema(createMetadataTablesTx) + // if err != nil { + // return err + // } + _, err := c.database.ExecContext(c.ctx, fmt.Sprintf(createMirrorJobsTableSQL, mirrorJobsTableIdentifier)) + if err != nil { + return fmt.Errorf("error while setting up mirror jobs table: %w", err) + } + // err = createMetadataTablesTx.Commit() + // if err != nil { + // return fmt.Errorf("unable to commit transaction for creating metadata tables: %w", err) + // } + + return nil +} + +func (c *ClickhouseConnector) GetLastOffset(jobName string) (int64, error) { + getLastOffsetSQL := "SELECT OFFSET FROM %s WHERE MIRROR_JOB_NAME=?" + + rows, err := c.database.QueryContext(c.ctx, fmt.Sprintf(getLastOffsetSQL, + mirrorJobsTableIdentifier), jobName) + if err != nil { + return 0, fmt.Errorf("error querying Clickhouse peer for last syncedID: %w", err) + } + defer func() { + err = rows.Close() + if err != nil { + c.logger.Error("error while closing rows for reading last offset", slog.Any("error", err)) + } + }() + + if !rows.Next() { + c.logger.Warn("No row found, returning 0") + return 0, nil + } + var result pgtype.Int8 + err = rows.Scan(&result) + if err != nil { + return 0, fmt.Errorf("error while reading result row: %w", err) + } + if result.Int64 == 0 { + c.logger.Warn("Assuming zero offset means no sync has happened") + return 0, nil + } + return result.Int64, nil +} + +func (c *ClickhouseConnector) SetLastOffset(jobName string, lastOffset int64) error { + currentRow, err := c.getMirrorRowByJobNAme(jobName) + + if err != nil { + return err + } + + //setLastOffsetSQL = "UPDATE %s.%s SET OFFSET=GREATEST(OFFSET, ?) WHERE MIRROR_JOB_NAME=?" + setLastOffsetSQL := `INSERT INTO %s + (mirror_job_name, offset, sync_batch_id, normalize_batch_id) + VALUES (?, ?, ?, ?);` + _, err = c.database.ExecContext(c.ctx, fmt.Sprintf(setLastOffsetSQL, + mirrorJobsTableIdentifier), currentRow.MirrorJobName, lastOffset, currentRow.SyncBatchID, currentRow.NormalizeBatchID) + if err != nil { + return fmt.Errorf("error querying Snowflake peer for last syncedID: %w", err) + } + return nil +} + +func (c *ClickhouseConnector) GetLastSyncBatchID(jobName string) (int64, error) { + getLastSyncBatchID_SQL := "SELECT SYNC_BATCH_ID FROM %s WHERE MIRROR_JOB_NAME=?" + + rows, err := c.database.QueryContext(c.ctx, fmt.Sprintf(getLastSyncBatchID_SQL, + mirrorJobsTableIdentifier), jobName) + if err != nil { + return 0, fmt.Errorf("error querying Clickhouse peer for last syncBatchId: %w", err) + } + defer rows.Close() + + var result pgtype.Int8 + if !rows.Next() { + c.logger.Warn("No row found, returning 0") + return 0, nil + } + err = rows.Scan(&result) + if err != nil { + return 0, fmt.Errorf("error while reading result row: %w", err) + } + return result.Int64, nil +} + +func (c *ClickhouseConnector) CreateRawTable(req *protos.CreateRawTableInput) (*protos.CreateRawTableOutput, error) { + rawTableName := c.getRawTableName(req.FlowJobName) + + // createRawTableTx, err := c.database.BeginTx(c.ctx, nil) + // if err != nil { + // return nil, fmt.Errorf("unable to begin transaction for creation of raw table: %w", err) + // } + + createRawTableSQL := `CREATE TABLE IF NOT EXISTS %s ( + _PEERDB_UID STRING NOT NULL, + _PEERDB_TIMESTAMP INT NOT NULL, + _PEERDB_DESTINATION_TABLE_NAME STRING NOT NULL, + _PEERDB_DATA STRING NOT NULL, + _PEERDB_RECORD_TYPE INTEGER NOT NULL, + _PEERDB_MATCH_DATA STRING, + _PEERDB_BATCH_ID INT, + _PEERDB_UNCHANGED_TOAST_COLUMNS STRING + ) ENGINE = ReplacingMergeTree ORDER BY _PEERDB_UID;` + + _, err := c.database.ExecContext(c.ctx, + fmt.Sprintf(createRawTableSQL, rawTableName)) + if err != nil { + return nil, fmt.Errorf("unable to create raw table: %w", err) + } + // err = createRawTableTx.Commit() + // if err != nil { + // return nil, fmt.Errorf("unable to commit transaction for creation of raw table: %w", err) + // } + + stage := c.getStageNameForJob(req.FlowJobName) + err = c.createStage(stage, &protos.QRepConfig{}) + if err != nil { + return nil, err + } + + return &protos.CreateRawTableOutput{ + TableIdentifier: rawTableName, + }, nil +} + +func (c *ClickhouseConnector) syncRecordsViaAvro( + req *model.SyncRecordsRequest, + rawTableIdentifier string, + syncBatchID int64, +) (*model.SyncResponse, error) { + tableNameRowsMapping := make(map[string]uint32) + streamReq := model.NewRecordsToStreamRequest(req.Records.GetRecords(), tableNameRowsMapping, syncBatchID) + streamRes, err := utils.RecordsToRawTableStream(streamReq) + if err != nil { + return nil, fmt.Errorf("failed to convert records to raw table stream: %w", err) + } + + qrepConfig := &protos.QRepConfig{ + StagingPath: "", + FlowJobName: req.FlowJobName, + DestinationTableIdentifier: strings.ToLower(fmt.Sprintf("%s", + rawTableIdentifier)), + } + avroSyncer := NewSnowflakeAvroSyncMethod(qrepConfig, c) + destinationTableSchema, err := c.getTableSchema(qrepConfig.DestinationTableIdentifier) + if err != nil { + return nil, err + } + + numRecords, err := avroSyncer.SyncRecords(destinationTableSchema, streamRes.Stream, req.FlowJobName) + if err != nil { + return nil, err + } + + tableSchemaDeltas := req.Records.WaitForSchemaDeltas(req.TableMappings) + err = c.ReplayTableSchemaDeltas(req.FlowJobName, tableSchemaDeltas) + if err != nil { + return nil, fmt.Errorf("failed to sync schema changes: %w", err) + } + + lastCheckpoint, err := req.Records.GetLastCheckpoint() + if err != nil { + return nil, err + } + + return &model.SyncResponse{ + LastSyncedCheckPointID: lastCheckpoint, + NumRecordsSynced: int64(numRecords), + CurrentSyncBatchID: syncBatchID, + TableNameRowsMapping: tableNameRowsMapping, + TableSchemaDeltas: tableSchemaDeltas, + RelationMessageMapping: <-req.Records.RelationMessageMapping, + }, nil +} + +func (c *ClickhouseConnector) SyncRecords(req *model.SyncRecordsRequest) (*model.SyncResponse, error) { + rawTableName := getRawTableName(req.FlowJobName) + c.logger.Info(fmt.Sprintf("pushing records to Snowflake table %s", rawTableName)) + + syncBatchID, err := c.GetLastSyncBatchID(req.FlowJobName) + if err != nil { + return nil, fmt.Errorf("failed to get previous syncBatchID: %w", err) + } + syncBatchID += 1 + + res, err := c.syncRecordsViaAvro(req, rawTableName, syncBatchID) + if err != nil { + return nil, err + } + + // transaction for SyncRecords + syncRecordsTx, err := c.database.BeginTx(c.ctx, nil) + if err != nil { + return nil, err + } + // in case we return after error, ensure transaction is rolled back + defer func() { + deferErr := syncRecordsTx.Rollback() + if deferErr != sql.ErrTxDone && deferErr != nil { + c.logger.Error("error while rolling back transaction for SyncRecords: %v", + slog.Any("error", deferErr), slog.Int64("syncBatchID", syncBatchID)) + } + }() + + // updating metadata with new offset and syncBatchID + err = c.updateSyncMetadata(req.FlowJobName, res.LastSyncedCheckPointID, syncBatchID, syncRecordsTx) + if err != nil { + return nil, err + } + // transaction commits + err = syncRecordsTx.Commit() + if err != nil { + return nil, err + } + + return res, nil +} + +func (c *ClickhouseConnector) SyncFlowCleanup(jobName string) error { + syncFlowCleanupTx, err := c.database.BeginTx(c.ctx, nil) + if err != nil { + return fmt.Errorf("unable to begin transaction for sync flow cleanup: %w", err) + } + defer func() { + deferErr := syncFlowCleanupTx.Rollback() + if deferErr != sql.ErrTxDone && deferErr != nil { + c.logger.Error("error while rolling back transaction for flow cleanup", slog.Any("error", deferErr)) + } + }() + + row := syncFlowCleanupTx.QueryRowContext(c.ctx, checkSchemaExistsSQL, c.metadataSchema) + var schemaExists pgtype.Bool + err = row.Scan(&schemaExists) + if err != nil { + return fmt.Errorf("unable to check if internal schema exists: %w", err) + } + + if schemaExists.Bool { + _, err = syncFlowCleanupTx.ExecContext(c.ctx, fmt.Sprintf(dropTableIfExistsSQL, c.metadataSchema, + getRawTableIdentifier(jobName))) + if err != nil { + return fmt.Errorf("unable to drop raw table: %w", err) + } + _, err = syncFlowCleanupTx.ExecContext(c.ctx, + fmt.Sprintf(deleteJobMetadataSQL, c.metadataSchema, mirrorJobsTableIdentifier), jobName) + if err != nil { + return fmt.Errorf("unable to delete job metadata: %w", err) + } + } + + err = syncFlowCleanupTx.Commit() + if err != nil { + return fmt.Errorf("unable to commit transaction for sync flow cleanup: %w", err) + } + + err = c.dropStage("", jobName) + if err != nil { + return err + } + + return nil +} + +func (c *ClickhouseConnector) SetupNormalizedTables( + req *protos.SetupNormalizedTableBatchInput, +) (*protos.SetupNormalizedTableBatchOutput, error) { + tableExistsMapping := make(map[string]bool) + for tableIdentifier, tableSchema := range req.TableNameSchemaMapping { + normalizedSchemaTable, err := utils.ParseSchemaTable(tableIdentifier) + if err != nil { + return nil, fmt.Errorf("error while parsing table schema and name: %w", err) + } + tableAlreadyExists, err := c.checkIfTableExists(normalizedSchemaTable.Schema, normalizedSchemaTable.Table) + if err != nil { + return nil, fmt.Errorf("error occurred while checking if normalized table exists: %w", err) + } + if tableAlreadyExists { + tableExistsMapping[tableIdentifier] = true + continue + } + + normalizedTableCreateSQL := generateCreateTableSQLForNormalizedTable( + normalizedSchemaTable, tableSchema, req.SoftDeleteColName, req.SyncedAtColName) + _, err = c.database.ExecContext(c.ctx, normalizedTableCreateSQL) + if err != nil { + return nil, fmt.Errorf("[sf] error while creating normalized table: %w", err) + } + tableExistsMapping[tableIdentifier] = false + } + + return &protos.SetupNormalizedTableBatchOutput{ + TableExistsMapping: tableExistsMapping, + }, nil +} + +// ReplayTableSchemaDeltas changes a destination table to match the schema at source +// This could involve adding or dropping multiple columns. +func (c *ClickhouseConnector) ReplayTableSchemaDeltas(flowJobName string, + schemaDeltas []*protos.TableSchemaDelta, +) error { + if len(schemaDeltas) == 0 { + return nil + } + + tableSchemaModifyTx, err := c.database.Begin() + if err != nil { + return fmt.Errorf("error starting transaction for schema modification: %w", + err) + } + defer func() { + deferErr := tableSchemaModifyTx.Rollback() + if deferErr != sql.ErrTxDone && deferErr != nil { + c.logger.Error("error rolling back transaction for table schema modification", slog.Any("error", deferErr)) + } + }() + + for _, schemaDelta := range schemaDeltas { + if schemaDelta == nil || len(schemaDelta.AddedColumns) == 0 { + continue + } + + for _, addedColumn := range schemaDelta.AddedColumns { + sfColtype, err := qValueKindToSnowflakeType(qvalue.QValueKind(addedColumn.ColumnType)) + if err != nil { + return fmt.Errorf("failed to convert column type %s to snowflake type: %w", + addedColumn.ColumnType, err) + } + _, err = tableSchemaModifyTx.ExecContext(c.ctx, + fmt.Sprintf("ALTER TABLE %s ADD COLUMN IF NOT EXISTS \"%s\" %s", + schemaDelta.DstTableName, strings.ToUpper(addedColumn.ColumnName), sfColtype)) + if err != nil { + return fmt.Errorf("failed to add column %s for table %s: %w", addedColumn.ColumnName, + schemaDelta.DstTableName, err) + } + c.logger.Info(fmt.Sprintf("[schema delta replay] added column %s with data type %s", addedColumn.ColumnName, + addedColumn.ColumnType), + slog.String("destination table name", schemaDelta.DstTableName), + slog.String("source table name", schemaDelta.SrcTableName)) + } + } + + err = tableSchemaModifyTx.Commit() + if err != nil { + return fmt.Errorf("failed to commit transaction for table schema modification: %w", + err) + } + + return nil +} + +func (c *ClickhouseConnector) NormalizeRecords(req *model.NormalizeRecordsRequest) (*model.NormalizeResponse, error) { + batchIDs, err := c.GetLastSyncAndNormalizeBatchID(req.FlowJobName) + if err != nil { + return nil, err + } + // normalize has caught up with sync, chill until more records are loaded. + if batchIDs.NormalizeBatchID >= batchIDs.SyncBatchID { + return &model.NormalizeResponse{ + Done: false, + StartBatchID: batchIDs.NormalizeBatchID, + EndBatchID: batchIDs.SyncBatchID, + }, nil + } + + jobMetadataExists, err := c.jobMetadataExists(req.FlowJobName) + if err != nil { + return nil, err + } + // sync hasn't created job metadata yet, chill. + if !jobMetadataExists { + return &model.NormalizeResponse{ + Done: false, + }, nil + } + destinationTableNames, err := c.getDistinctTableNamesInBatch( + req.FlowJobName, + batchIDs.SyncBatchID, + batchIDs.NormalizeBatchID, + ) + if err != nil { + return nil, err + } + + tableNametoUnchangedToastCols, err := c.getTableNametoUnchangedCols(req.FlowJobName, batchIDs.SyncBatchID, batchIDs.NormalizeBatchID) + if err != nil { + return nil, fmt.Errorf("couldn't tablename to unchanged cols mapping: %w", err) + } + + var totalRowsAffected int64 = 0 + g, gCtx := errgroup.WithContext(c.ctx) + g.SetLimit(8) // limit parallel merges to 8 + + for _, destinationTableName := range destinationTableNames { + tableName := destinationTableName // local variable for the closure + + g.Go(func() error { + mergeGen := &mergeStmtGenerator{ + rawTableName: getRawTableIdentifier(req.FlowJobName), + dstTableName: tableName, + syncBatchID: batchIDs.SyncBatchID, + normalizeBatchID: batchIDs.NormalizeBatchID, + normalizedTableSchema: req.TableNameSchemaMapping[tableName], + unchangedToastColumns: tableNametoUnchangedToastCols[tableName], + peerdbCols: &protos.PeerDBColumns{ + SoftDelete: req.SoftDelete, + SoftDeleteColName: req.SoftDeleteColName, + SyncedAtColName: req.SyncedAtColName, + }, + } + mergeStatement, err := mergeGen.generateMergeStmt() + if err != nil { + return err + } + + startTime := time.Now() + c.logger.Info("[merge] merging records...", slog.String("destTable", tableName)) + + result, err := c.database.ExecContext(gCtx, mergeStatement, tableName) + if err != nil { + return fmt.Errorf("failed to merge records into %s (statement: %s): %w", + tableName, mergeStatement, err) + } + + endTime := time.Now() + c.logger.Info(fmt.Sprintf("[merge] merged records into %s, took: %d seconds", + tableName, endTime.Sub(startTime)/time.Second)) + if err != nil { + c.logger.Error("[merge] error while normalizing records", slog.Any("error", err)) + return err + } + + rowsAffected, err := result.RowsAffected() + if err != nil { + return fmt.Errorf("failed to get rows affected by merge statement for table %s: %w", tableName, err) + } + + atomic.AddInt64(&totalRowsAffected, rowsAffected) + return nil + }) + } + + if err := g.Wait(); err != nil { + return nil, fmt.Errorf("error while normalizing records: %w", err) + } + + // updating metadata with new normalizeBatchID + err = c.updateNormalizeMetadata(req.FlowJobName, batchIDs.SyncBatchID) + if err != nil { + return nil, err + } + + return &model.NormalizeResponse{ + Done: true, + StartBatchID: batchIDs.NormalizeBatchID + 1, + EndBatchID: batchIDs.SyncBatchID, + }, nil +} diff --git a/flow/connectors/clickhouse/clickhouse.go b/flow/connectors/clickhouse/clickhouse.go index 6b3740935a..834f10ffc5 100644 --- a/flow/connectors/clickhouse/clickhouse.go +++ b/flow/connectors/clickhouse/clickhouse.go @@ -18,6 +18,7 @@ type ClickhouseConnector struct { database *sql.DB tableSchemaMapping map[string]*protos.TableSchema logger slog.Logger + config *protos.ClickhouseConfig } func NewClickhouseConnector(ctx context.Context, @@ -34,6 +35,7 @@ func NewClickhouseConnector(ctx context.Context, database: database, tableSchemaMapping: nil, logger: *slog.With(slog.String(string(shared.FlowNameKey), flowName)), + config: clickhouseProtoConfig, }, nil } From 9a0e5a05969383c514cae1f61878f83bf1414556 Mon Sep 17 00:00:00 2001 From: Kaushik Iska Date: Thu, 18 Jan 2024 10:13:44 -0500 Subject: [PATCH 02/36] add generation of normalize schema (#1097) --- flow/connectors/clickhouse/cdc.go | 145 +----------- flow/connectors/clickhouse/normalize.go | 219 +++++++++++++++++++ flow/connectors/clickhouse/qvalue_convert.go | 9 + 3 files changed, 231 insertions(+), 142 deletions(-) create mode 100644 flow/connectors/clickhouse/normalize.go diff --git a/flow/connectors/clickhouse/cdc.go b/flow/connectors/clickhouse/cdc.go index 6c286e01a5..8a35caa8a7 100644 --- a/flow/connectors/clickhouse/cdc.go +++ b/flow/connectors/clickhouse/cdc.go @@ -6,13 +6,14 @@ import ( "log/slog" "regexp" "strings" - "sync/atomic" - "time" _ "github.com/ClickHouse/clickhouse-go/v2" _ "github.com/ClickHouse/clickhouse-go/v2/lib/driver" + "github.com/PeerDB-io/peer-flow/connectors/utils" "github.com/PeerDB-io/peer-flow/generated/protos" + "github.com/PeerDB-io/peer-flow/model" "github.com/PeerDB-io/peer-flow/model/qvalue" + "github.com/jackc/pgx/v5/pgtype" ) const ( @@ -364,38 +365,6 @@ func (c *ClickhouseConnector) SyncFlowCleanup(jobName string) error { return nil } -func (c *ClickhouseConnector) SetupNormalizedTables( - req *protos.SetupNormalizedTableBatchInput, -) (*protos.SetupNormalizedTableBatchOutput, error) { - tableExistsMapping := make(map[string]bool) - for tableIdentifier, tableSchema := range req.TableNameSchemaMapping { - normalizedSchemaTable, err := utils.ParseSchemaTable(tableIdentifier) - if err != nil { - return nil, fmt.Errorf("error while parsing table schema and name: %w", err) - } - tableAlreadyExists, err := c.checkIfTableExists(normalizedSchemaTable.Schema, normalizedSchemaTable.Table) - if err != nil { - return nil, fmt.Errorf("error occurred while checking if normalized table exists: %w", err) - } - if tableAlreadyExists { - tableExistsMapping[tableIdentifier] = true - continue - } - - normalizedTableCreateSQL := generateCreateTableSQLForNormalizedTable( - normalizedSchemaTable, tableSchema, req.SoftDeleteColName, req.SyncedAtColName) - _, err = c.database.ExecContext(c.ctx, normalizedTableCreateSQL) - if err != nil { - return nil, fmt.Errorf("[sf] error while creating normalized table: %w", err) - } - tableExistsMapping[tableIdentifier] = false - } - - return &protos.SetupNormalizedTableBatchOutput{ - TableExistsMapping: tableExistsMapping, - }, nil -} - // ReplayTableSchemaDeltas changes a destination table to match the schema at source // This could involve adding or dropping multiple columns. func (c *ClickhouseConnector) ReplayTableSchemaDeltas(flowJobName string, @@ -450,111 +419,3 @@ func (c *ClickhouseConnector) ReplayTableSchemaDeltas(flowJobName string, return nil } - -func (c *ClickhouseConnector) NormalizeRecords(req *model.NormalizeRecordsRequest) (*model.NormalizeResponse, error) { - batchIDs, err := c.GetLastSyncAndNormalizeBatchID(req.FlowJobName) - if err != nil { - return nil, err - } - // normalize has caught up with sync, chill until more records are loaded. - if batchIDs.NormalizeBatchID >= batchIDs.SyncBatchID { - return &model.NormalizeResponse{ - Done: false, - StartBatchID: batchIDs.NormalizeBatchID, - EndBatchID: batchIDs.SyncBatchID, - }, nil - } - - jobMetadataExists, err := c.jobMetadataExists(req.FlowJobName) - if err != nil { - return nil, err - } - // sync hasn't created job metadata yet, chill. - if !jobMetadataExists { - return &model.NormalizeResponse{ - Done: false, - }, nil - } - destinationTableNames, err := c.getDistinctTableNamesInBatch( - req.FlowJobName, - batchIDs.SyncBatchID, - batchIDs.NormalizeBatchID, - ) - if err != nil { - return nil, err - } - - tableNametoUnchangedToastCols, err := c.getTableNametoUnchangedCols(req.FlowJobName, batchIDs.SyncBatchID, batchIDs.NormalizeBatchID) - if err != nil { - return nil, fmt.Errorf("couldn't tablename to unchanged cols mapping: %w", err) - } - - var totalRowsAffected int64 = 0 - g, gCtx := errgroup.WithContext(c.ctx) - g.SetLimit(8) // limit parallel merges to 8 - - for _, destinationTableName := range destinationTableNames { - tableName := destinationTableName // local variable for the closure - - g.Go(func() error { - mergeGen := &mergeStmtGenerator{ - rawTableName: getRawTableIdentifier(req.FlowJobName), - dstTableName: tableName, - syncBatchID: batchIDs.SyncBatchID, - normalizeBatchID: batchIDs.NormalizeBatchID, - normalizedTableSchema: req.TableNameSchemaMapping[tableName], - unchangedToastColumns: tableNametoUnchangedToastCols[tableName], - peerdbCols: &protos.PeerDBColumns{ - SoftDelete: req.SoftDelete, - SoftDeleteColName: req.SoftDeleteColName, - SyncedAtColName: req.SyncedAtColName, - }, - } - mergeStatement, err := mergeGen.generateMergeStmt() - if err != nil { - return err - } - - startTime := time.Now() - c.logger.Info("[merge] merging records...", slog.String("destTable", tableName)) - - result, err := c.database.ExecContext(gCtx, mergeStatement, tableName) - if err != nil { - return fmt.Errorf("failed to merge records into %s (statement: %s): %w", - tableName, mergeStatement, err) - } - - endTime := time.Now() - c.logger.Info(fmt.Sprintf("[merge] merged records into %s, took: %d seconds", - tableName, endTime.Sub(startTime)/time.Second)) - if err != nil { - c.logger.Error("[merge] error while normalizing records", slog.Any("error", err)) - return err - } - - rowsAffected, err := result.RowsAffected() - if err != nil { - return fmt.Errorf("failed to get rows affected by merge statement for table %s: %w", tableName, err) - } - - atomic.AddInt64(&totalRowsAffected, rowsAffected) - return nil - }) - } - - if err := g.Wait(); err != nil { - return nil, fmt.Errorf("error while normalizing records: %w", err) - } - - // updating metadata with new normalizeBatchID - err = c.updateNormalizeMetadata(req.FlowJobName, batchIDs.SyncBatchID) - if err != nil { - return nil, err - } - - return &model.NormalizeResponse{ - Done: true, - StartBatchID: batchIDs.NormalizeBatchID + 1, - EndBatchID: batchIDs.SyncBatchID, - }, nil -} diff --git a/flow/connectors/clickhouse/normalize.go b/flow/connectors/clickhouse/normalize.go new file mode 100644 index 0000000000..fdf467d0f4 --- /dev/null +++ b/flow/connectors/clickhouse/normalize.go @@ -0,0 +1,219 @@ +package connclickhouse + +import ( + "fmt" + "log/slog" + "strings" + "sync/atomic" + "time" + + "github.com/PeerDB-io/peer-flow/connectors/utils" + "github.com/PeerDB-io/peer-flow/generated/protos" + "github.com/PeerDB-io/peer-flow/model" + "github.com/PeerDB-io/peer-flow/model/qvalue" + "golang.org/x/sync/errgroup" +) + +const ( + signColName = "_peerdb_sign" + signColType = "Int8" + versionColName = "_peerdb_version" + versionColType = "Int8" +) + +func (c *ClickhouseConnector) SetupNormalizedTables( + req *protos.SetupNormalizedTableBatchInput, +) (*protos.SetupNormalizedTableBatchOutput, error) { + tableExistsMapping := make(map[string]bool) + for tableIdentifier, tableSchema := range req.TableNameSchemaMapping { + normalizedSchemaTable, err := utils.ParseSchemaTable(tableIdentifier) + if err != nil { + return nil, fmt.Errorf("error while parsing table schema and name: %w", err) + } + tableAlreadyExists, err := c.checkIfTableExists(normalizedSchemaTable.Schema, normalizedSchemaTable.Table) + if err != nil { + return nil, fmt.Errorf("error occurred while checking if normalized table exists: %w", err) + } + if tableAlreadyExists { + tableExistsMapping[tableIdentifier] = true + continue + } + + normalizedTableCreateSQL, err := generateCreateTableSQLForNormalizedTable( + normalizedSchemaTable, + tableSchema, + req.SoftDeleteColName, + req.SyncedAtColName, + ) + if err != nil { + return nil, fmt.Errorf("error while generating create table sql for normalized table: %w", err) + } + + _, err = c.database.ExecContext(c.ctx, normalizedTableCreateSQL) + if err != nil { + return nil, fmt.Errorf("[sf] error while creating normalized table: %w", err) + } + tableExistsMapping[tableIdentifier] = false + } + + return &protos.SetupNormalizedTableBatchOutput{ + TableExistsMapping: tableExistsMapping, + }, nil +} + +func generateCreateTableSQLForNormalizedTable( + normalizedSchemaTable *utils.SchemaTable, + tableSchema *protos.TableSchema, + softDeleteColName string, + syncedAtColName string, +) (string, error) { + var stmtBuilder strings.Builder + stmtBuilder.WriteString(fmt.Sprintf("CREATE TABLE `%s`.`%s` (", normalizedSchemaTable.Schema, normalizedSchemaTable.Table)) + + nc := len(tableSchema.ColumnNames) + for i := 0; i < nc; i++ { + colName := tableSchema.ColumnNames[i] + colType := qvalue.QValueKind(tableSchema.ColumnTypes[i]) + clickhouseType, err := qValueKindToClickhouseType(colType) + if err != nil { + return "", fmt.Errorf("error while converting column type to clickhouse type: %w", err) + } + stmtBuilder.WriteString(fmt.Sprintf("`%s` %s, ", colName, clickhouseType)) + } + + // TODO support soft delete + + // synced at column will be added to all normalized tables + if syncedAtColName != "" { + stmtBuilder.WriteString(fmt.Sprintf("`%s` %s, ", syncedAtColName, "DateTime64(9)")) + } + + // add sign and version columns + stmtBuilder.WriteString(fmt.Sprintf("`%s` %s, ", signColName, signColType)) + stmtBuilder.WriteString(fmt.Sprintf("`%s` %s", versionColName, versionColType)) + + stmtBuilder.WriteString(fmt.Sprintf(") ENGINE = ReplacingMergeTree(`%s`) ", versionColName)) + + pkeys := tableSchema.PrimaryKeyColumns + if len(pkeys) > 0 { + pkeyStr := strings.Join(pkeys, ",") + + stmtBuilder.WriteString("PRIMARY KEY (") + stmtBuilder.WriteString(pkeyStr) + stmtBuilder.WriteString(") ") + + stmtBuilder.WriteString("ORDER BY (") + stmtBuilder.WriteString(pkeyStr) + stmtBuilder.WriteString(")") + } + + return stmtBuilder.String(), nil +} + +func (c *ClickhouseConnector) NormalizeRecords(req *model.NormalizeRecordsRequest) (*model.NormalizeResponse, error) { + batchIDs, err := c.GetLastSyncAndNormalizeBatchID(req.FlowJobName) + if err != nil { + return nil, err + } + // normalize has caught up with sync, chill until more records are loaded. + if batchIDs.NormalizeBatchID >= batchIDs.SyncBatchID { + return &model.NormalizeResponse{ + Done: false, + StartBatchID: batchIDs.NormalizeBatchID, + EndBatchID: batchIDs.SyncBatchID, + }, nil + } + + jobMetadataExists, err := c.jobMetadataExists(req.FlowJobName) + if err != nil { + return nil, err + } + // sync hasn't created job metadata yet, chill. + if !jobMetadataExists { + return &model.NormalizeResponse{ + Done: false, + }, nil + } + destinationTableNames, err := c.getDistinctTableNamesInBatch( + req.FlowJobName, + batchIDs.SyncBatchID, + batchIDs.NormalizeBatchID, + ) + if err != nil { + return nil, err + } + + tableNametoUnchangedToastCols, err := c.getTableNametoUnchangedCols(req.FlowJobName, batchIDs.SyncBatchID, batchIDs.NormalizeBatchID) + if err != nil { + return nil, fmt.Errorf("couldn't tablename to unchanged cols mapping: %w", err) + } + + var totalRowsAffected int64 = 0 + g, gCtx := errgroup.WithContext(c.ctx) + g.SetLimit(8) // limit parallel merges to 8 + + for _, destinationTableName := range destinationTableNames { + tableName := destinationTableName // local variable for the closure + + g.Go(func() error { + mergeGen := &mergeStmtGenerator{ + rawTableName: getRawTableIdentifier(req.FlowJobName), + dstTableName: tableName, + syncBatchID: batchIDs.SyncBatchID, + normalizeBatchID: batchIDs.NormalizeBatchID, + normalizedTableSchema: req.TableNameSchemaMapping[tableName], + unchangedToastColumns: tableNametoUnchangedToastCols[tableName], + peerdbCols: &protos.PeerDBColumns{ + SoftDelete: req.SoftDelete, + SoftDeleteColName: req.SoftDeleteColName, + SyncedAtColName: req.SyncedAtColName, + }, + } + mergeStatement, err := mergeGen.generateMergeStmt() + if err != nil { + return err + } + + startTime := time.Now() + c.logger.Info("[merge] merging records...", slog.String("destTable", tableName)) + + result, err := c.database.ExecContext(gCtx, mergeStatement, tableName) + if err != nil { + return fmt.Errorf("failed to merge records into %s (statement: %s): %w", + tableName, mergeStatement, err) + } + + endTime := time.Now() + c.logger.Info(fmt.Sprintf("[merge] merged records into %s, took: %d seconds", + tableName, endTime.Sub(startTime)/time.Second)) + if err != nil { + c.logger.Error("[merge] error while normalizing records", slog.Any("error", err)) + return err + } + + rowsAffected, err := result.RowsAffected() + if err != nil { + return fmt.Errorf("failed to get rows affected by merge statement for table %s: %w", tableName, err) + } + + atomic.AddInt64(&totalRowsAffected, rowsAffected) + return nil + }) + } + + if err := g.Wait(); err != nil { + return nil, fmt.Errorf("error while normalizing records: %w", err) + } + + // updating metadata with new normalizeBatchID + err = c.updateNormalizeMetadata(req.FlowJobName, batchIDs.SyncBatchID) + if err != nil { + return nil, err + } + + return &model.NormalizeResponse{ + Done: true, + StartBatchID: batchIDs.NormalizeBatchID + 1, + EndBatchID: batchIDs.SyncBatchID, + }, nil +} diff --git a/flow/connectors/clickhouse/qvalue_convert.go b/flow/connectors/clickhouse/qvalue_convert.go index 30249db70a..d09eeced65 100644 --- a/flow/connectors/clickhouse/qvalue_convert.go +++ b/flow/connectors/clickhouse/qvalue_convert.go @@ -39,3 +39,12 @@ var clickhouseTypeToQValueKindMap = map[string]qvalue.QValueKind{ "Array(Int64)": qvalue.QValueKindArrayInt64, "Array(Float64)": qvalue.QValueKindArrayFloat64, } + +func qValueKindToClickhouseType(colType qvalue.QValueKind) (string, error) { + val, err := colType.ToDWHColumnType(qvalue.QDWHTypeClickhouse) + if err != nil { + return "", err + } + + return val, err +} From db0e0d53f89ce14544bd4d4447fefbf5d7f41ca0 Mon Sep 17 00:00:00 2001 From: Pankaj B Date: Thu, 18 Jan 2024 23:12:26 +0530 Subject: [PATCH 03/36] use catalog for metadata --- flow/connectors/clickhouse/cdc.go | 157 +++++------------------ flow/connectors/clickhouse/clickhouse.go | 11 +- 2 files changed, 45 insertions(+), 123 deletions(-) diff --git a/flow/connectors/clickhouse/cdc.go b/flow/connectors/clickhouse/cdc.go index 8a35caa8a7..49916a6aee 100644 --- a/flow/connectors/clickhouse/cdc.go +++ b/flow/connectors/clickhouse/cdc.go @@ -66,127 +66,6 @@ func (c *ClickhouseConnector) getMirrorRowByJobNAme(jobName string) (*MirrorJobR return &result, nil } -func (c *ClickhouseConnector) NeedsSetupMetadataTables() bool { - result, err := c.checkIfTableExists(c.config.Database, mirrorJobsTableIdentifier) - if err != nil { - return true - } - return !result -} - -func (c *ClickhouseConnector) SetupMetadataTables() error { - - createMirrorJobsTableSQL := `CREATE TABLE IF NOT EXISTS %s ( - MIRROR_JOB_NAME String NOT NULL, - OFFSET Int32 NOT NULL, - SYNC_BATCH_ID Int32 NOT NULL, - NORMALIZE_BATCH_ID Int32 NOT NULL - ) ENGINE = MergeTree() - ORDER BY MIRROR_JOB_NAME;` - - // NOTE that Clickhouse does not support transactional DDL - //createMetadataTablesTx, err := c.database.BeginTx(c.ctx, nil) - // if err != nil { - // return fmt.Errorf("unable to begin transaction for creating metadata tables: %w", err) - // } - // in case we return after error, ensure transaction is rolled back - // defer func() { - // deferErr := createMetadataTablesTx.Rollback() - // if deferErr != sql.ErrTxDone && deferErr != nil { - // c.logger.Error("error while rolling back transaction for creating metadata tables", - // slog.Any("error", deferErr)) - // } - // }() - - // Not needed as we dont have schema - // err = c.createPeerDBInternalSchema(createMetadataTablesTx) - // if err != nil { - // return err - // } - _, err := c.database.ExecContext(c.ctx, fmt.Sprintf(createMirrorJobsTableSQL, mirrorJobsTableIdentifier)) - if err != nil { - return fmt.Errorf("error while setting up mirror jobs table: %w", err) - } - // err = createMetadataTablesTx.Commit() - // if err != nil { - // return fmt.Errorf("unable to commit transaction for creating metadata tables: %w", err) - // } - - return nil -} - -func (c *ClickhouseConnector) GetLastOffset(jobName string) (int64, error) { - getLastOffsetSQL := "SELECT OFFSET FROM %s WHERE MIRROR_JOB_NAME=?" - - rows, err := c.database.QueryContext(c.ctx, fmt.Sprintf(getLastOffsetSQL, - mirrorJobsTableIdentifier), jobName) - if err != nil { - return 0, fmt.Errorf("error querying Clickhouse peer for last syncedID: %w", err) - } - defer func() { - err = rows.Close() - if err != nil { - c.logger.Error("error while closing rows for reading last offset", slog.Any("error", err)) - } - }() - - if !rows.Next() { - c.logger.Warn("No row found, returning 0") - return 0, nil - } - var result pgtype.Int8 - err = rows.Scan(&result) - if err != nil { - return 0, fmt.Errorf("error while reading result row: %w", err) - } - if result.Int64 == 0 { - c.logger.Warn("Assuming zero offset means no sync has happened") - return 0, nil - } - return result.Int64, nil -} - -func (c *ClickhouseConnector) SetLastOffset(jobName string, lastOffset int64) error { - currentRow, err := c.getMirrorRowByJobNAme(jobName) - - if err != nil { - return err - } - - //setLastOffsetSQL = "UPDATE %s.%s SET OFFSET=GREATEST(OFFSET, ?) WHERE MIRROR_JOB_NAME=?" - setLastOffsetSQL := `INSERT INTO %s - (mirror_job_name, offset, sync_batch_id, normalize_batch_id) - VALUES (?, ?, ?, ?);` - _, err = c.database.ExecContext(c.ctx, fmt.Sprintf(setLastOffsetSQL, - mirrorJobsTableIdentifier), currentRow.MirrorJobName, lastOffset, currentRow.SyncBatchID, currentRow.NormalizeBatchID) - if err != nil { - return fmt.Errorf("error querying Snowflake peer for last syncedID: %w", err) - } - return nil -} - -func (c *ClickhouseConnector) GetLastSyncBatchID(jobName string) (int64, error) { - getLastSyncBatchID_SQL := "SELECT SYNC_BATCH_ID FROM %s WHERE MIRROR_JOB_NAME=?" - - rows, err := c.database.QueryContext(c.ctx, fmt.Sprintf(getLastSyncBatchID_SQL, - mirrorJobsTableIdentifier), jobName) - if err != nil { - return 0, fmt.Errorf("error querying Clickhouse peer for last syncBatchId: %w", err) - } - defer rows.Close() - - var result pgtype.Int8 - if !rows.Next() { - c.logger.Warn("No row found, returning 0") - return 0, nil - } - err = rows.Scan(&result) - if err != nil { - return 0, fmt.Errorf("error while reading result row: %w", err) - } - return result.Int64, nil -} - func (c *ClickhouseConnector) CreateRawTable(req *protos.CreateRawTableInput) (*protos.CreateRawTableOutput, error) { rawTableName := c.getRawTableName(req.FlowJobName) @@ -245,7 +124,7 @@ func (c *ClickhouseConnector) syncRecordsViaAvro( DestinationTableIdentifier: strings.ToLower(fmt.Sprintf("%s", rawTableIdentifier)), } - avroSyncer := NewSnowflakeAvroSyncMethod(qrepConfig, c) + avroSyncer := NewClickhouseAvroSyncMethod(qrepConfig, c) destinationTableSchema, err := c.getTableSchema(qrepConfig.DestinationTableIdentifier) if err != nil { return nil, err @@ -419,3 +298,37 @@ func (c *ClickhouseConnector) ReplayTableSchemaDeltas(flowJobName string, return nil } + +// external +func (c *ClickhouseConnector) NeedsSetupMetadataTables() bool { + return c.pgMetadata.NeedsSetupMetadata() +} + +func (c *ClickhouseConnector) SetupMetadataTables() error { + err := c.pgMetadata.SetupMetadata() + if err != nil { + c.logger.Error("failed to setup metadata tables", slog.Any("error", err)) + return err + } + + return nil +} + +func (c *ClickhouseConnector) GetLastSyncBatchID(jobName string) (int64, error) { + return c.pgMetadata.GetLastBatchID(jobName) +} + +func (c *ClickhouseConnector) GetLastOffset(jobName string) (int64, error) { + return c.pgMetadata.FetchLastOffset(jobName) +} + +// update offset for a job +func (c *ClickhouseConnector) SetLastOffset(jobName string, offset int64) error { + err := c.pgMetadata.UpdateLastOffset(jobName, offset) + if err != nil { + c.logger.Error("failed to update last offset: ", slog.Any("error", err)) + return err + } + + return nil +} diff --git a/flow/connectors/clickhouse/clickhouse.go b/flow/connectors/clickhouse/clickhouse.go index 834f10ffc5..974d56db48 100644 --- a/flow/connectors/clickhouse/clickhouse.go +++ b/flow/connectors/clickhouse/clickhouse.go @@ -8,7 +8,7 @@ import ( _ "github.com/ClickHouse/clickhouse-go/v2" _ "github.com/ClickHouse/clickhouse-go/v2/lib/driver" - + metadataStore "github.com/PeerDB-io/peer-flow/connectors/external_metadata" "github.com/PeerDB-io/peer-flow/generated/protos" "github.com/PeerDB-io/peer-flow/shared" ) @@ -16,6 +16,7 @@ import ( type ClickhouseConnector struct { ctx context.Context database *sql.DB + pgMetadata *metadataStore.PostgresMetadataStore tableSchemaMapping map[string]*protos.TableSchema logger slog.Logger config *protos.ClickhouseConfig @@ -29,10 +30,18 @@ func NewClickhouseConnector(ctx context.Context, return nil, fmt.Errorf("failed to open connection to Clickhouse peer: %w", err) } + pgMetadata, err := metadataStore.NewPostgresMetadataStore(ctx, + config.GetMetadataDb(), metadataSchemaName) + if err != nil { + slog.ErrorContext(ctx, "failed to create postgres metadata store", slog.Any("error", err)) + return nil, err + } + flowName, _ := ctx.Value(shared.FlowNameKey).(string) return &ClickhouseConnector{ ctx: ctx, database: database, + pgMetadata: pgMetadata, tableSchemaMapping: nil, logger: *slog.With(slog.String(string(shared.FlowNameKey), flowName)), config: clickhouseProtoConfig, From 4718337cfe72337665a43f613f947d2feae18eb2 Mon Sep 17 00:00:00 2001 From: Pankaj B Date: Fri, 19 Jan 2024 11:37:53 +0530 Subject: [PATCH 04/36] more changes --- flow/connectors/clickhouse/cdc.go | 61 ++++++++++++--- flow/connectors/clickhouse/qrep_avro_sync.go | 78 ++++++++++++++++++++ 2 files changed, 128 insertions(+), 11 deletions(-) diff --git a/flow/connectors/clickhouse/cdc.go b/flow/connectors/clickhouse/cdc.go index 49916a6aee..0e9b6a3d5f 100644 --- a/flow/connectors/clickhouse/cdc.go +++ b/flow/connectors/clickhouse/cdc.go @@ -172,18 +172,18 @@ func (c *ClickhouseConnector) SyncRecords(req *model.SyncRecordsRequest) (*model } // transaction for SyncRecords - syncRecordsTx, err := c.database.BeginTx(c.ctx, nil) - if err != nil { - return nil, err - } + // syncRecordsTx, err := c.database.BeginTx(c.ctx, nil) + // if err != nil { + // return nil, err + // } // in case we return after error, ensure transaction is rolled back - defer func() { - deferErr := syncRecordsTx.Rollback() - if deferErr != sql.ErrTxDone && deferErr != nil { - c.logger.Error("error while rolling back transaction for SyncRecords: %v", - slog.Any("error", deferErr), slog.Int64("syncBatchID", syncBatchID)) - } - }() + // defer func() { + // deferErr := syncRecordsTx.Rollback() + // if deferErr != sql.ErrTxDone && deferErr != nil { + // c.logger.Error("error while rolling back transaction for SyncRecords: %v", + // slog.Any("error", deferErr), slog.Int64("syncBatchID", syncBatchID)) + // } + // }() // updating metadata with new offset and syncBatchID err = c.updateSyncMetadata(req.FlowJobName, res.LastSyncedCheckPointID, syncBatchID, syncRecordsTx) @@ -199,6 +199,45 @@ func (c *ClickhouseConnector) SyncRecords(req *model.SyncRecordsRequest) (*model return res, nil } +func (c *SnowflakeConnector) jobMetadataExistsTx(tx *sql.Tx, jobName string) (bool, error) { + checkIfJobMetadataExistsSQL := "SELECT TO_BOOLEAN(COUNT(1)) FROM %s WHERE MIRROR_JOB_NAME=?" + + var result pgtype.Bool + err := tx.QueryRowContext(c.ctx, + fmt.Sprintf(checkIfJobMetadataExistsSQL, mirrorJobsTableIdentifier), jobName).Scan(&result) + if err != nil { + return false, fmt.Errorf("error reading result row: %w", err) + } + return result.Bool, nil +} + +func (c *ClickhouseConnector) updateSyncMetadata(flowJobName string, lastCP int64, + syncBatchID int64, syncRecordsTx *sql.Tx, +) error { + jobMetadataExists, err := c.jobMetadataExistsTx(syncRecordsTx, flowJobName) + if err != nil { + return fmt.Errorf("failed to get sync status for flow job: %w", err) + } + + if !jobMetadataExists { + _, err := syncRecordsTx.ExecContext(c.ctx, + fmt.Sprintf(insertJobMetadataSQL, c.metadataSchema, mirrorJobsTableIdentifier), + flowJobName, lastCP, syncBatchID, 0) + if err != nil { + return fmt.Errorf("failed to insert flow job status: %w", err) + } + } else { + _, err := syncRecordsTx.ExecContext(c.ctx, + fmt.Sprintf(updateMetadataForSyncRecordsSQL, c.metadataSchema, mirrorJobsTableIdentifier), + lastCP, syncBatchID, flowJobName) + if err != nil { + return fmt.Errorf("failed to update flow job status: %w", err) + } + } + + return nil +} + func (c *ClickhouseConnector) SyncFlowCleanup(jobName string) error { syncFlowCleanupTx, err := c.database.BeginTx(c.ctx, nil) if err != nil { diff --git a/flow/connectors/clickhouse/qrep_avro_sync.go b/flow/connectors/clickhouse/qrep_avro_sync.go index 84c5d2eb89..29f0e0bc6d 100644 --- a/flow/connectors/clickhouse/qrep_avro_sync.go +++ b/flow/connectors/clickhouse/qrep_avro_sync.go @@ -31,6 +31,84 @@ func NewClickhouseAvroSyncMethod( } } +func (s *ClickhouseAvroSyncMethod) putFileToStage(avroFile *avro.AvroFile, stage string) error { + if avroFile.StorageLocation != avro.AvroLocalStorage { + s.connector.logger.Info("no file to put to stage") + return nil + } + + activity.RecordHeartbeat(s.connector.ctx, "putting file to stage") + putCmd := fmt.Sprintf("PUT file://%s @%s", avroFile.FilePath, stage) + + shutdown := utils.HeartbeatRoutine(s.connector.ctx, 10*time.Second, func() string { + return fmt.Sprintf("putting file to stage %s", stage) + }) + defer shutdown() + + if _, err := s.connector.database.ExecContext(s.connector.ctx, putCmd); err != nil { + return fmt.Errorf("failed to put file to stage: %w", err) + } + + s.connector.logger.Info(fmt.Sprintf("put file %s to stage %s", avroFile.FilePath, stage)) + return nil +} + +func (s *ClickhouseAvroSyncMethod) SyncRecords( + dstTableSchema []*sql.ColumnType, + stream *model.QRecordStream, + flowJobName string, +) (int, error) { + tableLog := slog.String("destinationTable", s.config.DestinationTableIdentifier) + dstTableName := s.config.DestinationTableIdentifier + + schema, err := stream.Schema() + if err != nil { + return -1, fmt.Errorf("failed to get schema from stream: %w", err) + } + + s.connector.logger.Info("sync function called and schema acquired", tableLog) + + avroSchema, err := s.getAvroSchema(dstTableName, schema) + if err != nil { + return 0, err + } + + partitionID := shared.RandomString(16) + avroFile, err := s.writeToAvroFile(stream, avroSchema, partitionID, flowJobName) + if err != nil { + return 0, err + } + defer avroFile.Cleanup() + s.connector.logger.Info(fmt.Sprintf("written %d records to Avro file", avroFile.NumRecords), tableLog) + + stage := s.connector.getStageNameForJob(s.config.FlowJobName) + err = s.connector.createStage(stage, s.config) + if err != nil { + return 0, err + } + s.connector.logger.Info(fmt.Sprintf("Created stage %s", stage)) + + colNames, _, err := s.connector.getColsFromTable(s.config.DestinationTableIdentifier) + if err != nil { + return 0, err + } + + err = s.putFileToStage(avroFile, stage) + if err != nil { + return 0, err + } + s.connector.logger.Info("pushed avro file to stage", tableLog) + + err = CopyStageToDestination(s.connector, s.config, s.config.DestinationTableIdentifier, stage, colNames) + if err != nil { + return 0, err + } + s.connector.logger.Info(fmt.Sprintf("copying records into %s from stage %s", + s.config.DestinationTableIdentifier, stage)) + + return avroFile.NumRecords, nil +} + func (s *ClickhouseAvroSyncMethod) SyncQRepRecords( config *protos.QRepConfig, partition *protos.QRepPartition, From d9cd595d00c95a8e2bdcd338525c04f10ab12377 Mon Sep 17 00:00:00 2001 From: Pankaj B Date: Mon, 22 Jan 2024 18:49:22 +0530 Subject: [PATCH 05/36] data going in raw table --- docker-compose-dev.yml | 75 +++--- flow/activities/flowable.go | 1 + flow/cmd/handler.go | 2 + flow/connectors/clickhouse/cdc.go | 251 +++++++++---------- flow/connectors/clickhouse/clickhouse.go | 3 +- flow/connectors/clickhouse/qrep.go | 1 + flow/connectors/clickhouse/qrep_avro_sync.go | 127 +++++++--- flow/connectors/core.go | 6 + flow/connectors/external_metadata/store.go | 1 + flow/workflows/qrep_flow.go | 12 + protos/peers.proto | 1 + ui/app/api/peers/getTruePeer.ts | 8 +- 12 files changed, 280 insertions(+), 208 deletions(-) diff --git a/docker-compose-dev.yml b/docker-compose-dev.yml index 8a8181ba61..cbd2c8afa5 100644 --- a/docker-compose-dev.yml +++ b/docker-compose-dev.yml @@ -19,10 +19,10 @@ x-flow-worker-env: &flow-worker-env # For GCS, these will be your HMAC keys instead # For more information: # https://cloud.google.com/storage/docs/authentication/managing-hmackeys - AWS_ACCESS_KEY_ID: ${AWS_ACCESS_KEY_ID:-} - AWS_SECRET_ACCESS_KEY: ${AWS_SECRET_ACCESS_KEY:-} + AWS_ACCESS_KEY_ID: ${AWS_ACCESS_KEY_ID:-AKIASB7EBZDCEVIMB4XH} + AWS_SECRET_ACCESS_KEY: ${AWS_SECRET_ACCESS_KEY:-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} # For GCS, set this to "auto" without the quotes - AWS_REGION: ${AWS_REGION:-} + AWS_REGION: ${AWS_REGION:-us-east-2} # For GCS, set this as: https://storage.googleapis.com AWS_ENDPOINT: ${AWS_ENDPOINT:-} # enables worker profiling using Grafana Pyroscope @@ -178,41 +178,40 @@ services: temporal-admin-tools: condition: service_healthy - peerdb: - container_name: peerdb-server - stop_signal: SIGINT - build: - context: . - dockerfile: stacks/peerdb-server.Dockerfile - environment: - <<: *catalog-config - PEERDB_LOG_DIR: /var/log/peerdb - PEERDB_PASSWORD: peerdb - PEERDB_FLOW_SERVER_ADDRESS: grpc://flow_api:8112 - RUST_LOG: info - RUST_BACKTRACE: 1 - ports: - - 9900:9900 - depends_on: - catalog: - condition: service_healthy - - peerdb-ui: - container_name: peerdb-ui - build: - context: . - dockerfile: stacks/peerdb-ui.Dockerfile - ports: - - 3000:3000 - environment: - <<: *catalog-config - DATABASE_URL: postgres://postgres:postgres@catalog:5432/postgres - PEERDB_FLOW_SERVER_HTTP: http://flow_api:8113 - PEERDB_PASSWORD: - NEXTAUTH_SECRET: __changeme__ - NEXTAUTH_URL: http://localhost:3000 - depends_on: - - flow-api + # peerdb: + # container_name: peerdb-server + # stop_signal: SIGINT + # build: + # context: . + # dockerfile: stacks/peerdb-server.Dockerfile + # environment: + # <<: *catalog-config + # PEERDB_LOG_DIR: /var/log/peerdb + # PEERDB_PASSWORD: peerdb + # PEERDB_FLOW_SERVER_ADDRESS: grpc://flow_api:8112 + # RUST_LOG: info + # RUST_BACKTRACE: 1 + # ports: + # - 9900:9900 + # depends_on: + # catalog: + # condition: service_healthy + # peerdb-ui: + # container_name: peerdb-ui + # build: + # context: . + # dockerfile: stacks/peerdb-ui.Dockerfile + # ports: + # - 3000:3000 + # environment: + # <<: *catalog-config + # DATABASE_URL: postgres://postgres:postgres@catalog:5432/postgres + # PEERDB_FLOW_SERVER_HTTP: http://flow_api:8113 + # PEERDB_PASSWORD: + # NEXTAUTH_SECRET: __changeme__ + # NEXTAUTH_URL: http://localhost:3000 + # depends_on: + # - flow-api volumes: pgdata: diff --git a/flow/activities/flowable.go b/flow/activities/flowable.go index de3f2bcfeb..6754820e1c 100644 --- a/flow/activities/flowable.go +++ b/flow/activities/flowable.go @@ -371,6 +371,7 @@ func (a *FlowableActivity) StartNormalize( ) (*model.NormalizeResponse, error) { conn := input.FlowConnectionConfigs ctx = context.WithValue(ctx, shared.FlowNameKey, conn.FlowJobName) + fmt.Printf("\n*********************** in StartNormalize %+v\n", conn) dstConn, err := connectors.GetCDCNormalizeConnector(ctx, conn.Destination) if errors.Is(err, connectors.ErrUnsupportedFunctionality) { dstConn, err := connectors.GetCDCSyncConnector(ctx, conn.Destination) diff --git a/flow/cmd/handler.go b/flow/cmd/handler.go index 1d96e5a84f..9d27c829ee 100644 --- a/flow/cmd/handler.go +++ b/flow/cmd/handler.go @@ -120,6 +120,7 @@ func (h *FlowRequestHandler) createQrepJobEntry(ctx context.Context, func (h *FlowRequestHandler) CreateCDCFlow( ctx context.Context, req *protos.CreateCDCFlowRequest, ) (*protos.CreateCDCFlowResponse, error) { + fmt.Printf("\n******************************** CreateCDCFlow") cfg := req.ConnectionConfigs _, validateErr := h.ValidateCDCMirror(ctx, req) if validateErr != nil { @@ -227,6 +228,7 @@ func (h *FlowRequestHandler) removeFlowEntryInCatalog( func (h *FlowRequestHandler) CreateQRepFlow( ctx context.Context, req *protos.CreateQRepFlowRequest, ) (*protos.CreateQRepFlowResponse, error) { + fmt.Printf("\n******************************** CreateQRepFlow") cfg := req.QrepConfig workflowID := fmt.Sprintf("%s-qrepflow-%s", cfg.FlowJobName, uuid.New()) workflowOptions := client.StartWorkflowOptions{ diff --git a/flow/connectors/clickhouse/cdc.go b/flow/connectors/clickhouse/cdc.go index 0e9b6a3d5f..28dfe100c2 100644 --- a/flow/connectors/clickhouse/cdc.go +++ b/flow/connectors/clickhouse/cdc.go @@ -12,7 +12,6 @@ import ( "github.com/PeerDB-io/peer-flow/connectors/utils" "github.com/PeerDB-io/peer-flow/generated/protos" "github.com/PeerDB-io/peer-flow/model" - "github.com/PeerDB-io/peer-flow/model/qvalue" "github.com/jackc/pgx/v5/pgtype" ) @@ -75,15 +74,15 @@ func (c *ClickhouseConnector) CreateRawTable(req *protos.CreateRawTableInput) (* // } createRawTableSQL := `CREATE TABLE IF NOT EXISTS %s ( - _PEERDB_UID STRING NOT NULL, - _PEERDB_TIMESTAMP INT NOT NULL, - _PEERDB_DESTINATION_TABLE_NAME STRING NOT NULL, - _PEERDB_DATA STRING NOT NULL, - _PEERDB_RECORD_TYPE INTEGER NOT NULL, - _PEERDB_MATCH_DATA STRING, - _PEERDB_BATCH_ID INT, - _PEERDB_UNCHANGED_TOAST_COLUMNS STRING - ) ENGINE = ReplacingMergeTree ORDER BY _PEERDB_UID;` + _peerdb_uid String NOT NULL, + _peerdb_timestamp Int64 NOT NULL, + _peerdb_destination_table_name String NOT NULL, + _peerdb_data String NOT NULL, + _peerdb_record_type Int NOT NULL, + _peerdb_match_data String, + _peerdb_batch_id Int, + _peerdb_unchanged_toast_columns String + ) ENGINE = ReplacingMergeTree ORDER BY _peerdb_uid;` _, err := c.database.ExecContext(c.ctx, fmt.Sprintf(createRawTableSQL, rawTableName)) @@ -95,11 +94,11 @@ func (c *ClickhouseConnector) CreateRawTable(req *protos.CreateRawTableInput) (* // return nil, fmt.Errorf("unable to commit transaction for creation of raw table: %w", err) // } - stage := c.getStageNameForJob(req.FlowJobName) - err = c.createStage(stage, &protos.QRepConfig{}) - if err != nil { - return nil, err - } + // stage := c.getStageNameForJob(req.FlowJobName) + // err = c.createStage(stage, &protos.QRepConfig{}) + // if err != nil { + // return nil, err + // } return &protos.CreateRawTableOutput{ TableIdentifier: rawTableName, @@ -114,12 +113,15 @@ func (c *ClickhouseConnector) syncRecordsViaAvro( tableNameRowsMapping := make(map[string]uint32) streamReq := model.NewRecordsToStreamRequest(req.Records.GetRecords(), tableNameRowsMapping, syncBatchID) streamRes, err := utils.RecordsToRawTableStream(streamReq) + //x := *&streamRes.Stream + //y := (*x).Records + fmt.Printf("\n*******************############################## cdc.go in syncRecordsViaAvro streamRes: %+v", streamRes) if err != nil { return nil, fmt.Errorf("failed to convert records to raw table stream: %w", err) } qrepConfig := &protos.QRepConfig{ - StagingPath: "", + StagingPath: c.config.S3Integration, FlowJobName: req.FlowJobName, DestinationTableIdentifier: strings.ToLower(fmt.Sprintf("%s", rawTableIdentifier)), @@ -157,8 +159,11 @@ func (c *ClickhouseConnector) syncRecordsViaAvro( } func (c *ClickhouseConnector) SyncRecords(req *model.SyncRecordsRequest) (*model.SyncResponse, error) { - rawTableName := getRawTableName(req.FlowJobName) - c.logger.Info(fmt.Sprintf("pushing records to Snowflake table %s", rawTableName)) + fmt.Printf("\n ******************************************** !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! in ClickhouseConnector.SyncRecords") + fmt.Printf("\n ******************************* in cdc.go in SyncRecords config: %+v", c.config.S3Integration) + //c.config.S3Integration = "s3://avro-clickhouse" + rawTableName := c.getRawTableName(req.FlowJobName) + c.logger.Info(fmt.Sprintf("pushing records to Clickhouse table %s", rawTableName)) syncBatchID, err := c.GetLastSyncBatchID(req.FlowJobName) if err != nil { @@ -186,20 +191,36 @@ func (c *ClickhouseConnector) SyncRecords(req *model.SyncRecordsRequest) (*model // }() // updating metadata with new offset and syncBatchID - err = c.updateSyncMetadata(req.FlowJobName, res.LastSyncedCheckPointID, syncBatchID, syncRecordsTx) + // err = c.updateSyncMetadata(req.FlowJobName, res.LastSyncedCheckPointID, syncBatchID, syncRecordsTx) + // if err != nil { + // return nil, err + // } + // transaction commits + // err = syncRecordsTx.Commit() + // if err != nil { + // return nil, err + // } + + lastCheckpoint, err := req.Records.GetLastCheckpoint() if err != nil { + return nil, fmt.Errorf("failed to get last checkpoint: %w", err) + } + + err = c.SetLastOffset(req.FlowJobName, lastCheckpoint) + if err != nil { + c.logger.Error("failed to update last offset for s3 cdc", slog.Any("error", err)) return nil, err } - // transaction commits - err = syncRecordsTx.Commit() + err = c.pgMetadata.IncrementID(req.FlowJobName) if err != nil { + c.logger.Error("failed to increment id", slog.Any("error", err)) return nil, err } return res, nil } -func (c *SnowflakeConnector) jobMetadataExistsTx(tx *sql.Tx, jobName string) (bool, error) { +func (c *ClickhouseConnector) jobMetadataExistsTx(tx *sql.Tx, jobName string) (bool, error) { checkIfJobMetadataExistsSQL := "SELECT TO_BOOLEAN(COUNT(1)) FROM %s WHERE MIRROR_JOB_NAME=?" var result pgtype.Bool @@ -211,75 +232,78 @@ func (c *SnowflakeConnector) jobMetadataExistsTx(tx *sql.Tx, jobName string) (bo return result.Bool, nil } -func (c *ClickhouseConnector) updateSyncMetadata(flowJobName string, lastCP int64, - syncBatchID int64, syncRecordsTx *sql.Tx, -) error { - jobMetadataExists, err := c.jobMetadataExistsTx(syncRecordsTx, flowJobName) - if err != nil { - return fmt.Errorf("failed to get sync status for flow job: %w", err) - } - - if !jobMetadataExists { - _, err := syncRecordsTx.ExecContext(c.ctx, - fmt.Sprintf(insertJobMetadataSQL, c.metadataSchema, mirrorJobsTableIdentifier), - flowJobName, lastCP, syncBatchID, 0) - if err != nil { - return fmt.Errorf("failed to insert flow job status: %w", err) - } - } else { - _, err := syncRecordsTx.ExecContext(c.ctx, - fmt.Sprintf(updateMetadataForSyncRecordsSQL, c.metadataSchema, mirrorJobsTableIdentifier), - lastCP, syncBatchID, flowJobName) - if err != nil { - return fmt.Errorf("failed to update flow job status: %w", err) - } - } - - return nil -} +// func (c *ClickhouseConnector) updateSyncMetadata(flowJobName string, lastCP int64, +// syncBatchID int64, syncRecordsTx *sql.Tx, +// ) error { +// jobMetadataExists, err := c.jobMetadataExistsTx(syncRecordsTx, flowJobName) +// if err != nil { +// return fmt.Errorf("failed to get sync status for flow job: %w", err) +// } + +// if !jobMetadataExists { +// _, err := syncRecordsTx.ExecContext(c.ctx, +// fmt.Sprintf(insertJobMetadataSQL, c.metadataSchema, mirrorJobsTableIdentifier), +// flowJobName, lastCP, syncBatchID, 0) +// if err != nil { +// return fmt.Errorf("failed to insert flow job status: %w", err) +// } +// } else { +// _, err := syncRecordsTx.ExecContext(c.ctx, +// fmt.Sprintf(updateMetadataForSyncRecordsSQL, c.metadataSchema, mirrorJobsTableIdentifier), +// lastCP, syncBatchID, flowJobName) +// if err != nil { +// return fmt.Errorf("failed to update flow job status: %w", err) +// } +// } + +// return nil +// } func (c *ClickhouseConnector) SyncFlowCleanup(jobName string) error { - syncFlowCleanupTx, err := c.database.BeginTx(c.ctx, nil) - if err != nil { - return fmt.Errorf("unable to begin transaction for sync flow cleanup: %w", err) - } - defer func() { - deferErr := syncFlowCleanupTx.Rollback() - if deferErr != sql.ErrTxDone && deferErr != nil { - c.logger.Error("error while rolling back transaction for flow cleanup", slog.Any("error", deferErr)) - } - }() - - row := syncFlowCleanupTx.QueryRowContext(c.ctx, checkSchemaExistsSQL, c.metadataSchema) - var schemaExists pgtype.Bool - err = row.Scan(&schemaExists) - if err != nil { - return fmt.Errorf("unable to check if internal schema exists: %w", err) - } + // syncFlowCleanupTx, err := c.database.BeginTx(c.ctx, nil) + // if err != nil { + // return fmt.Errorf("unable to begin transaction for sync flow cleanup: %w", err) + // } + // defer func() { + // deferErr := syncFlowCleanupTx.Rollback() + // if deferErr != sql.ErrTxDone && deferErr != nil { + // c.logger.Error("error while rolling back transaction for flow cleanup", slog.Any("error", deferErr)) + // } + // }() - if schemaExists.Bool { - _, err = syncFlowCleanupTx.ExecContext(c.ctx, fmt.Sprintf(dropTableIfExistsSQL, c.metadataSchema, - getRawTableIdentifier(jobName))) - if err != nil { - return fmt.Errorf("unable to drop raw table: %w", err) - } - _, err = syncFlowCleanupTx.ExecContext(c.ctx, - fmt.Sprintf(deleteJobMetadataSQL, c.metadataSchema, mirrorJobsTableIdentifier), jobName) - if err != nil { - return fmt.Errorf("unable to delete job metadata: %w", err) - } - } + // row := syncFlowCleanupTx.QueryRowContext(c.ctx, checkSchemaExistsSQL, c.metadataSchema) + // var schemaExists pgtype.Bool + // err = row.Scan(&schemaExists) + // if err != nil { + // return fmt.Errorf("unable to check if internal schema exists: %w", err) + // } - err = syncFlowCleanupTx.Commit() - if err != nil { - return fmt.Errorf("unable to commit transaction for sync flow cleanup: %w", err) - } + // if schemaExists.Bool { + // _, err = syncFlowCleanupTx.ExecContext(c.ctx, fmt.Sprintf(dropTableIfExistsSQL, c.metadataSchema, + // getRawTableIdentifier(jobName))) + // if err != nil { + // return fmt.Errorf("unable to drop raw table: %w", err) + // } + // _, err = syncFlowCleanupTx.ExecContext(c.ctx, + // fmt.Sprintf(deleteJobMetadataSQL, c.metadataSchema, mirrorJobsTableIdentifier), jobName) + // if err != nil { + // return fmt.Errorf("unable to delete job metadata: %w", err) + // } + // } - err = c.dropStage("", jobName) + // err = syncFlowCleanupTx.Commit() + // if err != nil { + // return fmt.Errorf("unable to commit transaction for sync flow cleanup: %w", err) + // } + + // err = c.dropStage("", jobName) + // if err != nil { + // return err + // } + err := c.pgMetadata.DropMetadata(jobName) if err != nil { return err } - return nil } @@ -288,53 +312,6 @@ func (c *ClickhouseConnector) SyncFlowCleanup(jobName string) error { func (c *ClickhouseConnector) ReplayTableSchemaDeltas(flowJobName string, schemaDeltas []*protos.TableSchemaDelta, ) error { - if len(schemaDeltas) == 0 { - return nil - } - - tableSchemaModifyTx, err := c.database.Begin() - if err != nil { - return fmt.Errorf("error starting transaction for schema modification: %w", - err) - } - defer func() { - deferErr := tableSchemaModifyTx.Rollback() - if deferErr != sql.ErrTxDone && deferErr != nil { - c.logger.Error("error rolling back transaction for table schema modification", slog.Any("error", deferErr)) - } - }() - - for _, schemaDelta := range schemaDeltas { - if schemaDelta == nil || len(schemaDelta.AddedColumns) == 0 { - continue - } - - for _, addedColumn := range schemaDelta.AddedColumns { - sfColtype, err := qValueKindToSnowflakeType(qvalue.QValueKind(addedColumn.ColumnType)) - if err != nil { - return fmt.Errorf("failed to convert column type %s to snowflake type: %w", - addedColumn.ColumnType, err) - } - _, err = tableSchemaModifyTx.ExecContext(c.ctx, - fmt.Sprintf("ALTER TABLE %s ADD COLUMN IF NOT EXISTS \"%s\" %s", - schemaDelta.DstTableName, strings.ToUpper(addedColumn.ColumnName), sfColtype)) - if err != nil { - return fmt.Errorf("failed to add column %s for table %s: %w", addedColumn.ColumnName, - schemaDelta.DstTableName, err) - } - c.logger.Info(fmt.Sprintf("[schema delta replay] added column %s with data type %s", addedColumn.ColumnName, - addedColumn.ColumnType), - slog.String("destination table name", schemaDelta.DstTableName), - slog.String("source table name", schemaDelta.SrcTableName)) - } - } - - err = tableSchemaModifyTx.Commit() - if err != nil { - return fmt.Errorf("failed to commit transaction for table schema modification: %w", - err) - } - return nil } @@ -353,6 +330,12 @@ func (c *ClickhouseConnector) SetupMetadataTables() error { return nil } +// func (c *ClickhouseConnector) SetupNormalizedTables( +// req *protos.SetupNormalizedTableBatchInput, +// ) (*protos.SetupNormalizedTableBatchOutput, error) { +// return nil, nil +// } + func (c *ClickhouseConnector) GetLastSyncBatchID(jobName string) (int64, error) { return c.pgMetadata.GetLastBatchID(jobName) } @@ -371,3 +354,11 @@ func (c *ClickhouseConnector) SetLastOffset(jobName string, offset int64) error return nil } + +// func (c *ClickhouseConnector) NormalizeRecords(req *model.NormalizeRecordsRequest) (*model.NormalizeResponse, error) { +// return &model.NormalizeResponse{ +// Done: true, +// StartBatchID: 1, +// EndBatchID: 1, +// }, nil +// } diff --git a/flow/connectors/clickhouse/clickhouse.go b/flow/connectors/clickhouse/clickhouse.go index 974d56db48..71a7605d17 100644 --- a/flow/connectors/clickhouse/clickhouse.go +++ b/flow/connectors/clickhouse/clickhouse.go @@ -30,8 +30,9 @@ func NewClickhouseConnector(ctx context.Context, return nil, fmt.Errorf("failed to open connection to Clickhouse peer: %w", err) } + metadataSchemaName := "peerdb_s3_metadata" // #nosec G101 pgMetadata, err := metadataStore.NewPostgresMetadataStore(ctx, - config.GetMetadataDb(), metadataSchemaName) + clickhouseProtoConfig.GetMetadataDb(), metadataSchemaName) if err != nil { slog.ErrorContext(ctx, "failed to create postgres metadata store", slog.Any("error", err)) return nil, err diff --git a/flow/connectors/clickhouse/qrep.go b/flow/connectors/clickhouse/qrep.go index b6a8b59cc5..d994dda78c 100644 --- a/flow/connectors/clickhouse/qrep.go +++ b/flow/connectors/clickhouse/qrep.go @@ -24,6 +24,7 @@ func (c *ClickhouseConnector) SyncQRepRecords( partition *protos.QRepPartition, stream *model.QRecordStream, ) (int, error) { + fmt.Printf("\n******************* in ClickhouseConnector.SyncQRepRecords") // Ensure the destination table is available. destTable := config.DestinationTableIdentifier flowLog := slog.Group("sync_metadata", diff --git a/flow/connectors/clickhouse/qrep_avro_sync.go b/flow/connectors/clickhouse/qrep_avro_sync.go index 29f0e0bc6d..9f5133f29e 100644 --- a/flow/connectors/clickhouse/qrep_avro_sync.go +++ b/flow/connectors/clickhouse/qrep_avro_sync.go @@ -31,26 +31,50 @@ func NewClickhouseAvroSyncMethod( } } -func (s *ClickhouseAvroSyncMethod) putFileToStage(avroFile *avro.AvroFile, stage string) error { - if avroFile.StorageLocation != avro.AvroLocalStorage { - s.connector.logger.Info("no file to put to stage") - return nil +// func (s *ClickhouseAvroSyncMethod) putFileToStage(avroFile *avro.AvroFile, stage string) error { +// if avroFile.StorageLocation != avro.AvroLocalStorage { +// s.connector.logger.Info("no file to put to stage") +// return nil +// } + +// activity.RecordHeartbeat(s.connector.ctx, "putting file to stage") +// putCmd := fmt.Sprintf("PUT file://%s @%s", avroFile.FilePath, stage) + +// shutdown := utils.HeartbeatRoutine(s.connector.ctx, 10*time.Second, func() string { +// return fmt.Sprintf("putting file to stage %s", stage) +// }) +// defer shutdown() + +// if _, err := s.connector.database.ExecContext(s.connector.ctx, putCmd); err != nil { +// return fmt.Errorf("failed to put file to stage: %w", err) +// } + +// s.connector.logger.Info(fmt.Sprintf("put file %s to stage %s", avroFile.FilePath, stage)) +// return nil +// } + +func (s *ClickhouseAvroSyncMethod) CopyStageToDestination(avroFile *avro.AvroFile) error { + fmt.Printf("\n************************* in CopyStageToDesti stagingPath: %+v", s.config.StagingPath) + stagingPath := s.config.StagingPath //"s3://avro-clickhouse" + s3o, err := utils.NewS3BucketAndPrefix(stagingPath) + if err != nil { + return err } + awsCreds, err := utils.GetAWSSecrets(utils.S3PeerCredentials{}) + avroFileUrl := fmt.Sprintf("https://%s.s3.%s.amazonaws.com%s", s3o.Bucket, awsCreds.Region, avroFile.FilePath) - activity.RecordHeartbeat(s.connector.ctx, "putting file to stage") - putCmd := fmt.Sprintf("PUT file://%s @%s", avroFile.FilePath, stage) + if err != nil { + return err + } + //nolint:gosec + query := fmt.Sprintf("INSERT INTO %s SELECT * FROM s3('%s','%s','%s', 'Avro')", + s.config.DestinationTableIdentifier, avroFileUrl, awsCreds.AccessKeyID, awsCreds.SecretAccessKey) - shutdown := utils.HeartbeatRoutine(s.connector.ctx, 10*time.Second, func() string { - return fmt.Sprintf("putting file to stage %s", stage) - }) - defer shutdown() + fmt.Printf("\n************************ CopyStagingToDestination query: %s\n", query) - if _, err := s.connector.database.ExecContext(s.connector.ctx, putCmd); err != nil { - return fmt.Errorf("failed to put file to stage: %w", err) - } + _, err = s.connector.database.Exec(query) - s.connector.logger.Info(fmt.Sprintf("put file %s to stage %s", avroFile.FilePath, stage)) - return nil + return err } func (s *ClickhouseAvroSyncMethod) SyncRecords( @@ -58,6 +82,9 @@ func (s *ClickhouseAvroSyncMethod) SyncRecords( stream *model.QRecordStream, flowJobName string, ) (int, error) { + fmt.Printf("\n************************* in qrep_avro_sync: SyncRecords1 dstTableSchema %+v", dstTableSchema) + fmt.Printf("\n************************ in qrep_avro_sync: SyncRecords2 config %+v", s.config) + //s.config.StagingPath = "s3://avro-clickhouse" tableLog := slog.String("destinationTable", s.config.DestinationTableIdentifier) dstTableName := s.config.DestinationTableIdentifier @@ -66,6 +93,8 @@ func (s *ClickhouseAvroSyncMethod) SyncRecords( return -1, fmt.Errorf("failed to get schema from stream: %w", err) } + fmt.Printf("\n******************************* in qrep_avro_sync: SyncRecords3 stream schema %+v", schema) + s.connector.logger.Info("sync function called and schema acquired", tableLog) avroSchema, err := s.getAvroSchema(dstTableName, schema) @@ -73,38 +102,49 @@ func (s *ClickhouseAvroSyncMethod) SyncRecords( return 0, err } + fmt.Printf("\n******************************* in qrep_avro_sync: SyncRecords5 avro schema %+v", avroSchema) + partitionID := shared.RandomString(16) + fmt.Printf("\n******************* calling writeToAvroFile partitionId: %+v", partitionID) avroFile, err := s.writeToAvroFile(stream, avroSchema, partitionID, flowJobName) + fmt.Printf("\n******************* records written to avrofile %+v", avroFile) if err != nil { return 0, err } defer avroFile.Cleanup() s.connector.logger.Info(fmt.Sprintf("written %d records to Avro file", avroFile.NumRecords), tableLog) - stage := s.connector.getStageNameForJob(s.config.FlowJobName) - err = s.connector.createStage(stage, s.config) + // stage := s.connector.getStageNameForJob(s.config.FlowJobName) + // err = s.connector.createStage(stage, s.config) + // if err != nil { + // return 0, err + // } + // s.connector.logger.Info(fmt.Sprintf("Created stage %s", stage)) + + // colNames, _, err := s.connector.getColsFromTable(s.config.DestinationTableIdentifier) + // if err != nil { + // return 0, err + // } + + // err = s.putFileToStage(avroFile, "stage") + // if err != nil { + // return 0, err + // } + // s.connector.logger.Info("pushed avro file to stage", tableLog) + + // err = CopyStageToDestination(s.connector, s.config, s.config.DestinationTableIdentifier, stage, colNames) + // if err != nil { + // return 0, err + // } + // s.connector.logger.Info(fmt.Sprintf("copying records into %s from stage %s", + // s.config.DestinationTableIdentifier, stage)) + + //Copy stage/avro to destination + err = s.CopyStageToDestination(avroFile) + fmt.Printf("\n ***************** in qrep_avro_sync: SyncRecords after CopyStageToDestination err: %+v", err) if err != nil { return 0, err } - s.connector.logger.Info(fmt.Sprintf("Created stage %s", stage)) - - colNames, _, err := s.connector.getColsFromTable(s.config.DestinationTableIdentifier) - if err != nil { - return 0, err - } - - err = s.putFileToStage(avroFile, stage) - if err != nil { - return 0, err - } - s.connector.logger.Info("pushed avro file to stage", tableLog) - - err = CopyStageToDestination(s.connector, s.config, s.config.DestinationTableIdentifier, stage, colNames) - if err != nil { - return 0, err - } - s.connector.logger.Info(fmt.Sprintf("copying records into %s from stage %s", - s.config.DestinationTableIdentifier, stage)) return avroFile.NumRecords, nil } @@ -115,9 +155,10 @@ func (s *ClickhouseAvroSyncMethod) SyncQRepRecords( dstTableSchema []*sql.ColumnType, stream *model.QRecordStream, ) (int, error) { + fmt.Printf("\n************************* in SyncQRepRecords 1") startTime := time.Now() dstTableName := config.DestinationTableIdentifier - // s.config.StagingPath = "s3://avro-clickhouse" + //s.config.StagingPath = "s3://avro-clickhouse" schema, err := stream.Schema() if err != nil { @@ -141,6 +182,8 @@ func (s *ClickhouseAvroSyncMethod) SyncQRepRecords( awsCreds, err := utils.GetAWSSecrets(utils.S3PeerCredentials{}) avroFileUrl := fmt.Sprintf("https://%s.s3.%s.amazonaws.com%s", s3o.Bucket, awsCreds.Region, avroFile.FilePath) + fmt.Printf("\n*********************** in qrep_avro_sync SyncQRepRecords 4 avroFileUrl: %+v", avroFileUrl) + if err != nil { return 0, err } @@ -148,7 +191,12 @@ func (s *ClickhouseAvroSyncMethod) SyncQRepRecords( query := fmt.Sprintf("INSERT INTO %s SELECT * FROM s3('%s','%s','%s', 'Avro')", config.DestinationTableIdentifier, avroFileUrl, awsCreds.AccessKeyID, awsCreds.SecretAccessKey) + fmt.Printf("\n************************************ in qrep_avro_sync SyncQRepRecords 5 query: %s\n", query) + _, err = s.connector.database.Exec(query) + + fmt.Printf("\n************************************ in qrep_avro_sync SyncQRepRecords 6 err: %+v\n", err) + if err != nil { return 0, err } @@ -180,15 +228,18 @@ func (s *ClickhouseAvroSyncMethod) writeToAvroFile( partitionID string, flowJobName string, ) (*avro.AvroFile, error) { + stagingPath := s.config.StagingPath //"s3://avro-clickhouse" + fmt.Printf("\n****************************************** StagingPath: %+v*****\n", s.config.StagingPath) ocfWriter := avro.NewPeerDBOCFWriter(s.connector.ctx, stream, avroSchema, avro.CompressZstd, qvalue.QDWHTypeClickhouse) - s3o, err := utils.NewS3BucketAndPrefix(s.config.StagingPath) + s3o, err := utils.NewS3BucketAndPrefix(stagingPath) if err != nil { return nil, fmt.Errorf("failed to parse staging path: %w", err) } s3AvroFileKey := fmt.Sprintf("%s/%s/%s.avro.zst", s3o.Prefix, flowJobName, partitionID) // s.config.FlowJobName avroFile, err := ocfWriter.WriteRecordsToS3(s3o.Bucket, s3AvroFileKey, utils.S3PeerCredentials{}) ///utils.S3PeerCredentials{}) + fmt.Printf("\n************************* writeToAvroFile 2 avroFile %+v, err: %+v", avroFile, err) if err != nil { return nil, fmt.Errorf("failed to write records to S3: %w", err) } diff --git a/flow/connectors/core.go b/flow/connectors/core.go index 1e28822181..f0914fba9f 100644 --- a/flow/connectors/core.go +++ b/flow/connectors/core.go @@ -158,7 +158,10 @@ func GetCDCSyncConnector(ctx context.Context, config *protos.Peer) (CDCSyncConne return conneventhub.NewEventHubConnector(ctx, config.GetEventhubGroupConfig()) case *protos.Peer_S3Config: return conns3.NewS3Connector(ctx, config.GetS3Config()) + case *protos.Peer_ClickhouseConfig: + return connclickhouse.NewClickhouseConnector(ctx, config.GetClickhouseConfig()) default: + fmt.Printf("\n*********************** in GetCDCSyncConnector not found %+v %T\n", inner, inner) return nil, ErrUnsupportedFunctionality } } @@ -174,7 +177,10 @@ func GetCDCNormalizeConnector(ctx context.Context, return connbigquery.NewBigQueryConnector(ctx, config.GetBigqueryConfig()) case *protos.Peer_SnowflakeConfig: return connsnowflake.NewSnowflakeConnector(ctx, config.GetSnowflakeConfig()) + case *protos.Peer_ClickhouseConfig: + return connclickhouse.NewClickhouseConnector(ctx, config.GetClickhouseConfig()) default: + fmt.Printf("\n*********************** in GetCDCNormalizeConnector not found %+v %T\n", inner, inner) return nil, ErrUnsupportedFunctionality } } diff --git a/flow/connectors/external_metadata/store.go b/flow/connectors/external_metadata/store.go index 73557ad300..5c81235286 100644 --- a/flow/connectors/external_metadata/store.go +++ b/flow/connectors/external_metadata/store.go @@ -112,6 +112,7 @@ func (p *PostgresMetadataStore) SetupMetadata() error { // create the schema _, err := p.conn.Exec(p.ctx, "CREATE SCHEMA IF NOT EXISTS "+p.schemaName) if err != nil && !utils.IsUniqueError(err) { + fmt.Printf("********** error in SetupMetadata %+v", err) p.logger.Error("failed to create schema", slog.Any("error", err)) return err } diff --git a/flow/workflows/qrep_flow.go b/flow/workflows/qrep_flow.go index 6b4db6482b..d20db8966c 100644 --- a/flow/workflows/qrep_flow.go +++ b/flow/workflows/qrep_flow.go @@ -36,6 +36,8 @@ type QRepPartitionFlowExecution struct { // returns a new empty QRepFlowState func NewQRepFlowState() *protos.QRepFlowState { + fmt.Printf("\n*****************************NewQRepFlowState") + return &protos.QRepFlowState{ LastPartition: &protos.QRepPartition{ PartitionId: "not-applicable-partition", @@ -49,6 +51,8 @@ func NewQRepFlowState() *protos.QRepFlowState { // returns a new empty QRepFlowState func NewQRepFlowStateForTesting() *protos.QRepFlowState { + fmt.Printf("\n*****************************NewQRepFlowStateForTesting") + return &protos.QRepFlowState{ LastPartition: &protos.QRepPartition{ PartitionId: "not-applicable-partition", @@ -62,6 +66,8 @@ func NewQRepFlowStateForTesting() *protos.QRepFlowState { // NewQRepFlowExecution creates a new instance of QRepFlowExecution. func NewQRepFlowExecution(ctx workflow.Context, config *protos.QRepConfig, runUUID string) *QRepFlowExecution { + fmt.Printf("\n*****************************NewQRepFlowExecution") + return &QRepFlowExecution{ config: config, flowExecutionID: workflow.GetInfo(ctx).WorkflowExecution.ID, @@ -76,6 +82,8 @@ func NewQRepFlowExecution(ctx workflow.Context, config *protos.QRepConfig, runUU func NewQRepPartitionFlowExecution(ctx workflow.Context, config *protos.QRepConfig, runUUID string, ) *QRepPartitionFlowExecution { + fmt.Printf("\n*****************************NewQRepPartitionFlowExecution") + return &QRepPartitionFlowExecution{ config: config, flowExecutionID: workflow.GetInfo(ctx).WorkflowExecution.ID, @@ -403,6 +411,8 @@ func QRepFlowWorkflow( config *protos.QRepConfig, state *protos.QRepFlowState, ) error { + fmt.Printf("\n*****************************QRepFlowWorkflow") + // The structure of this workflow is as follows: // 1. Start the loop to continuously run the replication flow. // 2. In the loop, query the source database to get the partitions to replicate. @@ -537,6 +547,8 @@ func QRepPartitionWorkflow( partitions *protos.QRepPartitionBatch, runUUID string, ) error { + fmt.Printf("\n*****************************QRepPartitionWorkflow") + ctx = workflow.WithValue(ctx, shared.FlowNameKey, config.FlowJobName) q := NewQRepPartitionFlowExecution(ctx, config, runUUID) return q.ReplicatePartitions(ctx, partitions) diff --git a/protos/peers.proto b/protos/peers.proto index 372c02936b..351b11b8ce 100644 --- a/protos/peers.proto +++ b/protos/peers.proto @@ -97,6 +97,7 @@ message ClickhouseConfig{ string password = 4; string database = 5; string s3_integration = 6; // staging to store avro files + PostgresConfig metadata_db = 7; } message SqlServerConfig { diff --git a/ui/app/api/peers/getTruePeer.ts b/ui/app/api/peers/getTruePeer.ts index 1af4155dec..75aa53902b 100644 --- a/ui/app/api/peers/getTruePeer.ts +++ b/ui/app/api/peers/getTruePeer.ts @@ -8,6 +8,7 @@ import { S3Config, SnowflakeConfig, SqlServerConfig, + ClickhouseConfig, } from '@/grpc_generated/peers'; export const getTruePeer = (peer: CatalogPeer) => { @@ -23,7 +24,8 @@ export const getTruePeer = (peer: CatalogPeer) => { | EventHubConfig | S3Config | SqlServerConfig - | EventHubGroupConfig; + | EventHubGroupConfig + | ClickhouseConfig; switch (peer.type) { case 0: config = BigqueryConfig.decode(options); @@ -53,6 +55,10 @@ export const getTruePeer = (peer: CatalogPeer) => { config = EventHubGroupConfig.decode(options); newPeer.eventhubGroupConfig = config; break; + case 8: + config = ClickhouseConfig.decode(options); + newPeer.clickhouseConfig = config; + break; default: return newPeer; } From f4620ba5d104d91e00541a52223fb28918f5b44f Mon Sep 17 00:00:00 2001 From: Kaushik Iska Date: Mon, 22 Jan 2024 06:25:30 -0700 Subject: [PATCH 06/36] fix normalize errors --- flow/connectors/clickhouse/normalize.go | 107 ++++-------------------- 1 file changed, 16 insertions(+), 91 deletions(-) diff --git a/flow/connectors/clickhouse/normalize.go b/flow/connectors/clickhouse/normalize.go index fdf467d0f4..33c2dd9d13 100644 --- a/flow/connectors/clickhouse/normalize.go +++ b/flow/connectors/clickhouse/normalize.go @@ -2,16 +2,12 @@ package connclickhouse import ( "fmt" - "log/slog" "strings" - "sync/atomic" - "time" "github.com/PeerDB-io/peer-flow/connectors/utils" "github.com/PeerDB-io/peer-flow/generated/protos" "github.com/PeerDB-io/peer-flow/model" "github.com/PeerDB-io/peer-flow/model/qvalue" - "golang.org/x/sync/errgroup" ) const ( @@ -113,8 +109,10 @@ func generateCreateTableSQLForNormalizedTable( func (c *ClickhouseConnector) NormalizeRecords(req *model.NormalizeRecordsRequest) (*model.NormalizeResponse, error) { batchIDs, err := c.GetLastSyncAndNormalizeBatchID(req.FlowJobName) if err != nil { + c.logger.ErrorContext(c.ctx, "[sf] error while getting last sync and normalize batch id", err) return nil, err } + // normalize has caught up with sync, chill until more records are loaded. if batchIDs.NormalizeBatchID >= batchIDs.SyncBatchID { return &model.NormalizeResponse{ @@ -124,96 +122,23 @@ func (c *ClickhouseConnector) NormalizeRecords(req *model.NormalizeRecordsReques }, nil } - jobMetadataExists, err := c.jobMetadataExists(req.FlowJobName) - if err != nil { - return nil, err - } - // sync hasn't created job metadata yet, chill. - if !jobMetadataExists { - return &model.NormalizeResponse{ - Done: false, - }, nil - } - destinationTableNames, err := c.getDistinctTableNamesInBatch( - req.FlowJobName, - batchIDs.SyncBatchID, - batchIDs.NormalizeBatchID, - ) - if err != nil { - return nil, err - } - - tableNametoUnchangedToastCols, err := c.getTableNametoUnchangedCols(req.FlowJobName, batchIDs.SyncBatchID, batchIDs.NormalizeBatchID) - if err != nil { - return nil, fmt.Errorf("couldn't tablename to unchanged cols mapping: %w", err) - } - - var totalRowsAffected int64 = 0 - g, gCtx := errgroup.WithContext(c.ctx) - g.SetLimit(8) // limit parallel merges to 8 - - for _, destinationTableName := range destinationTableNames { - tableName := destinationTableName // local variable for the closure - - g.Go(func() error { - mergeGen := &mergeStmtGenerator{ - rawTableName: getRawTableIdentifier(req.FlowJobName), - dstTableName: tableName, - syncBatchID: batchIDs.SyncBatchID, - normalizeBatchID: batchIDs.NormalizeBatchID, - normalizedTableSchema: req.TableNameSchemaMapping[tableName], - unchangedToastColumns: tableNametoUnchangedToastCols[tableName], - peerdbCols: &protos.PeerDBColumns{ - SoftDelete: req.SoftDelete, - SoftDeleteColName: req.SoftDeleteColName, - SyncedAtColName: req.SyncedAtColName, - }, - } - mergeStatement, err := mergeGen.generateMergeStmt() - if err != nil { - return err - } - - startTime := time.Now() - c.logger.Info("[merge] merging records...", slog.String("destTable", tableName)) - - result, err := c.database.ExecContext(gCtx, mergeStatement, tableName) - if err != nil { - return fmt.Errorf("failed to merge records into %s (statement: %s): %w", - tableName, mergeStatement, err) - } - - endTime := time.Now() - c.logger.Info(fmt.Sprintf("[merge] merged records into %s, took: %d seconds", - tableName, endTime.Sub(startTime)/time.Second)) - if err != nil { - c.logger.Error("[merge] error while normalizing records", slog.Any("error", err)) - return err - } - - rowsAffected, err := result.RowsAffected() - if err != nil { - return fmt.Errorf("failed to get rows affected by merge statement for table %s: %w", tableName, err) - } - - atomic.AddInt64(&totalRowsAffected, rowsAffected) - return nil - }) - } - - if err := g.Wait(); err != nil { - return nil, fmt.Errorf("error while normalizing records: %w", err) - } - - // updating metadata with new normalizeBatchID - err = c.updateNormalizeMetadata(req.FlowJobName, batchIDs.SyncBatchID) - if err != nil { - return nil, err - } - + // This will never happen. return &model.NormalizeResponse{ Done: true, StartBatchID: batchIDs.NormalizeBatchID + 1, EndBatchID: batchIDs.SyncBatchID, }, nil } + +func (c *ClickhouseConnector) GetLastSyncAndNormalizeBatchID(flowJobName string) (model.SyncAndNormalizeBatchID, error) { + // return sync batch id as the normalize batch id as well as this is a no-op. + batchId, err := c.pgMetadata.GetLastBatchID(flowJobName) + if err != nil { + return model.SyncAndNormalizeBatchID{}, err + } + + return model.SyncAndNormalizeBatchID{ + SyncBatchID: batchId, + NormalizeBatchID: batchId, + }, nil +} From f620e37e84a55e90e0765d1337cdacd47f5dc954 Mon Sep 17 00:00:00 2001 From: Kaushik Iska Date: Mon, 22 Jan 2024 06:44:28 -0700 Subject: [PATCH 07/36] more normalize code --- flow/connectors/clickhouse/cdc.go | 7 +-- flow/connectors/clickhouse/normalize.go | 68 +++++++++++++++++++--- flow/connectors/external_metadata/store.go | 44 +++++++++++++- 3 files changed, 104 insertions(+), 15 deletions(-) diff --git a/flow/connectors/clickhouse/cdc.go b/flow/connectors/clickhouse/cdc.go index 28dfe100c2..66280fccd6 100644 --- a/flow/connectors/clickhouse/cdc.go +++ b/flow/connectors/clickhouse/cdc.go @@ -121,10 +121,9 @@ func (c *ClickhouseConnector) syncRecordsViaAvro( } qrepConfig := &protos.QRepConfig{ - StagingPath: c.config.S3Integration, - FlowJobName: req.FlowJobName, - DestinationTableIdentifier: strings.ToLower(fmt.Sprintf("%s", - rawTableIdentifier)), + StagingPath: c.config.S3Integration, + FlowJobName: req.FlowJobName, + DestinationTableIdentifier: strings.ToLower(fmt.Sprintf("%s", rawTableIdentifier)), } avroSyncer := NewClickhouseAvroSyncMethod(qrepConfig, c) destinationTableSchema, err := c.getTableSchema(qrepConfig.DestinationTableIdentifier) diff --git a/flow/connectors/clickhouse/normalize.go b/flow/connectors/clickhouse/normalize.go index 33c2dd9d13..b67fe17587 100644 --- a/flow/connectors/clickhouse/normalize.go +++ b/flow/connectors/clickhouse/normalize.go @@ -1,6 +1,7 @@ package connclickhouse import ( + "database/sql" "fmt" "strings" @@ -109,7 +110,7 @@ func generateCreateTableSQLForNormalizedTable( func (c *ClickhouseConnector) NormalizeRecords(req *model.NormalizeRecordsRequest) (*model.NormalizeResponse, error) { batchIDs, err := c.GetLastSyncAndNormalizeBatchID(req.FlowJobName) if err != nil { - c.logger.ErrorContext(c.ctx, "[sf] error while getting last sync and normalize batch id", err) + c.logger.ErrorContext(c.ctx, "[clickhouse] error while getting last sync and normalize batch id", err) return nil, err } @@ -122,23 +123,74 @@ func (c *ClickhouseConnector) NormalizeRecords(req *model.NormalizeRecordsReques }, nil } - // This will never happen. + destinationTableNames, err := c.getDistinctTableNamesInBatch( + req.FlowJobName, + batchIDs.SyncBatchID, + batchIDs.NormalizeBatchID, + ) + if err != nil { + c.logger.ErrorContext(c.ctx, "[clickhouse] error while getting distinct table names in batch", err) + return nil, err + } + + // model the raw table data as inserts. + + endNormalizeBatchId := batchIDs.NormalizeBatchID + 1 + c.pgMetadata.UpdateNormalizeBatchID(req.FlowJobName, endNormalizeBatchId) return &model.NormalizeResponse{ Done: true, - StartBatchID: batchIDs.NormalizeBatchID + 1, + StartBatchID: endNormalizeBatchId, EndBatchID: batchIDs.SyncBatchID, }, nil } +func (c *ClickhouseConnector) getDistinctTableNamesInBatch( + flowJobName string, + syncBatchID int64, + normalizeBatchID int64, +) ([]string, error) { + rawTbl := c.getRawTableName(flowJobName) + + q := fmt.Sprintf( + `SELECT DISTINCT _peerdb_destination_table_name FROM %s WHERE _peerdb_batch_id > %d AND _peerdb_batch_id <= %d`, + rawTbl, normalizeBatchID, syncBatchID) + + rows, err := c.database.QueryContext(c.ctx, q) + if err != nil { + return nil, fmt.Errorf("error while querying raw table for distinct table names in batch: %w", err) + } + + var tableNames []string + for rows.Next() { + var tableName sql.NullString + err = rows.Scan(&tableName) + if err != nil { + return nil, fmt.Errorf("error while scanning table name: %w", err) + } + + if !tableName.Valid { + return nil, fmt.Errorf("table name is not valid") + } + + tableNames = append(tableNames, tableName.String) + } + + return tableNames, nil +} + func (c *ClickhouseConnector) GetLastSyncAndNormalizeBatchID(flowJobName string) (model.SyncAndNormalizeBatchID, error) { - // return sync batch id as the normalize batch id as well as this is a no-op. - batchId, err := c.pgMetadata.GetLastBatchID(flowJobName) + syncBatchID, err := c.pgMetadata.GetLastBatchID(flowJobName) + if err != nil { + return model.SyncAndNormalizeBatchID{}, fmt.Errorf("error while getting last sync batch id: %w", err) + } + + normalizeBatchID, err := c.pgMetadata.GetLastNormalizeBatchID(flowJobName) if err != nil { - return model.SyncAndNormalizeBatchID{}, err + return model.SyncAndNormalizeBatchID{}, fmt.Errorf("error while getting last normalize batch id: %w", err) } return model.SyncAndNormalizeBatchID{ - SyncBatchID: batchId, - NormalizeBatchID: batchId, + SyncBatchID: syncBatchID, + NormalizeBatchID: normalizeBatchID, }, nil } diff --git a/flow/connectors/external_metadata/store.go b/flow/connectors/external_metadata/store.go index 5c81235286..00f4cb9282 100644 --- a/flow/connectors/external_metadata/store.go +++ b/flow/connectors/external_metadata/store.go @@ -123,7 +123,8 @@ func (p *PostgresMetadataStore) SetupMetadata() error { job_name TEXT PRIMARY KEY NOT NULL, last_offset BIGINT NOT NULL, updated_at TIMESTAMP NOT NULL DEFAULT NOW(), - sync_batch_id BIGINT NOT NULL + sync_batch_id BIGINT NOT NULL, + normalize_batch_id BIGINT NOT NULL ) `) if err != nil && !utils.IsUniqueError(err) { @@ -172,7 +173,7 @@ func (p *PostgresMetadataStore) GetLastBatchID(jobName string) (int64, error) { return 0, nil } - slog.Error("failed to get last offset", slog.Any("error", err)) + p.logger.Error("failed to get last sync batch id", slog.Any("error", err)) return 0, err } p.logger.Info("got last batch id for job", slog.Int64("batch id", syncBatchID.Int64)) @@ -180,6 +181,29 @@ func (p *PostgresMetadataStore) GetLastBatchID(jobName string) (int64, error) { return syncBatchID.Int64, nil } +func (p *PostgresMetadataStore) GetLastNormalizeBatchID(jobName string) (int64, error) { + rows := p.pool.QueryRow(p.ctx, ` + SELECT normalize_batch_id + FROM `+p.schemaName+`.`+lastSyncStateTableName+` + WHERE job_name = $1 + `, jobName) + + var normalizeBatchID pgtype.Int8 + err := rows.Scan(&normalizeBatchID) + if err != nil { + // if the job doesn't exist, return 0 + if err.Error() == "no rows in result set" { + return 0, nil + } + + p.logger.Error("failed to get last normalize", slog.Any("error", err)) + return 0, err + } + p.logger.Info("got last normalize batch normalize id for job", slog.Int64("batch id", normalizeBatchID.Int64)) + + return normalizeBatchID.Int64, nil +} + // update offset for a job func (p *PostgresMetadataStore) UpdateLastOffset(jobName string, offset int64) error { // start a transaction @@ -214,7 +238,7 @@ func (p *PostgresMetadataStore) UpdateLastOffset(jobName string, offset int64) e return nil } -// update offset for a job +// update the sync batch id for a job. func (p *PostgresMetadataStore) IncrementID(jobName string) error { p.logger.Info("incrementing sync batch id for job") _, err := p.conn.Exec(p.ctx, ` @@ -229,6 +253,20 @@ func (p *PostgresMetadataStore) IncrementID(jobName string) error { return nil } +func (p *PostgresMetadataStore) UpdateNormalizeBatchID(jobName string, batchID int64) error { + p.logger.Info("updating normalize batch id for job") + _, err := p.pool.Exec(p.ctx, ` + UPDATE `+p.schemaName+`.`+lastSyncStateTableName+` + SET normalize_batch_id=$2 WHERE job_name=$1 + `, jobName, batchID) + if err != nil { + p.logger.Error("failed to update normalize batch id", slog.Any("error", err)) + return err + } + + return nil +} + func (p *PostgresMetadataStore) DropMetadata(jobName string) error { _, err := p.conn.Exec(p.ctx, ` DELETE FROM `+p.QualifyTable(lastSyncStateTableName)+` From 0f2db0720f072547a7d5df28a916249001b635ff Mon Sep 17 00:00:00 2001 From: Kaushik Iska Date: Mon, 22 Jan 2024 07:10:06 -0700 Subject: [PATCH 08/36] basic sketch of normalize --- flow/connectors/clickhouse/normalize.go | 66 ++++++++++++++++++++++++- 1 file changed, 65 insertions(+), 1 deletion(-) diff --git a/flow/connectors/clickhouse/normalize.go b/flow/connectors/clickhouse/normalize.go index b67fe17587..ffa479a5df 100644 --- a/flow/connectors/clickhouse/normalize.go +++ b/flow/connectors/clickhouse/normalize.go @@ -15,7 +15,7 @@ const ( signColName = "_peerdb_sign" signColType = "Int8" versionColName = "_peerdb_version" - versionColType = "Int8" + versionColType = "Int64" ) func (c *ClickhouseConnector) SetupNormalizedTables( @@ -133,7 +133,71 @@ func (c *ClickhouseConnector) NormalizeRecords(req *model.NormalizeRecordsReques return nil, err } + rawTbl := c.getRawTableName(req.FlowJobName) + // model the raw table data as inserts. + for _, tbl := range destinationTableNames { + // SELECT projection FROM raw_table WHERE _peerdb_batch_id > normalize_batch_id AND _peerdb_batch_id <= sync_batch_id + selectQuery := strings.Builder{} + selectQuery.WriteString("SELECT ") + + colSelector := strings.Builder{} + colSelector.WriteString("(") + + schema := c.tableSchemaMapping[tbl] + numCols := len(schema.ColumnNames) + + projection := strings.Builder{} + + for i := 0; i < numCols; i++ { + cn := schema.ColumnNames[i] + ct := schema.ColumnTypes[i] + + colSelector.WriteString(fmt.Sprintf("%s", cn)) + if i < numCols-1 { + colSelector.WriteString(",") + } + + extractionFuction := "JSONExtractRaw" + switch qvalue.QValueKind(ct) { + case qvalue.QValueKindString: + extractionFuction = "JSONExtractString" + case qvalue.QValueKindInt64: + // TODO check if int64 is supported. + extractionFuction = "JSONExtractInt" + } + projection.WriteString(fmt.Sprintf("%s(_peerdb_data, '%s') AS %s, ", extractionFuction, cn, cn)) + } + + colSelector.WriteString(") ") + + selectQuery.WriteString(projection.String()) + selectQuery.WriteString(" FROM ") + selectQuery.WriteString(rawTbl) + selectQuery.WriteString(" WHERE _peerdb_batch_id > ") + selectQuery.WriteString(fmt.Sprintf("%d", batchIDs.NormalizeBatchID)) + selectQuery.WriteString(" AND _peerdb_batch_id <= ") + selectQuery.WriteString(fmt.Sprintf("%d", batchIDs.SyncBatchID)) + selectQuery.WriteString(" AND _peerdb_destination_table_name = '") + selectQuery.WriteString(tbl) + selectQuery.WriteString("'") + + selectQuery.WriteString(" ORDER BY _peerdb_timestamp") + + insertIntoSelectQuery := strings.Builder{} + insertIntoSelectQuery.WriteString("INSERT INTO ") + insertIntoSelectQuery.WriteString(tbl) + insertIntoSelectQuery.WriteString(colSelector.String()) + insertIntoSelectQuery.WriteString(selectQuery.String()) + + q := insertIntoSelectQuery.String() + c.logger.InfoContext(c.ctx, "[clickhouse] insert into select query", q) + + _, err = c.database.ExecContext(c.ctx, q) + if err != nil { + return nil, fmt.Errorf("error while inserting into normalized table: %w", err) + } + } endNormalizeBatchId := batchIDs.NormalizeBatchID + 1 c.pgMetadata.UpdateNormalizeBatchID(req.FlowJobName, endNormalizeBatchId) From f3ab0348cd34c37fae91e3119b292af77484a353 Mon Sep 17 00:00:00 2001 From: Kaushik Iska Date: Mon, 22 Jan 2024 07:23:17 -0700 Subject: [PATCH 09/36] fix schemaless ness --- flow/connectors/clickhouse/normalize.go | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/flow/connectors/clickhouse/normalize.go b/flow/connectors/clickhouse/normalize.go index ffa479a5df..c611c0c19e 100644 --- a/flow/connectors/clickhouse/normalize.go +++ b/flow/connectors/clickhouse/normalize.go @@ -5,7 +5,6 @@ import ( "fmt" "strings" - "github.com/PeerDB-io/peer-flow/connectors/utils" "github.com/PeerDB-io/peer-flow/generated/protos" "github.com/PeerDB-io/peer-flow/model" "github.com/PeerDB-io/peer-flow/model/qvalue" @@ -23,11 +22,7 @@ func (c *ClickhouseConnector) SetupNormalizedTables( ) (*protos.SetupNormalizedTableBatchOutput, error) { tableExistsMapping := make(map[string]bool) for tableIdentifier, tableSchema := range req.TableNameSchemaMapping { - normalizedSchemaTable, err := utils.ParseSchemaTable(tableIdentifier) - if err != nil { - return nil, fmt.Errorf("error while parsing table schema and name: %w", err) - } - tableAlreadyExists, err := c.checkIfTableExists(normalizedSchemaTable.Schema, normalizedSchemaTable.Table) + tableAlreadyExists, err := c.checkIfTableExists(c.config.Database, tableIdentifier) if err != nil { return nil, fmt.Errorf("error occurred while checking if normalized table exists: %w", err) } @@ -37,7 +32,7 @@ func (c *ClickhouseConnector) SetupNormalizedTables( } normalizedTableCreateSQL, err := generateCreateTableSQLForNormalizedTable( - normalizedSchemaTable, + tableIdentifier, tableSchema, req.SoftDeleteColName, req.SyncedAtColName, @@ -59,13 +54,13 @@ func (c *ClickhouseConnector) SetupNormalizedTables( } func generateCreateTableSQLForNormalizedTable( - normalizedSchemaTable *utils.SchemaTable, + normalizedTable string, tableSchema *protos.TableSchema, softDeleteColName string, syncedAtColName string, ) (string, error) { var stmtBuilder strings.Builder - stmtBuilder.WriteString(fmt.Sprintf("CREATE TABLE `%s`.`%s` (", normalizedSchemaTable.Schema, normalizedSchemaTable.Table)) + stmtBuilder.WriteString(fmt.Sprintf("CREATE TABLE `%s` (", normalizedTable)) nc := len(tableSchema.ColumnNames) for i := 0; i < nc; i++ { From 9c9a0ff2e29e6324227dbc3ade85e3522baf8db7 Mon Sep 17 00:00:00 2001 From: Kaushik Iska Date: Mon, 22 Jan 2024 07:32:42 -0700 Subject: [PATCH 10/36] fix table exists check --- flow/connectors/clickhouse/cdc.go | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/flow/connectors/clickhouse/cdc.go b/flow/connectors/clickhouse/cdc.go index 66280fccd6..5581ec03a1 100644 --- a/flow/connectors/clickhouse/cdc.go +++ b/flow/connectors/clickhouse/cdc.go @@ -28,13 +28,17 @@ func (c *ClickhouseConnector) getRawTableName(flowJobName string) string { } func (c *ClickhouseConnector) checkIfTableExists(databaseName string, tableIdentifier string) (bool, error) { - var result pgtype.Bool + var result sql.NullInt32 err := c.database.QueryRowContext(c.ctx, checkIfTableExistsSQL, databaseName, tableIdentifier).Scan(&result) if err != nil { return false, fmt.Errorf("error while reading result row: %w", err) } - fmt.Printf("result: %+v\n", result) - return result.Bool, nil + + if !result.Valid { + return false, fmt.Errorf("[clickhouse] checkIfTableExists: result is not valid") + } + + return result.Int32 == 1, nil } type MirrorJobRow struct { From 1f8d06f5219f5769bced34e080cb6d8e44686ca6 Mon Sep 17 00:00:00 2001 From: Kaushik Iska Date: Mon, 22 Jan 2024 07:36:49 -0700 Subject: [PATCH 11/36] more fixes --- flow/model/qvalue/kind.go | 23 +++++++++++++++-------- 1 file changed, 15 insertions(+), 8 deletions(-) diff --git a/flow/model/qvalue/kind.go b/flow/model/qvalue/kind.go index 5c6b069639..d67feea471 100644 --- a/flow/model/qvalue/kind.go +++ b/flow/model/qvalue/kind.go @@ -127,13 +127,20 @@ var QValueKindToClickhouseTypeMap = map[QValueKind]string{ } func (kind QValueKind) ToDWHColumnType(dwhType QDWHType) (string, error) { - if dwhType != QDWHTypeSnowflake { - return "", fmt.Errorf("unsupported DWH type: %v", dwhType) - } - - if val, ok := QValueKindToSnowflakeTypeMap[kind]; ok { - return val, nil - } else { - return "STRING", nil + switch dwhType { + case QDWHTypeSnowflake: + if val, ok := QValueKindToSnowflakeTypeMap[kind]; ok { + return val, nil + } else { + return "STRING", nil + } + case QDWHTypeClickhouse: + if val, ok := QValueKindToClickhouseTypeMap[kind]; ok { + return val, nil + } else { + return "String", nil + } + default: + return "", fmt.Errorf("unknown dwh type: %s", dwhType) } } From a697232295a7cba6e25457e7406a96789e9076c8 Mon Sep 17 00:00:00 2001 From: Kaushik Iska Date: Mon, 22 Jan 2024 07:44:54 -0700 Subject: [PATCH 12/36] sign and version columns --- flow/connectors/clickhouse/normalize.go | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/flow/connectors/clickhouse/normalize.go b/flow/connectors/clickhouse/normalize.go index c611c0c19e..1706442800 100644 --- a/flow/connectors/clickhouse/normalize.go +++ b/flow/connectors/clickhouse/normalize.go @@ -164,6 +164,12 @@ func (c *ClickhouseConnector) NormalizeRecords(req *model.NormalizeRecordsReques projection.WriteString(fmt.Sprintf("%s(_peerdb_data, '%s') AS %s, ", extractionFuction, cn, cn)) } + // add _peerdb_sign as _peerdb_record_type / 2 + projection.WriteString(fmt.Sprintf("intDiv(_peerdb_record_type, 2) AS %s, ", signColName)) + + // add _peerdb_timestamp as _peerdb_version + projection.WriteString(fmt.Sprintf("_peerdb_timestamp AS %s", versionColName)) + colSelector.WriteString(") ") selectQuery.WriteString(projection.String()) From 8124e58195a3c410abdabdfd5689f0a95c07af10 Mon Sep 17 00:00:00 2001 From: Kaushik Iska Date: Mon, 22 Jan 2024 07:49:27 -0700 Subject: [PATCH 13/36] add other col values --- flow/connectors/clickhouse/normalize.go | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/flow/connectors/clickhouse/normalize.go b/flow/connectors/clickhouse/normalize.go index 1706442800..5b3e8bddc3 100644 --- a/flow/connectors/clickhouse/normalize.go +++ b/flow/connectors/clickhouse/normalize.go @@ -11,9 +11,9 @@ import ( ) const ( - signColName = "_peerdb_sign" + signColName = "_PEERDB_IS_DELETED" signColType = "Int8" - versionColName = "_peerdb_version" + versionColName = "_PEERDB_VERSION" versionColType = "Int64" ) @@ -77,7 +77,7 @@ func generateCreateTableSQLForNormalizedTable( // synced at column will be added to all normalized tables if syncedAtColName != "" { - stmtBuilder.WriteString(fmt.Sprintf("`%s` %s, ", syncedAtColName, "DateTime64(9)")) + stmtBuilder.WriteString(fmt.Sprintf("`%s` %s, ", syncedAtColName, "DateTime64(9) DEFAULT now()")) } // add sign and version columns From bb8ca2234ce5a5f66a92f7c5b106397e5bfae119 Mon Sep 17 00:00:00 2001 From: Kaushik Iska Date: Mon, 22 Jan 2024 08:10:17 -0700 Subject: [PATCH 14/36] lweor case --- flow/connectors/clickhouse/normalize.go | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/flow/connectors/clickhouse/normalize.go b/flow/connectors/clickhouse/normalize.go index 5b3e8bddc3..fc7e2f9aeb 100644 --- a/flow/connectors/clickhouse/normalize.go +++ b/flow/connectors/clickhouse/normalize.go @@ -11,9 +11,9 @@ import ( ) const ( - signColName = "_PEERDB_IS_DELETED" + signColName = "_peerdb_is_deleted" signColType = "Int8" - versionColName = "_PEERDB_VERSION" + versionColName = "_peerdb_version" versionColType = "Int64" ) @@ -77,7 +77,8 @@ func generateCreateTableSQLForNormalizedTable( // synced at column will be added to all normalized tables if syncedAtColName != "" { - stmtBuilder.WriteString(fmt.Sprintf("`%s` %s, ", syncedAtColName, "DateTime64(9) DEFAULT now()")) + colName := strings.ToLower(syncedAtColName) + stmtBuilder.WriteString(fmt.Sprintf("`%s` %s, ", colName, "DateTime64(9) DEFAULT now()")) } // add sign and version columns From b0a7265709fc6cce3ccb22ca5a065f4e1c2fdbc8 Mon Sep 17 00:00:00 2001 From: Pankaj B Date: Mon, 22 Jan 2024 20:40:51 +0530 Subject: [PATCH 15/36] hardcode staging --- flow/connectors/clickhouse/qrep_avro_sync.go | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/flow/connectors/clickhouse/qrep_avro_sync.go b/flow/connectors/clickhouse/qrep_avro_sync.go index 9f5133f29e..2e4325dc08 100644 --- a/flow/connectors/clickhouse/qrep_avro_sync.go +++ b/flow/connectors/clickhouse/qrep_avro_sync.go @@ -55,7 +55,7 @@ func NewClickhouseAvroSyncMethod( func (s *ClickhouseAvroSyncMethod) CopyStageToDestination(avroFile *avro.AvroFile) error { fmt.Printf("\n************************* in CopyStageToDesti stagingPath: %+v", s.config.StagingPath) - stagingPath := s.config.StagingPath //"s3://avro-clickhouse" + stagingPath := "s3://avro-clickhouse" //s.config.StagingPath s3o, err := utils.NewS3BucketAndPrefix(stagingPath) if err != nil { return err @@ -158,7 +158,7 @@ func (s *ClickhouseAvroSyncMethod) SyncQRepRecords( fmt.Printf("\n************************* in SyncQRepRecords 1") startTime := time.Now() dstTableName := config.DestinationTableIdentifier - //s.config.StagingPath = "s3://avro-clickhouse" + s.config.StagingPath = "s3://avro-clickhouse" schema, err := stream.Schema() if err != nil { @@ -228,7 +228,7 @@ func (s *ClickhouseAvroSyncMethod) writeToAvroFile( partitionID string, flowJobName string, ) (*avro.AvroFile, error) { - stagingPath := s.config.StagingPath //"s3://avro-clickhouse" + stagingPath := "s3://avro-clickhouse" //s.config.StagingPath // fmt.Printf("\n****************************************** StagingPath: %+v*****\n", s.config.StagingPath) ocfWriter := avro.NewPeerDBOCFWriter(s.connector.ctx, stream, avroSchema, avro.CompressZstd, qvalue.QDWHTypeClickhouse) From b1b2ab1f987ee9ecd208da987279b2da8ed0e319 Mon Sep 17 00:00:00 2001 From: Pankaj B Date: Thu, 25 Jan 2024 20:12:01 +0530 Subject: [PATCH 16/36] fix stagingPath --- flow/activities/flowable.go | 2 +- flow/cmd/handler.go | 6 +- flow/connectors/clickhouse/cdc.go | 6 +- flow/connectors/clickhouse/normalize.go | 18 ++++-- flow/connectors/clickhouse/qrep.go | 2 +- flow/connectors/clickhouse/qrep_avro_sync.go | 58 ++++++++++++-------- flow/connectors/core.go | 4 +- flow/connectors/external_metadata/store.go | 2 +- flow/workflows/qrep_flow.go | 12 ++-- ui/components/PeerComponent.tsx | 1 + 10 files changed, 67 insertions(+), 44 deletions(-) diff --git a/flow/activities/flowable.go b/flow/activities/flowable.go index 6754820e1c..a89493f1f1 100644 --- a/flow/activities/flowable.go +++ b/flow/activities/flowable.go @@ -371,7 +371,7 @@ func (a *FlowableActivity) StartNormalize( ) (*model.NormalizeResponse, error) { conn := input.FlowConnectionConfigs ctx = context.WithValue(ctx, shared.FlowNameKey, conn.FlowJobName) - fmt.Printf("\n*********************** in StartNormalize %+v\n", conn) + //fmt.Printf("\n*********************** in StartNormalize %+v\n", conn) dstConn, err := connectors.GetCDCNormalizeConnector(ctx, conn.Destination) if errors.Is(err, connectors.ErrUnsupportedFunctionality) { dstConn, err := connectors.GetCDCSyncConnector(ctx, conn.Destination) diff --git a/flow/cmd/handler.go b/flow/cmd/handler.go index 9d27c829ee..ef402c3e36 100644 --- a/flow/cmd/handler.go +++ b/flow/cmd/handler.go @@ -120,8 +120,9 @@ func (h *FlowRequestHandler) createQrepJobEntry(ctx context.Context, func (h *FlowRequestHandler) CreateCDCFlow( ctx context.Context, req *protos.CreateCDCFlowRequest, ) (*protos.CreateCDCFlowResponse, error) { - fmt.Printf("\n******************************** CreateCDCFlow") cfg := req.ConnectionConfigs + fmt.Printf("\n******************************** CreateCDCFlow %+v", cfg) + _, validateErr := h.ValidateCDCMirror(ctx, req) if validateErr != nil { slog.Error("validate mirror error", slog.Any("error", validateErr)) @@ -228,8 +229,9 @@ func (h *FlowRequestHandler) removeFlowEntryInCatalog( func (h *FlowRequestHandler) CreateQRepFlow( ctx context.Context, req *protos.CreateQRepFlowRequest, ) (*protos.CreateQRepFlowResponse, error) { - fmt.Printf("\n******************************** CreateQRepFlow") cfg := req.QrepConfig + fmt.Printf("\n******************************** CreateQRepFlow config: %+v", cfg) + workflowID := fmt.Sprintf("%s-qrepflow-%s", cfg.FlowJobName, uuid.New()) workflowOptions := client.StartWorkflowOptions{ ID: workflowID, diff --git a/flow/connectors/clickhouse/cdc.go b/flow/connectors/clickhouse/cdc.go index 5581ec03a1..2e97695746 100644 --- a/flow/connectors/clickhouse/cdc.go +++ b/flow/connectors/clickhouse/cdc.go @@ -119,7 +119,7 @@ func (c *ClickhouseConnector) syncRecordsViaAvro( streamRes, err := utils.RecordsToRawTableStream(streamReq) //x := *&streamRes.Stream //y := (*x).Records - fmt.Printf("\n*******************############################## cdc.go in syncRecordsViaAvro streamRes: %+v", streamRes) + //fmt.Printf("\n*******************############################## cdc.go in syncRecordsViaAvro streamRes: %+v", streamRes) if err != nil { return nil, fmt.Errorf("failed to convert records to raw table stream: %w", err) } @@ -162,8 +162,8 @@ func (c *ClickhouseConnector) syncRecordsViaAvro( } func (c *ClickhouseConnector) SyncRecords(req *model.SyncRecordsRequest) (*model.SyncResponse, error) { - fmt.Printf("\n ******************************************** !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! in ClickhouseConnector.SyncRecords") - fmt.Printf("\n ******************************* in cdc.go in SyncRecords config: %+v", c.config.S3Integration) + //fmt.Printf("\n ******************************************** !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! in ClickhouseConnector.SyncRecords") + //fmt.Printf("\n ******************************* in cdc.go in SyncRecords config: %+v", c.config.S3Integration) //c.config.S3Integration = "s3://avro-clickhouse" rawTableName := c.getRawTableName(req.FlowJobName) c.logger.Info(fmt.Sprintf("pushing records to Clickhouse table %s", rawTableName)) diff --git a/flow/connectors/clickhouse/normalize.go b/flow/connectors/clickhouse/normalize.go index fc7e2f9aeb..54a7af2385 100644 --- a/flow/connectors/clickhouse/normalize.go +++ b/flow/connectors/clickhouse/normalize.go @@ -134,13 +134,16 @@ func (c *ClickhouseConnector) NormalizeRecords(req *model.NormalizeRecordsReques // model the raw table data as inserts. for _, tbl := range destinationTableNames { // SELECT projection FROM raw_table WHERE _peerdb_batch_id > normalize_batch_id AND _peerdb_batch_id <= sync_batch_id + //fmt.Printf("\n************************* in normalize_records1: tbl %s", tbl) selectQuery := strings.Builder{} selectQuery.WriteString("SELECT ") colSelector := strings.Builder{} colSelector.WriteString("(") - schema := c.tableSchemaMapping[tbl] + //schema := c.tableSchemaMapping[tbl] + schema := req.TableNameSchemaMapping[tbl] + //fmt.Printf("\n************************* in normalize_records2: schema %+v", schema) numCols := len(schema.ColumnNames) projection := strings.Builder{} @@ -149,10 +152,10 @@ func (c *ClickhouseConnector) NormalizeRecords(req *model.NormalizeRecordsReques cn := schema.ColumnNames[i] ct := schema.ColumnTypes[i] - colSelector.WriteString(fmt.Sprintf("%s", cn)) - if i < numCols-1 { - colSelector.WriteString(",") - } + colSelector.WriteString(fmt.Sprintf("%s,", cn)) + // if i < numCols-1 { + // colSelector.WriteString(",") + // } extractionFuction := "JSONExtractRaw" switch qvalue.QValueKind(ct) { @@ -167,10 +170,11 @@ func (c *ClickhouseConnector) NormalizeRecords(req *model.NormalizeRecordsReques // add _peerdb_sign as _peerdb_record_type / 2 projection.WriteString(fmt.Sprintf("intDiv(_peerdb_record_type, 2) AS %s, ", signColName)) + colSelector.WriteString(fmt.Sprintf("%s,", signColName)) // add _peerdb_timestamp as _peerdb_version projection.WriteString(fmt.Sprintf("_peerdb_timestamp AS %s", versionColName)) - + colSelector.WriteString(versionColName) colSelector.WriteString(") ") selectQuery.WriteString(projection.String()) @@ -241,6 +245,8 @@ func (c *ClickhouseConnector) getDistinctTableNamesInBatch( tableNames = append(tableNames, tableName.String) } + fmt.Printf("\n****************************** getDistinctTableNamesInBatch tableNames %+v", tableNames) + return tableNames, nil } diff --git a/flow/connectors/clickhouse/qrep.go b/flow/connectors/clickhouse/qrep.go index d994dda78c..f3c6f7b763 100644 --- a/flow/connectors/clickhouse/qrep.go +++ b/flow/connectors/clickhouse/qrep.go @@ -24,7 +24,7 @@ func (c *ClickhouseConnector) SyncQRepRecords( partition *protos.QRepPartition, stream *model.QRecordStream, ) (int, error) { - fmt.Printf("\n******************* in ClickhouseConnector.SyncQRepRecords") + //fmt.Printf("\n******************* in ClickhouseConnector.SyncQRepRecords") // Ensure the destination table is available. destTable := config.DestinationTableIdentifier flowLog := slog.Group("sync_metadata", diff --git a/flow/connectors/clickhouse/qrep_avro_sync.go b/flow/connectors/clickhouse/qrep_avro_sync.go index 2e4325dc08..4fed02a355 100644 --- a/flow/connectors/clickhouse/qrep_avro_sync.go +++ b/flow/connectors/clickhouse/qrep_avro_sync.go @@ -4,6 +4,7 @@ import ( "database/sql" "fmt" "log/slog" + "strings" "time" "go.temporal.io/sdk/activity" @@ -54,8 +55,11 @@ func NewClickhouseAvroSyncMethod( // } func (s *ClickhouseAvroSyncMethod) CopyStageToDestination(avroFile *avro.AvroFile) error { - fmt.Printf("\n************************* in CopyStageToDesti stagingPath: %+v", s.config.StagingPath) - stagingPath := "s3://avro-clickhouse" //s.config.StagingPath + //fmt.Printf("\n************************* in CopyStageToDesti stagingPath: %+v", s.config.StagingPath) + stagingPath := s.config.StagingPath + if stagingPath == "" { + stagingPath = s.config.DestinationPeer.GetClickhouseConfig().S3Integration //"s3://avro-clickhouse" + } s3o, err := utils.NewS3BucketAndPrefix(stagingPath) if err != nil { return err @@ -70,7 +74,7 @@ func (s *ClickhouseAvroSyncMethod) CopyStageToDestination(avroFile *avro.AvroFil query := fmt.Sprintf("INSERT INTO %s SELECT * FROM s3('%s','%s','%s', 'Avro')", s.config.DestinationTableIdentifier, avroFileUrl, awsCreds.AccessKeyID, awsCreds.SecretAccessKey) - fmt.Printf("\n************************ CopyStagingToDestination query: %s\n", query) + //fmt.Printf("\n************************ CopyStagingToDestination query: %s\n", query) _, err = s.connector.database.Exec(query) @@ -82,8 +86,8 @@ func (s *ClickhouseAvroSyncMethod) SyncRecords( stream *model.QRecordStream, flowJobName string, ) (int, error) { - fmt.Printf("\n************************* in qrep_avro_sync: SyncRecords1 dstTableSchema %+v", dstTableSchema) - fmt.Printf("\n************************ in qrep_avro_sync: SyncRecords2 config %+v", s.config) + //fmt.Printf("\n************************* in qrep_avro_sync: SyncRecords1 dstTableSchema %+v", dstTableSchema) + //fmt.Printf("\n************************ in qrep_avro_sync: SyncRecords2 config %+v", s.config) //s.config.StagingPath = "s3://avro-clickhouse" tableLog := slog.String("destinationTable", s.config.DestinationTableIdentifier) dstTableName := s.config.DestinationTableIdentifier @@ -93,7 +97,7 @@ func (s *ClickhouseAvroSyncMethod) SyncRecords( return -1, fmt.Errorf("failed to get schema from stream: %w", err) } - fmt.Printf("\n******************************* in qrep_avro_sync: SyncRecords3 stream schema %+v", schema) + //fmt.Printf("\n******************************* in qrep_avro_sync: SyncRecords3 stream schema %+v", schema) s.connector.logger.Info("sync function called and schema acquired", tableLog) @@ -102,12 +106,12 @@ func (s *ClickhouseAvroSyncMethod) SyncRecords( return 0, err } - fmt.Printf("\n******************************* in qrep_avro_sync: SyncRecords5 avro schema %+v", avroSchema) + //fmt.Printf("\n******************************* in qrep_avro_sync: SyncRecords5 avro schema %+v", avroSchema) partitionID := shared.RandomString(16) - fmt.Printf("\n******************* calling writeToAvroFile partitionId: %+v", partitionID) + //fmt.Printf("\n******************* calling writeToAvroFile partitionId: %+v", partitionID) avroFile, err := s.writeToAvroFile(stream, avroSchema, partitionID, flowJobName) - fmt.Printf("\n******************* records written to avrofile %+v", avroFile) + //fmt.Printf("\n******************* records written to avrofile %+v", avroFile) if err != nil { return 0, err } @@ -141,7 +145,7 @@ func (s *ClickhouseAvroSyncMethod) SyncRecords( //Copy stage/avro to destination err = s.CopyStageToDestination(avroFile) - fmt.Printf("\n ***************** in qrep_avro_sync: SyncRecords after CopyStageToDestination err: %+v", err) + //fmt.Printf("\n ***************** in qrep_avro_sync: SyncRecords after CopyStageToDestination err: %+v", err) if err != nil { return 0, err } @@ -155,47 +159,53 @@ func (s *ClickhouseAvroSyncMethod) SyncQRepRecords( dstTableSchema []*sql.ColumnType, stream *model.QRecordStream, ) (int, error) { - fmt.Printf("\n************************* in SyncQRepRecords 1") + fmt.Printf("\n******************* in qrep_avro_sync: SyncQRepRecords config %+v", s.config.DestinationPeer) + //fmt.Printf("\n************************* in SyncQRepRecords 1") startTime := time.Now() dstTableName := config.DestinationTableIdentifier - s.config.StagingPath = "s3://avro-clickhouse" + //s.config.StagingPath = "s3://avro-clickhouse" + stagingPath := s.config.DestinationPeer.GetClickhouseConfig().S3Integration schema, err := stream.Schema() if err != nil { return -1, fmt.Errorf("failed to get schema from stream: %w", err) } - + //fmt.Printf("\n******************************* in qrep_avro_sync: SyncQRepRecords 2 avro schema %+v", schema) avroSchema, err := s.getAvroSchema(dstTableName, schema) if err != nil { return 0, err } + //fmt.Printf("\n******************************* in qrep_avro_sync: SyncQRepRecords 3 avro schema %+v", avroSchema) + //fmt.Printf("\n******************************* in qrep_avro_sync: SyncQRepRecords 4 avro schema %+v",) + avroFile, err := s.writeToAvroFile(stream, avroSchema, partition.PartitionId, config.FlowJobName) if err != nil { return 0, err } - s3o, err := utils.NewS3BucketAndPrefix(s.config.StagingPath) + s3o, err := utils.NewS3BucketAndPrefix(stagingPath) if err != nil { return 0, err } awsCreds, err := utils.GetAWSSecrets(utils.S3PeerCredentials{}) avroFileUrl := fmt.Sprintf("https://%s.s3.%s.amazonaws.com%s", s3o.Bucket, awsCreds.Region, avroFile.FilePath) - fmt.Printf("\n*********************** in qrep_avro_sync SyncQRepRecords 4 avroFileUrl: %+v", avroFileUrl) + //fmt.Printf("\n*********************** in qrep_avro_sync SyncQRepRecords 4 avroFileUrl: %+v", avroFileUrl) if err != nil { return 0, err } + selector := strings.Join(schema.GetColumnNames(), ",") //nolint:gosec - query := fmt.Sprintf("INSERT INTO %s SELECT * FROM s3('%s','%s','%s', 'Avro')", - config.DestinationTableIdentifier, avroFileUrl, awsCreds.AccessKeyID, awsCreds.SecretAccessKey) + query := fmt.Sprintf("INSERT INTO %s (%s) SELECT * FROM s3('%s','%s','%s', 'Avro')", + config.DestinationTableIdentifier, selector, avroFileUrl, awsCreds.AccessKeyID, awsCreds.SecretAccessKey) - fmt.Printf("\n************************************ in qrep_avro_sync SyncQRepRecords 5 query: %s\n", query) + //fmt.Printf("\n************************************ in qrep_avro_sync SyncQRepRecords 5 query: %s\n", query) _, err = s.connector.database.Exec(query) - fmt.Printf("\n************************************ in qrep_avro_sync SyncQRepRecords 6 err: %+v\n", err) + //fmt.Printf("\n************************************ in qrep_avro_sync SyncQRepRecords 6 err: %+v\n", err) if err != nil { return 0, err @@ -228,8 +238,12 @@ func (s *ClickhouseAvroSyncMethod) writeToAvroFile( partitionID string, flowJobName string, ) (*avro.AvroFile, error) { - stagingPath := "s3://avro-clickhouse" //s.config.StagingPath // - fmt.Printf("\n****************************************** StagingPath: %+v*****\n", s.config.StagingPath) + fmt.Printf("\n************************* in writeToAvroFile 1 21 %+v", s.config) + stagingPath := s.config.StagingPath //"s3://avro-clickhouse" + if stagingPath == "" { + stagingPath = s.config.DestinationPeer.GetClickhouseConfig().S3Integration //"s3://avro-clickhouse" + } + fmt.Printf("\n****************************************** StagingPath: %+v*****\n", stagingPath) ocfWriter := avro.NewPeerDBOCFWriter(s.connector.ctx, stream, avroSchema, avro.CompressZstd, qvalue.QDWHTypeClickhouse) s3o, err := utils.NewS3BucketAndPrefix(stagingPath) @@ -239,7 +253,7 @@ func (s *ClickhouseAvroSyncMethod) writeToAvroFile( s3AvroFileKey := fmt.Sprintf("%s/%s/%s.avro.zst", s3o.Prefix, flowJobName, partitionID) // s.config.FlowJobName avroFile, err := ocfWriter.WriteRecordsToS3(s3o.Bucket, s3AvroFileKey, utils.S3PeerCredentials{}) ///utils.S3PeerCredentials{}) - fmt.Printf("\n************************* writeToAvroFile 2 avroFile %+v, err: %+v", avroFile, err) + //fmt.Printf("\n************************* writeToAvroFile 2 avroFile %+v, err: %+v", avroFile, err) if err != nil { return nil, fmt.Errorf("failed to write records to S3: %w", err) } diff --git a/flow/connectors/core.go b/flow/connectors/core.go index f0914fba9f..09f68282b2 100644 --- a/flow/connectors/core.go +++ b/flow/connectors/core.go @@ -161,7 +161,7 @@ func GetCDCSyncConnector(ctx context.Context, config *protos.Peer) (CDCSyncConne case *protos.Peer_ClickhouseConfig: return connclickhouse.NewClickhouseConnector(ctx, config.GetClickhouseConfig()) default: - fmt.Printf("\n*********************** in GetCDCSyncConnector not found %+v %T\n", inner, inner) + //fmt.Printf("\n*********************** in GetCDCSyncConnector not found %+v %T\n", inner, inner) return nil, ErrUnsupportedFunctionality } } @@ -180,7 +180,7 @@ func GetCDCNormalizeConnector(ctx context.Context, case *protos.Peer_ClickhouseConfig: return connclickhouse.NewClickhouseConnector(ctx, config.GetClickhouseConfig()) default: - fmt.Printf("\n*********************** in GetCDCNormalizeConnector not found %+v %T\n", inner, inner) + //fmt.Printf("\n*********************** in GetCDCNormalizeConnector not found %+v %T\n", inner, inner) return nil, ErrUnsupportedFunctionality } } diff --git a/flow/connectors/external_metadata/store.go b/flow/connectors/external_metadata/store.go index 00f4cb9282..309fe9ecd0 100644 --- a/flow/connectors/external_metadata/store.go +++ b/flow/connectors/external_metadata/store.go @@ -124,7 +124,7 @@ func (p *PostgresMetadataStore) SetupMetadata() error { last_offset BIGINT NOT NULL, updated_at TIMESTAMP NOT NULL DEFAULT NOW(), sync_batch_id BIGINT NOT NULL, - normalize_batch_id BIGINT NOT NULL + normalize_batch_id BIGINT ) `) if err != nil && !utils.IsUniqueError(err) { diff --git a/flow/workflows/qrep_flow.go b/flow/workflows/qrep_flow.go index d20db8966c..056c571bba 100644 --- a/flow/workflows/qrep_flow.go +++ b/flow/workflows/qrep_flow.go @@ -36,7 +36,7 @@ type QRepPartitionFlowExecution struct { // returns a new empty QRepFlowState func NewQRepFlowState() *protos.QRepFlowState { - fmt.Printf("\n*****************************NewQRepFlowState") + //fmt.Printf("\n*****************************NewQRepFlowState") return &protos.QRepFlowState{ LastPartition: &protos.QRepPartition{ @@ -51,7 +51,7 @@ func NewQRepFlowState() *protos.QRepFlowState { // returns a new empty QRepFlowState func NewQRepFlowStateForTesting() *protos.QRepFlowState { - fmt.Printf("\n*****************************NewQRepFlowStateForTesting") + //fmt.Printf("\n*****************************NewQRepFlowStateForTesting") return &protos.QRepFlowState{ LastPartition: &protos.QRepPartition{ @@ -66,7 +66,7 @@ func NewQRepFlowStateForTesting() *protos.QRepFlowState { // NewQRepFlowExecution creates a new instance of QRepFlowExecution. func NewQRepFlowExecution(ctx workflow.Context, config *protos.QRepConfig, runUUID string) *QRepFlowExecution { - fmt.Printf("\n*****************************NewQRepFlowExecution") + //fmt.Printf("\n*****************************NewQRepFlowExecution") return &QRepFlowExecution{ config: config, @@ -82,7 +82,7 @@ func NewQRepFlowExecution(ctx workflow.Context, config *protos.QRepConfig, runUU func NewQRepPartitionFlowExecution(ctx workflow.Context, config *protos.QRepConfig, runUUID string, ) *QRepPartitionFlowExecution { - fmt.Printf("\n*****************************NewQRepPartitionFlowExecution") + //fmt.Printf("\n*****************************NewQRepPartitionFlowExecution") return &QRepPartitionFlowExecution{ config: config, @@ -411,7 +411,7 @@ func QRepFlowWorkflow( config *protos.QRepConfig, state *protos.QRepFlowState, ) error { - fmt.Printf("\n*****************************QRepFlowWorkflow") + //fmt.Printf("\n*****************************QRepFlowWorkflow") // The structure of this workflow is as follows: // 1. Start the loop to continuously run the replication flow. @@ -547,7 +547,7 @@ func QRepPartitionWorkflow( partitions *protos.QRepPartitionBatch, runUUID string, ) error { - fmt.Printf("\n*****************************QRepPartitionWorkflow") + //fmt.Printf("\n*****************************QRepPartitionWorkflow") ctx = workflow.WithValue(ctx, shared.FlowNameKey, config.FlowJobName) q := NewQRepPartitionFlowExecution(ctx, config, runUUID) diff --git a/ui/components/PeerComponent.tsx b/ui/components/PeerComponent.tsx index 318aff7529..81782f46e0 100644 --- a/ui/components/PeerComponent.tsx +++ b/ui/components/PeerComponent.tsx @@ -19,6 +19,7 @@ export const DBTypeToImageMapping = (peerType: DBType | string) => { case 'S3': return '/svgs/aws.svg'; case 'CLICKHOUSE': + case DBType.CLICKHOUSE: return '/svgs/ch.svg'; case DBType.EVENTHUB_GROUP: case DBType.EVENTHUB: From 3c05b8e8c208060e6c4658109cb288c7226ed181 Mon Sep 17 00:00:00 2001 From: Pankaj B Date: Thu, 25 Jan 2024 20:28:47 +0530 Subject: [PATCH 17/36] cleanup --- flow/activities/flowable.go | 1 - flow/cmd/handler.go | 2 -- flow/connectors/clickhouse/cdc.go | 4 +-- flow/connectors/clickhouse/normalize.go | 4 --- flow/connectors/clickhouse/qrep.go | 1 - flow/connectors/clickhouse/qrep_avro_sync.go | 27 -------------------- flow/connectors/core.go | 2 -- flow/connectors/external_metadata/store.go | 1 - flow/workflows/qrep_flow.go | 6 ----- 9 files changed, 1 insertion(+), 47 deletions(-) diff --git a/flow/activities/flowable.go b/flow/activities/flowable.go index a89493f1f1..de3f2bcfeb 100644 --- a/flow/activities/flowable.go +++ b/flow/activities/flowable.go @@ -371,7 +371,6 @@ func (a *FlowableActivity) StartNormalize( ) (*model.NormalizeResponse, error) { conn := input.FlowConnectionConfigs ctx = context.WithValue(ctx, shared.FlowNameKey, conn.FlowJobName) - //fmt.Printf("\n*********************** in StartNormalize %+v\n", conn) dstConn, err := connectors.GetCDCNormalizeConnector(ctx, conn.Destination) if errors.Is(err, connectors.ErrUnsupportedFunctionality) { dstConn, err := connectors.GetCDCSyncConnector(ctx, conn.Destination) diff --git a/flow/cmd/handler.go b/flow/cmd/handler.go index ef402c3e36..1e3e8d48e6 100644 --- a/flow/cmd/handler.go +++ b/flow/cmd/handler.go @@ -121,7 +121,6 @@ func (h *FlowRequestHandler) CreateCDCFlow( ctx context.Context, req *protos.CreateCDCFlowRequest, ) (*protos.CreateCDCFlowResponse, error) { cfg := req.ConnectionConfigs - fmt.Printf("\n******************************** CreateCDCFlow %+v", cfg) _, validateErr := h.ValidateCDCMirror(ctx, req) if validateErr != nil { @@ -230,7 +229,6 @@ func (h *FlowRequestHandler) CreateQRepFlow( ctx context.Context, req *protos.CreateQRepFlowRequest, ) (*protos.CreateQRepFlowResponse, error) { cfg := req.QrepConfig - fmt.Printf("\n******************************** CreateQRepFlow config: %+v", cfg) workflowID := fmt.Sprintf("%s-qrepflow-%s", cfg.FlowJobName, uuid.New()) workflowOptions := client.StartWorkflowOptions{ diff --git a/flow/connectors/clickhouse/cdc.go b/flow/connectors/clickhouse/cdc.go index 2e97695746..8103c05b64 100644 --- a/flow/connectors/clickhouse/cdc.go +++ b/flow/connectors/clickhouse/cdc.go @@ -119,12 +119,12 @@ func (c *ClickhouseConnector) syncRecordsViaAvro( streamRes, err := utils.RecordsToRawTableStream(streamReq) //x := *&streamRes.Stream //y := (*x).Records - //fmt.Printf("\n*******************############################## cdc.go in syncRecordsViaAvro streamRes: %+v", streamRes) if err != nil { return nil, fmt.Errorf("failed to convert records to raw table stream: %w", err) } qrepConfig := &protos.QRepConfig{ + DestinationPeer: c.config.DestinationPeer, StagingPath: c.config.S3Integration, FlowJobName: req.FlowJobName, DestinationTableIdentifier: strings.ToLower(fmt.Sprintf("%s", rawTableIdentifier)), @@ -162,8 +162,6 @@ func (c *ClickhouseConnector) syncRecordsViaAvro( } func (c *ClickhouseConnector) SyncRecords(req *model.SyncRecordsRequest) (*model.SyncResponse, error) { - //fmt.Printf("\n ******************************************** !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! in ClickhouseConnector.SyncRecords") - //fmt.Printf("\n ******************************* in cdc.go in SyncRecords config: %+v", c.config.S3Integration) //c.config.S3Integration = "s3://avro-clickhouse" rawTableName := c.getRawTableName(req.FlowJobName) c.logger.Info(fmt.Sprintf("pushing records to Clickhouse table %s", rawTableName)) diff --git a/flow/connectors/clickhouse/normalize.go b/flow/connectors/clickhouse/normalize.go index 54a7af2385..2432809192 100644 --- a/flow/connectors/clickhouse/normalize.go +++ b/flow/connectors/clickhouse/normalize.go @@ -134,7 +134,6 @@ func (c *ClickhouseConnector) NormalizeRecords(req *model.NormalizeRecordsReques // model the raw table data as inserts. for _, tbl := range destinationTableNames { // SELECT projection FROM raw_table WHERE _peerdb_batch_id > normalize_batch_id AND _peerdb_batch_id <= sync_batch_id - //fmt.Printf("\n************************* in normalize_records1: tbl %s", tbl) selectQuery := strings.Builder{} selectQuery.WriteString("SELECT ") @@ -143,7 +142,6 @@ func (c *ClickhouseConnector) NormalizeRecords(req *model.NormalizeRecordsReques //schema := c.tableSchemaMapping[tbl] schema := req.TableNameSchemaMapping[tbl] - //fmt.Printf("\n************************* in normalize_records2: schema %+v", schema) numCols := len(schema.ColumnNames) projection := strings.Builder{} @@ -245,8 +243,6 @@ func (c *ClickhouseConnector) getDistinctTableNamesInBatch( tableNames = append(tableNames, tableName.String) } - fmt.Printf("\n****************************** getDistinctTableNamesInBatch tableNames %+v", tableNames) - return tableNames, nil } diff --git a/flow/connectors/clickhouse/qrep.go b/flow/connectors/clickhouse/qrep.go index f3c6f7b763..b6a8b59cc5 100644 --- a/flow/connectors/clickhouse/qrep.go +++ b/flow/connectors/clickhouse/qrep.go @@ -24,7 +24,6 @@ func (c *ClickhouseConnector) SyncQRepRecords( partition *protos.QRepPartition, stream *model.QRecordStream, ) (int, error) { - //fmt.Printf("\n******************* in ClickhouseConnector.SyncQRepRecords") // Ensure the destination table is available. destTable := config.DestinationTableIdentifier flowLog := slog.Group("sync_metadata", diff --git a/flow/connectors/clickhouse/qrep_avro_sync.go b/flow/connectors/clickhouse/qrep_avro_sync.go index 4fed02a355..61e6b4d600 100644 --- a/flow/connectors/clickhouse/qrep_avro_sync.go +++ b/flow/connectors/clickhouse/qrep_avro_sync.go @@ -55,7 +55,6 @@ func NewClickhouseAvroSyncMethod( // } func (s *ClickhouseAvroSyncMethod) CopyStageToDestination(avroFile *avro.AvroFile) error { - //fmt.Printf("\n************************* in CopyStageToDesti stagingPath: %+v", s.config.StagingPath) stagingPath := s.config.StagingPath if stagingPath == "" { stagingPath = s.config.DestinationPeer.GetClickhouseConfig().S3Integration //"s3://avro-clickhouse" @@ -74,8 +73,6 @@ func (s *ClickhouseAvroSyncMethod) CopyStageToDestination(avroFile *avro.AvroFil query := fmt.Sprintf("INSERT INTO %s SELECT * FROM s3('%s','%s','%s', 'Avro')", s.config.DestinationTableIdentifier, avroFileUrl, awsCreds.AccessKeyID, awsCreds.SecretAccessKey) - //fmt.Printf("\n************************ CopyStagingToDestination query: %s\n", query) - _, err = s.connector.database.Exec(query) return err @@ -86,8 +83,6 @@ func (s *ClickhouseAvroSyncMethod) SyncRecords( stream *model.QRecordStream, flowJobName string, ) (int, error) { - //fmt.Printf("\n************************* in qrep_avro_sync: SyncRecords1 dstTableSchema %+v", dstTableSchema) - //fmt.Printf("\n************************ in qrep_avro_sync: SyncRecords2 config %+v", s.config) //s.config.StagingPath = "s3://avro-clickhouse" tableLog := slog.String("destinationTable", s.config.DestinationTableIdentifier) dstTableName := s.config.DestinationTableIdentifier @@ -97,8 +92,6 @@ func (s *ClickhouseAvroSyncMethod) SyncRecords( return -1, fmt.Errorf("failed to get schema from stream: %w", err) } - //fmt.Printf("\n******************************* in qrep_avro_sync: SyncRecords3 stream schema %+v", schema) - s.connector.logger.Info("sync function called and schema acquired", tableLog) avroSchema, err := s.getAvroSchema(dstTableName, schema) @@ -106,12 +99,8 @@ func (s *ClickhouseAvroSyncMethod) SyncRecords( return 0, err } - //fmt.Printf("\n******************************* in qrep_avro_sync: SyncRecords5 avro schema %+v", avroSchema) - partitionID := shared.RandomString(16) - //fmt.Printf("\n******************* calling writeToAvroFile partitionId: %+v", partitionID) avroFile, err := s.writeToAvroFile(stream, avroSchema, partitionID, flowJobName) - //fmt.Printf("\n******************* records written to avrofile %+v", avroFile) if err != nil { return 0, err } @@ -145,7 +134,6 @@ func (s *ClickhouseAvroSyncMethod) SyncRecords( //Copy stage/avro to destination err = s.CopyStageToDestination(avroFile) - //fmt.Printf("\n ***************** in qrep_avro_sync: SyncRecords after CopyStageToDestination err: %+v", err) if err != nil { return 0, err } @@ -159,8 +147,6 @@ func (s *ClickhouseAvroSyncMethod) SyncQRepRecords( dstTableSchema []*sql.ColumnType, stream *model.QRecordStream, ) (int, error) { - fmt.Printf("\n******************* in qrep_avro_sync: SyncQRepRecords config %+v", s.config.DestinationPeer) - //fmt.Printf("\n************************* in SyncQRepRecords 1") startTime := time.Now() dstTableName := config.DestinationTableIdentifier //s.config.StagingPath = "s3://avro-clickhouse" @@ -170,15 +156,11 @@ func (s *ClickhouseAvroSyncMethod) SyncQRepRecords( if err != nil { return -1, fmt.Errorf("failed to get schema from stream: %w", err) } - //fmt.Printf("\n******************************* in qrep_avro_sync: SyncQRepRecords 2 avro schema %+v", schema) avroSchema, err := s.getAvroSchema(dstTableName, schema) if err != nil { return 0, err } - //fmt.Printf("\n******************************* in qrep_avro_sync: SyncQRepRecords 3 avro schema %+v", avroSchema) - //fmt.Printf("\n******************************* in qrep_avro_sync: SyncQRepRecords 4 avro schema %+v",) - avroFile, err := s.writeToAvroFile(stream, avroSchema, partition.PartitionId, config.FlowJobName) if err != nil { return 0, err @@ -191,8 +173,6 @@ func (s *ClickhouseAvroSyncMethod) SyncQRepRecords( awsCreds, err := utils.GetAWSSecrets(utils.S3PeerCredentials{}) avroFileUrl := fmt.Sprintf("https://%s.s3.%s.amazonaws.com%s", s3o.Bucket, awsCreds.Region, avroFile.FilePath) - //fmt.Printf("\n*********************** in qrep_avro_sync SyncQRepRecords 4 avroFileUrl: %+v", avroFileUrl) - if err != nil { return 0, err } @@ -201,12 +181,8 @@ func (s *ClickhouseAvroSyncMethod) SyncQRepRecords( query := fmt.Sprintf("INSERT INTO %s (%s) SELECT * FROM s3('%s','%s','%s', 'Avro')", config.DestinationTableIdentifier, selector, avroFileUrl, awsCreds.AccessKeyID, awsCreds.SecretAccessKey) - //fmt.Printf("\n************************************ in qrep_avro_sync SyncQRepRecords 5 query: %s\n", query) - _, err = s.connector.database.Exec(query) - //fmt.Printf("\n************************************ in qrep_avro_sync SyncQRepRecords 6 err: %+v\n", err) - if err != nil { return 0, err } @@ -238,12 +214,10 @@ func (s *ClickhouseAvroSyncMethod) writeToAvroFile( partitionID string, flowJobName string, ) (*avro.AvroFile, error) { - fmt.Printf("\n************************* in writeToAvroFile 1 21 %+v", s.config) stagingPath := s.config.StagingPath //"s3://avro-clickhouse" if stagingPath == "" { stagingPath = s.config.DestinationPeer.GetClickhouseConfig().S3Integration //"s3://avro-clickhouse" } - fmt.Printf("\n****************************************** StagingPath: %+v*****\n", stagingPath) ocfWriter := avro.NewPeerDBOCFWriter(s.connector.ctx, stream, avroSchema, avro.CompressZstd, qvalue.QDWHTypeClickhouse) s3o, err := utils.NewS3BucketAndPrefix(stagingPath) @@ -253,7 +227,6 @@ func (s *ClickhouseAvroSyncMethod) writeToAvroFile( s3AvroFileKey := fmt.Sprintf("%s/%s/%s.avro.zst", s3o.Prefix, flowJobName, partitionID) // s.config.FlowJobName avroFile, err := ocfWriter.WriteRecordsToS3(s3o.Bucket, s3AvroFileKey, utils.S3PeerCredentials{}) ///utils.S3PeerCredentials{}) - //fmt.Printf("\n************************* writeToAvroFile 2 avroFile %+v, err: %+v", avroFile, err) if err != nil { return nil, fmt.Errorf("failed to write records to S3: %w", err) } diff --git a/flow/connectors/core.go b/flow/connectors/core.go index 09f68282b2..b3a93f9b90 100644 --- a/flow/connectors/core.go +++ b/flow/connectors/core.go @@ -161,7 +161,6 @@ func GetCDCSyncConnector(ctx context.Context, config *protos.Peer) (CDCSyncConne case *protos.Peer_ClickhouseConfig: return connclickhouse.NewClickhouseConnector(ctx, config.GetClickhouseConfig()) default: - //fmt.Printf("\n*********************** in GetCDCSyncConnector not found %+v %T\n", inner, inner) return nil, ErrUnsupportedFunctionality } } @@ -180,7 +179,6 @@ func GetCDCNormalizeConnector(ctx context.Context, case *protos.Peer_ClickhouseConfig: return connclickhouse.NewClickhouseConnector(ctx, config.GetClickhouseConfig()) default: - //fmt.Printf("\n*********************** in GetCDCNormalizeConnector not found %+v %T\n", inner, inner) return nil, ErrUnsupportedFunctionality } } diff --git a/flow/connectors/external_metadata/store.go b/flow/connectors/external_metadata/store.go index 309fe9ecd0..08f2938e73 100644 --- a/flow/connectors/external_metadata/store.go +++ b/flow/connectors/external_metadata/store.go @@ -112,7 +112,6 @@ func (p *PostgresMetadataStore) SetupMetadata() error { // create the schema _, err := p.conn.Exec(p.ctx, "CREATE SCHEMA IF NOT EXISTS "+p.schemaName) if err != nil && !utils.IsUniqueError(err) { - fmt.Printf("********** error in SetupMetadata %+v", err) p.logger.Error("failed to create schema", slog.Any("error", err)) return err } diff --git a/flow/workflows/qrep_flow.go b/flow/workflows/qrep_flow.go index 056c571bba..d434d5ea3b 100644 --- a/flow/workflows/qrep_flow.go +++ b/flow/workflows/qrep_flow.go @@ -36,7 +36,6 @@ type QRepPartitionFlowExecution struct { // returns a new empty QRepFlowState func NewQRepFlowState() *protos.QRepFlowState { - //fmt.Printf("\n*****************************NewQRepFlowState") return &protos.QRepFlowState{ LastPartition: &protos.QRepPartition{ @@ -51,7 +50,6 @@ func NewQRepFlowState() *protos.QRepFlowState { // returns a new empty QRepFlowState func NewQRepFlowStateForTesting() *protos.QRepFlowState { - //fmt.Printf("\n*****************************NewQRepFlowStateForTesting") return &protos.QRepFlowState{ LastPartition: &protos.QRepPartition{ @@ -66,7 +64,6 @@ func NewQRepFlowStateForTesting() *protos.QRepFlowState { // NewQRepFlowExecution creates a new instance of QRepFlowExecution. func NewQRepFlowExecution(ctx workflow.Context, config *protos.QRepConfig, runUUID string) *QRepFlowExecution { - //fmt.Printf("\n*****************************NewQRepFlowExecution") return &QRepFlowExecution{ config: config, @@ -82,7 +79,6 @@ func NewQRepFlowExecution(ctx workflow.Context, config *protos.QRepConfig, runUU func NewQRepPartitionFlowExecution(ctx workflow.Context, config *protos.QRepConfig, runUUID string, ) *QRepPartitionFlowExecution { - //fmt.Printf("\n*****************************NewQRepPartitionFlowExecution") return &QRepPartitionFlowExecution{ config: config, @@ -411,7 +407,6 @@ func QRepFlowWorkflow( config *protos.QRepConfig, state *protos.QRepFlowState, ) error { - //fmt.Printf("\n*****************************QRepFlowWorkflow") // The structure of this workflow is as follows: // 1. Start the loop to continuously run the replication flow. @@ -547,7 +542,6 @@ func QRepPartitionWorkflow( partitions *protos.QRepPartitionBatch, runUUID string, ) error { - //fmt.Printf("\n*****************************QRepPartitionWorkflow") ctx = workflow.WithValue(ctx, shared.FlowNameKey, config.FlowJobName) q := NewQRepPartitionFlowExecution(ctx, config, runUUID) From eaaf63d8bbc4f4afa183ee6bd757b966eff3ce82 Mon Sep 17 00:00:00 2001 From: Pankaj B Date: Fri, 26 Jan 2024 23:32:35 +0530 Subject: [PATCH 18/36] cleanup --- flow/cmd/handler.go | 2 - flow/connectors/clickhouse/cdc.go | 30 ++++++------- flow/connectors/clickhouse/qrep_avro_sync.go | 47 -------------------- flow/workflows/qrep_flow.go | 6 --- 4 files changed, 15 insertions(+), 70 deletions(-) diff --git a/flow/cmd/handler.go b/flow/cmd/handler.go index 1e3e8d48e6..1d96e5a84f 100644 --- a/flow/cmd/handler.go +++ b/flow/cmd/handler.go @@ -121,7 +121,6 @@ func (h *FlowRequestHandler) CreateCDCFlow( ctx context.Context, req *protos.CreateCDCFlowRequest, ) (*protos.CreateCDCFlowResponse, error) { cfg := req.ConnectionConfigs - _, validateErr := h.ValidateCDCMirror(ctx, req) if validateErr != nil { slog.Error("validate mirror error", slog.Any("error", validateErr)) @@ -229,7 +228,6 @@ func (h *FlowRequestHandler) CreateQRepFlow( ctx context.Context, req *protos.CreateQRepFlowRequest, ) (*protos.CreateQRepFlowResponse, error) { cfg := req.QrepConfig - workflowID := fmt.Sprintf("%s-qrepflow-%s", cfg.FlowJobName, uuid.New()) workflowOptions := client.StartWorkflowOptions{ ID: workflowID, diff --git a/flow/connectors/clickhouse/cdc.go b/flow/connectors/clickhouse/cdc.go index 8103c05b64..7b49ca4dfb 100644 --- a/flow/connectors/clickhouse/cdc.go +++ b/flow/connectors/clickhouse/cdc.go @@ -48,26 +48,26 @@ type MirrorJobRow struct { NormalizeBatchID int } -func (c *ClickhouseConnector) getMirrorRowByJobNAme(jobName string) (*MirrorJobRow, error) { - getLastOffsetSQL := "SELECT mirror_job_name, offset, sync_batch_id, normalize_batch_id FROM %s WHERE MIRROR_JOB_NAME=? Limit 1" +// func (c *ClickhouseConnector) getMirrorRowByJobNAme(jobName string) (*MirrorJobRow, error) { +// getLastOffsetSQL := "SELECT mirror_job_name, offset, sync_batch_id, normalize_batch_id FROM %s WHERE MIRROR_JOB_NAME=? Limit 1" - row := c.database.QueryRowContext(c.ctx, fmt.Sprintf(getLastOffsetSQL, mirrorJobsTableIdentifier), jobName) +// row := c.database.QueryRowContext(c.ctx, fmt.Sprintf(getLastOffsetSQL, mirrorJobsTableIdentifier), jobName) - var result MirrorJobRow +// var result MirrorJobRow - err := row.Scan( - &result.MirrorJobName, - &result.Offset, - &result.SyncBatchID, - &result.NormalizeBatchID, - ) +// err := row.Scan( +// &result.MirrorJobName, +// &result.Offset, +// &result.SyncBatchID, +// &result.NormalizeBatchID, +// ) - if err != nil { - return nil, err - } +// if err != nil { +// return nil, err +// } - return &result, nil -} +// return &result, nil +// } func (c *ClickhouseConnector) CreateRawTable(req *protos.CreateRawTableInput) (*protos.CreateRawTableOutput, error) { rawTableName := c.getRawTableName(req.FlowJobName) diff --git a/flow/connectors/clickhouse/qrep_avro_sync.go b/flow/connectors/clickhouse/qrep_avro_sync.go index 61e6b4d600..0bf5c0c8c7 100644 --- a/flow/connectors/clickhouse/qrep_avro_sync.go +++ b/flow/connectors/clickhouse/qrep_avro_sync.go @@ -32,28 +32,6 @@ func NewClickhouseAvroSyncMethod( } } -// func (s *ClickhouseAvroSyncMethod) putFileToStage(avroFile *avro.AvroFile, stage string) error { -// if avroFile.StorageLocation != avro.AvroLocalStorage { -// s.connector.logger.Info("no file to put to stage") -// return nil -// } - -// activity.RecordHeartbeat(s.connector.ctx, "putting file to stage") -// putCmd := fmt.Sprintf("PUT file://%s @%s", avroFile.FilePath, stage) - -// shutdown := utils.HeartbeatRoutine(s.connector.ctx, 10*time.Second, func() string { -// return fmt.Sprintf("putting file to stage %s", stage) -// }) -// defer shutdown() - -// if _, err := s.connector.database.ExecContext(s.connector.ctx, putCmd); err != nil { -// return fmt.Errorf("failed to put file to stage: %w", err) -// } - -// s.connector.logger.Info(fmt.Sprintf("put file %s to stage %s", avroFile.FilePath, stage)) -// return nil -// } - func (s *ClickhouseAvroSyncMethod) CopyStageToDestination(avroFile *avro.AvroFile) error { stagingPath := s.config.StagingPath if stagingPath == "" { @@ -107,31 +85,6 @@ func (s *ClickhouseAvroSyncMethod) SyncRecords( defer avroFile.Cleanup() s.connector.logger.Info(fmt.Sprintf("written %d records to Avro file", avroFile.NumRecords), tableLog) - // stage := s.connector.getStageNameForJob(s.config.FlowJobName) - // err = s.connector.createStage(stage, s.config) - // if err != nil { - // return 0, err - // } - // s.connector.logger.Info(fmt.Sprintf("Created stage %s", stage)) - - // colNames, _, err := s.connector.getColsFromTable(s.config.DestinationTableIdentifier) - // if err != nil { - // return 0, err - // } - - // err = s.putFileToStage(avroFile, "stage") - // if err != nil { - // return 0, err - // } - // s.connector.logger.Info("pushed avro file to stage", tableLog) - - // err = CopyStageToDestination(s.connector, s.config, s.config.DestinationTableIdentifier, stage, colNames) - // if err != nil { - // return 0, err - // } - // s.connector.logger.Info(fmt.Sprintf("copying records into %s from stage %s", - // s.config.DestinationTableIdentifier, stage)) - //Copy stage/avro to destination err = s.CopyStageToDestination(avroFile) if err != nil { diff --git a/flow/workflows/qrep_flow.go b/flow/workflows/qrep_flow.go index d434d5ea3b..6b4db6482b 100644 --- a/flow/workflows/qrep_flow.go +++ b/flow/workflows/qrep_flow.go @@ -36,7 +36,6 @@ type QRepPartitionFlowExecution struct { // returns a new empty QRepFlowState func NewQRepFlowState() *protos.QRepFlowState { - return &protos.QRepFlowState{ LastPartition: &protos.QRepPartition{ PartitionId: "not-applicable-partition", @@ -50,7 +49,6 @@ func NewQRepFlowState() *protos.QRepFlowState { // returns a new empty QRepFlowState func NewQRepFlowStateForTesting() *protos.QRepFlowState { - return &protos.QRepFlowState{ LastPartition: &protos.QRepPartition{ PartitionId: "not-applicable-partition", @@ -64,7 +62,6 @@ func NewQRepFlowStateForTesting() *protos.QRepFlowState { // NewQRepFlowExecution creates a new instance of QRepFlowExecution. func NewQRepFlowExecution(ctx workflow.Context, config *protos.QRepConfig, runUUID string) *QRepFlowExecution { - return &QRepFlowExecution{ config: config, flowExecutionID: workflow.GetInfo(ctx).WorkflowExecution.ID, @@ -79,7 +76,6 @@ func NewQRepFlowExecution(ctx workflow.Context, config *protos.QRepConfig, runUU func NewQRepPartitionFlowExecution(ctx workflow.Context, config *protos.QRepConfig, runUUID string, ) *QRepPartitionFlowExecution { - return &QRepPartitionFlowExecution{ config: config, flowExecutionID: workflow.GetInfo(ctx).WorkflowExecution.ID, @@ -407,7 +403,6 @@ func QRepFlowWorkflow( config *protos.QRepConfig, state *protos.QRepFlowState, ) error { - // The structure of this workflow is as follows: // 1. Start the loop to continuously run the replication flow. // 2. In the loop, query the source database to get the partitions to replicate. @@ -542,7 +537,6 @@ func QRepPartitionWorkflow( partitions *protos.QRepPartitionBatch, runUUID string, ) error { - ctx = workflow.WithValue(ctx, shared.FlowNameKey, config.FlowJobName) q := NewQRepPartitionFlowExecution(ctx, config, runUUID) return q.ReplicatePartitions(ctx, partitions) From 4abf02cd101f97d81f0ef7794c1c8920496ff50d Mon Sep 17 00:00:00 2001 From: Pankaj B Date: Sun, 28 Jan 2024 07:05:11 +0530 Subject: [PATCH 19/36] fix after cleanup --- flow/connectors/clickhouse/cdc.go | 1 - ui/app/peers/create/[peerType]/helpers/ch.ts | 1 + 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/flow/connectors/clickhouse/cdc.go b/flow/connectors/clickhouse/cdc.go index 7b49ca4dfb..e6e5850e26 100644 --- a/flow/connectors/clickhouse/cdc.go +++ b/flow/connectors/clickhouse/cdc.go @@ -124,7 +124,6 @@ func (c *ClickhouseConnector) syncRecordsViaAvro( } qrepConfig := &protos.QRepConfig{ - DestinationPeer: c.config.DestinationPeer, StagingPath: c.config.S3Integration, FlowJobName: req.FlowJobName, DestinationTableIdentifier: strings.ToLower(fmt.Sprintf("%s", rawTableIdentifier)), diff --git a/ui/app/peers/create/[peerType]/helpers/ch.ts b/ui/app/peers/create/[peerType]/helpers/ch.ts index 57eee3c39a..025d64aa36 100644 --- a/ui/app/peers/create/[peerType]/helpers/ch.ts +++ b/ui/app/peers/create/[peerType]/helpers/ch.ts @@ -57,4 +57,5 @@ export const blankClickhouseSetting: ClickhouseConfig = { password: '', database: '', s3Integration: '', + metadataDb: undefined }; From 3b21db0c6062cde075cd4cb13692909221309a7d Mon Sep 17 00:00:00 2001 From: Pankaj B Date: Sun, 28 Jan 2024 07:30:55 +0530 Subject: [PATCH 20/36] fix after reabase --- flow/connectors/clickhouse/cdc.go | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/flow/connectors/clickhouse/cdc.go b/flow/connectors/clickhouse/cdc.go index e6e5850e26..fad2474fb4 100644 --- a/flow/connectors/clickhouse/cdc.go +++ b/flow/connectors/clickhouse/cdc.go @@ -151,7 +151,7 @@ func (c *ClickhouseConnector) syncRecordsViaAvro( } return &model.SyncResponse{ - LastSyncedCheckPointID: lastCheckpoint, + LastSyncedCheckpointID: lastCheckpoint, NumRecordsSynced: int64(numRecords), CurrentSyncBatchID: syncBatchID, TableNameRowsMapping: tableNameRowsMapping, @@ -165,13 +165,13 @@ func (c *ClickhouseConnector) SyncRecords(req *model.SyncRecordsRequest) (*model rawTableName := c.getRawTableName(req.FlowJobName) c.logger.Info(fmt.Sprintf("pushing records to Clickhouse table %s", rawTableName)) - syncBatchID, err := c.GetLastSyncBatchID(req.FlowJobName) - if err != nil { - return nil, fmt.Errorf("failed to get previous syncBatchID: %w", err) - } - syncBatchID += 1 + // syncBatchID, err := c.GetLastSyncBatchID(req.FlowJobName) + // if err != nil { + // return nil, fmt.Errorf("failed to get previous syncBatchID: %w", err) + // } + // syncBatchID += 1 - res, err := c.syncRecordsViaAvro(req, rawTableName, syncBatchID) + res, err := c.syncRecordsViaAvro(req, rawTableName, req.SyncBatchID) if err != nil { return nil, err } From 2de2c4eecd5aa915b164cd4175c4e4048920d94a Mon Sep 17 00:00:00 2001 From: Pankaj B Date: Sun, 28 Jan 2024 14:19:37 +0530 Subject: [PATCH 21/36] fix after rebase --- flow/activities/flowable.go | 3 +- flow/connectors/clickhouse/cdc.go | 8 +++--- flow/connectors/clickhouse/normalize.go | 37 ++++++++++++------------- 3 files changed, 23 insertions(+), 25 deletions(-) diff --git a/flow/activities/flowable.go b/flow/activities/flowable.go index de3f2bcfeb..8031d0c328 100644 --- a/flow/activities/flowable.go +++ b/flow/activities/flowable.go @@ -299,12 +299,13 @@ func (a *FlowableActivity) StartFlow(ctx context.Context, TableMappings: input.FlowConnectionConfigs.TableMappings, StagingPath: input.FlowConnectionConfigs.CdcStagingPath, }) - res.RelationMessageMapping = input.RelationMessageMapping + if err != nil { slog.Warn("failed to push records", slog.Any("error", err)) a.Alerter.LogFlowError(ctx, flowName, err) return nil, fmt.Errorf("failed to push records: %w", err) } + res.RelationMessageMapping = input.RelationMessageMapping err = errGroup.Wait() if err != nil { diff --git a/flow/connectors/clickhouse/cdc.go b/flow/connectors/clickhouse/cdc.go index fad2474fb4..5a5ad6326a 100644 --- a/flow/connectors/clickhouse/cdc.go +++ b/flow/connectors/clickhouse/cdc.go @@ -139,8 +139,8 @@ func (c *ClickhouseConnector) syncRecordsViaAvro( return nil, err } - tableSchemaDeltas := req.Records.WaitForSchemaDeltas(req.TableMappings) - err = c.ReplayTableSchemaDeltas(req.FlowJobName, tableSchemaDeltas) + //tableSchemaDeltas := req.Records.WaitForSchemaDeltas(req.TableMappings) + err = c.ReplayTableSchemaDeltas(req.FlowJobName, req.Records.SchemaDeltas) if err != nil { return nil, fmt.Errorf("failed to sync schema changes: %w", err) } @@ -155,8 +155,8 @@ func (c *ClickhouseConnector) syncRecordsViaAvro( NumRecordsSynced: int64(numRecords), CurrentSyncBatchID: syncBatchID, TableNameRowsMapping: tableNameRowsMapping, - TableSchemaDeltas: tableSchemaDeltas, - RelationMessageMapping: <-req.Records.RelationMessageMapping, + TableSchemaDeltas: req.Records.SchemaDeltas, + //RelationMessageMapping: <-req.Records.RelationMessageMapping, }, nil } diff --git a/flow/connectors/clickhouse/normalize.go b/flow/connectors/clickhouse/normalize.go index 2432809192..c82602c232 100644 --- a/flow/connectors/clickhouse/normalize.go +++ b/flow/connectors/clickhouse/normalize.go @@ -104,25 +104,25 @@ func generateCreateTableSQLForNormalizedTable( } func (c *ClickhouseConnector) NormalizeRecords(req *model.NormalizeRecordsRequest) (*model.NormalizeResponse, error) { - batchIDs, err := c.GetLastSyncAndNormalizeBatchID(req.FlowJobName) + normBatchID, err := c.GetLastNormalizeBatchID(req.FlowJobName) if err != nil { c.logger.ErrorContext(c.ctx, "[clickhouse] error while getting last sync and normalize batch id", err) return nil, err } // normalize has caught up with sync, chill until more records are loaded. - if batchIDs.NormalizeBatchID >= batchIDs.SyncBatchID { + if normBatchID >= req.SyncBatchID { return &model.NormalizeResponse{ Done: false, - StartBatchID: batchIDs.NormalizeBatchID, - EndBatchID: batchIDs.SyncBatchID, + StartBatchID: normBatchID, + EndBatchID: req.SyncBatchID, }, nil } destinationTableNames, err := c.getDistinctTableNamesInBatch( req.FlowJobName, - batchIDs.SyncBatchID, - batchIDs.NormalizeBatchID, + req.SyncBatchID, + normBatchID, ) if err != nil { c.logger.ErrorContext(c.ctx, "[clickhouse] error while getting distinct table names in batch", err) @@ -179,9 +179,9 @@ func (c *ClickhouseConnector) NormalizeRecords(req *model.NormalizeRecordsReques selectQuery.WriteString(" FROM ") selectQuery.WriteString(rawTbl) selectQuery.WriteString(" WHERE _peerdb_batch_id > ") - selectQuery.WriteString(fmt.Sprintf("%d", batchIDs.NormalizeBatchID)) + selectQuery.WriteString(fmt.Sprintf("%d", normBatchID)) selectQuery.WriteString(" AND _peerdb_batch_id <= ") - selectQuery.WriteString(fmt.Sprintf("%d", batchIDs.SyncBatchID)) + selectQuery.WriteString(fmt.Sprintf("%d", req.SyncBatchID)) selectQuery.WriteString(" AND _peerdb_destination_table_name = '") selectQuery.WriteString(tbl) selectQuery.WriteString("'") @@ -203,12 +203,12 @@ func (c *ClickhouseConnector) NormalizeRecords(req *model.NormalizeRecordsReques } } - endNormalizeBatchId := batchIDs.NormalizeBatchID + 1 + endNormalizeBatchId := normBatchID + 1 c.pgMetadata.UpdateNormalizeBatchID(req.FlowJobName, endNormalizeBatchId) return &model.NormalizeResponse{ Done: true, StartBatchID: endNormalizeBatchId, - EndBatchID: batchIDs.SyncBatchID, + EndBatchID: req.SyncBatchID, }, nil } @@ -246,19 +246,16 @@ func (c *ClickhouseConnector) getDistinctTableNamesInBatch( return tableNames, nil } -func (c *ClickhouseConnector) GetLastSyncAndNormalizeBatchID(flowJobName string) (model.SyncAndNormalizeBatchID, error) { - syncBatchID, err := c.pgMetadata.GetLastBatchID(flowJobName) - if err != nil { - return model.SyncAndNormalizeBatchID{}, fmt.Errorf("error while getting last sync batch id: %w", err) - } +func (c *ClickhouseConnector) GetLastNormalizeBatchID(flowJobName string) (int64, error) { + // syncBatchID, err := c.pgMetadata.GetLastBatchID(flowJobName) + // if err != nil { + // return 0, fmt.Errorf("error while getting last sync batch id: %w", err) + // } normalizeBatchID, err := c.pgMetadata.GetLastNormalizeBatchID(flowJobName) if err != nil { - return model.SyncAndNormalizeBatchID{}, fmt.Errorf("error while getting last normalize batch id: %w", err) + return 0, fmt.Errorf("error while getting last normalize batch id: %w", err) } - return model.SyncAndNormalizeBatchID{ - SyncBatchID: syncBatchID, - NormalizeBatchID: normalizeBatchID, - }, nil + return normalizeBatchID, nil } From fb598ac9320b314183465acc56e592beb56b1aae Mon Sep 17 00:00:00 2001 From: Pankaj B Date: Mon, 29 Jan 2024 17:10:03 +0530 Subject: [PATCH 22/36] fix s3 url formation issue --- flow/connectors/clickhouse/qrep_avro_sync.go | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/flow/connectors/clickhouse/qrep_avro_sync.go b/flow/connectors/clickhouse/qrep_avro_sync.go index 0bf5c0c8c7..6fbe68f1cf 100644 --- a/flow/connectors/clickhouse/qrep_avro_sync.go +++ b/flow/connectors/clickhouse/qrep_avro_sync.go @@ -42,7 +42,7 @@ func (s *ClickhouseAvroSyncMethod) CopyStageToDestination(avroFile *avro.AvroFil return err } awsCreds, err := utils.GetAWSSecrets(utils.S3PeerCredentials{}) - avroFileUrl := fmt.Sprintf("https://%s.s3.%s.amazonaws.com%s", s3o.Bucket, awsCreds.Region, avroFile.FilePath) + avroFileUrl := fmt.Sprintf("https://%s.s3.%s.amazonaws.com/%s", s3o.Bucket, awsCreds.Region, avroFile.FilePath) if err != nil { return err @@ -124,7 +124,7 @@ func (s *ClickhouseAvroSyncMethod) SyncQRepRecords( return 0, err } awsCreds, err := utils.GetAWSSecrets(utils.S3PeerCredentials{}) - avroFileUrl := fmt.Sprintf("https://%s.s3.%s.amazonaws.com%s", s3o.Bucket, awsCreds.Region, avroFile.FilePath) + avroFileUrl := fmt.Sprintf("https://%s.s3.%s.amazonaws.com/%s", s3o.Bucket, awsCreds.Region, avroFile.FilePath) if err != nil { return 0, err @@ -178,7 +178,9 @@ func (s *ClickhouseAvroSyncMethod) writeToAvroFile( return nil, fmt.Errorf("failed to parse staging path: %w", err) } - s3AvroFileKey := fmt.Sprintf("%s/%s/%s.avro.zst", s3o.Prefix, flowJobName, partitionID) // s.config.FlowJobName + s3AvroFileKey := fmt.Sprintf("%s/%s/%s.avro.zst", s3o.Prefix, flowJobName, partitionID) // s.config.FlowJobName + s3AvroFileKey = strings.Trim(s3AvroFileKey, "/") + avroFile, err := ocfWriter.WriteRecordsToS3(s3o.Bucket, s3AvroFileKey, utils.S3PeerCredentials{}) ///utils.S3PeerCredentials{}) if err != nil { return nil, fmt.Errorf("failed to write records to S3: %w", err) From ad208cee0936b262067147ec95417dee54888a51 Mon Sep 17 00:00:00 2001 From: Pankaj B Date: Mon, 29 Jan 2024 18:29:56 +0530 Subject: [PATCH 23/36] restore docker --- docker-compose-dev.yml | 77 +++++++++++++++++++++--------------------- 1 file changed, 39 insertions(+), 38 deletions(-) diff --git a/docker-compose-dev.yml b/docker-compose-dev.yml index cbd2c8afa5..dfd74c0554 100644 --- a/docker-compose-dev.yml +++ b/docker-compose-dev.yml @@ -19,10 +19,10 @@ x-flow-worker-env: &flow-worker-env # For GCS, these will be your HMAC keys instead # For more information: # https://cloud.google.com/storage/docs/authentication/managing-hmackeys - AWS_ACCESS_KEY_ID: ${AWS_ACCESS_KEY_ID:-AKIASB7EBZDCEVIMB4XH} - AWS_SECRET_ACCESS_KEY: ${AWS_SECRET_ACCESS_KEY:-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} + AWS_ACCESS_KEY_ID: ${AWS_ACCESS_KEY_ID:-} + AWS_SECRET_ACCESS_KEY: ${AWS_SECRET_ACCESS_KEY:-} # For GCS, set this to "auto" without the quotes - AWS_REGION: ${AWS_REGION:-us-east-2} + AWS_REGION: ${AWS_REGION:-} # For GCS, set this as: https://storage.googleapis.com AWS_ENDPOINT: ${AWS_ENDPOINT:-} # enables worker profiling using Grafana Pyroscope @@ -178,40 +178,41 @@ services: temporal-admin-tools: condition: service_healthy - # peerdb: - # container_name: peerdb-server - # stop_signal: SIGINT - # build: - # context: . - # dockerfile: stacks/peerdb-server.Dockerfile - # environment: - # <<: *catalog-config - # PEERDB_LOG_DIR: /var/log/peerdb - # PEERDB_PASSWORD: peerdb - # PEERDB_FLOW_SERVER_ADDRESS: grpc://flow_api:8112 - # RUST_LOG: info - # RUST_BACKTRACE: 1 - # ports: - # - 9900:9900 - # depends_on: - # catalog: - # condition: service_healthy + peerdb: + container_name: peerdb-server + stop_signal: SIGINT + build: + context: . + dockerfile: stacks/peerdb-server.Dockerfile + environment: + <<: *catalog-config + PEERDB_LOG_DIR: /var/log/peerdb + PEERDB_PASSWORD: peerdb + PEERDB_FLOW_SERVER_ADDRESS: grpc://flow_api:8112 + RUST_LOG: info + RUST_BACKTRACE: 1 + ports: + - 9900:9900 + depends_on: + catalog: + condition: service_healthy + + peerdb-ui: + container_name: peerdb-ui + build: + context: . + dockerfile: stacks/peerdb-ui.Dockerfile + ports: + - 3000:3000 + environment: + <<: *catalog-config + DATABASE_URL: postgres://postgres:postgres@catalog:5432/postgres + PEERDB_FLOW_SERVER_HTTP: http://flow_api:8113 + PEERDB_PASSWORD: + NEXTAUTH_SECRET: __changeme__ + NEXTAUTH_URL: http://localhost:3000 + depends_on: + - flow-api - # peerdb-ui: - # container_name: peerdb-ui - # build: - # context: . - # dockerfile: stacks/peerdb-ui.Dockerfile - # ports: - # - 3000:3000 - # environment: - # <<: *catalog-config - # DATABASE_URL: postgres://postgres:postgres@catalog:5432/postgres - # PEERDB_FLOW_SERVER_HTTP: http://flow_api:8113 - # PEERDB_PASSWORD: - # NEXTAUTH_SECRET: __changeme__ - # NEXTAUTH_URL: http://localhost:3000 - # depends_on: - # - flow-api volumes: - pgdata: + pgdata: \ No newline at end of file From a51dbf360a828984ef1aa3f6ac3208502e7a73a8 Mon Sep 17 00:00:00 2001 From: Pankaj B Date: Mon, 29 Jan 2024 18:31:42 +0530 Subject: [PATCH 24/36] lint --- flow/activities/flowable.go | 1 - flow/connectors/clickhouse/cdc.go | 11 ++++++----- flow/connectors/clickhouse/clickhouse.go | 1 + flow/connectors/clickhouse/normalize.go | 2 +- flow/connectors/clickhouse/qrep_avro_sync.go | 12 ++++++------ 5 files changed, 14 insertions(+), 13 deletions(-) diff --git a/flow/activities/flowable.go b/flow/activities/flowable.go index 8031d0c328..3c19feec0c 100644 --- a/flow/activities/flowable.go +++ b/flow/activities/flowable.go @@ -299,7 +299,6 @@ func (a *FlowableActivity) StartFlow(ctx context.Context, TableMappings: input.FlowConnectionConfigs.TableMappings, StagingPath: input.FlowConnectionConfigs.CdcStagingPath, }) - if err != nil { slog.Warn("failed to push records", slog.Any("error", err)) a.Alerter.LogFlowError(ctx, flowName, err) diff --git a/flow/connectors/clickhouse/cdc.go b/flow/connectors/clickhouse/cdc.go index 5a5ad6326a..66a5c0634e 100644 --- a/flow/connectors/clickhouse/cdc.go +++ b/flow/connectors/clickhouse/cdc.go @@ -9,10 +9,11 @@ import ( _ "github.com/ClickHouse/clickhouse-go/v2" _ "github.com/ClickHouse/clickhouse-go/v2/lib/driver" + "github.com/jackc/pgx/v5/pgtype" + "github.com/PeerDB-io/peer-flow/connectors/utils" "github.com/PeerDB-io/peer-flow/generated/protos" "github.com/PeerDB-io/peer-flow/model" - "github.com/jackc/pgx/v5/pgtype" ) const ( @@ -117,7 +118,7 @@ func (c *ClickhouseConnector) syncRecordsViaAvro( tableNameRowsMapping := make(map[string]uint32) streamReq := model.NewRecordsToStreamRequest(req.Records.GetRecords(), tableNameRowsMapping, syncBatchID) streamRes, err := utils.RecordsToRawTableStream(streamReq) - //x := *&streamRes.Stream + //y := (*x).Records if err != nil { return nil, fmt.Errorf("failed to convert records to raw table stream: %w", err) @@ -139,7 +140,7 @@ func (c *ClickhouseConnector) syncRecordsViaAvro( return nil, err } - //tableSchemaDeltas := req.Records.WaitForSchemaDeltas(req.TableMappings) + err = c.ReplayTableSchemaDeltas(req.FlowJobName, req.Records.SchemaDeltas) if err != nil { return nil, fmt.Errorf("failed to sync schema changes: %w", err) @@ -156,12 +157,12 @@ func (c *ClickhouseConnector) syncRecordsViaAvro( CurrentSyncBatchID: syncBatchID, TableNameRowsMapping: tableNameRowsMapping, TableSchemaDeltas: req.Records.SchemaDeltas, - //RelationMessageMapping: <-req.Records.RelationMessageMapping, + // RelationMessageMapping: <-req.Records.RelationMessageMapping, }, nil } func (c *ClickhouseConnector) SyncRecords(req *model.SyncRecordsRequest) (*model.SyncResponse, error) { - //c.config.S3Integration = "s3://avro-clickhouse" + rawTableName := c.getRawTableName(req.FlowJobName) c.logger.Info(fmt.Sprintf("pushing records to Clickhouse table %s", rawTableName)) diff --git a/flow/connectors/clickhouse/clickhouse.go b/flow/connectors/clickhouse/clickhouse.go index 71a7605d17..51dd545f7f 100644 --- a/flow/connectors/clickhouse/clickhouse.go +++ b/flow/connectors/clickhouse/clickhouse.go @@ -8,6 +8,7 @@ import ( _ "github.com/ClickHouse/clickhouse-go/v2" _ "github.com/ClickHouse/clickhouse-go/v2/lib/driver" + metadataStore "github.com/PeerDB-io/peer-flow/connectors/external_metadata" "github.com/PeerDB-io/peer-flow/generated/protos" "github.com/PeerDB-io/peer-flow/shared" diff --git a/flow/connectors/clickhouse/normalize.go b/flow/connectors/clickhouse/normalize.go index c82602c232..f997685954 100644 --- a/flow/connectors/clickhouse/normalize.go +++ b/flow/connectors/clickhouse/normalize.go @@ -140,7 +140,7 @@ func (c *ClickhouseConnector) NormalizeRecords(req *model.NormalizeRecordsReques colSelector := strings.Builder{} colSelector.WriteString("(") - //schema := c.tableSchemaMapping[tbl] + schema := req.TableNameSchemaMapping[tbl] numCols := len(schema.ColumnNames) diff --git a/flow/connectors/clickhouse/qrep_avro_sync.go b/flow/connectors/clickhouse/qrep_avro_sync.go index 6fbe68f1cf..5adad380d4 100644 --- a/flow/connectors/clickhouse/qrep_avro_sync.go +++ b/flow/connectors/clickhouse/qrep_avro_sync.go @@ -35,7 +35,7 @@ func NewClickhouseAvroSyncMethod( func (s *ClickhouseAvroSyncMethod) CopyStageToDestination(avroFile *avro.AvroFile) error { stagingPath := s.config.StagingPath if stagingPath == "" { - stagingPath = s.config.DestinationPeer.GetClickhouseConfig().S3Integration //"s3://avro-clickhouse" + stagingPath = s.config.DestinationPeer.GetClickhouseConfig().S3Integration // "s3://avro-clickhouse" } s3o, err := utils.NewS3BucketAndPrefix(stagingPath) if err != nil { @@ -61,7 +61,7 @@ func (s *ClickhouseAvroSyncMethod) SyncRecords( stream *model.QRecordStream, flowJobName string, ) (int, error) { - //s.config.StagingPath = "s3://avro-clickhouse" + tableLog := slog.String("destinationTable", s.config.DestinationTableIdentifier) dstTableName := s.config.DestinationTableIdentifier @@ -85,7 +85,7 @@ func (s *ClickhouseAvroSyncMethod) SyncRecords( defer avroFile.Cleanup() s.connector.logger.Info(fmt.Sprintf("written %d records to Avro file", avroFile.NumRecords), tableLog) - //Copy stage/avro to destination + err = s.CopyStageToDestination(avroFile) if err != nil { return 0, err @@ -102,7 +102,7 @@ func (s *ClickhouseAvroSyncMethod) SyncQRepRecords( ) (int, error) { startTime := time.Now() dstTableName := config.DestinationTableIdentifier - //s.config.StagingPath = "s3://avro-clickhouse" + stagingPath := s.config.DestinationPeer.GetClickhouseConfig().S3Integration schema, err := stream.Schema() @@ -167,9 +167,9 @@ func (s *ClickhouseAvroSyncMethod) writeToAvroFile( partitionID string, flowJobName string, ) (*avro.AvroFile, error) { - stagingPath := s.config.StagingPath //"s3://avro-clickhouse" + stagingPath := s.config.StagingPath // "s3://avro-clickhouse" if stagingPath == "" { - stagingPath = s.config.DestinationPeer.GetClickhouseConfig().S3Integration //"s3://avro-clickhouse" + stagingPath = s.config.DestinationPeer.GetClickhouseConfig().S3Integration // "s3://avro-clickhouse" } ocfWriter := avro.NewPeerDBOCFWriter(s.connector.ctx, stream, avroSchema, avro.CompressZstd, qvalue.QDWHTypeClickhouse) From 80dc51b540b9a0547c6d89ed63e5a2ba25291a70 Mon Sep 17 00:00:00 2001 From: Pankaj B Date: Mon, 29 Jan 2024 18:56:10 +0530 Subject: [PATCH 25/36] remove unused code --- flow/connectors/clickhouse/cdc.go | 152 ------------------------ flow/connectors/clickhouse/normalize.go | 6 - 2 files changed, 158 deletions(-) diff --git a/flow/connectors/clickhouse/cdc.go b/flow/connectors/clickhouse/cdc.go index 66a5c0634e..67d290cdeb 100644 --- a/flow/connectors/clickhouse/cdc.go +++ b/flow/connectors/clickhouse/cdc.go @@ -49,35 +49,9 @@ type MirrorJobRow struct { NormalizeBatchID int } -// func (c *ClickhouseConnector) getMirrorRowByJobNAme(jobName string) (*MirrorJobRow, error) { -// getLastOffsetSQL := "SELECT mirror_job_name, offset, sync_batch_id, normalize_batch_id FROM %s WHERE MIRROR_JOB_NAME=? Limit 1" - -// row := c.database.QueryRowContext(c.ctx, fmt.Sprintf(getLastOffsetSQL, mirrorJobsTableIdentifier), jobName) - -// var result MirrorJobRow - -// err := row.Scan( -// &result.MirrorJobName, -// &result.Offset, -// &result.SyncBatchID, -// &result.NormalizeBatchID, -// ) - -// if err != nil { -// return nil, err -// } - -// return &result, nil -// } - func (c *ClickhouseConnector) CreateRawTable(req *protos.CreateRawTableInput) (*protos.CreateRawTableOutput, error) { rawTableName := c.getRawTableName(req.FlowJobName) - // createRawTableTx, err := c.database.BeginTx(c.ctx, nil) - // if err != nil { - // return nil, fmt.Errorf("unable to begin transaction for creation of raw table: %w", err) - // } - createRawTableSQL := `CREATE TABLE IF NOT EXISTS %s ( _peerdb_uid String NOT NULL, _peerdb_timestamp Int64 NOT NULL, @@ -94,17 +68,6 @@ func (c *ClickhouseConnector) CreateRawTable(req *protos.CreateRawTableInput) (* if err != nil { return nil, fmt.Errorf("unable to create raw table: %w", err) } - // err = createRawTableTx.Commit() - // if err != nil { - // return nil, fmt.Errorf("unable to commit transaction for creation of raw table: %w", err) - // } - - // stage := c.getStageNameForJob(req.FlowJobName) - // err = c.createStage(stage, &protos.QRepConfig{}) - // if err != nil { - // return nil, err - // } - return &protos.CreateRawTableOutput{ TableIdentifier: rawTableName, }, nil @@ -119,7 +82,6 @@ func (c *ClickhouseConnector) syncRecordsViaAvro( streamReq := model.NewRecordsToStreamRequest(req.Records.GetRecords(), tableNameRowsMapping, syncBatchID) streamRes, err := utils.RecordsToRawTableStream(streamReq) - //y := (*x).Records if err != nil { return nil, fmt.Errorf("failed to convert records to raw table stream: %w", err) } @@ -140,7 +102,6 @@ func (c *ClickhouseConnector) syncRecordsViaAvro( return nil, err } - err = c.ReplayTableSchemaDeltas(req.FlowJobName, req.Records.SchemaDeltas) if err != nil { return nil, fmt.Errorf("failed to sync schema changes: %w", err) @@ -157,7 +118,6 @@ func (c *ClickhouseConnector) syncRecordsViaAvro( CurrentSyncBatchID: syncBatchID, TableNameRowsMapping: tableNameRowsMapping, TableSchemaDeltas: req.Records.SchemaDeltas, - // RelationMessageMapping: <-req.Records.RelationMessageMapping, }, nil } @@ -166,42 +126,11 @@ func (c *ClickhouseConnector) SyncRecords(req *model.SyncRecordsRequest) (*model rawTableName := c.getRawTableName(req.FlowJobName) c.logger.Info(fmt.Sprintf("pushing records to Clickhouse table %s", rawTableName)) - // syncBatchID, err := c.GetLastSyncBatchID(req.FlowJobName) - // if err != nil { - // return nil, fmt.Errorf("failed to get previous syncBatchID: %w", err) - // } - // syncBatchID += 1 - res, err := c.syncRecordsViaAvro(req, rawTableName, req.SyncBatchID) if err != nil { return nil, err } - // transaction for SyncRecords - // syncRecordsTx, err := c.database.BeginTx(c.ctx, nil) - // if err != nil { - // return nil, err - // } - // in case we return after error, ensure transaction is rolled back - // defer func() { - // deferErr := syncRecordsTx.Rollback() - // if deferErr != sql.ErrTxDone && deferErr != nil { - // c.logger.Error("error while rolling back transaction for SyncRecords: %v", - // slog.Any("error", deferErr), slog.Int64("syncBatchID", syncBatchID)) - // } - // }() - - // updating metadata with new offset and syncBatchID - // err = c.updateSyncMetadata(req.FlowJobName, res.LastSyncedCheckPointID, syncBatchID, syncRecordsTx) - // if err != nil { - // return nil, err - // } - // transaction commits - // err = syncRecordsTx.Commit() - // if err != nil { - // return nil, err - // } - lastCheckpoint, err := req.Records.GetLastCheckpoint() if err != nil { return nil, fmt.Errorf("failed to get last checkpoint: %w", err) @@ -233,74 +162,7 @@ func (c *ClickhouseConnector) jobMetadataExistsTx(tx *sql.Tx, jobName string) (b return result.Bool, nil } -// func (c *ClickhouseConnector) updateSyncMetadata(flowJobName string, lastCP int64, -// syncBatchID int64, syncRecordsTx *sql.Tx, -// ) error { -// jobMetadataExists, err := c.jobMetadataExistsTx(syncRecordsTx, flowJobName) -// if err != nil { -// return fmt.Errorf("failed to get sync status for flow job: %w", err) -// } - -// if !jobMetadataExists { -// _, err := syncRecordsTx.ExecContext(c.ctx, -// fmt.Sprintf(insertJobMetadataSQL, c.metadataSchema, mirrorJobsTableIdentifier), -// flowJobName, lastCP, syncBatchID, 0) -// if err != nil { -// return fmt.Errorf("failed to insert flow job status: %w", err) -// } -// } else { -// _, err := syncRecordsTx.ExecContext(c.ctx, -// fmt.Sprintf(updateMetadataForSyncRecordsSQL, c.metadataSchema, mirrorJobsTableIdentifier), -// lastCP, syncBatchID, flowJobName) -// if err != nil { -// return fmt.Errorf("failed to update flow job status: %w", err) -// } -// } - -// return nil -// } - func (c *ClickhouseConnector) SyncFlowCleanup(jobName string) error { - // syncFlowCleanupTx, err := c.database.BeginTx(c.ctx, nil) - // if err != nil { - // return fmt.Errorf("unable to begin transaction for sync flow cleanup: %w", err) - // } - // defer func() { - // deferErr := syncFlowCleanupTx.Rollback() - // if deferErr != sql.ErrTxDone && deferErr != nil { - // c.logger.Error("error while rolling back transaction for flow cleanup", slog.Any("error", deferErr)) - // } - // }() - - // row := syncFlowCleanupTx.QueryRowContext(c.ctx, checkSchemaExistsSQL, c.metadataSchema) - // var schemaExists pgtype.Bool - // err = row.Scan(&schemaExists) - // if err != nil { - // return fmt.Errorf("unable to check if internal schema exists: %w", err) - // } - - // if schemaExists.Bool { - // _, err = syncFlowCleanupTx.ExecContext(c.ctx, fmt.Sprintf(dropTableIfExistsSQL, c.metadataSchema, - // getRawTableIdentifier(jobName))) - // if err != nil { - // return fmt.Errorf("unable to drop raw table: %w", err) - // } - // _, err = syncFlowCleanupTx.ExecContext(c.ctx, - // fmt.Sprintf(deleteJobMetadataSQL, c.metadataSchema, mirrorJobsTableIdentifier), jobName) - // if err != nil { - // return fmt.Errorf("unable to delete job metadata: %w", err) - // } - // } - - // err = syncFlowCleanupTx.Commit() - // if err != nil { - // return fmt.Errorf("unable to commit transaction for sync flow cleanup: %w", err) - // } - - // err = c.dropStage("", jobName) - // if err != nil { - // return err - // } err := c.pgMetadata.DropMetadata(jobName) if err != nil { return err @@ -331,12 +193,6 @@ func (c *ClickhouseConnector) SetupMetadataTables() error { return nil } -// func (c *ClickhouseConnector) SetupNormalizedTables( -// req *protos.SetupNormalizedTableBatchInput, -// ) (*protos.SetupNormalizedTableBatchOutput, error) { -// return nil, nil -// } - func (c *ClickhouseConnector) GetLastSyncBatchID(jobName string) (int64, error) { return c.pgMetadata.GetLastBatchID(jobName) } @@ -355,11 +211,3 @@ func (c *ClickhouseConnector) SetLastOffset(jobName string, offset int64) error return nil } - -// func (c *ClickhouseConnector) NormalizeRecords(req *model.NormalizeRecordsRequest) (*model.NormalizeResponse, error) { -// return &model.NormalizeResponse{ -// Done: true, -// StartBatchID: 1, -// EndBatchID: 1, -// }, nil -// } diff --git a/flow/connectors/clickhouse/normalize.go b/flow/connectors/clickhouse/normalize.go index f997685954..de1511deac 100644 --- a/flow/connectors/clickhouse/normalize.go +++ b/flow/connectors/clickhouse/normalize.go @@ -140,7 +140,6 @@ func (c *ClickhouseConnector) NormalizeRecords(req *model.NormalizeRecordsReques colSelector := strings.Builder{} colSelector.WriteString("(") - schema := req.TableNameSchemaMapping[tbl] numCols := len(schema.ColumnNames) @@ -247,11 +246,6 @@ func (c *ClickhouseConnector) getDistinctTableNamesInBatch( } func (c *ClickhouseConnector) GetLastNormalizeBatchID(flowJobName string) (int64, error) { - // syncBatchID, err := c.pgMetadata.GetLastBatchID(flowJobName) - // if err != nil { - // return 0, fmt.Errorf("error while getting last sync batch id: %w", err) - // } - normalizeBatchID, err := c.pgMetadata.GetLastNormalizeBatchID(flowJobName) if err != nil { return 0, fmt.Errorf("error while getting last normalize batch id: %w", err) From d97f69a6edf559042b899e77ce5063e86888f81d Mon Sep 17 00:00:00 2001 From: Pankaj B Date: Mon, 29 Jan 2024 19:31:15 +0530 Subject: [PATCH 26/36] fix build issue --- docker-compose.yml | 2 +- nexus/analyzer/src/lib.rs | 4 ++++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/docker-compose.yml b/docker-compose.yml index 7ac7e19bb5..ae6ec7bb90 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -181,4 +181,4 @@ services: volumes: pgdata: - prometheusdata: + prometheusdata: \ No newline at end of file diff --git a/nexus/analyzer/src/lib.rs b/nexus/analyzer/src/lib.rs index 6602782e72..d2856cc381 100644 --- a/nexus/analyzer/src/lib.rs +++ b/nexus/analyzer/src/lib.rs @@ -789,6 +789,9 @@ fn parse_db_options( Some(config) } DbType::Clickhouse => { + let conn_str = opts.get("metadata_db"); + let metadata_db = parse_metadata_db_info(conn_str.copied())?; + let s3_int = opts .get("s3_integration") .map(|s| s.to_string()) @@ -814,6 +817,7 @@ fn parse_db_options( .context("no default database specified")? .to_string(), s3_integration: s3_int, + metadata_db, }; let config = Config::ClickhouseConfig(clickhouse_config); Some(config) From 56ebce63c69f8762c1329a3d8953daec73af5c6b Mon Sep 17 00:00:00 2001 From: Pankaj B Date: Mon, 29 Jan 2024 19:43:42 +0530 Subject: [PATCH 27/36] fix build issue --- flow/connectors/clickhouse/normalize.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/flow/connectors/clickhouse/normalize.go b/flow/connectors/clickhouse/normalize.go index de1511deac..f7d4060a65 100644 --- a/flow/connectors/clickhouse/normalize.go +++ b/flow/connectors/clickhouse/normalize.go @@ -194,7 +194,7 @@ func (c *ClickhouseConnector) NormalizeRecords(req *model.NormalizeRecordsReques insertIntoSelectQuery.WriteString(selectQuery.String()) q := insertIntoSelectQuery.String() - c.logger.InfoContext(c.ctx, "[clickhouse] insert into select query", q) + c.logger.InfoContext(c.ctx, fmt.Sprintf("[clickhouse] insert into select query", q)) _, err = c.database.ExecContext(c.ctx, q) if err != nil { From 553830fa6f317858be9e1f818d10307b5b76479a Mon Sep 17 00:00:00 2001 From: Pankaj B Date: Mon, 29 Jan 2024 19:44:24 +0530 Subject: [PATCH 28/36] fix build issue --- flow/connectors/clickhouse/normalize.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/flow/connectors/clickhouse/normalize.go b/flow/connectors/clickhouse/normalize.go index f7d4060a65..8439b8e730 100644 --- a/flow/connectors/clickhouse/normalize.go +++ b/flow/connectors/clickhouse/normalize.go @@ -194,7 +194,7 @@ func (c *ClickhouseConnector) NormalizeRecords(req *model.NormalizeRecordsReques insertIntoSelectQuery.WriteString(selectQuery.String()) q := insertIntoSelectQuery.String() - c.logger.InfoContext(c.ctx, fmt.Sprintf("[clickhouse] insert into select query", q)) + c.logger.InfoContext(c.ctx, fmt.Sprintf("[clickhouse] insert into select query %s", q)) _, err = c.database.ExecContext(c.ctx, q) if err != nil { From 083a445c18e46f25b9ffca7ef55e81ab16ae9dfe Mon Sep 17 00:00:00 2001 From: Pankaj B Date: Mon, 29 Jan 2024 21:16:37 +0530 Subject: [PATCH 29/36] fix flow build --- flow/model/qvalue/kind.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/flow/model/qvalue/kind.go b/flow/model/qvalue/kind.go index d67feea471..5c1c9271e8 100644 --- a/flow/model/qvalue/kind.go +++ b/flow/model/qvalue/kind.go @@ -141,6 +141,6 @@ func (kind QValueKind) ToDWHColumnType(dwhType QDWHType) (string, error) { return "String", nil } default: - return "", fmt.Errorf("unknown dwh type: %s", dwhType) + return "", fmt.Errorf("unknown dwh type: %v", dwhType) } } From 08f822b976c75f1b8155256260042362a65530dd Mon Sep 17 00:00:00 2001 From: Pankaj B Date: Mon, 29 Jan 2024 21:59:48 +0530 Subject: [PATCH 30/36] fix lint error in normalize.go --- flow/connectors/clickhouse/normalize.go | 22 +++++++++++++++++----- 1 file changed, 17 insertions(+), 5 deletions(-) diff --git a/flow/connectors/clickhouse/normalize.go b/flow/connectors/clickhouse/normalize.go index 8439b8e730..f0aac33fd2 100644 --- a/flow/connectors/clickhouse/normalize.go +++ b/flow/connectors/clickhouse/normalize.go @@ -3,6 +3,7 @@ package connclickhouse import ( "database/sql" "fmt" + "strconv" "strings" "github.com/PeerDB-io/peer-flow/generated/protos" @@ -56,7 +57,7 @@ func (c *ClickhouseConnector) SetupNormalizedTables( func generateCreateTableSQLForNormalizedTable( normalizedTable string, tableSchema *protos.TableSchema, - softDeleteColName string, + _ string, // softDeleteColName syncedAtColName string, ) (string, error) { var stmtBuilder strings.Builder @@ -178,9 +179,9 @@ func (c *ClickhouseConnector) NormalizeRecords(req *model.NormalizeRecordsReques selectQuery.WriteString(" FROM ") selectQuery.WriteString(rawTbl) selectQuery.WriteString(" WHERE _peerdb_batch_id > ") - selectQuery.WriteString(fmt.Sprintf("%d", normBatchID)) + selectQuery.WriteString(strconv.FormatInt(normBatchID, 10)) selectQuery.WriteString(" AND _peerdb_batch_id <= ") - selectQuery.WriteString(fmt.Sprintf("%d", req.SyncBatchID)) + selectQuery.WriteString(strconv.FormatInt(req.SyncBatchID, 10)) selectQuery.WriteString(" AND _peerdb_destination_table_name = '") selectQuery.WriteString(tbl) selectQuery.WriteString("'") @@ -203,7 +204,12 @@ func (c *ClickhouseConnector) NormalizeRecords(req *model.NormalizeRecordsReques } endNormalizeBatchId := normBatchID + 1 - c.pgMetadata.UpdateNormalizeBatchID(req.FlowJobName, endNormalizeBatchId) + err = c.pgMetadata.UpdateNormalizeBatchID(req.FlowJobName, endNormalizeBatchId) + if err != nil { + c.logger.ErrorContext(c.ctx, "[clickhouse] error while updating normalize batch id", err) + return nil, err + } + return &model.NormalizeResponse{ Done: true, StartBatchID: endNormalizeBatchId, @@ -218,6 +224,7 @@ func (c *ClickhouseConnector) getDistinctTableNamesInBatch( ) ([]string, error) { rawTbl := c.getRawTableName(flowJobName) + //nolint:gosec q := fmt.Sprintf( `SELECT DISTINCT _peerdb_destination_table_name FROM %s WHERE _peerdb_batch_id > %d AND _peerdb_batch_id <= %d`, rawTbl, normalizeBatchID, syncBatchID) @@ -226,7 +233,7 @@ func (c *ClickhouseConnector) getDistinctTableNamesInBatch( if err != nil { return nil, fmt.Errorf("error while querying raw table for distinct table names in batch: %w", err) } - + defer rows.Close() var tableNames []string for rows.Next() { var tableName sql.NullString @@ -242,6 +249,11 @@ func (c *ClickhouseConnector) getDistinctTableNamesInBatch( tableNames = append(tableNames, tableName.String) } + err = rows.Err() + if err != nil { + return nil, fmt.Errorf("failed to read rows: %w", err) + } + return tableNames, nil } From 8173deff52a098c4cc734a372cb3620b5e873950 Mon Sep 17 00:00:00 2001 From: Pankaj B Date: Mon, 29 Jan 2024 22:07:28 +0530 Subject: [PATCH 31/36] fix more lint errors --- flow/connectors/clickhouse/cdc.go | 17 +---------------- flow/connectors/clickhouse/qrep_avro_sync.go | 3 --- 2 files changed, 1 insertion(+), 19 deletions(-) diff --git a/flow/connectors/clickhouse/cdc.go b/flow/connectors/clickhouse/cdc.go index 67d290cdeb..f867d9d96f 100644 --- a/flow/connectors/clickhouse/cdc.go +++ b/flow/connectors/clickhouse/cdc.go @@ -9,7 +9,6 @@ import ( _ "github.com/ClickHouse/clickhouse-go/v2" _ "github.com/ClickHouse/clickhouse-go/v2/lib/driver" - "github.com/jackc/pgx/v5/pgtype" "github.com/PeerDB-io/peer-flow/connectors/utils" "github.com/PeerDB-io/peer-flow/generated/protos" @@ -81,7 +80,6 @@ func (c *ClickhouseConnector) syncRecordsViaAvro( tableNameRowsMapping := make(map[string]uint32) streamReq := model.NewRecordsToStreamRequest(req.Records.GetRecords(), tableNameRowsMapping, syncBatchID) streamRes, err := utils.RecordsToRawTableStream(streamReq) - if err != nil { return nil, fmt.Errorf("failed to convert records to raw table stream: %w", err) } @@ -89,7 +87,7 @@ func (c *ClickhouseConnector) syncRecordsViaAvro( qrepConfig := &protos.QRepConfig{ StagingPath: c.config.S3Integration, FlowJobName: req.FlowJobName, - DestinationTableIdentifier: strings.ToLower(fmt.Sprintf("%s", rawTableIdentifier)), + DestinationTableIdentifier: strings.ToLower(rawTableIdentifier), } avroSyncer := NewClickhouseAvroSyncMethod(qrepConfig, c) destinationTableSchema, err := c.getTableSchema(qrepConfig.DestinationTableIdentifier) @@ -122,7 +120,6 @@ func (c *ClickhouseConnector) syncRecordsViaAvro( } func (c *ClickhouseConnector) SyncRecords(req *model.SyncRecordsRequest) (*model.SyncResponse, error) { - rawTableName := c.getRawTableName(req.FlowJobName) c.logger.Info(fmt.Sprintf("pushing records to Clickhouse table %s", rawTableName)) @@ -150,18 +147,6 @@ func (c *ClickhouseConnector) SyncRecords(req *model.SyncRecordsRequest) (*model return res, nil } -func (c *ClickhouseConnector) jobMetadataExistsTx(tx *sql.Tx, jobName string) (bool, error) { - checkIfJobMetadataExistsSQL := "SELECT TO_BOOLEAN(COUNT(1)) FROM %s WHERE MIRROR_JOB_NAME=?" - - var result pgtype.Bool - err := tx.QueryRowContext(c.ctx, - fmt.Sprintf(checkIfJobMetadataExistsSQL, mirrorJobsTableIdentifier), jobName).Scan(&result) - if err != nil { - return false, fmt.Errorf("error reading result row: %w", err) - } - return result.Bool, nil -} - func (c *ClickhouseConnector) SyncFlowCleanup(jobName string) error { err := c.pgMetadata.DropMetadata(jobName) if err != nil { diff --git a/flow/connectors/clickhouse/qrep_avro_sync.go b/flow/connectors/clickhouse/qrep_avro_sync.go index 5adad380d4..68129a98d5 100644 --- a/flow/connectors/clickhouse/qrep_avro_sync.go +++ b/flow/connectors/clickhouse/qrep_avro_sync.go @@ -61,7 +61,6 @@ func (s *ClickhouseAvroSyncMethod) SyncRecords( stream *model.QRecordStream, flowJobName string, ) (int, error) { - tableLog := slog.String("destinationTable", s.config.DestinationTableIdentifier) dstTableName := s.config.DestinationTableIdentifier @@ -84,8 +83,6 @@ func (s *ClickhouseAvroSyncMethod) SyncRecords( } defer avroFile.Cleanup() s.connector.logger.Info(fmt.Sprintf("written %d records to Avro file", avroFile.NumRecords), tableLog) - - err = s.CopyStageToDestination(avroFile) if err != nil { return 0, err From d6f89fc95fa562064a0e937d84eff798575380ea Mon Sep 17 00:00:00 2001 From: Pankaj B Date: Mon, 29 Jan 2024 23:02:01 +0530 Subject: [PATCH 32/36] prettier on ui --- ui/app/api/peers/getTruePeer.ts | 6 +++--- ui/app/peers/create/[peerType]/helpers/ch.ts | 2 +- ui/tsconfig.json | 10 +++++----- 3 files changed, 9 insertions(+), 9 deletions(-) diff --git a/ui/app/api/peers/getTruePeer.ts b/ui/app/api/peers/getTruePeer.ts index 75aa53902b..3cef249e48 100644 --- a/ui/app/api/peers/getTruePeer.ts +++ b/ui/app/api/peers/getTruePeer.ts @@ -1,6 +1,7 @@ import { CatalogPeer } from '@/app/dto/PeersDTO'; import { BigqueryConfig, + ClickhouseConfig, EventHubConfig, EventHubGroupConfig, Peer, @@ -8,7 +9,6 @@ import { S3Config, SnowflakeConfig, SqlServerConfig, - ClickhouseConfig, } from '@/grpc_generated/peers'; export const getTruePeer = (peer: CatalogPeer) => { @@ -57,8 +57,8 @@ export const getTruePeer = (peer: CatalogPeer) => { break; case 8: config = ClickhouseConfig.decode(options); - newPeer.clickhouseConfig = config; - break; + newPeer.clickhouseConfig = config; + break; default: return newPeer; } diff --git a/ui/app/peers/create/[peerType]/helpers/ch.ts b/ui/app/peers/create/[peerType]/helpers/ch.ts index 025d64aa36..ff8615267a 100644 --- a/ui/app/peers/create/[peerType]/helpers/ch.ts +++ b/ui/app/peers/create/[peerType]/helpers/ch.ts @@ -57,5 +57,5 @@ export const blankClickhouseSetting: ClickhouseConfig = { password: '', database: '', s3Integration: '', - metadataDb: undefined + metadataDb: undefined, }; diff --git a/ui/tsconfig.json b/ui/tsconfig.json index ad4b679af6..c443fefcce 100644 --- a/ui/tsconfig.json +++ b/ui/tsconfig.json @@ -16,13 +16,13 @@ "incremental": true, "plugins": [ { - "name": "next", - }, + "name": "next" + } ], "paths": { - "@/*": ["./*"], - }, + "@/*": ["./*"] + } }, "include": ["next-env.d.ts", "**/*.ts", "**/*.tsx", ".next/types/**/*.ts"], - "exclude": ["node_modules"], + "exclude": ["node_modules"] } From 9b9e57e2de01b5bebed2e0a1d0b1041e0bd220ef Mon Sep 17 00:00:00 2001 From: Pankaj B Date: Mon, 29 Jan 2024 23:33:35 +0530 Subject: [PATCH 33/36] fix build after rebase --- flow/connectors/external_metadata/store.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/flow/connectors/external_metadata/store.go b/flow/connectors/external_metadata/store.go index 08f2938e73..ddf4da7108 100644 --- a/flow/connectors/external_metadata/store.go +++ b/flow/connectors/external_metadata/store.go @@ -181,7 +181,7 @@ func (p *PostgresMetadataStore) GetLastBatchID(jobName string) (int64, error) { } func (p *PostgresMetadataStore) GetLastNormalizeBatchID(jobName string) (int64, error) { - rows := p.pool.QueryRow(p.ctx, ` + rows := p.conn.QueryRow(p.ctx, ` SELECT normalize_batch_id FROM `+p.schemaName+`.`+lastSyncStateTableName+` WHERE job_name = $1 @@ -254,7 +254,7 @@ func (p *PostgresMetadataStore) IncrementID(jobName string) error { func (p *PostgresMetadataStore) UpdateNormalizeBatchID(jobName string, batchID int64) error { p.logger.Info("updating normalize batch id for job") - _, err := p.pool.Exec(p.ctx, ` + _, err := p.conn.Exec(p.ctx, ` UPDATE `+p.schemaName+`.`+lastSyncStateTableName+` SET normalize_batch_id=$2 WHERE job_name=$1 `, jobName, batchID) From b7d63a9c274b059c1e06175b254afb93f47d41ce Mon Sep 17 00:00:00 2001 From: Pankaj B Date: Mon, 29 Jan 2024 23:46:32 +0530 Subject: [PATCH 34/36] add newline back --- docker-compose-dev.yml | 2 +- docker-compose.yml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/docker-compose-dev.yml b/docker-compose-dev.yml index dfd74c0554..8a8181ba61 100644 --- a/docker-compose-dev.yml +++ b/docker-compose-dev.yml @@ -215,4 +215,4 @@ services: - flow-api volumes: - pgdata: \ No newline at end of file + pgdata: diff --git a/docker-compose.yml b/docker-compose.yml index ae6ec7bb90..7ac7e19bb5 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -181,4 +181,4 @@ services: volumes: pgdata: - prometheusdata: \ No newline at end of file + prometheusdata: From 7566de4a03e4f34cc743a0726600d4126563d74b Mon Sep 17 00:00:00 2001 From: Pankaj B Date: Tue, 30 Jan 2024 00:06:16 +0530 Subject: [PATCH 35/36] ui prettier --- ui/tsconfig.json | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/ui/tsconfig.json b/ui/tsconfig.json index c443fefcce..422d661481 100644 --- a/ui/tsconfig.json +++ b/ui/tsconfig.json @@ -16,13 +16,13 @@ "incremental": true, "plugins": [ { - "name": "next" - } + "name": "next", + }, ], "paths": { - "@/*": ["./*"] - } + "@/*": ["./*"], + }, }, "include": ["next-env.d.ts", "**/*.ts", "**/*.tsx", ".next/types/**/*.ts"], - "exclude": ["node_modules"] -} + "exclude": ["node_modules"], +} \ No newline at end of file From 152b8c849c647c8e200a15ecc2a382a278ee0352 Mon Sep 17 00:00:00 2001 From: Pankaj B Date: Tue, 30 Jan 2024 00:08:10 +0530 Subject: [PATCH 36/36] ts config new line --- ui/tsconfig.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ui/tsconfig.json b/ui/tsconfig.json index 422d661481..ad4b679af6 100644 --- a/ui/tsconfig.json +++ b/ui/tsconfig.json @@ -25,4 +25,4 @@ }, "include": ["next-env.d.ts", "**/*.ts", "**/*.tsx", ".next/types/**/*.ts"], "exclude": ["node_modules"], -} \ No newline at end of file +}