diff --git a/pom.xml b/pom.xml index 8a248f35..5afd8425 100644 --- a/pom.xml +++ b/pom.xml @@ -1,213 +1,217 @@ - - 4.0.0 - - datastax.astra.migrate - cassandra-data-migrator - ${revision} - jar - - - UTF-8 - 3.2.2 - 2.12.17 - 2.12 - 3.3.1 - 3.2.12 - 3.2.0 - 3.11.13 - 4.13.2 - - - - - github - GitHub Packages - https://maven.pkg.github.com/datastax/cassandra-data-migrator - - - - - - org.scala-lang - scala-library - ${scala.version} - - - org.apache.spark - spark-core_${scala.main.version} - ${spark.version} - - - log4j - log4j - - - - - org.apache.spark - spark-sql_${scala.main.version} - ${spark.version} - - - org.apache.spark - spark-hive_${scala.main.version} - ${spark.version} - - - log4j - log4j - - - log4j - apache-log4j-extras - - - - - com.datastax.spark - spark-cassandra-connector_${scala.main.version} - ${connector.version} - - - com.github.jnr - jnr-posix - 3.1.15 - - - - org.apache.logging.log4j - log4j-api - 2.19.0 - - - org.apache.logging.log4j - log4j-core - 2.19.0 - - - org.apache.logging.log4j - log4j-to-slf4j - 2.19.0 - - - - - org.scalatest - scalatest_${scala.main.version} - ${scalatest.version} - test - - - junit - junit - ${junit.version} - test - - - org.apache.cassandra - cassandra-all - ${cassandra.version} - test - - - - org.slf4j - log4j-over-slf4j - - - - - - - - - - src/resources - - - - - net.alchim31.maven - scala-maven-plugin - 4.8.0 - - - process-sources - - compile - testCompile - - - - - - - org.apache.maven.plugins - maven-shade-plugin - 3.4.1 - - - - package - - shade - - - - - - - *:* - - META-INF/*.SF - META-INF/*.DSA - META-INF/*.RSA - - - - - - - - - - - org.apache.maven.plugins - maven-surefire-plugin - 2.22.2 - - true - - - - - org.scalatest - scalatest-maven-plugin - 2.2.0 - - ${project.build.directory}/surefire-reports - . - WDF TestSuite.txt - - - - test - - test - - - - - - - org.apache.maven.plugins - maven-compiler-plugin - 3.10.1 - - 1.8 - 1.8 - - - - - + + 4.0.0 + + datastax.astra.migrate + cassandra-data-migrator + 3.0.0 + jar + + + UTF-8 + 2.12.17 + 2.12 + 3.3.1 + 3.2.12 + 3.2.0 + 3.11.13 + 4.13.2 + + + + + github + GitHub Packages + https://maven.pkg.github.com/datastax/cassandra-data-migrator + + + + + + com.google.guava + guava + 31.1-jre + + + org.scala-lang + scala-library + ${scala.version} + + + org.apache.spark + spark-core_${scala.main.version} + ${spark.version} + + + log4j + log4j + + + + + org.apache.spark + spark-sql_${scala.main.version} + ${spark.version} + + + org.apache.spark + spark-hive_${scala.main.version} + ${spark.version} + + + log4j + log4j + + + log4j + apache-log4j-extras + + + + + com.datastax.spark + spark-cassandra-connector_${scala.main.version} + ${connector.version} + + + com.github.jnr + jnr-posix + 3.1.15 + + + + org.apache.logging.log4j + log4j-api + 2.19.0 + + + org.apache.logging.log4j + log4j-core + 2.19.0 + + + org.apache.logging.log4j + log4j-to-slf4j + 2.19.0 + + + + + org.scalatest + scalatest_${scala.main.version} + ${scalatest.version} + test + + + junit + junit + ${junit.version} + test + + + org.apache.cassandra + cassandra-all + ${cassandra.version} + test + + + + org.slf4j + log4j-over-slf4j + + + + + + + + + + src/resources + + + + + net.alchim31.maven + scala-maven-plugin + 3.2.2 + + + process-sources + + compile + testCompile + + + + + + + org.apache.maven.plugins + maven-shade-plugin + 2.4.3 + + + + package + + shade + + + + + + + *:* + + META-INF/*.SF + META-INF/*.DSA + META-INF/*.RSA + + + + + + + + + + + org.apache.maven.plugins + maven-surefire-plugin + 2.7 + + true + + + + + org.scalatest + scalatest-maven-plugin + 1.0 + + ${project.build.directory}/surefire-reports + . + WDF TestSuite.txt + + + + test + + test + + + + + + + org.apache.maven.plugins + maven-compiler-plugin + 3.8.1 + + 1.8 + 1.8 + + + + + diff --git a/src/main/java/datastax/astra/migrate/AbstractJobSession.java b/src/main/java/datastax/astra/migrate/AbstractJobSession.java index e1ad398d..df7f7a2d 100644 --- a/src/main/java/datastax/astra/migrate/AbstractJobSession.java +++ b/src/main/java/datastax/astra/migrate/AbstractJobSession.java @@ -1,300 +1,281 @@ -package datastax.astra.migrate; - -import com.datastax.oss.driver.api.core.CqlSession; -import com.datastax.oss.driver.api.core.cql.BoundStatement; -import com.datastax.oss.driver.api.core.cql.PreparedStatement; -import com.datastax.oss.driver.api.core.cql.Row; -import com.datastax.oss.driver.shaded.guava.common.util.concurrent.RateLimiter; -import org.apache.commons.lang.StringUtils; -import org.apache.spark.SparkConf; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.time.Duration; -import java.time.Instant; -import java.util.List; -import java.util.Optional; -import java.util.stream.IntStream; - -public class AbstractJobSession extends BaseJobSession { - - public Logger logger = LoggerFactory.getLogger(this.getClass().getName()); - - protected AbstractJobSession(CqlSession sourceSession, CqlSession astraSession, SparkConf sc) { - this(sourceSession, astraSession, sc, false); - } - - protected AbstractJobSession(CqlSession sourceSession, CqlSession astraSession, SparkConf sc, boolean isJobMigrateRowsFromFile) { - super(sc); - - if (sourceSession == null) { - return; - } - - this.sourceSession = sourceSession; - this.astraSession = astraSession; - - batchSize = new Integer(Util.getSparkPropOr(sc, "spark.batchSize", "5")); - fetchSizeInRows = new Integer(Util.getSparkPropOr(sc, "spark.read.fetch.sizeInRows", "1000")); - printStatsAfter = new Integer(Util.getSparkPropOr(sc, "spark.printStatsAfter", "100000")); - if (printStatsAfter < 1) { - printStatsAfter = 100000; - } - - readLimiter = RateLimiter.create(new Integer(Util.getSparkPropOr(sc, "spark.readRateLimit", "20000"))); - writeLimiter = RateLimiter.create(new Integer(Util.getSparkPropOr(sc, "spark.writeRateLimit", "40000"))); - maxRetries = Integer.parseInt(sc.get("spark.maxRetries", "0")); - - sourceKeyspaceTable = Util.getSparkProp(sc, "spark.origin.keyspaceTable"); - astraKeyspaceTable = Util.getSparkProp(sc, "spark.target.keyspaceTable"); - - String ttlColsStr = Util.getSparkPropOrEmpty(sc, "spark.query.ttl.cols"); - if (null != ttlColsStr && ttlColsStr.trim().length() > 0) { - for (String ttlCol : ttlColsStr.split(",")) { - ttlCols.add(Integer.parseInt(ttlCol)); - } - } - - String writeTimestampColsStr = Util.getSparkPropOrEmpty(sc, "spark.query.writetime.cols"); - if (null != writeTimestampColsStr && writeTimestampColsStr.trim().length() > 0) { - for (String writeTimeStampCol : writeTimestampColsStr.split(",")) { - writeTimeStampCols.add(Integer.parseInt(writeTimeStampCol)); - } - } - - writeTimeStampFilter = Boolean - .parseBoolean(Util.getSparkPropOr(sc, "spark.origin.writeTimeStampFilter", "false")); - // batchsize set to 1 if there is a writeFilter - if (writeTimeStampFilter) { - batchSize = 1; - } - - String minWriteTimeStampFilterStr = - Util.getSparkPropOr(sc, "spark.origin.minWriteTimeStampFilter", "0"); - if (null != minWriteTimeStampFilterStr && minWriteTimeStampFilterStr.trim().length() > 1) { - minWriteTimeStampFilter = Long.parseLong(minWriteTimeStampFilterStr); - } - String maxWriteTimeStampFilterStr = - Util.getSparkPropOr(sc, "spark.origin.maxWriteTimeStampFilter", "0"); - if (null != maxWriteTimeStampFilterStr && maxWriteTimeStampFilterStr.trim().length() > 1) { - maxWriteTimeStampFilter = Long.parseLong(maxWriteTimeStampFilterStr); - } - - String customWriteTimeStr = - Util.getSparkPropOr(sc, "spark.target.custom.writeTime", "0"); - if (null != customWriteTimeStr && customWriteTimeStr.trim().length() > 1 && StringUtils.isNumeric(customWriteTimeStr.trim())) { - customWritetime = Long.parseLong(customWriteTimeStr); - } - - logger.info("PARAM -- Read Consistency: {}", readConsistencyLevel); - logger.info("PARAM -- Write Consistency: {}", writeConsistencyLevel); - logger.info("PARAM -- Write Batch Size: {}", batchSize); - logger.info("PARAM -- Max Retries: {}", maxRetries); - logger.info("PARAM -- Read Fetch Size: {}", fetchSizeInRows); - logger.info("PARAM -- Source Keyspace Table: {}", sourceKeyspaceTable); - logger.info("PARAM -- Destination Keyspace Table: {}", astraKeyspaceTable); - logger.info("PARAM -- ReadRateLimit: {}", readLimiter.getRate()); - logger.info("PARAM -- WriteRateLimit: {}", writeLimiter.getRate()); - logger.info("PARAM -- TTLCols: {}", ttlCols); - logger.info("PARAM -- WriteTimestampFilterCols: {}", writeTimeStampCols); - logger.info("PARAM -- WriteTimestampFilter: {}", writeTimeStampFilter); - if (writeTimeStampFilter) { - logger.info("PARAM -- minWriteTimeStampFilter: {} datetime is {}", minWriteTimeStampFilter, - Instant.ofEpochMilli(minWriteTimeStampFilter / 1000)); - logger.info("PARAM -- maxWriteTimeStampFilter: {} datetime is {}", maxWriteTimeStampFilter, - Instant.ofEpochMilli(maxWriteTimeStampFilter / 1000)); - } - - String selectCols = Util.getSparkProp(sc, "spark.query.origin"); - String partitionKey = Util.getSparkProp(sc, "spark.query.origin.partitionKey"); - String sourceSelectCondition = Util.getSparkPropOrEmpty(sc, "spark.query.condition"); - if (!sourceSelectCondition.isEmpty() && !sourceSelectCondition.trim().toUpperCase().startsWith("AND")) { - sourceSelectCondition = " AND " + sourceSelectCondition; - } - - final StringBuilder selectTTLWriteTimeCols = new StringBuilder(); - allCols = selectCols.split(","); - ttlCols.forEach(col -> { - selectTTLWriteTimeCols.append(",ttl(" + allCols[col] + ")"); - }); - writeTimeStampCols.forEach(col -> { - selectTTLWriteTimeCols.append(",writetime(" + allCols[col] + ")"); - }); - selectColTypes = getTypes(Util.getSparkProp(sc, "spark.query.types")); - String idCols = Util.getSparkPropOrEmpty(sc, "spark.query.target.id"); - idColTypes = selectColTypes.subList(0, idCols.split(",").length); - - String insertCols = Util.getSparkPropOrEmpty(sc, "spark.query.target"); - if (null == insertCols || insertCols.trim().isEmpty()) { - insertCols = selectCols; - } - String insertBinds = ""; - for (String str : idCols.split(",")) { - if (insertBinds.isEmpty()) { - insertBinds = str + "= ?"; - } else { - insertBinds += " and " + str + "= ?"; - } - } - - String fullSelectQuery; - if (!isJobMigrateRowsFromFile) { - fullSelectQuery = "select " + selectCols + selectTTLWriteTimeCols + " from " + sourceKeyspaceTable + - " where token(" + partitionKey.trim() + ") >= ? and token(" + partitionKey.trim() + ") <= ? " + - sourceSelectCondition + " ALLOW FILTERING"; - } else { - fullSelectQuery = "select " + selectCols + selectTTLWriteTimeCols + " from " + sourceKeyspaceTable + " where " + insertBinds; - } - sourceSelectStatement = sourceSession.prepare(fullSelectQuery); - logger.info("PARAM -- Query used: {}", fullSelectQuery); - - astraSelectStatement = astraSession.prepare( - "select " + insertCols + " from " + astraKeyspaceTable - + " where " + insertBinds); - - hasRandomPartitioner = Boolean.parseBoolean(Util.getSparkPropOr(sc, "spark.origin.hasRandomPartitioner", "false")); - isCounterTable = Boolean.parseBoolean(Util.getSparkPropOr(sc, "spark.counterTable", "false")); - if (isCounterTable) { - String updateSelectMappingStr = Util.getSparkPropOr(sc, "spark.counterTable.cql.index", "0"); - for (String updateSelectIndex : updateSelectMappingStr.split(",")) { - updateSelectMapping.add(Integer.parseInt(updateSelectIndex)); - } - - String counterTableUpdate = Util.getSparkProp(sc, "spark.counterTable.cql"); - astraInsertStatement = astraSession.prepare(counterTableUpdate); - } else { - insertBinds = ""; - for (String str : insertCols.split(",")) { - if (insertBinds.isEmpty()) { - insertBinds += "?"; - } else { - insertBinds += ", ?"; - } - } - - String fullInsertQuery = "insert into " + astraKeyspaceTable + " (" + insertCols + ") VALUES (" + insertBinds + ")"; - if (!ttlCols.isEmpty()) { - fullInsertQuery += " USING TTL ?"; - if (!writeTimeStampCols.isEmpty()) { - fullInsertQuery += " AND TIMESTAMP ?"; - } - } else if (!writeTimeStampCols.isEmpty()) { - fullInsertQuery += " USING TIMESTAMP ?"; - } - astraInsertStatement = astraSession.prepare(fullInsertQuery); - } - - // Handle rows with blank values for 'timestamp' data-type in primary-key fields - tsReplaceValStr = Util.getSparkPropOr(sc, "spark.target.replace.blankTimestampKeyUsingEpoch", ""); - if (!tsReplaceValStr.isEmpty()) { - tsReplaceVal = Long.parseLong(tsReplaceValStr); - } - } - - public BoundStatement bindInsert(PreparedStatement insertStatement, Row sourceRow, Row astraRow) { - BoundStatement boundInsertStatement = insertStatement.bind().setConsistencyLevel(writeConsistencyLevel); - - if (isCounterTable) { - for (int index = 0; index < selectColTypes.size(); index++) { - MigrateDataType dataType = selectColTypes.get(updateSelectMapping.get(index)); - // compute the counter delta if reading from astra for the difference - if (astraRow != null && index < (selectColTypes.size() - idColTypes.size())) { - boundInsertStatement = boundInsertStatement.set(index, (sourceRow.getLong(updateSelectMapping.get(index)) - astraRow.getLong(updateSelectMapping.get(index))), Long.class); - } else { - boundInsertStatement = boundInsertStatement.set(index, getData(dataType, updateSelectMapping.get(index), sourceRow), dataType.typeClass); - } - } - } else { - int index = 0; - for (index = 0; index < selectColTypes.size(); index++) { - boundInsertStatement = getBoundStatement(sourceRow, boundInsertStatement, index, selectColTypes); - if (boundInsertStatement == null) return null; - } - - if (!ttlCols.isEmpty()) { - boundInsertStatement = boundInsertStatement.set(index, getLargestTTL(sourceRow), Integer.class); - index++; - } - if (!writeTimeStampCols.isEmpty()) { - if (customWritetime > 0) { - boundInsertStatement = boundInsertStatement.set(index, customWritetime, Long.class); - } else { - boundInsertStatement = boundInsertStatement.set(index, getLargestWriteTimeStamp(sourceRow), Long.class); - } - } - } - - // Batch insert for large records may take longer, hence 10 secs to avoid timeout errors - return boundInsertStatement.setTimeout(Duration.ofSeconds(10)); - } - - public int getLargestTTL(Row sourceRow) { - return IntStream.range(0, ttlCols.size()) - .map(i -> sourceRow.getInt(selectColTypes.size() + i)).max().getAsInt(); - } - - public long getLargestWriteTimeStamp(Row sourceRow) { - return IntStream.range(0, writeTimeStampCols.size()) - .mapToLong(i -> sourceRow.getLong(selectColTypes.size() + ttlCols.size() + i)).max().getAsLong(); - } - - public BoundStatement selectFromAstra(PreparedStatement selectStatement, Row sourceRow) { - BoundStatement boundSelectStatement = selectStatement.bind().setConsistencyLevel(readConsistencyLevel); - for (int index = 0; index < idColTypes.size(); index++) { - boundSelectStatement = getBoundStatement(sourceRow, boundSelectStatement, index, idColTypes); - if (boundSelectStatement == null) return null; - } - - return boundSelectStatement; - } - - private BoundStatement getBoundStatement(Row sourceRow, BoundStatement boundSelectStatement, int index, - List cols) { - MigrateDataType dataTypeObj = cols.get(index); - Object colData = getData(dataTypeObj, index, sourceRow); - - // Handle rows with blank values in primary-key fields - if (index < idColTypes.size()) { - Optional optionalVal = handleBlankInPrimaryKey(index, colData, dataTypeObj.typeClass, sourceRow); - if (!optionalVal.isPresent()) { - return null; - } - colData = optionalVal.get(); - } - boundSelectStatement = boundSelectStatement.set(index, colData, dataTypeObj.typeClass); - return boundSelectStatement; - } - - protected Optional handleBlankInPrimaryKey(int index, Object colData, Class dataType, Row sourceRow) { - return handleBlankInPrimaryKey(index, colData, dataType, sourceRow, true); - } - - protected Optional handleBlankInPrimaryKey(int index, Object colData, Class dataType, Row sourceRow, boolean logWarn) { - // Handle rows with blank values for 'String' data-type in primary-key fields - if (index < idColTypes.size() && colData == null && dataType == String.class) { - if (logWarn) { - logger.warn("For row with Key: {}, found String primary-key column {} with blank value", - getKey(sourceRow), allCols[index]); - } - return Optional.of(""); - } - - // Handle rows with blank values for 'timestamp' data-type in primary-key fields - if (index < idColTypes.size() && colData == null && dataType == Instant.class) { - if (tsReplaceValStr.isEmpty()) { - logger.error("Skipping row with Key: {} as Timestamp primary-key column {} has invalid blank value. " + - "Alternatively rerun the job with --conf spark.target.replace.blankTimestampKeyUsingEpoch=\"\" " + - "option to replace the blanks with a fixed timestamp value", getKey(sourceRow), allCols[index]); - return Optional.empty(); - } - if (logWarn) { - logger.warn("For row with Key: {}, found Timestamp primary-key column {} with invalid blank value. " + - "Using value {} instead", getKey(sourceRow), allCols[index], Instant.ofEpochSecond(tsReplaceVal)); - } - return Optional.of(Instant.ofEpochSecond(tsReplaceVal)); - } - - return Optional.of(colData); - } - -} +package datastax.astra.migrate; + +import com.datastax.oss.driver.api.core.CqlSession; +import com.datastax.oss.driver.api.core.cql.BoundStatement; +import com.datastax.oss.driver.api.core.cql.PreparedStatement; +import com.datastax.oss.driver.api.core.cql.Row; +import com.datastax.oss.driver.shaded.guava.common.util.concurrent.RateLimiter; +import org.apache.commons.lang.StringUtils; +import org.apache.spark.SparkConf; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.time.Duration; +import java.time.Instant; +import java.util.Map; +import java.util.stream.IntStream; + +public class AbstractJobSession extends BaseJobSession { + + public Logger logger = LoggerFactory.getLogger(this.getClass().getName()); + + protected AbstractJobSession(CqlSession sourceSession, CqlSession astraSession, SparkConf sc) { + this(sourceSession, astraSession, sc, false); + } + + protected AbstractJobSession(CqlSession sourceSession, CqlSession astraSession, SparkConf sc, boolean isJobMigrateRowsFromFile) { + super(sc); + this.sourceSession = sourceSession; + this.astraSession = astraSession; + + batchSize = new Integer(Util.getSparkPropOr(sc, "spark.batchSize", "1")); + fetchSizeInRows = new Integer(Util.getSparkPropOr(sc, "spark.read.fetch.sizeInRows", "1000")); + printStatsAfter = new Integer(Util.getSparkPropOr(sc, "spark.printStatsAfter", "100000")); + if (printStatsAfter < 1) { + printStatsAfter = 100000; + } + + readLimiter = RateLimiter.create(new Integer(Util.getSparkPropOr(sc, "spark.readRateLimit", "20000"))); + writeLimiter = RateLimiter.create(new Integer(Util.getSparkPropOr(sc, "spark.writeRateLimit", "40000"))); + maxRetries = Integer.parseInt(sc.get("spark.maxRetries", "10")); + maxRetriesRowFailure = Integer.parseInt(sc.get("spark.maxRetries.rowFailure", "2")); + + sourceKeyspaceTable = Util.getSparkProp(sc, "spark.origin.keyspaceTable"); + astraKeyspaceTable = Util.getSparkProp(sc, "spark.target.keyspaceTable"); + + tokenRangeExceptionDir = Util.getSparkProp(sc, "spark.tokenRange.exceptionDir"); + rowExceptionDir = Util.getSparkProp(sc, "spark.row.exceptionDir"); + exceptionFileName = sourceKeyspaceTable; + + String ttlColsStr = Util.getSparkPropOrEmpty(sc, "spark.query.ttl.cols"); + if (null != ttlColsStr && ttlColsStr.trim().length() > 0) { + for (String ttlCol : ttlColsStr.split(",")) { + ttlCols.add(Integer.parseInt(ttlCol)); + } + } + + String writeTimestampColsStr = Util.getSparkPropOrEmpty(sc, "spark.query.writetime.cols"); + if (null != writeTimestampColsStr && writeTimestampColsStr.trim().length() > 0) { + for (String writeTimeStampCol : writeTimestampColsStr.split(",")) { + writeTimeStampCols.add(Integer.parseInt(writeTimeStampCol)); + } + } + + writeTimeStampFilter = Boolean + .parseBoolean(Util.getSparkPropOr(sc, "spark.origin.writeTimeStampFilter", "false")); + // batchsize set to 1 if there is a writeFilter + if (writeTimeStampFilter) { + batchSize = 1; + } + + enableDefaultTTL = Boolean + .parseBoolean(Util.getSparkPropOr(sc, "spark.target.default.ttl.enable", "false")); + if (enableDefaultTTL) { + defaultTTL = Integer.parseInt(Util.getSparkPropOr(sc, "spark.target.default.ttl", "7776000")); + } + enableDefaultWriteTime = Boolean + .parseBoolean(Util.getSparkPropOr(sc, "spark.target.default.writetime.enable", "false")); + if (enableDefaultWriteTime) { + defaultWriteTime = Long.parseLong(Util.getSparkPropOr(sc, "spark.target.default.writetime", "1640998861000")); + } + + String minWriteTimeStampFilterStr = + Util.getSparkPropOr(sc, "spark.origin.minWriteTimeStampFilter", "0"); + if (null != minWriteTimeStampFilterStr && minWriteTimeStampFilterStr.trim().length() > 1) { + minWriteTimeStampFilter = Long.parseLong(minWriteTimeStampFilterStr); + } + String maxWriteTimeStampFilterStr = + Util.getSparkPropOr(sc, "spark.origin.maxWriteTimeStampFilter", "0"); + if (null != maxWriteTimeStampFilterStr && maxWriteTimeStampFilterStr.trim().length() > 1) { + maxWriteTimeStampFilter = Long.parseLong(maxWriteTimeStampFilterStr); + } + + String customWriteTimeStr = + Util.getSparkPropOr(sc, "spark.target.custom.writeTime", "0"); + if (null != customWriteTimeStr && customWriteTimeStr.trim().length() > 1 && StringUtils.isNumeric(customWriteTimeStr.trim())) { + customWritetime = Long.parseLong(customWriteTimeStr); + } + + logger.info("PARAM -- Read Consistency: {}", readConsistencyLevel); + logger.info("PARAM -- Write Consistency: {}", writeConsistencyLevel); + logger.info("PARAM -- Write Batch Size: {}", batchSize); + logger.info("PARAM -- Read Fetch Size: {}", fetchSizeInRows); + logger.info("PARAM -- Source Keyspace Table: {}", sourceKeyspaceTable); + logger.info("PARAM -- Destination Keyspace Table: {}", astraKeyspaceTable); + logger.info("PARAM -- ReadRateLimit: {}", readLimiter.getRate()); + logger.info("PARAM -- WriteRateLimit: {}", writeLimiter.getRate()); + logger.info("PARAM -- TTLCols: {}", ttlCols); + logger.info("PARAM -- WriteTimestampFilterCols: {}", writeTimeStampCols); + logger.info("PARAM -- WriteTimestampFilter: {}", writeTimeStampFilter); + logger.info("PARAM -- enableDefaultTTL: {}", enableDefaultTTL); + logger.info("PARAM -- defaultTTL: {}", defaultTTL); + logger.info("PARAM -- enableDefaultWriteTime: {}", enableDefaultWriteTime); + logger.info("PARAM -- defaultWriteTime: {}", defaultWriteTime); + + if (writeTimeStampFilter) { + logger.info("PARAM -- minWriteTimeStampFilter: {} datetime is {}", minWriteTimeStampFilter, + Instant.ofEpochMilli(minWriteTimeStampFilter / 1000)); + logger.info("PARAM -- maxWriteTimeStampFilter: {} datetime is {}", maxWriteTimeStampFilter, + Instant.ofEpochMilli(maxWriteTimeStampFilter / 1000)); + } + + String selectCols = Util.getSparkProp(sc, "spark.query.origin"); + String partionKey = Util.getSparkProp(sc, "spark.query.origin.partitionKey"); + String sourceSelectCondition = Util.getSparkPropOrEmpty(sc, "spark.query.condition"); + if (!sourceSelectCondition.isEmpty() && !sourceSelectCondition.trim().toUpperCase().startsWith("AND")) { + sourceSelectCondition = " AND " + sourceSelectCondition; + } + + final StringBuilder selectTTLWriteTimeCols = new StringBuilder(); + String[] allCols = selectCols.split(","); + ttlCols.forEach(col -> { + selectTTLWriteTimeCols.append(",ttl(" + allCols[col] + ")"); + }); + writeTimeStampCols.forEach(col -> { + selectTTLWriteTimeCols.append(",writetime(" + allCols[col] + ")"); + }); + selectColTypes = getTypes(Util.getSparkProp(sc, "spark.query.types")); + String idCols = Util.getSparkPropOrEmpty(sc, "spark.query.target.id"); + idColTypes = selectColTypes.subList(0, idCols.split(",").length); + + String insertCols = Util.getSparkPropOrEmpty(sc, "spark.query.target"); + if (null == insertCols || insertCols.trim().isEmpty()) { + insertCols = selectCols; + } + String insertBinds = ""; + for (String str : idCols.split(",")) { + if (insertBinds.isEmpty()) { + insertBinds = str + "= ?"; + } else { + insertBinds += " and " + str + "= ?"; + } + } + + String fullSelectQuery; + String fullSelectLatestQuery; + if (!isJobMigrateRowsFromFile) { + fullSelectQuery = "select " + selectCols + selectTTLWriteTimeCols + " from " + sourceKeyspaceTable + " where token(" + partionKey.trim() + + ") >= ? and token(" + partionKey.trim() + ") <= ? " + sourceSelectCondition + " ALLOW FILTERING"; + } else { + fullSelectQuery = "select " + selectCols + selectTTLWriteTimeCols + " from " + sourceKeyspaceTable + " where " + insertBinds; + } + fullSelectLatestQuery = "select " + selectCols + " from " + sourceKeyspaceTable + " where " + insertBinds; + sourceSelectStatement = sourceSession.prepare(fullSelectQuery); + sourceSelectLatestStatement = sourceSession.prepare(fullSelectLatestQuery); + logger.info("PARAM -- Query used: {}", fullSelectQuery); + logger.info("PARAM -- Latest Query used: {}", fullSelectLatestQuery); + + astraSelectStatement = astraSession.prepare( + "select " + insertCols + " from " + astraKeyspaceTable + + " where " + insertBinds); + + hasRandomPartitioner = Boolean.parseBoolean(Util.getSparkPropOr(sc, "spark.origin.hasRandomPartitioner", "false")); + isCounterTable = Boolean.parseBoolean(Util.getSparkPropOr(sc, "spark.counterTable", "false")); + if (isCounterTable) { + String updateSelectMappingStr = Util.getSparkPropOr(sc, "spark.counterTable.cql.index", "0"); + for (String updateSelectIndex : updateSelectMappingStr.split(",")) { + updateSelectMapping.add(Integer.parseInt(updateSelectIndex)); + } + + String counterTableUpdate = Util.getSparkProp(sc, "spark.counterTable.cql"); + astraInsertStatement = astraSession.prepare(counterTableUpdate); + } else { + insertBinds = ""; + for (String str : insertCols.split(",")) { + if (insertBinds.isEmpty()) { + insertBinds += "?"; + } else { + insertBinds += ", ?"; + } + } + + String fullInsertQuery = "insert into " + astraKeyspaceTable + " (" + insertCols + ") VALUES (" + insertBinds + ")"; + if (!ttlCols.isEmpty() || enableDefaultTTL) { + fullInsertQuery += " USING TTL ?"; + if (!writeTimeStampCols.isEmpty() || enableDefaultWriteTime) { + fullInsertQuery += " AND TIMESTAMP ?"; + } + } else if (!writeTimeStampCols.isEmpty() || enableDefaultWriteTime) { + fullInsertQuery += " USING TIMESTAMP ?"; + } + astraInsertStatement = astraSession.prepare(fullInsertQuery); + } + } + + public BoundStatement bindInsert(PreparedStatement insertStatement, Row sourceRow, Row astraRow) { + BoundStatement boundInsertStatement = insertStatement.bind().setConsistencyLevel(writeConsistencyLevel); + + if (isCounterTable) { + for (int index = 0; index < selectColTypes.size(); index++) { + MigrateDataType dataType = selectColTypes.get(updateSelectMapping.get(index)); + // compute the counter delta if reading from astra for the difference + if (astraRow != null && index < (selectColTypes.size() - idColTypes.size())) { + boundInsertStatement = boundInsertStatement.set(index, (sourceRow.getLong(updateSelectMapping.get(index)) - astraRow.getLong(updateSelectMapping.get(index))), Long.class); + } else { + boundInsertStatement = boundInsertStatement.set(index, getData(dataType, updateSelectMapping.get(index), sourceRow), dataType.typeClass); + } + } + } else { + int index = 0; + for (index = 0; index < selectColTypes.size(); index++) { + MigrateDataType dataTypeObj = selectColTypes.get(index); + Class dataType = dataTypeObj.typeClass; + + try { + Object colData = getData(dataTypeObj, index, sourceRow); + if (index < idColTypes.size() && colData == null && dataType == String.class) { + colData = ""; + } + boundInsertStatement = boundInsertStatement.set(index, colData, dataType); + } catch (NullPointerException e) { + // ignore the exception for map values being null + if (dataType != Map.class) { + throw e; + } + } + } + + if (!ttlCols.isEmpty()) { + boundInsertStatement = boundInsertStatement.set(index, getLargestTTL(sourceRow), Integer.class); + index++; + }else if(enableDefaultTTL && defaultTTL > 0) { + boundInsertStatement = boundInsertStatement.set(index, defaultTTL, Integer.class); + index++; + } + if (!writeTimeStampCols.isEmpty()) { + if (customWritetime > 0) { + boundInsertStatement = boundInsertStatement.set(index, customWritetime, Long.class); + } else { + boundInsertStatement = boundInsertStatement.set(index, getLargestWriteTimeStamp(sourceRow), Long.class); + } + }else if(enableDefaultWriteTime && defaultWriteTime > 0l) { + boundInsertStatement = boundInsertStatement.set(index, defaultWriteTime, Long.class); + } + } + + // Batch insert for large records may take longer, hence 10 secs to avoid timeout errors + return boundInsertStatement.setTimeout(Duration.ofSeconds(10)); + } + + public int getLargestTTL(Row sourceRow) { + return IntStream.range(0, ttlCols.size()) + .map(i -> sourceRow.getInt(selectColTypes.size() + i)).max().getAsInt(); + } + + public long getLargestWriteTimeStamp(Row sourceRow) { + return IntStream.range(0, writeTimeStampCols.size()) + .mapToLong(i -> sourceRow.getLong(selectColTypes.size() + ttlCols.size() + i)).max().getAsLong(); + } + + public BoundStatement selectFromAstra(PreparedStatement selectStatement, Row sourceRow) { + BoundStatement boundSelectStatement = selectStatement.bind().setConsistencyLevel(readConsistencyLevel); + for (int index = 0; index < idColTypes.size(); index++) { + MigrateDataType dataType = idColTypes.get(index); + boundSelectStatement = boundSelectStatement.set(index, getData(dataType, index, sourceRow), + dataType.typeClass); + } + + return boundSelectStatement; + } + +} diff --git a/src/main/java/datastax/astra/migrate/BaseJobSession.java b/src/main/java/datastax/astra/migrate/BaseJobSession.java index cf3cfbb5..8aedcfaf 100644 --- a/src/main/java/datastax/astra/migrate/BaseJobSession.java +++ b/src/main/java/datastax/astra/migrate/BaseJobSession.java @@ -1,110 +1,118 @@ -package datastax.astra.migrate; - -import com.datastax.oss.driver.api.core.ConsistencyLevel; -import com.datastax.oss.driver.api.core.CqlSession; -import com.datastax.oss.driver.api.core.cql.PreparedStatement; -import com.datastax.oss.driver.api.core.cql.Row; -import com.datastax.oss.driver.shaded.guava.common.util.concurrent.RateLimiter; -import org.apache.spark.SparkConf; - -import java.util.ArrayList; -import java.util.List; -import java.util.Map; -import java.util.Set; - -public abstract class BaseJobSession { - - protected PreparedStatement sourceSelectStatement; - protected PreparedStatement astraSelectStatement; - protected PreparedStatement astraInsertStatement; - protected ConsistencyLevel readConsistencyLevel; - protected ConsistencyLevel writeConsistencyLevel; - - // Read/Write Rate limiter - // Determine the total throughput for the entire cluster in terms of wries/sec, - // reads/sec - // then do the following to set the values as they are only applicable per JVM - // (hence spark Executor)... - // Rate = Total Throughput (write/read per sec) / Total Executors - protected RateLimiter readLimiter; - protected RateLimiter writeLimiter; - protected Integer maxRetries = 10; - - protected CqlSession sourceSession; - protected CqlSession astraSession; - protected List selectColTypes = new ArrayList(); - protected List idColTypes = new ArrayList(); - protected List updateSelectMapping = new ArrayList(); - - protected Integer batchSize = 1; - protected Integer fetchSizeInRows = 1000; - protected Integer printStatsAfter = 100000; - - protected Boolean writeTimeStampFilter = Boolean.FALSE; - protected Long minWriteTimeStampFilter = 0l; - protected Long maxWriteTimeStampFilter = Long.MAX_VALUE; - protected Long customWritetime = 0l; - - protected List writeTimeStampCols = new ArrayList(); - protected List ttlCols = new ArrayList(); - protected Boolean isCounterTable; - - protected String sourceKeyspaceTable; - protected String astraKeyspaceTable; - - protected Boolean hasRandomPartitioner; - protected Boolean filterData; - protected String filterColName; - protected String filterColType; - protected Integer filterColIndex; - protected String filterColValue; - - protected String[] allCols; - protected String tsReplaceValStr; - protected long tsReplaceVal; - - protected BaseJobSession(SparkConf sc) { - readConsistencyLevel = Util.mapToConsistencyLevel(Util.getSparkPropOrEmpty(sc, "spark.consistency.read")); - writeConsistencyLevel = Util.mapToConsistencyLevel(Util.getSparkPropOrEmpty(sc, "spark.consistency.write")); - } - - public String getKey(Row sourceRow) { - StringBuffer key = new StringBuffer(); - for (int index = 0; index < idColTypes.size(); index++) { - MigrateDataType dataType = idColTypes.get(index); - if (index == 0) { - key.append(getData(dataType, index, sourceRow)); - } else { - key.append(" %% " + getData(dataType, index, sourceRow)); - } - } - - return key.toString(); - } - - public List getTypes(String types) { - List dataTypes = new ArrayList(); - for (String type : types.split(",")) { - dataTypes.add(new MigrateDataType(type)); - } - - return dataTypes; - } - - public Object getData(MigrateDataType dataType, int index, Row sourceRow) { - if (dataType.typeClass == Map.class) { - return sourceRow.getMap(index, dataType.subTypes.get(0), dataType.subTypes.get(1)); - } else if (dataType.typeClass == List.class) { - return sourceRow.getList(index, dataType.subTypes.get(0)); - } else if (dataType.typeClass == Set.class) { - return sourceRow.getSet(index, dataType.subTypes.get(0)); - } else if (isCounterTable && dataType.typeClass == Long.class) { - Object data = sourceRow.get(index, dataType.typeClass); - if (data == null) { - return new Long(0); - } - } - - return sourceRow.get(index, dataType.typeClass); - } -} +package datastax.astra.migrate; + +import com.datastax.oss.driver.api.core.ConsistencyLevel; +import com.datastax.oss.driver.api.core.CqlSession; +import com.datastax.oss.driver.api.core.cql.PreparedStatement; +import com.datastax.oss.driver.api.core.cql.Row; +import com.datastax.oss.driver.shaded.guava.common.util.concurrent.RateLimiter; +import org.apache.spark.SparkConf; + +import java.util.ArrayList; +import java.util.List; +import java.util.Map; +import java.util.Set; + +public abstract class BaseJobSession { + + protected PreparedStatement sourceSelectStatement; + protected PreparedStatement sourceSelectLatestStatement; + protected PreparedStatement astraSelectStatement; + protected PreparedStatement astraInsertStatement; + protected ConsistencyLevel readConsistencyLevel; + protected ConsistencyLevel writeConsistencyLevel; + + // Read/Write Rate limiter + // Determine the total throughput for the entire cluster in terms of wries/sec, + // reads/sec + // then do the following to set the values as they are only applicable per JVM + // (hence spark Executor)... + // Rate = Total Throughput (write/read per sec) / Total Executors + protected RateLimiter readLimiter; + protected RateLimiter writeLimiter; + protected Integer maxRetries = 10; + protected Integer maxRetriesRowFailure = 2; + + protected CqlSession sourceSession; + protected CqlSession astraSession; + protected List selectColTypes = new ArrayList(); + protected List idColTypes = new ArrayList(); + protected List updateSelectMapping = new ArrayList(); + + protected Integer batchSize = 1; + protected Integer fetchSizeInRows = 1000; + protected Integer printStatsAfter = 100000; + + protected Boolean writeTimeStampFilter = Boolean.FALSE; + protected Long minWriteTimeStampFilter = 0l; + protected Long maxWriteTimeStampFilter = Long.MAX_VALUE; + protected Long customWritetime = 0l; + + protected List writeTimeStampCols = new ArrayList(); + protected List ttlCols = new ArrayList(); + protected Boolean isCounterTable; + + protected String sourceKeyspaceTable; + protected String astraKeyspaceTable; + + protected Boolean hasRandomPartitioner; + protected Boolean filterData; + protected String filterColName; + protected String filterColType; + protected Integer filterColIndex; + protected String filterColValue; + + protected Boolean enableDefaultTTL = Boolean.FALSE; + protected Integer defaultTTL = 7776000; //default TTL as 90days + + protected Boolean enableDefaultWriteTime = Boolean.FALSE; + protected Long defaultWriteTime = 1640998861000l; //default as Saturday, January 1, 2022 2:01:01 AM GMT+01:00 in epoch microseconds + + protected String tokenRangeExceptionDir; + protected String rowExceptionDir; + protected String exceptionFileName; + + protected BaseJobSession(SparkConf sc) { + readConsistencyLevel = Util.mapToConsistencyLevel(Util.getSparkPropOrEmpty(sc, "spark.consistency.read")); + writeConsistencyLevel = Util.mapToConsistencyLevel(Util.getSparkPropOrEmpty(sc, "spark.consistency.write")); + } + + public String getKey(Row sourceRow) { + StringBuffer key = new StringBuffer(); + for (int index = 0; index < idColTypes.size(); index++) { + MigrateDataType dataType = idColTypes.get(index); + if (index == 0) { + key.append(getData(dataType, index, sourceRow)); + } else { + key.append(" %% " + getData(dataType, index, sourceRow)); + } + } + + return key.toString(); + } + + public List getTypes(String types) { + List dataTypes = new ArrayList(); + for (String type : types.split(",")) { + dataTypes.add(new MigrateDataType(type)); + } + + return dataTypes; + } + + public Object getData(MigrateDataType dataType, int index, Row sourceRow) { + if (dataType.typeClass == Map.class) { + return sourceRow.getMap(index, dataType.subTypes.get(0), dataType.subTypes.get(1)); + } else if (dataType.typeClass == List.class) { + return sourceRow.getList(index, dataType.subTypes.get(0)); + } else if (dataType.typeClass == Set.class) { + return sourceRow.getSet(index, dataType.subTypes.get(0)); + } else if (isCounterTable && dataType.typeClass == Long.class) { + Object data = sourceRow.get(index, dataType.typeClass); + if (data == null) { + return new Long(0); + } + } + + return sourceRow.get(index, dataType.typeClass); + } +} diff --git a/src/main/java/datastax/astra/migrate/CopyJobSession.java b/src/main/java/datastax/astra/migrate/CopyJobSession.java index 7a027e63..b87f6fd3 100644 --- a/src/main/java/datastax/astra/migrate/CopyJobSession.java +++ b/src/main/java/datastax/astra/migrate/CopyJobSession.java @@ -1,208 +1,181 @@ -package datastax.astra.migrate; - -import com.datastax.oss.driver.api.core.CqlSession; -import com.datastax.oss.driver.api.core.cql.*; -import org.apache.spark.SparkConf; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.math.BigInteger; -import java.util.ArrayList; -import java.util.Collection; -import java.util.concurrent.CompletionStage; -import java.util.concurrent.atomic.AtomicLong; - -public class CopyJobSession extends AbstractJobSession { - - private static CopyJobSession copyJobSession; - public Logger logger = LoggerFactory.getLogger(this.getClass().getName()); - protected AtomicLong readCounter = new AtomicLong(0); - protected AtomicLong skippedCounter = new AtomicLong(0); - protected AtomicLong writeCounter = new AtomicLong(0); - protected AtomicLong errorCounter = new AtomicLong(0); - - protected CopyJobSession(CqlSession sourceSession, CqlSession astraSession, SparkConf sc) { - super(sourceSession, astraSession, sc); - filterData = Boolean.parseBoolean(sc.get("spark.origin.FilterData", "false")); - filterColName = Util.getSparkPropOrEmpty(sc, "spark.origin.FilterColumn"); - filterColType = Util.getSparkPropOrEmpty(sc, "spark.origin.FilterColumnType"); - filterColIndex = Integer.parseInt(sc.get("spark.origin.FilterColumnIndex", "0")); - filterColValue = Util.getSparkPropOrEmpty(sc, "spark.origin.FilterColumnValue"); - } - - public static CopyJobSession getInstance(CqlSession sourceSession, CqlSession astraSession, SparkConf sc) { - if (copyJobSession == null) { - synchronized (CopyJobSession.class) { - if (copyJobSession == null) { - copyJobSession = new CopyJobSession(sourceSession, astraSession, sc); - } - } - } - - return copyJobSession; - } - - public void getDataAndInsert(BigInteger min, BigInteger max) { - logger.info("ThreadID: {} Processing min: {} max: {}", Thread.currentThread().getId(), min, max); - boolean done = false; - int maxAttempts = maxRetries + 1; - for (int attempts = 1; attempts <= maxAttempts && !done; attempts++) { - long readCnt = 0; - long writeCnt = 0; - long skipCnt = 0; - long errCnt = 0; - try { - ResultSet resultSet = sourceSession.execute(sourceSelectStatement.bind(hasRandomPartitioner ? - min : min.longValueExact(), hasRandomPartitioner ? max : max.longValueExact()) - .setConsistencyLevel(readConsistencyLevel).setPageSize(fetchSizeInRows)); - - Collection> writeResults = new ArrayList>(); - - // cannot do batching if the writeFilter is greater than 0 or - // maxWriteTimeStampFilter is less than max long - // do not batch for counters as it adds latency & increases chance of discrepancy - if (batchSize == 1 || writeTimeStampFilter || isCounterTable) { - for (Row sourceRow : resultSet) { - readLimiter.acquire(1); - readCnt++; - if (readCnt % printStatsAfter == 0) { - printCounts(false); - } - - if (filterData) { - String col = (String) getData(new MigrateDataType(filterColType), filterColIndex, sourceRow); - if (col.trim().equalsIgnoreCase(filterColValue)) { - logger.warn("Skipping row and filtering out: {}", getKey(sourceRow)); - skipCnt++; - continue; - } - } - if (writeTimeStampFilter) { - // only process rows greater than writeTimeStampFilter - Long sourceWriteTimeStamp = getLargestWriteTimeStamp(sourceRow); - if (sourceWriteTimeStamp < minWriteTimeStampFilter - || sourceWriteTimeStamp > maxWriteTimeStampFilter) { - skipCnt++; - continue; - } - } - writeLimiter.acquire(1); - - Row astraRow = null; - if (isCounterTable) { - ResultSet astraReadResultSet = astraSession - .execute(selectFromAstra(astraSelectStatement, sourceRow)); - astraRow = astraReadResultSet.one(); - } - - BoundStatement bInsert = bindInsert(astraInsertStatement, sourceRow, astraRow); - if (null == bInsert) { - skipCnt++; - continue; - } - CompletionStage astraWriteResultSet = astraSession.executeAsync(bInsert); - writeResults.add(astraWriteResultSet); - if (writeResults.size() > fetchSizeInRows) { - writeCnt += iterateAndClearWriteResults(writeResults, 1); - } - } - - // clear the write resultset - writeCnt += iterateAndClearWriteResults(writeResults, 1); - } else { - BatchStatement batchStatement = BatchStatement.newInstance(BatchType.UNLOGGED); - for (Row sourceRow : resultSet) { - readLimiter.acquire(1); - readCnt++; - if (readCnt % printStatsAfter == 0) { - printCounts(false); - } - - if (filterData) { - String colValue = (String) getData(new MigrateDataType(filterColType), filterColIndex, sourceRow); - if (colValue.trim().equalsIgnoreCase(filterColValue)) { - logger.warn("Skipping row and filtering out: {}", getKey(sourceRow)); - skipCnt++; - continue; - } - } - - writeLimiter.acquire(1); - BoundStatement bInsert = bindInsert(astraInsertStatement, sourceRow, null); - if (null == bInsert) { - skipCnt++; - continue; - } - batchStatement = batchStatement.add(bInsert); - - // if batch threshold is met, send the writes and clear the batch - if (batchStatement.size() >= batchSize) { - CompletionStage writeResultSet = astraSession.executeAsync(batchStatement); - writeResults.add(writeResultSet); - batchStatement = BatchStatement.newInstance(BatchType.UNLOGGED); - } - - if (writeResults.size() * batchSize > fetchSizeInRows) { - writeCnt += iterateAndClearWriteResults(writeResults, batchSize); - } - } - - // clear the write resultset - writeCnt += iterateAndClearWriteResults(writeResults, batchSize); - - // if there are any pending writes because the batchSize threshold was not met, then write and clear them - if (batchStatement.size() > 0) { - CompletionStage writeResultSet = astraSession.executeAsync(batchStatement); - writeResults.add(writeResultSet); - writeCnt += iterateAndClearWriteResults(writeResults, batchStatement.size()); - batchStatement = BatchStatement.newInstance(BatchType.UNLOGGED); - } - } - - readCounter.addAndGet(readCnt); - writeCounter.addAndGet(writeCnt); - skippedCounter.addAndGet(skipCnt); - done = true; - } catch (Exception e) { - if (attempts == maxAttempts) { - readCounter.addAndGet(readCnt); - writeCounter.addAndGet(writeCnt); - skippedCounter.addAndGet(skipCnt); - errorCounter.addAndGet(readCnt - writeCnt - skipCnt); - } - logger.error("Error occurred during Attempt#: {}", attempts, e); - logger.error("Error with PartitionRange -- ThreadID: {} Processing min: {} max: {} -- Attempt# {}", - Thread.currentThread().getId(), min, max, attempts); - logger.error("Error stats Read#: {}, Wrote#: {}, Skipped#: {}, Error#: {}", readCnt, writeCnt, skipCnt, (readCnt - writeCnt - skipCnt)); - } - } - } - - public synchronized void printCounts(boolean isFinal) { - String msg = "ThreadID: " + Thread.currentThread().getId(); - if (isFinal) { - msg += " Final"; - logger.info("################################################################################################"); - } - logger.info("{} Read Record Count: {}", msg, readCounter.get()); - logger.info("{} Skipped Record Count: {}", msg, skippedCounter.get()); - logger.info("{} Write Record Count: {}", msg, writeCounter.get()); - logger.info("{} Error Record Count: {}", msg, errorCounter.get()); - if (isFinal) { - logger.info("################################################################################################"); - } - } - - private int iterateAndClearWriteResults(Collection> writeResults, int incrementBy) throws Exception { - int cnt = 0; - for (CompletionStage writeResult : writeResults) { - //wait for the writes to complete for the batch. The Retry policy, if defined, should retry the write on timeouts. - writeResult.toCompletableFuture().get().one(); - cnt += incrementBy; - } - writeResults.clear(); - - return cnt; - } - -} +package datastax.astra.migrate; + +import com.datastax.oss.driver.api.core.CqlSession; +import com.datastax.oss.driver.api.core.cql.*; +import org.apache.spark.SparkConf; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.math.BigInteger; +import java.util.ArrayList; +import java.util.Collection; +import java.util.concurrent.CompletionStage; +import java.util.concurrent.atomic.AtomicLong; + +public class CopyJobSession extends AbstractJobSession { + + private static CopyJobSession copyJobSession; + public Logger logger = LoggerFactory.getLogger(this.getClass().getName()); + protected AtomicLong readCounter = new AtomicLong(0); + protected AtomicLong skippedCounter = new AtomicLong(0); + protected AtomicLong writeCounter = new AtomicLong(0); + + protected CopyJobSession(CqlSession sourceSession, CqlSession astraSession, SparkConf sc) { + super(sourceSession, astraSession, sc); + filterData = Boolean.parseBoolean(sc.get("spark.origin.FilterData", "false")); + filterColName = Util.getSparkPropOrEmpty(sc, "spark.origin.FilterColumn"); + filterColType = Util.getSparkPropOrEmpty(sc, "spark.origin.FilterColumnType"); + filterColIndex = Integer.parseInt(sc.get("spark.origin.FilterColumnIndex", "0")); + filterColValue = Util.getSparkPropOrEmpty(sc, "spark.origin.FilterColumnValue"); + } + + public static CopyJobSession getInstance(CqlSession sourceSession, CqlSession astraSession, SparkConf sc) { + if (copyJobSession == null) { + synchronized (CopyJobSession.class) { + if (copyJobSession == null) { + copyJobSession = new CopyJobSession(sourceSession, astraSession, sc); + } + } + } + + return copyJobSession; + } + + public void getDataAndInsert(BigInteger min, BigInteger max) { + logger.info("ThreadID: {} Processing min: {} max: {}", Thread.currentThread().getId(), min, max); + int maxAttempts = maxRetries; + for (int retryCount = 1; retryCount <= maxAttempts; retryCount++) { + + try { + ResultSet resultSet = sourceSession.execute(sourceSelectStatement.bind(hasRandomPartitioner ? + min : min.longValueExact(), hasRandomPartitioner ? max : max.longValueExact()) + .setConsistencyLevel(readConsistencyLevel).setPageSize(fetchSizeInRows)); + + Collection> writeResults = new ArrayList>(); + + // cannot do batching if the writeFilter is greater than 0 or + // maxWriteTimeStampFilter is less than max long + // do not batch for counters as it adds latency & increases chance of discrepancy + if (batchSize == 1 || writeTimeStampFilter || isCounterTable) { + for (Row sourceRow : resultSet) { + readLimiter.acquire(1); + + if (filterData) { + String col = (String) getData(new MigrateDataType(filterColType), filterColIndex, sourceRow); + if (col.trim().equalsIgnoreCase(filterColValue)) { + logger.warn("Skipping row and filtering out: {}", getKey(sourceRow)); + skippedCounter.incrementAndGet(); + continue; + } + } + + if (writeTimeStampFilter) { + // only process rows greater than writeTimeStampFilter + Long sourceWriteTimeStamp = getLargestWriteTimeStamp(sourceRow); + if (sourceWriteTimeStamp < minWriteTimeStampFilter + || sourceWriteTimeStamp > maxWriteTimeStampFilter) { + readCounter.incrementAndGet(); + skippedCounter.incrementAndGet(); + continue; + } + } + + writeLimiter.acquire(1); + if (readCounter.incrementAndGet() % printStatsAfter == 0) { + printCounts(false); + } + Row astraRow = null; + if (isCounterTable) { + ResultSet astraReadResultSet = astraSession + .execute(selectFromAstra(astraSelectStatement, sourceRow)); + astraRow = astraReadResultSet.one(); + } + + + CompletionStage astraWriteResultSet = astraSession + .executeAsync(bindInsert(astraInsertStatement, sourceRow, astraRow)); + writeResults.add(astraWriteResultSet); + if (writeResults.size() > fetchSizeInRows) { + iterateAndClearWriteResults(writeResults, 1); + } + } + + // clear the write resultset + iterateAndClearWriteResults(writeResults, 1); + } else { + BatchStatement batchStatement = BatchStatement.newInstance(BatchType.UNLOGGED); + for (Row sourceRow : resultSet) { + readLimiter.acquire(1); + writeLimiter.acquire(1); + if (readCounter.incrementAndGet() % printStatsAfter == 0) { + printCounts(false); + } + + if (filterData) { + String colValue = (String) getData(new MigrateDataType(filterColType), filterColIndex, sourceRow); + if (colValue.trim().equalsIgnoreCase(filterColValue)) { + logger.warn("Skipping row and filtering out: {}", getKey(sourceRow)); + skippedCounter.incrementAndGet(); + continue; + } + } + + batchStatement = batchStatement.add(bindInsert(astraInsertStatement, sourceRow, null)); + + // if batch threshold is met, send the writes and clear the batch + if (batchStatement.size() >= batchSize) { + CompletionStage writeResultSet = astraSession.executeAsync(batchStatement); + writeResults.add(writeResultSet); + batchStatement = BatchStatement.newInstance(BatchType.UNLOGGED); + } + + if (writeResults.size() * batchSize > fetchSizeInRows) { + iterateAndClearWriteResults(writeResults, batchSize); + } + } + + // clear the write resultset + iterateAndClearWriteResults(writeResults, batchSize); + + // if there are any pending writes because the batchSize threshold was not met, then write and clear them + if (batchStatement.size() > 0) { + CompletionStage writeResultSet = astraSession.executeAsync(batchStatement); + writeResults.add(writeResultSet); + iterateAndClearWriteResults(writeResults, batchStatement.size()); + batchStatement = BatchStatement.newInstance(BatchType.UNLOGGED); + } + } + + retryCount = maxAttempts; + } catch (Exception e) { + logger.error("Error occurred retry#: {}", retryCount, e); + logger.error("Error with PartitionRange -- ThreadID: {} Processing min: {} max: {} -- Retry# {}", + Thread.currentThread().getId(), min, max, retryCount); + } + } + } + + public synchronized void printCounts(boolean isFinal) { + String msg = "ThreadID: " + Thread.currentThread().getId(); + if (isFinal) { + msg += " Final"; + logger.info("################################################################################################"); + } + logger.info("{} Read Record Count: {}", msg, readCounter.get()); + logger.info("{} Skipped Record Count: {}", msg, skippedCounter.get()); + logger.info("{} Write Record Count: {}", msg, writeCounter.get()); + if (isFinal) { + logger.info("################################################################################################"); + } + } + + private void iterateAndClearWriteResults(Collection> writeResults, int incrementBy) throws Exception { + for (CompletionStage writeResult : writeResults) { + //wait for the writes to complete for the batch. The Retry policy, if defined, should retry the write on timeouts. + writeResult.toCompletableFuture().get().one(); + writeCounter.addAndGet(incrementBy); + } + writeResults.clear(); + } + +} diff --git a/src/main/java/datastax/astra/migrate/CopyPKJobSession.java b/src/main/java/datastax/astra/migrate/CopyPKJobSession.java index 1dbc1729..93033981 100644 --- a/src/main/java/datastax/astra/migrate/CopyPKJobSession.java +++ b/src/main/java/datastax/astra/migrate/CopyPKJobSession.java @@ -1,87 +1,266 @@ -package datastax.astra.migrate; - -import com.datastax.oss.driver.api.core.CqlSession; -import com.datastax.oss.driver.api.core.cql.BoundStatement; -import com.datastax.oss.driver.api.core.cql.ResultSet; -import com.datastax.oss.driver.api.core.cql.Row; -import org.apache.spark.SparkConf; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.beans.PropertyEditor; -import java.beans.PropertyEditorManager; -import java.util.List; -import java.util.concurrent.atomic.AtomicLong; - -public class CopyPKJobSession extends AbstractJobSession { - - private static CopyPKJobSession copyJobSession; - public Logger logger = LoggerFactory.getLogger(this.getClass().getName()); - protected AtomicLong readCounter = new AtomicLong(0); - protected AtomicLong missingCounter = new AtomicLong(0); - protected AtomicLong writeCounter = new AtomicLong(0); - - protected CopyPKJobSession(CqlSession sourceSession, CqlSession astraSession, SparkConf sc) { - super(sourceSession, astraSession, sc, true); - } - - public static CopyPKJobSession getInstance(CqlSession sourceSession, CqlSession astraSession, SparkConf sc) { - if (copyJobSession == null) { - synchronized (CopyPKJobSession.class) { - if (copyJobSession == null) { - copyJobSession = new CopyPKJobSession(sourceSession, astraSession, sc); - } - } - } - - return copyJobSession; - } - - public void getRowAndInsert(List rowsList) { - for (SplitPartitions.PKRows rows : rowsList) { - rows.pkRows.parallelStream().forEach(row -> { - readCounter.incrementAndGet(); - String[] pkFields = row.split(" %% "); - int idx = 0; - BoundStatement bspk = sourceSelectStatement.bind().setConsistencyLevel(readConsistencyLevel); - for (MigrateDataType tp : idColTypes) { - bspk = bspk.set(idx, convert(tp.typeClass, pkFields[idx]), tp.typeClass); - idx++; - } - Row pkRow = sourceSession.execute(bspk).one(); - if (null == pkRow) { - missingCounter.incrementAndGet(); - logger.error("Could not find row with primary-key: {}", row); - return; - } - ResultSet astraWriteResultSet = astraSession - .execute(bindInsert(astraInsertStatement, pkRow, null)); - writeCounter.incrementAndGet(); - if (readCounter.get() % printStatsAfter == 0) { - printCounts(false); - } - }); - } - - printCounts(true); - } - - public void printCounts(boolean isFinal) { - if (isFinal) { - logger.info("################################################################################################"); - } - logger.info("ThreadID: {} Read Record Count: {}", Thread.currentThread().getId(), readCounter.get()); - logger.info("ThreadID: {} Missing Record Count: {}", Thread.currentThread().getId(), missingCounter.get()); - logger.info("ThreadID: {} Inserted Record Count: {}", Thread.currentThread().getId(), writeCounter.get()); - if (isFinal) { - logger.info("################################################################################################"); - } - } - - private Object convert(Class targetType, String text) { - PropertyEditor editor = PropertyEditorManager.findEditor(targetType); - editor.setAsText(text); - return editor.getValue(); - } - +package datastax.astra.migrate; + +import com.datastax.oss.driver.api.core.CqlSession; +import com.datastax.oss.driver.api.core.cql.BoundStatement; +import com.datastax.oss.driver.api.core.cql.ResultSet; +import com.datastax.oss.driver.api.core.cql.Row; +import com.datastax.oss.driver.api.core.data.UdtValue; + +import org.apache.spark.SparkConf; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.beans.PropertyEditor; +import java.beans.PropertyEditorManager; +import java.math.BigDecimal; +import java.math.BigInteger; +import java.nio.ByteBuffer; +import java.time.Instant; +import java.time.LocalDate; +import java.util.List; +import java.util.UUID; +import java.util.concurrent.atomic.AtomicLong; +import java.util.stream.IntStream; + +public class CopyPKJobSession extends AbstractJobSession { + + private static CopyPKJobSession copyJobSession; + public Logger logger = LoggerFactory.getLogger(this.getClass().getName()); + protected AtomicLong readCounter = new AtomicLong(0); + protected AtomicLong missingCounter = new AtomicLong(0); + protected AtomicLong writeCounter = new AtomicLong(0); + + private AtomicLong correctedMissingCounter = new AtomicLong(0); + private AtomicLong correctedMismatchCounter = new AtomicLong(0); + private AtomicLong validCounter = new AtomicLong(0); + private AtomicLong mismatchCounter = new AtomicLong(0); + private AtomicLong skippedCounter = new AtomicLong(0); + private AtomicLong failedRowCounter = new AtomicLong(0); + + private Boolean firstRecord = true; + + protected CopyPKJobSession(CqlSession sourceSession, CqlSession astraSession, SparkConf sc) { + super(sourceSession, astraSession, sc, true); + } + + public static CopyPKJobSession getInstance(CqlSession sourceSession, CqlSession astraSession, SparkConf sc) { + if (copyJobSession == null) { + synchronized (CopyPKJobSession.class) { + if (copyJobSession == null) { + copyJobSession = new CopyPKJobSession(sourceSession, astraSession, sc); + } + } + } + + return copyJobSession; + } + + public void getRowAndInsert(List rowsList) { + for (SplitPartitions.PKRows rows : rowsList) { + rows.pkRows.parallelStream().forEach(row -> { + readCounter.incrementAndGet(); + String[] pkFields = row.split(" %% "); + int idx = 0; + BoundStatement bspk = sourceSelectStatement.bind().setConsistencyLevel(readConsistencyLevel); + for (MigrateDataType tp : idColTypes) { + bspk = bspk.set(idx, convert(tp.typeClass, pkFields[idx]), tp.typeClass); + idx++; + } + Row pkRow = sourceSession.execute(bspk).one(); + if (null == pkRow) { + missingCounter.incrementAndGet(); + logger.error("Could not find row with primary-key: {}", row); + return; + } + ResultSet astraWriteResultSet = astraSession.execute(bindInsert(astraInsertStatement, pkRow, null)); + writeCounter.incrementAndGet(); + if (readCounter.get() % printStatsAfter == 0) { + printCounts(false); + } + }); + } + + printCounts(true); + } + + @SuppressWarnings("unchecked") + public void getRowAndDiff(List rowsList) { + for (SplitPartitions.PKRows rows : rowsList) { + rows.pkRows.parallelStream().forEach(row -> { + readCounter.incrementAndGet(); + String[] pkFields = row.split(" %% "); + int idx = 0; + BoundStatement bspk = sourceSelectStatement.bind(); + try { + for (MigrateDataType tp : idColTypes) { + bspk = bspk.set(idx, convertNew(tp.typeClass, pkFields[idx]), tp.typeClass); + idx++; + } + } catch (Exception e) { + logger.error("Error occurred while type conversion {}", e); + throw new RuntimeException("Error occurred while type conversion" + e); + } + int maxAttempts = maxRetriesRowFailure; + Row sourceRow = null; + int diffAttempt = 0; + for (int retryCount = 1; retryCount <= maxAttempts; retryCount++) { + try { + sourceRow = sourceSession.execute(bspk).one(); + if (sourceRow != null) { + Row astraRow = astraSession.execute(selectFromAstra(astraSelectStatement, sourceRow)).one(); + diffAttempt++; + diff(sourceRow, astraRow, diffAttempt); + } else { + logger.error("Could not find row with primary-key: {} on source", row); + } + retryCount = maxAttempts; + } catch (Exception e) { + logger.error("Could not find row with primary-key: {} retry# {}", row, retryCount, e); + if (retryCount == maxAttempts) { + logFailedRecordInFile(sourceRow); + } + } + } + }); + } + printValidationCounts(true); + } + + private void diff(Row sourceRow, Row astraRow, int diffAttempt) { + if (astraRow == null) { + if (diffAttempt == 1) { + missingCounter.incrementAndGet(); + logger.info("Missing target row found for key: {}", getKey(sourceRow)); + } + astraSession.execute(bindInsert(astraInsertStatement, sourceRow, null)); + correctedMissingCounter.incrementAndGet(); + logger.info("Inserted missing row in target: {}", getKey(sourceRow)); + } else { + String diffData = isDifferent(sourceRow, astraRow); + if (!diffData.isEmpty()) { + if (diffAttempt == 1) { + mismatchCounter.incrementAndGet(); + logger.info("Mismatch row found for key: {} Mismatch: {}", getKey(sourceRow), diffData); + } + if (isCounterTable) { + astraSession.execute(bindInsert(astraInsertStatement, sourceRow, astraRow)); + } else { + astraSession.execute(bindInsert(astraInsertStatement, sourceRow, null)); + } + correctedMismatchCounter.incrementAndGet(); + logger.info("Updated mismatch row in target: {}", getKey(sourceRow)); + } else { + validCounter.incrementAndGet(); + } + } + } + + private void logFailedRecordInFile(Row sourceRow) { + try { + failedRowCounter.getAndIncrement(); + Util.FileAppend(rowExceptionDir, exceptionFileName, getKey(sourceRow)); + logger.error("Failed to validate row: {} after {} retry.", getKey(sourceRow)); + } catch (Exception exp) { + logger.error("Error occurred while writing to key {} to file ", getKey(sourceRow), exp); + } + } + + private String isDifferent(Row sourceRow, Row astraRow) { + StringBuffer diffData = new StringBuffer(); + IntStream.range(0, selectColTypes.size()).parallel().forEach(index -> { + MigrateDataType dataType = selectColTypes.get(index); + Object source = getData(dataType, index, sourceRow); + Object astra = getData(dataType, index, astraRow); + + boolean isDiff = dataType.diff(source, astra); + if (isDiff) { + if (dataType.typeClass.equals(UdtValue.class)) { + String sourceUdtContent = ((UdtValue) source).getFormattedContents(); + String astraUdtContent = ((UdtValue) astra).getFormattedContents(); + if (!sourceUdtContent.equals(astraUdtContent)) { + diffData.append("(Index: " + index + " Origin: " + sourceUdtContent + " Target: " + + astraUdtContent + ") "); + } + } else { + diffData.append("(Index: " + index + " Origin: " + source + " Target: " + astra + ") "); + } + } + }); + + return diffData.toString(); + } + + public void printValidationCounts(boolean isFinal) { + String msg = "ThreadID: " + Thread.currentThread().getId(); + if (isFinal) { + logger.info( + "################################################################################################"); + + logger.info("ThreadID: {} Read Record Count: {}", Thread.currentThread().getId(), readCounter.get()); + logger.info("{} Mismatch Record Count: {}", msg, mismatchCounter.get()); + logger.info("{} Corrected Mismatch Record Count: {}", msg, correctedMismatchCounter.get()); + logger.info("ThreadID: {} Missing Record Count: {}", Thread.currentThread().getId(), missingCounter.get()); + logger.info("{} Corrected Missing Record Count: {}", msg, correctedMissingCounter.get()); + logger.info("{} Skipped Record Count: {}", msg, skippedCounter.get()); + logger.info("{} Failed row Count: {}", msg, failedRowCounter.get()); + logger.info("{} Valid Record Count: {}", msg, validCounter.get()); + } + + logger.debug("ThreadID: {} Read Record Count: {}", Thread.currentThread().getId(), readCounter.get()); + logger.debug("{} Mismatch Record Count: {}", msg, mismatchCounter.get()); + logger.debug("{} Corrected Mismatch Record Count: {}", msg, correctedMismatchCounter.get()); + logger.debug("ThreadID: {} Missing Record Count: {}", Thread.currentThread().getId(), missingCounter.get()); + logger.debug("{} Corrected Missing Record Count: {}", msg, correctedMissingCounter.get()); + logger.debug("{} Skipped Record Count: {}", msg, skippedCounter.get()); + logger.debug("{} Failed row Count: {}", msg, failedRowCounter.get()); + logger.info("{} Valid Record Count: {}", msg, validCounter.get()); + + if (isFinal) { + logger.info( + "################################################################################################"); + } + } + + public void printCounts(boolean isFinal) { + if (isFinal) { + logger.info( + "################################################################################################"); + } + logger.info("ThreadID: {} Read Record Count: {}", Thread.currentThread().getId(), readCounter.get()); + logger.info("ThreadID: {} Missing Record Count: {}", Thread.currentThread().getId(), missingCounter.get()); + logger.info("ThreadID: {} Inserted Record Count: {}", Thread.currentThread().getId(), writeCounter.get()); + if (isFinal) { + logger.info( + "################################################################################################"); + } + } + + private Object convert(Class targetType, String text) { + PropertyEditor editor = PropertyEditorManager.findEditor(targetType); + editor.setAsText(text); + return editor.getValue(); + } + + private Object convertNew(Class targetType, String text) { + String className = targetType.getSimpleName(); + switch (className) { + case "Instant": + return Instant.parse(text); + case "ByteBuffer": + return ByteBuffer.wrap(text.getBytes()); + case "UUID": + return UUID.fromString(text); + case "BigDecimal": + return new BigDecimal(text); + case "LocalDate": + return LocalDate.parse(text); + case "BigInteger": + return new BigInteger(text); + default: + PropertyEditor editor = PropertyEditorManager.findEditor(targetType); + editor.setAsText(text); + return editor.getValue(); + } + } + } \ No newline at end of file diff --git a/src/main/java/datastax/astra/migrate/DiffJobSession.java b/src/main/java/datastax/astra/migrate/DiffJobSession.java index 32b98e85..887ecdcd 100644 --- a/src/main/java/datastax/astra/migrate/DiffJobSession.java +++ b/src/main/java/datastax/astra/migrate/DiffJobSession.java @@ -1,202 +1,261 @@ -package datastax.astra.migrate; - -import com.datastax.oss.driver.api.core.CqlSession; -import com.datastax.oss.driver.api.core.cql.AsyncResultSet; -import com.datastax.oss.driver.api.core.cql.BoundStatement; -import com.datastax.oss.driver.api.core.cql.ResultSet; -import com.datastax.oss.driver.api.core.cql.Row; -import com.datastax.oss.driver.api.core.data.UdtValue; -import org.apache.spark.SparkConf; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.math.BigInteger; -import java.util.HashMap; -import java.util.Map; -import java.util.Optional; -import java.util.concurrent.CompletionStage; -import java.util.concurrent.atomic.AtomicLong; -import java.util.stream.IntStream; -import java.util.stream.StreamSupport; - -public class DiffJobSession extends CopyJobSession { - - private static DiffJobSession diffJobSession; - public Logger logger = LoggerFactory.getLogger(this.getClass().getName()); - protected Boolean autoCorrectMissing = false; - protected Boolean autoCorrectMismatch = false; - private AtomicLong readCounter = new AtomicLong(0); - private AtomicLong mismatchCounter = new AtomicLong(0); - private AtomicLong missingCounter = new AtomicLong(0); - private AtomicLong correctedMissingCounter = new AtomicLong(0); - private AtomicLong correctedMismatchCounter = new AtomicLong(0); - private AtomicLong validCounter = new AtomicLong(0); - private AtomicLong skippedCounter = new AtomicLong(0); - - private DiffJobSession(CqlSession sourceSession, CqlSession astraSession, SparkConf sc) { - super(sourceSession, astraSession, sc); - - autoCorrectMissing = Boolean.parseBoolean(Util.getSparkPropOr(sc, "spark.target.autocorrect.missing", "false")); - logger.info("PARAM -- Autocorrect Missing: {}", autoCorrectMissing); - - autoCorrectMismatch = Boolean.parseBoolean(Util.getSparkPropOr(sc, "spark.target.autocorrect.mismatch", "false")); - logger.info("PARAM -- Autocorrect Mismatch: {}", autoCorrectMismatch); - } - - public static DiffJobSession getInstance(CqlSession sourceSession, CqlSession astraSession, SparkConf sparkConf) { - if (diffJobSession == null) { - synchronized (DiffJobSession.class) { - if (diffJobSession == null) { - diffJobSession = new DiffJobSession(sourceSession, astraSession, sparkConf); - } - } - } - - return diffJobSession; - } - - public void getDataAndDiff(BigInteger min, BigInteger max) { - logger.info("ThreadID: {} Processing min: {} max: {}", Thread.currentThread().getId(), min, max); - boolean done = false; - int maxAttempts = maxRetries + 1; - for (int attempts = 1; attempts <= maxAttempts && !done; attempts++) { - try { - // cannot do batching if the writeFilter is greater than 0 - ResultSet resultSet = sourceSession.execute(sourceSelectStatement.bind(hasRandomPartitioner ? - min : min.longValueExact(), hasRandomPartitioner ? max : max.longValueExact()) - .setConsistencyLevel(readConsistencyLevel).setPageSize(fetchSizeInRows)); - - Map> srcToTargetRowMap = new HashMap>(); - StreamSupport.stream(resultSet.spliterator(), false).forEach(srcRow -> { - readLimiter.acquire(1); - // do not process rows less than writeTimeStampFilter - if (!(writeTimeStampFilter && (getLargestWriteTimeStamp(srcRow) < minWriteTimeStampFilter - || getLargestWriteTimeStamp(srcRow) > maxWriteTimeStampFilter))) { - if (readCounter.incrementAndGet() % printStatsAfter == 0) { - printCounts(false); - } - - BoundStatement bSelect = selectFromAstra(astraSelectStatement, srcRow); - if (null == bSelect) { - skippedCounter.incrementAndGet(); - } else { - CompletionStage targetRowFuture = astraSession.executeAsync(bSelect); - srcToTargetRowMap.put(srcRow, targetRowFuture); - if (srcToTargetRowMap.size() > fetchSizeInRows) { - diffAndClear(srcToTargetRowMap); - } - } - } else { - readCounter.incrementAndGet(); - skippedCounter.incrementAndGet(); - } - }); - diffAndClear(srcToTargetRowMap); - done = true; - } catch (Exception e) { - logger.error("Error occurred during Attempt#: {}", attempts, e); - logger.error("Error with PartitionRange -- ThreadID: {} Processing min: {} max: {} -- Attempt# {}", - Thread.currentThread().getId(), min, max, attempts); - } - } - - } - - private void diffAndClear(Map> srcToTargetRowMap) { - for (Row srcRow : srcToTargetRowMap.keySet()) { - try { - Row targetRow = srcToTargetRowMap.get(srcRow).toCompletableFuture().get().one(); - diff(srcRow, targetRow); - } catch (Exception e) { - logger.error("Could not perform diff for Key: {}", getKey(srcRow), e); - } - } - srcToTargetRowMap.clear(); - } - - public synchronized void printCounts(boolean isFinal) { - String msg = "ThreadID: " + Thread.currentThread().getId(); - if (isFinal) { - msg += " Final"; - logger.info("################################################################################################"); - } - logger.info("{} Read Record Count: {}", msg, readCounter.get()); - logger.info("{} Mismatch Record Count: {}", msg, mismatchCounter.get()); - logger.info("{} Corrected Mismatch Record Count: {}", msg, correctedMismatchCounter.get()); - logger.info("{} Missing Record Count: {}", msg, missingCounter.get()); - logger.info("{} Corrected Missing Record Count: {}", msg, correctedMissingCounter.get()); - logger.info("{} Valid Record Count: {}", msg, validCounter.get()); - logger.info("{} Skipped Record Count: {}", msg, skippedCounter.get()); - if (isFinal) { - logger.info("################################################################################################"); - } - } - - private void diff(Row sourceRow, Row astraRow) { - if (astraRow == null) { - missingCounter.incrementAndGet(); - logger.error("Missing target row found for key: {}", getKey(sourceRow)); - //correct data - - if (autoCorrectMissing) { - astraSession.execute(bindInsert(astraInsertStatement, sourceRow, null)); - correctedMissingCounter.incrementAndGet(); - logger.error("Inserted missing row in target: {}", getKey(sourceRow)); - } - - return; - } - - String diffData = isDifferent(sourceRow, astraRow); - if (!diffData.isEmpty()) { - mismatchCounter.incrementAndGet(); - logger.error("Mismatch row found for key: {} Mismatch: {}", getKey(sourceRow), diffData); - - if (autoCorrectMismatch) { - if (isCounterTable) { - astraSession.execute(bindInsert(astraInsertStatement, sourceRow, astraRow)); - } else { - astraSession.execute(bindInsert(astraInsertStatement, sourceRow, null)); - } - correctedMismatchCounter.incrementAndGet(); - logger.error("Updated mismatch row in target: {}", getKey(sourceRow)); - } - - return; - } - - validCounter.incrementAndGet(); - } - - private String isDifferent(Row sourceRow, Row astraRow) { - StringBuffer diffData = new StringBuffer(); - IntStream.range(0, selectColTypes.size()).parallel().forEach(index -> { - MigrateDataType dataTypeObj = selectColTypes.get(index); - Object source = getData(dataTypeObj, index, sourceRow); - if (index < idColTypes.size()) { - Optional optionalVal = handleBlankInPrimaryKey(index, source, dataTypeObj.typeClass, sourceRow, false); - if (optionalVal.isPresent()) { - source = optionalVal.get(); - } - } - - Object astra = getData(dataTypeObj, index, astraRow); - - boolean isDiff = dataTypeObj.diff(source, astra); - if (isDiff) { - if (dataTypeObj.typeClass.equals(UdtValue.class)) { - String sourceUdtContent = ((UdtValue) source).getFormattedContents(); - String astraUdtContent = ((UdtValue) astra).getFormattedContents(); - if (!sourceUdtContent.equals(astraUdtContent)) { - diffData.append("(Index: " + index + " Origin: " + sourceUdtContent + " Target: " + astraUdtContent + ") "); - } - } else { - diffData.append("(Index: " + index + " Origin: " + source + " Target: " + astra + ") "); - } - } - }); - - return diffData.toString(); - } - -} +package datastax.astra.migrate; + +import java.math.BigInteger; +import java.util.HashMap; +import java.util.Map; +import java.util.concurrent.CompletionStage; +import java.util.concurrent.atomic.AtomicLong; +import java.util.stream.IntStream; +import java.util.stream.StreamSupport; + +import org.apache.spark.SparkConf; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import com.datastax.oss.driver.api.core.CqlSession; +import com.datastax.oss.driver.api.core.cql.AsyncResultSet; +import com.datastax.oss.driver.api.core.cql.ResultSet; +import com.datastax.oss.driver.api.core.cql.Row; +import com.datastax.oss.driver.api.core.data.UdtValue; + +public class DiffJobSession extends CopyJobSession { + + private static DiffJobSession diffJobSession; + public Logger logger = LoggerFactory.getLogger(this.getClass().getName()); + protected Boolean autoCorrectMissing = false; + protected Boolean autoCorrectMismatch = false; + private AtomicLong readCounter = new AtomicLong(0); + private AtomicLong mismatchCounter = new AtomicLong(0); + private AtomicLong missingCounter = new AtomicLong(0); + private AtomicLong correctedMissingCounter = new AtomicLong(0); + private AtomicLong correctedMismatchCounter = new AtomicLong(0); + private AtomicLong validCounter = new AtomicLong(0); + private AtomicLong skippedCounter = new AtomicLong(0); + private AtomicLong failedRowCounter = new AtomicLong(0); + + private DiffJobSession(CqlSession sourceSession, CqlSession astraSession, SparkConf sc) { + super(sourceSession, astraSession, sc); + + autoCorrectMissing = Boolean.parseBoolean(Util.getSparkPropOr(sc, "spark.target.autocorrect.missing", "false")); + logger.info("PARAM -- Autocorrect Missing: {}", autoCorrectMissing); + + autoCorrectMismatch = Boolean + .parseBoolean(Util.getSparkPropOr(sc, "spark.target.autocorrect.mismatch", "false")); + logger.info("PARAM -- Autocorrect Mismatch: {}", autoCorrectMismatch); + } + + public static DiffJobSession getInstance(CqlSession sourceSession, CqlSession astraSession, SparkConf sparkConf) { + if (diffJobSession == null) { + synchronized (DiffJobSession.class) { + if (diffJobSession == null) { + diffJobSession = new DiffJobSession(sourceSession, astraSession, sparkConf); + } + } + } + + return diffJobSession; + } + + public void getDataAndDiff(BigInteger min, BigInteger max) { + logger.info("ThreadID: {} Processing min: {} max: {}", Thread.currentThread().getId(), min, max); + int maxAttempts = maxRetries; + for (int retryCount = 1; retryCount <= maxAttempts; retryCount++) { + + try { + // cannot do batching if the writeFilter is greater than 0 + ResultSet resultSet = sourceSession.execute(sourceSelectStatement + .bind(hasRandomPartitioner ? min : min.longValueExact(), + hasRandomPartitioner ? max : max.longValueExact()) + .setConsistencyLevel(readConsistencyLevel).setPageSize(fetchSizeInRows)); + + Map> srcToTargetRowMap = new HashMap>(); + StreamSupport.stream(resultSet.spliterator(), false).forEach(srcRow -> { + readLimiter.acquire(1); + // do not process rows less than writeTimeStampFilter + if (!(writeTimeStampFilter && (getLargestWriteTimeStamp(srcRow) < minWriteTimeStampFilter + || getLargestWriteTimeStamp(srcRow) > maxWriteTimeStampFilter))) { + if (readCounter.incrementAndGet() % printStatsAfter == 0) { + printCounts(false); + } + + CompletionStage targetRowFuture = astraSession + .executeAsync(selectFromAstra(astraSelectStatement, srcRow)); + srcToTargetRowMap.put(srcRow, targetRowFuture); + if (srcToTargetRowMap.size() > fetchSizeInRows) { + diffAndClear(srcToTargetRowMap); + } + } else { + readCounter.incrementAndGet(); + skippedCounter.incrementAndGet(); + } + }); + diffAndClear(srcToTargetRowMap); + retryCount = maxAttempts; + } catch (Exception e) { + logger.error("Error occurred retry#: {}", retryCount, e); + logger.error("Error with PartitionRange -- ThreadID: {} Processing min: {} max: {} -- Retry# {}", + Thread.currentThread().getId(), min, max, retryCount); + if (retryCount == maxAttempts) { + logFailedPartitionsInFile(min, max); + } + } + } + + } + + private void logFailedPartitionsInFile(BigInteger min, BigInteger max) { + try { + Util.FileAppend(tokenRangeExceptionDir, exceptionFileName, min + "," + max); + } catch (Exception ee) { + logger.error("Error occurred while writing to token range file min: {} max: {}", min, max, ee); + } + } + + private void logFailedRecordInFile(Row sourceRow) { + try { + failedRowCounter.getAndIncrement(); + Util.FileAppend(rowExceptionDir, exceptionFileName, getKey(sourceRow)); + logger.error("Failed to validate row: {} after {} retry.", getKey(sourceRow)); + } catch (Exception exp) { + logger.error("Error occurred while writing to key {} to file ", getKey(sourceRow), exp); + } + } + + private void diffAndClear(Map> srcToTargetRowMap) { + for (Row srcRow : srcToTargetRowMap.keySet()) { + int maxAttempts = maxRetriesRowFailure; + for (int retryCount = 1; retryCount <= maxAttempts; retryCount++) { + try { + Row targetRow = srcToTargetRowMap.get(srcRow).toCompletableFuture().get().one(); + diff(srcRow, targetRow, retryCount); + retryCount = maxAttempts; + } catch (Exception e) { + logger.error("Could not perform diff for Key: {} -- Retry# {}", getKey(srcRow), retryCount, e); + if (retryCount == maxAttempts) { + logFailedRecordInFile(srcRow); + } + } + } + } + srcToTargetRowMap.clear(); + } + + public synchronized void printCounts(boolean isFinal) { + String msg = "ThreadID: " + Thread.currentThread().getId(); + if (isFinal) { + msg += " Final"; + logger.info( + "################################################################################################"); + logger.info("{} Read Record Count: {}", msg, readCounter.get()); + logger.info("{} Mismatch Record Count: {}", msg, mismatchCounter.get()); + logger.info("{} Corrected Mismatch Record Count: {}", msg, correctedMismatchCounter.get()); + logger.info("{} Missing Record Count: {}", msg, missingCounter.get()); + logger.info("{} Corrected Missing Record Count: {}", msg, correctedMissingCounter.get()); + logger.info("{} Valid Record Count: {}", msg, validCounter.get()); + logger.info("{} Skipped Record Count: {}", msg, skippedCounter.get()); + logger.info("{} Failed row Count: {}", msg, failedRowCounter.get()); + } + logger.debug("{} Read Record Count: {}", msg, readCounter.get()); + logger.debug("{} Mismatch Record Count: {}", msg, mismatchCounter.get()); + logger.debug("{} Corrected Mismatch Record Count: {}", msg, correctedMismatchCounter.get()); + logger.debug("{} Missing Record Count: {}", msg, missingCounter.get()); + logger.debug("{} Corrected Missing Record Count: {}", msg, correctedMissingCounter.get()); + logger.debug("{} Valid Record Count: {}", msg, validCounter.get()); + logger.debug("{} Skipped Record Count: {}", msg, skippedCounter.get()); + logger.debug("{} Failed row Count: {}", msg, failedRowCounter.get()); + if (isFinal) { + logger.info( + "################################################################################################"); + } + } + + private void diff(Row sourceRow, Row astraRow, int retry) { + if (astraRow == null) { + if (retry == 1) { + missingCounter.incrementAndGet(); + logger.error("Missing target row found for key: {}", getKey(sourceRow)); + } + if (autoCorrectMissing) { + astraSession.execute(bindInsert(astraInsertStatement, sourceRow, null)); + correctedMissingCounter.incrementAndGet(); + logger.error("Inserted missing row in target: {}", getKey(sourceRow)); + } + return; + } + + String diffData = isDifferent(sourceRow, astraRow); + if (!diffData.isEmpty()) { + if (retry == 1) { + mismatchCounter.incrementAndGet(); + logger.error("Mismatch row found for key: {} Mismatch: {}", getKey(sourceRow), diffData); + } + if (autoCorrectMismatch) { + if (isCounterTable) { + astraSession.execute(bindInsert(astraInsertStatement, sourceRow, astraRow)); + } else { + if (writeTimeStampCols.isEmpty()) { + checkAndUpdateForComplexTypeCols(sourceRow, astraRow); + } else { + astraSession.execute(bindInsert(astraInsertStatement, sourceRow, null)); + logger.error("Updated mismatch row in target: {}", getKey(sourceRow)); + } + correctedMismatchCounter.incrementAndGet(); + } + } + return; + } + + validCounter.incrementAndGet(); + } + + /** + * Use Case: Program fetches data from source Cassandra and stores it in memory + * and compare one record at a time with target Cassandra. During comparison if + * data is updated on both source and target databases on finding diff program + * will override old data from source over target. Use Case specific to complex + * tables having only set, map, list, UDT etc as column type for non primary + * columns. + * + * @param sourceRow + * @param astraRow + * @param diffData + */ + private void checkAndUpdateForComplexTypeCols(Row sourceRow, Row astraRow) { + Row latestRow = sourceSession.execute(selectFromAstra(sourceSelectLatestStatement, sourceRow)).one(); + String diffData = isDifferent(latestRow, astraRow); + if (!diffData.isEmpty()) { + logger.error("Mismatch found even after matching with latest record found for key: {} Mismatch: {}", + getKey(latestRow), diffData); + astraSession.execute(bindInsert(astraInsertStatement, latestRow, null)); + logger.error("Updated mismatch row in target: {}", getKey(sourceRow)); + } else { + logger.info("No mismatch after matching with latest record for key: {} ", getKey(latestRow)); + } + } + + private String isDifferent(Row sourceRow, Row astraRow) { + StringBuffer diffData = new StringBuffer(); + IntStream.range(0, selectColTypes.size()).parallel().forEach(index -> { + MigrateDataType dataType = selectColTypes.get(index); + Object source = getData(dataType, index, sourceRow); + Object astra = getData(dataType, index, astraRow); + + boolean isDiff = dataType.diff(source, astra); + if (isDiff) { + if (dataType.typeClass.equals(UdtValue.class)) { + String sourceUdtContent = ((UdtValue) source).getFormattedContents(); + String astraUdtContent = ((UdtValue) astra).getFormattedContents(); + if (!sourceUdtContent.equals(astraUdtContent)) { + diffData.append("(Index: " + index + " Origin: " + sourceUdtContent + " Target: " + + astraUdtContent + ") "); + } + } else { + diffData.append("(Index: " + index + " Origin: " + source + " Target: " + astra + ") "); + } + } + }); + + return diffData.toString(); + } + +} \ No newline at end of file diff --git a/src/main/java/datastax/astra/migrate/MigrateDataType.java b/src/main/java/datastax/astra/migrate/MigrateDataType.java index bf9c391c..aab8ea2d 100644 --- a/src/main/java/datastax/astra/migrate/MigrateDataType.java +++ b/src/main/java/datastax/astra/migrate/MigrateDataType.java @@ -1,94 +1,92 @@ -package datastax.astra.migrate; - -import com.datastax.oss.driver.api.core.data.TupleValue; -import com.datastax.oss.driver.api.core.data.UdtValue; - -import java.math.BigDecimal; -import java.math.BigInteger; -import java.nio.ByteBuffer; -import java.time.Instant; -import java.time.LocalDate; -import java.time.LocalTime; -import java.util.*; - -public class MigrateDataType { - Class typeClass = Object.class; - List subTypes = new ArrayList(); - - public MigrateDataType(String dataType) { - if (dataType.contains("%")) { - int count = 1; - for (String type : dataType.split("%")) { - if (count == 1) { - typeClass = getType(Integer.parseInt(type)); - } else { - subTypes.add(getType(Integer.parseInt(type))); - } - count++; - } - } else { - int type = Integer.parseInt(dataType); - typeClass = getType(type); - } - } - - public boolean diff(Object source, Object astra) { - if (source == null && astra == null) { - return false; - } else if (source == null && astra != null) { - return true; - } else if (source != null && astra == null) { - return true; - } - - return !source.equals(astra); - } - - private Class getType(int type) { - switch (type) { - case 0: - return String.class; - case 1: - return Integer.class; - case 2: - return Long.class; - case 3: - return Double.class; - case 4: - return Instant.class; - case 5: - return Map.class; - case 6: - return List.class; - case 7: - return ByteBuffer.class; - case 8: - return Set.class; - case 9: - return UUID.class; - case 10: - return Boolean.class; - case 11: - return TupleValue.class; - case 12: - return Float.class; - case 13: - return Byte.class; - case 14: - return BigDecimal.class; - case 15: - return LocalDate.class; - case 16: - return UdtValue.class; - case 17: - return BigInteger.class; - case 18: - return LocalTime.class; - case 19: - return Short.class; - } - - return Object.class; - } - -} +package datastax.astra.migrate; + +import com.datastax.oss.driver.api.core.data.TupleValue; +import com.datastax.oss.driver.api.core.data.UdtValue; + +import java.math.BigDecimal; +import java.math.BigInteger; +import java.nio.ByteBuffer; +import java.time.Instant; +import java.time.LocalDate; +import java.util.*; + +public class MigrateDataType { + Class typeClass = Object.class; + List subTypes = new ArrayList(); + + public MigrateDataType(String dataType) { + if (dataType.contains("%")) { + int count = 1; + for (String type : dataType.split("%")) { + if (count == 1) { + typeClass = getType(Integer.parseInt(type)); + } else { + subTypes.add(getType(Integer.parseInt(type))); + } + count++; + } + } else { + int type = Integer.parseInt(dataType); + typeClass = getType(type); + } + } + + public boolean diff(Object source, Object astra) { + if (source == null && astra == null) { + return false; + } else if (source == null && astra != null) { + return true; + } else if (source != null && astra == null) { + return true; + } else if( source instanceof BigDecimal && astra instanceof BigDecimal ) { + BigDecimal sourceConverted = (BigDecimal) source; + BigDecimal astraConverted = (BigDecimal) astra; + return !(sourceConverted.compareTo(astraConverted) == 0); + } + return !source.equals(astra); + } + + private Class getType(int type) { + switch (type) { + case 0: + return String.class; + case 1: + return Integer.class; + case 2: + return Long.class; + case 3: + return Double.class; + case 4: + return Instant.class; + case 5: + return Map.class; + case 6: + return List.class; + case 7: + return ByteBuffer.class; + case 8: + return Set.class; + case 9: + return UUID.class; + case 10: + return Boolean.class; + case 11: + return TupleValue.class; + case 12: + return Float.class; + case 13: + return Byte.class; + case 14: + return BigDecimal.class; + case 15: + return LocalDate.class; + case 16: + return UdtValue.class; + case 17: + return BigInteger.class; + } + + return Object.class; + } + +} diff --git a/src/main/java/datastax/astra/migrate/OriginCountJobSession.java b/src/main/java/datastax/astra/migrate/OriginCountJobSession.java index 10fd422d..53fa9f64 100644 --- a/src/main/java/datastax/astra/migrate/OriginCountJobSession.java +++ b/src/main/java/datastax/astra/migrate/OriginCountJobSession.java @@ -1,160 +1,163 @@ -package datastax.astra.migrate; - -import com.datastax.oss.driver.api.core.CqlSession; -import com.datastax.oss.driver.api.core.cql.*; -import com.datastax.oss.driver.shaded.guava.common.util.concurrent.RateLimiter; -import org.apache.commons.lang.SerializationUtils; -import org.apache.spark.SparkConf; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.io.Serializable; -import java.math.BigInteger; -import java.util.ArrayList; -import java.util.Collection; -import java.util.List; -import java.util.concurrent.CompletionStage; -import java.util.concurrent.atomic.AtomicLong; - -public class OriginCountJobSession extends BaseJobSession { - private static OriginCountJobSession originCountJobSession; - public Logger logger = LoggerFactory.getLogger(this.getClass().getName()); - protected AtomicLong readCounter = new AtomicLong(0); - protected List updateSelectMapping = new ArrayList(); - protected Boolean checkTableforColSize; - protected String checkTableforselectCols; - protected Integer fieldGuardraillimitMB; - protected List checkTableforColSizeTypes = new ArrayList(); - - protected OriginCountJobSession(CqlSession sourceSession, SparkConf sc) { - super(sc); - this.sourceSession = sourceSession; - batchSize = new Integer(sc.get("spark.batchSize", "1")); - printStatsAfter = new Integer(sc.get("spark.printStatsAfter", "100000")); - if (printStatsAfter < 1) { - printStatsAfter = 100000; - } - - readLimiter = RateLimiter.create(new Integer(sc.get("spark.readRateLimit", "20000"))); - sourceKeyspaceTable = sc.get("spark.origin.keyspaceTable"); - - hasRandomPartitioner = Boolean.parseBoolean(sc.get("spark.origin.hasRandomPartitioner", "false")); - isCounterTable = Boolean.parseBoolean(sc.get("spark.counterTable", "false")); - - checkTableforColSize = Boolean.parseBoolean(sc.get("spark.origin.checkTableforColSize", "false")); - checkTableforselectCols = sc.get("spark.origin.checkTableforColSize.cols"); - checkTableforColSizeTypes = getTypes(sc.get("spark.origin.checkTableforColSize.cols.types")); - filterColName = Util.getSparkPropOrEmpty(sc, "spark.origin.FilterColumn"); - filterColType = Util.getSparkPropOrEmpty(sc, "spark.origin.FilterColumnType"); - filterColIndex = Integer.parseInt(sc.get("spark.origin.FilterColumnIndex", "0")); - fieldGuardraillimitMB = Integer.parseInt(sc.get("spark.fieldGuardraillimitMB", "0")); - - String partionKey = sc.get("spark.query.cols.partitionKey"); - idColTypes = getTypes(sc.get("spark.query.cols.id.types")); - - String selectCols = sc.get("spark.query.cols.select"); - String updateSelectMappingStr = sc.get("spark.counterTable.cql.index", "0"); - for (String updateSelectIndex : updateSelectMappingStr.split(",")) { - updateSelectMapping.add(Integer.parseInt(updateSelectIndex)); - } - String sourceSelectCondition = sc.get("spark.query.cols.select.condition", ""); - sourceSelectStatement = sourceSession.prepare( - "select " + selectCols + " from " + sourceKeyspaceTable + " where token(" + partionKey.trim() - + ") >= ? and token(" + partionKey.trim() + ") <= ? " + sourceSelectCondition + " ALLOW FILTERING"); - } - - public static OriginCountJobSession getInstance(CqlSession sourceSession, SparkConf sparkConf) { - if (originCountJobSession == null) { - synchronized (OriginCountJobSession.class) { - if (originCountJobSession == null) { - originCountJobSession = new OriginCountJobSession(sourceSession, sparkConf); - } - } - } - - return originCountJobSession; - } - - public void getData(BigInteger min, BigInteger max) { - logger.info("ThreadID: {} Processing min: {} max: {}", Thread.currentThread().getId(), min, max); - boolean done = false; - int maxAttempts = maxRetries + 1; - for (int attempts = 1; attempts <= maxAttempts && !done; attempts++) { - try { - ResultSet resultSet = sourceSession.execute(sourceSelectStatement.bind(hasRandomPartitioner ? - min : min.longValueExact(), hasRandomPartitioner ? max : max.longValueExact()) - .setConsistencyLevel(readConsistencyLevel).setPageSize(fetchSizeInRows)); - - Collection> writeResults = new ArrayList>(); - - // cannot do batching if the writeFilter is greater than 0 or - // maxWriteTimeStampFilter is less than max long - // do not batch for counters as it adds latency & increases chance of discrepancy - if (batchSize == 1 || writeTimeStampFilter || isCounterTable) { - for (Row sourceRow : resultSet) { - readLimiter.acquire(1); - - if (checkTableforColSize) { - int rowColcnt = GetRowColumnLength(sourceRow, filterColType, filterColIndex); - String result = ""; - if (rowColcnt > fieldGuardraillimitMB * 1048576) { - for (int index = 0; index < checkTableforColSizeTypes.size(); index++) { - MigrateDataType dataType = checkTableforColSizeTypes.get(index); - Object colData = getData(dataType, index, sourceRow); - String[] colName = checkTableforselectCols.split(","); - result = result + " - " + colName[index] + " : " + colData; - } - logger.error("ThreadID: {}{} - {} length: {}", Thread.currentThread().getId(), result, filterColName, rowColcnt); - continue; - } - } - } - } else { - BatchStatement batchStatement = BatchStatement.newInstance(BatchType.UNLOGGED); - for (Row sourceRow : resultSet) { - readLimiter.acquire(1); - writeLimiter.acquire(1); - - if (checkTableforColSize) { - int rowColcnt = GetRowColumnLength(sourceRow, filterColType, filterColIndex); - String result = ""; - if (rowColcnt > fieldGuardraillimitMB * 1048576) { - for (int index = 0; index < checkTableforColSizeTypes.size(); index++) { - MigrateDataType dataType = checkTableforColSizeTypes.get(index); - Object colData = getData(dataType, index, sourceRow); - String[] colName = checkTableforselectCols.split(","); - result = result + " - " + colName[index] + " : " + colData; - } - logger.error("ThreadID: {}{} - {} length: {}", Thread.currentThread().getId(), result, filterColName, rowColcnt); - continue; - } - } - - if (readCounter.incrementAndGet() % 1000 == 0) { - logger.info("ThreadID: {} Read Record Count: {}", Thread.currentThread().getId(), readCounter.get()); - } - - } - } - - logger.info("ThreadID: {} Final Read Record Count: {}", Thread.currentThread().getId(), readCounter.get()); - done = true; - } catch (Exception e) { - logger.error("Error occurred during Attempt#: {}", attempts, e); - logger.error("Error with PartitionRange -- ThreadID: {} Processing min: {} max: {} -- Attempt# {}", - Thread.currentThread().getId(), min, max, attempts); - } - } - } - - private int GetRowColumnLength(Row sourceRow, String filterColType, Integer filterColIndex) { - int sizeInMB = 0; - Object colData = getData(new MigrateDataType(filterColType), filterColIndex, sourceRow); - byte[] colBytes = SerializationUtils.serialize((Serializable) colData); - sizeInMB = colBytes.length; - if (sizeInMB > fieldGuardraillimitMB) - return sizeInMB; - return sizeInMB; - } - -} +package datastax.astra.migrate; + +import com.datastax.oss.driver.api.core.CqlSession; +import com.datastax.oss.driver.api.core.cql.*; +import com.datastax.oss.driver.shaded.guava.common.util.concurrent.RateLimiter; +import org.apache.commons.lang.SerializationUtils; +import org.apache.spark.SparkConf; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.Serializable; +import java.math.BigInteger; +import java.util.ArrayList; +import java.util.Collection; +import java.util.List; +import java.util.concurrent.CompletionStage; +import java.util.concurrent.atomic.AtomicLong; + +public class OriginCountJobSession extends BaseJobSession { + private static OriginCountJobSession originCountJobSession; + public Logger logger = LoggerFactory.getLogger(this.getClass().getName()); + protected AtomicLong readCounter = new AtomicLong(0); + protected List updateSelectMapping = new ArrayList(); + protected Boolean checkTableforColSize; + protected String checkTableforselectCols; + protected Integer fieldGuardraillimitMB; + protected List checkTableforColSizeTypes = new ArrayList(); + + protected OriginCountJobSession(CqlSession sourceSession, SparkConf sc) { + super(sc); + this.sourceSession = sourceSession; + batchSize = new Integer(sc.get("spark.batchSize", "1")); + printStatsAfter = new Integer(sc.get("spark.printStatsAfter", "100000")); + if (printStatsAfter < 1) { + printStatsAfter = 100000; + } + + readLimiter = RateLimiter.create(new Integer(sc.get("spark.readRateLimit", "20000"))); + sourceKeyspaceTable = sc.get("spark.origin.keyspaceTable"); + + hasRandomPartitioner = Boolean.parseBoolean(sc.get("spark.origin.hasRandomPartitioner", "false")); + isCounterTable = Boolean.parseBoolean(sc.get("spark.counterTable", "false")); + + checkTableforColSize = Boolean.parseBoolean(sc.get("spark.origin.checkTableforColSize", "false")); + checkTableforselectCols = sc.get("spark.origin.checkTableforColSize.cols"); + checkTableforColSizeTypes = getTypes(sc.get("spark.origin.checkTableforColSize.cols.types")); + filterColName = Util.getSparkPropOrEmpty(sc, "spark.origin.FilterColumn"); + filterColType = Util.getSparkPropOrEmpty(sc, "spark.origin.FilterColumnType"); + filterColIndex = Integer.parseInt(sc.get("spark.origin.FilterColumnIndex", "0")); + fieldGuardraillimitMB = Integer.parseInt(sc.get("spark.fieldGuardraillimitMB", "0")); + + String partionKey = sc.get("spark.query.cols.partitionKey"); + idColTypes = getTypes(sc.get("spark.query.cols.id.types")); + + String selectCols = sc.get("spark.query.cols.select"); + String updateSelectMappingStr = sc.get("spark.counterTable.cql.index", "0"); + for (String updateSelectIndex : updateSelectMappingStr.split(",")) { + updateSelectMapping.add(Integer.parseInt(updateSelectIndex)); + } + String sourceSelectCondition = sc.get("spark.query.cols.select.condition", ""); + sourceSelectStatement = sourceSession.prepare( + "select " + selectCols + " from " + sourceKeyspaceTable + " where token(" + partionKey.trim() + + ") >= ? and token(" + partionKey.trim() + ") <= ? " + sourceSelectCondition + " ALLOW FILTERING"); + + } + + public static OriginCountJobSession getInstance(CqlSession sourceSession, SparkConf sparkConf) { + if (originCountJobSession == null) { + synchronized (OriginCountJobSession.class) { + if (originCountJobSession == null) { + originCountJobSession = new OriginCountJobSession(sourceSession, sparkConf); + } + } + } + + return originCountJobSession; + } + + public void getData(BigInteger min, BigInteger max) { + logger.info("ThreadID: {} Processing min: {} max: {}", Thread.currentThread().getId(), min, max); + int maxAttempts = maxRetries; + for (int retryCount = 1; retryCount <= maxAttempts; retryCount++) { + + try { + ResultSet resultSet = sourceSession.execute(sourceSelectStatement.bind(hasRandomPartitioner ? + min : min.longValueExact(), hasRandomPartitioner ? max : max.longValueExact()) + .setConsistencyLevel(readConsistencyLevel).setPageSize(fetchSizeInRows)); + + Collection> writeResults = new ArrayList>(); + + // cannot do batching if the writeFilter is greater than 0 or + // maxWriteTimeStampFilter is less than max long + // do not batch for counters as it adds latency & increases chance of discrepancy + if (batchSize == 1 || writeTimeStampFilter || isCounterTable) { + for (Row sourceRow : resultSet) { + readLimiter.acquire(1); + + if (checkTableforColSize) { + int rowColcnt = GetRowColumnLength(sourceRow, filterColType, filterColIndex); + String result = ""; + if (rowColcnt > fieldGuardraillimitMB * 1048576) { + for (int index = 0; index < checkTableforColSizeTypes.size(); index++) { + MigrateDataType dataType = checkTableforColSizeTypes.get(index); + Object colData = getData(dataType, index, sourceRow); + String[] colName = checkTableforselectCols.split(","); + result = result + " - " + colName[index] + " : " + colData; + } + logger.error("ThreadID: {}{} - {} length: {}", Thread.currentThread().getId(), result, filterColName, rowColcnt); + continue; + } + } + } + + } else { + BatchStatement batchStatement = BatchStatement.newInstance(BatchType.UNLOGGED); + for (Row sourceRow : resultSet) { + readLimiter.acquire(1); + writeLimiter.acquire(1); + + if (checkTableforColSize) { + int rowColcnt = GetRowColumnLength(sourceRow, filterColType, filterColIndex); + String result = ""; + if (rowColcnt > fieldGuardraillimitMB * 1048576) { + for (int index = 0; index < checkTableforColSizeTypes.size(); index++) { + MigrateDataType dataType = checkTableforColSizeTypes.get(index); + Object colData = getData(dataType, index, sourceRow); + String[] colName = checkTableforselectCols.split(","); + result = result + " - " + colName[index] + " : " + colData; + } + logger.error("ThreadID: {}{} - {} length: {}", Thread.currentThread().getId(), result, filterColName, rowColcnt); + continue; + } + } + + if (readCounter.incrementAndGet() % 1000 == 0) { + logger.info("ThreadID: {} Read Record Count: {}", Thread.currentThread().getId(), readCounter.get()); + } + + } + } + + logger.info("ThreadID: {} Final Read Record Count: {}", Thread.currentThread().getId(), readCounter.get()); + retryCount = maxAttempts; + } catch (Exception e) { + logger.error("Error occurred retry#: {}", retryCount, e); + logger.error("Error with PartitionRange -- ThreadID: {} Processing min: {} max: {} -- Retry# {}", + Thread.currentThread().getId(), min, max, retryCount); + } + } + + } + + private int GetRowColumnLength(Row sourceRow, String filterColType, Integer filterColIndex) { + int sizeInMB = 0; + Object colData = getData(new MigrateDataType(filterColType), filterColIndex, sourceRow); + byte[] colBytes = SerializationUtils.serialize((Serializable) colData); + sizeInMB = colBytes.length; + if (sizeInMB > fieldGuardraillimitMB) + return sizeInMB; + return sizeInMB; + } + +} diff --git a/src/main/java/datastax/astra/migrate/SplitPartitions.java b/src/main/java/datastax/astra/migrate/SplitPartitions.java index fc8f8895..5127ce5a 100644 --- a/src/main/java/datastax/astra/migrate/SplitPartitions.java +++ b/src/main/java/datastax/astra/migrate/SplitPartitions.java @@ -1,161 +1,240 @@ -package datastax.astra.migrate; - -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.io.BufferedReader; -import java.io.IOException; -import java.io.Serializable; -import java.math.BigInteger; -import java.util.ArrayList; -import java.util.Collection; -import java.util.Collections; -import java.util.List; -import java.util.stream.Collectors; -import java.util.stream.IntStream; -import java.util.stream.Stream; - -public class SplitPartitions { - - public final static Long MIN_PARTITION = Long.MIN_VALUE; - public final static Long MAX_PARTITION = Long.MAX_VALUE; - public static Logger logger = LoggerFactory.getLogger(SplitPartitions.class.getName()); - - public static void main(String[] args) throws IOException { - Collection partitions = getSubPartitions(2, BigInteger.valueOf(1), - BigInteger.valueOf(1000), 100); -// Collection partitions = getSubPartitionsFromFile(3); - for (Partition partition : partitions) { - System.out.println(partition); - } - } - - public static Collection getRandomSubPartitions(int numSplits, BigInteger min, BigInteger max, int coveragePercent) { - logger.info("ThreadID: {} Splitting min: {} max: {}", Thread.currentThread().getId(), min, max); - List partitions = getSubPartitions(numSplits, min, max, coveragePercent); - Collections.shuffle(partitions); - Collections.shuffle(partitions); - Collections.shuffle(partitions); - Collections.shuffle(partitions); - return partitions; - } - - public static List getSubPartitionsFromFile(int numSplits) throws IOException { - logger.info("ThreadID: {} Splitting partitions in file: ./partitions.csv using a split-size of {}" - , Thread.currentThread().getId(), numSplits); - List partitions = new ArrayList(); - BufferedReader reader = Util.getfileReader("./partitions.csv"); - String line = null; - while ((line = reader.readLine()) != null) { - if (line.startsWith("#")) { - continue; - } - String[] minMax = line.split(","); - try { - partitions.addAll(getSubPartitions(numSplits, new BigInteger(minMax[0]), new BigInteger(minMax[1]), 100)); - } catch (Exception e) { - logger.error("Skipping partition: {}", line, e); - } - } - - return partitions; - } - - public static List getRowPartsFromFile(int numSplits) throws IOException { - logger.info("ThreadID: {} Splitting rows in file: ./primary_key_rows.csv using a split-size of {}" - , Thread.currentThread().getId(), numSplits); - List pkRows = new ArrayList(); - BufferedReader reader = Util.getfileReader("./primary_key_rows.csv"); - String pkRow = null; - while ((pkRow = reader.readLine()) != null) { - if (pkRow.startsWith("#")) { - continue; - } - pkRows.add(pkRow); - } - int partSize = pkRows.size() / numSplits; - if (partSize == 0) { - partSize = pkRows.size(); - } - return batches(pkRows, partSize).map(l -> (new PKRows(l))).collect(Collectors.toList()); - } - - public static Stream> batches(List source, int length) { - if (length <= 0) - throw new IllegalArgumentException("length = " + length); - int size = source.size(); - if (size <= 0) - return Stream.empty(); - int fullChunks = (size - 1) / length; - return IntStream.range(0, fullChunks + 1).mapToObj( - n -> source.subList(n * length, n == fullChunks ? size : (n + 1) * length)); - } - - private static List getSubPartitions(int numSplits, BigInteger min, BigInteger max, int coveragePercent) { - if (coveragePercent < 1 || coveragePercent > 100) { - coveragePercent = 100; - } - BigInteger curMax = new BigInteger(min.toString()); - BigInteger partitionSize = max.subtract(min).divide(BigInteger.valueOf(numSplits)); - List partitions = new ArrayList(); - if (partitionSize.compareTo(new BigInteger("0")) == 0) { - partitionSize = new BigInteger("100000"); - } - boolean exausted = false; - while (curMax.compareTo(max) <= 0) { - BigInteger curMin = new BigInteger(curMax.toString()); - BigInteger newCurMax = curMin.add(partitionSize); - if (newCurMax.compareTo(curMax) == -1) { - newCurMax = new BigInteger(max.toString()); - exausted = true; - } - if (newCurMax.compareTo(max) == 1) { - newCurMax = new BigInteger(max.toString()); - exausted = true; - } - curMax = newCurMax; - - BigInteger range = curMax.subtract(curMin); - BigInteger curRange = range.multiply(BigInteger.valueOf(coveragePercent)).divide(BigInteger.valueOf(100)); - partitions.add(new Partition(curMin, curMin.add(curRange))); - if (exausted) { - break; - } - curMax = curMax.add(BigInteger.ONE); - } - - return partitions; - } - - public static class PKRows implements Serializable { - List pkRows; - - public PKRows(List rows) { - pkRows = rows; - } - } - - public static class Partition implements Serializable { - private static final long serialVersionUID = 1L; - - private BigInteger min; - private BigInteger max; - - public Partition(BigInteger min, BigInteger max) { - this.min = min; - this.max = max; - } - - public BigInteger getMin() { - return min; - } - - public BigInteger getMax() { - return max; - } - - public String toString() { - return "Processing partition for token range " + min + " to " + max; - } - } +package datastax.astra.migrate; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.BufferedReader; +import java.io.File; +import java.io.IOException; +import java.io.Serializable; +import java.math.BigInteger; +import java.nio.file.Files; +import java.nio.file.Paths; +import java.util.ArrayList; +import java.util.Collection; +import java.util.Collections; +import java.util.List; +import java.util.stream.Collectors; +import java.util.stream.IntStream; +import java.util.stream.Stream; + +public class SplitPartitions { + + public final static Long MIN_PARTITION = Long.MIN_VALUE; + public final static Long MAX_PARTITION = Long.MAX_VALUE; + public static Logger logger = LoggerFactory.getLogger(SplitPartitions.class.getName()); + + public static void main(String[] args) throws IOException { + Collection partitions = getSubPartitions(2, BigInteger.valueOf(1), + BigInteger.valueOf(1000), 100); +// Collection partitions = getSubPartitionsFromFile(3); + for (Partition partition : partitions) { + System.out.println(partition); + } + } + + public static Collection getRandomSubPartitions(int splitSize, BigInteger min, BigInteger max, int coveragePercent) { + logger.info("ThreadID: {} Splitting min: {} max: {}", Thread.currentThread().getId(), min, max); + List partitions = getSubPartitions(splitSize, min, max, coveragePercent); + Collections.shuffle(partitions); + Collections.shuffle(partitions); + Collections.shuffle(partitions); + Collections.shuffle(partitions); + return partitions; + } + + public static List getFailedSubPartitionsFromFile(int splitSize, String tokenRangeFile) throws IOException { + logger.info("ThreadID: {} Splitting partitions in file: {} using a split-size of {}" + , Thread.currentThread().getId(), tokenRangeFile, splitSize); + + File file = new File(tokenRangeFile); + String renamedFile = tokenRangeFile+"_bkp"; + File rename = new File(renamedFile); + if(rename.exists()) { + rename.delete(); + } + boolean flag = file.renameTo(rename); + if (flag == true) { + logger.info("File Successfully Renamed to : "+renamedFile); + } + else { + logger.info("Operation Failed to rename file : "+tokenRangeFile); + } + + List partitions = new ArrayList(); + BufferedReader reader = Util.getfileReader(renamedFile); + String line = null; + while ((line = reader.readLine()) != null) { + if (line.startsWith("#")) { + continue; + } + String[] minMax = line.split(","); + try { + partitions.addAll(getSubPartitions(splitSize, new BigInteger(minMax[0]), new BigInteger(minMax[1]), 100)); + } catch (Exception e) { + logger.error("Skipping partition: {}", line, e); + } + } + + return partitions; + } + + public static List getSubPartitionsFromFile(int splitSize, String tokenRangeFile) throws IOException { + logger.info("ThreadID: {} Splitting partitions in file: {} using a split-size of {}" + , Thread.currentThread().getId(), tokenRangeFile, splitSize); + List partitions = new ArrayList(); + BufferedReader reader = Util.getfileReader(tokenRangeFile); + String line = null; + while ((line = reader.readLine()) != null) { + if (line.startsWith("#")) { + continue; + } + String[] minMax = line.split(","); + try { + partitions.addAll(getSubPartitions(splitSize, new BigInteger(minMax[0]), new BigInteger(minMax[1]), 100)); + } catch (Exception e) { + logger.error("Skipping partition: {}", line, e); + } + } + + return partitions; + } + + public static List getFailedRowPartsFromFile(int splitSize, long rowFailureFileSizeLimit, String failedRowsFile) throws IOException { + logger.info("ThreadID: {} Splitting rows in file: {} using a split-size of {}" + , Thread.currentThread().getId(), failedRowsFile, splitSize); + + long bytesSize = Files.size(Paths.get(failedRowsFile)); + + if(bytesSize > rowFailureFileSizeLimit) { + throw new RuntimeException("Row failure file size exceeds permissible limit of " + rowFailureFileSizeLimit + " bytes. Actual file size is " + bytesSize); + } + + String renameFile = failedRowsFile+"_bkp"; + File file = new File(failedRowsFile); + File rename = new File(renameFile); + if(rename.exists()) { + rename.delete(); + } + boolean flag = file.renameTo(rename); + if (flag == true) { + logger.info("File Successfully Renamed to : "+renameFile); + } + else { + logger.info("Operation Failed to rename file : "+failedRowsFile); + } + + List pkRows = new ArrayList(); + BufferedReader reader = Util.getfileReader(renameFile); + String pkRow = null; + while ((pkRow = reader.readLine()) != null) { + if (pkRow.startsWith("#")) { + continue; + } + pkRows.add(pkRow); + } + int partSize = pkRows.size() / splitSize; + if (partSize == 0) { + partSize = pkRows.size(); + } + return batches(pkRows, partSize).map(l -> (new PKRows(l))).collect(Collectors.toList()); + } + + public static List getRowPartsFromFile(int splitSize, String failedRowsFile) throws IOException { + logger.info("ThreadID: {} Splitting rows in file: {} using a split-size of {}" + , Thread.currentThread().getId(), failedRowsFile, splitSize); + List pkRows = new ArrayList(); + BufferedReader reader = Util.getfileReader(failedRowsFile); + String pkRow = null; + while ((pkRow = reader.readLine()) != null) { + if (pkRow.startsWith("#")) { + continue; + } + pkRows.add(pkRow); + } + int partSize = pkRows.size() / splitSize; + if (partSize == 0) { + partSize = pkRows.size(); + } + return batches(pkRows, partSize).map(l -> (new PKRows(l))).collect(Collectors.toList()); + } + + public static Stream> batches(List source, int length) { + if (length <= 0) + throw new IllegalArgumentException("length = " + length); + int size = source.size(); + if (size <= 0) + return Stream.empty(); + int fullChunks = (size - 1) / length; + return IntStream.range(0, fullChunks + 1).mapToObj( + n -> source.subList(n * length, n == fullChunks ? size : (n + 1) * length)); + } + + private static List getSubPartitions(int splitSize, BigInteger min, BigInteger max, int coveragePercent) { + if (coveragePercent < 1 || coveragePercent > 100) { + coveragePercent = 100; + } + BigInteger curMax = new BigInteger(min.toString()); + BigInteger partitionSize = max.subtract(min).divide(BigInteger.valueOf(splitSize)); + List partitions = new ArrayList(); + if (partitionSize.compareTo(new BigInteger("0")) == 0) { + partitionSize = new BigInteger("100000"); + } + boolean exausted = false; + while (curMax.compareTo(max) <= 0) { + BigInteger curMin = new BigInteger(curMax.toString()); + BigInteger newCurMax = curMin.add(partitionSize); + if (newCurMax.compareTo(curMax) == -1) { + newCurMax = new BigInteger(max.toString()); + exausted = true; + } + if (newCurMax.compareTo(max) == 1) { + newCurMax = new BigInteger(max.toString()); + exausted = true; + } + curMax = newCurMax; + + BigInteger range = curMax.subtract(curMin); + BigInteger curRange = range.multiply(BigInteger.valueOf(coveragePercent)).divide(BigInteger.valueOf(100)); + partitions.add(new Partition(curMin, curMin.add(curRange))); + if (exausted) { + break; + } + curMax = curMax.add(BigInteger.ONE); + } + + return partitions; + } + + public static class PKRows implements Serializable { + List pkRows; + + public PKRows(List rows) { + pkRows = rows; + } + } + + public static class Partition implements Serializable { + private static final long serialVersionUID = 1L; + + private BigInteger min; + private BigInteger max; + + public Partition(BigInteger min, BigInteger max) { + this.min = min; + this.max = max; + } + + public BigInteger getMin() { + return min; + } + + public BigInteger getMax() { + return max; + } + + public String toString() { + return "Processing partition for token range " + min + " to " + max; + } + } } \ No newline at end of file diff --git a/src/main/java/datastax/astra/migrate/Util.java b/src/main/java/datastax/astra/migrate/Util.java index 112d2056..5c91c1c2 100644 --- a/src/main/java/datastax/astra/migrate/Util.java +++ b/src/main/java/datastax/astra/migrate/Util.java @@ -1,84 +1,127 @@ -package datastax.astra.migrate; - -import com.datastax.oss.driver.api.core.ConsistencyLevel; -import org.apache.commons.lang.StringUtils; -import org.apache.spark.SparkConf; - -import java.io.BufferedReader; -import java.io.FileNotFoundException; -import java.io.FileReader; -import java.util.NoSuchElementException; - -public class Util { - - public static String getSparkProp(SparkConf sc, String prop) { - try { - return sc.get(prop); - } catch (NoSuchElementException nse) { - String newProp = prop.replace("origin", "source").replace("target", "destination"); - return sc.get(newProp); - } - } - - public static String getSparkPropOr(SparkConf sc, String prop, String defaultVal) { - try { - return sc.get(prop); - } catch (NoSuchElementException nse) { - String newProp = prop.replace("origin", "source").replace("target", "destination"); - return sc.get(newProp, defaultVal); - } - } - - public static String getSparkPropOrEmpty(SparkConf sc, String prop) { - return getSparkPropOr(sc, prop, ""); - } - - public static BufferedReader getfileReader(String fileName) { - try { - return new BufferedReader(new FileReader(fileName)); - } catch (FileNotFoundException fnfe) { - throw new RuntimeException("No '" + fileName + "' file found!! Add this file in the current folder & rerun!"); - } - } - - public static ConsistencyLevel mapToConsistencyLevel(String level) { - ConsistencyLevel retVal = ConsistencyLevel.LOCAL_QUORUM; - if (StringUtils.isNotEmpty(level)) { - switch (level.toUpperCase()) { - case "ANY": - retVal = ConsistencyLevel.ANY; - break; - case "ONE": - retVal = ConsistencyLevel.ONE; - break; - case "TWO": - retVal = ConsistencyLevel.TWO; - break; - case "THREE": - retVal = ConsistencyLevel.THREE; - break; - case "QUORUM": - retVal = ConsistencyLevel.QUORUM; - break; - case "LOCAL_ONE": - retVal = ConsistencyLevel.LOCAL_ONE; - break; - case "EACH_QUORUM": - retVal = ConsistencyLevel.EACH_QUORUM; - break; - case "SERIAL": - retVal = ConsistencyLevel.SERIAL; - break; - case "LOCAL_SERIAL": - retVal = ConsistencyLevel.LOCAL_SERIAL; - break; - case "ALL": - retVal = ConsistencyLevel.ALL; - break; - } - } - - return retVal; - } - -} +package datastax.astra.migrate; + +import com.datastax.oss.driver.api.core.ConsistencyLevel; +import org.apache.commons.lang.StringUtils; +import org.apache.spark.SparkConf; + +import java.io.BufferedReader; +import java.io.FileNotFoundException; +import java.io.FileReader; +import java.util.Date; +import java.util.NoSuchElementException; + +import java.io.IOException; +import java.nio.charset.StandardCharsets; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.Paths; +import java.nio.file.StandardOpenOption; +import java.text.DateFormat; +import java.text.SimpleDateFormat; + +public class Util { + + public static String getSparkProp(SparkConf sc, String prop) { + try { + return sc.get(prop); + } catch (NoSuchElementException nse) { + String newProp = prop.replace("origin", "source").replace("target", "destination"); + return sc.get(newProp); + } + } + + public static String getSparkPropOr(SparkConf sc, String prop, String defaultVal) { + try { + return sc.get(prop); + } catch (NoSuchElementException nse) { + String newProp = prop.replace("origin", "source").replace("target", "destination"); + return sc.get(newProp, defaultVal); + } + } + + public static String getSparkPropOrEmpty(SparkConf sc, String prop) { + return getSparkPropOr(sc, prop, ""); + } + + public static BufferedReader getfileReader(String fileName) { + try { + return new BufferedReader(new FileReader(fileName)); + } catch (FileNotFoundException fnfe) { + throw new RuntimeException("No '" + fileName + "' file found!! Add this file in the current folder & rerun!"); + } + } + private static void appendToFile(Path path, String content) + throws IOException { + // if file not exists, create and write to it + // otherwise append to the end of the file + Files.write(path, content.getBytes(StandardCharsets.UTF_8), + StandardOpenOption.CREATE, + StandardOpenOption.APPEND); + } + + private static void writeToFile(Path path, String content) + throws IOException { + // if file not exists, create and write to it + // otherwise override existing file + Files.write(path, content.getBytes(StandardCharsets.UTF_8), + StandardOpenOption.CREATE, + StandardOpenOption.TRUNCATE_EXISTING); + } + + public static ConsistencyLevel mapToConsistencyLevel(String level) { + ConsistencyLevel retVal = ConsistencyLevel.LOCAL_QUORUM; + if (StringUtils.isNotEmpty(level)) { + switch (level.toUpperCase()) { + case "ANY": + retVal = ConsistencyLevel.ANY; + break; + case "ONE": + retVal = ConsistencyLevel.ONE; + break; + case "TWO": + retVal = ConsistencyLevel.TWO; + break; + case "THREE": + retVal = ConsistencyLevel.THREE; + break; + case "QUORUM": + retVal = ConsistencyLevel.QUORUM; + break; + case "LOCAL_ONE": + retVal = ConsistencyLevel.LOCAL_ONE; + break; + case "EACH_QUORUM": + retVal = ConsistencyLevel.EACH_QUORUM; + break; + case "SERIAL": + retVal = ConsistencyLevel.SERIAL; + break; + case "LOCAL_SERIAL": + retVal = ConsistencyLevel.LOCAL_SERIAL; + break; + case "ALL": + retVal = ConsistencyLevel.ALL; + break; + } + } + + return retVal; + } + + private static final String NEW_LINE = System.lineSeparator(); + + public static void FileAppend(String dir, String fileName, String content) throws IOException { + + //create directory if not already existing + Files.createDirectories(Paths.get(dir)); + Path path = Paths.get(dir + "/" + fileName); + appendToFile(path, content + NEW_LINE); + + } + + public final static String getDateTime() + { + DateFormat df = new SimpleDateFormat("yyyy-MM-dd_hh_mm_ss"); + return df.format(new Date()); + } +} diff --git a/src/main/scala/datastax/astra/migrate/AbstractJob.scala b/src/main/scala/datastax/astra/migrate/AbstractJob.scala index cbce4bff..d2760f08 100644 --- a/src/main/scala/datastax/astra/migrate/AbstractJob.scala +++ b/src/main/scala/datastax/astra/migrate/AbstractJob.scala @@ -1,75 +1,70 @@ -package datastax.astra.migrate - -import com.datastax.spark.connector.cql.CassandraConnector -import org.apache.spark.SparkConf - -class AbstractJob extends BaseJob { - - abstractLogger.info("PARAM -- Min Partition: " + minPartition) - abstractLogger.info("PARAM -- Max Partition: " + maxPartition) - abstractLogger.info("PARAM -- Number of Splits : " + numSplits) - abstractLogger.info("PARAM -- Coverage Percent: " + coveragePercent) - abstractLogger.info("PARAM -- Origin SSL Enabled: {}", sourceSSLEnabled); - abstractLogger.info("PARAM -- Target SSL Enabled: {}", destinationSSLEnabled); - - var sourceConnection = getConnection(true, sourceScbPath, sourceHost, sourcePort, sourceUsername, sourcePassword, sourceSSLEnabled, - sourceTrustStorePath, sourceTrustStorePassword, sourceTrustStoreType, sourceKeyStorePath, sourceKeyStorePassword, sourceEnabledAlgorithms); - - var destinationConnection = getConnection(false, destinationScbPath, destinationHost, destinationPort, destinationUsername, destinationPassword, destinationSSLEnabled, - destinationTrustStorePath, destinationTrustStorePassword, destinationTrustStoreType, destinationKeyStorePath, destinationKeyStorePassword, destinationEnabledAlgorithms); - - private def getConnection(isSource: Boolean, scbPath: String, host: String, port: String, username: String, password: String, - sslEnabled: String, trustStorePath: String, trustStorePassword: String, trustStoreType: String, - keyStorePath: String, keyStorePassword: String, enabledAlgorithms: String): CassandraConnector = { - var connType: String = "Source" - if (!isSource) { - connType = "Destination" - } - - var config: SparkConf = sContext.getConf - if (scbPath.nonEmpty) { - abstractLogger.info(connType + ": Connecting to Astra using SCB: " + scbPath); - - return CassandraConnector(config - .set("spark.cassandra.auth.username", username) - .set("spark.cassandra.auth.password", password) - .set("spark.cassandra.input.consistency.level", consistencyLevel) - .set("spark.cassandra.connection.config.cloud.path", scbPath)) - } else if (trustStorePath.nonEmpty) { - abstractLogger.info(connType + ": Connecting (with clientAuth) to Cassandra (or DSE) host:port " + host + ":" + port); - - // Use defaults when not provided - var enabledAlgorithmsVar = enabledAlgorithms - if (enabledAlgorithms == null || enabledAlgorithms.trim.isEmpty) { - enabledAlgorithmsVar = "TLS_RSA_WITH_AES_128_CBC_SHA, TLS_RSA_WITH_AES_256_CBC_SHA" - } - - return CassandraConnector(config - .set("spark.cassandra.auth.username", username) - .set("spark.cassandra.auth.password", password) - .set("spark.cassandra.input.consistency.level", consistencyLevel) - .set("spark.cassandra.connection.host", host) - .set("spark.cassandra.connection.port", port) - .set("spark.cassandra.connection.ssl.enabled", "true") - .set("spark.cassandra.connection.ssl.enabledAlgorithms", enabledAlgorithmsVar) - .set("spark.cassandra.connection.ssl.trustStore.password", trustStorePassword) - .set("spark.cassandra.connection.ssl.trustStore.path", trustStorePath) - .set("spark.cassandra.connection.ssl.keyStore.password", keyStorePassword) - .set("spark.cassandra.connection.ssl.keyStore.path", keyStorePath) - .set("spark.cassandra.connection.ssl.trustStore.type", trustStoreType) - .set("spark.cassandra.connection.ssl.clientAuth.enabled", "true") - ) - } else { - abstractLogger.info(connType + ": Connecting to Cassandra (or DSE) host:port " + host + ":" + port); - - return CassandraConnector(config.set("spark.cassandra.auth.username", username) - .set("spark.cassandra.connection.ssl.enabled", sslEnabled) - .set("spark.cassandra.auth.password", password) - .set("spark.cassandra.input.consistency.level", consistencyLevel) - .set("spark.cassandra.connection.host", host) - .set("spark.cassandra.connection.port", port)) - } - - } - -} +package datastax.astra.migrate + +import com.datastax.spark.connector.cql.CassandraConnector +import org.apache.spark.SparkConf + +class AbstractJob extends BaseJob { + + abstractLogger.info("PARAM -- Min Partition: " + minPartition) + abstractLogger.info("PARAM -- Max Partition: " + maxPartition) + abstractLogger.info("PARAM -- Split Size: " + splitSize) + abstractLogger.info("PARAM -- Coverage Percent: " + coveragePercent) + + var sourceConnection = getConnection(true, sourceScbPath, sourceHost, sourceUsername, sourcePassword, + sourceTrustStorePath, sourceTrustStorePassword, sourceTrustStoreType, sourceKeyStorePath, sourceKeyStorePassword, sourceEnabledAlgorithms); + + var destinationConnection = getConnection(false, destinationScbPath, destinationHost, destinationUsername, destinationPassword, + destinationTrustStorePath, destinationTrustStorePassword, destinationTrustStoreType, destinationKeyStorePath, destinationKeyStorePassword, destinationEnabledAlgorithms); + + private def getConnection(isSource: Boolean, scbPath: String, host: String, username: String, password: String, + trustStorePath: String, trustStorePassword: String, trustStoreType: String, + keyStorePath: String, keyStorePassword: String, enabledAlgorithms: String): CassandraConnector = { + var connType: String = "Source" + if (!isSource) { + connType = "Destination" + } + + var config: SparkConf = sContext.getConf + if (scbPath.nonEmpty) { + abstractLogger.info(connType + ": Connecting to Astra using SCB: " + scbPath); + + return CassandraConnector(config + .set("spark.cassandra.auth.username", username) + .set("spark.cassandra.auth.password", password) + .set("spark.cassandra.input.consistency.level", consistencyLevel) + .set("spark.cassandra.connection.config.cloud.path", scbPath)) + } else if (trustStorePath.nonEmpty) { + abstractLogger.info(connType + ": Connecting to Cassandra (or DSE) with SSL host: " + host); + + // Use defaults when not provided + var enabledAlgorithmsVar = enabledAlgorithms + if (enabledAlgorithms == null || enabledAlgorithms.trim.isEmpty) { + enabledAlgorithmsVar = "TLS_RSA_WITH_AES_128_CBC_SHA, TLS_RSA_WITH_AES_256_CBC_SHA" + } + + return CassandraConnector(config + .set("spark.cassandra.auth.username", username) + .set("spark.cassandra.auth.password", password) + .set("spark.cassandra.input.consistency.level", consistencyLevel) + .set("spark.cassandra.connection.host", host) + .set("spark.cassandra.connection.ssl.enabled", "true") + .set("spark.cassandra.connection.ssl.enabledAlgorithms", enabledAlgorithmsVar) + .set("spark.cassandra.connection.ssl.trustStore.password", trustStorePassword) + .set("spark.cassandra.connection.ssl.trustStore.path", trustStorePath) + .set("spark.cassandra.connection.ssl.keyStore.password", keyStorePassword) + .set("spark.cassandra.connection.ssl.keyStore.path", keyStorePath) + .set("spark.cassandra.connection.ssl.trustStore.type", trustStoreType) + .set("spark.cassandra.connection.ssl.clientAuth.enabled", "true") + ) + } else { + abstractLogger.info(connType + ": Connecting to Cassandra (or DSE) host: " + host); + + return CassandraConnector(config.set("spark.cassandra.auth.username", username) + .set("spark.cassandra.auth.password", password) + .set("spark.cassandra.input.consistency.level", consistencyLevel) + .set("spark.cassandra.connection.host", host)) + } + + } + +} diff --git a/src/main/scala/datastax/astra/migrate/BaseJob.scala b/src/main/scala/datastax/astra/migrate/BaseJob.scala index 683a902f..1d816652 100644 --- a/src/main/scala/datastax/astra/migrate/BaseJob.scala +++ b/src/main/scala/datastax/astra/migrate/BaseJob.scala @@ -1,63 +1,61 @@ -package datastax.astra.migrate - -import org.apache.spark.sql.SparkSession -import org.slf4j.LoggerFactory - -import java.math.BigInteger - -class BaseJob extends App { - - val abstractLogger = LoggerFactory.getLogger(this.getClass.getName) - val spark = SparkSession.builder - .appName("Cassandra Data Migrator") - .getOrCreate() - abstractLogger.info("################################################################################################") - abstractLogger.info("############################## Cassandra Data Migrator - Starting ##############################") - abstractLogger.info("################################################################################################") - - val sContext = spark.sparkContext - val sc = sContext.getConf - - val consistencyLevel = Util.getSparkPropOr(sc, "spark.read.consistency.level", "LOCAL_QUORUM") - - val sourceScbPath = Util.getSparkPropOrEmpty(sc, "spark.origin.scb") - val sourceHost = Util.getSparkPropOrEmpty(sc, "spark.origin.host") - val sourcePort = Util.getSparkPropOr(sc, "spark.origin.port", "9042") - val sourceUsername = Util.getSparkPropOrEmpty(sc, "spark.origin.username") - val sourcePassword = Util.getSparkPropOrEmpty(sc, "spark.origin.password") - val sourceSSLEnabled = Util.getSparkPropOr(sc, "spark.origin.ssl.enabled", "false") - val sourceTrustStorePath = Util.getSparkPropOrEmpty(sc, "spark.origin.trustStore.path") - val sourceTrustStorePassword = Util.getSparkPropOrEmpty(sc, "spark.origin.trustStore.password") - val sourceTrustStoreType = Util.getSparkPropOr(sc, "spark.origin.trustStore.type", "JKS") - val sourceKeyStorePath = Util.getSparkPropOrEmpty(sc, "spark.origin.keyStore.path") - val sourceKeyStorePassword = Util.getSparkPropOrEmpty(sc, "spark.origin.keyStore.password") - val sourceEnabledAlgorithms = Util.getSparkPropOrEmpty(sc, "spark.origin.enabledAlgorithms") - - val destinationScbPath = Util.getSparkPropOrEmpty(sc, "spark.target.scb") - val destinationHost = Util.getSparkPropOrEmpty(sc, "spark.target.host") - val destinationPort = Util.getSparkPropOr(sc, "spark.target.port", "9042") - val destinationUsername = Util.getSparkProp(sc, "spark.target.username") - val destinationPassword = Util.getSparkProp(sc, "spark.target.password") - val destinationSSLEnabled = Util.getSparkPropOr(sc, "spark.target.ssl.enabled", "false") - val destinationTrustStorePath = Util.getSparkPropOrEmpty(sc, "spark.target.trustStore.path") - val destinationTrustStorePassword = Util.getSparkPropOrEmpty(sc, "spark.target.trustStore.password") - val destinationTrustStoreType = Util.getSparkPropOr(sc, "spark.target.trustStore.type", "JKS") - val destinationKeyStorePath = Util.getSparkPropOrEmpty(sc, "spark.target.keyStore.path") - val destinationKeyStorePassword = Util.getSparkPropOrEmpty(sc, "spark.target.keyStore.password") - val destinationEnabledAlgorithms = Util.getSparkPropOrEmpty(sc, "spark.target.enabledAlgorithms") - - val minPartition = new BigInteger(Util.getSparkPropOr(sc, "spark.origin.minPartition", "-9223372036854775808")) - val maxPartition = new BigInteger(Util.getSparkPropOr(sc, "spark.origin.maxPartition", "9223372036854775807")) - val coveragePercent = Util.getSparkPropOr(sc, "spark.coveragePercent", "100") - val splitSizeBackwardCompatibility = Util.getSparkPropOr(sc, "spark.splitSize", "10000") - val numSplits = Integer.parseInt(Util.getSparkPropOr(sc, "spark.numSplits", splitSizeBackwardCompatibility)) - - protected def exitSpark() = { - spark.stop() - abstractLogger.info("################################################################################################") - abstractLogger.info("############################## Cassandra Data Migrator - Stopped ###############################") - abstractLogger.info("################################################################################################") - sys.exit(0) - } - -} +package datastax.astra.migrate + +import org.apache.spark.sql.SparkSession +import org.slf4j.LoggerFactory + +import java.math.BigInteger + +class BaseJob extends App { + + val abstractLogger = LoggerFactory.getLogger(this.getClass.getName) + val spark = SparkSession.builder + .appName("Cassandra Data Migrator") + .getOrCreate() + abstractLogger.info("################################################################################################") + abstractLogger.info("############################## Cassandra Data Migrator - Starting ##############################") + abstractLogger.info("################################################################################################") + + val sContext = spark.sparkContext + val sc = sContext.getConf + + val consistencyLevel = Util.getSparkPropOr(sc, "spark.read.consistency.level", "LOCAL_QUORUM") + + val sourceScbPath = Util.getSparkPropOrEmpty(sc, "spark.origin.scb") + val sourceHost = Util.getSparkPropOrEmpty(sc, "spark.origin.host") + val sourceUsername = Util.getSparkPropOrEmpty(sc, "spark.origin.username") + val sourcePassword = Util.getSparkPropOrEmpty(sc, "spark.origin.password") + val sourceTrustStorePath = Util.getSparkPropOrEmpty(sc, "spark.origin.trustStore.path") + val sourceTrustStorePassword = Util.getSparkPropOrEmpty(sc, "spark.origin.trustStore.password") + val sourceTrustStoreType = Util.getSparkPropOr(sc, "spark.origin.trustStore.type", "JKS") + val sourceKeyStorePath = Util.getSparkPropOrEmpty(sc, "spark.origin.keyStore.path") + val sourceKeyStorePassword = Util.getSparkPropOrEmpty(sc, "spark.origin.keyStore.password") + val sourceEnabledAlgorithms = Util.getSparkPropOrEmpty(sc, "spark.origin.enabledAlgorithms") + + val destinationScbPath = Util.getSparkPropOrEmpty(sc, "spark.target.scb") + val destinationHost = Util.getSparkPropOrEmpty(sc, "spark.target.host") + val destinationUsername = Util.getSparkProp(sc, "spark.target.username") + val destinationPassword = Util.getSparkProp(sc, "spark.target.password") + val destinationTrustStorePath = Util.getSparkPropOrEmpty(sc, "spark.target.trustStore.path") + val destinationTrustStorePassword = Util.getSparkPropOrEmpty(sc, "spark.target.trustStore.password") + val destinationTrustStoreType = Util.getSparkPropOr(sc, "spark.target.trustStore.type", "JKS") + val destinationKeyStorePath = Util.getSparkPropOrEmpty(sc, "spark.target.keyStore.path") + val destinationKeyStorePassword = Util.getSparkPropOrEmpty(sc, "spark.target.keyStore.password") + val destinationEnabledAlgorithms = Util.getSparkPropOrEmpty(sc, "spark.target.enabledAlgorithms") + + val minPartition = new BigInteger(Util.getSparkPropOr(sc, "spark.origin.minPartition", "-9223372036854775808")) + val maxPartition = new BigInteger(Util.getSparkPropOr(sc, "spark.origin.maxPartition", "9223372036854775807")) + val coveragePercent = Util.getSparkPropOr(sc, "spark.coveragePercent", "100") + val splitSize = Integer.parseInt(Util.getSparkPropOr(sc, "spark.splitSize", "10000")) + val rowFailureFileSizeLimit = Util.getSparkPropOr(sc, "spark.rowfailure.filesize.limit", "200000000").toLong + + val tokenRangeFile= Util.getSparkPropOr(sc, "spark.input.partitionFile", "./partitions.csv") + val failedRowsFile= Util.getSparkPropOr(sc, "spark.input.failedRowsFile", "./failedRows.csv") + protected def exitSpark() = { + spark.stop() + abstractLogger.info("################################################################################################") + abstractLogger.info("############################## Cassandra Data Migrator - Stopped ###############################") + abstractLogger.info("################################################################################################") + sys.exit(0) + } + +} diff --git a/src/main/scala/datastax/astra/migrate/DiffData.scala b/src/main/scala/datastax/astra/migrate/DiffData.scala index c99f8ee7..aa6e23b1 100644 --- a/src/main/scala/datastax/astra/migrate/DiffData.scala +++ b/src/main/scala/datastax/astra/migrate/DiffData.scala @@ -1,34 +1,33 @@ -package datastax.astra.migrate - -import com.datastax.spark.connector.cql.CassandraConnector -import org.slf4j.LoggerFactory - -import org.apache.spark.SparkConf -import scala.collection.JavaConversions._ - -object DiffData extends AbstractJob { - - val logger = LoggerFactory.getLogger(this.getClass.getName) - logger.info("Started Data Validation App") - - diffTable(sourceConnection, destinationConnection, sc) - - exitSpark - - private def diffTable(sourceConnection: CassandraConnector, destinationConnection: CassandraConnector, config: SparkConf) = { - val partitions = SplitPartitions.getRandomSubPartitions(numSplits, minPartition, maxPartition, Integer.parseInt(coveragePercent)) - logger.info("PARAM Calculated -- Total Partitions: " + partitions.size()) - val parts = sContext.parallelize(partitions.toSeq, partitions.size); - logger.info("Spark parallelize created : " + parts.count() + " parts!"); - - parts.foreach(part => { - sourceConnection.withSessionDo(sourceSession => - destinationConnection.withSessionDo(destinationSession => - DiffJobSession.getInstance(sourceSession, destinationSession, config) - .getDataAndDiff(part.getMin, part.getMax))) - }) - - DiffJobSession.getInstance(null, null, config).printCounts(true); - } - -} +package datastax.astra.migrate + +import com.datastax.spark.connector.cql.CassandraConnector +import org.slf4j.LoggerFactory + +import scala.collection.JavaConversions._ + +object DiffData extends AbstractJob { + + val logger = LoggerFactory.getLogger(this.getClass.getName) + logger.info("Started Data Validation App") + + diffTable(sourceConnection, destinationConnection) + + exitSpark + + private def diffTable(sourceConnection: CassandraConnector, destinationConnection: CassandraConnector) = { + val partitions = SplitPartitions.getRandomSubPartitions(splitSize, minPartition, maxPartition, Integer.parseInt(coveragePercent)) + logger.info("PARAM Calculated -- Total Partitions: " + partitions.size()) + val parts = sContext.parallelize(partitions.toSeq, partitions.size); + logger.info("Spark parallelize created : " + parts.count() + " parts!"); + + parts.foreach(part => { + sourceConnection.withSessionDo(sourceSession => + destinationConnection.withSessionDo(destinationSession => + DiffJobSession.getInstance(sourceSession, destinationSession, sc) + .getDataAndDiff(part.getMin, part.getMax))) + }) + + DiffJobSession.getInstance(null, null, sc).printCounts(true); + } + +} diff --git a/src/main/scala/datastax/astra/migrate/DiffDataFailedPartitionsFromFile.scala b/src/main/scala/datastax/astra/migrate/DiffDataFailedPartitionsFromFile.scala new file mode 100644 index 00000000..e4e7bfdb --- /dev/null +++ b/src/main/scala/datastax/astra/migrate/DiffDataFailedPartitionsFromFile.scala @@ -0,0 +1,36 @@ +package datastax.astra.migrate + +import com.datastax.spark.connector.cql.CassandraConnector +import org.slf4j.LoggerFactory + +import scala.collection.JavaConversions._ + +object DiffDataFailedPartitionsFromFile extends AbstractJob { + + val logger = LoggerFactory.getLogger(this.getClass.getName) + logger.info("Started MigratePartitionsFromFile App") + + migrateTable(sourceConnection, destinationConnection) + + exitSpark + + private def migrateTable(sourceConnection: CassandraConnector, destinationConnection: CassandraConnector) = { + val partitions = SplitPartitions.getFailedSubPartitionsFromFile(splitSize, tokenRangeFile) + logger.info("PARAM Calculated -- Total Partitions: " + partitions.size()) + val parts = sContext.parallelize(partitions.toSeq, partitions.size); + logger.info("Spark parallelize created : " + parts.count() + " parts!"); + + parts.foreach(part => { + sourceConnection.withSessionDo(sourceSession => + destinationConnection.withSessionDo(destinationSession => + DiffJobSession.getInstance(sourceSession, destinationSession, sc) + .getDataAndDiff(part.getMin, part.getMax))) + }) + + DiffJobSession.getInstance(null, null, sc).printCounts(true); + } + +} + + + diff --git a/src/main/scala/datastax/astra/migrate/DiffDataFailedRowsFromFile.scala b/src/main/scala/datastax/astra/migrate/DiffDataFailedRowsFromFile.scala new file mode 100644 index 00000000..7f71ad89 --- /dev/null +++ b/src/main/scala/datastax/astra/migrate/DiffDataFailedRowsFromFile.scala @@ -0,0 +1,26 @@ +package datastax.astra.migrate + +import com.datastax.spark.connector.cql.CassandraConnector +import org.slf4j.LoggerFactory + +object DiffDataFailedRowsFromFile extends AbstractJob { + + val logger = LoggerFactory.getLogger(this.getClass.getName) + logger.info("Started MigrateRowsFromFile App") + + migrateTable(sourceConnection, destinationConnection) + + exitSpark + + private def migrateTable(sourceConnection: CassandraConnector, destinationConnection: CassandraConnector) = { + val listOfPKRows = SplitPartitions.getFailedRowPartsFromFile(splitSize, rowFailureFileSizeLimit, failedRowsFile) + logger.info("PARAM Calculated -- Number of PKRows: " + listOfPKRows.size()) + + sourceConnection.withSessionDo(sourceSession => + destinationConnection.withSessionDo(destinationSession => + CopyPKJobSession.getInstance(sourceSession, destinationSession, sc) + .getRowAndDiff(listOfPKRows))) + + } + +} diff --git a/src/main/scala/datastax/astra/migrate/Migrate.scala b/src/main/scala/datastax/astra/migrate/Migrate.scala index 800db223..772e35b0 100644 --- a/src/main/scala/datastax/astra/migrate/Migrate.scala +++ b/src/main/scala/datastax/astra/migrate/Migrate.scala @@ -1,39 +1,38 @@ -package datastax.astra.migrate - -import com.datastax.spark.connector.cql.CassandraConnector -import org.slf4j.LoggerFactory -import org.apache.spark.SparkConf - -import scala.collection.JavaConversions._ - -// http://www.russellspitzer.com/2016/02/16/Multiple-Clusters-SparkSql-Cassandra/ - -object Migrate extends AbstractJob { - - val logger = LoggerFactory.getLogger(this.getClass.getName) - logger.info("Started Migration App") - - migrateTable(sourceConnection, destinationConnection, sc) - - exitSpark - - private def migrateTable(sourceConnection: CassandraConnector, destinationConnection: CassandraConnector, config: SparkConf) = { - val partitions = SplitPartitions.getRandomSubPartitions(numSplits, minPartition, maxPartition, Integer.parseInt(coveragePercent)) - logger.info("PARAM Calculated -- Total Partitions: " + partitions.size()) - val parts = sContext.parallelize(partitions.toSeq, partitions.size); - logger.info("Spark parallelize created : " + parts.count() + " parts!"); - - parts.foreach(part => { - sourceConnection.withSessionDo(sourceSession => - destinationConnection.withSessionDo(destinationSession => - CopyJobSession.getInstance(sourceSession, destinationSession, config) - .getDataAndInsert(part.getMin, part.getMax))) - }) - - CopyJobSession.getInstance(null, null, sc).printCounts(true); - } - -} - - - +package datastax.astra.migrate + +import com.datastax.spark.connector.cql.CassandraConnector +import org.slf4j.LoggerFactory + +import scala.collection.JavaConversions._ + +// http://www.russellspitzer.com/2016/02/16/Multiple-Clusters-SparkSql-Cassandra/ + +object Migrate extends AbstractJob { + + val logger = LoggerFactory.getLogger(this.getClass.getName) + logger.info("Started Migration App") + + migrateTable(sourceConnection, destinationConnection) + + exitSpark + + private def migrateTable(sourceConnection: CassandraConnector, destinationConnection: CassandraConnector) = { + val partitions = SplitPartitions.getRandomSubPartitions(splitSize, minPartition, maxPartition, Integer.parseInt(coveragePercent)) + logger.info("PARAM Calculated -- Total Partitions: " + partitions.size()) + val parts = sContext.parallelize(partitions.toSeq, partitions.size); + logger.info("Spark parallelize created : " + parts.count() + " parts!"); + + parts.foreach(part => { + sourceConnection.withSessionDo(sourceSession => + destinationConnection.withSessionDo(destinationSession => + CopyJobSession.getInstance(sourceSession, destinationSession, sc) + .getDataAndInsert(part.getMin, part.getMax))) + }) + + CopyJobSession.getInstance(null, null, sc).printCounts(true); + } + +} + + + diff --git a/src/main/scala/datastax/astra/migrate/MigratePartitionsFromFile.scala b/src/main/scala/datastax/astra/migrate/MigratePartitionsFromFile.scala index 97e1c325..1cc7315b 100644 --- a/src/main/scala/datastax/astra/migrate/MigratePartitionsFromFile.scala +++ b/src/main/scala/datastax/astra/migrate/MigratePartitionsFromFile.scala @@ -1,36 +1,36 @@ -package datastax.astra.migrate - -import com.datastax.spark.connector.cql.CassandraConnector -import org.slf4j.LoggerFactory - -import scala.collection.JavaConversions._ - -object MigratePartitionsFromFile extends AbstractJob { - - val logger = LoggerFactory.getLogger(this.getClass.getName) - logger.info("Started MigratePartitionsFromFile App") - - migrateTable(sourceConnection, destinationConnection) - - exitSpark - - private def migrateTable(sourceConnection: CassandraConnector, destinationConnection: CassandraConnector) = { - val partitions = SplitPartitions.getSubPartitionsFromFile(numSplits) - logger.info("PARAM Calculated -- Total Partitions: " + partitions.size()) - val parts = sContext.parallelize(partitions.toSeq, partitions.size); - logger.info("Spark parallelize created : " + parts.count() + " parts!"); - - parts.foreach(part => { - sourceConnection.withSessionDo(sourceSession => - destinationConnection.withSessionDo(destinationSession => - CopyJobSession.getInstance(sourceSession, destinationSession, sc) - .getDataAndInsert(part.getMin, part.getMax))) - }) - - CopyJobSession.getInstance(null, null, sc).printCounts(true); - } - -} - - - +package datastax.astra.migrate + +import com.datastax.spark.connector.cql.CassandraConnector +import org.slf4j.LoggerFactory + +import scala.collection.JavaConversions._ + +object MigratePartitionsFromFile extends AbstractJob { + + val logger = LoggerFactory.getLogger(this.getClass.getName) + logger.info("Started MigratePartitionsFromFile App") + + migrateTable(sourceConnection, destinationConnection) + + exitSpark + + private def migrateTable(sourceConnection: CassandraConnector, destinationConnection: CassandraConnector) = { + val partitions = SplitPartitions.getSubPartitionsFromFile(splitSize, tokenRangeFile) + logger.info("PARAM Calculated -- Total Partitions: " + partitions.size()) + val parts = sContext.parallelize(partitions.toSeq, partitions.size); + logger.info("Spark parallelize created : " + parts.count() + " parts!"); + + parts.foreach(part => { + sourceConnection.withSessionDo(sourceSession => + destinationConnection.withSessionDo(destinationSession => + CopyJobSession.getInstance(sourceSession, destinationSession, sc) + .getDataAndInsert(part.getMin, part.getMax))) + }) + + CopyJobSession.getInstance(null, null, sc).printCounts(true); + } + +} + + + diff --git a/src/main/scala/datastax/astra/migrate/MigrateRowsFromFile.scala b/src/main/scala/datastax/astra/migrate/MigrateRowsFromFile.scala index 07dda966..757138f6 100644 --- a/src/main/scala/datastax/astra/migrate/MigrateRowsFromFile.scala +++ b/src/main/scala/datastax/astra/migrate/MigrateRowsFromFile.scala @@ -1,25 +1,25 @@ -package datastax.astra.migrate - -import com.datastax.spark.connector.cql.CassandraConnector -import org.slf4j.LoggerFactory - -object MigrateRowsFromFile extends AbstractJob { - - val logger = LoggerFactory.getLogger(this.getClass.getName) - logger.info("Started MigrateRowsFromFile App") - - migrateTable(sourceConnection, destinationConnection) - - exitSpark - - private def migrateTable(sourceConnection: CassandraConnector, destinationConnection: CassandraConnector) = { - val listOfPKRows = SplitPartitions.getRowPartsFromFile(numSplits) - logger.info("PARAM Calculated -- Number of PKRows: " + listOfPKRows.size()) - - sourceConnection.withSessionDo(sourceSession => - destinationConnection.withSessionDo(destinationSession => - CopyPKJobSession.getInstance(sourceSession, destinationSession, sc) - .getRowAndInsert(listOfPKRows))) - } - -} +package datastax.astra.migrate + +import com.datastax.spark.connector.cql.CassandraConnector +import org.slf4j.LoggerFactory + +object MigrateRowsFromFile extends AbstractJob { + + val logger = LoggerFactory.getLogger(this.getClass.getName) + logger.info("Started MigrateRowsFromFile App") + + migrateTable(sourceConnection, destinationConnection) + + exitSpark + + private def migrateTable(sourceConnection: CassandraConnector, destinationConnection: CassandraConnector) = { + val listOfPKRows = SplitPartitions.getRowPartsFromFile(splitSize, failedRowsFile) + logger.info("PARAM Calculated -- Number of PKRows: " + listOfPKRows.size()) + + sourceConnection.withSessionDo(sourceSession => + destinationConnection.withSessionDo(destinationSession => + CopyPKJobSession.getInstance(sourceSession, destinationSession, sc) + .getRowAndInsert(listOfPKRows))) + } + +} diff --git a/src/main/scala/datastax/astra/migrate/OriginData.scala b/src/main/scala/datastax/astra/migrate/OriginData.scala index 28081b2c..a718e5da 100644 --- a/src/main/scala/datastax/astra/migrate/OriginData.scala +++ b/src/main/scala/datastax/astra/migrate/OriginData.scala @@ -1,80 +1,80 @@ -package datastax.astra.migrate - -import com.datastax.spark.connector.cql.CassandraConnector -import org.slf4j.LoggerFactory - -import scala.collection.JavaConversions._ - -object OriginData extends BaseJob { - - val logger = LoggerFactory.getLogger(this.getClass.getName) - logger.info("Started Migration App") - var sourceConnection = getConnection(true, sourceScbPath, sourceHost, sourceUsername, sourcePassword, - sourceTrustStorePath, sourceTrustStorePassword, sourceTrustStoreType, sourceKeyStorePath, sourceKeyStorePassword, sourceEnabledAlgorithms); - analyzeSourceTable(sourceConnection) - exitSpark - - - private def getConnection(isSource: Boolean, scbPath: String, host: String, username: String, password: String, - trustStorePath: String, trustStorePassword: String, trustStoreType: String, - keyStorePath: String, keyStorePassword: String, enabledAlgorithms: String): CassandraConnector = { - var connType: String = "Source" - - if (scbPath.nonEmpty) { - abstractLogger.info(connType + ": Connected to Astra!"); - - return CassandraConnector(sc - .set("spark.cassandra.auth.username", username) - .set("spark.cassandra.auth.password", password) - .set("spark.cassandra.input.consistency.level", consistencyLevel) - .set("spark.cassandra.connection.config.cloud.path", scbPath)) - } else if (trustStorePath.nonEmpty) { - abstractLogger.info(connType + ": Connected to Cassandra (or DSE) with SSL!"); - - // Use defaults when not provided - var enabledAlgorithmsVar = enabledAlgorithms - if (enabledAlgorithms == null || enabledAlgorithms.trim.isEmpty) { - enabledAlgorithmsVar = "TLS_RSA_WITH_AES_128_CBC_SHA, TLS_RSA_WITH_AES_256_CBC_SHA" - } - - return CassandraConnector(sc - .set("spark.cassandra.auth.username", username) - .set("spark.cassandra.auth.password", password) - .set("spark.cassandra.input.consistency.level", consistencyLevel) - .set("spark.cassandra.connection.host", host) - .set("spark.cassandra.connection.ssl.enabled", "true") - .set("spark.cassandra.connection.ssl.enabledAlgorithms", enabledAlgorithmsVar) - .set("spark.cassandra.connection.ssl.trustStore.password", trustStorePassword) - .set("spark.cassandra.connection.ssl.trustStore.path", trustStorePath) - .set("spark.cassandra.connection.ssl.keyStore.password", keyStorePassword) - .set("spark.cassandra.connection.ssl.keyStore.path", keyStorePath) - .set("spark.cassandra.connection.ssl.trustStore.type", trustStoreType) - .set("spark.cassandra.connection.ssl.clientAuth.enabled", "true") - ) - } else { - abstractLogger.info(connType + ": Connected to Cassandra (or DSE)!"); - - return CassandraConnector(sc.set("spark.cassandra.auth.username", username) - .set("spark.cassandra.auth.password", password) - .set("spark.cassandra.input.consistency.level", consistencyLevel) - .set("spark.cassandra.connection.host", host)) - } - - } - - private def analyzeSourceTable(sourceConnection: CassandraConnector) = { - val partitions = SplitPartitions.getRandomSubPartitions(numSplits, minPartition, maxPartition, Integer.parseInt(coveragePercent)) - logger.info("PARAM Calculated -- Total Partitions: " + partitions.size()) - val parts = sContext.parallelize(partitions.toSeq, partitions.size); - logger.info("Spark parallelize created : " + parts.count() + " parts!"); - - parts.foreach(part => { - sourceConnection.withSessionDo(sourceSession => - OriginCountJobSession.getInstance(sourceSession, sc) - .getData(part.getMin, part.getMax)) - }) - - } - -} - +package datastax.astra.migrate + +import com.datastax.spark.connector.cql.CassandraConnector +import org.slf4j.LoggerFactory + +import scala.collection.JavaConversions._ + +object OriginData extends BaseJob { + + val logger = LoggerFactory.getLogger(this.getClass.getName) + logger.info("Started Migration App") + var sourceConnection = getConnection(true, sourceScbPath, sourceHost, sourceUsername, sourcePassword, + sourceTrustStorePath, sourceTrustStorePassword, sourceTrustStoreType, sourceKeyStorePath, sourceKeyStorePassword, sourceEnabledAlgorithms); + analyzeSourceTable(sourceConnection) + exitSpark + + + private def getConnection(isSource: Boolean, scbPath: String, host: String, username: String, password: String, + trustStorePath: String, trustStorePassword: String, trustStoreType: String, + keyStorePath: String, keyStorePassword: String, enabledAlgorithms: String): CassandraConnector = { + var connType: String = "Source" + + if (scbPath.nonEmpty) { + abstractLogger.info(connType + ": Connected to Astra!"); + + return CassandraConnector(sc + .set("spark.cassandra.auth.username", username) + .set("spark.cassandra.auth.password", password) + .set("spark.cassandra.input.consistency.level", consistencyLevel) + .set("spark.cassandra.connection.config.cloud.path", scbPath)) + } else if (trustStorePath.nonEmpty) { + abstractLogger.info(connType + ": Connected to Cassandra (or DSE) with SSL!"); + + // Use defaults when not provided + var enabledAlgorithmsVar = enabledAlgorithms + if (enabledAlgorithms == null || enabledAlgorithms.trim.isEmpty) { + enabledAlgorithmsVar = "TLS_RSA_WITH_AES_128_CBC_SHA, TLS_RSA_WITH_AES_256_CBC_SHA" + } + + return CassandraConnector(sc + .set("spark.cassandra.auth.username", username) + .set("spark.cassandra.auth.password", password) + .set("spark.cassandra.input.consistency.level", consistencyLevel) + .set("spark.cassandra.connection.host", host) + .set("spark.cassandra.connection.ssl.enabled", "true") + .set("spark.cassandra.connection.ssl.enabledAlgorithms", enabledAlgorithmsVar) + .set("spark.cassandra.connection.ssl.trustStore.password", trustStorePassword) + .set("spark.cassandra.connection.ssl.trustStore.path", trustStorePath) + .set("spark.cassandra.connection.ssl.keyStore.password", keyStorePassword) + .set("spark.cassandra.connection.ssl.keyStore.path", keyStorePath) + .set("spark.cassandra.connection.ssl.trustStore.type", trustStoreType) + .set("spark.cassandra.connection.ssl.clientAuth.enabled", "true") + ) + } else { + abstractLogger.info(connType + ": Connected to Cassandra (or DSE)!"); + + return CassandraConnector(sc.set("spark.cassandra.auth.username", username) + .set("spark.cassandra.auth.password", password) + .set("spark.cassandra.input.consistency.level", consistencyLevel) + .set("spark.cassandra.connection.host", host)) + } + + } + + private def analyzeSourceTable(sourceConnection: CassandraConnector) = { + val partitions = SplitPartitions.getRandomSubPartitions(splitSize, minPartition, maxPartition, Integer.parseInt(coveragePercent)) + logger.info("PARAM Calculated -- Total Partitions: " + partitions.size()) + val parts = sContext.parallelize(partitions.toSeq, partitions.size); + logger.info("Spark parallelize created : " + parts.count() + " parts!"); + + parts.foreach(part => { + sourceConnection.withSessionDo(sourceSession => + OriginCountJobSession.getInstance(sourceSession, sc) + .getData(part.getMin, part.getMax)) + }) + + } + +} + diff --git a/src/resources/autoSparkConfig.py b/src/resources/autoSparkConfig.py new file mode 100644 index 00000000..65e93bbb --- /dev/null +++ b/src/resources/autoSparkConfig.py @@ -0,0 +1,431 @@ +#!/usr/bin/env python3 + +#pip install xlsxwriter +#pip install pandas + +# tool imports +import os.path +from os import path +import sys +import re +import math + +# Astra Spark Migration Data Model Extrator +version = "1.0.0" + +# Retrieve default values +cfg_base_array = { + 'beta' : 'false', + 'source_host': 'localhost', + 'source_read_consistancy_level' : 'LOCAL_QUORUM', + 'astra_read_consistancy_level' : 'LOCAL_QUORUM', + 'max_retries' : '10', + 'read_rate_limit' : '40000', + 'write_rate_limit' : '40000', + 'split_size' : '10000', + 'batch_size' : '5', + 'print_stats_after' : '100000', + 'counter_table' : 'false' +} + +field_type_array = { + 'ascii': '0', + 'text': '0', + 'varchar': '0', + 'int': '1', + 'varint': '1', # confirm + 'smallint': '1', + 'bigint': '2', + 'counter': '2', + 'double': '3', + 'time': '4', + 'timestamp':'4', + 'map': '5', + 'list': '6', + 'blob': '7', + 'set': '8', + 'uuid': '9', + 'timeuuid': '9', + 'boolean': '10', + 'tuple': '11', + 'float': '12', + 'tinyint': '13', + 'decimal': '14', + 'date': '1', + 'inet': '0' + +} +system_keyspace = ['OpsCenter','dse_insights_local','solr_admin','test','dse_system','dse_analytics','system_auth','system_traces','system','dse_system_local','system_distributed','system_schema','dse_perf','dse_insights','dse_security','dse_system','killrvideo','dse_leases','dsefs_c4z','HiveMetaStore','dse_analytics','dsefs','spark_system'] +tp_tbl_data = { + 'Materialized Views':{}, + 'Secondary Indexes':{}, + 'Storage-Attached Indexes':{} +} + +def field_type_comment(tbl,fieldName,fieldType): + if fieldType=='date': + print('Alert for '+tbl+'.'+fieldName+'\n\tValues of the date type are encoded as 32-bit unsigned integers representing a number of days with “the epoch” at the center of the range (2^31). Epoch is January 1st, 1970\n\tFor timestamps, a date can be input either as an integer or using a date string. In the later case, the format should be yyyy-mm-dd (so 2011-02-03 for instance).\n\tAdditional customization required for this table.\n') + elif fieldType=='decimal': + print('Alert for '+tbl+'.'+fieldName+'\n\tValues of the decimal type require specific variable-precision at the field level./n/tAdditional customization required for this table.\n') + elif fieldType=='double': + print('Alert for '+tbl+'.'+fieldName+'\n\tValues of the double type require specific variable-precision at the field level./n/tAdditional customization required for this table.\n') + elif fieldType=='float': + print('Alert for '+tbl+'.'+fieldName+'\n\tValues of the float type require additonal work at the field level./n/tAdditional customization required for this table.\n') + elif fieldType=='time' or fieldType=='timestamp': + print('Alert for '+tbl+'.'+fieldName+'\n\tValues of the time type are encoded as 64-bit signed integers representing the number of nanoseconds since midnight.\n\tFor timestamps, a time can be input either as an integer or using a string representing the time. In the later case, the format should be hh:mm:ss[.fffffffff] (where the sub-second precision is optional and if provided, can be less than the nanosecond). So for instance, the following are valid inputs for a time:\n\t\t08:12:54\n\t\t08:12:54.123\n\t\t08:12:54.123456\n\t\t08:12:54.123456789\n\tAdditional customization required for this table.\n') + + +# 0: String [ascii, text, varchar] +# 1: Integer [int, smallint] +# 2: Long [bigint] +# 3: Double [double] +# 4: Instant [time, timestamp] +# 5: Map (separate type by %) [map] - Example: 5%1%0 for map +# 6: List (separate type by %) [list] - Example: 6%0 for list +# 7: ByteBuffer [blob] +# 8: Set (seperate type by %) [set] - Example: 8%0 for set +# 9: UUID [uuid, timeuuid] +#10: Boolean [boolean] +#11: TupleValue [tuple] +#12: Float (float) +#13: TinyInt [tinyint] +#14: BigDecimal (decimal) + +# initialize script variables +field_types = field_type_array.keys() +migrate_ks = '' +show_help = '' +target_field_type = [] +migrate_tbl_data = {} +target_table = [] +dm_path = '' +cf_file = 'sparkConfigDefaults.txt' +template_file = 'sparkConfTemplate.txt' +fieldData = {} +cfg_array = {} +schema_name = 'schema' + +def add_tp_tbl(gr,ks,tbl,src_ks,src_tbl): + if src_ks not in system_keyspace: + try: + type(tp_tbl_data[gr][src_ks]) + except: + tp_tbl_data[gr][src_ks]={} + try: + type(tp_tbl_data[gr][src_ks][src_tbl]) + except: + tp_tbl_data[gr][src_ks][src_tbl] = [] + if (ks+'.'+tbl) not in tp_tbl_data[gr][src_ks][src_tbl]: + tp_tbl_data[gr][src_ks][src_tbl].append(ks+'.'+tbl) + +def process_field(tbl,fieldName,fieldType,cql=''): + if 'map<' in fieldType: + mapData = fieldType.split('<')[1].split('>')[0].split(',') + fieldValue = field_type_array['map'] + for mapType in mapData: + fieldValue += '%' + field_type_array[mapType.strip()] + field_type_comment(tbl,fieldName,mapType.strip()) + elif 'set<' in fieldType: + mapData = fieldType.split('<')[1].split('>')[0].split(',') + fieldValue = field_type_array['set'] + for mapType in mapData: + fieldValue += '%' + field_type_array[mapType.strip()] + field_type_comment(tbl,fieldName,mapType.strip()) + elif 'list<' in fieldType: + mapData = fieldType.split('<')[1].split('>')[0].split(',') + fieldValue = field_type_array['list'] + for mapType in mapData: + fieldValue += '%' + field_type_array[mapType.strip()] + field_type_comment(tbl,fieldName,mapType.strip()) + elif fieldType in field_types: + fieldValue = field_type_array[fieldType] + else: + exit('Error: unknown field type: ' + fieldType +'\n'+cql) + cfg_array['fields'] += fieldName + cfg_array['field_types'] += fieldValue + + if fieldType == 'counter': + cfg_array['counter_table'] = 'true' + # more work here + elif fieldType == 'map' or fieldType == 'list' or fieldType == 'set': + exit('Error: Build fieldType: ' + fieldType) + field_type_comment(tbl,fieldName,fieldType) + +# communicate command line help +for argnum,arg in enumerate(sys.argv): + if(arg=='-h' or arg =='--help'): + help_content = \ + '\n'\ + 'Script for creating migration support files\n'\ + 'usage: autoSparkConfig.py -k KEYSPACE [-p PATH_MIGRATION_SUPPORT_FILES] [-c] [-t TABLE1] [-t TABLE2]\n'\ + 'optional arguments:\n'\ + '-v, --version Version\n'\ + '-h, --help This help info\n'\ + '-p, --path Path to data model file\n'\ + '-s, --schema Name of schema file - Default schema\n'\ + '-k, --keyspace *Required: keyspace\n'\ + '-c, --counter Generates files for tables with counters\n'\ + '-t, --table Generates file for table(s)\n'\ + '\n'\ + 'Configuration Elements\n'\ + '-cf, --config Configuration File - Default: migrateConfig.txt\n'\ + ' *Required elements in Config File\n'\ + ' source_host\n'\ + ' source_username\n'\ + ' source_password\n'\ + ' astra_scb\n'\ + ' astra_username\n'\ + ' astra_password\n'\ + ' Optional Elements in Config File\n'\ + ' beta Default: '+cfg_base_array['beta']+'\n'\ + ' source_read_consistancy_level Default: '+cfg_base_array['source_read_consistancy_level']+'\n'\ + ' astra_read_consistancy_level Default: '+cfg_base_array['astra_read_consistancy_level']+'\n'\ + ' max_retries Default: '+cfg_base_array['max_retries']+'\n'\ + ' read_rate_limit Default: '+cfg_base_array['read_rate_limit']+'\n'\ + ' write_rate_Limit Default: '+cfg_base_array['write_rate_limit']+'\n'\ + ' split_size Default: '+cfg_base_array['split_size']+'\n'\ + ' batch_size Default: '+cfg_base_array['batch_size']+'\n'\ + ' print_stats_after Default: '+cfg_base_array['print_stats_after']+'\n'\ + '\n' + + exit(help_content) + elif(arg=='-v' or arg =='--version'): + exit("Version " + version) + elif(arg=='-s' or arg =='--schema'): + schema_name = sys.argv[argnum+1] + elif(arg=='-k' or arg =='--keyspace'): + migrate_ks = sys.argv[argnum+1] + elif(arg=='-c' or arg =='--counter'): + target_field_type.append('counter') + elif(arg=='-p' or arg =='--path'): + dm_path = sys.argv[argnum+1] + elif(arg=='-t' or arg =='--table'): + target_table.append(sys.argv[argnum+1]) + elif(arg=='-cf' or arg =='--config'): + cf_file = sys.argv[argnum+1] + +if (migrate_ks==''): + exit("keyspace required") + +info_box = 'Astra Spark Migration Data Model Extrator\n'\ + 'Version '+version+'\n'\ + 'Supported data in separate spreadsheet tabs'\ + +# initialize database vaariables +is_index = 0 +ks_array = [] +count = 0 +row={} +end_row={} + +# collect and analyze schema +ks = '' +tbl = '' +tbl_data = {} + +if path.isfile(dm_path + schema_name): + schemaFile = open(dm_path + schema_name, 'r') +for line in schemaFile: + line = line.strip('\n').strip() + if (line==''): tbl='' + elif("CREATE KEYSPACE" in line): + prev_ks = ks + ks = line.split()[2].strip('"') + tbl_data[ks] = {'cql':line,'table':{}} + tbl='' + elif ks != '' and ks==migrate_ks: + if('CREATE INDEX' in line): + prev_tbl = tbl + tbl = line.split()[2].strip('"') + tbl_data[ks]['table'][tbl] = {'type':'Index', 'cql':line} + src_ks = line.split('ON')[1].split('.')[0].strip().strip('"') + src_tbl = line.split('ON')[1].split('.')[1].split()[0].strip() + add_tp_tbl('Secondary Indexes',ks,tbl,src_ks,src_tbl) + tbl='' + elif('CREATE CUSTOM INDEX' in line): + prev_tbl = tbl + tbl = line.split()[3].strip('"') + tbl_data[ks]['table'][tbl] = {'type':'Storage-Attached Index', 'cql':line} + src_ks = line.split('ON')[1].split('.')[0].strip().strip('"') + src_tbl = line.split('ON')[1].split('.')[1].split()[0].strip() + add_tp_tbl('Storage-Attached Indexes',ks,tbl,src_ks,src_tbl) + tbl='' + elif('CREATE TYPE' in line): + prev_tbl = tbl + tbl_line = line.split()[2].strip('"') + tbl = tbl_line.split('.')[1].strip().strip('"') + tbl_data[ks]['table'][tbl] = {'type':'Type', 'cql':line} + tbl_data[ks]['table'][tbl]['field'] = {} + elif('CREATE AGGREGATE' in line): + prev_tbl = tbl + if 'IF NOT EXISTS' in line: + tbl = line.split()[5].strip('"') + else: + tbl = line.split()[2].strip('"') + tbl_data[ks]['table'][tbl] = {'type':'UDA', 'cql':line} + tbl_data[ks]['table'][tbl]['field'] = {} + try: + warnings['Astra Guardrails']['User-Defined Aggregate'].append = 'UDA '+tbl+' in '+ks + except: + warnings['Astra Guardrails']['User-Defined Aggregate'] = ['UDA '+tbl+' in '+ks] + elif('CREATE OR REPLACE FUNCTION' in line): + prev_tbl = tbl + tbl = line.split()[4].strip('"') + tbl_data[ks]['table'][tbl] = {'type':'UDF', 'cql':line} + tbl_data[ks]['table'][tbl]['field'] = {} + try: + warnings['Astra Guardrails']['User-Defined Function'].append = 'UDF '+tbl+' in '+ks + except: + warnings['Astra Guardrails']['User-Defined Function'] = ['UDF '+tbl+' in '+ks] + elif 'CREATE FUNCTION' in line: + prev_tbl = tbl + tbl = line.split()[2].strip('"') + tbl_data[ks]['table'][tbl] = {'type':'UDF', 'cql':line} + tbl_data[ks]['table'][tbl]['field'] = {} + try: + warnings['Astra Guardrails']['User-Defined Function'].append = 'UDF '+tbl+' in '+ks + except: + warnings['Astra Guardrails']['User-Defined Function'] = ['UDF '+tbl+' in '+ks] + elif('CREATE TABLE' in line): + prev_tbl = tbl + tbl_line = line.split()[2].strip('"') + tbl = tbl_line.split('.')[1].strip().strip('"') + tbl_data[ks]['table'][tbl] = {'type':'Table', 'cql':line} + tbl_data[ks]['table'][tbl]['field'] = {} + elif('CREATE MATERIALIZED VIEW' in line ): + prev_tbl = tbl + tbl_line = line.split()[3].strip('"') + tbl = tbl_line.split('.')[1].strip().strip('"') + tbl_data[ks]['table'][tbl] = {'type':'Materialized Views', 'cql':line} + tbl_data[ks]['table'][tbl]['field'] = {} + if (tbl !=''): + if('FROM' in line and tbl_data[ks][tbl]['type']=='Materialized Views'): + src_ks = line.split('.')[0].split()[1].strip('"') + src_tbl = line.split('.')[1].strip('"') + add_tp_tbl('Materialized Views',ks,tbl,src_ks,src_tbl) + elif('PRIMARY KEY' in line): + if(line.count('(') == 1): + tbl_data[ks]['table'][tbl]['pk'] = [line.split('(')[1].split(')')[0].split(', ')[0]] + tbl_data[ks]['table'][tbl]['cc'] = line.split('(')[1].split(')')[0].split(', ') + del tbl_data[ks]['table'][tbl]['cc'][0] + elif(line.count('(') == 2): + tbl_data[ks]['table'][tbl]['pk'] = line.split('(')[2].split(')')[0].split(', ') + tbl_data[ks]['table'][tbl]['cc'] = line.split('(')[2].split(')')[1].lstrip(', ').split(', ') + elif(line.split()[2]=='PRIMARY'): + fld_name = line.split()[0] + fld_type = line.split()[1].strip(',') + tbl_data[ks]['table'][tbl]['field'][fld_name]=fld_type + tbl_data[ks]['table'][tbl]['pk'] = [fld_name] + tbl_data[ks]['table'][tbl]['cc'] = [] + tbl_data[ks]['table'][tbl]['cql'] += ' ' + line.strip() + elif line.strip() != ');': + try: + tbl_data[ks]['table'][tbl]['cql'] += ' ' + line + if('AND ' not in line and ' WITH ' not in line): + fld_name = line.split()[0] + fld_type = line.replace(fld_name + ' ','').strip(',') + if (fld_name!='CREATE'): + tbl_data[ks]['table'][tbl]['field'][fld_name]=fld_type + except: + print(('Error1:' + ks + '.' + tbl + ' - ' + line)) + +# Add tables to be migrated +for ks, ksData in list(tbl_data.items()): + if (migrate_ks == '' or migrate_ks == ks): + for tbl, tblData in list(ksData['table'].items()): + if tblData['type'] == 'Table': + if len(target_table)>0: + if (tbl in target_table): + migrate_tbl_data[tbl]=tbl_data[ks]['table'][tbl] + if len(target_field_type)>0: + for field, fieldType in list(tblData['field'].items()): + if fieldType in target_field_type: + try: + type(migrate_tbl_data[tbl]) + except: + migrate_tbl_data[tbl]=tbl_data[ks]['table'][tbl] + +#exit(migrate_tbl_data) + + +for tbl,tblData in list(migrate_tbl_data.items()): + + # Retrieve config file values + if path.isfile(dm_path + cf_file): + defaultsFile = open(dm_path + cf_file, 'r') + + # reset config data + cfg_array = cfg_base_array + for line in defaultsFile: + line = line.strip('\n').strip() + if len(line.split()) > 1: + cfg_array[line.split()[0]] = line.split()[1] + + defaultsFile.close() + config_file_data = '' + fieldData = {} + + # add table elements + cfg_array['keyspace_table'] = migrate_ks + "." + tbl + + # add field elements + # primary key(s) + if len(tblData['pk'])>1: cfg_array['fields'] = '(' + else: cfg_array['fields'] = '' + cfg_array['field_types'] = '' + first_field = 1 + for key in tblData['pk']: + if first_field == 1: + first_field = 0 + cfg_array['partition_keys'] = key + else: + cfg_array['fields'] += ',' + cfg_array['field_types'] += ',' + cfg_array['partition_keys'] += ','+key + process_field(tbl,key,tblData['field'][key],tblData['cql']) + if len(tblData['pk'])>1: cfg_array['fields'] += '),(' + cfg_array['fields'] += ',' + if len(tblData['cc'])>1: cfg_array['fields'] += '(' + # clustering column(s) + first_field = 1 + for key in tblData['cc']: + if first_field == 1: + first_field = 0 + cfg_array['field_types'] += ',' + else: + cfg_array['fields'] += ',' + cfg_array['field_types'] += ',' + process_field(tbl,key,tblData['field'][key],tblData['cql']) + if len(tblData['cc'])>1 : cfg_array['fields'] += ')' + + # non-primary field(s) + for fieldName,fieldType in list(tblData['field'].items()): + if fieldName not in tblData['pk'] and fieldName not in tblData['cc']: + cfg_array['fields'] += ',' + cfg_array['field_types'] += ',' + process_field(tbl,fieldName,fieldType,tblData['cql']) + + # Retrieve migration config template + if path.isfile(dm_path + template_file): + templateFile = open(dm_path + template_file, 'r') + + # create table spark migration config file + for line in templateFile: + newline = line + if '<<' in line and '>>' in line: + cfg_param = (line.split('<<'))[1].split('>>')[0] + if cfg_param in cfg_array.keys(): + newline = line.replace('<<'+cfg_param+'>>',cfg_array[cfg_param]) + + config_file_data += newline + + templateFile.close() + + # add sql reference + config_file_data += '\n/* CQL Reference:\n' + tblData['cql'] + '\n/*' + + # Create Spark Migration Table Config File + cfgFile = open(dm_path+migrate_ks+'_'+tbl+'_SparkConfig.properties', 'w') + cfgFile.write(config_file_data) + print('Migration Config File Created: '+dm_path+migrate_ks+'_'+tbl+'_SparkConfig.properties') + cfgFile.close() diff --git a/src/resources/migrate_data.sh b/src/resources/migrate_data.sh index f30c71a3..1c64379d 100644 --- a/src/resources/migrate_data.sh +++ b/src/resources/migrate_data.sh @@ -1,64 +1,64 @@ -#! /bin/bash - -########################################################################################################################### -# -# This script can be used to Migrate data between two Cassandra Clusters (including Astra) in chunks. It migrates data -# sequentially in progressive token-range slices. It also helps to restart migration from a point where the previous -# run might have stopped/failed for whatever reasons. -# -# Before running the script, update the below params -# SPARK_SUBMIT - Path to the spark-submit command -# PROPS_FILE - Path to the spark configuration for the table -# S_IDX - Change this value only if you want to set a custom starting point (e.g. after a previous incomplete run) -# -# *** IMP Note: Run this script using nohup in background using a logfile and tail the logfile to monitor progress *** -# e.g. nohup ./migrate_data.sh > logs/spark/migrate_data.out & -# -# To monitor migration progress, you could use the below command -# grep "Running Migrate for Partition Range" logs/spark/migrate_data.out -# -########################################################################################################################### - -# Path to spark-submit -SPARK_SUBMIT=/home/ubuntu/spark-3.3.1-bin-hadoop3/bin/spark-submit - -# Path to spark configuration for the table -PROPS_FILE=/home/ubuntu/sparkConf.properties - -# Starting partition token (Default is Min possible value of a Cassandra token - min long value in Java). -# Change this value only if you want to start from a custom partition token (e.g. when a migrate job failed midway) -S_IDX=-9223372036854775808 - -# ** DO NOT CHANGE ANYTHING BELOW THIS ** -SLICE=999999999999999999 - -echo "Starting Migration using $PROPS_FILE !!" - -# Migrate initial partition tokens from min-long to -9000000000000000000 -if [ $S_IDX -lt -9000000000000000000 ] -then - E_IDX=-9000000000000000001 - echo "Running Migrate for Partition Range $S_IDX to $E_IDX .." - $SPARK_SUBMIT --properties-file $PROPS_FILE --master "local[*]" --conf spark.origin.minPartition=$S_IDX --conf spark.origin.maxPartition=$E_IDX --class datastax.astra.migrate.Migrate cassandra-data-migrator-*.jar - S_IDX=-9000000000000000000 -fi - -# Migrate partition tokens from -9000000000000000000 to 8999999999999999999 in slices of 1000000000000000000 -while [ $S_IDX -lt 9000000000000000000 ] -do - if [ $S_IDX -gt 8223372036854775807 ] - then - E_IDX=8999999999999999999 - else - E_IDX=$(( $S_IDX + $SLICE )) - fi - echo "Running Migrate for Partition Range $S_IDX to $E_IDX .." - $SPARK_SUBMIT --properties-file $PROPS_FILE --master "local[*]" --conf spark.origin.minPartition=$S_IDX --conf spark.origin.maxPartition=$E_IDX --class datastax.astra.migrate.Migrate cassandra-data-migrator-*.jar - S_IDX=$(( $E_IDX + 1 )) -done - -# Migrate final partition tokens from 9000000000000000000 to max-long -E_IDX=9223372036854775807 -echo "Running Migrate for Partition Range $S_IDX to 9223372036854775807 .." -$SPARK_SUBMIT --properties-file $PROPS_FILE --master "local[*]" --conf spark.origin.minPartition=$S_IDX --conf spark.origin.maxPartition=$E_IDX --class datastax.astra.migrate.Migrate cassandra-data-migrator-*.jar -echo "Completed Migration using $PROPS_FILE !!" +#! /bin/bash + +########################################################################################################################### +# +# This script can be used to Migrate data between two Cassandra Clusters (including Astra) in chunks. It migrates data +# sequentially in progressive token-range slices. It also helps to restart migration from a point where the previous +# run might have stopped/failed for whatever reasons. +# +# Before running the script, update the below params +# SPARK_SUBMIT - Path to the spark-submit command +# PROPS_FILE - Path to the spark configuration for the table +# S_IDX - Change this value only if you want to set a custom starting point (e.g. after a previous incomplete run) +# +# *** IMP Note: Run this script using nohup in background using a logfile and tail the logfile to monitor progress *** +# e.g. nohup ./migrate_data.sh > logs/spark/migrate_data.out & +# +# To monitor migration progress, you could use the below command +# grep "Running Migrate for Partition Range" logs/spark/migrate_data.out +# +########################################################################################################################### + +# Path to spark-submit +SPARK_SUBMIT=/home/ubuntu/spark-2.4.8-bin-hadoop2.6/bin/spark-submit + +# Path to spark configuration for the table +PROPS_FILE=/home/ubuntu/sparkConf.properties + +# Starting partition token (Default is Min possible value of a Cassandra token - min long value in Java). +# Change this value only if you want to start from a custom partition token (e.g. when a migrate job failed midway) +S_IDX=-9223372036854775808 + +# ** DO NOT CHANGE ANYTHING BELOW THIS ** +SLICE=999999999999999999 + +echo "Starting Migration using $PROPS_FILE !!" + +# Migrate initial partition tokens from min-long to -9000000000000000000 +if [ $S_IDX -lt -9000000000000000000 ] +then + E_IDX=-9000000000000000001 + echo "Running Migrate for Partition Range $S_IDX to $E_IDX .." + $SPARK_SUBMIT --properties-file $PROPS_FILE --master "local[*]" --conf spark.origin.minPartition=$S_IDX --conf spark.origin.maxPartition=$E_IDX --class datastax.astra.migrate.Migrate cassandra-data-migrator-*.jar + S_IDX=-9000000000000000000 +fi + +# Migrate partition tokens from -9000000000000000000 to 8999999999999999999 in slices of 1000000000000000000 +while [ $S_IDX -lt 9000000000000000000 ] +do + if [ $S_IDX -gt 8223372036854775807 ] + then + E_IDX=8999999999999999999 + else + E_IDX=$(( $S_IDX + $SLICE )) + fi + echo "Running Migrate for Partition Range $S_IDX to $E_IDX .." + $SPARK_SUBMIT --properties-file $PROPS_FILE --master "local[*]" --conf spark.origin.minPartition=$S_IDX --conf spark.origin.maxPartition=$E_IDX --class datastax.astra.migrate.Migrate cassandra-data-migrator-*.jar + S_IDX=$(( $E_IDX + 1 )) +done + +# Migrate final partition tokens from 9000000000000000000 to max-long +E_IDX=9223372036854775807 +echo "Running Migrate for Partition Range $S_IDX to 9223372036854775807 .." +$SPARK_SUBMIT --properties-file $PROPS_FILE --master "local[*]" --conf spark.origin.minPartition=$S_IDX --conf spark.origin.maxPartition=$E_IDX --class datastax.astra.migrate.Migrate cassandra-data-migrator-*.jar +echo "Completed Migration using $PROPS_FILE !!" diff --git a/src/resources/partitions.csv b/src/resources/partitions.csv index 95701f5f..a3c706ec 100644 --- a/src/resources/partitions.csv +++ b/src/resources/partitions.csv @@ -1,7 +1,7 @@ -# This is a sample input file for job: MigratePartitionsFromFile -# list of primary-key fields separated by ' %% ' --6220480106867985210,-6120480106867985210 --4782224463879838018,-4682224463879838018 --507900353496146534,-407285462027022883 -2637884402540451982,4638499294009575633 -798869613692279889,8699484505161403540 +# This is a sample input file for job: MigratePartitionsFromFile +# list of primary-key fields separated by ' %% ' +-6220480106867985210,-6120480106867985210 +-4782224463879838018,-4682224463879838018 +-507900353496146534,-407285462027022883 +2637884402540451982,4638499294009575633 +798869613692279889,8699484505161403540 diff --git a/src/resources/primary_key_rows.csv b/src/resources/primary_key_rows.csv index 20ce1ab7..5aed7f57 100644 --- a/src/resources/primary_key_rows.csv +++ b/src/resources/primary_key_rows.csv @@ -1,4 +1,4 @@ -# This is a sample input file for job: MigrateRowsFromFile -# list of primary-key fields separated by ' %% ' --1000154815969456717 %% 0 %% 10 %% 1024 --1000154815969456717 %% 0 %% 10 %% 1025 +# This is a sample input file for job: MigrateRowsFromFile +# list of primary-key fields separated by ' %% ' +-1000154815969456717 %% 0 %% 10 %% 1024 +-1000154815969456717 %% 0 %% 10 %% 1025 diff --git a/src/resources/runCommands.txt b/src/resources/runCommands.txt index 9dc36c66..e57aafb2 100644 --- a/src/resources/runCommands.txt +++ b/src/resources/runCommands.txt @@ -1,14 +1,14 @@ -// Download dependencies -curl -OL https://downloads.datastax.com/dsbulk/dsbulk.tar.gz -curl -OL https://downloads.datastax.com/enterprise/cqlsh-astra.tar.gz -wget https://archive.apache.org/dist/spark/spark-3.3.1/spark-3.3.1-bin-hadoop3.tgz - -// Migrate -spark-submit --properties-file //sparkConf.properties --verbose --master "local[*]" --class datastax.astra.migrate.Migrate //cassandra-data-migrator-3.*.jar -spark-submit --properties-file //sparkConf.properties --master "local[*]" --driver-memory 25G --executor-memory 25G --class datastax.astra.migrate.Migrate //cassandra-data-migrator-3.*.jar &> table_out.log - -// Random Partitioner Run Command -spark-submit --properties-file //sparkConf.properties --verbose --master "local[*]" --conf spark.origin.minPartition=-1 --conf spark.origin.maxPartition=170141183460469231731687303715884105728 --class datastax.astra.migrate.Migrate //cassandra-data-migrator-3.*.jar - -// Validate -spark-submit --properties-file //sparkConf.properties --master "local[*]" --driver-memory 25G --executor-memory 25G --class datastax.astra.migrate.DiffData //cassandra-data-migrator-3.*.jar &> table_out.log +// Download dependencies +curl -OL https://downloads.datastax.com/dsbulk/dsbulk.tar.gz +curl -OL https://downloads.datastax.com/enterprise/cqlsh-astra.tar.gz +wget https://archive.apache.org/dist/spark/spark-2.4.8/spark-2.4.8-bin-hadoop2.7.tgz + +// Migrate +spark-submit --properties-file //sparkConf.properties --verbose --master "local[8]" --class datastax.astra.migrate.Migrate //cassandra-data-migrator-2.*.jar +spark-submit --properties-file //sparkConf.properties --master "local[8]" --driver-memory 25G --executor-memory 25G --class datastax.astra.migrate.Migrate //cassandra-data-migrator-2.*.jar &> table_out.log + +// Random Partitioner Run Command +spark-submit --properties-file //sparkConf.properties --verbose --master "local[8]" --conf spark.origin.minPartition=-1 --conf spark.origin.maxPartition=170141183460469231731687303715884105728 --class datastax.astra.migrate.Migrate //cassandra-data-migrator-2.*.jar + +// Validate +spark-submit --properties-file //sparkConf.properties --master "local[8]" --driver-memory 25G --executor-memory 25G --class datastax.astra.migrate.DiffData //cassandra-data-migrator-2.*.jar &> table_out.log diff --git a/src/resources/sparkConf.properties b/src/resources/sparkConf.properties index 71c37640..1dac9741 100644 --- a/src/resources/sparkConf.properties +++ b/src/resources/sparkConf.properties @@ -1,142 +1,145 @@ -# Origin cluster credentials (use "host + port" OR "secure-connect-bundle" but not both) -spark.origin.host localhost -spark.origin.port 9042 -#spark.origin.scb file:///aaa/bbb/secure-connect-enterprise.zip -spark.origin.username some-username -spark.origin.password some-secret-password -spark.origin.keyspaceTable test.a1 - -# Target cluster credentials (use "host + port" OR "secure-connect-bundle" but not both) -#spark.target.host localhost -#spark.target.port 9042 -spark.target.scb file:///aaa/bbb/secure-connect-enterprise.zip -spark.target.username client-id -spark.target.password client-secret -spark.target.keyspaceTable test.a2 - -# Add 'missing' rows (during 'Validation') in 'Target' from 'Origin'. N/A for 'Migration' -spark.target.autocorrect.missing false -# Update 'mismatched' rows (during 'Validation') in 'Target' to match 'Origin'. N/A for 'Migration' -spark.target.autocorrect.mismatch false - -# Read & Write rate-limits(rows/second). Higher value will improve performance and put more load on cluster -spark.readRateLimit 20000 -spark.writeRateLimit 20000 - -# Used to split Cassandra token-range into slices and migrate random slices one at a time -# 10K splits usually works for tables up to 100GB (uncompressed) with balanced token distribution -# For larger tables, test on 1% volume (using param coveragePercent) and increase the number-of-splits as needed -spark.numSplits 10000 - -# Use a value of 1 (disable batching) when primary-key and partition-key are same -# For tables with high avg count of rows/partition, use higher value to improve performance -spark.batchSize 10 - -# Below 'query' properties are set based on table schema -spark.query.origin comma-separated-partition-key,comma-separated-clustering-key,comma-separated-other-columns -spark.query.origin.partitionKey comma-separated-partition-key -spark.query.target.id comma-separated-partition-key,comma-separated-clustering-key -# Comma separated numeric data-type mapping (e.g. 'text' will map to '0') for all columns listed in "spark.query.origin" -spark.query.types 9,1,4,3 -############################################################################################################# -# Following are the supported data types and their corresponding [Cassandra data-types] mapping -# 0: ascii, text, varchar -# 1: int -# 2: bigint, counter -# 3: double -# 4: timestamp -# 5: map (separate type by %) - Example: 5%1%0 for map -# 6: list (separate type by %) - Example: 6%0 for list -# 7: blob -# 8: set (separate type by %) - Example: 8%0 for set -# 9: uuid, timeuuid -# 10: boolean -# 11: tuple -# 12: float -# 13: tinyint -# 14: decimal -# 15: date -# 16: UDT [any user-defined-type created using 'CREATE TYPE'] -# 17: varint -# 18: time -# 19: smallint -# Note: Ignore "Frozen" while mapping Collections (Map/List/Set) - Example: 5%1%0 for frozen> -############################################################################################################# - -# ENABLE ONLY IF COLUMN NAMES ON TARGET IS DIFFERENT FROM ORIGIN (SCHEMA & DATA-TYPES MUST BE SAME) -#spark.query.target comma-separated-partition-key,comma-separated-clustering-key,comma-separated-other-columns - -# The tool adds TTL & Writetime at row-level (not field-level). -# The largest TTL & Writetime values are used if multiple indexes are listed (comma separated) -# Comma separated column indexes from "spark.query.origin" used to find largest TTL or Writetime -spark.query.ttl.cols 2,3 -spark.query.writetime.cols 2,3 - -# ENABLE ONLY IF YOU WANT TO MIGRATE/VALIDATE ROWS BASED ON CQL FILTER -#spark.query.condition - -# ENABLE ONLY IF IT IS A COUNTER TABLE -#spark.counterTable false -#spark.counterTable.cql -#spark.counterTable.cql.index 0 - -# ENABLE ONLY IF YOU WANT TO FILTER BASED ON WRITE-TIME (values must be in microseconds) -#spark.origin.writeTimeStampFilter false -#spark.origin.minWriteTimeStampFilter 0 -#spark.origin.maxWriteTimeStampFilter 4102444800000000 - -# ENABLE ONLY IF retries needed (Retry a slice of token-range if an exception occurs) -#spark.maxRetries 0 - -# ENABLE ONLY IF YOU WANT TO MIGRATE/VALIDATE SOME % OF ROWS (NOT 100%) -#spark.coveragePercent 100 - -# ENABLE ONLY IF WANT LOG STATS MORE OR LESS FREQUENTLY THAN DEFAULT -#spark.printStatsAfter 100000 - -# ENABLE ONLY IF YOU WANT TO USE READ AND/OR WRITE CONSISTENCY OTHER THAN LOCAL_QUORUM -#spark.consistency.read LOCAL_QUORUM -#spark.consistency.write LOCAL_QUORUM - -# ENABLE ONLY IF YOU WANT TO REDUCE FETCH-SIZE TO AVOID FrameTooLongException -#spark.read.fetch.sizeInRows 1000 - -# ENABLE ONLY IF YOU WANT TO USE CUSTOM FIXED WRITETIME VALUE ON TARGET -#spark.target.custom.writeTime 0 - -# ENABLE ONLY TO SKIP recs greater than 10MB from Origin (to avoid Astra Guardrail error) -#spark.fieldGuardraillimitMB 10 - -# ENABLE ONLY TO count of recs greater than 10MB from Origin needed -#spark.origin.checkTableforColSize false -#spark.origin.checkTableforColSize.cols partition-key,clustering-key -#spark.origin.checkTableforColSize.cols.types 9,1 - -# ENABLE ONLY TO filter data from Origin -#spark.origin.FilterData false -#spark.origin.FilterColumn test -#spark.origin.FilterColumnIndex 2 -#spark.origin.FilterColumnType 6%16 -#spark.origin.FilterColumnValue test - -# ONLY USE if SSL is enabled on origin Cassandra/DSE (e.g. Azure Cosmos Cassandra DB) -#spark.origin.ssl.enabled true - -# ONLY USE if SSL clientAuth is enabled on origin Cassandra/DSE -#spark.origin.trustStore.path -#spark.origin.trustStore.password -#spark.origin.trustStore.type JKS -#spark.origin.keyStore.path -#spark.origin.keyStore.password -#spark.origin.enabledAlgorithms TLS_RSA_WITH_AES_128_CBC_SHA,TLS_RSA_WITH_AES_256_CBC_SHA - -# ONLY USE if SSL is enabled on target Cassandra/DSE -#spark.target.ssl.enabled true - -# ONLY USE if SSL clientAuth is enabled on target Cassandra/DSE -#spark.target.trustStore.path -#spark.target.trustStore.password -#spark.target.trustStore.type JKS -#spark.target.keyStore.path -#spark.target.keyStore.password -#spark.target.enabledAlgorithms TLS_RSA_WITH_AES_128_CBC_SHA,TLS_RSA_WITH_AES_256_CBC_SHA +spark.origin.isAstra false +#spark.origin.host +#spark.origin.username +#spark.origin.password +#spark.origin.keyspaceTable + +spark.target.isAstra true +#spark.target.scb +#spark.target.username +#spark.target.password +#spark.target.keyspaceTable +spark.target.autocorrect.missing true +spark.target.autocorrect.mismatch true +#spark.files +#spark.tokenRange.exceptionDir +#spark.row.exceptionDir +#Max file size allowed in bytes. 200000000 = 200MB +spark.rowfailure.filesize.limit 200000000 + +spark.maxRetries 10 +spark.maxRetries.rowFailure 2 +spark.readRateLimit 200000 +spark.writeRateLimit 200000 +spark.batchSize 5 + +#spark.query.origin db_name, tenant_id, tenant_uuid +#spark.query.origin.partitionKey db_name, tenant_id +#spark.query.target db_name, tenant_id, tenant_uuid +#spark.query.target.id db_name, tenant_id +#spark.query.types 0,0,9 +#spark.query.ttl.cols 2 +#spark.query.writetime.cols 2 + +spark.target.default.ttl.enable false +#spark.target.default.ttl 7776000 +spark.target.default.writetime.enable false +#spark.target.default.writetime 1640998861000 + +spark.counterTable false +spark.counterTable.cql +spark.counterTable.cql.index 0 + +spark.splitSize 60000 + +#spark.origin.writeTimeStampFilter false +#spark.origin.minWriteTimeStampFilter 0 +#spark.origin.maxWriteTimeStampFilter 9223372036854775807 + +spark.coveragePercent 100 + +spark.origin.read.consistency.level LOCAL_QUORUM +spark.target.read.consistency.level LOCAL_QUORUM + +##### ENABLE ONLY IF COLUMN NAMES ON TARGET IS DIFFERENT FROM ORIGIN (SCHEMA & DATA-TYPES MUST BE SAME) ##### +#spark.query.target partition-key,clustering-key,order-date,amount + +################# ENABLE ONLY IF YOU WANT TO MIGRATE/VALIDATE SOME DATA BASED ON CQL FILTER ################# +#spark.query.condition + +################# ENABLE ONLY IF YOU WANT TO MIGRATE/VALIDATE SOME % (NOT 100%) DATA ###################### +#spark.coveragePercent 10 + +#################### ENABLE ONLY IF WANT LOG STATS MORE OR LESS FREQUENTLY THAN DEFAULT ##################### +#spark.printStatsAfter 100000 + +################################# ENABLE ONLY IF IT IS A COUNTER TABLE ###################################### +#spark.counterTable false +#spark.counterTable.cql +#spark.counterTable.cql.index 0 + +######## ENABLE ONLY IF YOU WANT TO FILTER BASED ON WRITE-TIME (values must be in microseconds) ############# +#spark.origin.writeTimeStampFilter false +#spark.origin.minWriteTimeStampFilter 0 +#spark.origin.maxWriteTimeStampFilter 4102444800000000 + +######## ENABLE ONLY IF YOU WANT TO USE READ AND/OR WRITE CONSISTENCY OTHER THAN LOCAL_QUORUM ############## +#spark.consistency.read LOCAL_QUORUM +#spark.consistency.write LOCAL_QUORUM + +############# ENABLE ONLY IF YOU WANT TO REDUCE FETCH-SIZE TO AVOID FrameTooLongException ################## +#spark.read.fetch.sizeInRows 1000 + +############### ENABLE ONLY IF YOU WANT TO USE CUSTOM FIXED WRITETIME VALUE ON TARGET ###################### +#spark.target.custom.writeTime 0 + +#################### ONLY USE if SKIPPING recs greater than 10MB from Origin needed ######################### +#spark.fieldGuardraillimitMB 10 + +#################### ONLY USE if count of recs greater than 10MB from Origin needed ######################### +#spark.origin.checkTableforColSize false +#spark.origin.checkTableforColSize.cols partition-key,clustering-key +#spark.origin.checkTableforColSize.cols.types 9,1 + +############################ ONLY USE if needing to filter data from Origin ################################# +#spark.origin.FilterData false +#spark.origin.FilterColumn test +#spark.origin.FilterColumnIndex 2 +#spark.origin.FilterColumnType 6%16 +#spark.origin.FilterColumnValue test + +########################## ONLY USE if SSL clientAuth is enabled on origin Cassandra/DSE #################### +#spark.origin.trustStore.path +#spark.origin.trustStore.password +#spark.origin.trustStore.type JKS +#spark.origin.keyStore.path +#spark.origin.keyStore.password +#spark.origin.enabledAlgorithms TLS_RSA_WITH_AES_128_CBC_SHA,TLS_RSA_WITH_AES_256_CBC_SHA + +####################### ONLY USE if SSL clientAuth is enabled on target Cassandra/DSE ####################### +#spark.target.trustStore.path +#spark.target.trustStore.password +#spark.target.trustStore.type JKS +#spark.target.keyStore.path +#spark.target.keyStore.password +#spark.target.enabledAlgorithms TLS_RSA_WITH_AES_128_CBC_SHA,TLS_RSA_WITH_AES_256_CBC_SHA + +############################################################################################################# +# Following are the supported data types and their corresponding [Cassandra data-types] +# 0: ascii, text, varchar +# 1: int, smallint +# 2: bigint, counter +# 3: double +# 4: time, timestamp +# 5: map (separate type by %) - Example: 5%1%0 for map +# 6: list (separate type by %) - Example: 6%0 for list +# 7: blob +# 8: set (separate type by %) - Example: 8%0 for set +# 9: uuid, timeuuid +# 10: boolean +# 11: tuple +# 12: float +# 13: tinyint +# 14: decimal +# 15: date +# 16: UDT [any user-defined-type created using 'CREATE TYPE'] +# 17: varint + +# Note: Ignore "Frozen" while mapping Collections (Map/List/Set) - Example: 5%1%0 for frozen> +# +# "spark.query.ttl.cols" - Comma separated column indexes from "spark.query.origin" used to find largest TTL. +# "spark.query.writetime.cols" - Comma separated column indexes from "spark.query.origin" used to find largest writetime. +# Note: The tool migrates TTL & Writetimes at row-level and not field-level. +# Migration will use the largest TTL & Writetimes value per row. +# +############################################################################################################# \ No newline at end of file