From 572abb6d809702e78bc25befa01a65f067fc100b Mon Sep 17 00:00:00 2001 From: beajohnson Date: Mon, 15 Apr 2024 11:17:58 -0700 Subject: [PATCH 01/32] new pages for cdm --- modules/ROOT/pages/cdm-parameters.adoc | 468 +++++++++++++++++++++++++ modules/ROOT/pages/cdm-steps.adoc | 119 +++++++ 2 files changed, 587 insertions(+) create mode 100644 modules/ROOT/pages/cdm-parameters.adoc create mode 100644 modules/ROOT/pages/cdm-steps.adoc diff --git a/modules/ROOT/pages/cdm-parameters.adoc b/modules/ROOT/pages/cdm-parameters.adoc new file mode 100644 index 00000000..830f7ae2 --- /dev/null +++ b/modules/ROOT/pages/cdm-parameters.adoc @@ -0,0 +1,468 @@ += {cstar-data-migrator} parameters + +Each parameter below offers a different connection. Review each option to determine what is best for your organization. + +[[cdm-origin-schema-params]] +=== Origin schema parameters + +[cols="3,1,5a"] +|=== +|Property | Default | Notes + +| `spark.cdm.schema.origin.keyspaceTable` +| +| Required - the `.` of the table to be migrated. +Table must exist in Origin. + +| `spark.cdm.schema.origin.column.ttl.automatic` +| `true` +| Default is `true`, unless `spark.cdm.schema.origin.column.ttl.names` is specified. +When `true`, the Time To Live (TTL) of the Target record will be determined by finding the maximum TTL of all Origin columns that can have TTL set (which excludes partition key, clustering key, collections/UDT/tuple, and frozen columns). +When `false`, and `spark.cdm.schema.origin.column.ttl.names` is not set, the Target record will have the TTL determined by the Target table configuration. + +| `spark.cdm.schema.origin.column.ttl.names` +| +| Default is empty, meaning the names will be determined automatically if `spark.cdm.schema.origin.column.ttl.automatic` is set. +Specify a subset of eligible columns that are used to calculate the TTL of the Target record. + +| `spark.cdm.schema.origin.column.writetime.automatic` +| `true` +| Default is `true`, unless `spark.cdm.schema.origin.column.writetime.names` is specified. +When `true`, the `WRITETIME` of the Target record will be determined by finding the maximum `WRITETIME` of all Origin columns that can have `WRITETIME` set (which excludes partition key, clustering key, collections/UDT/tuple, and frozen columns). +When `false`, and `spark.cdm.schema.origin.column.writetime.names` is not set, the Target record will have the `WRITETIME` determined by the Target table configuration. +[NOTE] +==== +The `spark.cdm.transform.custom.writetime` property, if set, would override `spark.cdm.schema.origin.column.writetime`. +==== + +| `spark.cdm.schema.origin.column.writetime.names` +| +| Default is empty, meaning the names will be determined automatically if `spark.cdm.schema.origin.column.writetime.automatic` is set. +Otherwise, specify a subset of eligible columns that are used to calculate the WRITETIME of the Target record. +Example: `data_col1,data_col2,...` + +| `spark.cdm.schema.origin.column.names.to.target` +| +| Default is empty. +If column names are changed between Origin and Target, then this mapped list provides a mechanism to associate the two. +The format is `:`. +The list is comma-separated. +You only need to list renamed columns. + +|=== + +[NOTE] +==== +For optimization reasons, {cstar-data-migrator} does not migrate TTL and writetime at the field-level. +Instead, {cstar-data-migrator} finds the field with the highest TTL, and the field with the highest writetime within an Origin table row, and uses those values on the entire Target table row. +==== + +[[cdm-target-schema-params]] +=== Target schema parameter + +[cols="3,1,2"] +|=== +|Property | Default | Notes + +| `spark.cdm.schema.target.keyspaceTable` +| Equals the value of `spark.cdm.schema.origin.keyspaceTable` +| This parameter is commented out. +It's the `.` of the table to be migrated into the Target. +Table must exist in Target. + +|=== + + +[[cdm-auto-correction-params]] +=== Auto-correction parameters + +Auto-correction parameters allow {cstar-data-migrator} to correct data differences found between Origin and Target when you run the `DiffData` program. +Typically, these are run disabled (for "what if" migration testing), which will generate a list of data discrepancies. +The reasons for these discrepancies can then be investigated, and if necessary the parameters below can be enabled. + +For information about invoking `DiffData` in a {cstar-data-migrator} command, see xref:#cdm-validation-steps[{cstar-data-migrator} steps in validation mode] in this topic. + +[cols="2,2,3a"] +|=== +|Property | Default | Notes + +| `spark.cdm.autocorrect.missing` +| `false` +| When `true`, data that is missing in Target but is found in Origin will be re-migrated to Target. + +| `spark.cdm.autocorrect.mismatch` +| `false` +| When `true`, data that is different between Origin and Target will be reconciled. +[NOTE] +==== +The `TIMESTAMP` of records may have an effect. +If the `WRITETIME` of the Origin record (determined with `.writetime.names`) is earlier than the `WRITETIME` of the Target record, the change will not appear in Target. +This comparative state may be particularly challenging to troubleshoot if individual columns (cells) have been modified in Target. +==== + +| `spark.cdm.autocorrect.missing.counter` +| `false` +| Commented out. +By default, Counter tables are not copied when missing, unless explicitly set. + +| `spark.tokenrange.partitionFile` +| `./._partitions.csv` +| Commented out. +This CSV file is used as input, as well as output when applicable. +If the file exists, only the partition ranges in this file will be migrated or validated. +Similarly, if exceptions occur while migrating or validating, partition ranges with exceptions will be logged to this file. + +|=== + + +[[cdm-performance-operations-params]] +=== Performance and operations parameters + +Performance and operations parameters that can affect migration throughput, error handling, and similar concerns. + +[cols="4,1,3"] +|=== +|Property | Default | Notes + +| `spark.cdm.perfops.numParts` +| `10000` +| In standard operation, the full token range (-2^63 .. 2^63-1) is divided into a number of parts, which will be parallel-processed. +You should aim for each part to comprise a total of ≈1-10GB of data to migrate. +During initial testing, you may want this to be a small number (such as `1`). + +| `spark.cdm.perfops.batchSize` +| `5` +| When writing to Target, this comprises the number of records that will be put into an `UNLOGGED` batch. +{cstar-data-migrator} will tend to work on the same partition at a time. +Thus if your partition sizes are larger, this number may be increased. +If the `spark.cdm.perfops.batchSize` would mean that more than 1 partition is often contained in a batch, reduce this parameter's value. +Ideally < 1% of batches have more than 1 partition. + +| `spark.cdm.perfops.ratelimit.origin` +| `20000` +| Concurrent number of operations across all parallel threads from Origin. +This value may be adjusted up (or down), depending on the amount of data and the processing capacity of the Origin cluster. + +| `spark.cdm.perfops.ratelimit.target` +| `40000` +| Concurrent number of operations across all parallel threads from Target. +This may be adjusted up (or down), depending on the amount of data and the processing capacity of the Target cluster. + +| `spark.cdm.perfops.consistency.read` +| `LOCAL_QUORUM` +| Commented out. +Read consistency from Origin, and also from Target when records are read for comparison purposes. +The consistency parameters may be one of: `ANY`, `ONE`, `TWO`, `THREE`, `QUORUM`, `LOCAL_ONE`, `EACH_QUORUM`, `LOCAL_QUORUM`, `SERIAL`, `LOCAL_SERIAL`, `ALL`. + +| `spark.cdm.perfops.consistency.write` +| `LOCAL_QUORUM` +| Commented out. +Write consistency to Target. +The consistency parameters may be one of: `ANY`, `ONE`, `TWO`, `THREE`, `QUORUM`, `LOCAL_ONE`, `EACH_QUORUM`, `LOCAL_QUORUM`, `SERIAL`, `LOCAL_SERIAL`, `ALL`. + +| `spark.cdm.perfops.printStatsAfter` +| `100000` +| Commented out. +Number of rows of processing after which a progress log entry will be made. + +| `spark.cdm.perfops.fetchSizeInRows` +| `1000` +| Commented out. +This parameter affects the frequency of reads from Origin, and also the frequency of flushes to Target. + +| `spark.cdm.perfops.errorLimit` +| `0` +| Commented out. +Controls how many errors a thread may encounter during `MigrateData` and `DiffData` operations before failing. +Recommendation: set this parameter to a non-zero value **only when not doing** a mutation-type operation, such as when you're running `DiffData` without `.autocorrect`. + +|=== + + +[[cdm-transformation-params]] +=== Transformation parameters + +Parameters to perform schema transformations between Origin and Target. + +By default, these parameters are commented out. + +[cols="2,1,4a"] +|=== +|Property | Default | Notes + +| `spark.cdm.transform.missing.key.ts.replace.value` +| `1685577600000` +| Timestamp value in milliseconds. +Partition and clustering columns cannot have null values, but if these are added as part of a schema transformation between Origin and Target, it is possible that the Origin side is null. +In this case, the `Migrate` data operation would fail. +This parameter allows a crude constant value to be used in its place, separate from the Constant values feature. + +| `spark.cdm.transform.custom.writetime` +| `0` +| Default is 0 (disabled). +Timestamp value in microseconds to use as the `WRITETIME` for the Target record. +This is useful when the `WRITETIME` of the record in Origin cannot be determined (such as when the only non-key columns are collections). +This parameter allows a crude constant value to be used in its place, and overrides `spark.cdm.schema.origin.column.writetime.names`. + +| `spark.cdm.transform.custom.writetime.incrementBy` +| `0` +| Default is `0`. +This is useful when you have a List that is not frozen, and you are updating this via the autocorrect feature. +Lists are not idempotent, and subsequent UPSERTs would add duplicates to the list. + +| `spark.cdm.transform.codecs` +| +| Default is empty. +A comma-separated list of additional codecs to enable. + + * `INT_STRING` : int stored in a String. + * `DOUBLE_STRING` : double stored in a String. + * `BIGINT_STRING` : bigint stored in a String. + * `DECIMAL_STRING` : decimal stored in a String. + * `TIMESTAMP_STRING_MILLIS` : timestamp stored in a String, as Epoch milliseconds. + * `TIMESTAMP_STRING_FORMAT` : timestamp stored in a String, with a custom format. + +[NOTE] +==== +Where there are multiple type pair options, such as with `TIMESTAMP_STRING_*`, only one can be configured at a time with the `spark.cdm.transform.codecs` parameter. +==== + +| `spark.cdm.transform.codecs.timestamp.string.format` +| `yyyyMMddHHmmss` +| Configuration for `CQL_TIMESTAMP_TO_STRING_FORMAT` codec. +Default format is `yyyyMMddHHmmss`; `DateTimeFormatter.ofPattern(formatString)` + + +| `spark.cdm.transform.codecs.timestamp.string.zone` +| `UTC` +| Default is `UTC`. +Must be in `ZoneRulesProvider.getAvailableZoneIds()`. + +|=== + + +[[cdm-cassandra-filter-params]] +=== Cassandra filter parameters + +Cassandra filters are applied on the coordinator node. +Note that, depending on the filter, the coordinator node may need to do a lot more work than is normal, notably because {cstar-data-migrator} specifies `ALLOW FILTERING`. + +By default, these parameters are commented out. + +[cols="3,1,3"] +|=== +|Property | Default | Notes + +| `spark.cdm.filter.cassandra.partition.min` +| `-9223372036854775808` +| Default is `0` (when using `RandomPartitioner`) and `-9223372036854775808` (-2^63) otherwise. +Lower partition bound (inclusive). + +| `spark.cdm.filter.cassandra.partition.max` +| `9223372036854775807` +| Default is `2^127-1` (when using `RandomPartitioner`) and `9223372036854775807` (2^63-1) otherwise. +Upper partition bound (inclusive). + +| `spark.cdm.filter.cassandra.whereCondition` +| +| CQL added to the `WHERE` clause of `SELECT` statements from Origin. + +|=== + + +[[cdm-java-filter-params]] +=== Java filter parameters + +Java filters are applied on the client node. +Data must be pulled from the Origin cluster and then filtered. +However, this option may have a lower impact on the production cluster than xref:cdm-cassandra-filter-params[Cassandra filters]. +Java filters put load onto the {cstar-data-migrator} processing node, by sending more data from Cassandra. +Cassandra filters put load on the Cassandra nodes, notably because {cstar-data-migrator} specifies `ALLOW FILTERING`, which could cause the coordinator node to perform a lot more work. + +By default, these parameters are commented out. + +[cols="2,1,4"] +|=== +|Property | Default | Notes + +| `spark.cdm.filter.java.token.percent` +| `100` +| Percent (between 1 and 100) of the token in each Split that will be migrated. +This property is used to do a wide and random sampling of the data. +The percentage value is applied to each split. +Invalid percentages will be treated as 100. + +| `spark.cdm.filter.java.writetime.min` +| `0` +| The lowest (inclusive) writetime values to be migrated. +Using the `spark.cdm.filter.java.writetime.min` and `spark.cdm.filter.java.writetime.max` thresholds, {cstar-data-migrator} can filter records based on their writetimes. +The maximum writetime of the columns configured at `spark.cdm.schema.origin.column.writetime.names` will be compared to the `.min` and `.max` thresholds, which must be in **microseconds since the epoch**. +If the `spark.cdm.schema.origin.column.writetime.names` are not specified, or the thresholds are null or otherwise invalid, the filter will be ignored. +Note that `spark.cdm.s.perfops.batchSize` will be ignored when this filter is in place; a value of 1 will be used instead. + +| `spark.cdm.filter.java.writetime.max` +| `9223372036854775807` +| The highest (inclusive) writetime values to be migrated. +Maximum timestamp of the columns specified by `spark.cdm.schema.origin.column.writetime.names`; if that property is not specified, or is for some reason null, the filter is ignored. + +| `spark.cdm.filter.java.column.name` +| +| Filter rows based on matching a configured value. +With `spark.cdm.filter.java.column.name`, specify the column name against which the `spark.cdm.filter.java.column.value` is compared. +Must be on the column list specified at `spark.cdm.schema.origin.column.names`. +The column value will be converted to a String, trimmed of whitespace on both ends, and compared. + +| `spark.cdm.filter.java.column.value` +| +| String value to use as comparison. +Whitespace on the ends of `spark.cdm.filter.java.column.value` will be trimmed. +|=== + + +[[cdm-constant-column-feature-params]] +=== Constant column feature parameters + +The constant columns feature allows you to add constant columns to the target table. +If used, the `spark.cdm.feature.constantColumns.names`, `spark.cdm.feature.constantColumns.types`, and `spark.cdm.feature.constantColumns.values` lists must all be the same length. + +By default, these parameters are commented out. + +[cols="2,1,3"] +|=== +|Property | Default | Notes + +| `spark.cdm.feature.constantColumns.names` +| +| A comma-separated list of column names, such as `const1,const2`. + +| `spark.cdm.feature.constantColumns.type` +| +| A comma-separated list of column types. + +| `spark.cdm.feature.constantColumns.values` +| +| A comma-separated list of hard-coded values. +Each value should be provided as you would use on the `CQLSH` command line. +Examples: `'abcd'` for a string; `1234` for an int, and so on. + +| `spark.cdm.feature.constantColumns.splitRegex` +| `,` +| Defaults to comma, but can be any regex character that works with `String.split(regex)`; this option is needed because some type values contain commas, such as in lists, maps, and sets. + +|=== + + +[[cdm-explode-map-feature-params]] +=== Explode map feature parameters + +The explode map feature allows you convert an Origin table Map into multiple Target table records. + +By default, these parameters are commented out. + +[cols="3,3"] +|=== +|Property | Notes + +| `spark.cdm.feature.explodeMap.origin.name` +| The name of the map column, such as `my_map`. +Must be defined on `spark.cdm.schema.origin.column.names`, and the corresponding type on `spark.cdm.schema.origin.column.types` must be a map. + +| `spark.cdm.feature.explodeMap.origin.name.key` +| The name of the column on the Target table that will hold the map key, such as `my_map_key`. +This key must be present on the Target primary key `spark.cdm.schema.target.column.id.names`. + +| `spark.cdm.feature.explodeMap.origin.value` +| The name of the column on the Target table that will hold the map value, such as `my_map_value`. +|=== + + +[[cdm-guardrail-feature-params]] +=== Guardrail feature parameter + +The guardrail feature manages records that exceed guardrail checks. +The Guardrail job will generate a report; other jobs will skip records that exceed the guardrail limit. + +By default, these parameters are commented out. + +[cols="3,1,3"] +|=== +|Property | Default | Notes + +| `spark.cdm.feature.guardrail.colSizeInKB` +| `0` +| The `0` default means the guardrail check is not done. +If set, table records with one or more fields that exceed the column size in kB will be flagged. +Note this is kB (base 10), not kiB (base 2). + +|=== + + +[[cdm-tls-ssl-connection-params]] +=== TLS (SSL) connection parameters + +TLS (SSL) connection parameters, if configured, for Origin and Target. +Note that a secure connect bundle (SCB) embeds these details. + +By default, these parameters are commented out. + +[cols="3,3,3"] +|=== +|Property | Default | Notes + +| `spark.cdm.connect.origin.tls.enabled` +| `false` +| If TLS is used, set to `true`. + +| `spark.cdm.connect.origin.tls.trustStore.path` +| +| Path to the Java truststore file. + +| `spark.cdm.connect.origin.tls.trustStore.password` +| +| Password needed to open the truststore. + +| `spark.cdm.connect.origin.tls.trustStore.type` +| `JKS` +| + +| `spark.cdm.connect.origin.tls.keyStore.path` +| +| Path to the Java keystore file. + +| `spark.cdm.connect.origin.tls.keyStore.password` +| +| Password needed to open the keystore. + +| `spark.cdm.connect.origin.tls.enabledAlgorithms` +| `TLS_RSA_WITH_AES_128_CBC_SHA`,`TLS_RSA_WITH_AES_256_CBC_SHA` +| + +| `spark.cdm.connect.target.tls.enabled` +| `false` +| If TLS is used, set to `true`. + +| `spark.cdm.connect.target.tls.trustStore.path` +| +| Path to the Java truststore file. + +| `spark.cdm.connect.target.tls.trustStore.password` +| +| Password needed to open the truststore. + +| `spark.cdm.connect.target.tls.trustStore.type` +| `JKS` +| + +| `spark.cdm.connect.target.tls.keyStore.path` +| +| Path to the Java keystore file. + +| `spark.cdm.connect.target.tls.keyStore.password` +| +| Password needed to open the keystore. + +| `spark.cdm.connect.target.tls.enabledAlgorithms` +| `TLS_RSA_WITH_AES_128_CBC_SHA`,`TLS_RSA_WITH_AES_256_CBC_SHA` +| + +|=== \ No newline at end of file diff --git a/modules/ROOT/pages/cdm-steps.adoc b/modules/ROOT/pages/cdm-steps.adoc new file mode 100644 index 00000000..e5b0cb0b --- /dev/null +++ b/modules/ROOT/pages/cdm-steps.adoc @@ -0,0 +1,119 @@ += Steps to use {cstar-data-migrator} for data migration + +Use {cstar-data-migrator} to migrate and validate tables between Origin and Target Cassandra clusters, with available logging and reconciliation support. + +[[cdm-prereqs]] +== {cstar-data-migrator} prerequisites + +* Install or switch to Java 11. +The Spark binaries are compiled with this version of Java. +* Install https://archive.apache.org/dist/spark/spark-3.5.1/[Spark 3.5.1] on a single VM (no cluster necessary) where you want to run this job. +* Optionally, install https://maven.apache.org/download.cgi[Maven] 3.9.x if you want to build the JAR for local development. + +You can install Apache Spark by running the following commands: + +[source,bash] +---- +wget https://archive.apache.org/dist/spark/spark-3.5.1/spark-3.5.1-bin-hadoop3-scala2.13.tgz + +tar -xvzf spark-3.5.1-bin-hadoop3-scala2.13.tgz +---- + + +== {cstar-data-migrator} steps + +1. Configure for your environment the `cdm*.properties` file that's provided in the {cstar-data-migrator} https://github.com/datastax/cassandra-data-migrator/tree/main/src/resources[GitHub repo]. +The file can have any name. +It does not need to be `cdm.properties` or `cdm-detailed.properties`. +In both versions, only the parameters that aren't commented out will be processed by the `spark-submit` job. +Other parameter values use defaults or are ignored. +See the descriptions and defaults in each file. +Refer to: + * The simplified sample properties configuration, https://github.com/datastax/cassandra-data-migrator/blob/main/src/resources/cdm.properties[cdm.properties]. + This file contains only those parameters that are commonly configured. + * The complete sample properties configuration, https://github.com/datastax/cassandra-data-migrator/blob/main/src/resources/cdm-detailed.properties[cdm-detailed.properties], for the full set of configurable settings. + +2. Place the properties file that you elected to use and customize where it can be accessed while running the job via `spark-submit`. + +3. Run the job using `spark-submit` command: + +[source,bash] +---- +./spark-submit --properties-file cdm.properties \ +--conf spark.cdm.schema.origin.keyspaceTable="." \ +--master "local[*]" --driver-memory 25G --executor-memory 25G \ +--class com.datastax.cdm.job.Migrate cassandra-data-migrator-x.y.z.jar &> logfile_name_$(date +%Y%m%d_%H_%M).txt +---- + +[TIP] +==== +* Above command generates a log file `logfile_name_*.txt` to avoid log output on the console. +* Update the memory options (driver & executor memory) based on your use-case +==== + +[[cdm-validation-steps]] +== {cstar-data-migrator} steps in validation mode + +To run your migration job with {cstar-data-migrator} in **data validation mode**, use class option `--class com.datastax.cdm.job.DiffData`. +Example: + +[source,bash] +---- +./spark-submit --properties-file cdm.properties \ +--conf spark.cdm.schema.origin.keyspaceTable="." \ +--master "local[*]" --driver-memory 25G --executor-memory 25G \ +--class com.datastax.cdm.job.DiffData cassandra-data-migrator-x.y.z.jar &> logfile_name_$(date +%Y%m%d_%H_%M).txt +---- + +The {cstar-data-migrator} validation job will report differences as `ERROR` entries in the log file. +Example: + +[source,bash] +---- +23/04/06 08:43:06 ERROR DiffJobSession: Mismatch row found for key: [key3] Mismatch: Target Index: 1 Origin: valueC Target: value999) +23/04/06 08:43:06 ERROR DiffJobSession: Corrected mismatch row in target: [key3] +23/04/06 08:43:06 ERROR DiffJobSession: Missing target row found for key: [key2] +23/04/06 08:43:06 ERROR DiffJobSession: Inserted missing row in target: [key2] +---- + +[TIP] +==== +To get the list of missing or mismatched records, grep for all `ERROR` entries in the log files. +Differences noted in the log file are listed by primary-key values. +==== + +You can also run the {cstar-data-migrator} validation job in an **AutoCorrect** mode. This mode can: + +* Add any missing records from Origin to Target. +* Update any mismatched records between Origin and Target; this action makes Target the same as Origin. + +To enable or disable this feature, use one or both of the following settings in your `*.properties` configuration file. + +[source,properties] +---- +spark.cdm.autocorrect.missing false|true +spark.cdm.autocorrect.mismatch false|true +---- + +[IMPORTANT] +==== +The {cstar-data-migrator} validation job will never delete records from Target. +The job only adds or updates data on Target. +==== + +[[cdm-guardrail-checks]] +== Perform large-field guardrail violation checks + +Use {cstar-data-migrator} to identify large fields from a table that may break your cluster guardrails. +For example, {astra_db} has a 10MB limit for a single large field. +Specify `--class com.datastax.cdm.job.GuardrailCheck` on the command. +Example: + +[source,bash] +---- +./spark-submit --properties-file cdm.properties \ +--conf spark.cdm.schema.origin.keyspaceTable="." \ +--conf spark.cdm.feature.guardrail.colSizeInKB=10000 \ +--master "local[*]" --driver-memory 25G --executor-memory 25G \ +--class com.datastax.cdm.job.GuardrailCheck cassandra-data-migrator-4.x.x.jar &> logfile_name_$(date +%Y%m%d_%H_%M).txt +---- From 27590696a4029ab0f0cc46f2d6a02010df21e4fd Mon Sep 17 00:00:00 2001 From: beajohnson Date: Mon, 15 Apr 2024 11:57:52 -0700 Subject: [PATCH 02/32] fixed build failures --- modules/ROOT/pages/cdm-parameters.adoc | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/modules/ROOT/pages/cdm-parameters.adoc b/modules/ROOT/pages/cdm-parameters.adoc index 830f7ae2..8c5cb3a4 100644 --- a/modules/ROOT/pages/cdm-parameters.adoc +++ b/modules/ROOT/pages/cdm-parameters.adoc @@ -3,7 +3,7 @@ Each parameter below offers a different connection. Review each option to determine what is best for your organization. [[cdm-origin-schema-params]] -=== Origin schema parameters +== Origin schema parameters [cols="3,1,5a"] |=== @@ -58,7 +58,7 @@ Instead, {cstar-data-migrator} finds the field with the highest TTL, and the fie ==== [[cdm-target-schema-params]] -=== Target schema parameter +== Target schema parameter [cols="3,1,2"] |=== @@ -74,7 +74,7 @@ Table must exist in Target. [[cdm-auto-correction-params]] -=== Auto-correction parameters +== Auto-correction parameters Auto-correction parameters allow {cstar-data-migrator} to correct data differences found between Origin and Target when you run the `DiffData` program. Typically, these are run disabled (for "what if" migration testing), which will generate a list of data discrepancies. @@ -116,7 +116,7 @@ Similarly, if exceptions occur while migrating or validating, partition ranges w [[cdm-performance-operations-params]] -=== Performance and operations parameters +== Performance and operations parameters Performance and operations parameters that can affect migration throughput, error handling, and similar concerns. @@ -180,7 +180,7 @@ Recommendation: set this parameter to a non-zero value **only when not doing** a [[cdm-transformation-params]] -=== Transformation parameters +== Transformation parameters Parameters to perform schema transformations between Origin and Target. @@ -242,7 +242,7 @@ Must be in `ZoneRulesProvider.getAvailableZoneIds()`. [[cdm-cassandra-filter-params]] -=== Cassandra filter parameters +== Cassandra filter parameters Cassandra filters are applied on the coordinator node. Note that, depending on the filter, the coordinator node may need to do a lot more work than is normal, notably because {cstar-data-migrator} specifies `ALLOW FILTERING`. @@ -271,7 +271,7 @@ Upper partition bound (inclusive). [[cdm-java-filter-params]] -=== Java filter parameters +== Java filter parameters Java filters are applied on the client node. Data must be pulled from the Origin cluster and then filtered. @@ -320,7 +320,7 @@ Whitespace on the ends of `spark.cdm.filter.java.column.value` will be trimmed. [[cdm-constant-column-feature-params]] -=== Constant column feature parameters +== Constant column feature parameters The constant columns feature allows you to add constant columns to the target table. If used, the `spark.cdm.feature.constantColumns.names`, `spark.cdm.feature.constantColumns.types`, and `spark.cdm.feature.constantColumns.values` lists must all be the same length. @@ -353,7 +353,7 @@ Examples: `'abcd'` for a string; `1234` for an int, and so on. [[cdm-explode-map-feature-params]] -=== Explode map feature parameters +== Explode map feature parameters The explode map feature allows you convert an Origin table Map into multiple Target table records. @@ -377,7 +377,7 @@ This key must be present on the Target primary key `spark.cdm.schema.target.colu [[cdm-guardrail-feature-params]] -=== Guardrail feature parameter +== Guardrail feature parameter The guardrail feature manages records that exceed guardrail checks. The Guardrail job will generate a report; other jobs will skip records that exceed the guardrail limit. @@ -398,7 +398,7 @@ Note this is kB (base 10), not kiB (base 2). [[cdm-tls-ssl-connection-params]] -=== TLS (SSL) connection parameters +== TLS (SSL) connection parameters TLS (SSL) connection parameters, if configured, for Origin and Target. Note that a secure connect bundle (SCB) embeds these details. From 4abd7c521f27baf47776f4aa47421f1c75c3f7da Mon Sep 17 00:00:00 2001 From: beajohnson Date: Mon, 15 Apr 2024 14:25:20 -0700 Subject: [PATCH 03/32] updated nav --- modules/ROOT/nav.adoc | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/modules/ROOT/nav.adoc b/modules/ROOT/nav.adoc index f6cb01fd..e4010fa1 100644 --- a/modules/ROOT/nav.adoc +++ b/modules/ROOT/nav.adoc @@ -14,9 +14,13 @@ ** xref:connect-clients-to-proxy.adoc[] ** xref:metrics.adoc[] ** xref:manage-proxy-instances.adoc[] +.{cstar-data-migrator} //phase 2 * xref:migrate-and-validate-data.adoc[] ** xref:cassandra-data-migrator.adoc[] +* xref:cdm-steps.adoc[] +* xref:cdm-parameteres.adoc[] +.{dsbulk-migrator} ** xref:dsbulk-migrator.adoc[] //phase 3 * xref:enable-async-dual-reads.adoc[] From 4ab35686c5fca24c0cc4708cb638f54d6661ec8e Mon Sep 17 00:00:00 2001 From: beajohnson Date: Mon, 22 Apr 2024 14:21:41 -0700 Subject: [PATCH 04/32] corrected nav --- modules/ROOT/nav.adoc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/ROOT/nav.adoc b/modules/ROOT/nav.adoc index e4010fa1..324a7e3b 100644 --- a/modules/ROOT/nav.adoc +++ b/modules/ROOT/nav.adoc @@ -19,7 +19,7 @@ * xref:migrate-and-validate-data.adoc[] ** xref:cassandra-data-migrator.adoc[] * xref:cdm-steps.adoc[] -* xref:cdm-parameteres.adoc[] +* xref:cdm-parameters.adoc[] .{dsbulk-migrator} ** xref:dsbulk-migrator.adoc[] //phase 3 From a9940b9542d663ecd65e01da5b838e223d938cdf Mon Sep 17 00:00:00 2001 From: beajohnson Date: Mon, 22 Apr 2024 16:54:44 -0700 Subject: [PATCH 05/32] check fails --- modules/ROOT/pages/cdm-parameters.adoc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/ROOT/pages/cdm-parameters.adoc b/modules/ROOT/pages/cdm-parameters.adoc index 8c5cb3a4..131126f9 100644 --- a/modules/ROOT/pages/cdm-parameters.adoc +++ b/modules/ROOT/pages/cdm-parameters.adoc @@ -3,7 +3,7 @@ Each parameter below offers a different connection. Review each option to determine what is best for your organization. [[cdm-origin-schema-params]] -== Origin schema parameters +== Origin schema parameters [cols="3,1,5a"] |=== From 67595095fdfaf35780c85b5c5ac9640cf67d6a6f Mon Sep 17 00:00:00 2001 From: beajohnson Date: Fri, 26 Apr 2024 09:53:21 -0700 Subject: [PATCH 06/32] updates for cdm --- modules/ROOT/nav.adoc | 8 +-- modules/ROOT/pages/cdm-parameters.adoc | 47 +++++++++--------- modules/ROOT/pages/cdm-prereqs.adoc | 67 ++++++++++++++++++++++++++ modules/ROOT/pages/cdm-steps.adoc | 30 +++--------- 4 files changed, 102 insertions(+), 50 deletions(-) create mode 100644 modules/ROOT/pages/cdm-prereqs.adoc diff --git a/modules/ROOT/nav.adoc b/modules/ROOT/nav.adoc index 324a7e3b..9a3f00a6 100644 --- a/modules/ROOT/nav.adoc +++ b/modules/ROOT/nav.adoc @@ -14,13 +14,9 @@ ** xref:connect-clients-to-proxy.adoc[] ** xref:metrics.adoc[] ** xref:manage-proxy-instances.adoc[] -.{cstar-data-migrator} //phase 2 * xref:migrate-and-validate-data.adoc[] ** xref:cassandra-data-migrator.adoc[] -* xref:cdm-steps.adoc[] -* xref:cdm-parameters.adoc[] -.{dsbulk-migrator} ** xref:dsbulk-migrator.adoc[] //phase 3 * xref:enable-async-dual-reads.adoc[] @@ -28,6 +24,10 @@ * xref:change-read-routing.adoc[] //phase 5 * xref:connect-clients-to-target.adoc[] +* xref:cassandra-data-migrator.adoc[] +** xref:cdm-prereqs.adoc[Prerequisites] +** xref:cdm-steps.adoc[Migrate data] +** xref:cdm-parameters.adoc[Parameters] * Troubleshooting ** xref:troubleshooting.adoc[] ** xref:troubleshooting-tips.adoc[] diff --git a/modules/ROOT/pages/cdm-parameters.adoc b/modules/ROOT/pages/cdm-parameters.adoc index 131126f9..3c43b754 100644 --- a/modules/ROOT/pages/cdm-parameters.adoc +++ b/modules/ROOT/pages/cdm-parameters.adoc @@ -37,7 +37,7 @@ The `spark.cdm.transform.custom.writetime` property, if set, would override `spa | `spark.cdm.schema.origin.column.writetime.names` | -| Default is empty, meaning the names will be determined automatically if `spark.cdm.schema.origin.column.writetime.automatic` is set. +| Default is empty, meaning the names are determined automatically if `spark.cdm.schema.origin.column.writetime.automatic` is set. Otherwise, specify a subset of eligible columns that are used to calculate the WRITETIME of the Target record. Example: `data_col1,data_col2,...` @@ -88,15 +88,15 @@ For information about invoking `DiffData` in a {cstar-data-migrator} command, se | `spark.cdm.autocorrect.missing` | `false` -| When `true`, data that is missing in Target but is found in Origin will be re-migrated to Target. +| When `true`, data that is missing in Target but is found in Origin is re-migrated to Target. | `spark.cdm.autocorrect.mismatch` | `false` -| When `true`, data that is different between Origin and Target will be reconciled. +| When `true`, data that is different between Origin and Target is reconciled. [NOTE] ==== The `TIMESTAMP` of records may have an effect. -If the `WRITETIME` of the Origin record (determined with `.writetime.names`) is earlier than the `WRITETIME` of the Target record, the change will not appear in Target. +If the `WRITETIME` of the Origin record (determined with `.writetime.names`) is earlier than the `WRITETIME` of the Target record, the change does appear in Target. This comparative state may be particularly challenging to troubleshoot if individual columns (cells) have been modified in Target. ==== @@ -109,8 +109,8 @@ By default, Counter tables are not copied when missing, unless explicitly set. | `./._partitions.csv` | Commented out. This CSV file is used as input, as well as output when applicable. -If the file exists, only the partition ranges in this file will be migrated or validated. -Similarly, if exceptions occur while migrating or validating, partition ranges with exceptions will be logged to this file. +If the file exists, only the partition ranges in this file are migrated or validated. +Similarly, if exceptions occur while migrating or validating, partition ranges with exceptions are logged to this file. |=== @@ -132,9 +132,9 @@ During initial testing, you may want this to be a small number (such as `1`). | `spark.cdm.perfops.batchSize` | `5` -| When writing to Target, this comprises the number of records that will be put into an `UNLOGGED` batch. -{cstar-data-migrator} will tend to work on the same partition at a time. -Thus if your partition sizes are larger, this number may be increased. +| When writing to Target, this comprises the number of records that are put into an `UNLOGGED` batch. +{cstar-data-migrator} tends to work on the same partition at a time. +If your partition sizes are larger, this number may be increased. If the `spark.cdm.perfops.batchSize` would mean that more than 1 partition is often contained in a batch, reduce this parameter's value. Ideally < 1% of batches have more than 1 partition. @@ -193,7 +193,8 @@ By default, these parameters are commented out. | `spark.cdm.transform.missing.key.ts.replace.value` | `1685577600000` | Timestamp value in milliseconds. -Partition and clustering columns cannot have null values, but if these are added as part of a schema transformation between Origin and Target, it is possible that the Origin side is null. +Partition and clustering columns cannot have null values. +If they are added as part of a schema transformation between Origin and Target, it is possible that the Origin side is null. In this case, the `Migrate` data operation would fail. This parameter allows a crude constant value to be used in its place, separate from the Constant values feature. @@ -207,7 +208,7 @@ This parameter allows a crude constant value to be used in its place, and overri | `spark.cdm.transform.custom.writetime.incrementBy` | `0` | Default is `0`. -This is useful when you have a List that is not frozen, and you are updating this via the autocorrect feature. +This is useful when you have a List that is not frozen, and you are updating this using the autocorrect feature. Lists are not idempotent, and subsequent UPSERTs would add duplicates to the list. | `spark.cdm.transform.codecs` @@ -245,7 +246,7 @@ Must be in `ZoneRulesProvider.getAvailableZoneIds()`. == Cassandra filter parameters Cassandra filters are applied on the coordinator node. -Note that, depending on the filter, the coordinator node may need to do a lot more work than is normal, notably because {cstar-data-migrator} specifies `ALLOW FILTERING`. +Depending on the filter, the coordinator node may need to do a lot more work than is normal, notably because {cstar-data-migrator} specifies `ALLOW FILTERING`. By default, these parameters are commented out. @@ -296,26 +297,27 @@ Invalid percentages will be treated as 100. | `0` | The lowest (inclusive) writetime values to be migrated. Using the `spark.cdm.filter.java.writetime.min` and `spark.cdm.filter.java.writetime.max` thresholds, {cstar-data-migrator} can filter records based on their writetimes. -The maximum writetime of the columns configured at `spark.cdm.schema.origin.column.writetime.names` will be compared to the `.min` and `.max` thresholds, which must be in **microseconds since the epoch**. -If the `spark.cdm.schema.origin.column.writetime.names` are not specified, or the thresholds are null or otherwise invalid, the filter will be ignored. -Note that `spark.cdm.s.perfops.batchSize` will be ignored when this filter is in place; a value of 1 will be used instead. +The maximum writetime of the columns configured at `spark.cdm.schema.origin.column.writetime.names` are compared to the `.min` and `.max` thresholds, which must be in **microseconds since the epoch**. +If the `spark.cdm.schema.origin.column.writetime.names` are not specified or the thresholds are null or otherwise invalid, the filter is ignored. +Note that `spark.cdm.s.perfops.batchSize` is ignored when this filter is in place; a value of 1 is used instead. | `spark.cdm.filter.java.writetime.max` | `9223372036854775807` | The highest (inclusive) writetime values to be migrated. -Maximum timestamp of the columns specified by `spark.cdm.schema.origin.column.writetime.names`; if that property is not specified, or is for some reason null, the filter is ignored. +Maximum timestamp of the columns specified by `spark.cdm.schema.origin.column.writetime.names`. +If that property is not specified or is for some reason null, the filter is ignored. | `spark.cdm.filter.java.column.name` | | Filter rows based on matching a configured value. With `spark.cdm.filter.java.column.name`, specify the column name against which the `spark.cdm.filter.java.column.value` is compared. Must be on the column list specified at `spark.cdm.schema.origin.column.names`. -The column value will be converted to a String, trimmed of whitespace on both ends, and compared. +The column value is converted to a String, trimmed of whitespace on both ends, and compared. | `spark.cdm.filter.java.column.value` | | String value to use as comparison. -Whitespace on the ends of `spark.cdm.filter.java.column.value` will be trimmed. +The whitespace on the ends of `spark.cdm.filter.java.column.value` is trimmed. |=== @@ -347,7 +349,8 @@ Examples: `'abcd'` for a string; `1234` for an int, and so on. | `spark.cdm.feature.constantColumns.splitRegex` | `,` -| Defaults to comma, but can be any regex character that works with `String.split(regex)`; this option is needed because some type values contain commas, such as in lists, maps, and sets. +| Defaults to comma, but can be any regex character that works with `String.split(regex)`. +This option is needed because some type values contain commas, such as in lists, maps, and sets. |=== @@ -368,11 +371,11 @@ By default, these parameters are commented out. Must be defined on `spark.cdm.schema.origin.column.names`, and the corresponding type on `spark.cdm.schema.origin.column.types` must be a map. | `spark.cdm.feature.explodeMap.origin.name.key` -| The name of the column on the Target table that will hold the map key, such as `my_map_key`. +| The name of the column on the Target table that holds the map key, such as `my_map_key`. This key must be present on the Target primary key `spark.cdm.schema.target.column.id.names`. | `spark.cdm.feature.explodeMap.origin.value` -| The name of the column on the Target table that will hold the map value, such as `my_map_value`. +| The name of the column on the Target table that holds the map value, such as `my_map_value`. |=== @@ -380,7 +383,7 @@ This key must be present on the Target primary key `spark.cdm.schema.target.colu == Guardrail feature parameter The guardrail feature manages records that exceed guardrail checks. -The Guardrail job will generate a report; other jobs will skip records that exceed the guardrail limit. +The Guardrail job generates a report; other jobs skip records that exceed the guardrail limit. By default, these parameters are commented out. diff --git a/modules/ROOT/pages/cdm-prereqs.adoc b/modules/ROOT/pages/cdm-prereqs.adoc new file mode 100644 index 00000000..47cd9940 --- /dev/null +++ b/modules/ROOT/pages/cdm-prereqs.adoc @@ -0,0 +1,67 @@ += {cstar-data-migrator} prerequisites + +Use {cstar-data-migrator} to migrate and validate tables between Origin and Target Cassandra clusters, with available logging and reconciliation support. + +* Install or switch to Java 11. +The Spark binaries are compiled with this version of Java. +* Install https://archive.apache.org/dist/spark/spark-3.5.1/[Spark 3.5.1] on a single VM (no cluster necessary) where you want to run this job. +* Optionally, install https://maven.apache.org/download.cgi[Maven] 3.9.x if you want to build the JAR for local development. + +You can install Apache Spark by running the following commands: + +[source,bash] +---- +wget https://archive.apache.org/dist/spark/spark-3.5.1/spark-3.5.1-bin-hadoop3-scala2.13.tgz + +tar -xvzf spark-3.5.1-bin-hadoop3-scala2.13.tgz +---- + +* Install or switch to Java 11. +The Spark binaries are compiled with this version of Java. +* Install https://archive.apache.org/dist/spark/spark-3.5.1/[Spark 3.5.1] on a single VM (no cluster necessary) where you want to run this job. +* Optionally, install https://maven.apache.org/download.cgi[Maven] 3.9.x if you want to build the JAR for local development. + +You can install Apache Spark by running the following commands: + +[source,bash] +---- +wget https://archive.apache.org/dist/spark/spark-3.5.1/spark-3.5.1-bin-hadoop3-scala2.13.tgz + +tar -xvzf spark-3.5.1-bin-hadoop3-scala2.13.tgz +---- + +[[cdm-install-as-container]] +== Install {cstar-data-migrator} as a Container + +Get the latest image that includes all dependencies from https://hub.docker.com/r/datastax/cassandra-data-migrator[DockerHub]. + +All migration tools (`cassandra-data-migrator` + `dsbulk` + `cqlsh`) are available in the `/assets/` folder of the container. + +[[cdm-install-as-jar]] +== Install {cstar-data-migrator} as a JAR file + +Download the *latest* JAR file from the {cstar-data-migrator} https://github.com/datastax/cassandra-data-migrator/packages/1832128[GitHub repo]. +image:https://img.shields.io/github/v/release/datastax/cassandra-data-migrator?color=green[Latest release] + +[NOTE] +==== +Version 4.x of {cstar-data-migrator} is not backward-compatible with `*.properties` files created in previous versions, and package names have changed. +If you're starting new, we recommended that you use the latest released version. +==== + +[[cdm-build-jar-local]] +== Build {cstar-data-migrator} JAR for local development (optional) + +Optionally, you can build the {cstar-data-migrator} JAR for local development. (You'll need https://maven.apache.org/download.cgi[Maven] 3.9.x.) + +Example: + +[source,bash] +---- +cd ~/github +git clone git@github.com:datastax/cassandra-data-migrator.git +cd cassandra-data-migrator +mvn clean package +---- + +The fat jar (`cassandra-data-migrator-x.y.z.jar`) file should be present now in the `target` folder. diff --git a/modules/ROOT/pages/cdm-steps.adoc b/modules/ROOT/pages/cdm-steps.adoc index e5b0cb0b..976a277c 100644 --- a/modules/ROOT/pages/cdm-steps.adoc +++ b/modules/ROOT/pages/cdm-steps.adoc @@ -1,31 +1,13 @@ -= Steps to use {cstar-data-migrator} for data migration += Migrate {cstar-data-migrator} Use {cstar-data-migrator} to migrate and validate tables between Origin and Target Cassandra clusters, with available logging and reconciliation support. -[[cdm-prereqs]] -== {cstar-data-migrator} prerequisites - -* Install or switch to Java 11. -The Spark binaries are compiled with this version of Java. -* Install https://archive.apache.org/dist/spark/spark-3.5.1/[Spark 3.5.1] on a single VM (no cluster necessary) where you want to run this job. -* Optionally, install https://maven.apache.org/download.cgi[Maven] 3.9.x if you want to build the JAR for local development. - -You can install Apache Spark by running the following commands: - -[source,bash] ----- -wget https://archive.apache.org/dist/spark/spark-3.5.1/spark-3.5.1-bin-hadoop3-scala2.13.tgz - -tar -xvzf spark-3.5.1-bin-hadoop3-scala2.13.tgz ----- - - == {cstar-data-migrator} steps -1. Configure for your environment the `cdm*.properties` file that's provided in the {cstar-data-migrator} https://github.com/datastax/cassandra-data-migrator/tree/main/src/resources[GitHub repo]. +. Configure for your environment the `cdm*.properties` file that's provided in the {cstar-data-migrator} https://github.com/datastax/cassandra-data-migrator/tree/main/src/resources[GitHub repo]. The file can have any name. It does not need to be `cdm.properties` or `cdm-detailed.properties`. -In both versions, only the parameters that aren't commented out will be processed by the `spark-submit` job. +In both versions, the `spark-submit` job processes only the parameters that aren't commented out. Other parameter values use defaults or are ignored. See the descriptions and defaults in each file. Refer to: @@ -33,9 +15,9 @@ Refer to: This file contains only those parameters that are commonly configured. * The complete sample properties configuration, https://github.com/datastax/cassandra-data-migrator/blob/main/src/resources/cdm-detailed.properties[cdm-detailed.properties], for the full set of configurable settings. -2. Place the properties file that you elected to use and customize where it can be accessed while running the job via `spark-submit`. +. Place the properties file that you elected to use and customize where it can be accessed while running the job using `spark-submit`. -3. Run the job using `spark-submit` command: +. Run the job using `spark-submit` command: [source,bash] ---- @@ -65,7 +47,7 @@ Example: --class com.datastax.cdm.job.DiffData cassandra-data-migrator-x.y.z.jar &> logfile_name_$(date +%Y%m%d_%H_%M).txt ---- -The {cstar-data-migrator} validation job will report differences as `ERROR` entries in the log file. +The {cstar-data-migrator} validation job reports differences as `ERROR` entries in the log file. Example: [source,bash] From 13323ee1fbddea2fe512404569a47a94f1e00fff Mon Sep 17 00:00:00 2001 From: beajohnson Date: Tue, 4 Jun 2024 11:56:08 -0700 Subject: [PATCH 07/32] minor grammar updates --- modules/ROOT/pages/cdm-parameters.adoc | 34 ++++++++++++++------------ modules/ROOT/pages/cdm-prereqs.adoc | 4 +-- modules/ROOT/pages/cdm-steps.adoc | 4 +-- 3 files changed, 22 insertions(+), 20 deletions(-) diff --git a/modules/ROOT/pages/cdm-parameters.adoc b/modules/ROOT/pages/cdm-parameters.adoc index 3c43b754..7aa3e0a3 100644 --- a/modules/ROOT/pages/cdm-parameters.adoc +++ b/modules/ROOT/pages/cdm-parameters.adoc @@ -17,22 +17,24 @@ Table must exist in Origin. | `spark.cdm.schema.origin.column.ttl.automatic` | `true` | Default is `true`, unless `spark.cdm.schema.origin.column.ttl.names` is specified. -When `true`, the Time To Live (TTL) of the Target record will be determined by finding the maximum TTL of all Origin columns that can have TTL set (which excludes partition key, clustering key, collections/UDT/tuple, and frozen columns). -When `false`, and `spark.cdm.schema.origin.column.ttl.names` is not set, the Target record will have the TTL determined by the Target table configuration. +When `true`, determine the Time To Live (TTL) of the Target record. +Find the maximum TTL of all Origin columns that can have TTL set (which excludes partition key, clustering key, collections/UDT/tuple, and frozen columns). +When `false`, and `spark.cdm.schema.origin.column.ttl.names` is not set, the Target record has the TTL determined by the Target table configuration. | `spark.cdm.schema.origin.column.ttl.names` | -| Default is empty, meaning the names will be determined automatically if `spark.cdm.schema.origin.column.ttl.automatic` is set. +| Default is empty, meaning the names are determined automatically if `spark.cdm.schema.origin.column.ttl.automatic` is set. Specify a subset of eligible columns that are used to calculate the TTL of the Target record. | `spark.cdm.schema.origin.column.writetime.automatic` | `true` | Default is `true`, unless `spark.cdm.schema.origin.column.writetime.names` is specified. -When `true`, the `WRITETIME` of the Target record will be determined by finding the maximum `WRITETIME` of all Origin columns that can have `WRITETIME` set (which excludes partition key, clustering key, collections/UDT/tuple, and frozen columns). -When `false`, and `spark.cdm.schema.origin.column.writetime.names` is not set, the Target record will have the `WRITETIME` determined by the Target table configuration. +When `true`, determine the `WRITETIME` of the Target record. +Find the maximum `WRITETIME` of all Origin columns that can have `WRITETIME` set (which excludes partition key, clustering key, collections/UDT/tuple, and frozen columns). +When `false`, and `spark.cdm.schema.origin.column.writetime.names` is not set, the Target record has the `WRITETIME` determined by the Target table configuration. [NOTE] ==== -The `spark.cdm.transform.custom.writetime` property, if set, would override `spark.cdm.schema.origin.column.writetime`. +The `spark.cdm.transform.custom.writetime` property, if set, overrides `spark.cdm.schema.origin.column.writetime`. ==== | `spark.cdm.schema.origin.column.writetime.names` @@ -77,7 +79,7 @@ Table must exist in Target. == Auto-correction parameters Auto-correction parameters allow {cstar-data-migrator} to correct data differences found between Origin and Target when you run the `DiffData` program. -Typically, these are run disabled (for "what if" migration testing), which will generate a list of data discrepancies. +Typically, these are run-disabled (for "what if" migration testing), which generate a list of data discrepancies. The reasons for these discrepancies can then be investigated, and if necessary the parameters below can be enabled. For information about invoking `DiffData` in a {cstar-data-migrator} command, see xref:#cdm-validation-steps[{cstar-data-migrator} steps in validation mode] in this topic. @@ -126,7 +128,7 @@ Performance and operations parameters that can affect migration throughput, erro | `spark.cdm.perfops.numParts` | `10000` -| In standard operation, the full token range (-2^63 .. 2^63-1) is divided into a number of parts, which will be parallel-processed. +| In standard operation, the full token range (-2^63 .. 2^63-1) is divided into a number of parts, which are parallel-processed. You should aim for each part to comprise a total of ≈1-10GB of data to migrate. During initial testing, you may want this to be a small number (such as `1`). @@ -163,12 +165,12 @@ The consistency parameters may be one of: `ANY`, `ONE`, `TWO`, `THREE`, `QUORUM` | `spark.cdm.perfops.printStatsAfter` | `100000` | Commented out. -Number of rows of processing after which a progress log entry will be made. +Number of rows of processing after which a progress log entry is made. | `spark.cdm.perfops.fetchSizeInRows` | `1000` | Commented out. -This parameter affects the frequency of reads from Origin, and also the frequency of flushes to Target. +This parameter affects the frequency of reads from Origin and the frequency of flushes to Target. | `spark.cdm.perfops.errorLimit` | `0` @@ -196,7 +198,7 @@ By default, these parameters are commented out. Partition and clustering columns cannot have null values. If they are added as part of a schema transformation between Origin and Target, it is possible that the Origin side is null. In this case, the `Migrate` data operation would fail. -This parameter allows a crude constant value to be used in its place, separate from the Constant values feature. +This parameter allows a crude constant value to be used in its place that is separate from the Constant values feature. | `spark.cdm.transform.custom.writetime` | `0` @@ -208,8 +210,8 @@ This parameter allows a crude constant value to be used in its place, and overri | `spark.cdm.transform.custom.writetime.incrementBy` | `0` | Default is `0`. -This is useful when you have a List that is not frozen, and you are updating this using the autocorrect feature. -Lists are not idempotent, and subsequent UPSERTs would add duplicates to the list. +This is useful when you have a list that is not frozen, and you are updating this using the autocorrect feature. +Lists are not idempotent, and subsequent UPSERTs add duplicates to the list. | `spark.cdm.transform.codecs` | @@ -288,10 +290,10 @@ By default, these parameters are commented out. | `spark.cdm.filter.java.token.percent` | `100` -| Percent (between 1 and 100) of the token in each Split that will be migrated. +| Percent (between 1 and 100) of the token in each Split that is migrated. This property is used to do a wide and random sampling of the data. The percentage value is applied to each split. -Invalid percentages will be treated as 100. +Invalid percentages are treated as 100. | `spark.cdm.filter.java.writetime.min` | `0` @@ -394,7 +396,7 @@ By default, these parameters are commented out. | `spark.cdm.feature.guardrail.colSizeInKB` | `0` | The `0` default means the guardrail check is not done. -If set, table records with one or more fields that exceed the column size in kB will be flagged. +If set, table records with one or more fields that exceed the column size in kB are flagged. Note this is kB (base 10), not kiB (base 2). |=== diff --git a/modules/ROOT/pages/cdm-prereqs.adoc b/modules/ROOT/pages/cdm-prereqs.adoc index 47cd9940..89a1b8c3 100644 --- a/modules/ROOT/pages/cdm-prereqs.adoc +++ b/modules/ROOT/pages/cdm-prereqs.adoc @@ -1,6 +1,6 @@ = {cstar-data-migrator} prerequisites -Use {cstar-data-migrator} to migrate and validate tables between Origin and Target Cassandra clusters, with available logging and reconciliation support. +Read the prerequistes below before using the Cassandra Data Migrator. * Install or switch to Java 11. The Spark binaries are compiled with this version of Java. @@ -21,7 +21,7 @@ The Spark binaries are compiled with this version of Java. * Install https://archive.apache.org/dist/spark/spark-3.5.1/[Spark 3.5.1] on a single VM (no cluster necessary) where you want to run this job. * Optionally, install https://maven.apache.org/download.cgi[Maven] 3.9.x if you want to build the JAR for local development. -You can install Apache Spark by running the following commands: +Run the following commands to install Apache Spark: [source,bash] ---- diff --git a/modules/ROOT/pages/cdm-steps.adoc b/modules/ROOT/pages/cdm-steps.adoc index 976a277c..71fedb0f 100644 --- a/modules/ROOT/pages/cdm-steps.adoc +++ b/modules/ROOT/pages/cdm-steps.adoc @@ -79,14 +79,14 @@ spark.cdm.autocorrect.mismatch false|true [IMPORTANT] ==== -The {cstar-data-migrator} validation job will never delete records from Target. +The {cstar-data-migrator} validation job nevers delete records from Target. The job only adds or updates data on Target. ==== [[cdm-guardrail-checks]] == Perform large-field guardrail violation checks -Use {cstar-data-migrator} to identify large fields from a table that may break your cluster guardrails. +Use {cstar-data-migrator} to identify large fields from a table that may break your https://docs.datastax.com/en/astra-db-serverless/cql/cassandra-guardrails.html[cluster guardrails]. For example, {astra_db} has a 10MB limit for a single large field. Specify `--class com.datastax.cdm.job.GuardrailCheck` on the command. Example: From 362464472aa8fe9fef14d596b4d213e295efa682 Mon Sep 17 00:00:00 2001 From: beajohnson Date: Wed, 5 Jun 2024 08:46:26 -0700 Subject: [PATCH 08/32] rebuild draft From 51eb84d204f7f41ea2706221b321d76987ca679b Mon Sep 17 00:00:00 2001 From: beajohnson Date: Wed, 5 Jun 2024 08:54:17 -0700 Subject: [PATCH 09/32] getting draft to review --- modules/ROOT/nav.adoc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/ROOT/nav.adoc b/modules/ROOT/nav.adoc index 9a3f00a6..fb4cf552 100644 --- a/modules/ROOT/nav.adoc +++ b/modules/ROOT/nav.adoc @@ -29,7 +29,7 @@ ** xref:cdm-steps.adoc[Migrate data] ** xref:cdm-parameters.adoc[Parameters] * Troubleshooting -** xref:troubleshooting.adoc[] +** xref:troubleshooting.adoc[] ** xref:troubleshooting-tips.adoc[] ** xref:troubleshooting-scenarios.adoc[] * xref:faqs.adoc[] From b39c353dc572e71005311c9d88c2b9d1bf447cfa Mon Sep 17 00:00:00 2001 From: beajohnson Date: Wed, 5 Jun 2024 08:59:25 -0700 Subject: [PATCH 10/32] rebuild draft From 4fdc9c83917278a39d7a369d4539c3579e63529a Mon Sep 17 00:00:00 2001 From: beajohnson Date: Wed, 5 Jun 2024 09:03:22 -0700 Subject: [PATCH 11/32] rebuild draft From fb6b102659e613f03f32267d9fb63aa789266425 Mon Sep 17 00:00:00 2001 From: beajohnson Date: Fri, 7 Jun 2024 12:54:47 -0700 Subject: [PATCH 12/32] small updates --- modules/ROOT/nav.adoc | 5 ++-- modules/ROOT/pages/cdm-parameters.adoc | 35 +++++++++++++------------- modules/ROOT/pages/cdm-prereqs.adoc | 24 +++++------------- modules/ROOT/pages/cdm-steps.adoc | 14 +++++------ 4 files changed, 34 insertions(+), 44 deletions(-) diff --git a/modules/ROOT/nav.adoc b/modules/ROOT/nav.adoc index fb4cf552..6e99c264 100644 --- a/modules/ROOT/nav.adoc +++ b/modules/ROOT/nav.adoc @@ -16,7 +16,7 @@ ** xref:manage-proxy-instances.adoc[] //phase 2 * xref:migrate-and-validate-data.adoc[] -** xref:cassandra-data-migrator.adoc[] +//** xref:cassandra-data-migrator.adoc[] ** xref:dsbulk-migrator.adoc[] //phase 3 * xref:enable-async-dual-reads.adoc[] @@ -24,7 +24,8 @@ * xref:change-read-routing.adoc[] //phase 5 * xref:connect-clients-to-target.adoc[] -* xref:cassandra-data-migrator.adoc[] +* Cassandra Data Migrator +** xref:cassandra-data-migrator.adoc[] ** xref:cdm-prereqs.adoc[Prerequisites] ** xref:cdm-steps.adoc[Migrate data] ** xref:cdm-parameters.adoc[Parameters] diff --git a/modules/ROOT/pages/cdm-parameters.adoc b/modules/ROOT/pages/cdm-parameters.adoc index 7aa3e0a3..eb624252 100644 --- a/modules/ROOT/pages/cdm-parameters.adoc +++ b/modules/ROOT/pages/cdm-parameters.adoc @@ -18,7 +18,7 @@ Table must exist in Origin. | `true` | Default is `true`, unless `spark.cdm.schema.origin.column.ttl.names` is specified. When `true`, determine the Time To Live (TTL) of the Target record. -Find the maximum TTL of all Origin columns that can have TTL set (which excludes partition key, clustering key, collections/UDT/tuple, and frozen columns). +Find the maximum TTL of all Origin columns that can have TTL set. This excludes partition key, clustering key, collections/UDT/tuple, and frozen columns. When `false`, and `spark.cdm.schema.origin.column.ttl.names` is not set, the Target record has the TTL determined by the Target table configuration. | `spark.cdm.schema.origin.column.ttl.names` @@ -30,8 +30,9 @@ Specify a subset of eligible columns that are used to calculate the TTL of the T | `true` | Default is `true`, unless `spark.cdm.schema.origin.column.writetime.names` is specified. When `true`, determine the `WRITETIME` of the Target record. -Find the maximum `WRITETIME` of all Origin columns that can have `WRITETIME` set (which excludes partition key, clustering key, collections/UDT/tuple, and frozen columns). -When `false`, and `spark.cdm.schema.origin.column.writetime.names` is not set, the Target record has the `WRITETIME` determined by the Target table configuration. +Find the maximum `WRITETIME` of all Origin columns that can have `WRITETIME` set. This excludes partition key, clustering key, collections/UDT/tuple, and frozen columns. +When `false`, and `spark.cdm.schema.origin.column.writetime.names` is not set, the Target table configuration determines the Target record's `WRITETIME`. + [NOTE] ==== The `spark.cdm.transform.custom.writetime` property, if set, overrides `spark.cdm.schema.origin.column.writetime`. @@ -55,7 +56,7 @@ You only need to list renamed columns. [NOTE] ==== -For optimization reasons, {cstar-data-migrator} does not migrate TTL and writetime at the field-level. +For optimization reasons, {cstar-data-migrator} does not migrate TTL and writetime at the field level. Instead, {cstar-data-migrator} finds the field with the highest TTL, and the field with the highest writetime within an Origin table row, and uses those values on the entire Target table row. ==== @@ -79,10 +80,10 @@ Table must exist in Target. == Auto-correction parameters Auto-correction parameters allow {cstar-data-migrator} to correct data differences found between Origin and Target when you run the `DiffData` program. -Typically, these are run-disabled (for "what if" migration testing), which generate a list of data discrepancies. +Typically, these are run-disabled for "what if" migration testing, and generate a list of data discrepancies. The reasons for these discrepancies can then be investigated, and if necessary the parameters below can be enabled. -For information about invoking `DiffData` in a {cstar-data-migrator} command, see xref:#cdm-validation-steps[{cstar-data-migrator} steps in validation mode] in this topic. +For information about invoking `DiffData` in a {cstar-data-migrator} command, see xref:#cdm-validation-steps[{cstar-data-migrator} steps in validation mode]. [cols="2,2,3a"] |=== @@ -105,7 +106,7 @@ This comparative state may be particularly challenging to troubleshoot if indivi | `spark.cdm.autocorrect.missing.counter` | `false` | Commented out. -By default, Counter tables are not copied when missing, unless explicitly set. +By default, counter tables are not copied when missing, unless explicitly set. | `spark.tokenrange.partitionFile` | `./._partitions.csv` @@ -197,8 +198,8 @@ By default, these parameters are commented out. | Timestamp value in milliseconds. Partition and clustering columns cannot have null values. If they are added as part of a schema transformation between Origin and Target, it is possible that the Origin side is null. -In this case, the `Migrate` data operation would fail. -This parameter allows a crude constant value to be used in its place that is separate from the Constant values feature. +In this case, the `Migrate` data operation fails. +This parameter allows a crude constant value to be used in its place that is separate from the constant values feature. | `spark.cdm.transform.custom.writetime` | `0` @@ -218,12 +219,12 @@ Lists are not idempotent, and subsequent UPSERTs add duplicates to the list. | Default is empty. A comma-separated list of additional codecs to enable. - * `INT_STRING` : int stored in a String. - * `DOUBLE_STRING` : double stored in a String. - * `BIGINT_STRING` : bigint stored in a String. - * `DECIMAL_STRING` : decimal stored in a String. - * `TIMESTAMP_STRING_MILLIS` : timestamp stored in a String, as Epoch milliseconds. - * `TIMESTAMP_STRING_FORMAT` : timestamp stored in a String, with a custom format. + * `INT_STRING` : int stored in a string. + * `DOUBLE_STRING` : double stored in a string. + * `BIGINT_STRING` : bigint stored in a string. + * `DECIMAL_STRING` : decimal stored in a string. + * `TIMESTAMP_STRING_MILLIS` : timestamp stored in a string, as Epoch milliseconds. + * `TIMESTAMP_STRING_FORMAT` : timestamp stored in a string with a custom format. [NOTE] ==== @@ -306,7 +307,7 @@ Note that `spark.cdm.s.perfops.batchSize` is ignored when this filter is in plac | `spark.cdm.filter.java.writetime.max` | `9223372036854775807` | The highest (inclusive) writetime values to be migrated. -Maximum timestamp of the columns specified by `spark.cdm.schema.origin.column.writetime.names`. +The `spark.cdm.schema.origin.column.writetime.names` specifies the maximum timestamp of the columns. If that property is not specified or is for some reason null, the filter is ignored. | `spark.cdm.filter.java.column.name` @@ -314,7 +315,7 @@ If that property is not specified or is for some reason null, the filter is igno | Filter rows based on matching a configured value. With `spark.cdm.filter.java.column.name`, specify the column name against which the `spark.cdm.filter.java.column.value` is compared. Must be on the column list specified at `spark.cdm.schema.origin.column.names`. -The column value is converted to a String, trimmed of whitespace on both ends, and compared. +The column value is converted to a string, trimmed of whitespace on both ends, and compared. | `spark.cdm.filter.java.column.value` | diff --git a/modules/ROOT/pages/cdm-prereqs.adoc b/modules/ROOT/pages/cdm-prereqs.adoc index 89a1b8c3..3a078368 100644 --- a/modules/ROOT/pages/cdm-prereqs.adoc +++ b/modules/ROOT/pages/cdm-prereqs.adoc @@ -4,21 +4,8 @@ Read the prerequistes below before using the Cassandra Data Migrator. * Install or switch to Java 11. The Spark binaries are compiled with this version of Java. -* Install https://archive.apache.org/dist/spark/spark-3.5.1/[Spark 3.5.1] on a single VM (no cluster necessary) where you want to run this job. -* Optionally, install https://maven.apache.org/download.cgi[Maven] 3.9.x if you want to build the JAR for local development. - -You can install Apache Spark by running the following commands: - -[source,bash] ----- -wget https://archive.apache.org/dist/spark/spark-3.5.1/spark-3.5.1-bin-hadoop3-scala2.13.tgz - -tar -xvzf spark-3.5.1-bin-hadoop3-scala2.13.tgz ----- - -* Install or switch to Java 11. -The Spark binaries are compiled with this version of Java. -* Install https://archive.apache.org/dist/spark/spark-3.5.1/[Spark 3.5.1] on a single VM (no cluster necessary) where you want to run this job. +* Install https://archive.apache.org/dist/spark/spark-3.5.1/[Spark 3.5.1] on a single VM where you want to run this job. +No cluster is necessary * Optionally, install https://maven.apache.org/download.cgi[Maven] 3.9.x if you want to build the JAR for local development. Run the following commands to install Apache Spark: @@ -31,11 +18,11 @@ tar -xvzf spark-3.5.1-bin-hadoop3-scala2.13.tgz ---- [[cdm-install-as-container]] -== Install {cstar-data-migrator} as a Container +== Install {cstar-data-migrator} as a container Get the latest image that includes all dependencies from https://hub.docker.com/r/datastax/cassandra-data-migrator[DockerHub]. -All migration tools (`cassandra-data-migrator` + `dsbulk` + `cqlsh`) are available in the `/assets/` folder of the container. +All migration tools, `cassandra-data-migrator` + `dsbulk` + `cqlsh`, are available in the `/assets/` folder of the container. [[cdm-install-as-jar]] == Install {cstar-data-migrator} as a JAR file @@ -52,7 +39,8 @@ If you're starting new, we recommended that you use the latest released version. [[cdm-build-jar-local]] == Build {cstar-data-migrator} JAR for local development (optional) -Optionally, you can build the {cstar-data-migrator} JAR for local development. (You'll need https://maven.apache.org/download.cgi[Maven] 3.9.x.) +Optionally, you can build the {cstar-data-migrator} JAR for local development. +You'll need https://maven.apache.org/download.cgi[Maven] 3.9.x. Example: diff --git a/modules/ROOT/pages/cdm-steps.adoc b/modules/ROOT/pages/cdm-steps.adoc index 71fedb0f..7aa2891d 100644 --- a/modules/ROOT/pages/cdm-steps.adoc +++ b/modules/ROOT/pages/cdm-steps.adoc @@ -1,8 +1,8 @@ -= Migrate {cstar-data-migrator} += {cstar-data-migrator} Use {cstar-data-migrator} to migrate and validate tables between Origin and Target Cassandra clusters, with available logging and reconciliation support. -== {cstar-data-migrator} steps +== Use {cstar-data-migrator} . Configure for your environment the `cdm*.properties` file that's provided in the {cstar-data-migrator} https://github.com/datastax/cassandra-data-migrator/tree/main/src/resources[GitHub repo]. The file can have any name. @@ -10,7 +10,7 @@ It does not need to be `cdm.properties` or `cdm-detailed.properties`. In both versions, the `spark-submit` job processes only the parameters that aren't commented out. Other parameter values use defaults or are ignored. See the descriptions and defaults in each file. -Refer to: +For more information,see the following: * The simplified sample properties configuration, https://github.com/datastax/cassandra-data-migrator/blob/main/src/resources/cdm.properties[cdm.properties]. This file contains only those parameters that are commonly configured. * The complete sample properties configuration, https://github.com/datastax/cassandra-data-migrator/blob/main/src/resources/cdm-detailed.properties[cdm-detailed.properties], for the full set of configurable settings. @@ -29,12 +29,12 @@ Refer to: [TIP] ==== -* Above command generates a log file `logfile_name_*.txt` to avoid log output on the console. -* Update the memory options (driver & executor memory) based on your use-case +* The command generates a log file `logfile_name_*.txt` to prevent log output on the console. +* Update the memory options, driver & executor memory, based on your use-case ==== [[cdm-validation-steps]] -== {cstar-data-migrator} steps in validation mode +== Validation mode for {cstar-data-migrator} To run your migration job with {cstar-data-migrator} in **data validation mode**, use class option `--class com.datastax.cdm.job.DiffData`. Example: @@ -64,7 +64,7 @@ To get the list of missing or mismatched records, grep for all `ERROR` entries i Differences noted in the log file are listed by primary-key values. ==== -You can also run the {cstar-data-migrator} validation job in an **AutoCorrect** mode. This mode can: +You can also run the {cstar-data-migrator} validation job in an **AutoCorrect** mode, which can: * Add any missing records from Origin to Target. * Update any mismatched records between Origin and Target; this action makes Target the same as Origin. From 9161fcb39c272d9beff5a2bba4589e03526c4b42 Mon Sep 17 00:00:00 2001 From: beajohnson Date: Tue, 11 Jun 2024 06:52:15 -0700 Subject: [PATCH 13/32] updates from pravin on main cassandra page --- modules/ROOT/pages/cassandra-data-migrator.adoc | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/modules/ROOT/pages/cassandra-data-migrator.adoc b/modules/ROOT/pages/cassandra-data-migrator.adoc index 3f8cba33..13762354 100644 --- a/modules/ROOT/pages/cassandra-data-migrator.adoc +++ b/modules/ROOT/pages/cassandra-data-migrator.adoc @@ -133,8 +133,8 @@ spark.cdm.autocorrect.mismatch false|true [IMPORTANT] ==== -The {cstar-data-migrator} validation job will never delete records from Target. -The job only adds or updates data on Target. +You may still have differences in your clusters after running validation job as {cstar-data-migrator} does not delete records from Target that don't exists on Origin. +The job only adds or updates records on Target that exists on Origin. ==== [[cdm--partition-ranges]] @@ -161,7 +161,7 @@ Example: ---- ./spark-submit --properties-file cdm.properties \ --conf spark.cdm.schema.origin.keyspaceTable="." \ - --conf spark.cdm.tokenRange.partitionFile="//" \ + --conf spark.cdm.tokenrange.partitionFile.input="//" \ --master "local[*]" --driver-memory 25G --executor-memory 25G \ --class com.datastax.cdm.job. cassandra-data-migrator-x.y.z.jar &> logfile_name_$(date +%Y%m%d_%H_%M).txt ---- From 98347347023b047c95c5391975d688e6f486ed46 Mon Sep 17 00:00:00 2001 From: beajohnson Date: Tue, 11 Jun 2024 11:19:37 -0700 Subject: [PATCH 14/32] checking grammar --- .../ROOT/pages/cassandra-data-migrator.adoc | 558 +----------------- modules/ROOT/pages/cdm-parameters.adoc | 71 +-- modules/ROOT/pages/cdm-prereqs.adoc | 4 +- modules/ROOT/pages/cdm-steps.adoc | 10 +- 4 files changed, 62 insertions(+), 581 deletions(-) diff --git a/modules/ROOT/pages/cassandra-data-migrator.adoc b/modules/ROOT/pages/cassandra-data-migrator.adoc index 13762354..a8da212d 100644 --- a/modules/ROOT/pages/cassandra-data-migrator.adoc +++ b/modules/ROOT/pages/cassandra-data-migrator.adoc @@ -61,7 +61,7 @@ The fat jar (`cassandra-data-migrator-x.y.z.jar`) file should be present now in 1. Configure for your environment the `cdm*.properties` file that's provided in the {cstar-data-migrator} https://github.com/datastax/cassandra-data-migrator/tree/main/src/resources[GitHub repo]. The file can have any name. It does not need to be `cdm.properties` or `cdm-detailed.properties`. -In both versions, only the parameters that aren't commented out will be processed by the `spark-submit` job. +In both versions, only the parameters that aren't commented out are processed with the `spark-submit` job. Other parameter values use defaults or are ignored. See the descriptions and defaults in each file. Refer to: @@ -69,7 +69,7 @@ Refer to: This file contains only those parameters that are commonly configured. * The complete sample properties configuration, https://github.com/datastax/cassandra-data-migrator/blob/main/src/resources/cdm-detailed.properties[cdm-detailed.properties], for the full set of configurable settings. -2. Place the properties file that you elected to use and customize where it can be accessed while running the job via `spark-submit`. +2. Place the properties file that you elected to use and customize where it can be accessed while running the job using `spark-submit`. 3. Run the job using `spark-submit` command: @@ -101,7 +101,7 @@ Example: --class com.datastax.cdm.job.DiffData cassandra-data-migrator-x.y.z.jar &> logfile_name_$(date +%Y%m%d_%H_%M).txt ---- -The {cstar-data-migrator} validation job will report differences as `ERROR` entries in the log file. +The {cstar-data-migrator} validation job reports differences as `ERROR` entries in the log file. Example: [source,bash] @@ -120,7 +120,7 @@ Differences noted in the log file are listed by primary-key values. You can also run the {cstar-data-migrator} validation job in an **AutoCorrect** mode. This mode can: -* Add any missing records from Origin to Target. +* Add any missing records from the Origin to Target cluster. * Update any mismatched records between Origin and Target; this action makes Target the same as Origin. To enable or disable this feature, use one or both of the following settings in your `*.properties` configuration file. @@ -134,7 +134,7 @@ spark.cdm.autocorrect.mismatch false|true [IMPORTANT] ==== You may still have differences in your clusters after running validation job as {cstar-data-migrator} does not delete records from Target that don't exists on Origin. -The job only adds or updates records on Target that exists on Origin. +The job only adds or updates records on the Target cluster that exists on the Origin cluster. ==== [[cdm--partition-ranges]] @@ -154,7 +154,7 @@ Example: Each line in the CSV represents a partition-range (`min,max`). -Alternatively, you can also pass the partition-file via a command-line parameter. +Alternatively, you can also pass the partition-file with a command-line parameter. Example: [source,bash] @@ -170,7 +170,7 @@ This mode is specifically useful to processes a subset of partition-ranges that [NOTE] ==== -A file named `./._partitions.csv` is auto-generated by the migration & validation jobs, in the format shown above. +A file named `./._partitions.csv` is auto-generated by the migration and validation jobs, in the format shown above. The file contains any failed partition ranges. No file is created if there were no failed partitions. You can use the CSV as input to process any failed partition in a subsequent run. @@ -196,535 +196,15 @@ Example: [[cdm-reference]] == {cstar-data-migrator} reference -* xref:#cdm-connection-params[Common connection parameters for Origin and Target] -* xref:#cdm-origin-schema-params[Origin schema parameters] -* xref:#cdm-target-schema-params[Target schema parameter] -* xref:#cdm-auto-correction-params[Auto-correction parameters] -* xref:#cdm-performance-operations-params[Performance and operations parameters] -* xref:#cdm-transformation-params[Transformation parameters] -* xref:#cdm-cassandra-filter-params[Cassandra filter parameters] -* xref:#cdm-java-filter-params[Java filter parameters] -* xref:#cdm-constant-column-feature-params[Constant column feature parameters] -* xref:#cdm-explode-map-feature-params[Explode map feature parameters] -* xref:#cdm-guardrail-feature-params[Guardrail feature parameters] -* xref:#cdm-tls-ssl-connection-params[TLS (SSL) connection parameters] - -[[cdm-connection-params]] -=== Common connection parameters for Origin and Target - -[cols="5,2,4"] -|=== -|Property | Default | Notes - -| `spark.cdm.connect.origin.host` -| `localhost` -| Hostname/IP address of the cluster. -May be a comma-separated list, and can follow the `:` convention. - -| `spark.cdm.connect.origin.port` -| `9042` -| Port number to use if not specified on `spark.cdm.connect.origin.host`. - -| `spark.cdm.connect.origin.scb` -| (Not set) -| Secure Connect Bundle, used to connect to an Astra DB database. -Example: `file:///aaa/bbb/scb-enterprise.zip`. - -| `spark.cdm.connect.origin.username` -| `cassandra` -| Username (or `client_id` value) used to authenticate. - -| `spark.cdm.connect.origin.password` -| `cassandra` -| Password (or `client_secret` value) used to authenticate. - -| `spark.cdm.connect.target.host` -| `localhost` -| Hostname/IP address of the cluster. -May be a comma-separated list, and can follow the `:` convention. - -| `spark.cdm.connect.target.port` -| `9042` -| Port number to use if not specified on `spark.cdm.connect.origin.host`. - -| `spark.cdm.connect.target.scb` -| (Not set) -| Secure Connect Bundle, used to connect to an Astra DB database. -Default is not set. -Example if set: `file:///aaa/bbb/my-scb.zip`. - -| `spark.cdm.connect.target.username` -| `cassandra` -| Username (or `client_id` value) used to authenticate. - -| `spark.cdm.connect.origin.password` -| `cassandra` -| Password (or `client_secret` value) used to authenticate. - -|=== - - -[[cdm-origin-schema-params]] -=== Origin schema parameters - -[cols="3,1,5a"] -|=== -|Property | Default | Notes - -| `spark.cdm.schema.origin.keyspaceTable` -| -| Required - the `.` of the table to be migrated. -Table must exist in Origin. - -| `spark.cdm.schema.origin.column.ttl.automatic` -| `true` -| Default is `true`, unless `spark.cdm.schema.origin.column.ttl.names` is specified. -When `true`, the Time To Live (TTL) of the Target record will be determined by finding the maximum TTL of all Origin columns that can have TTL set (which excludes partition key, clustering key, collections/UDT/tuple, and frozen columns). -When `false`, and `spark.cdm.schema.origin.column.ttl.names` is not set, the Target record will have the TTL determined by the Target table configuration. - -| `spark.cdm.schema.origin.column.ttl.names` -| -| Default is empty, meaning the names will be determined automatically if `spark.cdm.schema.origin.column.ttl.automatic` is set. -Specify a subset of eligible columns that are used to calculate the TTL of the Target record. - -| `spark.cdm.schema.origin.column.writetime.automatic` -| `true` -| Default is `true`, unless `spark.cdm.schema.origin.column.writetime.names` is specified. -When `true`, the `WRITETIME` of the Target record will be determined by finding the maximum `WRITETIME` of all Origin columns that can have `WRITETIME` set (which excludes partition key, clustering key, collections/UDT/tuple, and frozen columns). -When `false`, and `spark.cdm.schema.origin.column.writetime.names` is not set, the Target record will have the `WRITETIME` determined by the Target table configuration. -[NOTE] -==== -The `spark.cdm.transform.custom.writetime` property, if set, would override `spark.cdm.schema.origin.column.writetime`. -==== - -| `spark.cdm.schema.origin.column.writetime.names` -| -| Default is empty, meaning the names will be determined automatically if `spark.cdm.schema.origin.column.writetime.automatic` is set. -Otherwise, specify a subset of eligible columns that are used to calculate the WRITETIME of the Target record. -Example: `data_col1,data_col2,...` - -| `spark.cdm.schema.origin.column.names.to.target` -| -| Default is empty. -If column names are changed between Origin and Target, then this mapped list provides a mechanism to associate the two. -The format is `:`. -The list is comma-separated. -You only need to list renamed columns. - -|=== - -[NOTE] -==== -For optimization reasons, {cstar-data-migrator} does not migrate TTL and writetime at the field-level. -Instead, {cstar-data-migrator} finds the field with the highest TTL, and the field with the highest writetime within an Origin table row, and uses those values on the entire Target table row. -==== - -[[cdm-target-schema-params]] -=== Target schema parameter - -[cols="3,1,2"] -|=== -|Property | Default | Notes - -| `spark.cdm.schema.target.keyspaceTable` -| Equals the value of `spark.cdm.schema.origin.keyspaceTable` -| This parameter is commented out. -It's the `.` of the table to be migrated into the Target. -Table must exist in Target. - -|=== - - -[[cdm-auto-correction-params]] -=== Auto-correction parameters - -Auto-correction parameters allow {cstar-data-migrator} to correct data differences found between Origin and Target when you run the `DiffData` program. -Typically, these are run disabled (for "what if" migration testing), which will generate a list of data discrepancies. -The reasons for these discrepancies can then be investigated, and if necessary the parameters below can be enabled. - -For information about invoking `DiffData` in a {cstar-data-migrator} command, see xref:#cdm-validation-steps[{cstar-data-migrator} steps in validation mode] in this topic. - -[cols="2,2,3a"] -|=== -|Property | Default | Notes - -| `spark.cdm.autocorrect.missing` -| `false` -| When `true`, data that is missing in Target but is found in Origin will be re-migrated to Target. - -| `spark.cdm.autocorrect.mismatch` -| `false` -| When `true`, data that is different between Origin and Target will be reconciled. -[NOTE] -==== -The `TIMESTAMP` of records may have an effect. -If the `WRITETIME` of the Origin record (determined with `.writetime.names`) is earlier than the `WRITETIME` of the Target record, the change will not appear in Target. -This comparative state may be particularly challenging to troubleshoot if individual columns (cells) have been modified in Target. -==== - -| `spark.cdm.autocorrect.missing.counter` -| `false` -| Commented out. -By default, Counter tables are not copied when missing, unless explicitly set. - -| `spark.tokenrange.partitionFile` -| `./._partitions.csv` -| Commented out. -This CSV file is used as input, as well as output when applicable. -If the file exists, only the partition ranges in this file will be migrated or validated. -Similarly, if exceptions occur while migrating or validating, partition ranges with exceptions will be logged to this file. - -|=== - - -[[cdm-performance-operations-params]] -=== Performance and operations parameters - -Performance and operations parameters that can affect migration throughput, error handling, and similar concerns. - -[cols="4,1,3"] -|=== -|Property | Default | Notes - -| `spark.cdm.perfops.numParts` -| `10000` -| In standard operation, the full token range (-2^63 .. 2^63-1) is divided into a number of parts, which will be parallel-processed. -You should aim for each part to comprise a total of ≈1-10GB of data to migrate. -During initial testing, you may want this to be a small number (such as `1`). - -| `spark.cdm.perfops.batchSize` -| `5` -| When writing to Target, this comprises the number of records that will be put into an `UNLOGGED` batch. -{cstar-data-migrator} will tend to work on the same partition at a time. -Thus if your partition sizes are larger, this number may be increased. -If the `spark.cdm.perfops.batchSize` would mean that more than 1 partition is often contained in a batch, reduce this parameter's value. -Ideally < 1% of batches have more than 1 partition. - -| `spark.cdm.perfops.ratelimit.origin` -| `20000` -| Concurrent number of operations across all parallel threads from Origin. -This value may be adjusted up (or down), depending on the amount of data and the processing capacity of the Origin cluster. - -| `spark.cdm.perfops.ratelimit.target` -| `40000` -| Concurrent number of operations across all parallel threads from Target. -This may be adjusted up (or down), depending on the amount of data and the processing capacity of the Target cluster. - -| `spark.cdm.perfops.consistency.read` -| `LOCAL_QUORUM` -| Commented out. -Read consistency from Origin, and also from Target when records are read for comparison purposes. -The consistency parameters may be one of: `ANY`, `ONE`, `TWO`, `THREE`, `QUORUM`, `LOCAL_ONE`, `EACH_QUORUM`, `LOCAL_QUORUM`, `SERIAL`, `LOCAL_SERIAL`, `ALL`. - -| `spark.cdm.perfops.consistency.write` -| `LOCAL_QUORUM` -| Commented out. -Write consistency to Target. -The consistency parameters may be one of: `ANY`, `ONE`, `TWO`, `THREE`, `QUORUM`, `LOCAL_ONE`, `EACH_QUORUM`, `LOCAL_QUORUM`, `SERIAL`, `LOCAL_SERIAL`, `ALL`. - -| `spark.cdm.perfops.printStatsAfter` -| `100000` -| Commented out. -Number of rows of processing after which a progress log entry will be made. - -| `spark.cdm.perfops.fetchSizeInRows` -| `1000` -| Commented out. -This parameter affects the frequency of reads from Origin, and also the frequency of flushes to Target. - -| `spark.cdm.perfops.errorLimit` -| `0` -| Commented out. -Controls how many errors a thread may encounter during `MigrateData` and `DiffData` operations before failing. -Recommendation: set this parameter to a non-zero value **only when not doing** a mutation-type operation, such as when you're running `DiffData` without `.autocorrect`. - -|=== - - -[[cdm-transformation-params]] -=== Transformation parameters - -Parameters to perform schema transformations between Origin and Target. - -By default, these parameters are commented out. - -[cols="2,1,4a"] -|=== -|Property | Default | Notes - -| `spark.cdm.transform.missing.key.ts.replace.value` -| `1685577600000` -| Timestamp value in milliseconds. -Partition and clustering columns cannot have null values, but if these are added as part of a schema transformation between Origin and Target, it is possible that the Origin side is null. -In this case, the `Migrate` data operation would fail. -This parameter allows a crude constant value to be used in its place, separate from the Constant values feature. - -| `spark.cdm.transform.custom.writetime` -| `0` -| Default is 0 (disabled). -Timestamp value in microseconds to use as the `WRITETIME` for the Target record. -This is useful when the `WRITETIME` of the record in Origin cannot be determined (such as when the only non-key columns are collections). -This parameter allows a crude constant value to be used in its place, and overrides `spark.cdm.schema.origin.column.writetime.names`. - -| `spark.cdm.transform.custom.writetime.incrementBy` -| `0` -| Default is `0`. -This is useful when you have a List that is not frozen, and you are updating this via the autocorrect feature. -Lists are not idempotent, and subsequent UPSERTs would add duplicates to the list. - -| `spark.cdm.transform.codecs` -| -| Default is empty. -A comma-separated list of additional codecs to enable. - - * `INT_STRING` : int stored in a String. - * `DOUBLE_STRING` : double stored in a String. - * `BIGINT_STRING` : bigint stored in a String. - * `DECIMAL_STRING` : decimal stored in a String. - * `TIMESTAMP_STRING_MILLIS` : timestamp stored in a String, as Epoch milliseconds. - * `TIMESTAMP_STRING_FORMAT` : timestamp stored in a String, with a custom format. - -[NOTE] -==== -Where there are multiple type pair options, such as with `TIMESTAMP_STRING_*`, only one can be configured at a time with the `spark.cdm.transform.codecs` parameter. -==== - -| `spark.cdm.transform.codecs.timestamp.string.format` -| `yyyyMMddHHmmss` -| Configuration for `CQL_TIMESTAMP_TO_STRING_FORMAT` codec. -Default format is `yyyyMMddHHmmss`; `DateTimeFormatter.ofPattern(formatString)` - - -| `spark.cdm.transform.codecs.timestamp.string.zone` -| `UTC` -| Default is `UTC`. -Must be in `ZoneRulesProvider.getAvailableZoneIds()`. - -|=== - - -[[cdm-cassandra-filter-params]] -=== Cassandra filter parameters - -Cassandra filters are applied on the coordinator node. -Note that, depending on the filter, the coordinator node may need to do a lot more work than is normal, notably because {cstar-data-migrator} specifies `ALLOW FILTERING`. - -By default, these parameters are commented out. - -[cols="3,1,3"] -|=== -|Property | Default | Notes - -| `spark.cdm.filter.cassandra.partition.min` -| `-9223372036854775808` -| Default is `0` (when using `RandomPartitioner`) and `-9223372036854775808` (-2^63) otherwise. -Lower partition bound (inclusive). - -| `spark.cdm.filter.cassandra.partition.max` -| `9223372036854775807` -| Default is `2^127-1` (when using `RandomPartitioner`) and `9223372036854775807` (2^63-1) otherwise. -Upper partition bound (inclusive). - -| `spark.cdm.filter.cassandra.whereCondition` -| -| CQL added to the `WHERE` clause of `SELECT` statements from Origin. - -|=== - - -[[cdm-java-filter-params]] -=== Java filter parameters - -Java filters are applied on the client node. -Data must be pulled from the Origin cluster and then filtered. -However, this option may have a lower impact on the production cluster than xref:cdm-cassandra-filter-params[Cassandra filters]. -Java filters put load onto the {cstar-data-migrator} processing node, by sending more data from Cassandra. -Cassandra filters put load on the Cassandra nodes, notably because {cstar-data-migrator} specifies `ALLOW FILTERING`, which could cause the coordinator node to perform a lot more work. - -By default, these parameters are commented out. - -[cols="2,1,4"] -|=== -|Property | Default | Notes - -| `spark.cdm.filter.java.token.percent` -| `100` -| Percent (between 1 and 100) of the token in each Split that will be migrated. -This property is used to do a wide and random sampling of the data. -The percentage value is applied to each split. -Invalid percentages will be treated as 100. - -| `spark.cdm.filter.java.writetime.min` -| `0` -| The lowest (inclusive) writetime values to be migrated. -Using the `spark.cdm.filter.java.writetime.min` and `spark.cdm.filter.java.writetime.max` thresholds, {cstar-data-migrator} can filter records based on their writetimes. -The maximum writetime of the columns configured at `spark.cdm.schema.origin.column.writetime.names` will be compared to the `.min` and `.max` thresholds, which must be in **microseconds since the epoch**. -If the `spark.cdm.schema.origin.column.writetime.names` are not specified, or the thresholds are null or otherwise invalid, the filter will be ignored. -Note that `spark.cdm.s.perfops.batchSize` will be ignored when this filter is in place; a value of 1 will be used instead. - -| `spark.cdm.filter.java.writetime.max` -| `9223372036854775807` -| The highest (inclusive) writetime values to be migrated. -Maximum timestamp of the columns specified by `spark.cdm.schema.origin.column.writetime.names`; if that property is not specified, or is for some reason null, the filter is ignored. - -| `spark.cdm.filter.java.column.name` -| -| Filter rows based on matching a configured value. -With `spark.cdm.filter.java.column.name`, specify the column name against which the `spark.cdm.filter.java.column.value` is compared. -Must be on the column list specified at `spark.cdm.schema.origin.column.names`. -The column value will be converted to a String, trimmed of whitespace on both ends, and compared. - -| `spark.cdm.filter.java.column.value` -| -| String value to use as comparison. -Whitespace on the ends of `spark.cdm.filter.java.column.value` will be trimmed. -|=== - - -[[cdm-constant-column-feature-params]] -=== Constant column feature parameters - -The constant columns feature allows you to add constant columns to the target table. -If used, the `spark.cdm.feature.constantColumns.names`, `spark.cdm.feature.constantColumns.types`, and `spark.cdm.feature.constantColumns.values` lists must all be the same length. - -By default, these parameters are commented out. - -[cols="2,1,3"] -|=== -|Property | Default | Notes - -| `spark.cdm.feature.constantColumns.names` -| -| A comma-separated list of column names, such as `const1,const2`. - -| `spark.cdm.feature.constantColumns.type` -| -| A comma-separated list of column types. - -| `spark.cdm.feature.constantColumns.values` -| -| A comma-separated list of hard-coded values. -Each value should be provided as you would use on the `CQLSH` command line. -Examples: `'abcd'` for a string; `1234` for an int, and so on. - -| `spark.cdm.feature.constantColumns.splitRegex` -| `,` -| Defaults to comma, but can be any regex character that works with `String.split(regex)`; this option is needed because some type values contain commas, such as in lists, maps, and sets. - -|=== - - -[[cdm-explode-map-feature-params]] -=== Explode map feature parameters - -The explode map feature allows you convert an Origin table Map into multiple Target table records. - -By default, these parameters are commented out. - -[cols="3,3"] -|=== -|Property | Notes - -| `spark.cdm.feature.explodeMap.origin.name` -| The name of the map column, such as `my_map`. -Must be defined on `spark.cdm.schema.origin.column.names`, and the corresponding type on `spark.cdm.schema.origin.column.types` must be a map. - -| `spark.cdm.feature.explodeMap.origin.name.key` -| The name of the column on the Target table that will hold the map key, such as `my_map_key`. -This key must be present on the Target primary key `spark.cdm.schema.target.column.id.names`. - -| `spark.cdm.feature.explodeMap.origin.value` -| The name of the column on the Target table that will hold the map value, such as `my_map_value`. -|=== - - -[[cdm-guardrail-feature-params]] -=== Guardrail feature parameter - -The guardrail feature manages records that exceed guardrail checks. -The Guardrail job will generate a report; other jobs will skip records that exceed the guardrail limit. - -By default, these parameters are commented out. - -[cols="3,1,3"] -|=== -|Property | Default | Notes - -| `spark.cdm.feature.guardrail.colSizeInKB` -| `0` -| The `0` default means the guardrail check is not done. -If set, table records with one or more fields that exceed the column size in kB will be flagged. -Note this is kB (base 10), not kiB (base 2). - -|=== - - -[[cdm-tls-ssl-connection-params]] -=== TLS (SSL) connection parameters - -TLS (SSL) connection parameters, if configured, for Origin and Target. -Note that a secure connect bundle (SCB) embeds these details. - -By default, these parameters are commented out. - -[cols="3,3,3"] -|=== -|Property | Default | Notes - -| `spark.cdm.connect.origin.tls.enabled` -| `false` -| If TLS is used, set to `true`. - -| `spark.cdm.connect.origin.tls.trustStore.path` -| -| Path to the Java truststore file. - -| `spark.cdm.connect.origin.tls.trustStore.password` -| -| Password needed to open the truststore. - -| `spark.cdm.connect.origin.tls.trustStore.type` -| `JKS` -| - -| `spark.cdm.connect.origin.tls.keyStore.path` -| -| Path to the Java keystore file. - -| `spark.cdm.connect.origin.tls.keyStore.password` -| -| Password needed to open the keystore. - -| `spark.cdm.connect.origin.tls.enabledAlgorithms` -| `TLS_RSA_WITH_AES_128_CBC_SHA`,`TLS_RSA_WITH_AES_256_CBC_SHA` -| - -| `spark.cdm.connect.target.tls.enabled` -| `false` -| If TLS is used, set to `true`. - -| `spark.cdm.connect.target.tls.trustStore.path` -| -| Path to the Java truststore file. - -| `spark.cdm.connect.target.tls.trustStore.password` -| -| Password needed to open the truststore. - -| `spark.cdm.connect.target.tls.trustStore.type` -| `JKS` -| - -| `spark.cdm.connect.target.tls.keyStore.path` -| -| Path to the Java keystore file. - -| `spark.cdm.connect.target.tls.keyStore.password` -| -| Password needed to open the keystore. - -| `spark.cdm.connect.target.tls.enabledAlgorithms` -| `TLS_RSA_WITH_AES_128_CBC_SHA`,`TLS_RSA_WITH_AES_256_CBC_SHA` -| - -|=== \ No newline at end of file +* xref:cdm-parameters.adoc#cdm-connection-params[Common connection parameters for Origin and Target] +* xref:cdm-parameters.adoc#cdm-origin-schema-params[Origin schema parameters] +* xref:cdm-parameters.adoc#cdm-target-schema-params[Target schema parameter] +* xref:cdm-parameters.adoc#cdm-auto-correction-params[Auto-correction parameters] +* xref:cdm-parameters.adoc#cdm-performance-operations-params[Performance and operations parameters] +* xref:cdm-parameters.adoc#cdm-transformation-params[Transformation parameters] +* xref:cdm-parameters.adoc#cdm-cassandra-filter-params[Cassandra filter parameters] +* xref:cdm-parameters.adoc#cdm-java-filter-params[Java filter parameters] +* xref:cdm-parameters.adoc#cdm-constant-column-feature-params[Constant column feature parameters] +* xref:cdm-parameters.adoc#cdm-explode-map-feature-params[Explode map feature parameters] +* xref:cdm-parameters.adoc#cdm-guardrail-feature-params[Guardrail feature parameters] +* xref:cdm-parameters.adoc#cdm-tls-ssl-connection-params[TLS (SSL) connection parameters] diff --git a/modules/ROOT/pages/cdm-parameters.adoc b/modules/ROOT/pages/cdm-parameters.adoc index eb624252..27524b6e 100644 --- a/modules/ROOT/pages/cdm-parameters.adoc +++ b/modules/ROOT/pages/cdm-parameters.adoc @@ -12,14 +12,14 @@ Each parameter below offers a different connection. Review each option to determ | `spark.cdm.schema.origin.keyspaceTable` | | Required - the `.` of the table to be migrated. -Table must exist in Origin. +Table must exist in the Origin cluster. | `spark.cdm.schema.origin.column.ttl.automatic` | `true` | Default is `true`, unless `spark.cdm.schema.origin.column.ttl.names` is specified. When `true`, determine the Time To Live (TTL) of the Target record. Find the maximum TTL of all Origin columns that can have TTL set. This excludes partition key, clustering key, collections/UDT/tuple, and frozen columns. -When `false`, and `spark.cdm.schema.origin.column.ttl.names` is not set, the Target record has the TTL determined by the Target table configuration. +When `false`, and `spark.cdm.schema.origin.column.ttl.names` is not set, the Target record has the Target table configuration determine the TTL. | `spark.cdm.schema.origin.column.ttl.names` | @@ -47,7 +47,7 @@ Example: `data_col1,data_col2,...` | `spark.cdm.schema.origin.column.names.to.target` | | Default is empty. -If column names are changed between Origin and Target, then this mapped list provides a mechanism to associate the two. +If column names are changed between the Origin and Target clusters, then this mapped list provides a mechanism to associate the two. The format is `:`. The list is comma-separated. You only need to list renamed columns. @@ -57,7 +57,7 @@ You only need to list renamed columns. [NOTE] ==== For optimization reasons, {cstar-data-migrator} does not migrate TTL and writetime at the field level. -Instead, {cstar-data-migrator} finds the field with the highest TTL, and the field with the highest writetime within an Origin table row, and uses those values on the entire Target table row. +Instead, {cstar-data-migrator} finds the field with the highest TTL and the field with the highest writetime within an Origin table row, and uses those values on the entire Target table row. ==== [[cdm-target-schema-params]] @@ -71,7 +71,7 @@ Instead, {cstar-data-migrator} finds the field with the highest TTL, and the fie | Equals the value of `spark.cdm.schema.origin.keyspaceTable` | This parameter is commented out. It's the `.` of the table to be migrated into the Target. -Table must exist in Target. +Table must exist in the Target cluster. |=== @@ -79,11 +79,11 @@ Table must exist in Target. [[cdm-auto-correction-params]] == Auto-correction parameters -Auto-correction parameters allow {cstar-data-migrator} to correct data differences found between Origin and Target when you run the `DiffData` program. +Auto-correction parameters allow {cstar-data-migrator} to correct data differences found between the Origin and Target clusters when you run the `DiffData` program. Typically, these are run-disabled for "what if" migration testing, and generate a list of data discrepancies. The reasons for these discrepancies can then be investigated, and if necessary the parameters below can be enabled. -For information about invoking `DiffData` in a {cstar-data-migrator} command, see xref:#cdm-validation-steps[{cstar-data-migrator} steps in validation mode]. +For information about invoking `DiffData` in a {cstar-data-migrator} command, see xref:#cassandra-data-migrator.adoc#cdm-validation-steps[{cstar-data-migrator} steps in validation mode]. [cols="2,2,3a"] |=== @@ -91,16 +91,16 @@ For information about invoking `DiffData` in a {cstar-data-migrator} command, se | `spark.cdm.autocorrect.missing` | `false` -| When `true`, data that is missing in Target but is found in Origin is re-migrated to Target. +| When `true`, data that is missing in the Target cluster but is found in the Origin cluster is re-migrated to the Target cluster. | `spark.cdm.autocorrect.mismatch` | `false` -| When `true`, data that is different between Origin and Target is reconciled. +| When `true`, data that is different between the Origin and Target clusters is reconciled. [NOTE] ==== The `TIMESTAMP` of records may have an effect. -If the `WRITETIME` of the Origin record (determined with `.writetime.names`) is earlier than the `WRITETIME` of the Target record, the change does appear in Target. -This comparative state may be particularly challenging to troubleshoot if individual columns (cells) have been modified in Target. +If the `WRITETIME` of the Origin record (determined with `.writetime.names`) is earlier than the `WRITETIME` of the target record, the change does appear in the Target cluster. +This comparative state may be particularly challenging to troubleshoot if individual columns (cells) have been modified in theTarget cluster. ==== | `spark.cdm.autocorrect.missing.counter` @@ -111,7 +111,7 @@ By default, counter tables are not copied when missing, unless explicitly set. | `spark.tokenrange.partitionFile` | `./._partitions.csv` | Commented out. -This CSV file is used as input, as well as output when applicable. +This CSV file is used as input, as well as output, when applicable. If the file exists, only the partition ranges in this file are migrated or validated. Similarly, if exceptions occur while migrating or validating, partition ranges with exceptions are logged to this file. @@ -129,9 +129,9 @@ Performance and operations parameters that can affect migration throughput, erro | `spark.cdm.perfops.numParts` | `10000` -| In standard operation, the full token range (-2^63 .. 2^63-1) is divided into a number of parts, which are parallel-processed. +| In standard operation, the full token range of -2^63 .. 2^63-1 is divided into a number of parts, which are parallel-processed. You should aim for each part to comprise a total of ≈1-10GB of data to migrate. -During initial testing, you may want this to be a small number (such as `1`). +During initial testing, you may want this to be a small number, such as `1`. | `spark.cdm.perfops.batchSize` | `5` @@ -143,24 +143,24 @@ Ideally < 1% of batches have more than 1 partition. | `spark.cdm.perfops.ratelimit.origin` | `20000` -| Concurrent number of operations across all parallel threads from Origin. -This value may be adjusted up (or down), depending on the amount of data and the processing capacity of the Origin cluster. +| Concurrent number of operations across all parallel threads from the Origin cluster. +This value may be adjusted up or down, depending on the amount of data and the processing capacity of the Origin cluster. | `spark.cdm.perfops.ratelimit.target` | `40000` -| Concurrent number of operations across all parallel threads from Target. +| Concurrent number of operations across all parallel threads from the Target cluster. This may be adjusted up (or down), depending on the amount of data and the processing capacity of the Target cluster. | `spark.cdm.perfops.consistency.read` | `LOCAL_QUORUM` | Commented out. -Read consistency from Origin, and also from Target when records are read for comparison purposes. +Read consistency from Origin, and also from the Target cluster when records are read for comparison purposes. The consistency parameters may be one of: `ANY`, `ONE`, `TWO`, `THREE`, `QUORUM`, `LOCAL_ONE`, `EACH_QUORUM`, `LOCAL_QUORUM`, `SERIAL`, `LOCAL_SERIAL`, `ALL`. | `spark.cdm.perfops.consistency.write` | `LOCAL_QUORUM` | Commented out. -Write consistency to Target. +Write consistency to the Target cluster. The consistency parameters may be one of: `ANY`, `ONE`, `TWO`, `THREE`, `QUORUM`, `LOCAL_ONE`, `EACH_QUORUM`, `LOCAL_QUORUM`, `SERIAL`, `LOCAL_SERIAL`, `ALL`. | `spark.cdm.perfops.printStatsAfter` @@ -171,7 +171,7 @@ Number of rows of processing after which a progress log entry is made. | `spark.cdm.perfops.fetchSizeInRows` | `1000` | Commented out. -This parameter affects the frequency of reads from Origin and the frequency of flushes to Target. +This parameter affects the frequency of reads from the Origin cluster and the frequency of flushes to the Target cluster. | `spark.cdm.perfops.errorLimit` | `0` @@ -185,7 +185,7 @@ Recommendation: set this parameter to a non-zero value **only when not doing** a [[cdm-transformation-params]] == Transformation parameters -Parameters to perform schema transformations between Origin and Target. +Parameters to perform schema transformations between the Origin and Target clusters. By default, these parameters are commented out. @@ -197,7 +197,7 @@ By default, these parameters are commented out. | `1685577600000` | Timestamp value in milliseconds. Partition and clustering columns cannot have null values. -If they are added as part of a schema transformation between Origin and Target, it is possible that the Origin side is null. +If they are added as part of a schema transformation between the Origin and Target clusters, it is possible that the Origin side is null. In this case, the `Migrate` data operation fails. This parameter allows a crude constant value to be used in its place that is separate from the constant values feature. @@ -205,13 +205,13 @@ This parameter allows a crude constant value to be used in its place that is sep | `0` | Default is 0 (disabled). Timestamp value in microseconds to use as the `WRITETIME` for the Target record. -This is useful when the `WRITETIME` of the record in Origin cannot be determined (such as when the only non-key columns are collections). +This is useful when the `WRITETIME` of the record in the Origin cluster cannot be determined Such an example is when the only non-key columns are collections. This parameter allows a crude constant value to be used in its place, and overrides `spark.cdm.schema.origin.column.writetime.names`. | `spark.cdm.transform.custom.writetime.incrementBy` | `0` | Default is `0`. -This is useful when you have a list that is not frozen, and you are updating this using the autocorrect feature. +This is useful when you have a list that is not frozen and you are updating this using the autocorrect feature. Lists are not idempotent, and subsequent UPSERTs add duplicates to the list. | `spark.cdm.transform.codecs` @@ -259,17 +259,17 @@ By default, these parameters are commented out. | `spark.cdm.filter.cassandra.partition.min` | `-9223372036854775808` -| Default is `0` (when using `RandomPartitioner`) and `-9223372036854775808` (-2^63) otherwise. -Lower partition bound (inclusive). +| Default is `0` when using `RandomPartitioner` and `-9223372036854775808` or -2^63 otherwise. +Lower partition bound of the range is inclusive. | `spark.cdm.filter.cassandra.partition.max` | `9223372036854775807` -| Default is `2^127-1` (when using `RandomPartitioner`) and `9223372036854775807` (2^63-1) otherwise. -Upper partition bound (inclusive). +| Default is `2^127-1` when using `RandomPartitioner` and `9223372036854775807` or 2^63-1 otherwise. +Upper partition bound of the range is inclusive. | `spark.cdm.filter.cassandra.whereCondition` | -| CQL added to the `WHERE` clause of `SELECT` statements from Origin. +| CQL added to the `WHERE` clause of `SELECT` statements from the Origin cluster. |=== @@ -280,8 +280,9 @@ Upper partition bound (inclusive). Java filters are applied on the client node. Data must be pulled from the Origin cluster and then filtered. However, this option may have a lower impact on the production cluster than xref:cdm-cassandra-filter-params[Cassandra filters]. -Java filters put load onto the {cstar-data-migrator} processing node, by sending more data from Cassandra. -Cassandra filters put load on the Cassandra nodes, notably because {cstar-data-migrator} specifies `ALLOW FILTERING`, which could cause the coordinator node to perform a lot more work. +Java filters put a load onto the {cstar-data-migrator} processing node. +They send more data from Cassandra. +Cassandra filters put a load on the Cassandra nodes because {cstar-data-migrator} specifies `ALLOW FILTERING`, which could cause the coordinator node to perform a lot more work. By default, these parameters are commented out. @@ -291,7 +292,7 @@ By default, these parameters are commented out. | `spark.cdm.filter.java.token.percent` | `100` -| Percent (between 1 and 100) of the token in each Split that is migrated. +| Between 1 and 100 percent of the token in each Split that is migrated. This property is used to do a wide and random sampling of the data. The percentage value is applied to each split. Invalid percentages are treated as 100. @@ -361,7 +362,7 @@ This option is needed because some type values contain commas, such as in lists, [[cdm-explode-map-feature-params]] == Explode map feature parameters -The explode map feature allows you convert an Origin table Map into multiple Target table records. +The explode map feature allows you convert an Origin table map into multiple Target table records. By default, these parameters are commented out. @@ -398,7 +399,7 @@ By default, these parameters are commented out. | `0` | The `0` default means the guardrail check is not done. If set, table records with one or more fields that exceed the column size in kB are flagged. -Note this is kB (base 10), not kiB (base 2). +Note this is kB which is base 10, not kiB which is base 2. |=== @@ -406,7 +407,7 @@ Note this is kB (base 10), not kiB (base 2). [[cdm-tls-ssl-connection-params]] == TLS (SSL) connection parameters -TLS (SSL) connection parameters, if configured, for Origin and Target. +These are TLS (SSL) connection parameters, if configured, for the Origin and Target clusters. Note that a secure connect bundle (SCB) embeds these details. By default, these parameters are commented out. diff --git a/modules/ROOT/pages/cdm-prereqs.adoc b/modules/ROOT/pages/cdm-prereqs.adoc index 3a078368..f2845bba 100644 --- a/modules/ROOT/pages/cdm-prereqs.adoc +++ b/modules/ROOT/pages/cdm-prereqs.adoc @@ -33,7 +33,7 @@ image:https://img.shields.io/github/v/release/datastax/cassandra-data-migrator?c [NOTE] ==== Version 4.x of {cstar-data-migrator} is not backward-compatible with `*.properties` files created in previous versions, and package names have changed. -If you're starting new, we recommended that you use the latest released version. +If you're starting new, DataStax recommendeds that you use the latest released version. ==== [[cdm-build-jar-local]] @@ -52,4 +52,4 @@ cd cassandra-data-migrator mvn clean package ---- -The fat jar (`cassandra-data-migrator-x.y.z.jar`) file should be present now in the `target` folder. +The fat jar file, or `cassandra-data-migrator-x.y.z.jar`, should be present now in the `target` folder. diff --git a/modules/ROOT/pages/cdm-steps.adoc b/modules/ROOT/pages/cdm-steps.adoc index 7aa2891d..e25d2d18 100644 --- a/modules/ROOT/pages/cdm-steps.adoc +++ b/modules/ROOT/pages/cdm-steps.adoc @@ -1,6 +1,6 @@ = {cstar-data-migrator} -Use {cstar-data-migrator} to migrate and validate tables between Origin and Target Cassandra clusters, with available logging and reconciliation support. +Use {cstar-data-migrator} to migrate and validate tables between the Origin and Target Cassandra clusters, with available logging and reconciliation support. == Use {cstar-data-migrator} @@ -66,8 +66,8 @@ Differences noted in the log file are listed by primary-key values. You can also run the {cstar-data-migrator} validation job in an **AutoCorrect** mode, which can: -* Add any missing records from Origin to Target. -* Update any mismatched records between Origin and Target; this action makes Target the same as Origin. +* Add any missing records from the Origin to Target cluster. +* Update any mismatched records between the Origin and Target clusters; this action makes the Target cluster the same as the Origin cluster. To enable or disable this feature, use one or both of the following settings in your `*.properties` configuration file. @@ -79,8 +79,8 @@ spark.cdm.autocorrect.mismatch false|true [IMPORTANT] ==== -The {cstar-data-migrator} validation job nevers delete records from Target. -The job only adds or updates data on Target. +The {cstar-data-migrator} validation job nevers delete records from the Target cluster. +The job only adds or updates data on the Target cluster. ==== [[cdm-guardrail-checks]] From 9ac17f510b3c60fd92f0a8569ae47ef1fed3136b Mon Sep 17 00:00:00 2001 From: beajohnson Date: Wed, 12 Jun 2024 08:50:49 -0700 Subject: [PATCH 15/32] moved content around --- modules/ROOT/nav.adoc | 2 +- .../ROOT/pages/cassandra-data-migrator.adoc | 159 +----------------- modules/ROOT/pages/cdm-parameters.adoc | 80 ++++----- modules/ROOT/pages/cdm-prereqs.adoc | 37 +--- modules/ROOT/pages/cdm-steps.adoc | 49 +++++- 5 files changed, 89 insertions(+), 238 deletions(-) diff --git a/modules/ROOT/nav.adoc b/modules/ROOT/nav.adoc index 6e99c264..7a17267d 100644 --- a/modules/ROOT/nav.adoc +++ b/modules/ROOT/nav.adoc @@ -25,7 +25,7 @@ //phase 5 * xref:connect-clients-to-target.adoc[] * Cassandra Data Migrator -** xref:cassandra-data-migrator.adoc[] +** xref:cassandra-data-migrator.adoc[Overview] ** xref:cdm-prereqs.adoc[Prerequisites] ** xref:cdm-steps.adoc[Migrate data] ** xref:cdm-parameters.adoc[Parameters] diff --git a/modules/ROOT/pages/cassandra-data-migrator.adoc b/modules/ROOT/pages/cassandra-data-migrator.adoc index a8da212d..0e3476e0 100644 --- a/modules/ROOT/pages/cassandra-data-migrator.adoc +++ b/modules/ROOT/pages/cassandra-data-migrator.adoc @@ -1,23 +1,7 @@ = {cstar-data-migrator} -Use {cstar-data-migrator} to migrate and validate tables between Origin and Target Cassandra clusters, with available logging and reconciliation support. +Use {cstar-data-migrator} to migrate and validate tables between origin and target Cassandra clusters, with available logging and reconciliation support. -[[cdm-prereqs]] -== {cstar-data-migrator} prerequisites - -* Install or switch to Java 11. -The Spark binaries are compiled with this version of Java. -* Install https://archive.apache.org/dist/spark/spark-3.5.1/[Spark 3.5.1] on a single VM (no cluster necessary) where you want to run this job. -* Optionally, install https://maven.apache.org/download.cgi[Maven] 3.9.x if you want to build the JAR for local development. - -You can install Apache Spark by running the following commands: - -[source,bash] ----- -wget https://archive.apache.org/dist/spark/spark-3.5.1/spark-3.5.1-bin-hadoop3-scala2.13.tgz - -tar -xvzf spark-3.5.1-bin-hadoop3-scala2.13.tgz ----- [[cdm-install-as-container]] == Install {cstar-data-migrator} as a Container @@ -35,7 +19,7 @@ image:https://img.shields.io/github/v/release/datastax/cassandra-data-migrator?c [NOTE] ==== Version 4.x of {cstar-data-migrator} is not backward-compatible with `*.properties` files created in previous versions, and package names have changed. -If you're starting new, we recommended that you use the latest released version. +If you're starting new, use the latest released version if possible. ==== [[cdm-build-jar-local]] @@ -53,145 +37,8 @@ cd cassandra-data-migrator mvn clean package ---- -The fat jar (`cassandra-data-migrator-x.y.z.jar`) file should be present now in the `target` folder. - -[[cdm-steps]] -== {cstar-data-migrator} steps - -1. Configure for your environment the `cdm*.properties` file that's provided in the {cstar-data-migrator} https://github.com/datastax/cassandra-data-migrator/tree/main/src/resources[GitHub repo]. -The file can have any name. -It does not need to be `cdm.properties` or `cdm-detailed.properties`. -In both versions, only the parameters that aren't commented out are processed with the `spark-submit` job. -Other parameter values use defaults or are ignored. -See the descriptions and defaults in each file. -Refer to: - * The simplified sample properties configuration, https://github.com/datastax/cassandra-data-migrator/blob/main/src/resources/cdm.properties[cdm.properties]. - This file contains only those parameters that are commonly configured. - * The complete sample properties configuration, https://github.com/datastax/cassandra-data-migrator/blob/main/src/resources/cdm-detailed.properties[cdm-detailed.properties], for the full set of configurable settings. - -2. Place the properties file that you elected to use and customize where it can be accessed while running the job using `spark-submit`. - -3. Run the job using `spark-submit` command: - -[source,bash] ----- -./spark-submit --properties-file cdm.properties \ ---conf spark.cdm.schema.origin.keyspaceTable="." \ ---master "local[*]" --driver-memory 25G --executor-memory 25G \ ---class com.datastax.cdm.job.Migrate cassandra-data-migrator-x.y.z.jar &> logfile_name_$(date +%Y%m%d_%H_%M).txt ----- - -[TIP] -==== -* Above command generates a log file `logfile_name_*.txt` to avoid log output on the console. -* Update the memory options (driver & executor memory) based on your use-case -==== - -[[cdm-validation-steps]] -== {cstar-data-migrator} steps in validation mode - -To run your migration job with {cstar-data-migrator} in **data validation mode**, use class option `--class com.datastax.cdm.job.DiffData`. -Example: - -[source,bash] ----- -./spark-submit --properties-file cdm.properties \ ---conf spark.cdm.schema.origin.keyspaceTable="." \ ---master "local[*]" --driver-memory 25G --executor-memory 25G \ ---class com.datastax.cdm.job.DiffData cassandra-data-migrator-x.y.z.jar &> logfile_name_$(date +%Y%m%d_%H_%M).txt ----- - -The {cstar-data-migrator} validation job reports differences as `ERROR` entries in the log file. -Example: - -[source,bash] ----- -23/04/06 08:43:06 ERROR DiffJobSession: Mismatch row found for key: [key3] Mismatch: Target Index: 1 Origin: valueC Target: value999) -23/04/06 08:43:06 ERROR DiffJobSession: Corrected mismatch row in target: [key3] -23/04/06 08:43:06 ERROR DiffJobSession: Missing target row found for key: [key2] -23/04/06 08:43:06 ERROR DiffJobSession: Inserted missing row in target: [key2] ----- - -[TIP] -==== -To get the list of missing or mismatched records, grep for all `ERROR` entries in the log files. -Differences noted in the log file are listed by primary-key values. -==== +The fat jar, or`cassandra-data-migrator-x.y.z.jar`, file should be present now in the `target` folder. -You can also run the {cstar-data-migrator} validation job in an **AutoCorrect** mode. This mode can: - -* Add any missing records from the Origin to Target cluster. -* Update any mismatched records between Origin and Target; this action makes Target the same as Origin. - -To enable or disable this feature, use one or both of the following settings in your `*.properties` configuration file. - -[source,properties] ----- -spark.cdm.autocorrect.missing false|true -spark.cdm.autocorrect.mismatch false|true ----- - -[IMPORTANT] -==== -You may still have differences in your clusters after running validation job as {cstar-data-migrator} does not delete records from Target that don't exists on Origin. -The job only adds or updates records on the Target cluster that exists on the Origin cluster. -==== - -[[cdm--partition-ranges]] -== Migrating or validating specific partition ranges - -You can also use {cstar-data-migrator} to migrate or validate specific partition ranges, by using a **partition-file** with the name `./._partitions.csv`. -Use the following format in the CSV file, in the current folder as input. -Example: - -[source,csv] ----- --507900353496146534,-107285462027022883 --506781526266485690,1506166634797362039 -2637884402540451982,4638499294009575633 -798869613692279889,8699484505161403540 ----- - -Each line in the CSV represents a partition-range (`min,max`). - -Alternatively, you can also pass the partition-file with a command-line parameter. -Example: - -[source,bash] ----- -./spark-submit --properties-file cdm.properties \ - --conf spark.cdm.schema.origin.keyspaceTable="." \ - --conf spark.cdm.tokenrange.partitionFile.input="//" \ - --master "local[*]" --driver-memory 25G --executor-memory 25G \ - --class com.datastax.cdm.job. cassandra-data-migrator-x.y.z.jar &> logfile_name_$(date +%Y%m%d_%H_%M).txt ----- - -This mode is specifically useful to processes a subset of partition-ranges that may have failed during a previous run. - -[NOTE] -==== -A file named `./._partitions.csv` is auto-generated by the migration and validation jobs, in the format shown above. -The file contains any failed partition ranges. -No file is created if there were no failed partitions. -You can use the CSV as input to process any failed partition in a subsequent run. -==== - -[[cdm-guardrail-checks]] -== Perform large-field guardrail violation checks - -Use {cstar-data-migrator} to identify large fields from a table that may break your cluster guardrails. -For example, {astra_db} has a 10MB limit for a single large field. -Specify `--class com.datastax.cdm.job.GuardrailCheck` on the command. -Example: - -[source,bash] ----- -./spark-submit --properties-file cdm.properties \ ---conf spark.cdm.schema.origin.keyspaceTable="." \ ---conf spark.cdm.feature.guardrail.colSizeInKB=10000 \ ---master "local[*]" --driver-memory 25G --executor-memory 25G \ ---class com.datastax.cdm.job.GuardrailCheck cassandra-data-migrator-4.x.x.jar &> logfile_name_$(date +%Y%m%d_%H_%M).txt ----- [[cdm-reference]] == {cstar-data-migrator} reference diff --git a/modules/ROOT/pages/cdm-parameters.adoc b/modules/ROOT/pages/cdm-parameters.adoc index 27524b6e..a6668e5a 100644 --- a/modules/ROOT/pages/cdm-parameters.adoc +++ b/modules/ROOT/pages/cdm-parameters.adoc @@ -12,26 +12,26 @@ Each parameter below offers a different connection. Review each option to determ | `spark.cdm.schema.origin.keyspaceTable` | | Required - the `.` of the table to be migrated. -Table must exist in the Origin cluster. +Table must exist in the origin cluster. | `spark.cdm.schema.origin.column.ttl.automatic` | `true` | Default is `true`, unless `spark.cdm.schema.origin.column.ttl.names` is specified. -When `true`, determine the Time To Live (TTL) of the Target record. -Find the maximum TTL of all Origin columns that can have TTL set. This excludes partition key, clustering key, collections/UDT/tuple, and frozen columns. -When `false`, and `spark.cdm.schema.origin.column.ttl.names` is not set, the Target record has the Target table configuration determine the TTL. +When `true`, determine the Time To Live (TTL) of the target record. +Find the maximum TTL of all origin columns that can have TTL set. This excludes partition key, clustering key, collections/UDT/tuple, and frozen columns. +When `false`, and `spark.cdm.schema.origin.column.ttl.names` is not set, the target record has the target table configuration determine the TTL. | `spark.cdm.schema.origin.column.ttl.names` | | Default is empty, meaning the names are determined automatically if `spark.cdm.schema.origin.column.ttl.automatic` is set. -Specify a subset of eligible columns that are used to calculate the TTL of the Target record. +Specify a subset of eligible columns that are used to calculate the TTL of the target record. | `spark.cdm.schema.origin.column.writetime.automatic` | `true` | Default is `true`, unless `spark.cdm.schema.origin.column.writetime.names` is specified. -When `true`, determine the `WRITETIME` of the Target record. -Find the maximum `WRITETIME` of all Origin columns that can have `WRITETIME` set. This excludes partition key, clustering key, collections/UDT/tuple, and frozen columns. -When `false`, and `spark.cdm.schema.origin.column.writetime.names` is not set, the Target table configuration determines the Target record's `WRITETIME`. +When `true`, determine the `WRITETIME` of the target record. +Find the maximum `WRITETIME` of all origin columns that can have `WRITETIME` set. This excludes partition key, clustering key, collections/UDT/tuple, and frozen columns. +When `false`, and `spark.cdm.schema.origin.column.writetime.names` is not set, the target table configuration determines the target record's `WRITETIME`. [NOTE] ==== @@ -41,13 +41,13 @@ The `spark.cdm.transform.custom.writetime` property, if set, overrides `spark.cd | `spark.cdm.schema.origin.column.writetime.names` | | Default is empty, meaning the names are determined automatically if `spark.cdm.schema.origin.column.writetime.automatic` is set. -Otherwise, specify a subset of eligible columns that are used to calculate the WRITETIME of the Target record. +Otherwise, specify a subset of eligible columns that are used to calculate the WRITETIME of the target record. Example: `data_col1,data_col2,...` | `spark.cdm.schema.origin.column.names.to.target` | | Default is empty. -If column names are changed between the Origin and Target clusters, then this mapped list provides a mechanism to associate the two. +If column names are changed between the origin and target clusters, then this mapped list provides a mechanism to associate the two. The format is `:`. The list is comma-separated. You only need to list renamed columns. @@ -57,7 +57,7 @@ You only need to list renamed columns. [NOTE] ==== For optimization reasons, {cstar-data-migrator} does not migrate TTL and writetime at the field level. -Instead, {cstar-data-migrator} finds the field with the highest TTL and the field with the highest writetime within an Origin table row, and uses those values on the entire Target table row. +Instead, {cstar-data-migrator} finds the field with the highest TTL and the field with the highest writetime within an origin table row, and uses those values on the entire target table row. ==== [[cdm-target-schema-params]] @@ -70,8 +70,8 @@ Instead, {cstar-data-migrator} finds the field with the highest TTL and the fiel | `spark.cdm.schema.target.keyspaceTable` | Equals the value of `spark.cdm.schema.origin.keyspaceTable` | This parameter is commented out. -It's the `.` of the table to be migrated into the Target. -Table must exist in the Target cluster. +It's the `.` of the table to be migrated into the target. +Table must exist in the target cluster. |=== @@ -79,7 +79,7 @@ Table must exist in the Target cluster. [[cdm-auto-correction-params]] == Auto-correction parameters -Auto-correction parameters allow {cstar-data-migrator} to correct data differences found between the Origin and Target clusters when you run the `DiffData` program. +Auto-correction parameters allow {cstar-data-migrator} to correct data differences found between the origin and target clusters when you run the `DiffData` program. Typically, these are run-disabled for "what if" migration testing, and generate a list of data discrepancies. The reasons for these discrepancies can then be investigated, and if necessary the parameters below can be enabled. @@ -91,16 +91,16 @@ For information about invoking `DiffData` in a {cstar-data-migrator} command, se | `spark.cdm.autocorrect.missing` | `false` -| When `true`, data that is missing in the Target cluster but is found in the Origin cluster is re-migrated to the Target cluster. +| When `true`, data that is missing in the target cluster but is found in the origin cluster is re-migrated to the target cluster. | `spark.cdm.autocorrect.mismatch` | `false` -| When `true`, data that is different between the Origin and Target clusters is reconciled. +| When `true`, data that is different between the origin and target clusters is reconciled. [NOTE] ==== The `TIMESTAMP` of records may have an effect. -If the `WRITETIME` of the Origin record (determined with `.writetime.names`) is earlier than the `WRITETIME` of the target record, the change does appear in the Target cluster. -This comparative state may be particularly challenging to troubleshoot if individual columns (cells) have been modified in theTarget cluster. +If the `WRITETIME` of the origin record that is determined with `.writetime.names` is earlier than the `WRITETIME` of the target record, the change does appear in the target cluster. +This comparative state may be particularly challenging to troubleshoot if individual columns or cells have been modified in the target cluster. ==== | `spark.cdm.autocorrect.missing.counter` @@ -129,13 +129,13 @@ Performance and operations parameters that can affect migration throughput, erro | `spark.cdm.perfops.numParts` | `10000` -| In standard operation, the full token range of -2^63 .. 2^63-1 is divided into a number of parts, which are parallel-processed. +| In standard operation, the full token range of -2^63 to 2^63-1 is divided into a number of parts, which are parallel processed. You should aim for each part to comprise a total of ≈1-10GB of data to migrate. During initial testing, you may want this to be a small number, such as `1`. | `spark.cdm.perfops.batchSize` | `5` -| When writing to Target, this comprises the number of records that are put into an `UNLOGGED` batch. +| When writing to the target cluster, this comprises the number of records that are put into an `UNLOGGED` batch. {cstar-data-migrator} tends to work on the same partition at a time. If your partition sizes are larger, this number may be increased. If the `spark.cdm.perfops.batchSize` would mean that more than 1 partition is often contained in a batch, reduce this parameter's value. @@ -143,24 +143,24 @@ Ideally < 1% of batches have more than 1 partition. | `spark.cdm.perfops.ratelimit.origin` | `20000` -| Concurrent number of operations across all parallel threads from the Origin cluster. -This value may be adjusted up or down, depending on the amount of data and the processing capacity of the Origin cluster. +| Concurrent number of operations across all parallel threads from the origin cluster. +This value may be adjusted up or down, depending on the amount of data and the processing capacity of the origin cluster. | `spark.cdm.perfops.ratelimit.target` | `40000` -| Concurrent number of operations across all parallel threads from the Target cluster. -This may be adjusted up (or down), depending on the amount of data and the processing capacity of the Target cluster. +| Concurrent number of operations across all parallel threads from the target cluster. +This may be adjusted up or down, depending on the amount of data and the processing capacity of the target cluster. | `spark.cdm.perfops.consistency.read` | `LOCAL_QUORUM` | Commented out. -Read consistency from Origin, and also from the Target cluster when records are read for comparison purposes. +Read consistency from the origin cluster and from the target cluster when records are read for comparison purposes. The consistency parameters may be one of: `ANY`, `ONE`, `TWO`, `THREE`, `QUORUM`, `LOCAL_ONE`, `EACH_QUORUM`, `LOCAL_QUORUM`, `SERIAL`, `LOCAL_SERIAL`, `ALL`. | `spark.cdm.perfops.consistency.write` | `LOCAL_QUORUM` | Commented out. -Write consistency to the Target cluster. +Write consistency to the arget cluster. The consistency parameters may be one of: `ANY`, `ONE`, `TWO`, `THREE`, `QUORUM`, `LOCAL_ONE`, `EACH_QUORUM`, `LOCAL_QUORUM`, `SERIAL`, `LOCAL_SERIAL`, `ALL`. | `spark.cdm.perfops.printStatsAfter` @@ -171,7 +171,7 @@ Number of rows of processing after which a progress log entry is made. | `spark.cdm.perfops.fetchSizeInRows` | `1000` | Commented out. -This parameter affects the frequency of reads from the Origin cluster and the frequency of flushes to the Target cluster. +This parameter affects the frequency of reads from the origin cluster and the frequency of flushes to the target cluster. | `spark.cdm.perfops.errorLimit` | `0` @@ -185,7 +185,7 @@ Recommendation: set this parameter to a non-zero value **only when not doing** a [[cdm-transformation-params]] == Transformation parameters -Parameters to perform schema transformations between the Origin and Target clusters. +Parameters to perform schema transformations between the origin and target clusters. By default, these parameters are commented out. @@ -197,16 +197,16 @@ By default, these parameters are commented out. | `1685577600000` | Timestamp value in milliseconds. Partition and clustering columns cannot have null values. -If they are added as part of a schema transformation between the Origin and Target clusters, it is possible that the Origin side is null. +If they are added as part of a schema transformation between the origin and target clusters, it is possible that the origin side is null. In this case, the `Migrate` data operation fails. This parameter allows a crude constant value to be used in its place that is separate from the constant values feature. | `spark.cdm.transform.custom.writetime` | `0` | Default is 0 (disabled). -Timestamp value in microseconds to use as the `WRITETIME` for the Target record. -This is useful when the `WRITETIME` of the record in the Origin cluster cannot be determined Such an example is when the only non-key columns are collections. -This parameter allows a crude constant value to be used in its place, and overrides `spark.cdm.schema.origin.column.writetime.names`. +Timestamp value in microseconds to use as the `WRITETIME` for the target record. +This is useful when the `WRITETIME` of the record in the origin cluster cannot be determined. Such an example is when the only non-key columns are collections. +This parameter allows a crude constant value to be used in its place and overrides `spark.cdm.schema.origin.column.writetime.names`. | `spark.cdm.transform.custom.writetime.incrementBy` | `0` @@ -269,7 +269,7 @@ Upper partition bound of the range is inclusive. | `spark.cdm.filter.cassandra.whereCondition` | -| CQL added to the `WHERE` clause of `SELECT` statements from the Origin cluster. +| CQL added to the `WHERE` clause of `SELECT` statements from the origin cluster. |=== @@ -278,7 +278,7 @@ Upper partition bound of the range is inclusive. == Java filter parameters Java filters are applied on the client node. -Data must be pulled from the Origin cluster and then filtered. +Data must be pulled from the origin cluster and then filtered. However, this option may have a lower impact on the production cluster than xref:cdm-cassandra-filter-params[Cassandra filters]. Java filters put a load onto the {cstar-data-migrator} processing node. They send more data from Cassandra. @@ -292,7 +292,7 @@ By default, these parameters are commented out. | `spark.cdm.filter.java.token.percent` | `100` -| Between 1 and 100 percent of the token in each Split that is migrated. +| Between 1 and 100 percent of the token in each split that is migrated. This property is used to do a wide and random sampling of the data. The percentage value is applied to each split. Invalid percentages are treated as 100. @@ -362,7 +362,7 @@ This option is needed because some type values contain commas, such as in lists, [[cdm-explode-map-feature-params]] == Explode map feature parameters -The explode map feature allows you convert an Origin table map into multiple Target table records. +The explode map feature allows you convert an origin table map into multiple target table records. By default, these parameters are commented out. @@ -375,11 +375,11 @@ By default, these parameters are commented out. Must be defined on `spark.cdm.schema.origin.column.names`, and the corresponding type on `spark.cdm.schema.origin.column.types` must be a map. | `spark.cdm.feature.explodeMap.origin.name.key` -| The name of the column on the Target table that holds the map key, such as `my_map_key`. -This key must be present on the Target primary key `spark.cdm.schema.target.column.id.names`. +| The name of the column on the target table that holds the map key, such as `my_map_key`. +This key must be present on the target primary key `spark.cdm.schema.target.column.id.names`. | `spark.cdm.feature.explodeMap.origin.value` -| The name of the column on the Target table that holds the map value, such as `my_map_value`. +| The name of the column on the target table that holds the map value, such as `my_map_value`. |=== @@ -407,7 +407,7 @@ Note this is kB which is base 10, not kiB which is base 2. [[cdm-tls-ssl-connection-params]] == TLS (SSL) connection parameters -These are TLS (SSL) connection parameters, if configured, for the Origin and Target clusters. +These are TLS (SSL) connection parameters, if configured, for the origin and target clusters. Note that a secure connect bundle (SCB) embeds these details. By default, these parameters are commented out. diff --git a/modules/ROOT/pages/cdm-prereqs.adoc b/modules/ROOT/pages/cdm-prereqs.adoc index f2845bba..a548a70d 100644 --- a/modules/ROOT/pages/cdm-prereqs.adoc +++ b/modules/ROOT/pages/cdm-prereqs.adoc @@ -1,6 +1,6 @@ = {cstar-data-migrator} prerequisites -Read the prerequistes below before using the Cassandra Data Migrator. +Read the prerequisites below before using the Cassandra Data Migrator. * Install or switch to Java 11. The Spark binaries are compiled with this version of Java. @@ -17,39 +17,4 @@ wget https://archive.apache.org/dist/spark/spark-3.5.1/spark-3.5.1-bin-hadoop3-s tar -xvzf spark-3.5.1-bin-hadoop3-scala2.13.tgz ---- -[[cdm-install-as-container]] -== Install {cstar-data-migrator} as a container -Get the latest image that includes all dependencies from https://hub.docker.com/r/datastax/cassandra-data-migrator[DockerHub]. - -All migration tools, `cassandra-data-migrator` + `dsbulk` + `cqlsh`, are available in the `/assets/` folder of the container. - -[[cdm-install-as-jar]] -== Install {cstar-data-migrator} as a JAR file - -Download the *latest* JAR file from the {cstar-data-migrator} https://github.com/datastax/cassandra-data-migrator/packages/1832128[GitHub repo]. -image:https://img.shields.io/github/v/release/datastax/cassandra-data-migrator?color=green[Latest release] - -[NOTE] -==== -Version 4.x of {cstar-data-migrator} is not backward-compatible with `*.properties` files created in previous versions, and package names have changed. -If you're starting new, DataStax recommendeds that you use the latest released version. -==== - -[[cdm-build-jar-local]] -== Build {cstar-data-migrator} JAR for local development (optional) - -Optionally, you can build the {cstar-data-migrator} JAR for local development. -You'll need https://maven.apache.org/download.cgi[Maven] 3.9.x. - -Example: - -[source,bash] ----- -cd ~/github -git clone git@github.com:datastax/cassandra-data-migrator.git -cd cassandra-data-migrator -mvn clean package ----- - -The fat jar file, or `cassandra-data-migrator-x.y.z.jar`, should be present now in the `target` folder. diff --git a/modules/ROOT/pages/cdm-steps.adoc b/modules/ROOT/pages/cdm-steps.adoc index e25d2d18..42313663 100644 --- a/modules/ROOT/pages/cdm-steps.adoc +++ b/modules/ROOT/pages/cdm-steps.adoc @@ -1,6 +1,6 @@ = {cstar-data-migrator} -Use {cstar-data-migrator} to migrate and validate tables between the Origin and Target Cassandra clusters, with available logging and reconciliation support. +Use {cstar-data-migrator} to migrate and validate tables between the origin and target Cassandra clusters, with available logging and reconciliation support. == Use {cstar-data-migrator} @@ -66,8 +66,8 @@ Differences noted in the log file are listed by primary-key values. You can also run the {cstar-data-migrator} validation job in an **AutoCorrect** mode, which can: -* Add any missing records from the Origin to Target cluster. -* Update any mismatched records between the Origin and Target clusters; this action makes the Target cluster the same as the Origin cluster. +* Add any missing records from the origin to target cluster. +* Update any mismatched records between the origin and target clusters; this action makes the target cluster the same as the origin cluster. To enable or disable this feature, use one or both of the following settings in your `*.properties` configuration file. @@ -79,8 +79,47 @@ spark.cdm.autocorrect.mismatch false|true [IMPORTANT] ==== -The {cstar-data-migrator} validation job nevers delete records from the Target cluster. -The job only adds or updates data on the Target cluster. +The {cstar-data-migrator} validation job nevers delete records from the target cluster. +The job only adds or updates data on the target cluster. +==== + +[[cdm--partition-ranges]] +== Migrating or validating specific partition ranges + +You can also use {cstar-data-migrator} to migrate or validate specific partition ranges, by using a **partition-file** with the name `./._partitions.csv`. +Use the following format in the CSV file, in the current folder as input. +Example: + +[source,csv] +---- +-507900353496146534,-107285462027022883 +-506781526266485690,1506166634797362039 +2637884402540451982,4638499294009575633 +798869613692279889,8699484505161403540 +---- + +Each line in the CSV represents a partition-range (`min,max`). + +Alternatively, you can also pass the partition-file with a command-line parameter. +Example: + +[source,bash] +---- +./spark-submit --properties-file cdm.properties \ + --conf spark.cdm.schema.origin.keyspaceTable="." \ + --conf spark.cdm.tokenrange.partitionFile.input="//" \ + --master "local[*]" --driver-memory 25G --executor-memory 25G \ + --class com.datastax.cdm.job. cassandra-data-migrator-x.y.z.jar &> logfile_name_$(date +%Y%m%d_%H_%M).txt +---- + +This mode is specifically useful to processes a subset of partition-ranges that may have failed during a previous run. + +[NOTE] +==== +A file named `./._partitions.csv` is auto-generated by the migration and validation jobs, in the format shown above. +The file contains any failed partition ranges. +No file is created if there were no failed partitions. +You can use the CSV as input to process any failed partition in a subsequent run. ==== [[cdm-guardrail-checks]] From f474d44d33561f5f6b00bb609e02a575936cef31 Mon Sep 17 00:00:00 2001 From: beajohnson Date: Wed, 12 Jun 2024 12:36:08 -0700 Subject: [PATCH 16/32] double check --- modules/ROOT/pages/cdm-parameters.adoc | 4 ++-- modules/ROOT/pages/cdm-steps.adoc | 8 ++++---- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/modules/ROOT/pages/cdm-parameters.adoc b/modules/ROOT/pages/cdm-parameters.adoc index a6668e5a..9f129085 100644 --- a/modules/ROOT/pages/cdm-parameters.adoc +++ b/modules/ROOT/pages/cdm-parameters.adoc @@ -49,7 +49,7 @@ Example: `data_col1,data_col2,...` | Default is empty. If column names are changed between the origin and target clusters, then this mapped list provides a mechanism to associate the two. The format is `:`. -The list is comma-separated. +The list is comma separated. You only need to list renamed columns. |=== @@ -83,7 +83,7 @@ Auto-correction parameters allow {cstar-data-migrator} to correct data differenc Typically, these are run-disabled for "what if" migration testing, and generate a list of data discrepancies. The reasons for these discrepancies can then be investigated, and if necessary the parameters below can be enabled. -For information about invoking `DiffData` in a {cstar-data-migrator} command, see xref:#cassandra-data-migrator.adoc#cdm-validation-steps[{cstar-data-migrator} steps in validation mode]. +For information about invoking `DiffData` in a {cstar-data-migrator} command, see https://docs.datastax.com/en/data-migration/cdm.html#cdm-validation-steps[{cstar-data-migrator} steps in validation mode]. [cols="2,2,3a"] |=== diff --git a/modules/ROOT/pages/cdm-steps.adoc b/modules/ROOT/pages/cdm-steps.adoc index 42313663..5f518fcb 100644 --- a/modules/ROOT/pages/cdm-steps.adoc +++ b/modules/ROOT/pages/cdm-steps.adoc @@ -10,7 +10,7 @@ It does not need to be `cdm.properties` or `cdm-detailed.properties`. In both versions, the `spark-submit` job processes only the parameters that aren't commented out. Other parameter values use defaults or are ignored. See the descriptions and defaults in each file. -For more information,see the following: +For more information, see the following: * The simplified sample properties configuration, https://github.com/datastax/cassandra-data-migrator/blob/main/src/resources/cdm.properties[cdm.properties]. This file contains only those parameters that are commonly configured. * The complete sample properties configuration, https://github.com/datastax/cassandra-data-migrator/blob/main/src/resources/cdm-detailed.properties[cdm-detailed.properties], for the full set of configurable settings. @@ -30,11 +30,11 @@ For more information,see the following: [TIP] ==== * The command generates a log file `logfile_name_*.txt` to prevent log output on the console. -* Update the memory options, driver & executor memory, based on your use-case +* Update the memory options, driver & executor memory, based on your use case. ==== [[cdm-validation-steps]] -== Validation mode for {cstar-data-migrator} +== {cstar-data-migrator} steps in validation mode To run your migration job with {cstar-data-migrator} in **data validation mode**, use class option `--class com.datastax.cdm.job.DiffData`. Example: @@ -116,7 +116,7 @@ This mode is specifically useful to processes a subset of partition-ranges that [NOTE] ==== -A file named `./._partitions.csv` is auto-generated by the migration and validation jobs, in the format shown above. +A file named `./._partitions.csv` is autogenerated by the migration and validation jobs, in the format shown above. The file contains any failed partition ranges. No file is created if there were no failed partitions. You can use the CSV as input to process any failed partition in a subsequent run. From 2e685cf59350e50c0126794a353a9d972e59d473 Mon Sep 17 00:00:00 2001 From: beajohnson Date: Wed, 12 Jun 2024 18:42:43 -0700 Subject: [PATCH 17/32] updated with partials --- modules/ROOT/pages/cdm-parameters.adoc | 432 +----------------- modules/ROOT/pages/cdm-steps.adoc | 126 +---- .../partials/auto-correction-parameters.adoc | 37 ++ .../partials/cassandra-filter-parameters.adoc | 24 + .../ROOT/partials/cdm-guardrail-checks.adoc | 13 + .../ROOT/partials/cdm-partition-ranges.adoc | 35 ++ .../ROOT/partials/cdm-validation-steps.adoc | 46 ++ .../constant-column-feature-parameters.adoc | 29 ++ .../explode-map-feature-parameters.adoc | 19 + .../guardrail-feature-parameters.adoc | 16 + .../ROOT/partials/java-filter-parameters.adoc | 46 ++ .../partials/origin-schema-parameters.adoc | 54 +++ ...performance-and-operations-parameters.adoc | 59 +++ .../partials/target-schema-parameters.adoc | 11 + .../partials/tls-connection-parameters.adoc | 66 +++ .../partials/transformation-parameters.adoc | 58 +++ modules/ROOT/partials/use-cdm-migrator.adoc | 28 ++ 17 files changed, 556 insertions(+), 543 deletions(-) create mode 100644 modules/ROOT/partials/auto-correction-parameters.adoc create mode 100644 modules/ROOT/partials/cassandra-filter-parameters.adoc create mode 100644 modules/ROOT/partials/cdm-guardrail-checks.adoc create mode 100644 modules/ROOT/partials/cdm-partition-ranges.adoc create mode 100644 modules/ROOT/partials/cdm-validation-steps.adoc create mode 100644 modules/ROOT/partials/constant-column-feature-parameters.adoc create mode 100644 modules/ROOT/partials/explode-map-feature-parameters.adoc create mode 100644 modules/ROOT/partials/guardrail-feature-parameters.adoc create mode 100644 modules/ROOT/partials/java-filter-parameters.adoc create mode 100644 modules/ROOT/partials/origin-schema-parameters.adoc create mode 100644 modules/ROOT/partials/performance-and-operations-parameters.adoc create mode 100644 modules/ROOT/partials/target-schema-parameters.adoc create mode 100644 modules/ROOT/partials/tls-connection-parameters.adoc create mode 100644 modules/ROOT/partials/transformation-parameters.adoc create mode 100644 modules/ROOT/partials/use-cdm-migrator.adoc diff --git a/modules/ROOT/pages/cdm-parameters.adoc b/modules/ROOT/pages/cdm-parameters.adoc index 9f129085..de978e60 100644 --- a/modules/ROOT/pages/cdm-parameters.adoc +++ b/modules/ROOT/pages/cdm-parameters.adoc @@ -5,471 +5,61 @@ Each parameter below offers a different connection. Review each option to determ [[cdm-origin-schema-params]] == Origin schema parameters -[cols="3,1,5a"] -|=== -|Property | Default | Notes - -| `spark.cdm.schema.origin.keyspaceTable` -| -| Required - the `.` of the table to be migrated. -Table must exist in the origin cluster. - -| `spark.cdm.schema.origin.column.ttl.automatic` -| `true` -| Default is `true`, unless `spark.cdm.schema.origin.column.ttl.names` is specified. -When `true`, determine the Time To Live (TTL) of the target record. -Find the maximum TTL of all origin columns that can have TTL set. This excludes partition key, clustering key, collections/UDT/tuple, and frozen columns. -When `false`, and `spark.cdm.schema.origin.column.ttl.names` is not set, the target record has the target table configuration determine the TTL. - -| `spark.cdm.schema.origin.column.ttl.names` -| -| Default is empty, meaning the names are determined automatically if `spark.cdm.schema.origin.column.ttl.automatic` is set. -Specify a subset of eligible columns that are used to calculate the TTL of the target record. - -| `spark.cdm.schema.origin.column.writetime.automatic` -| `true` -| Default is `true`, unless `spark.cdm.schema.origin.column.writetime.names` is specified. -When `true`, determine the `WRITETIME` of the target record. -Find the maximum `WRITETIME` of all origin columns that can have `WRITETIME` set. This excludes partition key, clustering key, collections/UDT/tuple, and frozen columns. -When `false`, and `spark.cdm.schema.origin.column.writetime.names` is not set, the target table configuration determines the target record's `WRITETIME`. - -[NOTE] -==== -The `spark.cdm.transform.custom.writetime` property, if set, overrides `spark.cdm.schema.origin.column.writetime`. -==== - -| `spark.cdm.schema.origin.column.writetime.names` -| -| Default is empty, meaning the names are determined automatically if `spark.cdm.schema.origin.column.writetime.automatic` is set. -Otherwise, specify a subset of eligible columns that are used to calculate the WRITETIME of the target record. -Example: `data_col1,data_col2,...` - -| `spark.cdm.schema.origin.column.names.to.target` -| -| Default is empty. -If column names are changed between the origin and target clusters, then this mapped list provides a mechanism to associate the two. -The format is `:`. -The list is comma separated. -You only need to list renamed columns. - -|=== - -[NOTE] -==== -For optimization reasons, {cstar-data-migrator} does not migrate TTL and writetime at the field level. -Instead, {cstar-data-migrator} finds the field with the highest TTL and the field with the highest writetime within an origin table row, and uses those values on the entire target table row. -==== +include::partial$origin-schema-parameters.adoc[] [[cdm-target-schema-params]] == Target schema parameter -[cols="3,1,2"] -|=== -|Property | Default | Notes - -| `spark.cdm.schema.target.keyspaceTable` -| Equals the value of `spark.cdm.schema.origin.keyspaceTable` -| This parameter is commented out. -It's the `.` of the table to be migrated into the target. -Table must exist in the target cluster. - -|=== - +include::partial$target-schema-parameters.adoc[] [[cdm-auto-correction-params]] == Auto-correction parameters -Auto-correction parameters allow {cstar-data-migrator} to correct data differences found between the origin and target clusters when you run the `DiffData` program. -Typically, these are run-disabled for "what if" migration testing, and generate a list of data discrepancies. -The reasons for these discrepancies can then be investigated, and if necessary the parameters below can be enabled. - -For information about invoking `DiffData` in a {cstar-data-migrator} command, see https://docs.datastax.com/en/data-migration/cdm.html#cdm-validation-steps[{cstar-data-migrator} steps in validation mode]. - -[cols="2,2,3a"] -|=== -|Property | Default | Notes - -| `spark.cdm.autocorrect.missing` -| `false` -| When `true`, data that is missing in the target cluster but is found in the origin cluster is re-migrated to the target cluster. - -| `spark.cdm.autocorrect.mismatch` -| `false` -| When `true`, data that is different between the origin and target clusters is reconciled. -[NOTE] -==== -The `TIMESTAMP` of records may have an effect. -If the `WRITETIME` of the origin record that is determined with `.writetime.names` is earlier than the `WRITETIME` of the target record, the change does appear in the target cluster. -This comparative state may be particularly challenging to troubleshoot if individual columns or cells have been modified in the target cluster. -==== - -| `spark.cdm.autocorrect.missing.counter` -| `false` -| Commented out. -By default, counter tables are not copied when missing, unless explicitly set. - -| `spark.tokenrange.partitionFile` -| `./._partitions.csv` -| Commented out. -This CSV file is used as input, as well as output, when applicable. -If the file exists, only the partition ranges in this file are migrated or validated. -Similarly, if exceptions occur while migrating or validating, partition ranges with exceptions are logged to this file. - -|=== +include::partial$auto-correction-parameters.adoc[] [[cdm-performance-operations-params]] == Performance and operations parameters -Performance and operations parameters that can affect migration throughput, error handling, and similar concerns. - -[cols="4,1,3"] -|=== -|Property | Default | Notes - -| `spark.cdm.perfops.numParts` -| `10000` -| In standard operation, the full token range of -2^63 to 2^63-1 is divided into a number of parts, which are parallel processed. -You should aim for each part to comprise a total of ≈1-10GB of data to migrate. -During initial testing, you may want this to be a small number, such as `1`. - -| `spark.cdm.perfops.batchSize` -| `5` -| When writing to the target cluster, this comprises the number of records that are put into an `UNLOGGED` batch. -{cstar-data-migrator} tends to work on the same partition at a time. -If your partition sizes are larger, this number may be increased. -If the `spark.cdm.perfops.batchSize` would mean that more than 1 partition is often contained in a batch, reduce this parameter's value. -Ideally < 1% of batches have more than 1 partition. - -| `spark.cdm.perfops.ratelimit.origin` -| `20000` -| Concurrent number of operations across all parallel threads from the origin cluster. -This value may be adjusted up or down, depending on the amount of data and the processing capacity of the origin cluster. - -| `spark.cdm.perfops.ratelimit.target` -| `40000` -| Concurrent number of operations across all parallel threads from the target cluster. -This may be adjusted up or down, depending on the amount of data and the processing capacity of the target cluster. - -| `spark.cdm.perfops.consistency.read` -| `LOCAL_QUORUM` -| Commented out. -Read consistency from the origin cluster and from the target cluster when records are read for comparison purposes. -The consistency parameters may be one of: `ANY`, `ONE`, `TWO`, `THREE`, `QUORUM`, `LOCAL_ONE`, `EACH_QUORUM`, `LOCAL_QUORUM`, `SERIAL`, `LOCAL_SERIAL`, `ALL`. - -| `spark.cdm.perfops.consistency.write` -| `LOCAL_QUORUM` -| Commented out. -Write consistency to the arget cluster. -The consistency parameters may be one of: `ANY`, `ONE`, `TWO`, `THREE`, `QUORUM`, `LOCAL_ONE`, `EACH_QUORUM`, `LOCAL_QUORUM`, `SERIAL`, `LOCAL_SERIAL`, `ALL`. - -| `spark.cdm.perfops.printStatsAfter` -| `100000` -| Commented out. -Number of rows of processing after which a progress log entry is made. - -| `spark.cdm.perfops.fetchSizeInRows` -| `1000` -| Commented out. -This parameter affects the frequency of reads from the origin cluster and the frequency of flushes to the target cluster. - -| `spark.cdm.perfops.errorLimit` -| `0` -| Commented out. -Controls how many errors a thread may encounter during `MigrateData` and `DiffData` operations before failing. -Recommendation: set this parameter to a non-zero value **only when not doing** a mutation-type operation, such as when you're running `DiffData` without `.autocorrect`. - -|=== +include::partial$performance-and-operations-parameters.adoc[] [[cdm-transformation-params]] == Transformation parameters -Parameters to perform schema transformations between the origin and target clusters. - -By default, these parameters are commented out. - -[cols="2,1,4a"] -|=== -|Property | Default | Notes - -| `spark.cdm.transform.missing.key.ts.replace.value` -| `1685577600000` -| Timestamp value in milliseconds. -Partition and clustering columns cannot have null values. -If they are added as part of a schema transformation between the origin and target clusters, it is possible that the origin side is null. -In this case, the `Migrate` data operation fails. -This parameter allows a crude constant value to be used in its place that is separate from the constant values feature. - -| `spark.cdm.transform.custom.writetime` -| `0` -| Default is 0 (disabled). -Timestamp value in microseconds to use as the `WRITETIME` for the target record. -This is useful when the `WRITETIME` of the record in the origin cluster cannot be determined. Such an example is when the only non-key columns are collections. -This parameter allows a crude constant value to be used in its place and overrides `spark.cdm.schema.origin.column.writetime.names`. - -| `spark.cdm.transform.custom.writetime.incrementBy` -| `0` -| Default is `0`. -This is useful when you have a list that is not frozen and you are updating this using the autocorrect feature. -Lists are not idempotent, and subsequent UPSERTs add duplicates to the list. - -| `spark.cdm.transform.codecs` -| -| Default is empty. -A comma-separated list of additional codecs to enable. - - * `INT_STRING` : int stored in a string. - * `DOUBLE_STRING` : double stored in a string. - * `BIGINT_STRING` : bigint stored in a string. - * `DECIMAL_STRING` : decimal stored in a string. - * `TIMESTAMP_STRING_MILLIS` : timestamp stored in a string, as Epoch milliseconds. - * `TIMESTAMP_STRING_FORMAT` : timestamp stored in a string with a custom format. - -[NOTE] -==== -Where there are multiple type pair options, such as with `TIMESTAMP_STRING_*`, only one can be configured at a time with the `spark.cdm.transform.codecs` parameter. -==== - -| `spark.cdm.transform.codecs.timestamp.string.format` -| `yyyyMMddHHmmss` -| Configuration for `CQL_TIMESTAMP_TO_STRING_FORMAT` codec. -Default format is `yyyyMMddHHmmss`; `DateTimeFormatter.ofPattern(formatString)` - - -| `spark.cdm.transform.codecs.timestamp.string.zone` -| `UTC` -| Default is `UTC`. -Must be in `ZoneRulesProvider.getAvailableZoneIds()`. - -|=== +include::partial$transformation-parameters.adoc[] [[cdm-cassandra-filter-params]] == Cassandra filter parameters -Cassandra filters are applied on the coordinator node. -Depending on the filter, the coordinator node may need to do a lot more work than is normal, notably because {cstar-data-migrator} specifies `ALLOW FILTERING`. - -By default, these parameters are commented out. - -[cols="3,1,3"] -|=== -|Property | Default | Notes - -| `spark.cdm.filter.cassandra.partition.min` -| `-9223372036854775808` -| Default is `0` when using `RandomPartitioner` and `-9223372036854775808` or -2^63 otherwise. -Lower partition bound of the range is inclusive. - -| `spark.cdm.filter.cassandra.partition.max` -| `9223372036854775807` -| Default is `2^127-1` when using `RandomPartitioner` and `9223372036854775807` or 2^63-1 otherwise. -Upper partition bound of the range is inclusive. - -| `spark.cdm.filter.cassandra.whereCondition` -| -| CQL added to the `WHERE` clause of `SELECT` statements from the origin cluster. - -|=== +include::partial$cassandra-filter-parameters.adoc[] [[cdm-java-filter-params]] == Java filter parameters -Java filters are applied on the client node. -Data must be pulled from the origin cluster and then filtered. -However, this option may have a lower impact on the production cluster than xref:cdm-cassandra-filter-params[Cassandra filters]. -Java filters put a load onto the {cstar-data-migrator} processing node. -They send more data from Cassandra. -Cassandra filters put a load on the Cassandra nodes because {cstar-data-migrator} specifies `ALLOW FILTERING`, which could cause the coordinator node to perform a lot more work. - -By default, these parameters are commented out. - -[cols="2,1,4"] -|=== -|Property | Default | Notes - -| `spark.cdm.filter.java.token.percent` -| `100` -| Between 1 and 100 percent of the token in each split that is migrated. -This property is used to do a wide and random sampling of the data. -The percentage value is applied to each split. -Invalid percentages are treated as 100. - -| `spark.cdm.filter.java.writetime.min` -| `0` -| The lowest (inclusive) writetime values to be migrated. -Using the `spark.cdm.filter.java.writetime.min` and `spark.cdm.filter.java.writetime.max` thresholds, {cstar-data-migrator} can filter records based on their writetimes. -The maximum writetime of the columns configured at `spark.cdm.schema.origin.column.writetime.names` are compared to the `.min` and `.max` thresholds, which must be in **microseconds since the epoch**. -If the `spark.cdm.schema.origin.column.writetime.names` are not specified or the thresholds are null or otherwise invalid, the filter is ignored. -Note that `spark.cdm.s.perfops.batchSize` is ignored when this filter is in place; a value of 1 is used instead. - -| `spark.cdm.filter.java.writetime.max` -| `9223372036854775807` -| The highest (inclusive) writetime values to be migrated. -The `spark.cdm.schema.origin.column.writetime.names` specifies the maximum timestamp of the columns. -If that property is not specified or is for some reason null, the filter is ignored. - -| `spark.cdm.filter.java.column.name` -| -| Filter rows based on matching a configured value. -With `spark.cdm.filter.java.column.name`, specify the column name against which the `spark.cdm.filter.java.column.value` is compared. -Must be on the column list specified at `spark.cdm.schema.origin.column.names`. -The column value is converted to a string, trimmed of whitespace on both ends, and compared. - -| `spark.cdm.filter.java.column.value` -| -| String value to use as comparison. -The whitespace on the ends of `spark.cdm.filter.java.column.value` is trimmed. -|=== +include::partial$java-filter-parameters.adoc[] [[cdm-constant-column-feature-params]] == Constant column feature parameters -The constant columns feature allows you to add constant columns to the target table. -If used, the `spark.cdm.feature.constantColumns.names`, `spark.cdm.feature.constantColumns.types`, and `spark.cdm.feature.constantColumns.values` lists must all be the same length. - -By default, these parameters are commented out. - -[cols="2,1,3"] -|=== -|Property | Default | Notes - -| `spark.cdm.feature.constantColumns.names` -| -| A comma-separated list of column names, such as `const1,const2`. - -| `spark.cdm.feature.constantColumns.type` -| -| A comma-separated list of column types. - -| `spark.cdm.feature.constantColumns.values` -| -| A comma-separated list of hard-coded values. -Each value should be provided as you would use on the `CQLSH` command line. -Examples: `'abcd'` for a string; `1234` for an int, and so on. - -| `spark.cdm.feature.constantColumns.splitRegex` -| `,` -| Defaults to comma, but can be any regex character that works with `String.split(regex)`. -This option is needed because some type values contain commas, such as in lists, maps, and sets. - -|=== +include::partial$constant-column-feature-parameters.adoc[] [[cdm-explode-map-feature-params]] == Explode map feature parameters -The explode map feature allows you convert an origin table map into multiple target table records. - -By default, these parameters are commented out. - -[cols="3,3"] -|=== -|Property | Notes - -| `spark.cdm.feature.explodeMap.origin.name` -| The name of the map column, such as `my_map`. -Must be defined on `spark.cdm.schema.origin.column.names`, and the corresponding type on `spark.cdm.schema.origin.column.types` must be a map. - -| `spark.cdm.feature.explodeMap.origin.name.key` -| The name of the column on the target table that holds the map key, such as `my_map_key`. -This key must be present on the target primary key `spark.cdm.schema.target.column.id.names`. - -| `spark.cdm.feature.explodeMap.origin.value` -| The name of the column on the target table that holds the map value, such as `my_map_value`. -|=== +include::partial$explode-map-feature-parameters.adoc[] [[cdm-guardrail-feature-params]] == Guardrail feature parameter -The guardrail feature manages records that exceed guardrail checks. -The Guardrail job generates a report; other jobs skip records that exceed the guardrail limit. - -By default, these parameters are commented out. - -[cols="3,1,3"] -|=== -|Property | Default | Notes - -| `spark.cdm.feature.guardrail.colSizeInKB` -| `0` -| The `0` default means the guardrail check is not done. -If set, table records with one or more fields that exceed the column size in kB are flagged. -Note this is kB which is base 10, not kiB which is base 2. - -|=== - +include::partial$guardrail-feature-parameters.adoc[] [[cdm-tls-ssl-connection-params]] == TLS (SSL) connection parameters -These are TLS (SSL) connection parameters, if configured, for the origin and target clusters. -Note that a secure connect bundle (SCB) embeds these details. - -By default, these parameters are commented out. - -[cols="3,3,3"] -|=== -|Property | Default | Notes - -| `spark.cdm.connect.origin.tls.enabled` -| `false` -| If TLS is used, set to `true`. - -| `spark.cdm.connect.origin.tls.trustStore.path` -| -| Path to the Java truststore file. - -| `spark.cdm.connect.origin.tls.trustStore.password` -| -| Password needed to open the truststore. - -| `spark.cdm.connect.origin.tls.trustStore.type` -| `JKS` -| - -| `spark.cdm.connect.origin.tls.keyStore.path` -| -| Path to the Java keystore file. - -| `spark.cdm.connect.origin.tls.keyStore.password` -| -| Password needed to open the keystore. - -| `spark.cdm.connect.origin.tls.enabledAlgorithms` -| `TLS_RSA_WITH_AES_128_CBC_SHA`,`TLS_RSA_WITH_AES_256_CBC_SHA` -| - -| `spark.cdm.connect.target.tls.enabled` -| `false` -| If TLS is used, set to `true`. - -| `spark.cdm.connect.target.tls.trustStore.path` -| -| Path to the Java truststore file. - -| `spark.cdm.connect.target.tls.trustStore.password` -| -| Password needed to open the truststore. - -| `spark.cdm.connect.target.tls.trustStore.type` -| `JKS` -| - -| `spark.cdm.connect.target.tls.keyStore.path` -| -| Path to the Java keystore file. - -| `spark.cdm.connect.target.tls.keyStore.password` -| -| Password needed to open the keystore. - -| `spark.cdm.connect.target.tls.enabledAlgorithms` -| `TLS_RSA_WITH_AES_128_CBC_SHA`,`TLS_RSA_WITH_AES_256_CBC_SHA` -| - -|=== \ No newline at end of file +include::partial$tls-connection-parameters.adoc[] \ No newline at end of file diff --git a/modules/ROOT/pages/cdm-steps.adoc b/modules/ROOT/pages/cdm-steps.adoc index 5f518fcb..02ad1826 100644 --- a/modules/ROOT/pages/cdm-steps.adoc +++ b/modules/ROOT/pages/cdm-steps.adoc @@ -4,137 +4,19 @@ Use {cstar-data-migrator} to migrate and validate tables between the origin and == Use {cstar-data-migrator} -. Configure for your environment the `cdm*.properties` file that's provided in the {cstar-data-migrator} https://github.com/datastax/cassandra-data-migrator/tree/main/src/resources[GitHub repo]. -The file can have any name. -It does not need to be `cdm.properties` or `cdm-detailed.properties`. -In both versions, the `spark-submit` job processes only the parameters that aren't commented out. -Other parameter values use defaults or are ignored. -See the descriptions and defaults in each file. -For more information, see the following: - * The simplified sample properties configuration, https://github.com/datastax/cassandra-data-migrator/blob/main/src/resources/cdm.properties[cdm.properties]. - This file contains only those parameters that are commonly configured. - * The complete sample properties configuration, https://github.com/datastax/cassandra-data-migrator/blob/main/src/resources/cdm-detailed.properties[cdm-detailed.properties], for the full set of configurable settings. - -. Place the properties file that you elected to use and customize where it can be accessed while running the job using `spark-submit`. - -. Run the job using `spark-submit` command: - -[source,bash] ----- -./spark-submit --properties-file cdm.properties \ ---conf spark.cdm.schema.origin.keyspaceTable="." \ ---master "local[*]" --driver-memory 25G --executor-memory 25G \ ---class com.datastax.cdm.job.Migrate cassandra-data-migrator-x.y.z.jar &> logfile_name_$(date +%Y%m%d_%H_%M).txt ----- - -[TIP] -==== -* The command generates a log file `logfile_name_*.txt` to prevent log output on the console. -* Update the memory options, driver & executor memory, based on your use case. -==== +include::partial$use-cdm-migrator.adoc[] [[cdm-validation-steps]] == {cstar-data-migrator} steps in validation mode -To run your migration job with {cstar-data-migrator} in **data validation mode**, use class option `--class com.datastax.cdm.job.DiffData`. -Example: - -[source,bash] ----- -./spark-submit --properties-file cdm.properties \ ---conf spark.cdm.schema.origin.keyspaceTable="." \ ---master "local[*]" --driver-memory 25G --executor-memory 25G \ ---class com.datastax.cdm.job.DiffData cassandra-data-migrator-x.y.z.jar &> logfile_name_$(date +%Y%m%d_%H_%M).txt ----- - -The {cstar-data-migrator} validation job reports differences as `ERROR` entries in the log file. -Example: - -[source,bash] ----- -23/04/06 08:43:06 ERROR DiffJobSession: Mismatch row found for key: [key3] Mismatch: Target Index: 1 Origin: valueC Target: value999) -23/04/06 08:43:06 ERROR DiffJobSession: Corrected mismatch row in target: [key3] -23/04/06 08:43:06 ERROR DiffJobSession: Missing target row found for key: [key2] -23/04/06 08:43:06 ERROR DiffJobSession: Inserted missing row in target: [key2] ----- - -[TIP] -==== -To get the list of missing or mismatched records, grep for all `ERROR` entries in the log files. -Differences noted in the log file are listed by primary-key values. -==== - -You can also run the {cstar-data-migrator} validation job in an **AutoCorrect** mode, which can: - -* Add any missing records from the origin to target cluster. -* Update any mismatched records between the origin and target clusters; this action makes the target cluster the same as the origin cluster. - -To enable or disable this feature, use one or both of the following settings in your `*.properties` configuration file. - -[source,properties] ----- -spark.cdm.autocorrect.missing false|true -spark.cdm.autocorrect.mismatch false|true ----- - -[IMPORTANT] -==== -The {cstar-data-migrator} validation job nevers delete records from the target cluster. -The job only adds or updates data on the target cluster. -==== +include::partial$cdm-validation-steps.adoc[] [[cdm--partition-ranges]] == Migrating or validating specific partition ranges -You can also use {cstar-data-migrator} to migrate or validate specific partition ranges, by using a **partition-file** with the name `./._partitions.csv`. -Use the following format in the CSV file, in the current folder as input. -Example: - -[source,csv] ----- --507900353496146534,-107285462027022883 --506781526266485690,1506166634797362039 -2637884402540451982,4638499294009575633 -798869613692279889,8699484505161403540 ----- - -Each line in the CSV represents a partition-range (`min,max`). - -Alternatively, you can also pass the partition-file with a command-line parameter. -Example: - -[source,bash] ----- -./spark-submit --properties-file cdm.properties \ - --conf spark.cdm.schema.origin.keyspaceTable="." \ - --conf spark.cdm.tokenrange.partitionFile.input="//" \ - --master "local[*]" --driver-memory 25G --executor-memory 25G \ - --class com.datastax.cdm.job. cassandra-data-migrator-x.y.z.jar &> logfile_name_$(date +%Y%m%d_%H_%M).txt ----- - -This mode is specifically useful to processes a subset of partition-ranges that may have failed during a previous run. - -[NOTE] -==== -A file named `./._partitions.csv` is autogenerated by the migration and validation jobs, in the format shown above. -The file contains any failed partition ranges. -No file is created if there were no failed partitions. -You can use the CSV as input to process any failed partition in a subsequent run. -==== +include::partial$cdm-partition-ranges.adoc[] [[cdm-guardrail-checks]] == Perform large-field guardrail violation checks -Use {cstar-data-migrator} to identify large fields from a table that may break your https://docs.datastax.com/en/astra-db-serverless/cql/cassandra-guardrails.html[cluster guardrails]. -For example, {astra_db} has a 10MB limit for a single large field. -Specify `--class com.datastax.cdm.job.GuardrailCheck` on the command. -Example: - -[source,bash] ----- -./spark-submit --properties-file cdm.properties \ ---conf spark.cdm.schema.origin.keyspaceTable="." \ ---conf spark.cdm.feature.guardrail.colSizeInKB=10000 \ ---master "local[*]" --driver-memory 25G --executor-memory 25G \ ---class com.datastax.cdm.job.GuardrailCheck cassandra-data-migrator-4.x.x.jar &> logfile_name_$(date +%Y%m%d_%H_%M).txt ----- +include::partial$cdm-guardrail-checks.adoc[] diff --git a/modules/ROOT/partials/auto-correction-parameters.adoc b/modules/ROOT/partials/auto-correction-parameters.adoc new file mode 100644 index 00000000..f7b7a68e --- /dev/null +++ b/modules/ROOT/partials/auto-correction-parameters.adoc @@ -0,0 +1,37 @@ +Auto-correction parameters allow {cstar-data-migrator} to correct data differences found between the origin and target clusters when you run the `DiffData` program. +Typically, these parameters are run-disabled for "what if" migration testing, and generate a list of data discrepancies. +The reasons for these discrepancies can then be investigated, and if necessary the parameters below can be enabled. + +For information about invoking `DiffData` in a {cstar-data-migrator} command, see https://docs.datastax.com/en/data-migration/cdm.html#cdm-validation-steps[{cstar-data-migrator} steps in validation mode]. + +[cols="2,2,3a"] +|=== +|Property | Default | Notes + +| `spark.cdm.autocorrect.missing` +| `false` +| When `true`, data that is missing in the target cluster but is found in the origin cluster is re-migrated to the target cluster. + +| `spark.cdm.autocorrect.mismatch` +| `false` +| When `true`, data that is different between the origin and target clusters is reconciled. +[NOTE] +==== +The `TIMESTAMP` of records may have an effect. +If the `WRITETIME` of the origin record that is determined with `.writetime.names` is earlier than the `WRITETIME` of the target record, the change does appear in the target cluster. +This comparative state may be particularly challenging to troubleshoot if individual columns or cells have been modified in the target cluster. +==== + +| `spark.cdm.autocorrect.missing.counter` +| `false` +| Commented out. +By default, counter tables are not copied when missing, unless explicitly set. + +| `spark.tokenrange.partitionFile` +| `./._partitions.csv` +| Commented out. +This CSV file is used as input, as well as output, when applicable. +If the file exists, only the partition ranges in this file are migrated or validated. +Similarly, if exceptions occur while migrating or validating, partition ranges with exceptions are logged to this file. + +|=== \ No newline at end of file diff --git a/modules/ROOT/partials/cassandra-filter-parameters.adoc b/modules/ROOT/partials/cassandra-filter-parameters.adoc new file mode 100644 index 00000000..f9c46c88 --- /dev/null +++ b/modules/ROOT/partials/cassandra-filter-parameters.adoc @@ -0,0 +1,24 @@ +Cassandra filters are applied on the coordinator node. +Depending on the filter, the coordinator node may need to do a lot more work than is normal, notably because {cstar-data-migrator} specifies `ALLOW FILTERING`. + +By default, these parameters are commented out. + +[cols="3,1,3"] +|=== +|Property | Default | Notes + +| `spark.cdm.filter.cassandra.partition.min` +| `-9223372036854775808` +| Default is `0` when using `RandomPartitioner` and `-9223372036854775808` or -2^63 otherwise. +Lower partition bound of the range is inclusive. + +| `spark.cdm.filter.cassandra.partition.max` +| `9223372036854775807` +| Default is `2^127-1` when using `RandomPartitioner` and `9223372036854775807` or 2^63-1 otherwise. +Upper partition bound of the range is inclusive. + +| `spark.cdm.filter.cassandra.whereCondition` +| +| CQL added to the `WHERE` clause of `SELECT` statements from the origin cluster. + +|=== \ No newline at end of file diff --git a/modules/ROOT/partials/cdm-guardrail-checks.adoc b/modules/ROOT/partials/cdm-guardrail-checks.adoc new file mode 100644 index 00000000..b83d372b --- /dev/null +++ b/modules/ROOT/partials/cdm-guardrail-checks.adoc @@ -0,0 +1,13 @@ +Use {cstar-data-migrator} to identify large fields from a table that may break your https://docs.datastax.com/en/astra-db-serverless/cql/cassandra-guardrails.html[cluster guardrails]. +For example, {astra_db} has a 10MB limit for a single large field. +Specify `--class com.datastax.cdm.job.GuardrailCheck` on the command. +Example: + +[source,bash] +---- +./spark-submit --properties-file cdm.properties \ +--conf spark.cdm.schema.origin.keyspaceTable="." \ +--conf spark.cdm.feature.guardrail.colSizeInKB=10000 \ +--master "local[*]" --driver-memory 25G --executor-memory 25G \ +--class com.datastax.cdm.job.GuardrailCheck cassandra-data-migrator-4.x.x.jar &> logfile_name_$(date +%Y%m%d_%H_%M).txt +---- diff --git a/modules/ROOT/partials/cdm-partition-ranges.adoc b/modules/ROOT/partials/cdm-partition-ranges.adoc new file mode 100644 index 00000000..b86d4e98 --- /dev/null +++ b/modules/ROOT/partials/cdm-partition-ranges.adoc @@ -0,0 +1,35 @@ +You can also use {cstar-data-migrator} to migrate or validate specific partition ranges, by using a **partition-file** with the name `./._partitions.csv`. +Use the following format in the CSV file, in the current folder as input. +Example: + +[source,csv] +---- +-507900353496146534,-107285462027022883 +-506781526266485690,1506166634797362039 +2637884402540451982,4638499294009575633 +798869613692279889,8699484505161403540 +---- + +Each line in the CSV represents a partition-range (`min,max`). + +Alternatively, you can also pass the partition-file with a command-line parameter. +Example: + +[source,bash] +---- +./spark-submit --properties-file cdm.properties \ + --conf spark.cdm.schema.origin.keyspaceTable="." \ + --conf spark.cdm.tokenrange.partitionFile.input="//" \ + --master "local[*]" --driver-memory 25G --executor-memory 25G \ + --class com.datastax.cdm.job. cassandra-data-migrator-x.y.z.jar &> logfile_name_$(date +%Y%m%d_%H_%M).txt +---- + +This mode is specifically useful to processes a subset of partition-ranges that may have failed during a previous run. + +[NOTE] +==== +A file named `./._partitions.csv` is autogenerated by the migration and validation jobs, in the format shown above. +The file contains any failed partition ranges. +No file is created if there were no failed partitions. +You can use the CSV as input to process any failed partition in a subsequent run. +==== \ No newline at end of file diff --git a/modules/ROOT/partials/cdm-validation-steps.adoc b/modules/ROOT/partials/cdm-validation-steps.adoc new file mode 100644 index 00000000..46e05376 --- /dev/null +++ b/modules/ROOT/partials/cdm-validation-steps.adoc @@ -0,0 +1,46 @@ +To run your migration job with {cstar-data-migrator} in **data validation mode**, use class option `--class com.datastax.cdm.job.DiffData`. +Example: + +[source,bash] +---- +./spark-submit --properties-file cdm.properties \ +--conf spark.cdm.schema.origin.keyspaceTable="." \ +--master "local[*]" --driver-memory 25G --executor-memory 25G \ +--class com.datastax.cdm.job.DiffData cassandra-data-migrator-x.y.z.jar &> logfile_name_$(date +%Y%m%d_%H_%M).txt +---- + +The {cstar-data-migrator} validation job reports differences as `ERROR` entries in the log file. +Example: + +[source,bash] +---- +23/04/06 08:43:06 ERROR DiffJobSession: Mismatch row found for key: [key3] Mismatch: Target Index: 1 Origin: valueC Target: value999) +23/04/06 08:43:06 ERROR DiffJobSession: Corrected mismatch row in target: [key3] +23/04/06 08:43:06 ERROR DiffJobSession: Missing target row found for key: [key2] +23/04/06 08:43:06 ERROR DiffJobSession: Inserted missing row in target: [key2] +---- + +[TIP] +==== +To get the list of missing or mismatched records, grep for all `ERROR` entries in the log files. +Differences noted in the log file are listed by primary-key values. +==== + +You can also run the {cstar-data-migrator} validation job in an **AutoCorrect** mode, which can: + +* Add any missing records from the origin to target cluster. +* Update any mismatched records between the origin and target clusters; this action makes the target cluster the same as the origin cluster. + +To enable or disable this feature, use one or both of the following settings in your `*.properties` configuration file. + +[source,properties] +---- +spark.cdm.autocorrect.missing false|true +spark.cdm.autocorrect.mismatch false|true +---- + +[IMPORTANT] +==== +The {cstar-data-migrator} validation job nevers delete records from the target cluster. +The job only adds or updates data on the target cluster. +==== \ No newline at end of file diff --git a/modules/ROOT/partials/constant-column-feature-parameters.adoc b/modules/ROOT/partials/constant-column-feature-parameters.adoc new file mode 100644 index 00000000..3098cc92 --- /dev/null +++ b/modules/ROOT/partials/constant-column-feature-parameters.adoc @@ -0,0 +1,29 @@ +The constant columns feature allows you to add constant columns to the target table. +If used, the `spark.cdm.feature.constantColumns.names`, `spark.cdm.feature.constantColumns.types`, and `spark.cdm.feature.constantColumns.values` lists must all be the same length. + +By default, these parameters are commented out. + +[cols="2,1,3"] +|=== +|Property | Default | Notes + +| `spark.cdm.feature.constantColumns.names` +| +| A comma-separated list of column names, such as `const1,const2`. + +| `spark.cdm.feature.constantColumns.type` +| +| A comma-separated list of column types. + +| `spark.cdm.feature.constantColumns.values` +| +| A comma-separated list of hard-coded values. +Each value should be provided as you would use on the `CQLSH` command line. +Examples: `'abcd'` for a string; `1234` for an int, and so on. + +| `spark.cdm.feature.constantColumns.splitRegex` +| `,` +| Defaults to comma, but can be any regex character that works with `String.split(regex)`. +This option is needed because some data values contain commas, such as in lists, maps, and sets. + +|=== \ No newline at end of file diff --git a/modules/ROOT/partials/explode-map-feature-parameters.adoc b/modules/ROOT/partials/explode-map-feature-parameters.adoc new file mode 100644 index 00000000..f88880f0 --- /dev/null +++ b/modules/ROOT/partials/explode-map-feature-parameters.adoc @@ -0,0 +1,19 @@ +The explode map feature allows you convert an origin table map into multiple target table records. + +By default, these parameters are commented out. + +[cols="3,3"] +|=== +|Property | Notes + +| `spark.cdm.feature.explodeMap.origin.name` +| The name of the map column, such as `my_map`. +Must be defined on `spark.cdm.schema.origin.column.names`, and the corresponding type on `spark.cdm.schema.origin.column.types` must be a map. + +| `spark.cdm.feature.explodeMap.origin.name.key` +| The name of the column on the target table that holds the map key, such as `my_map_key`. +This key must be present on the target primary key `spark.cdm.schema.target.column.id.names`. + +| `spark.cdm.feature.explodeMap.origin.value` +| The name of the column on the target table that holds the map value, such as `my_map_value`. +|=== \ No newline at end of file diff --git a/modules/ROOT/partials/guardrail-feature-parameters.adoc b/modules/ROOT/partials/guardrail-feature-parameters.adoc new file mode 100644 index 00000000..2773c0d9 --- /dev/null +++ b/modules/ROOT/partials/guardrail-feature-parameters.adoc @@ -0,0 +1,16 @@ +The guardrail feature manages records that exceed guardrail checks. +The Guardrail job generates a report; other jobs skip records that exceed the guardrail limit. + +By default, these parameters are commented out. + +[cols="3,1,3"] +|=== +|Property | Default | Notes + +| `spark.cdm.feature.guardrail.colSizeInKB` +| `0` +| The `0` default means the guardrail check is not done. +If set, table records with one or more fields that exceed the column size in kB are flagged. +Note this is kB which is base 10, not kiB which is base 2. + +|=== diff --git a/modules/ROOT/partials/java-filter-parameters.adoc b/modules/ROOT/partials/java-filter-parameters.adoc new file mode 100644 index 00000000..329a6c95 --- /dev/null +++ b/modules/ROOT/partials/java-filter-parameters.adoc @@ -0,0 +1,46 @@ +Java filters are applied on the client node. +Data must be pulled from the origin cluster and then filtered. +However, this option may have a lower impact on the production cluster than xref:cdm-cassandra-filter-params[Cassandra filters]. +Java filters put a load onto the {cstar-data-migrator} processing node. +They send more data from Cassandra. +Cassandra filters put a load on the Cassandra nodes because {cstar-data-migrator} specifies `ALLOW FILTERING`, which could cause the coordinator node to perform a lot more work. + +By default, these parameters are commented out. + +[cols="2,1,4"] +|=== +|Property | Default | Notes + +| `spark.cdm.filter.java.token.percent` +| `100` +| Between 1 and 100 percent of the token in each split that is migrated. +This property is used to do a wide and random sampling of the data. +The percentage value is applied to each split. +Invalid percentages are treated as 100. + +| `spark.cdm.filter.java.writetime.min` +| `0` +| The lowest (inclusive) writetime values to be migrated. +Using the `spark.cdm.filter.java.writetime.min` and `spark.cdm.filter.java.writetime.max` thresholds, {cstar-data-migrator} can filter records based on their writetimes. +The maximum writetime of the columns configured at `spark.cdm.schema.origin.column.writetime.names` are compared to the `.min` and `.max` thresholds, which must be in **microseconds since the epoch**. +If the `spark.cdm.schema.origin.column.writetime.names` are not specified or the thresholds are null or otherwise invalid, the filter is ignored. +Note that `spark.cdm.s.perfops.batchSize` is ignored when this filter is in place; a value of 1 is used instead. + +| `spark.cdm.filter.java.writetime.max` +| `9223372036854775807` +| The highest (inclusive) writetime values to be migrated. +The `spark.cdm.schema.origin.column.writetime.names` specifies the maximum timestamp of the columns. +If that property is not specified or is for some reason null, the filter is ignored. + +| `spark.cdm.filter.java.column.name` +| +| Filter rows based on matching a configured value. +With `spark.cdm.filter.java.column.name`, specify the column name against which the `spark.cdm.filter.java.column.value` is compared. +Must be on the column list specified at `spark.cdm.schema.origin.column.names`. +The column value is converted to a string, trimmed of whitespace on both ends, and compared. + +| `spark.cdm.filter.java.column.value` +| +| String value to use as comparison. +The whitespace on the ends of `spark.cdm.filter.java.column.value` is trimmed. +|=== \ No newline at end of file diff --git a/modules/ROOT/partials/origin-schema-parameters.adoc b/modules/ROOT/partials/origin-schema-parameters.adoc new file mode 100644 index 00000000..360a3b2f --- /dev/null +++ b/modules/ROOT/partials/origin-schema-parameters.adoc @@ -0,0 +1,54 @@ +[cols="3,1,5a"] +|=== +|Property | Default | Notes + +| `spark.cdm.schema.origin.keyspaceTable` +| +| Required - the `.` of the table to be migrated. +Table must exist in the origin cluster. + +| `spark.cdm.schema.origin.column.ttl.automatic` +| `true` +| Default is `true`, unless `spark.cdm.schema.origin.column.ttl.names` is specified. +When `true`, determine the Time To Live (TTL) of the target record. +Find the maximum TTL of all origin columns that can have TTL set. This excludes partition key, clustering key, collections/UDT/tuple, and frozen columns. +When `false`, and `spark.cdm.schema.origin.column.ttl.names` is not set, the target record has the target table configuration determine the TTL. + +| `spark.cdm.schema.origin.column.ttl.names` +| +| Default is empty, meaning the names are determined automatically if `spark.cdm.schema.origin.column.ttl.automatic` is set. +Specify a subset of eligible columns that are used to calculate the TTL of the target record. + +| `spark.cdm.schema.origin.column.writetime.automatic` +| `true` +| Default is `true`, unless `spark.cdm.schema.origin.column.writetime.names` is specified. +When `true`, determine the `WRITETIME` of the target record. +Find the maximum `WRITETIME` of all origin columns that can have `WRITETIME` set. This excludes partition key, clustering key, collections/UDT/tuple, and frozen columns. +When `false`, and `spark.cdm.schema.origin.column.writetime.names` is not set, the target table configuration determines the target record's `WRITETIME`. + +[NOTE] +==== +The `spark.cdm.transform.custom.writetime` property, if set, overrides `spark.cdm.schema.origin.column.writetime`. +==== + +| `spark.cdm.schema.origin.column.writetime.names` +| +| Default is empty, meaning the names are determined automatically if `spark.cdm.schema.origin.column.writetime.automatic` is set. +Otherwise, specify a subset of eligible columns that are used to calculate the WRITETIME of the target record. +Example: `data_col1,data_col2,...` + +| `spark.cdm.schema.origin.column.names.to.target` +| +| Default is empty. +If column names are changed between the origin and target clusters, then this mapped list provides a mechanism to associate the two. +The format is `:`. +The list is comma separated. +You only need to list renamed columns. + +|=== + +[NOTE] +==== +For optimization reasons, {cstar-data-migrator} does not migrate TTL and writetime at the field level. +Instead, {cstar-data-migrator} finds the field with the highest TTL and the field with the highest writetime within an origin table row, and uses those values on the entire target table row. +==== \ No newline at end of file diff --git a/modules/ROOT/partials/performance-and-operations-parameters.adoc b/modules/ROOT/partials/performance-and-operations-parameters.adoc new file mode 100644 index 00000000..613c2e94 --- /dev/null +++ b/modules/ROOT/partials/performance-and-operations-parameters.adoc @@ -0,0 +1,59 @@ +Performance and operations parameters that can affect migration throughput, error handling, and similar concerns. + +[cols="4,1,3"] +|=== +|Property | Default | Notes + +| `spark.cdm.perfops.numParts` +| `10000` +| In standard operation, the full token range of -2^63 to 2^63-1 is divided into a number of parts, which are parallel processed. +You should aim for each part to comprise a total of ≈1-10GB of data to migrate. +During initial testing, you may want this to be a small number, such as `1`. + +| `spark.cdm.perfops.batchSize` +| `5` +| When writing to the target cluster, this comprises the number of records that are put into an `UNLOGGED` batch. +{cstar-data-migrator} tends to work on the same partition at a time. +If your partition sizes are larger, this number may be increased. +If the `spark.cdm.perfops.batchSize` would mean that more than 1 partition is often contained in a batch, reduce this parameter's value. +Ideally < 1% of batches have more than 1 partition. + +| `spark.cdm.perfops.ratelimit.origin` +| `20000` +| Concurrent number of operations across all parallel threads from the origin cluster. +This value may be adjusted up or down, depending on the amount of data and the processing capacity of the origin cluster. + +| `spark.cdm.perfops.ratelimit.target` +| `40000` +| Concurrent number of operations across all parallel threads from the target cluster. +This may be adjusted up or down, depending on the amount of data and the processing capacity of the target cluster. + +| `spark.cdm.perfops.consistency.read` +| `LOCAL_QUORUM` +| Commented out. +Read consistency from the origin cluster and from the target cluster when records are read for comparison purposes. +The consistency parameters may be one of: `ANY`, `ONE`, `TWO`, `THREE`, `QUORUM`, `LOCAL_ONE`, `EACH_QUORUM`, `LOCAL_QUORUM`, `SERIAL`, `LOCAL_SERIAL`, `ALL`. + +| `spark.cdm.perfops.consistency.write` +| `LOCAL_QUORUM` +| Commented out. +Write consistency to the arget cluster. +The consistency parameters may be one of: `ANY`, `ONE`, `TWO`, `THREE`, `QUORUM`, `LOCAL_ONE`, `EACH_QUORUM`, `LOCAL_QUORUM`, `SERIAL`, `LOCAL_SERIAL`, `ALL`. + +| `spark.cdm.perfops.printStatsAfter` +| `100000` +| Commented out. +Number of rows of processing after which a progress log entry is made. + +| `spark.cdm.perfops.fetchSizeInRows` +| `1000` +| Commented out. +This parameter affects the frequency of reads from the origin cluster and the frequency of flushes to the target cluster. + +| `spark.cdm.perfops.errorLimit` +| `0` +| Commented out. +Controls how many errors a thread may encounter during `MigrateData` and `DiffData` operations before failing. +Recommendation: set this parameter to a non-zero value **only when not doing** a mutation-type operation, such as when you're running `DiffData` without `.autocorrect`. + +|=== \ No newline at end of file diff --git a/modules/ROOT/partials/target-schema-parameters.adoc b/modules/ROOT/partials/target-schema-parameters.adoc new file mode 100644 index 00000000..62a1f610 --- /dev/null +++ b/modules/ROOT/partials/target-schema-parameters.adoc @@ -0,0 +1,11 @@ +[cols="3,1,2"] +|=== +|Property | Default | Notes + +| `spark.cdm.schema.target.keyspaceTable` +| Equals the value of `spark.cdm.schema.origin.keyspaceTable` +| This parameter is commented out. +It's the `.` of the table to be migrated into the target. +Table must exist in the target cluster. + +|=== \ No newline at end of file diff --git a/modules/ROOT/partials/tls-connection-parameters.adoc b/modules/ROOT/partials/tls-connection-parameters.adoc new file mode 100644 index 00000000..985092d9 --- /dev/null +++ b/modules/ROOT/partials/tls-connection-parameters.adoc @@ -0,0 +1,66 @@ +These are TLS (SSL) connection parameters, if configured, for the origin and target clusters. +Note that a secure connect bundle (SCB) embeds these details. + +By default, these parameters are commented out. + +[cols="3,3,3"] +|=== +|Property | Default | Notes + +| `spark.cdm.connect.origin.tls.enabled` +| `false` +| If TLS is used, set to `true`. + +| `spark.cdm.connect.origin.tls.trustStore.path` +| +| Path to the Java truststore file. + +| `spark.cdm.connect.origin.tls.trustStore.password` +| +| Password needed to open the truststore. + +| `spark.cdm.connect.origin.tls.trustStore.type` +| `JKS` +| + +| `spark.cdm.connect.origin.tls.keyStore.path` +| +| Path to the Java keystore file. + +| `spark.cdm.connect.origin.tls.keyStore.password` +| +| Password needed to open the keystore. + +| `spark.cdm.connect.origin.tls.enabledAlgorithms` +| `TLS_RSA_WITH_AES_128_CBC_SHA`,`TLS_RSA_WITH_AES_256_CBC_SHA` +| + +| `spark.cdm.connect.target.tls.enabled` +| `false` +| If TLS is used, set to `true`. + +| `spark.cdm.connect.target.tls.trustStore.path` +| +| Path to the Java truststore file. + +| `spark.cdm.connect.target.tls.trustStore.password` +| +| Password needed to open the truststore. + +| `spark.cdm.connect.target.tls.trustStore.type` +| `JKS` +| + +| `spark.cdm.connect.target.tls.keyStore.path` +| +| Path to the Java keystore file. + +| `spark.cdm.connect.target.tls.keyStore.password` +| +| Password needed to open the keystore. + +| `spark.cdm.connect.target.tls.enabledAlgorithms` +| `TLS_RSA_WITH_AES_128_CBC_SHA`,`TLS_RSA_WITH_AES_256_CBC_SHA` +| + +|=== \ No newline at end of file diff --git a/modules/ROOT/partials/transformation-parameters.adoc b/modules/ROOT/partials/transformation-parameters.adoc new file mode 100644 index 00000000..d8ff2f18 --- /dev/null +++ b/modules/ROOT/partials/transformation-parameters.adoc @@ -0,0 +1,58 @@ +Parameters to perform schema transformations between the origin and target clusters. + +By default, these parameters are commented out. + +[cols="2,1,4a"] +|=== +|Property | Default | Notes + +| `spark.cdm.transform.missing.key.ts.replace.value` +| `1685577600000` +| Timestamp value in milliseconds. +Partition and clustering columns cannot have null values. +If they are added as part of a schema transformation between the origin and target clusters, it is possible that the origin side is null. +In this case, the `Migrate` data operation fails. +This parameter allows a crude constant value to be used in its place that is separate from the constant values feature. + +| `spark.cdm.transform.custom.writetime` +| `0` +| Default is 0 (disabled). +Timestamp value in microseconds to use as the `WRITETIME` for the target record. +This is useful when the `WRITETIME` of the record in the origin cluster cannot be determined. Such an example is when the only non-key columns are collections. +This parameter allows a crude constant value to be used in its place and overrides `spark.cdm.schema.origin.column.writetime.names`. + +| `spark.cdm.transform.custom.writetime.incrementBy` +| `0` +| Default is `0`. +This is useful when you have a list that is not frozen and you are updating this using the autocorrect feature. +Lists are not idempotent, and subsequent UPSERTs add duplicates to the list. + +| `spark.cdm.transform.codecs` +| +| Default is empty. +A comma-separated list of additional codecs to enable. + + * `INT_STRING` : int stored in a string. + * `DOUBLE_STRING` : double stored in a string. + * `BIGINT_STRING` : bigint stored in a string. + * `DECIMAL_STRING` : decimal stored in a string. + * `TIMESTAMP_STRING_MILLIS` : timestamp stored in a string, as Epoch milliseconds. + * `TIMESTAMP_STRING_FORMAT` : timestamp stored in a string with a custom format. + +[NOTE] +==== +Where there are multiple type pair options, such as with `TIMESTAMP_STRING_*`, only one can be configured at a time with the `spark.cdm.transform.codecs` parameter. +==== + +| `spark.cdm.transform.codecs.timestamp.string.format` +| `yyyyMMddHHmmss` +| Configuration for `CQL_TIMESTAMP_TO_STRING_FORMAT` codec. +Default format is `yyyyMMddHHmmss`; `DateTimeFormatter.ofPattern(formatString)` + + +| `spark.cdm.transform.codecs.timestamp.string.zone` +| `UTC` +| Default is `UTC`. +Must be in `ZoneRulesProvider.getAvailableZoneIds()`. + +|=== \ No newline at end of file diff --git a/modules/ROOT/partials/use-cdm-migrator.adoc b/modules/ROOT/partials/use-cdm-migrator.adoc new file mode 100644 index 00000000..686a34c5 --- /dev/null +++ b/modules/ROOT/partials/use-cdm-migrator.adoc @@ -0,0 +1,28 @@ +. Configure for your environment the `cdm*.properties` file that's provided in the {cstar-data-migrator} https://github.com/datastax/cassandra-data-migrator/tree/main/src/resources[GitHub repo]. +The file can have any name. +It does not need to be `cdm.properties` or `cdm-detailed.properties`. +In both versions, the `spark-submit` job processes only the parameters that aren't commented out. +Other parameter values use defaults or are ignored. +See the descriptions and defaults in each file. +For more information, see the following: + * The simplified sample properties configuration, https://github.com/datastax/cassandra-data-migrator/blob/main/src/resources/cdm.properties[cdm.properties]. + This file contains only those parameters that are commonly configured. + * The complete sample properties configuration, https://github.com/datastax/cassandra-data-migrator/blob/main/src/resources/cdm-detailed.properties[cdm-detailed.properties], for the full set of configurable settings. + +. Place the properties file that you elected to use and customize where it can be accessed while running the job using `spark-submit`. + +. Run the job using `spark-submit` command: + +[source,bash] +---- +./spark-submit --properties-file cdm.properties \ +--conf spark.cdm.schema.origin.keyspaceTable="." \ +--master "local[*]" --driver-memory 25G --executor-memory 25G \ +--class com.datastax.cdm.job.Migrate cassandra-data-migrator-x.y.z.jar &> logfile_name_$(date +%Y%m%d_%H_%M).txt +---- + +[TIP] +==== +* The command generates a log file `logfile_name_*.txt` to prevent log output on the console. +* Update the memory options, driver & executor memory, based on your use case. +==== \ No newline at end of file From 44bf6b59c686c8a85c79ad10c110f34b33578a99 Mon Sep 17 00:00:00 2001 From: beajohnson Date: Wed, 12 Jun 2024 19:27:40 -0700 Subject: [PATCH 18/32] checking nav --- modules/ROOT/nav.adoc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/ROOT/nav.adoc b/modules/ROOT/nav.adoc index 7a17267d..98d5e2de 100644 --- a/modules/ROOT/nav.adoc +++ b/modules/ROOT/nav.adoc @@ -16,7 +16,7 @@ ** xref:manage-proxy-instances.adoc[] //phase 2 * xref:migrate-and-validate-data.adoc[] -//** xref:cassandra-data-migrator.adoc[] +** xref:cassandra-data-migrator.adoc[] ** xref:dsbulk-migrator.adoc[] //phase 3 * xref:enable-async-dual-reads.adoc[] From 2a5793b63b0ee73075e2c07e796fc66f25760a0a Mon Sep 17 00:00:00 2001 From: beajohnson Date: Thu, 13 Jun 2024 04:49:46 -0700 Subject: [PATCH 19/32] update nav --- modules/ROOT/nav.adoc | 60 ++++++++++++++++++++++++------------------- 1 file changed, 33 insertions(+), 27 deletions(-) diff --git a/modules/ROOT/nav.adoc b/modules/ROOT/nav.adoc index 98d5e2de..d316aef4 100644 --- a/modules/ROOT/nav.adoc +++ b/modules/ROOT/nav.adoc @@ -1,39 +1,45 @@ -.{product} -* xref:introduction.adoc[] -* xref:components.adoc[] -* xref:preliminary-steps.adoc[] -** xref:feasibility-checklists.adoc[] -** xref:deployment-infrastructure.adoc[] -** xref:create-target.adoc[] -** xref:rollback.adoc[] +* Zero Downtime Migration +** xref:introduction.adoc[] +** xref:components.adoc[] +** xref:preliminary-steps.adoc[] +*** xref:feasibility-checklists.adoc[] +*** xref:deployment-infrastructure.adoc[] +*** xref:create-target.adoc[] +*** xref:rollback.adoc[] //phase 1 -* xref:phase1.adoc[] -** xref:setup-ansible-playbooks.adoc[] -** xref:deploy-proxy-monitoring.adoc[] -** xref:tls.adoc[] -** xref:connect-clients-to-proxy.adoc[] -** xref:metrics.adoc[] -** xref:manage-proxy-instances.adoc[] +** xref:phase1.adoc[] +*** xref:setup-ansible-playbooks.adoc[] +*** xref:deploy-proxy-monitoring.adoc[] +*** xref:tls.adoc[] +*** xref:connect-clients-to-proxy.adoc[] +*** xref:metrics.adoc[] +*** xref:manage-proxy-instances.adoc[] //phase 2 -* xref:migrate-and-validate-data.adoc[] -** xref:cassandra-data-migrator.adoc[] -** xref:dsbulk-migrator.adoc[] +** xref:migrate-and-validate-data.adoc[] +*** xref:cassandra-data-migrator.adoc[] +*** xref:dsbulk-migrator.adoc[] //phase 3 -* xref:enable-async-dual-reads.adoc[] +** xref:enable-async-dual-reads.adoc[] //phase 4 -* xref:change-read-routing.adoc[] +** xref:change-read-routing.adoc[] //phase 5 -* xref:connect-clients-to-target.adoc[] -* Cassandra Data Migrator -** xref:cassandra-data-migrator.adoc[Overview] -** xref:cdm-prereqs.adoc[Prerequisites] -** xref:cdm-steps.adoc[Migrate data] -** xref:cdm-parameters.adoc[Parameters] +** xref:connect-clients-to-target.adoc[] + * Troubleshooting ** xref:troubleshooting.adoc[] ** xref:troubleshooting-tips.adoc[] ** xref:troubleshooting-scenarios.adoc[] + * xref:faqs.adoc[] + * xref:glossary.adoc[] + * xref:contributions.adoc[] -* xref:release-notes.adoc[] \ No newline at end of file + +* xref:release-notes.adoc[] + +* {cstar-data-migrator} +** xref:cassandra-data-migrator.adoc[Overview] +** xref:cdm-prereqs.adoc[Prerequisites] +** xref:cdm-steps.adoc[Migrate data] +** xref:cdm-parameters.adoc[Parameters] \ No newline at end of file From 80c7fb6075b823903a088642aae89b874c8b0c98 Mon Sep 17 00:00:00 2001 From: beajohnson Date: Thu, 13 Jun 2024 05:27:28 -0700 Subject: [PATCH 20/32] minor updates --- modules/ROOT/pages/cassandra-data-migrator.adoc | 6 +++--- modules/ROOT/pages/cdm-prereqs.adoc | 4 ++-- modules/ROOT/pages/cdm-steps.adoc | 4 ++-- modules/ROOT/partials/cdm-partition-ranges.adoc | 2 +- modules/ROOT/partials/use-cdm-migrator.adoc | 2 +- 5 files changed, 9 insertions(+), 9 deletions(-) diff --git a/modules/ROOT/pages/cassandra-data-migrator.adoc b/modules/ROOT/pages/cassandra-data-migrator.adoc index 0e3476e0..a0552eb0 100644 --- a/modules/ROOT/pages/cassandra-data-migrator.adoc +++ b/modules/ROOT/pages/cassandra-data-migrator.adoc @@ -1,4 +1,4 @@ -= {cstar-data-migrator} += Introduction to {cstar-data-migrator} Use {cstar-data-migrator} to migrate and validate tables between origin and target Cassandra clusters, with available logging and reconciliation support. @@ -8,7 +8,7 @@ Use {cstar-data-migrator} to migrate and validate tables between origin and targ Get the latest image that includes all dependencies from https://hub.docker.com/r/datastax/cassandra-data-migrator[DockerHub]. -All migration tools (`cassandra-data-migrator` + `dsbulk` + `cqlsh`) are available in the `/assets/` folder of the container. +All migration tools, `cassandra-data-migrator` + `dsbulk` + `cqlsh`, are available in the `/assets/` folder of the container. [[cdm-install-as-jar]] == Install {cstar-data-migrator} as a JAR file @@ -25,7 +25,7 @@ If you're starting new, use the latest released version if possible. [[cdm-build-jar-local]] == Build {cstar-data-migrator} JAR for local development (optional) -Optionally, you can build the {cstar-data-migrator} JAR for local development. (You'll need https://maven.apache.org/download.cgi[Maven] 3.9.x.) +Optionally, you can build the {cstar-data-migrator} JAR for local development. You'll need https://maven.apache.org/download.cgi[Maven] 3.9.x. Example: diff --git a/modules/ROOT/pages/cdm-prereqs.adoc b/modules/ROOT/pages/cdm-prereqs.adoc index a548a70d..14047565 100644 --- a/modules/ROOT/pages/cdm-prereqs.adoc +++ b/modules/ROOT/pages/cdm-prereqs.adoc @@ -4,8 +4,8 @@ Read the prerequisites below before using the Cassandra Data Migrator. * Install or switch to Java 11. The Spark binaries are compiled with this version of Java. -* Install https://archive.apache.org/dist/spark/spark-3.5.1/[Spark 3.5.1] on a single VM where you want to run this job. -No cluster is necessary +* Select a single VM to run this job and install https://archive.apache.org/dist/spark/spark-3.5.1/[Spark 3.5.1] there. +No cluster is necessary. * Optionally, install https://maven.apache.org/download.cgi[Maven] 3.9.x if you want to build the JAR for local development. Run the following commands to install Apache Spark: diff --git a/modules/ROOT/pages/cdm-steps.adoc b/modules/ROOT/pages/cdm-steps.adoc index 02ad1826..001a076d 100644 --- a/modules/ROOT/pages/cdm-steps.adoc +++ b/modules/ROOT/pages/cdm-steps.adoc @@ -7,12 +7,12 @@ Use {cstar-data-migrator} to migrate and validate tables between the origin and include::partial$use-cdm-migrator.adoc[] [[cdm-validation-steps]] -== {cstar-data-migrator} steps in validation mode +== Use {cstar-data-migrator} steps in validation mode include::partial$cdm-validation-steps.adoc[] [[cdm--partition-ranges]] -== Migrating or validating specific partition ranges +== Migrate or validate specific partition ranges include::partial$cdm-partition-ranges.adoc[] diff --git a/modules/ROOT/partials/cdm-partition-ranges.adoc b/modules/ROOT/partials/cdm-partition-ranges.adoc index b86d4e98..9f8c4444 100644 --- a/modules/ROOT/partials/cdm-partition-ranges.adoc +++ b/modules/ROOT/partials/cdm-partition-ranges.adoc @@ -1,4 +1,4 @@ -You can also use {cstar-data-migrator} to migrate or validate specific partition ranges, by using a **partition-file** with the name `./._partitions.csv`. +You can also use {cstar-data-migrator} to migrate or validate specific partition ranges. Use a **partition-file** with the name `./._partitions.csv`. Use the following format in the CSV file, in the current folder as input. Example: diff --git a/modules/ROOT/partials/use-cdm-migrator.adoc b/modules/ROOT/partials/use-cdm-migrator.adoc index 686a34c5..e5513d51 100644 --- a/modules/ROOT/partials/use-cdm-migrator.adoc +++ b/modules/ROOT/partials/use-cdm-migrator.adoc @@ -24,5 +24,5 @@ For more information, see the following: [TIP] ==== * The command generates a log file `logfile_name_*.txt` to prevent log output on the console. -* Update the memory options, driver & executor memory, based on your use case. +* Update the memory options, driver and executor memory, based on your use case. ==== \ No newline at end of file From ee47b811fe26ea73395ec14631f48ebd72ee2098 Mon Sep 17 00:00:00 2001 From: beajohnson Date: Thu, 13 Jun 2024 13:19:43 -0700 Subject: [PATCH 21/32] more partials --- modules/ROOT/nav.adoc | 29 +++++--- .../ROOT/pages/cassandra-data-migrator.adoc | 67 +++++++++---------- modules/ROOT/pages/cdm-prereqs.adoc | 18 +---- modules/ROOT/pages/cdm-steps.adoc | 1 + .../ROOT/partials/cdm-build-jar-local.adoc | 13 ++++ .../partials/cdm-install-as-container.adoc | 3 + modules/ROOT/partials/cdm-install-as-jar.adoc | 8 +++ modules/ROOT/partials/cdm-overview.adoc | 56 ++++++++++++++++ .../ROOT/partials/cdm-partition-ranges.adoc | 4 +- modules/ROOT/partials/cdm-prerequisites.adoc | 16 +++++ .../ROOT/partials/cdm-validation-steps.adoc | 2 +- .../common-connection-parameters.adoc | 50 ++++++++++++++ .../guardrail-feature-parameters.adoc | 2 +- ...performance-and-operations-parameters.adoc | 2 +- 14 files changed, 205 insertions(+), 66 deletions(-) create mode 100644 modules/ROOT/partials/cdm-build-jar-local.adoc create mode 100644 modules/ROOT/partials/cdm-install-as-container.adoc create mode 100644 modules/ROOT/partials/cdm-install-as-jar.adoc create mode 100644 modules/ROOT/partials/cdm-overview.adoc create mode 100644 modules/ROOT/partials/cdm-prerequisites.adoc create mode 100644 modules/ROOT/partials/common-connection-parameters.adoc diff --git a/modules/ROOT/nav.adoc b/modules/ROOT/nav.adoc index d316aef4..74aca74a 100644 --- a/modules/ROOT/nav.adoc +++ b/modules/ROOT/nav.adoc @@ -25,21 +25,30 @@ //phase 5 ** xref:connect-clients-to-target.adoc[] -* Troubleshooting -** xref:troubleshooting.adoc[] -** xref:troubleshooting-tips.adoc[] -** xref:troubleshooting-scenarios.adoc[] +** Troubleshooting +*** xref:troubleshooting.adoc[] +*** xref:troubleshooting-tips.adoc[] +*** xref:troubleshooting-scenarios.adoc[] -* xref:faqs.adoc[] +** xref:faqs.adoc[] -* xref:glossary.adoc[] +** xref:glossary.adoc[] -* xref:contributions.adoc[] +** xref:contributions.adoc[] -* xref:release-notes.adoc[] +** xref:release-notes.adoc[] * {cstar-data-migrator} -** xref:cassandra-data-migrator.adoc[Overview] +** xref:cassandra-data-migrator.adoc[] ** xref:cdm-prereqs.adoc[Prerequisites] ** xref:cdm-steps.adoc[Migrate data] -** xref:cdm-parameters.adoc[Parameters] \ No newline at end of file +** xref:cdm-parameters.adoc[Parameters] + +#include +#include "myheader.h" + +// Function definition +void myFunction(xref) { + // Call another function (include) + anotherFunction(include::partial::$cdm-overview); +} diff --git a/modules/ROOT/pages/cassandra-data-migrator.adoc b/modules/ROOT/pages/cassandra-data-migrator.adoc index a0552eb0..1cfd287c 100644 --- a/modules/ROOT/pages/cassandra-data-migrator.adoc +++ b/modules/ROOT/pages/cassandra-data-migrator.adoc @@ -1,57 +1,56 @@ -= Introduction to {cstar-data-migrator} += {cstar-data-migrator} Use {cstar-data-migrator} to migrate and validate tables between origin and target Cassandra clusters, with available logging and reconciliation support. +include::partial$cdm-prerequisites.adoc[] [[cdm-install-as-container]] == Install {cstar-data-migrator} as a Container -Get the latest image that includes all dependencies from https://hub.docker.com/r/datastax/cassandra-data-migrator[DockerHub]. - -All migration tools, `cassandra-data-migrator` + `dsbulk` + `cqlsh`, are available in the `/assets/` folder of the container. +include::partial$cdm-install-as-container.adoc[] [[cdm-install-as-jar]] == Install {cstar-data-migrator} as a JAR file -Download the *latest* JAR file from the {cstar-data-migrator} https://github.com/datastax/cassandra-data-migrator/packages/1832128[GitHub repo]. -image:https://img.shields.io/github/v/release/datastax/cassandra-data-migrator?color=green[Latest release] - -[NOTE] -==== -Version 4.x of {cstar-data-migrator} is not backward-compatible with `*.properties` files created in previous versions, and package names have changed. -If you're starting new, use the latest released version if possible. -==== +include::partial$cdm-install-as-jar.adoc[] [[cdm-build-jar-local]] == Build {cstar-data-migrator} JAR for local development (optional) -Optionally, you can build the {cstar-data-migrator} JAR for local development. You'll need https://maven.apache.org/download.cgi[Maven] 3.9.x. +include::partial$cdm-build-jar-local.adoc[] + +[[cdm-steps]] +== Use {cstar-data-migrator} + +include::partial$use-cdm-migrator.adoc[] + +[[cdm-validation-steps]] +== Use {cstar-data-migrator} steps in validation mode + +include::partial$cdm-validation-steps.adoc[] -Example: +[[cdm--partition-ranges]] +== Migrate or validate specific partition ranges -[source,bash] ----- -cd ~/github -git clone git@github.com:datastax/cassandra-data-migrator.git -cd cassandra-data-migrator -mvn clean package ----- +include::partial$cdm-partition-ranges.adoc[] -The fat jar, or`cassandra-data-migrator-x.y.z.jar`, file should be present now in the `target` folder. +[[cdm-guardrail-checks]] +== Perform large-field guardrail violation checks +include::partial$cdm-guardrail-checks.adoc[] [[cdm-reference]] == {cstar-data-migrator} reference -* xref:cdm-parameters.adoc#cdm-connection-params[Common connection parameters for Origin and Target] -* xref:cdm-parameters.adoc#cdm-origin-schema-params[Origin schema parameters] -* xref:cdm-parameters.adoc#cdm-target-schema-params[Target schema parameter] -* xref:cdm-parameters.adoc#cdm-auto-correction-params[Auto-correction parameters] -* xref:cdm-parameters.adoc#cdm-performance-operations-params[Performance and operations parameters] -* xref:cdm-parameters.adoc#cdm-transformation-params[Transformation parameters] -* xref:cdm-parameters.adoc#cdm-cassandra-filter-params[Cassandra filter parameters] -* xref:cdm-parameters.adoc#cdm-java-filter-params[Java filter parameters] -* xref:cdm-parameters.adoc#cdm-constant-column-feature-params[Constant column feature parameters] -* xref:cdm-parameters.adoc#cdm-explode-map-feature-params[Explode map feature parameters] -* xref:cdm-parameters.adoc#cdm-guardrail-feature-params[Guardrail feature parameters] -* xref:cdm-parameters.adoc#cdm-tls-ssl-connection-params[TLS (SSL) connection parameters] +* include::partial$common-connection-parameters.adoc[Common connection parameters for origin and target] +* include::partial$origin-schema-parameters.adoc[Origin schema parameters] +* include::partial$target-schema-parameters.adoc[Target schema parameter] +* include::partial$auto-correction-parameters.adoc[Auto-correction parameters] +* include::partial$performance-and-operation-parameters.adoc[Performance and operations parameters] +* include::partial$transformation-parameters.adoc[Transformation parameters] +* include::partial$cassandra-filter-parameters.adoc[Cassandra filter parameters] +* include::partial$java-filter-parameters.adoc[Java filter parameters] +* include::partial$constant-column-feature-parameters.adoc[Constant column feature parameters] +* include::partial$explode-map-feature-parameters.adoc[Explode map feature parameters] +* include::partial$guardrail-feature-parameters.adoc[Guardrail feature parameters] +* include::partial$tls-ssl-connection-parameters.adoc[TLS (SSL) connection parameters] diff --git a/modules/ROOT/pages/cdm-prereqs.adoc b/modules/ROOT/pages/cdm-prereqs.adoc index 14047565..e8a662f6 100644 --- a/modules/ROOT/pages/cdm-prereqs.adoc +++ b/modules/ROOT/pages/cdm-prereqs.adoc @@ -1,20 +1,4 @@ = {cstar-data-migrator} prerequisites -Read the prerequisites below before using the Cassandra Data Migrator. - -* Install or switch to Java 11. -The Spark binaries are compiled with this version of Java. -* Select a single VM to run this job and install https://archive.apache.org/dist/spark/spark-3.5.1/[Spark 3.5.1] there. -No cluster is necessary. -* Optionally, install https://maven.apache.org/download.cgi[Maven] 3.9.x if you want to build the JAR for local development. - -Run the following commands to install Apache Spark: - -[source,bash] ----- -wget https://archive.apache.org/dist/spark/spark-3.5.1/spark-3.5.1-bin-hadoop3-scala2.13.tgz - -tar -xvzf spark-3.5.1-bin-hadoop3-scala2.13.tgz ----- - +include::partial$cdm-prerequisites.adoc[] diff --git a/modules/ROOT/pages/cdm-steps.adoc b/modules/ROOT/pages/cdm-steps.adoc index 001a076d..6b020eff 100644 --- a/modules/ROOT/pages/cdm-steps.adoc +++ b/modules/ROOT/pages/cdm-steps.adoc @@ -2,6 +2,7 @@ Use {cstar-data-migrator} to migrate and validate tables between the origin and target Cassandra clusters, with available logging and reconciliation support. +[[cdm-steps]] == Use {cstar-data-migrator} include::partial$use-cdm-migrator.adoc[] diff --git a/modules/ROOT/partials/cdm-build-jar-local.adoc b/modules/ROOT/partials/cdm-build-jar-local.adoc new file mode 100644 index 00000000..6b285c9b --- /dev/null +++ b/modules/ROOT/partials/cdm-build-jar-local.adoc @@ -0,0 +1,13 @@ +Optionally, you can build the {cstar-data-migrator} JAR for local development. You'll need https://maven.apache.org/download.cgi[Maven] 3.9.x. + +Example: + +[source,bash] +---- +cd ~/github +git clone git@github.com:datastax/cassandra-data-migrator.git +cd cassandra-data-migrator +mvn clean package +---- + +The fat jar file, or`cassandra-data-migrator-x.y.z.jar`, should be present now in the `target` folder. \ No newline at end of file diff --git a/modules/ROOT/partials/cdm-install-as-container.adoc b/modules/ROOT/partials/cdm-install-as-container.adoc new file mode 100644 index 00000000..27825330 --- /dev/null +++ b/modules/ROOT/partials/cdm-install-as-container.adoc @@ -0,0 +1,3 @@ +Get the latest image that includes all dependencies from https://hub.docker.com/r/datastax/cassandra-data-migrator[DockerHub]. + +All migration tools, `cassandra-data-migrator` and `dsbulk` and `cqlsh`, are available in the `/assets/` folder of the container. \ No newline at end of file diff --git a/modules/ROOT/partials/cdm-install-as-jar.adoc b/modules/ROOT/partials/cdm-install-as-jar.adoc new file mode 100644 index 00000000..eb60f9b2 --- /dev/null +++ b/modules/ROOT/partials/cdm-install-as-jar.adoc @@ -0,0 +1,8 @@ +Download the *latest* JAR file from the {cstar-data-migrator} https://github.com/datastax/cassandra-data-migrator/packages/1832128[GitHub repo]. +image:https://img.shields.io/github/v/release/datastax/cassandra-data-migrator?color=green[Latest release] + +[NOTE] +==== +Version 4.x of {cstar-data-migrator} is not backward-compatible with `*.properties` files created in previous versions, and package names have changed. +If you're starting new, use the latest released version if possible. +==== diff --git a/modules/ROOT/partials/cdm-overview.adoc b/modules/ROOT/partials/cdm-overview.adoc new file mode 100644 index 00000000..1cfd287c --- /dev/null +++ b/modules/ROOT/partials/cdm-overview.adoc @@ -0,0 +1,56 @@ += {cstar-data-migrator} + +Use {cstar-data-migrator} to migrate and validate tables between origin and target Cassandra clusters, with available logging and reconciliation support. + +include::partial$cdm-prerequisites.adoc[] + +[[cdm-install-as-container]] +== Install {cstar-data-migrator} as a Container + +include::partial$cdm-install-as-container.adoc[] + +[[cdm-install-as-jar]] +== Install {cstar-data-migrator} as a JAR file + +include::partial$cdm-install-as-jar.adoc[] + +[[cdm-build-jar-local]] +== Build {cstar-data-migrator} JAR for local development (optional) + +include::partial$cdm-build-jar-local.adoc[] + +[[cdm-steps]] +== Use {cstar-data-migrator} + +include::partial$use-cdm-migrator.adoc[] + +[[cdm-validation-steps]] +== Use {cstar-data-migrator} steps in validation mode + +include::partial$cdm-validation-steps.adoc[] + +[[cdm--partition-ranges]] +== Migrate or validate specific partition ranges + +include::partial$cdm-partition-ranges.adoc[] + +[[cdm-guardrail-checks]] +== Perform large-field guardrail violation checks + +include::partial$cdm-guardrail-checks.adoc[] + +[[cdm-reference]] +== {cstar-data-migrator} reference + +* include::partial$common-connection-parameters.adoc[Common connection parameters for origin and target] +* include::partial$origin-schema-parameters.adoc[Origin schema parameters] +* include::partial$target-schema-parameters.adoc[Target schema parameter] +* include::partial$auto-correction-parameters.adoc[Auto-correction parameters] +* include::partial$performance-and-operation-parameters.adoc[Performance and operations parameters] +* include::partial$transformation-parameters.adoc[Transformation parameters] +* include::partial$cassandra-filter-parameters.adoc[Cassandra filter parameters] +* include::partial$java-filter-parameters.adoc[Java filter parameters] +* include::partial$constant-column-feature-parameters.adoc[Constant column feature parameters] +* include::partial$explode-map-feature-parameters.adoc[Explode map feature parameters] +* include::partial$guardrail-feature-parameters.adoc[Guardrail feature parameters] +* include::partial$tls-ssl-connection-parameters.adoc[TLS (SSL) connection parameters] diff --git a/modules/ROOT/partials/cdm-partition-ranges.adoc b/modules/ROOT/partials/cdm-partition-ranges.adoc index 9f8c4444..121f1566 100644 --- a/modules/ROOT/partials/cdm-partition-ranges.adoc +++ b/modules/ROOT/partials/cdm-partition-ranges.adoc @@ -24,11 +24,11 @@ Example: --class com.datastax.cdm.job. cassandra-data-migrator-x.y.z.jar &> logfile_name_$(date +%Y%m%d_%H_%M).txt ---- -This mode is specifically useful to processes a subset of partition-ranges that may have failed during a previous run. +This mode is specifically useful to process a subset of partition-ranges that may have failed during a previous run. [NOTE] ==== -A file named `./._partitions.csv` is autogenerated by the migration and validation jobs, in the format shown above. +In the format shown above, the migration and validation jobs autogenerate a file named `./._partitions.csv`. The file contains any failed partition ranges. No file is created if there were no failed partitions. You can use the CSV as input to process any failed partition in a subsequent run. diff --git a/modules/ROOT/partials/cdm-prerequisites.adoc b/modules/ROOT/partials/cdm-prerequisites.adoc new file mode 100644 index 00000000..a8d39bbd --- /dev/null +++ b/modules/ROOT/partials/cdm-prerequisites.adoc @@ -0,0 +1,16 @@ +Read the prerequisites below before using the Cassandra Data Migrator. + +* Install or switch to Java 11. +The Spark binaries are compiled with this version of Java. +* Select a single VM to run this job and install https://archive.apache.org/dist/spark/spark-3.5.1/[Spark 3.5.1] there. +No cluster is necessary. +* Optionally, install https://maven.apache.org/download.cgi[Maven] 3.9.x if you want to build the JAR for local development. + +Run the following commands to install Apache Spark: + +[source,bash] +---- +wget https://archive.apache.org/dist/spark/spark-3.5.1/spark-3.5.1-bin-hadoop3-scala2.13.tgz + +tar -xvzf spark-3.5.1-bin-hadoop3-scala2.13.tgz +---- diff --git a/modules/ROOT/partials/cdm-validation-steps.adoc b/modules/ROOT/partials/cdm-validation-steps.adoc index 46e05376..050ae467 100644 --- a/modules/ROOT/partials/cdm-validation-steps.adoc +++ b/modules/ROOT/partials/cdm-validation-steps.adoc @@ -41,6 +41,6 @@ spark.cdm.autocorrect.mismatch false|true [IMPORTANT] ==== -The {cstar-data-migrator} validation job nevers delete records from the target cluster. +The {cstar-data-migrator} validation job never deletes records from the target cluster. The job only adds or updates data on the target cluster. ==== \ No newline at end of file diff --git a/modules/ROOT/partials/common-connection-parameters.adoc b/modules/ROOT/partials/common-connection-parameters.adoc new file mode 100644 index 00000000..4bba0002 --- /dev/null +++ b/modules/ROOT/partials/common-connection-parameters.adoc @@ -0,0 +1,50 @@ +[cols="5,2,4"] +|=== +|Property | Default | Notes + +| `spark.cdm.connect.origin.host` +| `localhost` +| Hostname/IP address of the cluster. +May be a comma-separated list, and can follow the `:` convention. + +| `spark.cdm.connect.origin.port` +| `9042` +| Port number to use if not specified on `spark.cdm.connect.origin.host`. + +| `spark.cdm.connect.origin.scb` +| (Not set) +| Secure Connect Bundle, used to connect to an Astra DB database. +Example: `file:///aaa/bbb/scb-enterprise.zip`. + +| `spark.cdm.connect.origin.username` +| `cassandra` +| Username (or `client_id` value) used to authenticate. + +| `spark.cdm.connect.origin.password` +| `cassandra` +| Password (or `client_secret` value) used to authenticate. + +| `spark.cdm.connect.target.host` +| `localhost` +| Hostname/IP address of the cluster. +May be a comma-separated list, and can follow the `:` convention. + +| `spark.cdm.connect.target.port` +| `9042` +| Port number to use if not specified on `spark.cdm.connect.origin.host`. + +| `spark.cdm.connect.target.scb` +| (Not set) +| Secure Connect Bundle, used to connect to an Astra DB database. +Default is not set. +Example if set: `file:///aaa/bbb/my-scb.zip`. + +| `spark.cdm.connect.target.username` +| `cassandra` +| Username (or `client_id` value) used to authenticate. + +| `spark.cdm.connect.origin.password` +| `cassandra` +| Password (or `client_secret` value) used to authenticate. + +|=== \ No newline at end of file diff --git a/modules/ROOT/partials/guardrail-feature-parameters.adoc b/modules/ROOT/partials/guardrail-feature-parameters.adoc index 2773c0d9..7c4b31ab 100644 --- a/modules/ROOT/partials/guardrail-feature-parameters.adoc +++ b/modules/ROOT/partials/guardrail-feature-parameters.adoc @@ -1,5 +1,5 @@ The guardrail feature manages records that exceed guardrail checks. -The Guardrail job generates a report; other jobs skip records that exceed the guardrail limit. +The guardrail job generates a report; other jobs skip records that exceed the guardrail limit. By default, these parameters are commented out. diff --git a/modules/ROOT/partials/performance-and-operations-parameters.adoc b/modules/ROOT/partials/performance-and-operations-parameters.adoc index 613c2e94..45277759 100644 --- a/modules/ROOT/partials/performance-and-operations-parameters.adoc +++ b/modules/ROOT/partials/performance-and-operations-parameters.adoc @@ -37,7 +37,7 @@ The consistency parameters may be one of: `ANY`, `ONE`, `TWO`, `THREE`, `QUORUM` | `spark.cdm.perfops.consistency.write` | `LOCAL_QUORUM` | Commented out. -Write consistency to the arget cluster. +Write consistency to the target cluster. The consistency parameters may be one of: `ANY`, `ONE`, `TWO`, `THREE`, `QUORUM`, `LOCAL_ONE`, `EACH_QUORUM`, `LOCAL_QUORUM`, `SERIAL`, `LOCAL_SERIAL`, `ALL`. | `spark.cdm.perfops.printStatsAfter` From 973c86f203cceba261cb28aeb4ac3b3d96907ea9 Mon Sep 17 00:00:00 2001 From: beajohnson Date: Thu, 13 Jun 2024 13:20:27 -0700 Subject: [PATCH 22/32] removed extra code --- modules/ROOT/nav.adoc | 9 --------- 1 file changed, 9 deletions(-) diff --git a/modules/ROOT/nav.adoc b/modules/ROOT/nav.adoc index 74aca74a..d3a6ea7b 100644 --- a/modules/ROOT/nav.adoc +++ b/modules/ROOT/nav.adoc @@ -43,12 +43,3 @@ ** xref:cdm-prereqs.adoc[Prerequisites] ** xref:cdm-steps.adoc[Migrate data] ** xref:cdm-parameters.adoc[Parameters] - -#include -#include "myheader.h" - -// Function definition -void myFunction(xref) { - // Call another function (include) - anotherFunction(include::partial::$cdm-overview); -} From 87af21826e32badbd20bf9fbecd910bc5e486523 Mon Sep 17 00:00:00 2001 From: beajohnson Date: Thu, 13 Jun 2024 14:23:59 -0700 Subject: [PATCH 23/32] small changes --- .../ROOT/partials/cdm-build-jar-local.adoc | 2 +- modules/ROOT/partials/cdm-overview.adoc | 47 ++++++++++++++----- 2 files changed, 37 insertions(+), 12 deletions(-) diff --git a/modules/ROOT/partials/cdm-build-jar-local.adoc b/modules/ROOT/partials/cdm-build-jar-local.adoc index 6b285c9b..a4c7f6f6 100644 --- a/modules/ROOT/partials/cdm-build-jar-local.adoc +++ b/modules/ROOT/partials/cdm-build-jar-local.adoc @@ -10,4 +10,4 @@ cd cassandra-data-migrator mvn clean package ---- -The fat jar file, or`cassandra-data-migrator-x.y.z.jar`, should be present now in the `target` folder. \ No newline at end of file +The fat jar file, `cassandra-data-migrator-x.y.z.jar`, should be present now in the `target` folder. \ No newline at end of file diff --git a/modules/ROOT/partials/cdm-overview.adoc b/modules/ROOT/partials/cdm-overview.adoc index 1cfd287c..556c507b 100644 --- a/modules/ROOT/partials/cdm-overview.adoc +++ b/modules/ROOT/partials/cdm-overview.adoc @@ -42,15 +42,40 @@ include::partial$cdm-guardrail-checks.adoc[] [[cdm-reference]] == {cstar-data-migrator} reference +[[cdm-reference]] * include::partial$common-connection-parameters.adoc[Common connection parameters for origin and target] -* include::partial$origin-schema-parameters.adoc[Origin schema parameters] -* include::partial$target-schema-parameters.adoc[Target schema parameter] -* include::partial$auto-correction-parameters.adoc[Auto-correction parameters] -* include::partial$performance-and-operation-parameters.adoc[Performance and operations parameters] -* include::partial$transformation-parameters.adoc[Transformation parameters] -* include::partial$cassandra-filter-parameters.adoc[Cassandra filter parameters] -* include::partial$java-filter-parameters.adoc[Java filter parameters] -* include::partial$constant-column-feature-parameters.adoc[Constant column feature parameters] -* include::partial$explode-map-feature-parameters.adoc[Explode map feature parameters] -* include::partial$guardrail-feature-parameters.adoc[Guardrail feature parameters] -* include::partial$tls-ssl-connection-parameters.adoc[TLS (SSL) connection parameters] + +include::partial$origin-schema-parameters.adoc[Origin schema parameters] + +include::partial$target-schema-parameters.adoc[Target schema parameter] + +include::partial$auto-correction-parameters.adoc[Auto-correction parameters] + +include::partial$performance-and-operation-parameters.adoc[Performance and operations parameters] + +include::partial$transformation-parameters.adoc[Transformation parameters] + +include::partial$cassandra-filter-parameters.adoc[Cassandra filter parameters] + +include::partial$java-filter-parameters.adoc[Java filter parameters] + +include::partial$constant-column-feature-parameters.adoc[Constant column feature parameters] + +include::partial$explode-map-feature-parameters.adoc[Explode map feature parameters] + +include::partial$guardrail-feature-parameters.adoc[Guardrail feature parameters] + +include::partial$tls-ssl-connection-parameters.adoc[TLS (SSL) connection parameters] + +* xref:#cdm-connection-params[Common connection parameters for Origin and Target] +* xref:#cdm-origin-schema-params[Origin schema parameters] +* xref:#cdm-target-schema-params[Target schema parameter] +* xref:#cdm-auto-correction-params[Auto-correction parameters] +* xref:#cdm-performance-operations-params[Performance and operations parameters] +* xref:#cdm-transformation-params[Transformation parameters] +* xref:#cdm-cassandra-filter-params[Cassandra filter parameters] +* xref:#cdm-java-filter-params[Java filter parameters] +* xref:#cdm-constant-column-feature-params[Constant column feature parameters] +* xref:#cdm-explode-map-feature-params[Explode map feature parameters] +* xref:#cdm-guardrail-feature-params[Guardrail feature parameters] +* xref:#cdm-tls-ssl-connection-params[TLS (SSL) connection parameters] From b0286f2ac37cd567d6f8838a3c982e172903fb84 Mon Sep 17 00:00:00 2001 From: beajohnson Date: Thu, 13 Jun 2024 15:45:34 -0700 Subject: [PATCH 24/32] minor update --- modules/ROOT/partials/cdm-overview.adoc | 1 + 1 file changed, 1 insertion(+) diff --git a/modules/ROOT/partials/cdm-overview.adoc b/modules/ROOT/partials/cdm-overview.adoc index 556c507b..7c97de62 100644 --- a/modules/ROOT/partials/cdm-overview.adoc +++ b/modules/ROOT/partials/cdm-overview.adoc @@ -44,6 +44,7 @@ include::partial$cdm-guardrail-checks.adoc[] [[cdm-reference]] * include::partial$common-connection-parameters.adoc[Common connection parameters for origin and target] +xref:common-connection-parameters.adoc[Common connection parameters for origin and target] include::partial$origin-schema-parameters.adoc[Origin schema parameters] From f1c18c4ebf91a14c75a3d512deaa25fd37d810c5 Mon Sep 17 00:00:00 2001 From: beajohnson Date: Thu, 13 Jun 2024 16:49:32 -0700 Subject: [PATCH 25/32] pages not showing --- .../ROOT/pages/cassandra-data-migrator.adoc | 9 +++-- modules/ROOT/pages/cdm-parameters.adoc | 5 +++ modules/ROOT/partials/cdm-overview.adoc | 35 ------------------- 3 files changed, 12 insertions(+), 37 deletions(-) diff --git a/modules/ROOT/pages/cassandra-data-migrator.adoc b/modules/ROOT/pages/cassandra-data-migrator.adoc index 1cfd287c..eb892e35 100644 --- a/modules/ROOT/pages/cassandra-data-migrator.adoc +++ b/modules/ROOT/pages/cassandra-data-migrator.adoc @@ -2,6 +2,9 @@ Use {cstar-data-migrator} to migrate and validate tables between origin and target Cassandra clusters, with available logging and reconciliation support. +[[cdm-prerequisites]] +== {cstar-data-migrator} prerequisites + include::partial$cdm-prerequisites.adoc[] [[cdm-install-as-container]] @@ -39,10 +42,12 @@ include::partial$cdm-partition-ranges.adoc[] include::partial$cdm-guardrail-checks.adoc[] -[[cdm-reference]] -== {cstar-data-migrator} reference + +[[cdm-references]] +== {cstar-data-migrator} references * include::partial$common-connection-parameters.adoc[Common connection parameters for origin and target] + * include::partial$origin-schema-parameters.adoc[Origin schema parameters] * include::partial$target-schema-parameters.adoc[Target schema parameter] * include::partial$auto-correction-parameters.adoc[Auto-correction parameters] diff --git a/modules/ROOT/pages/cdm-parameters.adoc b/modules/ROOT/pages/cdm-parameters.adoc index de978e60..8934970b 100644 --- a/modules/ROOT/pages/cdm-parameters.adoc +++ b/modules/ROOT/pages/cdm-parameters.adoc @@ -2,6 +2,11 @@ Each parameter below offers a different connection. Review each option to determine what is best for your organization. +[{cdm-connection-params}] +== Common connection parameters for origin and target + +include::partial$common-connection-parameters.adoc[] + [[cdm-origin-schema-params]] == Origin schema parameters diff --git a/modules/ROOT/partials/cdm-overview.adoc b/modules/ROOT/partials/cdm-overview.adoc index 7c97de62..23edc3c5 100644 --- a/modules/ROOT/partials/cdm-overview.adoc +++ b/modules/ROOT/partials/cdm-overview.adoc @@ -43,40 +43,5 @@ include::partial$cdm-guardrail-checks.adoc[] == {cstar-data-migrator} reference [[cdm-reference]] -* include::partial$common-connection-parameters.adoc[Common connection parameters for origin and target] -xref:common-connection-parameters.adoc[Common connection parameters for origin and target] -include::partial$origin-schema-parameters.adoc[Origin schema parameters] -include::partial$target-schema-parameters.adoc[Target schema parameter] - -include::partial$auto-correction-parameters.adoc[Auto-correction parameters] - -include::partial$performance-and-operation-parameters.adoc[Performance and operations parameters] - -include::partial$transformation-parameters.adoc[Transformation parameters] - -include::partial$cassandra-filter-parameters.adoc[Cassandra filter parameters] - -include::partial$java-filter-parameters.adoc[Java filter parameters] - -include::partial$constant-column-feature-parameters.adoc[Constant column feature parameters] - -include::partial$explode-map-feature-parameters.adoc[Explode map feature parameters] - -include::partial$guardrail-feature-parameters.adoc[Guardrail feature parameters] - -include::partial$tls-ssl-connection-parameters.adoc[TLS (SSL) connection parameters] - -* xref:#cdm-connection-params[Common connection parameters for Origin and Target] -* xref:#cdm-origin-schema-params[Origin schema parameters] -* xref:#cdm-target-schema-params[Target schema parameter] -* xref:#cdm-auto-correction-params[Auto-correction parameters] -* xref:#cdm-performance-operations-params[Performance and operations parameters] -* xref:#cdm-transformation-params[Transformation parameters] -* xref:#cdm-cassandra-filter-params[Cassandra filter parameters] -* xref:#cdm-java-filter-params[Java filter parameters] -* xref:#cdm-constant-column-feature-params[Constant column feature parameters] -* xref:#cdm-explode-map-feature-params[Explode map feature parameters] -* xref:#cdm-guardrail-feature-params[Guardrail feature parameters] -* xref:#cdm-tls-ssl-connection-params[TLS (SSL) connection parameters] From dd972afc28d5a8ed0f648033bd612c9c3a2587b3 Mon Sep 17 00:00:00 2001 From: beajohnson Date: Thu, 13 Jun 2024 19:10:52 -0700 Subject: [PATCH 26/32] working links --- .../ROOT/pages/cassandra-data-migrator.adoc | 65 ++++++++++++++----- modules/ROOT/pages/cdm-parameters.adoc | 4 +- modules/ROOT/partials/cdm-overview.adoc | 5 +- ...doc => tls-ssl-connection-parameters.adoc} | 0 4 files changed, 56 insertions(+), 18 deletions(-) rename modules/ROOT/partials/{tls-connection-parameters.adoc => tls-ssl-connection-parameters.adoc} (100%) diff --git a/modules/ROOT/pages/cassandra-data-migrator.adoc b/modules/ROOT/pages/cassandra-data-migrator.adoc index eb892e35..5ac7c323 100644 --- a/modules/ROOT/pages/cassandra-data-migrator.adoc +++ b/modules/ROOT/pages/cassandra-data-migrator.adoc @@ -32,7 +32,7 @@ include::partial$use-cdm-migrator.adoc[] include::partial$cdm-validation-steps.adoc[] -[[cdm--partition-ranges]] +[[cdm-partition-ranges]] == Migrate or validate specific partition ranges include::partial$cdm-partition-ranges.adoc[] @@ -43,19 +43,54 @@ include::partial$cdm-partition-ranges.adoc[] include::partial$cdm-guardrail-checks.adoc[] -[[cdm-references]] +[[cdm-reference]] == {cstar-data-migrator} references -* include::partial$common-connection-parameters.adoc[Common connection parameters for origin and target] - -* include::partial$origin-schema-parameters.adoc[Origin schema parameters] -* include::partial$target-schema-parameters.adoc[Target schema parameter] -* include::partial$auto-correction-parameters.adoc[Auto-correction parameters] -* include::partial$performance-and-operation-parameters.adoc[Performance and operations parameters] -* include::partial$transformation-parameters.adoc[Transformation parameters] -* include::partial$cassandra-filter-parameters.adoc[Cassandra filter parameters] -* include::partial$java-filter-parameters.adoc[Java filter parameters] -* include::partial$constant-column-feature-parameters.adoc[Constant column feature parameters] -* include::partial$explode-map-feature-parameters.adoc[Explode map feature parameters] -* include::partial$guardrail-feature-parameters.adoc[Guardrail feature parameters] -* include::partial$tls-ssl-connection-parameters.adoc[TLS (SSL) connection parameters] +=== Common connection parameters for Origin and Target + +include::partial$common-connection-parameters.adoc[] + +=== Origin schema parameters + +include::partial$origin-schema-parameters.adoc[] + +=== Target schema parameters + +include::partial$target-schema-parameters.adoc[] + +=== Auto-correction parameters + +include::partial$auto-correction-parameters.adoc[] + +=== Performance and operations parameters + +include::partial$performance-and-operations-parameters.adoc[] + +=== Transformation parameters + +include::partial$transformation-parameters.adoc[] + +=== Cassandra filter parameters + +include::partial$cassandra-filter-parameters.adoc[] + +=== Java filter parameters + +include::partial$java-filter-parameters.adoc[] + +=== Constant column feature parameters + +include::partial$constant-column-feature-parameters.adoc[] + +=== Explode map feature parameters + +include::partial$explode-map-feature-parameters.adoc[] + +=== Guardrail feature parameter + +include::partial$guardrail-feature-parameters.adoc[] + +=== TLS (SSL) connection parameters + +include::partial$tls-ssl-connection-parameters.adoc[] + diff --git a/modules/ROOT/pages/cdm-parameters.adoc b/modules/ROOT/pages/cdm-parameters.adoc index 8934970b..3a1d8e52 100644 --- a/modules/ROOT/pages/cdm-parameters.adoc +++ b/modules/ROOT/pages/cdm-parameters.adoc @@ -2,7 +2,7 @@ Each parameter below offers a different connection. Review each option to determine what is best for your organization. -[{cdm-connection-params}] +[[cdm-connection-params]] == Common connection parameters for origin and target include::partial$common-connection-parameters.adoc[] @@ -67,4 +67,4 @@ include::partial$guardrail-feature-parameters.adoc[] [[cdm-tls-ssl-connection-params]] == TLS (SSL) connection parameters -include::partial$tls-connection-parameters.adoc[] \ No newline at end of file +include::partial$tls-ssl-connection-parameters.adoc[] \ No newline at end of file diff --git a/modules/ROOT/partials/cdm-overview.adoc b/modules/ROOT/partials/cdm-overview.adoc index 23edc3c5..0811a227 100644 --- a/modules/ROOT/partials/cdm-overview.adoc +++ b/modules/ROOT/partials/cdm-overview.adoc @@ -2,6 +2,9 @@ Use {cstar-data-migrator} to migrate and validate tables between origin and target Cassandra clusters, with available logging and reconciliation support. +[[cdm-prerequisites]] +== {cstar-data-migrator} prerequisites + include::partial$cdm-prerequisites.adoc[] [[cdm-install-as-container]] @@ -29,7 +32,7 @@ include::partial$use-cdm-migrator.adoc[] include::partial$cdm-validation-steps.adoc[] -[[cdm--partition-ranges]] +[[cdm-partition-ranges]] == Migrate or validate specific partition ranges include::partial$cdm-partition-ranges.adoc[] diff --git a/modules/ROOT/partials/tls-connection-parameters.adoc b/modules/ROOT/partials/tls-ssl-connection-parameters.adoc similarity index 100% rename from modules/ROOT/partials/tls-connection-parameters.adoc rename to modules/ROOT/partials/tls-ssl-connection-parameters.adoc From c6f94965ec2cb52d95f02f0a64cb011378da6e86 Mon Sep 17 00:00:00 2001 From: beajohnson Date: Thu, 13 Jun 2024 19:14:13 -0700 Subject: [PATCH 27/32] final links working --- modules/ROOT/partials/cdm-overview.adoc | 48 +++++++++++++++++++++++-- 1 file changed, 46 insertions(+), 2 deletions(-) diff --git a/modules/ROOT/partials/cdm-overview.adoc b/modules/ROOT/partials/cdm-overview.adoc index 0811a227..a510aba9 100644 --- a/modules/ROOT/partials/cdm-overview.adoc +++ b/modules/ROOT/partials/cdm-overview.adoc @@ -43,8 +43,52 @@ include::partial$cdm-partition-ranges.adoc[] include::partial$cdm-guardrail-checks.adoc[] [[cdm-reference]] -== {cstar-data-migrator} reference +== {cstar-data-migrator} references -[[cdm-reference]] +=== Common connection parameters for Origin and Target + +include::partial$common-connection-parameters.adoc[] + +=== Origin schema parameters + +include::partial$origin-schema-parameters.adoc[] + +=== Target schema parameters + +include::partial$target-schema-parameters.adoc[] + +=== Auto-correction parameters + +include::partial$auto-correction-parameters.adoc[] + +=== Performance and operations parameters + +include::partial$performance-and-operations-parameters.adoc[] + +=== Transformation parameters + +include::partial$transformation-parameters.adoc[] + +=== Cassandra filter parameters + +include::partial$cassandra-filter-parameters.adoc[] + +=== Java filter parameters + +include::partial$java-filter-parameters.adoc[] + +=== Constant column feature parameters + +include::partial$constant-column-feature-parameters.adoc[] + +=== Explode map feature parameters + +include::partial$explode-map-feature-parameters.adoc[] + +=== Guardrail feature parameter + +include::partial$guardrail-feature-parameters.adoc[] +=== TLS (SSL) connection parameters +include::partial$tls-ssl-connection-parameters.adoc[] From 5b7d71bf6b268a33a45ac40ff76115aeb2c2d262 Mon Sep 17 00:00:00 2001 From: beajohnson Date: Fri, 14 Jun 2024 07:28:53 -0700 Subject: [PATCH 28/32] update antora yml --- antora.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/antora.yml b/antora.yml index e42ef0ee..4a8d8b8b 100644 --- a/antora.yml +++ b/antora.yml @@ -1,7 +1,7 @@ name: data-migration title: Data Migration version: ~ -start_page: introduction.adoc +start_page: index.adoc nav: - modules/ROOT/nav.adoc From 17ddb52f446635fada3948a8928ad86a800183ed Mon Sep 17 00:00:00 2001 From: beajohnson Date: Fri, 14 Jun 2024 07:29:50 -0700 Subject: [PATCH 29/32] back to intro --- antora.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/antora.yml b/antora.yml index 4a8d8b8b..e42ef0ee 100644 --- a/antora.yml +++ b/antora.yml @@ -1,7 +1,7 @@ name: data-migration title: Data Migration version: ~ -start_page: index.adoc +start_page: introduction.adoc nav: - modules/ROOT/nav.adoc From 38c3230634a5ddcde2fe47676667168540d63914 Mon Sep 17 00:00:00 2001 From: beajohnson Date: Fri, 14 Jun 2024 10:15:28 -0700 Subject: [PATCH 30/32] overview page --- modules/ROOT/nav.adoc | 3 +-- .../{partials => pages}/cdm-overview.adoc | 19 +++++++++++++++---- modules/ROOT/pages/cdm-prereqs.adoc | 4 ---- 3 files changed, 16 insertions(+), 10 deletions(-) rename modules/ROOT/{partials => pages}/cdm-overview.adoc (68%) delete mode 100644 modules/ROOT/pages/cdm-prereqs.adoc diff --git a/modules/ROOT/nav.adoc b/modules/ROOT/nav.adoc index d3a6ea7b..3b840350 100644 --- a/modules/ROOT/nav.adoc +++ b/modules/ROOT/nav.adoc @@ -39,7 +39,6 @@ ** xref:release-notes.adoc[] * {cstar-data-migrator} -** xref:cassandra-data-migrator.adoc[] -** xref:cdm-prereqs.adoc[Prerequisites] +** xref:cdm-overview.adoc[] ** xref:cdm-steps.adoc[Migrate data] ** xref:cdm-parameters.adoc[Parameters] diff --git a/modules/ROOT/partials/cdm-overview.adoc b/modules/ROOT/pages/cdm-overview.adoc similarity index 68% rename from modules/ROOT/partials/cdm-overview.adoc rename to modules/ROOT/pages/cdm-overview.adoc index a510aba9..ab5972ac 100644 --- a/modules/ROOT/partials/cdm-overview.adoc +++ b/modules/ROOT/pages/cdm-overview.adoc @@ -1,22 +1,31 @@ -= {cstar-data-migrator} += Overview -Use {cstar-data-migrator} to migrate and validate tables between origin and target Cassandra clusters, with available logging and reconciliation support. +Cassandra Data Migrator (CDM) is a tool designed for migrating and validating data between origin and target Apache Cassandra-compatible clusters. It facilitates the transfer of data, creating multiple jobs at once that can access the Cassandra cluster concurrently. This tool is also useful when dealing with large datasets and requires careful configuration to balance performance impact and migration speed. + +The information below explains how to get started with CDM. Review your prerequisites and decide between the two installation options: as a container or as a JAR file. [[cdm-prerequisites]] == {cstar-data-migrator} prerequisites include::partial$cdm-prerequisites.adoc[] +== CDM installation methods + +Both installation methods require attention to version compatibility, especially with the `cdm.properties` files. +Both environments also use `spark-submit` to run the jobs. + [[cdm-install-as-container]] -== Install {cstar-data-migrator} as a Container +=== Install {cstar-data-migrator} as a Container include::partial$cdm-install-as-container.adoc[] [[cdm-install-as-jar]] -== Install {cstar-data-migrator} as a JAR file +=== Install {cstar-data-migrator} as a JAR file include::partial$cdm-install-as-jar.adoc[] + +//// [[cdm-build-jar-local]] == Build {cstar-data-migrator} JAR for local development (optional) @@ -45,6 +54,7 @@ include::partial$cdm-guardrail-checks.adoc[] [[cdm-reference]] == {cstar-data-migrator} references + === Common connection parameters for Origin and Target include::partial$common-connection-parameters.adoc[] @@ -92,3 +102,4 @@ include::partial$guardrail-feature-parameters.adoc[] === TLS (SSL) connection parameters include::partial$tls-ssl-connection-parameters.adoc[] +//// \ No newline at end of file diff --git a/modules/ROOT/pages/cdm-prereqs.adoc b/modules/ROOT/pages/cdm-prereqs.adoc deleted file mode 100644 index e8a662f6..00000000 --- a/modules/ROOT/pages/cdm-prereqs.adoc +++ /dev/null @@ -1,4 +0,0 @@ -= {cstar-data-migrator} prerequisites - -include::partial$cdm-prerequisites.adoc[] - From cb9e1046ef2f4d2380af679956fb346a1259a2ec Mon Sep 17 00:00:00 2001 From: beajohnson Date: Fri, 14 Jun 2024 10:39:05 -0700 Subject: [PATCH 31/32] cdm to full name --- modules/ROOT/pages/cdm-overview.adoc | 82 +--------------------------- 1 file changed, 1 insertion(+), 81 deletions(-) diff --git a/modules/ROOT/pages/cdm-overview.adoc b/modules/ROOT/pages/cdm-overview.adoc index ab5972ac..efac866c 100644 --- a/modules/ROOT/pages/cdm-overview.adoc +++ b/modules/ROOT/pages/cdm-overview.adoc @@ -9,7 +9,7 @@ The information below explains how to get started with CDM. Review your prerequi include::partial$cdm-prerequisites.adoc[] -== CDM installation methods +== {cstar-data-migrator} installation methods Both installation methods require attention to version compatibility, especially with the `cdm.properties` files. Both environments also use `spark-submit` to run the jobs. @@ -23,83 +23,3 @@ include::partial$cdm-install-as-container.adoc[] === Install {cstar-data-migrator} as a JAR file include::partial$cdm-install-as-jar.adoc[] - - -//// -[[cdm-build-jar-local]] -== Build {cstar-data-migrator} JAR for local development (optional) - -include::partial$cdm-build-jar-local.adoc[] - -[[cdm-steps]] -== Use {cstar-data-migrator} - -include::partial$use-cdm-migrator.adoc[] - -[[cdm-validation-steps]] -== Use {cstar-data-migrator} steps in validation mode - -include::partial$cdm-validation-steps.adoc[] - -[[cdm-partition-ranges]] -== Migrate or validate specific partition ranges - -include::partial$cdm-partition-ranges.adoc[] - -[[cdm-guardrail-checks]] -== Perform large-field guardrail violation checks - -include::partial$cdm-guardrail-checks.adoc[] - -[[cdm-reference]] -== {cstar-data-migrator} references - - -=== Common connection parameters for Origin and Target - -include::partial$common-connection-parameters.adoc[] - -=== Origin schema parameters - -include::partial$origin-schema-parameters.adoc[] - -=== Target schema parameters - -include::partial$target-schema-parameters.adoc[] - -=== Auto-correction parameters - -include::partial$auto-correction-parameters.adoc[] - -=== Performance and operations parameters - -include::partial$performance-and-operations-parameters.adoc[] - -=== Transformation parameters - -include::partial$transformation-parameters.adoc[] - -=== Cassandra filter parameters - -include::partial$cassandra-filter-parameters.adoc[] - -=== Java filter parameters - -include::partial$java-filter-parameters.adoc[] - -=== Constant column feature parameters - -include::partial$constant-column-feature-parameters.adoc[] - -=== Explode map feature parameters - -include::partial$explode-map-feature-parameters.adoc[] - -=== Guardrail feature parameter - -include::partial$guardrail-feature-parameters.adoc[] - -=== TLS (SSL) connection parameters - -include::partial$tls-ssl-connection-parameters.adoc[] -//// \ No newline at end of file From 3a44b7c2a1d26f8d0b577dfe4d9549e5118b1a6d Mon Sep 17 00:00:00 2001 From: beajohnson Date: Fri, 14 Jun 2024 11:54:47 -0700 Subject: [PATCH 32/32] updated nav --- modules/ROOT/nav.adoc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/ROOT/nav.adoc b/modules/ROOT/nav.adoc index 3b840350..a522b3b2 100644 --- a/modules/ROOT/nav.adoc +++ b/modules/ROOT/nav.adoc @@ -17,7 +17,7 @@ //phase 2 ** xref:migrate-and-validate-data.adoc[] *** xref:cassandra-data-migrator.adoc[] -*** xref:dsbulk-migrator.adoc[] +*** https://docs.datastax.com/en/dsbulk/overview/dsbulk-about.html[DSBulk Loader] //phase 3 ** xref:enable-async-dual-reads.adoc[] //phase 4