From 913b9487179884849449f6403794a4ad30839684 Mon Sep 17 00:00:00 2001 From: beajohnson Date: Tue, 23 Apr 2024 12:12:31 -0700 Subject: [PATCH 1/5] dsbulk set up --- modules/ROOT/nav.adoc | 5 + modules/ROOT/pages/bulk-command-line.adoc | 468 ++++++++++++++++++ modules/ROOT/pages/bulk-migrate.adoc | 64 +++ modules/ROOT/pages/dsbulk-examples.adoc | 89 ++++ modules/ROOT/pages/dsbulk-migrator.adoc | 20 +- modules/ROOT/pages/dsbulk-prereqs.adoc | 10 + .../ROOT/pages/migrate-and-validate-data.adoc | 4 +- 7 files changed, 648 insertions(+), 12 deletions(-) create mode 100644 modules/ROOT/pages/bulk-command-line.adoc create mode 100644 modules/ROOT/pages/bulk-migrate.adoc create mode 100644 modules/ROOT/pages/dsbulk-examples.adoc create mode 100644 modules/ROOT/pages/dsbulk-prereqs.adoc diff --git a/modules/ROOT/nav.adoc b/modules/ROOT/nav.adoc index f6cb01fd..e99ee75e 100644 --- a/modules/ROOT/nav.adoc +++ b/modules/ROOT/nav.adoc @@ -24,6 +24,11 @@ * xref:change-read-routing.adoc[] //phase 5 * xref:connect-clients-to-target.adoc[] +* DSBulk Migrator +** xref:dsbulk-prereqs.adoc[] +** xref:bulk-migrate.adoc[] +** xref:bulk-command-line.adoc[] +** xref:dsbulk-examples.adoc[] * Troubleshooting ** xref:troubleshooting.adoc[] ** xref:troubleshooting-tips.adoc[] diff --git a/modules/ROOT/pages/bulk-command-line.adoc b/modules/ROOT/pages/bulk-command-line.adoc new file mode 100644 index 00000000..4858efe4 --- /dev/null +++ b/modules/ROOT/pages/bulk-command-line.adoc @@ -0,0 +1,468 @@ += Command-line options + +* xref:#dsbulk-live[Live migration command-line options] +* xref:#dsbulk-script[Script generation command-line options] +* xref:#dsbulk-ddl[DDL generation command-line options] +* xref:#getting-help-with-dsbulk-migrator[Getting {dsbulk-migrator} help] +* xref:#dsbulk-examples[{dsbulk-migrator} examples] + + +[[dsbulk-live]] +== Live migration command-line options + +The following options are available for the `migrate-live` command. +Most options have sensible default values and do not need to be specified, unless you want to override the default value. + +[cols="2,8,14"] +|=== + +| `-c` +| `--dsbulk-cmd=CMD` +| The external DSBulk command to use. +Ignored if the embedded DSBulk is being used. +The default is simply `dsbulk`, assuming that the command is available through the `PATH` variable contents. + +| `-d` +| `--data-dir=PATH` +| The directory where data will be exported to and imported from. +The default is a `data` subdirectory in the current working directory. +The data directory will be created if it does not exist. +Tables will be exported and imported in subdirectories of the data directory specified here. +There will be one subdirectory per keyspace in the data directory, then one subdirectory per table in each keyspace directory. + +| `-e` +| `--dsbulk-use-embedded` +| Use the embedded DSBulk version instead of an external one. +The default is to use an external DSBulk command. + +| +| `--export-bundle=PATH` +| The path to a secure connect bundle to connect to the Origin cluster, if that cluster is a {company} {astra_db} cluster. +Options `--export-host` and `--export-bundle` are mutually exclusive. + +| +| `--export-consistency=CONSISTENCY` +| The consistency level to use when exporting data. +The default is `LOCAL_QUORUM`. + +| +| `--export-dsbulk-option=OPT=VALUE` +| An extra DSBulk option to use when exporting. +Any valid DSBulk option can be specified here, and it will passed as is to the DSBulk process. +DSBulk options, including driver options, must be passed as `--long.option.name=`. +Short options are not supported. + +| +| `--export-host=HOST[:PORT]` +| The host name or IP and, optionally, the port of a node from the Origin cluster. +If the port is not specified, it will default to `9042`. +This option can be specified multiple times. +Options `--export-host` and `--export-bundle` are mutually exclusive. + +| +| `--export-max-concurrent-files=NUM\|AUTO` +| The maximum number of concurrent files to write to. +Must be a positive number or the special value `AUTO`. +The default is `AUTO`. + +| +| `--export-max-concurrent-queries=NUM\|AUTO` +| The maximum number of concurrent queries to execute. +Must be a positive number or the special value `AUTO`. +The default is `AUTO`. + +| +| `--export-max-records=NUM` +| The maximum number of records to export for each table. +Must be a positive number or `-1`. +The default is `-1` (export the entire table). + +| +| `--export-password` +| The password to use to authenticate against the Origin cluster. +Options `--export-username` and `--export-password` must be provided together, or not at all. +Omit the parameter value to be prompted for the password interactively. + +| +| `--export-splits=NUM\|NC` +| The maximum number of token range queries to generate. +Use the `NC` syntax to specify a multiple of the number of available cores. +For example, `8C` = 8 times the number of available cores. +The default is `8C`. +This is an advanced setting; you should rarely need to modify the default value. + +| +| `--export-username=STRING` +| The username to use to authenticate against the Origin cluster. +Options `--export-username` and `--export-password` must be provided together, or not at all. + +| `-h` +| `--help` +| Displays this help text. + +| +| `--import-bundle=PATH` +| The path to a secure connect bundle to connect to the Target cluster, if it's a {company} {astra_db} cluster. +Options `--import-host` and `--import-bundle` are mutually exclusive. + +| +| `--import-consistency=CONSISTENCY` +| The consistency level to use when importing data. +The default is `LOCAL_QUORUM`. + +| +| `--import-default-timestamp=` +| The default timestamp to use when importing data. +Must be a valid instant in ISO-8601 syntax. +The default is `1970-01-01T00:00:00Z`. + +| +| `--import-dsbulk-option=OPT=VALUE` +| An extra DSBulk option to use when importing. +Any valid DSBulk option can be specified here, and it will passed as is to the DSBulk process. +DSBulk options, including driver options, must be passed as `--long.option.name=`. +Short options are not supported. + +| +| `--import-host=HOST[:PORT]` +| The host name or IP and, optionally, the port of a node from the Target cluster. +If the port is not specified, it will default to `9042`. +This option can be specified multiple times. +Options `--import-host` and `--import-bundle` are mutually exclusive. + +| +| `--import-max-concurrent-files=NUM\|AUTO` +| The maximum number of concurrent files to read from. +Must be a positive number or the special value `AUTO`. +The default is `AUTO`. + +| +| `--import-max-concurrent-queries=NUM\|AUTO` +| The maximum number of concurrent queries to execute. +Must be a positive number or the special value `AUTO`. +The default is `AUTO`. + +| +| `--import-max-errors=NUM` +| The maximum number of failed records to tolerate when importing data. +The default is `1000`. +Failed records will appear in a `load.bad` file in the DSBulk operation directory. + +| +| `--import-password` +| The password to use to authenticate against the Target cluster. +Options `--import-username` and `--import-password` must be provided together, or not at all. +Omit the parameter value to be prompted for the password interactively. + +| +| `--import-username=STRING` +| The username to use to authenticate against the Target cluster. Options `--import-username` and `--import-password` must be provided together, or not at all. + +| `-k` +| `--keyspaces=REGEX` +| A regular expression to select keyspaces to migrate. +The default is to migrate all keyspaces except system keyspaces, DSE-specific keyspaces, and the OpsCenter keyspace. +Case-sensitive keyspace names must be entered in their exact case. + +| `-l` +| `--dsbulk-log-dir=PATH` +| The directory where DSBulk should store its logs. +The default is a `logs` subdirectory in the current working directory. +This subdirectory will be created if it does not exist. +Each DSBulk operation will create a subdirectory in the log directory specified here. + +| +| `--max-concurrent-ops=NUM` +| The maximum number of concurrent operations (exports and imports) to carry. +The default is `1`. +Set this to higher values to allow exports and imports to occur concurrently. +For example, with a value of `2`, each table will be imported as soon as it is exported, while the next table is being exported. + +| +| `--skip-truncate-confirmation` +| Skip truncate confirmation before actually truncating tables. +Only applicable when migrating counter tables, ignored otherwise. + +| `-t` +| `--tables=REGEX` +| A regular expression to select tables to migrate. +The default is to migrate all tables in the keyspaces that were selected for migration with `--keyspaces`. +Case-sensitive table names must be entered in their exact case. + +| +| `--table-types=regular\|counter\|all` +| The table types to migrate. +The default is `all`. + +| +| `--truncate-before-export` +| Truncate tables before the export instead of after. +The default is to truncate after the export. +Only applicable when migrating counter tables, ignored otherwise. + +| `-w` +| `--dsbulk-working-dir=PATH` +| The directory where DSBulk should be executed. +Ignored if the embedded DSBulk is being used. +If unspecified, it defaults to the current working directory. + +|=== + + +[[dsbulk-script]] +== Script generation command-line options + +The following options are available for the `generate-script` command. +Most options have sensible default values and do not need to be specified, unless you want to override the default value. + + +[cols="2,8,14"] +|=== + +| `-c` +| `--dsbulk-cmd=CMD` +| The DSBulk command to use. +The default is simply `dsbulk`, assuming that the command is available through the `PATH` variable contents. + +| `-d` +| `--data-dir=PATH` +| The directory where data will be exported to and imported from. +The default is a `data` subdirectory in the current working directory. +The data directory will be created if it does not exist. + +| +| `--export-bundle=PATH` +| The path to a secure connect bundle to connect to the Origin cluster, if that cluster is a {company} {astra_db} cluster. +Options `--export-host` and `--export-bundle` are mutually exclusive. + +| +| `--export-consistency=CONSISTENCY` +| The consistency level to use when exporting data. +The default is `LOCAL_QUORUM`. + +| +| `--export-dsbulk-option=OPT=VALUE` +| An extra DSBulk option to use when exporting. +Any valid DSBulk option can be specified here, and it will passed as is to the DSBulk process. +DSBulk options, including driver options, must be passed as `--long.option.name=`. +Short options are not supported. + +| +| `--export-host=HOST[:PORT]` +| The host name or IP and, optionally, the port of a node from the Origin cluster. +If the port is not specified, it will default to `9042`. +This option can be specified multiple times. +Options `--export-host` and `--export-bundle` are mutually exclusive. + +| +| `--export-max-concurrent-files=NUM\|AUTO` +| The maximum number of concurrent files to write to. +Must be a positive number or the special value `AUTO`. +The default is `AUTO`. + +| +| `--export-max-concurrent-queries=NUM\|AUTO` +| The maximum number of concurrent queries to execute. +Must be a positive number or the special value `AUTO`. +The default is `AUTO`. + +| +| `--export-max-records=NUM` +| The maximum number of records to export for each table. +Must be a positive number or `-1`. +The default is `-1` (export the entire table). + +| +| `--export-password` +| The password to use to authenticate against the Origin cluster. +Options `--export-username` and `--export-password` must be provided together, or not at all. +Omit the parameter value to be prompted for the password interactively. + +| +| `--export-splits=NUM\|NC` +| The maximum number of token range queries to generate. +Use the `NC` syntax to specify a multiple of the number of available cores. +For example, `8C` = 8 times the number of available cores. +The default is `8C`. +This is an advanced setting. +You should rarely need to modify the default value. + +| +| `--export-username=STRING` +| The username to use to authenticate against the Origin cluster. +Options `--export-username` and `--export-password` must be provided together, or not at all. + +| `-h` +| `--help` +| Displays this help text. + +| +| `--import-bundle=PATH` +| The path to a secure connect bundle to connect to the Target cluster, if it's a {company} {astra_db} cluster. +Options `--import-host` and `--import-bundle` are mutually exclusive. + +| +| `--import-consistency=CONSISTENCY` +| The consistency level to use when importing data. +The default is `LOCAL_QUORUM`. + +| +| `--import-default-timestamp=` +| The default timestamp to use when importing data. +Must be a valid instant in ISO-8601 syntax. +The default is `1970-01-01T00:00:00Z`. + +| +| `--import-dsbulk-option=OPT=VALUE` +| An extra DSBulk option to use when importing. +Any valid DSBulk option can be specified here, and it will passed as is to the DSBulk process. +DSBulk options, including driver options, must be passed as `--long.option.name=`. +Short options are not supported. + +| +| `--import-host=HOST[:PORT]` +| The host name or IP and, optionally, the port of a node from the Target cluster. +If the port is not specified, it will default to `9042`. +This option can be specified multiple times. +Options `--import-host` and `--import-bundle` are mutually exclusive. + +| +| `--import-max-concurrent-files=NUM\|AUTO` +| The maximum number of concurrent files to read from. +Must be a positive number or the special value `AUTO`. +The default is `AUTO`. + +| +| `--import-max-concurrent-queries=NUM\|AUTO` +| The maximum number of concurrent queries to execute. +Must be a positive number or the special value `AUTO`. +The default is `AUTO`. + +| +| `--import-max-errors=NUM` +| The maximum number of failed records to tolerate when importing data. +The default is `1000`. +Failed records will appear in a `load.bad` file in the DSBulk operation directory. + +| +| `--import-password` +| The password to use to authenticate against the Target cluster. +Options `--import-username` and `--import-password` must be provided together, or not at all. +Omit the parameter value to be prompted for the password interactively. + +| +| `--import-username=STRING` +| The username to use to authenticate against the Target cluster. +Options `--import-username` and `--import-password` must be provided together, or not at all. + +| `-k` +| `--keyspaces=REGEX` +| A regular expression to select keyspaces to migrate. +The default is to migrate all keyspaces except system keyspaces, DSE-specific keyspaces, and the OpsCenter keyspace. +Case-sensitive keyspace names must be entered in their exact case. + +| `-l` +| `--dsbulk-log-dir=PATH` +| The directory where DSBulk should store its logs. +The default is a `logs` subdirectory in the current working directory. +This subdirectory will be created if it does not exist. +Each DSBulk operation will create a subdirectory in the log directory specified here. + +| `-t` +| `--tables=REGEX` +| A regular expression to select tables to migrate. +The default is to migrate all tables in the keyspaces that were selected for migration with `--keyspaces`. +Case-sensitive table names must be entered in their exact case. + +| +| `--table-types=regular\|counter\|all` +| The table types to migrate. The default is `all`. + +|=== + + + +[[dsbulk-ddl]] +== DDL generation command-line options + +The following options are available for the `generate-ddl` command. +Most options have sensible default values and do not need to be specified, unless you want to override the default value. + +[cols="2,8,14"] +|=== + +| `-a` +| `--optimize-for-astra` +| Produce CQL scripts optimized for {company} {astra_db}. +{astra_db} does not allow some options in DDL statements. +Using this {dsbulk-migrator} command option, forbidden {astra_db} options will be omitted from the generated CQL files. + +| `-d` +| `--data-dir=PATH` +| The directory where data will be exported to and imported from. +The default is a `data` subdirectory in the current working directory. +The data directory will be created if it does not exist. + +| +| `--export-bundle=PATH` +| The path to a secure connect bundle to connect to the Origin cluster, if that cluster is a {company} {astra_db} cluster. +Options `--export-host` and `--export-bundle` are mutually exclusive. + +| +| `--export-host=HOST[:PORT]` +| The host name or IP and, optionally, the port of a node from the Origin cluster. +If the port is not specified, it will default to `9042`. +This option can be specified multiple times. +Options `--export-host` and `--export-bundle` are mutually exclusive. + +| +| `--export-password` +| The password to use to authenticate against the Origin cluster. +Options `--export-username` and `--export-password` must be provided together, or not at all. +Omit the parameter value to be prompted for the password interactively. + +| +| `--export-username=STRING` +| The username to use to authenticate against the Origin cluster. +Options `--export-username` and `--export-password` must be provided together, or not at all. + +| `-h` +| `--help` +| Displays this help text. + +| `-k` +| `--keyspaces=REGEX` +| A regular expression to select keyspaces to migrate. +The default is to migrate all keyspaces except system keyspaces, DSE-specific keyspaces, and the OpsCenter keyspace. +Case-sensitive keyspace names must be entered in their exact case. + +| `-t` +| `--tables=REGEX` +| A regular expression to select tables to migrate. +The default is to migrate all tables in the keyspaces that were selected for migration with `--keyspaces`. +Case-sensitive table names must be entered in their exact case. + +| +| `--table-types=regular\|counter\|all` +| The table types to migrate. +The default is `all`. + +|=== + + +[[getting-help-with-dsbulk-migrator]] +== Getting help with {dsbulk-migrator} + +Use the following command to display the available {dsbulk-migrator} commands: + +[source,bash] +---- +java -jar /path/to/dsbulk-migrator-embedded-dsbulk.jar --help +---- + +For individual command help and each one's options: + +[source,bash] +---- +java -jar /path/to/dsbulk-migrator-embedded-dsbulk.jar COMMAND --help +---- \ No newline at end of file diff --git a/modules/ROOT/pages/bulk-migrate.adoc b/modules/ROOT/pages/bulk-migrate.adoc new file mode 100644 index 00000000..75f4989f --- /dev/null +++ b/modules/ROOT/pages/bulk-migrate.adoc @@ -0,0 +1,64 @@ += Migrate data + +With xref:dsbulk-prereqs.adoc[prerequisites installed], it's time to build, test, and run the DSBulk Migrator. + +Building {dsbulk-migrator} is accomplished with Maven. First, clone the git repo to your local machine. +Example: + +[source,bash] +---- +cd ~/github +git clone git@github.com:datastax/dsbulk-migrator.git +cd dsbulk-migrator +---- + +Then run: + +[source,bash] +---- +mvn clean package +---- + +The build produces two distributable fat jars: + +* `dsbulk-migrator--embedded-driver.jar` : contains an embedded Java driver; suitable for live migrations using an external DSBulk or for script generation. +This jar is NOT suitable for live migrations using an embedded DSBulk since no DSBulk classes are present. +* `dsbulk-migrator--embedded-dsbulk.jar`: contains an embedded DSBulk and an embedded Java driver and is suitable for all operations. +This jar is much bigger than the previous one due to the presence of DSBulk classes. + +[[testing-dsbulk-migrator]] +== Testing {dsbulk-migrator} + +The project contains a few integration tests. +Run them with: + +[source,bash] +---- +mvn clean verify +---- + +[NOTE] +==== +The integration tests require https://github.com/datastax/simulacron[Simulacron]. +Be sure to meet all the https://github.com/datastax/simulacron#prerequisites[Simulacron prerequisites] before running the +tests. +==== + +[[running-dsbulk-migrator]] +== Running {dsbulk-migrator} + +Launch the {dsbulk-migrator} tool: + +[source,bash] +---- +java -jar /path/to/dsbulk-migrator.jar { migrate-live | generate-script | generate-ddl } [OPTIONS] +---- + +In a live migration, the options effectively configure DSBulk and to connect to the clusters. + +When generating a migration script, most options serve as default values in the generated scripts. +Even when generating scripts, this tool still needs to access the Origin cluster to gather metadata about the tables to migrate. + +When generating a DDL file, only a few options are meaningful. +Because standard DSBulk is not used, and the import cluster is never contacted, import options and DSBulk-related options are ignored. +The tool still needs to access the Origin cluster to gather metadata about the keyspaces and tables for which to generate DDL statements. diff --git a/modules/ROOT/pages/dsbulk-examples.adoc b/modules/ROOT/pages/dsbulk-examples.adoc new file mode 100644 index 00000000..d4990b91 --- /dev/null +++ b/modules/ROOT/pages/dsbulk-examples.adoc @@ -0,0 +1,89 @@ += Case examples + +[NOTE] +==== +These examples show sample `username` and `password` values that are for demonstration purposes only. +Do not use these values in your environment. +==== + +== Generate migration script + +Generate a migration script to migrate from an existing Origin cluster to a Target {astra_db} cluster: + +[source,bash] +---- + java -jar target/dsbulk-migrator--embedded-driver.jar migrate-live \ + --data-dir=/path/to/data/dir \ + --dsbulk-cmd=${DSBULK_ROOT}/bin/dsbulk \ + --dsbulk-log-dir=/path/to/log/dir \ + --export-host=my-origin-cluster.com \ + --export-username=user1 \ + --export-password=s3cr3t \ + --import-bundle=/path/to/bundle \ + --import-username=user1 \ + --import-password=s3cr3t +---- + +== Migrate live using external DSBulk install + +Migrate live from an existing Origin cluster to a Target {astra_db} cluster using an external DSBulk installation. +Passwords are prompted interactively: + +[source,bash] +---- + java -jar target/dsbulk-migrator--embedded-driver.jar migrate-live \ + --data-dir=/path/to/data/dir \ + --dsbulk-cmd=${DSBULK_ROOT}/bin/dsbulk \ + --dsbulk-log-dir=/path/to/log/dir \ + --export-host=my-origin-cluster.com \ + --export-username=user1 \ + --export-password # password will be prompted \ + --import-bundle=/path/to/bundle \ + --import-username=user1 \ + --import-password # password will be prompted +---- + +== Migrate live using embedded DSBulk install + +Migrate live from an existing Origin cluster to a Target {astra_db} cluster using the embedded DSBulk installation. +Passwords are prompted interactively. +In this example, additional DSBulk options are passed. + +[NOTE] +==== +You must use the `dsbulk-migrator--embedded-dsbulk.jar` fat jar here. +Otherwise, an error is raised because no embedded DSBulk can be found. +==== + +[source,bash] +---- + java -jar target/dsbulk-migrator--embedded-dsbulk.jar migrate-live \ + --data-dir=/path/to/data/dir \ + --dsbulk-use-embedded \ + --dsbulk-log-dir=/path/to/log/dir \ + --export-host=my-origin-cluster.com \ + --export-username=user1 \ + --export-password # password will be prompted \ + --export-dsbulk-option "--connector.csv.maxCharsPerColumn=65536" \ + --export-dsbulk-option "--executor.maxPerSecond=1000" \ + --import-bundle=/path/to/bundle \ + --import-username=user1 \ + --import-password # password will be prompted \ + --import-dsbulk-option "--connector.csv.maxCharsPerColumn=65536" \ + --import-dsbulk-option "--executor.maxPerSecond=1000" +---- + + +== Generate DDL to recreate Origin schema in Target + +Generate DDL files to recreate the Origin schema in a Target {astra_db} cluster: + +[source,bash] +---- + java -jar target/dsbulk-migrator--embedded-driver.jar generate-ddl \ + --data-dir=/path/to/data/dir \ + --export-host=my-origin-cluster.com \ + --export-username=user1 \ + --export-password=s3cr3t \ + --optimize-for-astra +---- diff --git a/modules/ROOT/pages/dsbulk-migrator.adoc b/modules/ROOT/pages/dsbulk-migrator.adoc index b3406605..495da716 100644 --- a/modules/ROOT/pages/dsbulk-migrator.adoc +++ b/modules/ROOT/pages/dsbulk-migrator.adoc @@ -62,16 +62,15 @@ Launch the {dsbulk-migrator} tool: java -jar /path/to/dsbulk-migrator.jar { migrate-live | generate-script | generate-ddl } [OPTIONS] ---- -When doing a live migration, the options are used to effectively configure DSBulk and to connect to +In live migration, the options effectively configure DSBulk and to connect to the clusters. When generating a migration script, most options serve as default values in the generated scripts. -Note however that, even when generating scripts, this tool still needs to access the Origin cluster -in order to gather metadata about the tables to migrate. +Even when generating scripts, this tool still needs to access the Origin cluster to gather metadata about the tables to migrate. When generating a DDL file, only a few options are meaningful. Because standard DSBulk is not used, and the import cluster is never contacted, import options and DSBulk-related options are ignored. -The tool still needs to access the Origin cluster in order to gather metadata about the keyspaces and tables for which to generate DDL statements. +The tool still needs to access the Origin cluster to gather metadata about the keyspaces and tables for which to generate DDL statements. [[dsbulk-migrator-reference]] == {dsbulk-migrator} reference @@ -595,6 +594,12 @@ Migrate live from an existing Origin cluster to a Target {astra_db} cluster usin Passwords will be prompted interactively. In this example, additional DSBulk options are passed. +[NOTE] +==== +In this example, you must use the `dsbulk-migrator--embedded-dsbulk.jar` fat jar. +Otherwise, an error is raised because no embedded DSBulk can be found. +==== + [source,bash] ---- java -jar target/dsbulk-migrator--embedded-dsbulk.jar migrate-live \ @@ -613,11 +618,6 @@ In this example, additional DSBulk options are passed. --import-dsbulk-option "--executor.maxPerSecond=1000" ---- -[NOTE] -==== -In the example above, you must use the `dsbulk-migrator--embedded-dsbulk.jar` fat jar. -Otherwise, an error will be raised because no embedded DSBulk can be found. -==== === Generate DDL to recreate Origin schema in Target @@ -631,4 +631,4 @@ Generate DDL files to recreate the Origin schema in a Target {astra_db} cluster: --export-username=user1 \ --export-password=s3cr3t \ --optimize-for-astra ----- +---- \ No newline at end of file diff --git a/modules/ROOT/pages/dsbulk-prereqs.adoc b/modules/ROOT/pages/dsbulk-prereqs.adoc new file mode 100644 index 00000000..594a4356 --- /dev/null +++ b/modules/ROOT/pages/dsbulk-prereqs.adoc @@ -0,0 +1,10 @@ += Prerequisites + +Use {dsbulk-migrator} to perform simple migration of smaller data quantities, where data validation (other than post-migration row counts) is not necessary. Enure you have the prerequisites to + +* Install or switch to Java 11. +* Install https://maven.apache.org/download.cgi[Maven] 3.9.x. +* Optionally install https://docs.datastax.com/en/dsbulk/docs/installing/install.html[DSBulk Loader, window="_blank"], if you elect to reference your own external installation of DSBulk, instead of the embedded DSBulk that's in {dsbulk-migrator}. +* Install https://github.com/datastax/simulacron#prerequisites[Simulacron] 0.12.x and its prerequisites, for integration tests. + +With prerequisites installed, you can begin to xref:bulk-migrate.adoc[migrate data], use the xref:bulk-command-line.adoc[command line options], or use the xref:dsbulk-examples.adoc[case examples]! \ No newline at end of file diff --git a/modules/ROOT/pages/migrate-and-validate-data.adoc b/modules/ROOT/pages/migrate-and-validate-data.adoc index 558cf889..7e339965 100644 --- a/modules/ROOT/pages/migrate-and-validate-data.adoc +++ b/modules/ROOT/pages/migrate-and-validate-data.adoc @@ -8,7 +8,7 @@ This topic introduces two open-source data migration tools that you can use duri For full details, see these topics: * xref:cassandra-data-migrator.adoc[{cstar-data-migrator}] -* xref:dsbulk-migrator.adoc[{dsbulk-migrator}] +* xref:dsbulk-prereqs.adoc[{dsbulk-migrator}] These tools provide sophisticated features that help you migrate your data from any Cassandra **Origin** (Apache Cassandra®, {company} Enterprise (DSE), {company} {astra_db}) to any Cassandra **Target** (Apache Cassandra, DSE, {company} {astra_db}). @@ -87,4 +87,4 @@ For extensive usage and reference details, see xref:cassandra-data-migrator.adoc * `generate-ddl` reads the schema from Origin and generates CQL files to recreate it in an {astra_db} cluster used as Target. -For extensive usage and reference details, see xref:dsbulk-migrator.adoc[{dsbulk-migrator}]. +For extensive usage and reference details, see xref:bulk-command-line.adoc[{dsbulk-migrator} command-line options]. From 0246917d130df1d12fef571f972d7406597ef82b Mon Sep 17 00:00:00 2001 From: beajohnson Date: Thu, 25 Apr 2024 07:10:32 -0700 Subject: [PATCH 2/5] getting updates to appear in correct location --- modules/ROOT/nav.adoc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/ROOT/nav.adoc b/modules/ROOT/nav.adoc index e99ee75e..5f043914 100644 --- a/modules/ROOT/nav.adoc +++ b/modules/ROOT/nav.adoc @@ -25,7 +25,7 @@ //phase 5 * xref:connect-clients-to-target.adoc[] * DSBulk Migrator -** xref:dsbulk-prereqs.adoc[] +** xref:dsbulk-prereqs.adoc[] ** xref:bulk-migrate.adoc[] ** xref:bulk-command-line.adoc[] ** xref:dsbulk-examples.adoc[] From 50132ef2f9cc8839505dce630d63bebdf76ef1ee Mon Sep 17 00:00:00 2001 From: beajohnson Date: Tue, 4 Jun 2024 12:50:02 -0700 Subject: [PATCH 3/5] pushing changes so they will not be stuck --- modules/ROOT/pages/bulk-command-line.adoc | 48 +++++++++++------------ modules/ROOT/pages/bulk-migrate.adoc | 2 +- 2 files changed, 25 insertions(+), 25 deletions(-) diff --git a/modules/ROOT/pages/bulk-command-line.adoc b/modules/ROOT/pages/bulk-command-line.adoc index 4858efe4..45e59fe4 100644 --- a/modules/ROOT/pages/bulk-command-line.adoc +++ b/modules/ROOT/pages/bulk-command-line.adoc @@ -24,11 +24,11 @@ The default is simply `dsbulk`, assuming that the command is available through t | `-d` | `--data-dir=PATH` -| The directory where data will be exported to and imported from. +| The directory where data is exported to and imported from. The default is a `data` subdirectory in the current working directory. -The data directory will be created if it does not exist. -Tables will be exported and imported in subdirectories of the data directory specified here. -There will be one subdirectory per keyspace in the data directory, then one subdirectory per table in each keyspace directory. +The data directory is created if it does not exist. +Tables are exported and imported in subdirectories of the data directory specified here. +There is one subdirectory per keyspace in the data directory, then one subdirectory per table in each keyspace directory. | `-e` | `--dsbulk-use-embedded` @@ -48,14 +48,14 @@ The default is `LOCAL_QUORUM`. | | `--export-dsbulk-option=OPT=VALUE` | An extra DSBulk option to use when exporting. -Any valid DSBulk option can be specified here, and it will passed as is to the DSBulk process. +Any valid DSBulk option can be specified here and pass without change to the DSBulk process. DSBulk options, including driver options, must be passed as `--long.option.name=`. Short options are not supported. | | `--export-host=HOST[:PORT]` | The host name or IP and, optionally, the port of a node from the Origin cluster. -If the port is not specified, it will default to `9042`. +If the port is not specified, it defaults to `9042`. This option can be specified multiple times. Options `--export-host` and `--export-bundle` are mutually exclusive. @@ -119,14 +119,14 @@ The default is `1970-01-01T00:00:00Z`. | | `--import-dsbulk-option=OPT=VALUE` | An extra DSBulk option to use when importing. -Any valid DSBulk option can be specified here, and it will passed as is to the DSBulk process. +Any valid DSBulk option can be specified here and pass without change to the DSBulk process. DSBulk options, including driver options, must be passed as `--long.option.name=`. Short options are not supported. | | `--import-host=HOST[:PORT]` | The host name or IP and, optionally, the port of a node from the Target cluster. -If the port is not specified, it will default to `9042`. +If the port is not specified, it defaults to `9042`. This option can be specified multiple times. Options `--import-host` and `--import-bundle` are mutually exclusive. @@ -168,15 +168,15 @@ Case-sensitive keyspace names must be entered in their exact case. | `--dsbulk-log-dir=PATH` | The directory where DSBulk should store its logs. The default is a `logs` subdirectory in the current working directory. -This subdirectory will be created if it does not exist. -Each DSBulk operation will create a subdirectory in the log directory specified here. +This subdirectory is created if it does not exist. +Each DSBulk operation creates a subdirectory in the log directory specified here. | | `--max-concurrent-ops=NUM` | The maximum number of concurrent operations (exports and imports) to carry. The default is `1`. Set this to higher values to allow exports and imports to occur concurrently. -For example, with a value of `2`, each table will be imported as soon as it is exported, while the next table is being exported. +For example, with a value of `2`, each table is imported as soon as it is exported while the next table is being exported. | | `--skip-truncate-confirmation` @@ -226,9 +226,9 @@ The default is simply `dsbulk`, assuming that the command is available through t | `-d` | `--data-dir=PATH` -| The directory where data will be exported to and imported from. +| The directory where data isexported to and imported from. The default is a `data` subdirectory in the current working directory. -The data directory will be created if it does not exist. +The data directory is created if it does not exist. | | `--export-bundle=PATH` @@ -243,14 +243,14 @@ The default is `LOCAL_QUORUM`. | | `--export-dsbulk-option=OPT=VALUE` | An extra DSBulk option to use when exporting. -Any valid DSBulk option can be specified here, and it will passed as is to the DSBulk process. +Any valid DSBulk option can be specified here and pass without change to the DSBulk process. DSBulk options, including driver options, must be passed as `--long.option.name=`. Short options are not supported. | | `--export-host=HOST[:PORT]` | The host name or IP and, optionally, the port of a node from the Origin cluster. -If the port is not specified, it will default to `9042`. +If the port is not specified, it defaults to `9042`. This option can be specified multiple times. Options `--export-host` and `--export-bundle` are mutually exclusive. @@ -315,14 +315,14 @@ The default is `1970-01-01T00:00:00Z`. | | `--import-dsbulk-option=OPT=VALUE` | An extra DSBulk option to use when importing. -Any valid DSBulk option can be specified here, and it will passed as is to the DSBulk process. +Any valid DSBulk option can be specified here and pass without change to the DSBulk process. DSBulk options, including driver options, must be passed as `--long.option.name=`. Short options are not supported. | | `--import-host=HOST[:PORT]` | The host name or IP and, optionally, the port of a node from the Target cluster. -If the port is not specified, it will default to `9042`. +If the port is not specified, it defaults to `9042`. This option can be specified multiple times. Options `--import-host` and `--import-bundle` are mutually exclusive. @@ -342,7 +342,7 @@ The default is `AUTO`. | `--import-max-errors=NUM` | The maximum number of failed records to tolerate when importing data. The default is `1000`. -Failed records will appear in a `load.bad` file in the DSBulk operation directory. +Failed records appear in a `load.bad` file in the DSBulk operation directory. | | `--import-password` @@ -365,8 +365,8 @@ Case-sensitive keyspace names must be entered in their exact case. | `--dsbulk-log-dir=PATH` | The directory where DSBulk should store its logs. The default is a `logs` subdirectory in the current working directory. -This subdirectory will be created if it does not exist. -Each DSBulk operation will create a subdirectory in the log directory specified here. +This subdirectory is created if it does not exist. +Each DSBulk operation creates a subdirectory in the log directory specified here. | `-t` | `--tables=REGEX` @@ -395,13 +395,13 @@ Most options have sensible default values and do not need to be specified, unles | `--optimize-for-astra` | Produce CQL scripts optimized for {company} {astra_db}. {astra_db} does not allow some options in DDL statements. -Using this {dsbulk-migrator} command option, forbidden {astra_db} options will be omitted from the generated CQL files. +Using this {dsbulk-migrator} command option, forbidden {astra_db} options are omitted from the generated CQL files. | `-d` | `--data-dir=PATH` -| The directory where data will be exported to and imported from. +| The directory where data is exported to and imported from. The default is a `data` subdirectory in the current working directory. -The data directory will be created if it does not exist. +The data directory is created if it does not exist. | | `--export-bundle=PATH` @@ -411,7 +411,7 @@ Options `--export-host` and `--export-bundle` are mutually exclusive. | | `--export-host=HOST[:PORT]` | The host name or IP and, optionally, the port of a node from the Origin cluster. -If the port is not specified, it will default to `9042`. +If the port is not specified, it defaults to `9042`. This option can be specified multiple times. Options `--export-host` and `--export-bundle` are mutually exclusive. diff --git a/modules/ROOT/pages/bulk-migrate.adoc b/modules/ROOT/pages/bulk-migrate.adoc index 75f4989f..1a4f16b9 100644 --- a/modules/ROOT/pages/bulk-migrate.adoc +++ b/modules/ROOT/pages/bulk-migrate.adoc @@ -60,5 +60,5 @@ When generating a migration script, most options serve as default values in the Even when generating scripts, this tool still needs to access the Origin cluster to gather metadata about the tables to migrate. When generating a DDL file, only a few options are meaningful. -Because standard DSBulk is not used, and the import cluster is never contacted, import options and DSBulk-related options are ignored. +Because standard DSBulk is not used and the import cluster is never contacted, import options and DSBulk-related options are ignored. The tool still needs to access the Origin cluster to gather metadata about the keyspaces and tables for which to generate DDL statements. From b139c67fb505b52accec3f8c3ce4b044be145675 Mon Sep 17 00:00:00 2001 From: beajohnson Date: Wed, 5 Jun 2024 08:44:30 -0700 Subject: [PATCH 4/5] added dsbulk pages --- modules/ROOT/nav.adoc | 17 ++++++++++++----- modules/ROOT/pages/dsbulk-prereqs.adoc | 4 +++- 2 files changed, 15 insertions(+), 6 deletions(-) diff --git a/modules/ROOT/nav.adoc b/modules/ROOT/nav.adoc index 5f043914..c67e8ba5 100644 --- a/modules/ROOT/nav.adoc +++ b/modules/ROOT/nav.adoc @@ -24,11 +24,18 @@ * xref:change-read-routing.adoc[] //phase 5 * xref:connect-clients-to-target.adoc[] -* DSBulk Migrator -** xref:dsbulk-prereqs.adoc[] -** xref:bulk-migrate.adoc[] -** xref:bulk-command-line.adoc[] -** xref:dsbulk-examples.adoc[] +* DSBulk Loader +** https://docs.datastax.com/en/dsbulk/installing/install.html[Install DSBulk Loader] +** Migrating Data +*** https://docs.datastax.com/en/dsbulk/getting-started/simple-load.html[Loading data] +*** https://docs.datastax.com/en/dsbulk/getting-started/simple-unload.html[Unloading data] +*** https://docs.datastax.com/en/dsbulk/developing/loading-unloading-vector-data.html[Loading and unloading vector data] +** Examples +*** https://docs.datastax.com/en/dsbulk/reference/load.html[Loading data examples] +*** https://docs.datastax.com/en/dsbulk/reference/unload.html[Unlaoding data examples] +*** https://docs.datastax.com/en/dsbulk/reference/count-examples.html[Counting data example] + +*** xref:dsbulk-examples.adoc[] * Troubleshooting ** xref:troubleshooting.adoc[] ** xref:troubleshooting-tips.adoc[] diff --git a/modules/ROOT/pages/dsbulk-prereqs.adoc b/modules/ROOT/pages/dsbulk-prereqs.adoc index 594a4356..edd4cc20 100644 --- a/modules/ROOT/pages/dsbulk-prereqs.adoc +++ b/modules/ROOT/pages/dsbulk-prereqs.adoc @@ -1,4 +1,6 @@ -= Prerequisites += Installing DataStax Bulk Loader + +https://docs.datastax.com/en/dsbulk/installing/install.html Use {dsbulk-migrator} to perform simple migration of smaller data quantities, where data validation (other than post-migration row counts) is not necessary. Enure you have the prerequisites to From abdfd7d4ec9ce5d83f7bdf78ef62c16313778173 Mon Sep 17 00:00:00 2001 From: beajohnson Date: Thu, 6 Jun 2024 12:08:11 -0700 Subject: [PATCH 5/5] updated nav --- modules/ROOT/nav.adoc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/ROOT/nav.adoc b/modules/ROOT/nav.adoc index c67e8ba5..7a914f20 100644 --- a/modules/ROOT/nav.adoc +++ b/modules/ROOT/nav.adoc @@ -17,7 +17,7 @@ //phase 2 * xref:migrate-and-validate-data.adoc[] ** xref:cassandra-data-migrator.adoc[] -** xref:dsbulk-migrator.adoc[] +** xref:https://docs.datastax.com/en/dsbulk/installing/install.html[Install DSBulk Loader] //phase 3 * xref:enable-async-dual-reads.adoc[] //phase 4