forked from faizalrub-datastax/cassandra-data-migrator-ukg
-
Notifications
You must be signed in to change notification settings - Fork 0
/
sparkConf.properties
142 lines (118 loc) · 7.1 KB
/
sparkConf.properties
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
# Origin cluster credentials (use "host + port" OR "secure-connect-bundle" but not both)
spark.origin.host localhost
spark.origin.port 9042
#spark.origin.scb file:///aaa/bbb/secure-connect-enterprise.zip
spark.origin.username some-username
spark.origin.password some-secret-password
spark.origin.keyspaceTable test.a1
# Target cluster credentials (use "host + port" OR "secure-connect-bundle" but not both)
#spark.target.host localhost
#spark.target.port 9042
spark.target.scb file:///aaa/bbb/secure-connect-enterprise.zip
spark.target.username client-id
spark.target.password client-secret
spark.target.keyspaceTable test.a2
# Add 'missing' rows (during 'Validation') in 'Target' from 'Origin'. N/A for 'Migration'
spark.target.autocorrect.missing false
# Update 'mismatched' rows (during 'Validation') in 'Target' to match 'Origin'. N/A for 'Migration'
spark.target.autocorrect.mismatch false
# Read & Write rate-limits(rows/second). Higher value will improve performance and put more load on cluster
spark.readRateLimit 20000
spark.writeRateLimit 20000
# Used to split Cassandra token-range into slices and migrate random slices one at a time
# 10K splits usually works for tables up to 100GB (uncompressed) with balanced token distribution
# For larger tables, test on 1% volume (using param coveragePercent) and increase the number-of-splits as needed
spark.numSplits 10000
# Use a value of 1 (disable batching) when primary-key and partition-key are same
# For tables with high avg count of rows/partition, use higher value to improve performance
spark.batchSize 10
# Below 'query' properties are set based on table schema
spark.query.origin comma-separated-partition-key,comma-separated-clustering-key,comma-separated-other-columns
spark.query.origin.partitionKey comma-separated-partition-key
spark.query.target.id comma-separated-partition-key,comma-separated-clustering-key
# Comma separated numeric data-type mapping (e.g. 'text' will map to '0') for all columns listed in "spark.query.origin"
spark.query.types 9,1,4,3
#############################################################################################################
# Following are the supported data types and their corresponding [Cassandra data-types] mapping
# 0: ascii, text, varchar
# 1: int
# 2: bigint, counter
# 3: double
# 4: timestamp
# 5: map (separate type by %) - Example: 5%1%0 for map<int, text>
# 6: list (separate type by %) - Example: 6%0 for list<text>
# 7: blob
# 8: set (separate type by %) - Example: 8%0 for set<text>
# 9: uuid, timeuuid
# 10: boolean
# 11: tuple
# 12: float
# 13: tinyint
# 14: decimal
# 15: date
# 16: UDT [any user-defined-type created using 'CREATE TYPE']
# 17: varint
# 18: time
# 19: smallint
# Note: Ignore "Frozen" while mapping Collections (Map/List/Set) - Example: 5%1%0 for frozen<map<int, text>>
#############################################################################################################
# ENABLE ONLY IF COLUMN NAMES ON TARGET IS DIFFERENT FROM ORIGIN (SCHEMA & DATA-TYPES MUST BE SAME)
#spark.query.target comma-separated-partition-key,comma-separated-clustering-key,comma-separated-other-columns
# The tool adds TTL & Writetime at row-level (not field-level).
# The largest TTL & Writetime values are used if multiple indexes are listed (comma separated)
# Comma separated column indexes from "spark.query.origin" used to find largest TTL or Writetime
spark.query.ttl.cols 2,3
spark.query.writetime.cols 2,3
# ENABLE ONLY IF YOU WANT TO MIGRATE/VALIDATE ROWS BASED ON CQL FILTER
#spark.query.condition
# ENABLE ONLY IF IT IS A COUNTER TABLE
#spark.counterTable false
#spark.counterTable.cql
#spark.counterTable.cql.index 0
# ENABLE ONLY IF YOU WANT TO FILTER BASED ON WRITE-TIME (values must be in microseconds)
#spark.origin.writeTimeStampFilter false
#spark.origin.minWriteTimeStampFilter 0
#spark.origin.maxWriteTimeStampFilter 4102444800000000
# ENABLE ONLY IF retries needed (Retry a slice of token-range if an exception occurs)
#spark.maxRetries 0
# ENABLE ONLY IF YOU WANT TO MIGRATE/VALIDATE SOME % OF ROWS (NOT 100%)
#spark.coveragePercent 100
# ENABLE ONLY IF WANT LOG STATS MORE OR LESS FREQUENTLY THAN DEFAULT
#spark.printStatsAfter 100000
# ENABLE ONLY IF YOU WANT TO USE READ AND/OR WRITE CONSISTENCY OTHER THAN LOCAL_QUORUM
#spark.consistency.read LOCAL_QUORUM
#spark.consistency.write LOCAL_QUORUM
# ENABLE ONLY IF YOU WANT TO REDUCE FETCH-SIZE TO AVOID FrameTooLongException
#spark.read.fetch.sizeInRows 1000
# ENABLE ONLY IF YOU WANT TO USE CUSTOM FIXED WRITETIME VALUE ON TARGET
#spark.target.custom.writeTime 0
# ENABLE ONLY TO SKIP recs greater than 10MB from Origin (to avoid Astra Guardrail error)
#spark.fieldGuardraillimitMB 10
# ENABLE ONLY TO count of recs greater than 10MB from Origin needed
#spark.origin.checkTableforColSize false
#spark.origin.checkTableforColSize.cols partition-key,clustering-key
#spark.origin.checkTableforColSize.cols.types 9,1
# ENABLE ONLY TO filter data from Origin
#spark.origin.FilterData false
#spark.origin.FilterColumn test
#spark.origin.FilterColumnIndex 2
#spark.origin.FilterColumnType 6%16
#spark.origin.FilterColumnValue test
# ONLY USE if SSL is enabled on origin Cassandra/DSE (e.g. Azure Cosmos Cassandra DB)
#spark.origin.ssl.enabled true
# ONLY USE if SSL clientAuth is enabled on origin Cassandra/DSE
#spark.origin.trustStore.path
#spark.origin.trustStore.password
#spark.origin.trustStore.type JKS
#spark.origin.keyStore.path
#spark.origin.keyStore.password
#spark.origin.enabledAlgorithms TLS_RSA_WITH_AES_128_CBC_SHA,TLS_RSA_WITH_AES_256_CBC_SHA
# ONLY USE if SSL is enabled on target Cassandra/DSE
#spark.target.ssl.enabled true
# ONLY USE if SSL clientAuth is enabled on target Cassandra/DSE
#spark.target.trustStore.path
#spark.target.trustStore.password
#spark.target.trustStore.type JKS
#spark.target.keyStore.path
#spark.target.keyStore.password
#spark.target.enabledAlgorithms TLS_RSA_WITH_AES_128_CBC_SHA,TLS_RSA_WITH_AES_256_CBC_SHA