diff --git a/gen3/bin/kube-setup-data-replicate.sh b/gen3/bin/kube-setup-data-replicate.sh index ae03d13d4..c31317f23 100644 --- a/gen3/bin/kube-setup-data-replicate.sh +++ b/gen3/bin/kube-setup-data-replicate.sh @@ -10,47 +10,31 @@ gen3_load "gen3/lib/kube-setup-init" secret_folder="$(gen3_secrets_folder)" if [[ -d ${secret_folder}/apis_configs/dcf_dataservice ]]; then - if g3kubectl get secret dcf-aws-creds-secret; then - g3kubectl delete secret dcf-aws-creds-secret + if g3kubectl get secret aws-creds-secret; then + g3kubectl delete secret aws-creds-secret fi if g3kubectl get secret google-creds-secret; then g3kubectl delete secret google-creds-secret fi - if g3kubectl get secret dcf-dataservice-json-secret; then - g3kubectl delete secret dcf-dataservice-json-secret + if g3kubectl get secret dataservice-settings-secrets; then + g3kubectl delete secret dataservice-settings-secrets fi if g3kubectl get secret dcf-dataservice-settings-secrets; then g3kubectl delete secret dcf-dataservice-settings-secrets fi - if g3kubectl get configmap project-map-manifest; then - g3kubectl delete configmap project-map-manifest - fi if ! hostname="$(gen3 api hostname)"; then gen3_log_err "could not determine hostname from manifest-global - bailing out of data refresh setup" return 1 fi - cp ${GEN3_MANIFEST_HOME}/${hostname}/manifests/datarefresh/GDC_project_map.json ${secret_folder}/apis_configs/dcf_dataservice/GDC_project_map.json - - GDC_TOKEN=$(cat ${secret_folder}/apis_configs/dcf_dataservice/creds.json | jq '.GDC_TOKEN') - INDEXD_CRED=$(cat ${secret_folder}/apis_configs/dcf_dataservice/creds.json | jq '.INDEXD') - - cat >${secret_folder}/apis_configs/dcf_dataservice/dcf_dataservice_settings << EOL -TOKEN = ${GDC_TOKEN} - -INDEXD = ${INDEXD_CRED} + cat >${secret_folder}/apis_configs/dcf_dataservice/dcf_dataservice_settings.json << EOL -DATA_ENDPT = "https://api.gdc.cancer.gov/data/" - -PROJECT_ACL = $(cat ${secret_folder}/apis_configs/dcf_dataservice/GDC_project_map.json) - -IGNORED_FILES = "/dcf-dataservice/ignored_files_manifest.csv" EOL - g3kubectl create secret generic dcf-aws-creds-secret --from-file=credentials=${secret_folder}/apis_configs/dcf_dataservice/aws_creds_secret + g3kubectl create secret generic aws-creds-secret --from-file=credentials=${secret_folder}/apis_configs/dcf_dataservice/aws_creds_secret g3kubectl create secret generic google-creds-secret --from-file=google_service_account_creds=${secret_folder}/apis_configs/dcf_dataservice/gcloud-creds-secret - g3kubectl create secret generic dcf-dataservice-json-secret --from-file=dcf_dataservice_credentials.json=${secret_folder}/apis_configs/dcf_dataservice/creds.json - g3kubectl create secret generic dcf-dataservice-settings-secrets --from-file=dcf_dataservice_settings=${secret_folder}/apis_configs/dcf_dataservice/dcf_dataservice_settings - g3kubectl create configmap project-map-manifest --from-file=GDC_project_map.json=${secret_folder}/apis_configs/dcf_dataservice/GDC_project_map.json -fi \ No newline at end of file + g3kubectl create secret generic dataservice-settings-secrets --from-file=dataservice_settings.json=${secret_folder}/apis_configs/dcf_dataservice/dataservice_settings.json + g3kubectl create secret generic dcf-dataservice-settings-secrets --from-file=dcf_dataservice_settings.json=${secret_folder}/apis_configs/dcf_dataservice/dcf_dataservice_settings.json + +fi diff --git a/kube/services/jobs/aws-bucket-replicate-job.yaml b/kube/services/jobs/aws-bucket-replicate-job.yaml index d9f0f08ad..09510e307 100644 --- a/kube/services/jobs/aws-bucket-replicate-job.yaml +++ b/kube/services/jobs/aws-bucket-replicate-job.yaml @@ -1,4 +1,4 @@ -# Ex. gen3 job run aws-bucket-replicate RELEASE DR16 GDC_BUCKET_NAME mybucket20018 MANIFEST_S3 s3://giang816test/GDC_full_sync_legacy_manifest_20190326_post_DR16.0.tsv THREAD_NUM 5 LOG_BUCKET xssxs CHUNK_SIZE 1 +# Documentation: https://github.com/uc-cdis/cdis-wiki/blob/cf5e68f417993676f6e8c8d8c744e29b984ed7a0/ops/Data-refresh.md apiVersion: batch/v1 kind: Job metadata: @@ -11,73 +11,53 @@ spec: app: gen3job spec: volumes: - - name: cred-volume + - name: aws-cred-volume secret: - secretName: "dcf-aws-creds-secret" + secretName: "aws-creds-secret" - name: setting-volume secret: - secretName: "dcf-dataservice-settings-secrets" - - name: project-map-volume - configMap: - name: project-map-manifest - - name: creds-json-volume + secretName: "dataservice-settings-secrets" + - name: dcf-setting-volume secret: - secretName: "dcf-dataservice-json-secret" + secretName: "dcf-dataservice-settings-secrets" containers: - name: datareplicate GEN3_DATAREPLICATE_IMAGE + imagePullPolicy: Always resources: limits: memory: "32Gi" requests: cpu: "14" memory: "24Gi" - imagePullPolicy: Always env: - - name: GDC_BUCKET_NAME - GEN3_GDC_BUCKET_NAME - name: RELEASE GEN3_RELEASE - - name: LOG_BUCKET - GEN3_LOG_BUCKET - - name: CHUNK_SIZE - GEN3_CHUNK_SIZE - name: THREAD_NUM GEN3_THREAD_NUM - - name: MANIFEST_S3 - GEN3_MANIFEST_S3 - - name: QUICK_TEST - GEN3_QUICK_TEST|-value: "False"-| - - name: AUTH_NAMESPACE - valueFrom: - configMapKeyRef: - name: manifest-global - key: auth_namespace - optional: true + - name: SKIP_TO + GEN3_SKIP_TO + - name: DRY_RUN + GEN3_DRY_RUN|-value: "False"-| volumeMounts: - - name: cred-volume + - name: aws-cred-volume mountPath: "/root/.aws/credentials" subPath: credentials - name: "setting-volume" - mountPath: "/secrets/dcf_dataservice_settings" - subPath: "dcf_dataservice_settings" - - name: "project-map-volume" - mountPath: "/dcf-dataservice/GDC_project_map.json" - subPath: "GDC_project_map.json" - - name: "creds-json-volume" - mountPath: "/secrets/dcf_dataservice_credentials.json" - subPath: "dcf_dataservice_credentials.json" + mountPath: "/secrets/dataservice_settings.json" + subPath: "dataservice_settings.json" + - name: "dcf-setting-volume" + mountPath: "/secrets/dcf_dataservice_settings.json" + subPath: "dcf_dataservice_settings.json" command: ["/bin/bash" ] args: - "-c" - | - cat /secrets/dcf_dataservice_settings > ./scripts/settings.py echo """ [default] region: us-east-1 """ > ~/.aws/config aws configure set default.s3.max_concurrent_requests 1000 aws configure set default.s3.max_queue_size 10000 - python replicate.py aws_replicate --release $RELEASE --quick_test $QUICK_TEST --bucket $GDC_BUCKET_NAME --thread_num $THREAD_NUM --manifest_file $MANIFEST_S3 --global_config "{\"chunk_size\": $CHUNK_SIZE, \"log_bucket\": \"$LOG_BUCKET\"}" - # python scripts/replicate.py indexing --thread_num 20 --manifest_file $MANIFEST_S3 --global_config "{\"chunk_size\": 3, \"log_bucket\": \"$LOG_BUCKET\"}" + python run.py aws_replicate --release $RELEASE --thread_num $THREAD_NUM --skip_to $SKIP_TO --dry_run $dry_run --global_config "{\"release\": \"$RELEASE\", \"thread_num\": \"$THREAD_NUM\", \"skip_to\": \"$SKIP_TO\", \"dry_run\": \"$DRY_RUN\"}" restartPolicy: Never diff --git a/kube/services/jobs/google-bucket-replicate-job.yaml b/kube/services/jobs/google-bucket-replicate-job.yaml index f61a47868..8f9cd948b 100644 --- a/kube/services/jobs/google-bucket-replicate-job.yaml +++ b/kube/services/jobs/google-bucket-replicate-job.yaml @@ -1,6 +1,4 @@ -# run with -# Ex. gen3 job run jobs/google-bucket-replicate-job.yaml PROJECT cdis-test-188416 MAX_WORKERS 4 RELEASE DR16 MANIFEST_FILE gs://data-flow-code/input/GDC_full_sync_legacy_manifest_20190326_post_DR16.0.tsv IGNORED_FILE gs://data-flow-code/ignored/ignored_files_manifest.csv LOG_BUCKET data-flow-code -# Ex. gen3 runjob jobs/google-bucket-replicate-job.yaml PROJECT dcf-prod-buckets MAX_WORKERS 80 RELEASE DR16 MANIFEST_FILE gs://replication-input/GDC_full_sync_active_manifest_20190326_post_DR16.0.tsv IGNORED_FILE gs://replication-input/ignored_files_manifest.csv LOG_BUCKET datarefresh-log +# Documentation: https://github.com/uc-cdis/cdis-wiki/blob/cf5e68f417993676f6e8c8d8c744e29b984ed7a0/ops/Data-refresh.md apiVersion: batch/v1 kind: Job metadata: @@ -13,75 +11,55 @@ spec: app: gen3job spec: volumes: - - name: cred-volume + - name: gs-cred-volume secret: secretName: "google-creds-secret" - name: setting-volume secret: - secretName: "dcf-dataservice-settings-secrets" - - name: creds-json-volume + secretName: "dataservice-settings-secrets" + - name: dcf-setting-volume secret: - secretName: "dcf-dataservice-json-secret" + secretName: "dcf-dataservice-settings-secrets" containers: - name: datareplicate GEN3_DATAREPLICATE_IMAGE + imagePullPolicy: Always resources: limits: memory: "32Gi" requests: cpu: "14" memory: "24Gi" - imagePullPolicy: Always env: - - name: PROJECT - GEN3_PROJECT - - name: MAX_WORKERS - GEN3_MAX_WORKERS - name: RELEASE GEN3_RELEASE - - name: MANIFEST_FILE - GEN3_MANIFEST_FILE - - name: IGNORED_FILE - GEN3_IGNORED_FILE - - name: LOG_BUCKET - GEN3_LOG_BUCKET - - name: AUTH_NAMESPACE - valueFrom: - configMapKeyRef: - name: manifest-global - key: auth_namespace - optional: true + - name: THREAD_NUM + GEN3_THREAD_NUM + - name: SKIP_TO + GEN3_SKIP_TO + - name: DRY_RUN + GEN3_DRY_RUN|-value: "False"-| volumeMounts: - - name: cred-volume + - name: gs-cred-volume mountPath: "/secrets/google_service_account_creds" subPath: google_service_account_creds - name: "setting-volume" - mountPath: "/secrets/dcf_dataservice_settings" - subPath: "dcf_dataservice_settings" - - name: "creds-json-volume" - mountPath: "/secrets/dcf_dataservice_credentials.json" - subPath: "dcf_dataservice_credentials.json" + mountPath: "/secrets/dataservice_settings.json" + subPath: "dataservice_settings.json" + - name: "dcf-setting-volume" + mountPath: "/secrets/dcf_dataservice_settings.json" + subPath: "dcf_dataservice_settings.json" command: ["/bin/bash" ] - args: + args: - "-c" - | - cat /secrets/dcf_dataservice_settings > ./scripts/settings.py gcloud auth activate-service-account --key-file=/secrets/google_service_account_creds export GOOGLE_APPLICATION_CREDENTIALS=/secrets/google_service_account_creds export http_proxy='http://cloud-proxy.internal.io:3128' export https_proxy='http://cloud-proxy.internal.io:3128/' gsutil cp $IGNORED_FILE /dcf-dataservice/ignored_files_manifest.csv - if [[ "$MANIFEST_FILE" == *"active"* ]]; then - type="active" - elif [[ "$MANIFEST_FILE" == *"legacy"* ]]; then - type="legacy" - else - type="unknown" - fi - if [[ "$type" == "active" || "$type" == "legacy" ]]; then + rand_str="$(cat /dev/urandom | tr -dc 'a-zA-Z0-9' | fold -w 5 | head -n 1)" - python dataflow_pipeline.py --runner DataflowRunner --project $PROJECT --job_name dcf-dataservice --autoscaling_algorithm NONE --num_worker $MAX_WORKERS --maxNumWorkers $MAX_WORKERS --staging_location gs://$LOG_BUCKET/$RELEASE/staging --temp_location gs://$LOG_BUCKET/$RELEASE/temp --output gs://$LOG_BUCKET/$RELEASE/$type/output_$rand_str --setup_file ./setup.py --input $MANIFEST_FILE --global_config "{\"release\": \"$RELEASE/$type\", \"log_bucket\": \"$LOG_BUCKET\"}" --requirements_file requirements.txt --extra_package indexclient-1.6.0.zip --requirements_file scripts/requirements.txt - else - echo "Neither active nor legacy manifest is provided. Please check the manifest name!!!" - fi + python dataflow_pipeline.py --runner DataflowRunner --project $PROJECT --job_name dcf-dataservice --autoscaling_algorithm NONE --num_worker $MAX_WORKERS --maxNumWorkers $MAX_WORKERS --staging_location gs://$LOG_BUCKET/$RELEASE/staging --temp_location gs://$LOG_BUCKET/$RELEASE/temp --output gs://$LOG_BUCKET/$RELEASE/$type/output_$rand_str --setup_file ./setup.py --input $MANIFEST_FILE --global_config "{\"release\": \"$RELEASE/$type\"}" --requirements_file requirements.txt --requirements_file scripts/requirements.txt + restartPolicy: Never diff --git a/kube/services/jobs/remove-objects-from-clouds-job.yaml b/kube/services/jobs/remove-objects-from-clouds-job.yaml index 46aa3d43f..7294e5197 100644 --- a/kube/services/jobs/remove-objects-from-clouds-job.yaml +++ b/kube/services/jobs/remove-objects-from-clouds-job.yaml @@ -1,5 +1,4 @@ -# run with -# gen3 job run jobs/remove-objects-from-clouds-job.yaml RELEASE DR16 MANIFEST_S3 s3://giang816test/GDC_sample_redact_manifest.tsv LOG_BUCKET log_bucket IGNORED_FILE_S3 s3://giang816test/ignored_files_manifest.csv +# Documentation: https://github.com/uc-cdis/cdis-wiki/blob/cf5e68f417993676f6e8c8d8c744e29b984ed7a0/ops/Data-refresh.md apiVersion: batch/v1 kind: Job metadata: @@ -12,18 +11,18 @@ spec: app: gen3job spec: volumes: - - name: cred-volume + - name: aws-cred-volume secret: - secretName: "dcf-aws-creds-secret" - - name: google-cred-volume + secretName: "aws-creds-secret" + - name: gs-cred-volume secret: secretName: "google-creds-secret" - - name: project-map-volume - configMap: - name: project-map-manifest - - name: creds-json-volume + - name: setting-volume secret: - secretName: "dcf-dataservice-json-secret" + secretName: "dataservice-settings-secrets" + - name: dcf-setting-volume + secret: + secretName: "dcf-dataservice-settings-secrets" containers: - name: datareplicate GEN3_DATAREPLICATE_IMAGE @@ -31,27 +30,25 @@ spec: env: - name: RELEASE GEN3_RELEASE - - name: LOG_BUCKET - GEN3_LOG_BUCKET - - name: MANIFEST_S3 - GEN3_MANIFEST_S3 - - name: IGNORED_FILE_S3 - GEN3_IGNORED_FILE_S3 + - name: THREAD_NUM + GEN3_THREAD_NUM + - name: SKIP_TO + GEN3_SKIP_TO - name: DRY_RUN - GEN3_DRY_RUN|-value: "True"-| + GEN3_DRY_RUN|-value: "False"-| volumeMounts: - - name: cred-volume + - name: aws-cred-volume mountPath: "/root/.aws/credentials" subPath: credentials - - name: google-cred-volume + - name: gs-cred-volume mountPath: "/secrets/google_service_account_creds" subPath: google_service_account_creds - - name: "project-map-volume" - mountPath: "/dcf-dataservice/GDC_project_map.json" - subPath: "GDC_project_map.json" - - name: "creds-json-volume" - mountPath: "/secrets/dcf_dataservice_credentials.json" - subPath: "dcf_dataservice_credentials.json" + - name: "setting-volume" + mountPath: "/secrets/dataservice_settings.json" + subPath: "dataservice_settings.json" + - name: "dcf-setting-volume" + mountPath: "/secrets/dcf_dataservice_settings.json" + subPath: "dcf_dataservice_settings.json" command: ["/bin/bash" ] args: - "-c" @@ -63,5 +60,5 @@ spec: gcloud auth activate-service-account --key-file=/secrets/google_service_account_creds export GOOGLE_APPLICATION_CREDENTIALS=/secrets/google_service_account_creds aws s3 cp $IGNORED_FILE_S3 /dcf-dataservice/ignored_files_manifest.csv - python replicate.py redact --dry_run $DRY_RUN --release $RELEASE --redact_file $MANIFEST_S3 --log_bucket $LOG_BUCKET + python run.py redact --release $RELEASE --thread_num $THREAD_NUM --skip_to $SKIP_TO --dry_run $dry_run --global_config "{\"release\": \"$RELEASE\", \"thread_num\": \"$THREAD_NUM\", \"skip_to\": \"$SKIP_TO\", \"dry_run\": \"$DRY_RUN\"}" restartPolicy: Never \ No newline at end of file diff --git a/kube/services/jobs/replicate-indexing-job.yaml b/kube/services/jobs/replicate-indexing-job.yaml new file mode 100644 index 000000000..fdb08d95e --- /dev/null +++ b/kube/services/jobs/replicate-indexing-job.yaml @@ -0,0 +1,64 @@ +# Documentation: https://github.com/uc-cdis/cdis-wiki/blob/cf5e68f417993676f6e8c8d8c744e29b984ed7a0/ops/Data-refresh.md +apiVersion: batch/v1 +kind: Job +metadata: + name: replicate-indexing +spec: + # not yet supported - backOffLimit: 3 + template: + metadata: + labels: + app: gen3job + spec: + volumes: + - name: aws-cred-volume + secret: + secretName: "aws-creds-secret" + - name: gs-cred-volume + secret: + secretName: "google-creds-secret" + - name: setting-volume + secret: + secretName: "dataservice-settings-secrets" + - name: dcf-setting-volume + secret: + secretName: "dcf-dataservice-settings-secrets" + containers: + - name: datareplicate + GEN3_DATAREPLICATE_IMAGE + imagePullPolicy: Always + env: + - name: RELEASE + GEN3_RELEASE + - name: THREAD_NUM + GEN3_THREAD_NUM + - name: SKIP_TO + GEN3_SKIP_TO + - name: DRY_RUN + GEN3_DRY_RUN|-value: "False"-| + volumeMounts: + - name: aws-cred-volume + mountPath: "/root/.aws/credentials" + subPath: credentials + - name: gs-cred-volume + mountPath: "/secrets/google_service_account_creds" + subPath: google_service_account_creds + - name: "setting-volume" + mountPath: "/secrets/dataservice_settings.json" + subPath: "dataservice_settings.json" + - name: "dcf-setting-volume" + mountPath: "/secrets/dcf_dataservice_settings.json" + subPath: "dcf_dataservice_settings.json" + command: ["/bin/bash" ] + args: + - "-c" + - | + echo """ + [default] + region: us-east-1 + """ > ~/.aws/config + gcloud auth activate-service-account --key-file=/secrets/google_service_account_creds + export GOOGLE_APPLICATION_CREDENTIALS=/secrets/google_service_account_creds + aws s3 cp $IGNORED_FILE_S3 /dcf-dataservice/ignored_files_manifest.csv + python run.py index --release $RELEASE --thread_num $THREAD_NUM --skip_to $SKIP_TO --dry_run $dry_run --global_config "{\"release\": \"$RELEASE\", \"thread_num\": \"$THREAD_NUM\", \"skip_to\": \"$SKIP_TO\", \"dry_run\": \"$DRY_RUN\"}" + restartPolicy: Never \ No newline at end of file diff --git a/kube/services/jobs/replicate-validation-job.yaml b/kube/services/jobs/replicate-validation-job.yaml index 13f767d69..18efd363b 100644 --- a/kube/services/jobs/replicate-validation-job.yaml +++ b/kube/services/jobs/replicate-validation-job.yaml @@ -1,5 +1,4 @@ -# run with -# Ex. gen3 job run jobs/replicate-validation-job.yaml RELEASE DR16 IGNORED_FILE gs://data-flow-code/ignored/ignored_files_manifest.csv MANIFEST_FILES 's3://giang816test/GDC_full_sync_legacy_manifest_20190326_post_DR16.0.tsv' OUT_FILES 'GDC_full_sync_legacy_manifest_20190326_post_DR16_DCF.tsv' LOG_BUCKET 'xssxs' +# Documentation: https://github.com/uc-cdis/cdis-wiki/blob/cf5e68f417993676f6e8c8d8c744e29b984ed7a0/ops/Data-refresh.md apiVersion: batch/v1 kind: Job metadata: @@ -14,16 +13,16 @@ spec: volumes: - name: aws-cred-volume secret: - secretName: "dcf-aws-creds-secret" - - name: cred-volume + secretName: "aws-creds-secret" + - name: gs-cred-volume secret: secretName: "google-creds-secret" - name: setting-volume secret: - secretName: "dcf-dataservice-settings-secrets" - - name: creds-json-volume + secretName: "dataservice-settings-secrets" + - name: dcf-setting-volume secret: - secretName: "dcf-dataservice-json-secret" + secretName: "dcf-dataservice-settings-secrets" containers: - name: datareplicate GEN3_DATAREPLICATE_IMAGE @@ -37,29 +36,23 @@ spec: env: - name: RELEASE GEN3_RELEASE - - name: IGNORED_FILE - GEN3_IGNORED_FILE - - name: MANIFEST_FILES - GEN3_MANIFEST_FILES - - name: OUT_FILES - GEN3_OUT_FILES - - name: FORCE_CREATE_MANIFEST - GEN3_FORCE_CREATE_MANIFEST - - name: LOG_BUCKET - GEN3_LOG_BUCKET + - name: SKIP_TO + GEN3_SKIP_TO + - name: DRY_RUN + GEN3_DRY_RUN|-value: "False"-| volumeMounts: - name: aws-cred-volume mountPath: "/root/.aws/credentials" subPath: credentials - - name: cred-volume + - name: gs-cred-volume mountPath: "/secrets/google_service_account_creds" subPath: google_service_account_creds - name: "setting-volume" - mountPath: "/secrets/dcf_dataservice_settings" - subPath: "dcf_dataservice_settings" - - name: "creds-json-volume" - mountPath: "/secrets/dcf_dataservice_credentials.json" - subPath: "dcf_dataservice_credentials.json" + mountPath: "/secrets/dataservice_settings.json" + subPath: "dataservice_settings.json" + - name: "dcf-setting-volume" + mountPath: "/secrets/dcf_dataservice_settings.json" + subPath: "dcf_dataservice_settings.json" command: ["/bin/bash" ] args: - "-c" @@ -68,9 +61,8 @@ spec: [default] region: us-east-1 """ > ~/.aws/config - cat /secrets/dcf_dataservice_settings > ./scripts/settings.py gcloud auth activate-service-account --key-file=/secrets/google_service_account_creds export GOOGLE_APPLICATION_CREDENTIALS=/secrets/google_service_account_creds gsutil cp $IGNORED_FILE /dcf-dataservice/ignored_files_manifest.csv - python replicate.py validate --global_config "{\"release\": \"$RELEASE\", \"manifest_files\":\"$MANIFEST_FILES\", \"out_manifests\": \"$OUT_FILES\", \"FORCE_CREATE_MANIFEST\": \"$FORCE_CREATE_MANIFEST\", \"log_bucket\": \"$LOG_BUCKET\", \"save_copied_objects\": 1}" + python run.py validate --release $RELEASE --skip_to $SKIP_TO --dry_run $dry_run --global_config "{\"release\": \"$RELEASE\", \"skip_to\": \"$SKIP_TO\", \"dry_run\": \"$DRY_RUN\"}" restartPolicy: Never \ No newline at end of file