Skip to content

Commit

Permalink
modified job setup for dataservice
Browse files Browse the repository at this point in the history
add indexing job yaml
  • Loading branch information
MaribelleHGomez committed Oct 12, 2023
1 parent 15824f7 commit 395da07
Show file tree
Hide file tree
Showing 6 changed files with 154 additions and 159 deletions.
36 changes: 10 additions & 26 deletions gen3/bin/kube-setup-data-replicate.sh
Original file line number Diff line number Diff line change
Expand Up @@ -10,47 +10,31 @@ gen3_load "gen3/lib/kube-setup-init"
secret_folder="$(gen3_secrets_folder)"

if [[ -d ${secret_folder}/apis_configs/dcf_dataservice ]]; then
if g3kubectl get secret dcf-aws-creds-secret; then
g3kubectl delete secret dcf-aws-creds-secret
if g3kubectl get secret aws-creds-secret; then
g3kubectl delete secret aws-creds-secret
fi
if g3kubectl get secret google-creds-secret; then
g3kubectl delete secret google-creds-secret
fi
if g3kubectl get secret dcf-dataservice-json-secret; then
g3kubectl delete secret dcf-dataservice-json-secret
if g3kubectl get secret dataservice-settings-secrets; then
g3kubectl delete secret dataservice-settings-secrets
fi
if g3kubectl get secret dcf-dataservice-settings-secrets; then
g3kubectl delete secret dcf-dataservice-settings-secrets
fi
if g3kubectl get configmap project-map-manifest; then
g3kubectl delete configmap project-map-manifest
fi

if ! hostname="$(gen3 api hostname)"; then
gen3_log_err "could not determine hostname from manifest-global - bailing out of data refresh setup"
return 1
fi

cp ${GEN3_MANIFEST_HOME}/${hostname}/manifests/datarefresh/GDC_project_map.json ${secret_folder}/apis_configs/dcf_dataservice/GDC_project_map.json

GDC_TOKEN=$(cat ${secret_folder}/apis_configs/dcf_dataservice/creds.json | jq '.GDC_TOKEN')
INDEXD_CRED=$(cat ${secret_folder}/apis_configs/dcf_dataservice/creds.json | jq '.INDEXD')

cat >${secret_folder}/apis_configs/dcf_dataservice/dcf_dataservice_settings << EOL
TOKEN = ${GDC_TOKEN}
INDEXD = ${INDEXD_CRED}
cat >${secret_folder}/apis_configs/dcf_dataservice/dcf_dataservice_settings.json << EOL
DATA_ENDPT = "https://api.gdc.cancer.gov/data/"
PROJECT_ACL = $(cat ${secret_folder}/apis_configs/dcf_dataservice/GDC_project_map.json)
IGNORED_FILES = "/dcf-dataservice/ignored_files_manifest.csv"
EOL

g3kubectl create secret generic dcf-aws-creds-secret --from-file=credentials=${secret_folder}/apis_configs/dcf_dataservice/aws_creds_secret
g3kubectl create secret generic aws-creds-secret --from-file=credentials=${secret_folder}/apis_configs/dcf_dataservice/aws_creds_secret
g3kubectl create secret generic google-creds-secret --from-file=google_service_account_creds=${secret_folder}/apis_configs/dcf_dataservice/gcloud-creds-secret
g3kubectl create secret generic dcf-dataservice-json-secret --from-file=dcf_dataservice_credentials.json=${secret_folder}/apis_configs/dcf_dataservice/creds.json
g3kubectl create secret generic dcf-dataservice-settings-secrets --from-file=dcf_dataservice_settings=${secret_folder}/apis_configs/dcf_dataservice/dcf_dataservice_settings
g3kubectl create configmap project-map-manifest --from-file=GDC_project_map.json=${secret_folder}/apis_configs/dcf_dataservice/GDC_project_map.json
fi
g3kubectl create secret generic dataservice-settings-secrets --from-file=dataservice_settings.json=${secret_folder}/apis_configs/dcf_dataservice/dataservice_settings.json
g3kubectl create secret generic dcf-dataservice-settings-secrets --from-file=dcf_dataservice_settings.json=${secret_folder}/apis_configs/dcf_dataservice/dcf_dataservice_settings.json

fi
56 changes: 18 additions & 38 deletions kube/services/jobs/aws-bucket-replicate-job.yaml
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Ex. gen3 job run aws-bucket-replicate RELEASE DR16 GDC_BUCKET_NAME mybucket20018 MANIFEST_S3 s3://giang816test/GDC_full_sync_legacy_manifest_20190326_post_DR16.0.tsv THREAD_NUM 5 LOG_BUCKET xssxs CHUNK_SIZE 1
# Documentation: https://github.com/uc-cdis/cdis-wiki/blob/cf5e68f417993676f6e8c8d8c744e29b984ed7a0/ops/Data-refresh.md
apiVersion: batch/v1
kind: Job
metadata:
Expand All @@ -11,73 +11,53 @@ spec:
app: gen3job
spec:
volumes:
- name: cred-volume
- name: aws-cred-volume
secret:
secretName: "dcf-aws-creds-secret"
secretName: "aws-creds-secret"
- name: setting-volume
secret:
secretName: "dcf-dataservice-settings-secrets"
- name: project-map-volume
configMap:
name: project-map-manifest
- name: creds-json-volume
secretName: "dataservice-settings-secrets"
- name: dcf-setting-volume
secret:
secretName: "dcf-dataservice-json-secret"
secretName: "dcf-dataservice-settings-secrets"
containers:
- name: datareplicate
GEN3_DATAREPLICATE_IMAGE
imagePullPolicy: Always
resources:
limits:
memory: "32Gi"
requests:
cpu: "14"
memory: "24Gi"
imagePullPolicy: Always
env:
- name: GDC_BUCKET_NAME
GEN3_GDC_BUCKET_NAME
- name: RELEASE
GEN3_RELEASE
- name: LOG_BUCKET
GEN3_LOG_BUCKET
- name: CHUNK_SIZE
GEN3_CHUNK_SIZE
- name: THREAD_NUM
GEN3_THREAD_NUM
- name: MANIFEST_S3
GEN3_MANIFEST_S3
- name: QUICK_TEST
GEN3_QUICK_TEST|-value: "False"-|
- name: AUTH_NAMESPACE
valueFrom:
configMapKeyRef:
name: manifest-global
key: auth_namespace
optional: true
- name: SKIP_TO
GEN3_SKIP_TO
- name: DRY_RUN
GEN3_DRY_RUN|-value: "False"-|
volumeMounts:
- name: cred-volume
- name: aws-cred-volume
mountPath: "/root/.aws/credentials"
subPath: credentials
- name: "setting-volume"
mountPath: "/secrets/dcf_dataservice_settings"
subPath: "dcf_dataservice_settings"
- name: "project-map-volume"
mountPath: "/dcf-dataservice/GDC_project_map.json"
subPath: "GDC_project_map.json"
- name: "creds-json-volume"
mountPath: "/secrets/dcf_dataservice_credentials.json"
subPath: "dcf_dataservice_credentials.json"
mountPath: "/secrets/dataservice_settings.json"
subPath: "dataservice_settings.json"
- name: "dcf-setting-volume"
mountPath: "/secrets/dcf_dataservice_settings.json"
subPath: "dcf_dataservice_settings.json"
command: ["/bin/bash" ]
args:
- "-c"
- |
cat /secrets/dcf_dataservice_settings > ./scripts/settings.py
echo """
[default]
region: us-east-1
""" > ~/.aws/config
aws configure set default.s3.max_concurrent_requests 1000
aws configure set default.s3.max_queue_size 10000
python replicate.py aws_replicate --release $RELEASE --quick_test $QUICK_TEST --bucket $GDC_BUCKET_NAME --thread_num $THREAD_NUM --manifest_file $MANIFEST_S3 --global_config "{\"chunk_size\": $CHUNK_SIZE, \"log_bucket\": \"$LOG_BUCKET\"}"
# python scripts/replicate.py indexing --thread_num 20 --manifest_file $MANIFEST_S3 --global_config "{\"chunk_size\": 3, \"log_bucket\": \"$LOG_BUCKET\"}"
python run.py aws_replicate --release $RELEASE --thread_num $THREAD_NUM --skip_to $SKIP_TO --dry_run $dry_run --global_config "{\"release\": \"$RELEASE\", \"thread_num\": \"$THREAD_NUM\", \"skip_to\": \"$SKIP_TO\", \"dry_run\": \"$DRY_RUN\"}"
restartPolicy: Never
66 changes: 22 additions & 44 deletions kube/services/jobs/google-bucket-replicate-job.yaml
Original file line number Diff line number Diff line change
@@ -1,6 +1,4 @@
# run with
# Ex. gen3 job run jobs/google-bucket-replicate-job.yaml PROJECT cdis-test-188416 MAX_WORKERS 4 RELEASE DR16 MANIFEST_FILE gs://data-flow-code/input/GDC_full_sync_legacy_manifest_20190326_post_DR16.0.tsv IGNORED_FILE gs://data-flow-code/ignored/ignored_files_manifest.csv LOG_BUCKET data-flow-code
# Ex. gen3 runjob jobs/google-bucket-replicate-job.yaml PROJECT dcf-prod-buckets MAX_WORKERS 80 RELEASE DR16 MANIFEST_FILE gs://replication-input/GDC_full_sync_active_manifest_20190326_post_DR16.0.tsv IGNORED_FILE gs://replication-input/ignored_files_manifest.csv LOG_BUCKET datarefresh-log
# Documentation: https://github.com/uc-cdis/cdis-wiki/blob/cf5e68f417993676f6e8c8d8c744e29b984ed7a0/ops/Data-refresh.md
apiVersion: batch/v1
kind: Job
metadata:
Expand All @@ -13,75 +11,55 @@ spec:
app: gen3job
spec:
volumes:
- name: cred-volume
- name: gs-cred-volume
secret:
secretName: "google-creds-secret"
- name: setting-volume
secret:
secretName: "dcf-dataservice-settings-secrets"
- name: creds-json-volume
secretName: "dataservice-settings-secrets"
- name: dcf-setting-volume
secret:
secretName: "dcf-dataservice-json-secret"
secretName: "dcf-dataservice-settings-secrets"
containers:
- name: datareplicate
GEN3_DATAREPLICATE_IMAGE
imagePullPolicy: Always
resources:
limits:
memory: "32Gi"
requests:
cpu: "14"
memory: "24Gi"
imagePullPolicy: Always
env:
- name: PROJECT
GEN3_PROJECT
- name: MAX_WORKERS
GEN3_MAX_WORKERS
- name: RELEASE
GEN3_RELEASE
- name: MANIFEST_FILE
GEN3_MANIFEST_FILE
- name: IGNORED_FILE
GEN3_IGNORED_FILE
- name: LOG_BUCKET
GEN3_LOG_BUCKET
- name: AUTH_NAMESPACE
valueFrom:
configMapKeyRef:
name: manifest-global
key: auth_namespace
optional: true
- name: THREAD_NUM
GEN3_THREAD_NUM
- name: SKIP_TO
GEN3_SKIP_TO
- name: DRY_RUN
GEN3_DRY_RUN|-value: "False"-|
volumeMounts:
- name: cred-volume
- name: gs-cred-volume
mountPath: "/secrets/google_service_account_creds"
subPath: google_service_account_creds
- name: "setting-volume"
mountPath: "/secrets/dcf_dataservice_settings"
subPath: "dcf_dataservice_settings"
- name: "creds-json-volume"
mountPath: "/secrets/dcf_dataservice_credentials.json"
subPath: "dcf_dataservice_credentials.json"
mountPath: "/secrets/dataservice_settings.json"
subPath: "dataservice_settings.json"
- name: "dcf-setting-volume"
mountPath: "/secrets/dcf_dataservice_settings.json"
subPath: "dcf_dataservice_settings.json"
command: ["/bin/bash" ]
args:
args:
- "-c"
- |
cat /secrets/dcf_dataservice_settings > ./scripts/settings.py
gcloud auth activate-service-account --key-file=/secrets/google_service_account_creds
export GOOGLE_APPLICATION_CREDENTIALS=/secrets/google_service_account_creds
export http_proxy='http://cloud-proxy.internal.io:3128'
export https_proxy='http://cloud-proxy.internal.io:3128/'
gsutil cp $IGNORED_FILE /dcf-dataservice/ignored_files_manifest.csv
if [[ "$MANIFEST_FILE" == *"active"* ]]; then
type="active"
elif [[ "$MANIFEST_FILE" == *"legacy"* ]]; then
type="legacy"
else
type="unknown"
fi
if [[ "$type" == "active" || "$type" == "legacy" ]]; then
rand_str="$(cat /dev/urandom | tr -dc 'a-zA-Z0-9' | fold -w 5 | head -n 1)"
python dataflow_pipeline.py --runner DataflowRunner --project $PROJECT --job_name dcf-dataservice --autoscaling_algorithm NONE --num_worker $MAX_WORKERS --maxNumWorkers $MAX_WORKERS --staging_location gs://$LOG_BUCKET/$RELEASE/staging --temp_location gs://$LOG_BUCKET/$RELEASE/temp --output gs://$LOG_BUCKET/$RELEASE/$type/output_$rand_str --setup_file ./setup.py --input $MANIFEST_FILE --global_config "{\"release\": \"$RELEASE/$type\", \"log_bucket\": \"$LOG_BUCKET\"}" --requirements_file requirements.txt --extra_package indexclient-1.6.0.zip --requirements_file scripts/requirements.txt
else
echo "Neither active nor legacy manifest is provided. Please check the manifest name!!!"
fi
python dataflow_pipeline.py --runner DataflowRunner --project $PROJECT --job_name dcf-dataservice --autoscaling_algorithm NONE --num_worker $MAX_WORKERS --maxNumWorkers $MAX_WORKERS --staging_location gs://$LOG_BUCKET/$RELEASE/staging --temp_location gs://$LOG_BUCKET/$RELEASE/temp --output gs://$LOG_BUCKET/$RELEASE/$type/output_$rand_str --setup_file ./setup.py --input $MANIFEST_FILE --global_config "{\"release\": \"$RELEASE/$type\"}" --requirements_file requirements.txt --requirements_file scripts/requirements.txt
restartPolicy: Never
49 changes: 23 additions & 26 deletions kube/services/jobs/remove-objects-from-clouds-job.yaml
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
# run with
# gen3 job run jobs/remove-objects-from-clouds-job.yaml RELEASE DR16 MANIFEST_S3 s3://giang816test/GDC_sample_redact_manifest.tsv LOG_BUCKET log_bucket IGNORED_FILE_S3 s3://giang816test/ignored_files_manifest.csv
# Documentation: https://github.com/uc-cdis/cdis-wiki/blob/cf5e68f417993676f6e8c8d8c744e29b984ed7a0/ops/Data-refresh.md
apiVersion: batch/v1
kind: Job
metadata:
Expand All @@ -12,46 +11,44 @@ spec:
app: gen3job
spec:
volumes:
- name: cred-volume
- name: aws-cred-volume
secret:
secretName: "dcf-aws-creds-secret"
- name: google-cred-volume
secretName: "aws-creds-secret"
- name: gs-cred-volume
secret:
secretName: "google-creds-secret"
- name: project-map-volume
configMap:
name: project-map-manifest
- name: creds-json-volume
- name: setting-volume
secret:
secretName: "dcf-dataservice-json-secret"
secretName: "dataservice-settings-secrets"
- name: dcf-setting-volume
secret:
secretName: "dcf-dataservice-settings-secrets"
containers:
- name: datareplicate
GEN3_DATAREPLICATE_IMAGE
imagePullPolicy: Always
env:
- name: RELEASE
GEN3_RELEASE
- name: LOG_BUCKET
GEN3_LOG_BUCKET
- name: MANIFEST_S3
GEN3_MANIFEST_S3
- name: IGNORED_FILE_S3
GEN3_IGNORED_FILE_S3
- name: THREAD_NUM
GEN3_THREAD_NUM
- name: SKIP_TO
GEN3_SKIP_TO
- name: DRY_RUN
GEN3_DRY_RUN|-value: "True"-|
GEN3_DRY_RUN|-value: "False"-|
volumeMounts:
- name: cred-volume
- name: aws-cred-volume
mountPath: "/root/.aws/credentials"
subPath: credentials
- name: google-cred-volume
- name: gs-cred-volume
mountPath: "/secrets/google_service_account_creds"
subPath: google_service_account_creds
- name: "project-map-volume"
mountPath: "/dcf-dataservice/GDC_project_map.json"
subPath: "GDC_project_map.json"
- name: "creds-json-volume"
mountPath: "/secrets/dcf_dataservice_credentials.json"
subPath: "dcf_dataservice_credentials.json"
- name: "setting-volume"
mountPath: "/secrets/dataservice_settings.json"
subPath: "dataservice_settings.json"
- name: "dcf-setting-volume"
mountPath: "/secrets/dcf_dataservice_settings.json"
subPath: "dcf_dataservice_settings.json"
command: ["/bin/bash" ]
args:
- "-c"
Expand All @@ -63,5 +60,5 @@ spec:
gcloud auth activate-service-account --key-file=/secrets/google_service_account_creds
export GOOGLE_APPLICATION_CREDENTIALS=/secrets/google_service_account_creds
aws s3 cp $IGNORED_FILE_S3 /dcf-dataservice/ignored_files_manifest.csv
python replicate.py redact --dry_run $DRY_RUN --release $RELEASE --redact_file $MANIFEST_S3 --log_bucket $LOG_BUCKET
python run.py redact --release $RELEASE --thread_num $THREAD_NUM --skip_to $SKIP_TO --dry_run $dry_run --global_config "{\"release\": \"$RELEASE\", \"thread_num\": \"$THREAD_NUM\", \"skip_to\": \"$SKIP_TO\", \"dry_run\": \"$DRY_RUN\"}"
restartPolicy: Never
Loading

0 comments on commit 395da07

Please sign in to comment.