From a668567acae9910aff71e7b73665931f08c5084c Mon Sep 17 00:00:00 2001 From: vighnesh-wednesday Date: Tue, 2 Jan 2024 17:00:18 +0530 Subject: [PATCH] added automation for deployment --- .github/workflows/cd.yml | 13 +++--- automation/create_glue_job.json | 32 +++++++++++++++ automation/deploy_glue_job.sh | 73 +++++++++++++++++++++++++++++++++ automation/update_glue_job.json | 31 ++++++++++++++ jobs/__init__.py | 0 jobs/demo.py | 0 scripts/update-job.sh | 57 ------------------------- scripts/update-parameters.sh | 16 -------- 8 files changed, 142 insertions(+), 80 deletions(-) create mode 100644 automation/create_glue_job.json create mode 100755 automation/deploy_glue_job.sh create mode 100644 automation/update_glue_job.json create mode 100644 jobs/__init__.py create mode 100644 jobs/demo.py delete mode 100755 scripts/update-job.sh delete mode 100755 scripts/update-parameters.sh diff --git a/.github/workflows/cd.yml b/.github/workflows/cd.yml index 2c82aad..ffb1f3c 100644 --- a/.github/workflows/cd.yml +++ b/.github/workflows/cd.yml @@ -22,12 +22,12 @@ jobs: with: python-version: 3.9 - - run: | + - name: Build App Wheel + run: | pip install setuptools wheel python3 setup.py bdist_wheel - # Step 1: Copy script to S3 bucket - - name: Copy script to S3 bucket + - name: Setup AWS cli & upload App Wheel to S3 uses: jakejarvis/s3-sync-action@v0.5.0 with: args: --follow-symlinks @@ -36,11 +36,10 @@ jobs: DEST_DIR: $S3_SCRIPTS_PATH AWS_S3_BUCKET: $S3_BUCKET_NAME + - name: Upload Scripts to S3 + run: aws s3 cp jobs "s3://$S3_BUCKET_NAME/$S3_SCRIPTS_PATH/" --recursive --region ap-south-1 - - name: Upload Script file to S3 - run: aws s3 cp ./main.py "s3://$S3_BUCKET_NAME/$S3_SCRIPTS_PATH/" --region ap-south-1 - - - name: Update parameters for the job + - name: Deploy Jobs on Glue run: | scripts/update-parameters.sh scripts/update-job.sh diff --git a/automation/create_glue_job.json b/automation/create_glue_job.json new file mode 100644 index 0000000..17e8ea4 --- /dev/null +++ b/automation/create_glue_job.json @@ -0,0 +1,32 @@ +{ + "Name": "samplename", + "Description": "", + "LogUri": "", + "Role": "samplerole", + "ExecutionProperty": { + "MaxConcurrentRuns": 1 + }, + "Command": { + "Name": "glueetl", + "ScriptLocation": "sample-location", + "PythonVersion": "3" + }, + "DefaultArguments": { + "--enable-glue-datacatalog": "true", + "--job-bookmark-option": "job-bookmark-disable", + "--TempDir": "sample-bucket/Logs/temp/", + "--enable-metrics": "true", + "--extra-py-files": "sample-bucket/scripts/sample-wheel", + "--spark-event-logs-path": "sample-bucket/Logs/UILogs/", + "--enable-job-insights": "false", + "--additional-python-modules": "python-dotenv,kaggle", + "--enable-observability-metrics": "true", + "--enable-continuous-cloudwatch-log": "true", + "--job-language": "python" + }, + "MaxRetries": 0, + "Timeout": 10, + "WorkerType": "G.1X", + "NumberOfWorkers": 2, + "GlueVersion": "4.0" +} \ No newline at end of file diff --git a/automation/deploy_glue_job.sh b/automation/deploy_glue_job.sh new file mode 100755 index 0000000..dbe84e8 --- /dev/null +++ b/automation/deploy_glue_job.sh @@ -0,0 +1,73 @@ +#!/bin/bash +s3_bucket="$1" +role="$2" +kaggle_key="$3" +kaggle_username="$4" + +source ./app/.custom-env + +job_names=$(aws glue get-jobs | jq -r '.Jobs | map(.Name)[]') + +for file in jobs/*.py; do + filename=$(basename "$file" .py) + + if [ "$filename" != "__init__" ]; then + + if [[ $job_names != *"$filename"* ]]; then + + jq --arg NAME "$filename" \ + --arg SCRIPT_LOCATION "s3://$s3_bucket/scripts/$filename.py" \ + --arg ROLE "$role" \ + --arg TEMP_DIR "s3://$s3_bucket/Logs/temp/" \ + --arg EVENT_LOG "s3://$s3_bucket/Logs/UILogs/" \ + --arg WHEEL "s3://$s3_bucket/scripts/app-0.9-py3-none-any.whl" \ + --arg KAGGLE_KEY "$kaggle_key" \ + --arg KAGGLE_USERNAME "$kaggle_username" \ + --arg GLUE_READ_PATH "$GLUE_READ_PATH" \ + --arg GLUE_WRITE_PATH "$GLUE_WRITE_PATH" \ + --arg KAGGLE_PATH "$KAGGLE_PATH" \ + '.Name=$NAME | + .Command.ScriptLocation=$SCRIPT_LOCATION | + .Role=$ROLE | + .DefaultArguments["--TempDir"]=$TEMP_DIR | + .DefaultArguments["--spark-event-logs-path"]=$EVENT_LOG | + .DefaultArguments["--extra-py-files"]=$WHEEL | + .DefaultArguments["--KAGGLE_KEY"]=$KAGGLE_KEY | + .DefaultArguments["--KAGGLE_USERNAME"]=$KAGGLE_USERNAME | + .DefaultArguments["--GLUE_READ_PATH"] = $GLUE_READ_PATH | + .DefaultArguments["--GLUE_WRITE_PATH"] = $GLUE_WRITE_PATH | + .DefaultArguments["--KAGGLE_PATH"] = $KAGGLE_PATH' \ + automation/create_glue_job.json > "automation/output_$filename.json" + + aws glue create-job --cli-input-json file://"automation/output_$filename.json" + + else + + jq --arg NAME "$filename" \ + --arg SCRIPT_LOCATION "s3://$s3_bucket/scripts/$filename.py" \ + --arg ROLE "$role" \ + --arg TEMP_DIR "s3://$s3_bucket/Logs/temp/" \ + --arg EVENT_LOG "s3://$s3_bucket/Logs/UILogs/" \ + --arg WHEEL "s3://$s3_bucket/scripts/app-0.9-py3-none-any.whl" \ + --arg KAGGLE_KEY "$kaggle_key" \ + --arg KAGGLE_USERNAME "$kaggle_username" \ + --arg GLUE_READ_PATH "$GLUE_READ_PATH" \ + --arg GLUE_WRITE_PATH "$GLUE_WRITE_PATH" \ + --arg KAGGLE_PATH "$KAGGLE_PATH" \ + '.JobName=$NAME | + .JobUpdate.Command.ScriptLocation=$SCRIPT_LOCATION | + .JobUpdate.Role=$ROLE | + .JobUpdate.DefaultArguments["--TempDir"]=$TEMP_DIR | + .JobUpdate.DefaultArguments["--spark-event-logs-path"]=$EVENT_LOG | + .JobUpdate.DefaultArguments["--extra-py-files"]=$WHEEL | + .JobUpdate.DefaultArguments["--KAGGLE_KEY"]=$KAGGLE_KEY | + .JobUpdate.DefaultArguments["--KAGGLE_USERNAME"]=$KAGGLE_USERNAME | + .JobUpdate.DefaultArguments["--GLUE_READ_PATH"] = $GLUE_READ_PATH | + .JobUpdate.DefaultArguments["--GLUE_WRITE_PATH"] = $GLUE_WRITE_PATH | + .JobUpdate.DefaultArguments["--KAGGLE_PATH"] = $KAGGLE_PATH' \ + automation/update_glue_job.json > "automation/output_$filename.json" + + aws glue update-job --cli-input-json file://"automation/output_$filename.json" + fi + fi +done diff --git a/automation/update_glue_job.json b/automation/update_glue_job.json new file mode 100644 index 0000000..8bff748 --- /dev/null +++ b/automation/update_glue_job.json @@ -0,0 +1,31 @@ +{ + "JobName": "sample-name", + "JobUpdate": { + "Description": "", + "Role": "sample-role", + "ExecutionProperty": { + "MaxConcurrentRuns": 1 + }, + "Command": { + "Name": "glueetl", + "ScriptLocation": "sample-location", + "PythonVersion": "3" + }, + "DefaultArguments": { + "--enable-glue-datacatalog": "true", + "--job-bookmark-option": "job-bookmark-enable", + "--TempDir": "s3://sample-bucket/scripts/temp/", + "--enable-metrics": "true", + "--enable-spark-ui": "true", + "--spark-event-logs-path": "s3://sample-bucket/Logs/UILogs/", + "--enable-job-insights": "true", + "--enable-continuous-cloudwatch-log": "true", + "--job-language": "python" + }, + "MaxRetries": 0, + "Timeout": 10, + "WorkerType": "G.1X", + "NumberOfWorkers": 2, + "GlueVersion": "4.0" + } + } \ No newline at end of file diff --git a/jobs/__init__.py b/jobs/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/jobs/demo.py b/jobs/demo.py new file mode 100644 index 0000000..e69de29 diff --git a/scripts/update-job.sh b/scripts/update-job.sh deleted file mode 100755 index cf7ceee..0000000 --- a/scripts/update-job.sh +++ /dev/null @@ -1,57 +0,0 @@ -#!/bin/bash - -# Getting job-skeleton.json -aws glue update-job --generate-cli-skeleton > scripts/job-skeleton.json - -# Extract values from updated_job_details.json -jobName=$(jq -r '.Job.Name' scripts/updated_job_details.json) -description=$(jq -r '.Job.Description' scripts/updated_job_details.json) -role=$(jq -r '.Job.Role' scripts/updated_job_details.json) -maxConcurrentRuns=$(jq -r '.Job.ExecutionProperty.MaxConcurrentRuns' scripts/updated_job_details.json) -commandName=$(jq -r '.Job.Command.Name' scripts/updated_job_details.json) -scriptLocation=$(jq -r '.Job.Command.ScriptLocation' scripts/updated_job_details.json) -pythonVersion=$(jq -r '.Job.Command.PythonVersion' scripts/updated_job_details.json) -workerType=$(jq -r '.Job.WorkerType' scripts/updated_job_details.json) -numberOfWorkers=$(jq -r '.Job.NumberOfWorkers' scripts/updated_job_details.json) -maxRetries=$(jq -r '.Job.MaxRetries' scripts/updated_job_details.json) -timeout=$(jq -r '.Job.Timeout' scripts/updated_job_details.json) -maxCapacity=$(jq -r '.Job.MaxCapacity' scripts/updated_job_details.json) -glueVersion=$(jq -r '.Job.GlueVersion' scripts/updated_job_details.json) -defaultArguments=$(jq -r '.Job.DefaultArguments' scripts/updated_job_details.json) - -# Update update-job-skeleton.json with extracted values (excluding specified keys) -jq --arg jobName "$jobName" \ - --arg description "$description" \ - --arg role "$role" \ - --argjson maxConcurrentRuns "$maxConcurrentRuns" \ - --arg commandName "$commandName" \ - --arg scriptLocation "$scriptLocation" \ - --arg pythonVersion "$pythonVersion" \ - --arg workerType "$workerType" \ - --argjson numberOfWorkers "$numberOfWorkers" \ - --argjson maxRetries "$maxRetries" \ - --argjson timeout "$timeout" \ - --argjson maxCapacity "$maxCapacity" \ - --arg glueVersion "$glueVersion" \ - --argjson defaultArguments "$defaultArguments" \ - '.JobName = $jobName | - .JobUpdate.Description = $description | - .JobUpdate.Role = $role | - .JobUpdate.ExecutionProperty.MaxConcurrentRuns = $maxConcurrentRuns | - .JobUpdate.Command.Name = $commandName | - .JobUpdate.Command.ScriptLocation = $scriptLocation | - .JobUpdate.Command.PythonVersion = $pythonVersion | - .JobUpdate.DefaultArguments = $defaultArguments | - .JobUpdate.Timeout = $timeout | - .JobUpdate.GlueVersion = $glueVersion | - .JobUpdate.NumberOfWorkers = $numberOfWorkers | - del(.JobUpdate.NonOverridableArguments) | - del(.JobUpdate.Connections) | - del(.JobUpdate.AllocatedCapacity) | - del(.JobUpdate.MaxCapacity) | - del(.JobUpdate.SecurityConfiguration) | - del(.JobUpdate.NotificationProperty)' scripts/job-skeleton.json > scripts/updated-skeleton.json - - -# Updating the details using updated-skeleton -aws glue update-job --cli-input-json file://scripts/updated-skeleton.json \ No newline at end of file diff --git a/scripts/update-parameters.sh b/scripts/update-parameters.sh deleted file mode 100755 index 81078ff..0000000 --- a/scripts/update-parameters.sh +++ /dev/null @@ -1,16 +0,0 @@ -#!/bin/bash - -# Load environment variables from .custom-env file -source ./app/.custom-env - -aws glue get-job --job-name main > scripts/job_details.json - - -# Update JSON file using jq -jq --arg GLUE_READ_PATH "$GLUE_READ_PATH" \ - --arg GLUE_WRITE_PATH "$GLUE_WRITE_PATH" \ - --arg KAGGLE_PATH "$KAGGLE_PATH" \ - '.Job.DefaultArguments["--GLUE_READ_PATH"] = $GLUE_READ_PATH | - .Job.DefaultArguments["--GLUE_WRITE_PATH"] = $GLUE_WRITE_PATH | - .Job.DefaultArguments["--KAGGLE_PATH"] = $KAGGLE_PATH' ./scripts/job_details.json > ./scripts/updated_job_details.json -