From 1316fded1781cd608dcb5b0ed7fc56c54220a6b1 Mon Sep 17 00:00:00 2001 From: vighnesh_wednesday Date: Fri, 5 Jan 2024 11:28:16 +0530 Subject: [PATCH 01/12] added init docker example file --- README.md | 2 +- automation/init_docker_image_example.sh | 5 +++++ 2 files changed, 6 insertions(+), 1 deletion(-) create mode 100644 automation/init_docker_image_example.sh diff --git a/README.md b/README.md index 02750bc..8dc214e 100644 --- a/README.md +++ b/README.md @@ -26,7 +26,7 @@ To run the same ETL code in multiple cloud services based on your preference, th 2. Give your s3, adlas & kaggle (optional) paths in the ```app/.custom-env``` file. -3. Just run a Glue 4 docker conatiner & write your transformations in ```jobs``` folder. Refer ```demo.py``` file. Install dependancies using ```pip install -r requirements.txt``` +3. Just run a Glue 4 docker conatiner & write your transformations in ```jobs``` folder. Refer ```demo.py``` file. Install dependancies using ```pip install -r requirements.txt```. Refer ```automation/init_docker_image.sh``` write your own path for the docker image. 4. Run your scirpts in the docker container locally using ```spark-sumbit jobs/main.py``` diff --git a/automation/init_docker_image_example.sh b/automation/init_docker_image_example.sh new file mode 100644 index 0000000..16b11b9 --- /dev/null +++ b/automation/init_docker_image_example.sh @@ -0,0 +1,5 @@ +docker run -it -v :/home/glue_user/.aws -v :/home/glue_user/workspace/ -e AWS_PROFILE=default -e DISABLE_SSL=true --rm -p 4040:4040 -p 18080:18080 --name glue_pyspark amazon/aws-glue-libs:glue_libs_4.0.0_image_01 + +export KAGGLE_KEY=MOCKKEY +export KAGGLE_USERNAME=MOCKEUSERNAME +pip3 install -r requirements.txt From 929d7597262925b59e6cf6c19f85b7fab2217265 Mon Sep 17 00:00:00 2001 From: vighnesh_wednesday Date: Sat, 6 Jan 2024 15:35:30 +0530 Subject: [PATCH 02/12] added glue online parameter --- automation/create_glue_job.json | 3 ++- automation/update_glue_job.json | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/automation/create_glue_job.json b/automation/create_glue_job.json index 17e8ea4..1588727 100644 --- a/automation/create_glue_job.json +++ b/automation/create_glue_job.json @@ -22,7 +22,8 @@ "--additional-python-modules": "python-dotenv,kaggle", "--enable-observability-metrics": "true", "--enable-continuous-cloudwatch-log": "true", - "--job-language": "python" + "--job-language": "python", + "--JOB_NAME": "online" }, "MaxRetries": 0, "Timeout": 10, diff --git a/automation/update_glue_job.json b/automation/update_glue_job.json index 8bff748..99e5916 100644 --- a/automation/update_glue_job.json +++ b/automation/update_glue_job.json @@ -20,7 +20,8 @@ "--spark-event-logs-path": "s3://sample-bucket/Logs/UILogs/", "--enable-job-insights": "true", "--enable-continuous-cloudwatch-log": "true", - "--job-language": "python" + "--job-language": "python", + "--JOB_NAME": "online" }, "MaxRetries": 0, "Timeout": 10, From b51acd888d405503fad887ff42ee3e58fbbcf7c9 Mon Sep 17 00:00:00 2001 From: vighnesh_wednesday Date: Mon, 8 Jan 2024 11:09:14 +0530 Subject: [PATCH 03/12] removed job name param --- automation/create_glue_job.json | 3 +-- automation/update_glue_job.json | 3 +-- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/automation/create_glue_job.json b/automation/create_glue_job.json index 1588727..17e8ea4 100644 --- a/automation/create_glue_job.json +++ b/automation/create_glue_job.json @@ -22,8 +22,7 @@ "--additional-python-modules": "python-dotenv,kaggle", "--enable-observability-metrics": "true", "--enable-continuous-cloudwatch-log": "true", - "--job-language": "python", - "--JOB_NAME": "online" + "--job-language": "python" }, "MaxRetries": 0, "Timeout": 10, diff --git a/automation/update_glue_job.json b/automation/update_glue_job.json index 99e5916..8bff748 100644 --- a/automation/update_glue_job.json +++ b/automation/update_glue_job.json @@ -20,8 +20,7 @@ "--spark-event-logs-path": "s3://sample-bucket/Logs/UILogs/", "--enable-job-insights": "true", "--enable-continuous-cloudwatch-log": "true", - "--job-language": "python", - "--JOB_NAME": "online" + "--job-language": "python" }, "MaxRetries": 0, "Timeout": 10, From 7cc2fe5f2067be2b67590a4b31bea4932261a064 Mon Sep 17 00:00:00 2001 From: vighnesh_wednesday Date: Wed, 10 Jan 2024 12:20:08 +0530 Subject: [PATCH 04/12] changed .env path --- jobs/demo.py | 2 +- jobs/main.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/jobs/demo.py b/jobs/demo.py index 618e959..a3e4469 100644 --- a/jobs/demo.py +++ b/jobs/demo.py @@ -2,7 +2,7 @@ from dotenv import load_dotenv import app.environment as env -load_dotenv("app/.custom-env") +load_dotenv("../app/.custom-env") # COMMAND ---------- diff --git a/jobs/main.py b/jobs/main.py index 5aef691..7cd9bf4 100644 --- a/jobs/main.py +++ b/jobs/main.py @@ -9,7 +9,7 @@ import app.environment as env import app.spark_wrapper as sw -load_dotenv("app/.custom_env") +load_dotenv("../app/.custom_env") # COMMAND ---------- From 1ae7e49ece6e9e09a716ae67e75679a67166dc8f Mon Sep 17 00:00:00 2001 From: vighnesh_wednesday Date: Wed, 10 Jan 2024 13:06:00 +0530 Subject: [PATCH 05/12] removed kaggle keys --- automation/init_docker_image_example.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/automation/init_docker_image_example.sh b/automation/init_docker_image_example.sh index 16b11b9..96a2c2d 100644 --- a/automation/init_docker_image_example.sh +++ b/automation/init_docker_image_example.sh @@ -1,5 +1,5 @@ docker run -it -v :/home/glue_user/.aws -v :/home/glue_user/workspace/ -e AWS_PROFILE=default -e DISABLE_SSL=true --rm -p 4040:4040 -p 18080:18080 --name glue_pyspark amazon/aws-glue-libs:glue_libs_4.0.0_image_01 -export KAGGLE_KEY=MOCKKEY -export KAGGLE_USERNAME=MOCKEUSERNAME +export PYTHONPATH=$PYTHONPATH:/home/glue_user/workspace + pip3 install -r requirements.txt From cf0c00762765bcc19343a2bbc634338018e61de5 Mon Sep 17 00:00:00 2001 From: vighnesh_wednesday Date: Thu, 11 Jan 2024 11:23:52 +0530 Subject: [PATCH 06/12] added new folder --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index 5f4ce98..2f6683d 100644 --- a/.gitignore +++ b/.gitignore @@ -3,3 +3,4 @@ *__pycache__ temp htmlcov +.vscode From ba5595438bb5b80c8ab2b97a41fd2d02f4ea9acc Mon Sep 17 00:00:00 2001 From: vighnesh_wednesday Date: Thu, 11 Jan 2024 11:24:53 +0530 Subject: [PATCH 07/12] fixed typo --- README.md | 16 ++++++++-------- app/connect_databricks.py | 2 +- jobs/main.py | 2 +- 3 files changed, 10 insertions(+), 10 deletions(-) diff --git a/README.md b/README.md index 8dc214e..4465de4 100644 --- a/README.md +++ b/README.md @@ -12,11 +12,11 @@ To run the same ETL code in multiple cloud services based on your preference, th ## Requirements for Azure Databricks (for local connect only) - [Unity Catalog](https://learn.microsoft.com/en-us/azure/databricks/data-governance/unity-catalog/enable-workspaces) enabled workspace. -- [Databricks Connect](https://learn.microsoft.com/en-us/azure/databricks/dev-tools/databricks-connect/python/install) configured on local machine. Runing cluster. +- [Databricks Connect](https://learn.microsoft.com/en-us/azure/databricks/dev-tools/databricks-connect/python/install) configured on local machine. Running cluster. ## Requirements for AWS Glue (local setup) -- For Unix-based systems you can refer: [Data Enginnering Onboarding Starter Setup](https://github.com/wednesday-solutions/Data-Engineering-Onboarding-Starter#setup) +- For Unix-based systems you can refer: [Data Engineering Onboarding Starter Setup](https://github.com/wednesday-solutions/Data-Engineering-Onboarding-Starter#setup) - For Windows-based systems you can refer: [AWS Glue Developing using a Docker image](https://docs.aws.amazon.com/glue/latest/dg/aws-glue-programming-etl-libraries.html#develop-local-docker-image) @@ -24,13 +24,13 @@ To run the same ETL code in multiple cloud services based on your preference, th 1. Clone this repo in your own repo. -2. Give your s3, adlas & kaggle (optional) paths in the ```app/.custom-env``` file. +2. Give your S3, ADLS & Kaggle (optional) paths in the ```app/.custom-env``` file. -3. Just run a Glue 4 docker conatiner & write your transformations in ```jobs``` folder. Refer ```demo.py``` file. Install dependancies using ```pip install -r requirements.txt```. Refer ```automation/init_docker_image.sh``` write your own path for the docker image. +3. Just run a Glue 4 docker container & write your transformations in ```jobs``` folder. Refer ```demo.py``` file. Install dependencies using ```pip install -r requirements.txt```. Refer ```automation/init_docker_image.sh``` write your own path for the docker image. -4. Run your scirpts in the docker container locally using ```spark-sumbit jobs/main.py``` +4. Run your scripts in the docker container locally using ```spark-submit jobs/main.py``` -## Deployemnt +## Deployment 1. In your your GitHub Actions Secrets, setup the following keys with their values: ``` @@ -54,7 +54,7 @@ To run the same ETL code in multiple cloud services based on your preference, th ## Documentation -[Multi-cloud Pipeline Documnentation](https://docs.google.com/document/d/1npCpT_FIpw7ZuxAzQrEH3IsPKCDt7behmF-6VjrSFoQ/edit?usp=sharing) +[Multi-cloud Pipeline Documentation](https://docs.google.com/document/d/1npCpT_FIpw7ZuxAzQrEH3IsPKCDt7behmF-6VjrSFoQ/edit?usp=sharing) ## References @@ -67,4 +67,4 @@ To run tests in the root of the directory use: coverage run --source=app -m unittest discover -s tests coverage report -Note that awsglue libraries are not availabe to download, so use AWS Glue 4 Docker container. +Note that AWS Glue libraries are not available to download, so use AWS Glue 4 Docker container. diff --git a/app/connect_databricks.py b/app/connect_databricks.py index 93b2c26..62c05e6 100644 --- a/app/connect_databricks.py +++ b/app/connect_databricks.py @@ -14,7 +14,7 @@ def create_mount(dbutils, container_name, mount_path): f"fs.azure.account.key.{storage_name}.blob.core.windows.net": storage_key }, ) - print(f"{mount_path} Mount Successfull") + print(f"{mount_path} Mount Successful") else: dbutils.fs.refreshMounts() print(f"{mount_path} Already mounted") diff --git a/jobs/main.py b/jobs/main.py index 7cd9bf4..a731bd8 100644 --- a/jobs/main.py +++ b/jobs/main.py @@ -196,7 +196,7 @@ def get_cond(type1, type2): # COMMAND ---------- -# finally writting the data in transformed container +# finally writing the data in transformed container df.coalesce(1).write.csv(write_path + "final_data.csv", header=True, mode="overwrite") print("Execution Complete") From 27f0621678017eb3f479817627aad3e81c4487f4 Mon Sep 17 00:00:00 2001 From: vighnesh_wednesday Date: Thu, 11 Jan 2024 13:07:07 +0530 Subject: [PATCH 08/12] simplified steps --- README.md | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index 4465de4..1bfc44d 100644 --- a/README.md +++ b/README.md @@ -22,13 +22,15 @@ To run the same ETL code in multiple cloud services based on your preference, th ## Steps -1. Clone this repo in your own repo. +1. Clone this repo in your own repo. For Windows recommend use WSL. -2. Give your S3, ADLS & Kaggle (optional) paths in the ```app/.custom-env``` file. +2. Give your S3, ADLS & Kaggle (optional) paths in the ```app/.custom_env``` file for Databricks. Make ```.evn``` file in the root folder for local Docker Glue to use. -3. Just run a Glue 4 docker container & write your transformations in ```jobs``` folder. Refer ```demo.py``` file. Install dependencies using ```pip install -r requirements.txt```. Refer ```automation/init_docker_image.sh``` write your own path for the docker image. +3. Run ```automation/init_docker_image.sh``` passing your aws credential location & project root location. If you are using Windows Powershell or CommandPrompt then run the commands manually by copy-pasting. -4. Run your scripts in the docker container locally using ```spark-submit jobs/main.py``` +4. Write your jobs in the ```jobs``` folder. Refer ```demo.py``` file. One example is the ```jobs/main.py``` file. + +5. Check your setup is correct, by running scripts in the docker container locally using ```spark-submit jobs/demo.py```. Make sure you see the "Execution Complete" statement printed. ## Deployment @@ -41,7 +43,7 @@ To run the same ETL code in multiple cloud services based on your preference, th AWS_REGION AWS_GLUE_ROLE ``` - Rest all the key-value pairs in the ```app/.custom-env``` file are passed using aws cli using ```cd.yml``` file, so no need to pass them manually in the job. + Rest all the key-value pairs that you wrote in your .env file, make sure you pass them using the ```automation/deploy_glue_jobs.sh``` file. 2. For Azure Databricks, make a workflow with the link of your repo & main file. Pass the following parameters with their correct values: From cfacd305ec391d49f13a66ca716a72f5695b3dd2 Mon Sep 17 00:00:00 2001 From: vighnesh_wednesday Date: Thu, 11 Jan 2024 13:09:29 +0530 Subject: [PATCH 09/12] added detail on kaggle --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 1bfc44d..3d2e721 100644 --- a/README.md +++ b/README.md @@ -25,6 +25,7 @@ To run the same ETL code in multiple cloud services based on your preference, th 1. Clone this repo in your own repo. For Windows recommend use WSL. 2. Give your S3, ADLS & Kaggle (optional) paths in the ```app/.custom_env``` file for Databricks. Make ```.evn``` file in the root folder for local Docker Glue to use. +Make sure to pass KAGGLE_KEY & KAGGLE_USERNAME values if you are going to use Kaggle. Else make the kaggle_extraction flag as False. 3. Run ```automation/init_docker_image.sh``` passing your aws credential location & project root location. If you are using Windows Powershell or CommandPrompt then run the commands manually by copy-pasting. From f993433e0aab1b31d5009e382a9c29bec25d3e2f Mon Sep 17 00:00:00 2001 From: vighnesh_wednesday Date: Thu, 11 Jan 2024 13:10:35 +0530 Subject: [PATCH 10/12] kaggle extraction false --- jobs/demo.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/jobs/demo.py b/jobs/demo.py index a3e4469..8a2ce6f 100644 --- a/jobs/demo.py +++ b/jobs/demo.py @@ -18,7 +18,7 @@ # fmt: off # Keep this flag True if you want to extract data from kaggle, else False -kaggle_extraction = True +kaggle_extraction = False [employee, insurance, vendor] = env.get_data(databricks, kaggle_extraction, dbutils, spark) #pylint: disable=unbalanced-tuple-unpacking @@ -28,3 +28,6 @@ # COMMAND ---------- # Write all your transformations below: + + +print("\nExecution Complete\n") \ No newline at end of file From 7858e9a83023cb0f933942ed9bf43dab53775d46 Mon Sep 17 00:00:00 2001 From: vighnesh_wednesday Date: Thu, 11 Jan 2024 13:11:36 +0530 Subject: [PATCH 11/12] made executable --- automation/init_docker.sh | 8 ++++++++ automation/init_docker_image_example.sh | 5 ----- 2 files changed, 8 insertions(+), 5 deletions(-) create mode 100644 automation/init_docker.sh delete mode 100644 automation/init_docker_image_example.sh diff --git a/automation/init_docker.sh b/automation/init_docker.sh new file mode 100644 index 0000000..c039af9 --- /dev/null +++ b/automation/init_docker.sh @@ -0,0 +1,8 @@ +aws_credentials="$1" +project_root_location="$2" + +docker run -it -v $aws_credentials:/home/glue_user/.aws -v $project_root_location:/home/glue_user/workspace/ -e AWS_PROFILE=default -e DISABLE_SSL=true --rm -p 4040:4040 -p 18080:18080 --name glue_pyspark amazon/aws-glue-libs:glue_libs_4.0.0_image_01 + +export PYTHONPATH=$PYTHONPATH:/home/glue_user/workspace + +pip3 install -r requirements.txt diff --git a/automation/init_docker_image_example.sh b/automation/init_docker_image_example.sh deleted file mode 100644 index 96a2c2d..0000000 --- a/automation/init_docker_image_example.sh +++ /dev/null @@ -1,5 +0,0 @@ -docker run -it -v :/home/glue_user/.aws -v :/home/glue_user/workspace/ -e AWS_PROFILE=default -e DISABLE_SSL=true --rm -p 4040:4040 -p 18080:18080 --name glue_pyspark amazon/aws-glue-libs:glue_libs_4.0.0_image_01 - -export PYTHONPATH=$PYTHONPATH:/home/glue_user/workspace - -pip3 install -r requirements.txt From 84b505e42e534aa2eb914dc99337cc6ac7034edf Mon Sep 17 00:00:00 2001 From: vighnesh_wednesday Date: Fri, 12 Jan 2024 12:12:37 +0530 Subject: [PATCH 12/12] fixed typo --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 3d2e721..00ffb3f 100644 --- a/README.md +++ b/README.md @@ -27,7 +27,7 @@ To run the same ETL code in multiple cloud services based on your preference, th 2. Give your S3, ADLS & Kaggle (optional) paths in the ```app/.custom_env``` file for Databricks. Make ```.evn``` file in the root folder for local Docker Glue to use. Make sure to pass KAGGLE_KEY & KAGGLE_USERNAME values if you are going to use Kaggle. Else make the kaggle_extraction flag as False. -3. Run ```automation/init_docker_image.sh``` passing your aws credential location & project root location. If you are using Windows Powershell or CommandPrompt then run the commands manually by copy-pasting. +3. Run ```automation/init_docker.sh``` passing your aws credential location & project root location. If you are using Windows Powershell or CommandPrompt then run the commands manually by copy-pasting. 4. Write your jobs in the ```jobs``` folder. Refer ```demo.py``` file. One example is the ```jobs/main.py``` file.