diff --git a/README.md b/README.md index f2835b2..12dd150 100644 --- a/README.md +++ b/README.md @@ -1,14 +1,15 @@ -# cookiecutter-pyspark-aws-emr +# cookiecutter-pyspark-cloud :cloud: [![License](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](https://opensource.org/licenses/Apache-2.0) -Get started on AWS EMR with this [cookiecutter](https://cookiecutter.readthedocs.io/en/1.7.2/usage.html) template! +Run PySpark code in the 'cloud' with Amazon Web Services (AWS) Elastic MapReduce (EMR) service in a few simple steps with + this [cookiecutter](https://cookiecutter.readthedocs.io/en/1.7.2/usage.html) project template! ## Quickstart ``` pip install -U "cookiecutter>=1.7" -cookiecutter https://github.com/daniel-cortez-stevenson/cookiecutter-pyspark-aws-emr.git +cookiecutter https://github.com/daniel-cortez-stevenson/cookiecutter-pyspark-cloud.git cd *your-repo_name* make install ``` @@ -18,8 +19,8 @@ make install 1. Clone this repo: ``` -git clone https://github.com/daniel-cortez-stevenson/cookiecutter-pyspark-aws-emr.git -cd cookiecutter-pyspark-aws-emr +git clone https://github.com/daniel-cortez-stevenson/cookiecutter-pyspark-cloud.git +cd cookiecutter-pyspark-cloud ``` 2. Create a Python environment with dependencies installed: @@ -37,7 +38,7 @@ conda activate cookiecutter ``` cd .. -cookiecutter ./cookiecutter-pyspark-aws-emr +cookiecutter ./cookiecutter-pyspark-cloud ``` 5. Initialize git: @@ -88,6 +89,6 @@ make install-dev Contributions are welcome! -- [Submit an Issue](https://github.com/daniel-cortez-stevenson/cookiecutter-pyspark-aws-emr/issues/new) +- [Submit an Issue](https://github.com/daniel-cortez-stevenson/cookiecutter-pyspark-cloud/issues/new) -- [Submit a Pull Request](https://github.com/daniel-cortez-stevenson/cookiecutter-pyspark-aws-emr/compare) \ No newline at end of file +- [Submit a Pull Request](https://github.com/daniel-cortez-stevenson/cookiecutter-pyspark-cloud/compare) \ No newline at end of file diff --git a/cookiecutter.json b/cookiecutter.json index 33c70ff..54752b3 100644 --- a/cookiecutter.json +++ b/cookiecutter.json @@ -1,8 +1,8 @@ { - "project_name": "pyspark-aws-emr", - "repo_name": "{{ cookiecutter.project_name | slugify }}", - "package_name": "{{ cookiecutter.repo_name | slugify(separator='_') }}", - "s3_bucket": "{{ cookiecutter.repo_name | slugify }}-{{ random_ascii_string(8) | lower }}", + "project_name": "PySpark Cloud", + "repo_name": "{{cookiecutter.project_name | slugify}}", + "package_name": "{{cookiecutter.repo_name | slugify(separator='_')}}", + "s3_bucket": "{{cookiecutter.repo_name | slugify }}-{{ random_ascii_string(8) | lower}}", "_extensions": [ "cookiecutter.extensions.RandomStringExtension", "cookiecutter.extensions.SlugifyExtension" diff --git a/{{cookiecutter.repo_name}}/README.md b/{{cookiecutter.repo_name}}/README.md index 4cb38a6..9e4e9a4 100644 --- a/{{cookiecutter.repo_name}}/README.md +++ b/{{cookiecutter.repo_name}}/README.md @@ -1,43 +1,75 @@ # {{cookiecutter.repo_name}} -A PySpark on EMR stater kit - from infrastructure to spark-submit call. +A 'starter-kit' for PySpark in the cloud :cloud: - from infrastructure to spark-submit call. -## How to use this repo +## Quickstart -### Install Python libs +``` +conda create -n {{cookiecutter.repo_name}} -y "python=3.6" +conda activate {{cookiecutter.repo_name}} + +make install + +{{cookiecutter.package_name}} +``` + +## Usage + +### Install a Python 3.6 Environment ``` conda create -n {{cookiecutter.repo_name}} -y "python=3.6" -source activate {{cookiecutter.repo_name}} +conda activate {{cookiecutter.repo_name}} +``` + +### Install {{cookiecutter.package_name}} for Development + +``` make install-dev ``` -### Create your S3 bucket +### AWS + +#### Store Data and Assets in S3 ``` aws s3 mb s3://{{cookiecutter.s3_bucket}} ``` -### Deploy the AWS Cloud :cloud: Infrastrucutre as Code with AWS Cloudformation +#### Deploy Infrastrucutre as Code with AWS Cloudformation + +Distribute code: -Distribute your EMR bootstrap / step scripts and Python package via S3: +*make cluster bootstrap & EMR Step API bash scripts, PySpark code available via S3* ``` make s3dist ``` +Make Keys: + +*create the necessary AWS EC2 Key Pairs for the bastion server and master node via the AWS Console* + +Example Key Pair Names: + +- test-{{cookiecutter.repo_name}}-bastion + +- test-{{cookiecutter.repo_name}}-emr + +Deploy infrastructure: + ``` aws cloudformation create-stack \ - --stack-name MyEmrTestXyz123 \ + --stack-name "{{cookiecutter.project_name | slugify(separator='')}}-{{random_ascii_string(6) | lower}}" \ --template-body file://./cloudformation/emr-template.yaml \ - --tags Key=Environment,Value=Test Key=Project,Value=MyPySparkProject \ + --tags Key=Environment,Value=Test Key=Project,Value={{cookiecutter.project_name | slugify(separator='')}} \ --parameters \ - ParameterKey=BastionKeyName,ParameterValue=test-pyspark-aws-emr-bastion \ - ParameterKey=EmrKeyName,ParameterValue=test-pyspark-aws-emr-emr \ + ParameterKey=BastionKeyName,ParameterValue=test-{{cookiecutter.repo_name}}-bastion \ + ParameterKey=EmrKeyName,ParameterValue=test-{{cookiecutter.repo_name}}-emr \ # for debugging the stack use `--disable-rollback` ``` -### Submit PySpark code as AWS EMR Steps using the `{{cookiecutter.package_name}}` CLI +#### Submit PySpark code as AWS EMR Steps using the `{{cookiecutter.package_name}}` CLI Get help: diff --git a/{{cookiecutter.repo_name}}/setup.py b/{{cookiecutter.repo_name}}/setup.py index 1312fee..b924f00 100644 --- a/{{cookiecutter.repo_name}}/setup.py +++ b/{{cookiecutter.repo_name}}/setup.py @@ -19,7 +19,7 @@ setup( name='{{cookiecutter.package_name}}', version='0.0.1', - description='A PySpark ETL on AWS EMR cookiecutter project generated from cookiecutter-pyspark-aws-emr', + description='A PySpark Cloud project, generated from cookiecutter-pyspark-cloud.', classifiers=[ 'Development Status :: 3 - Alpha', 'License :: OSI Approved :: Apache Software License',