From 112d2649eedabe4cc31c5e7625ec84f62b5004d2 Mon Sep 17 00:00:00 2001 From: Carlos Giraldo Date: Mon, 8 Oct 2018 13:36:51 +0200 Subject: [PATCH] Added Dockerfiles to build pnda docker images --- docker/deploy.sh | 2 +- docker/docker-compose.yml | 43 +- docker/dockerfiles/jupyter/Dockerfile | 60 ++ docker/dockerfiles/jupyter/build-docker.sh | 4 + docker/dockerfiles/jupyter/data_generator.py | 114 +++ .../jupyter/docker/create_notebook_dir.sh | 19 + .../dockerfiles/jupyter/docker/entrypoint.sh | 4 + .../jupyter/docker/hdfs_root_uri_conf.diff | 16 + .../PNDA minimal SqlMagic notebook.ipynb | 57 ++ .../notebooks/PNDA minimal notebook.ipynb | 98 +++ ...le Platform-library PySpark Notebook.ipynb | 659 ++++++++++++++++++ .../docker/platform-libraries-env-conf.diff | 23 + .../jupyter/docker/platformlibs.ini.tpl | 6 + .../jupyter/docker/pyspark2_kernel.json.tpl | 21 + .../app-packages-requirements.txt | 14 + .../requirements-jupyter-extensions.txt | 3 + .../requirements/requirements-jupyter.txt | 44 ++ .../requirements/requirements-jupyterhub.txt | 15 + docker/dockerfiles/platform-console-backend | 1 + docker/dockerfiles/platform-console-frontend | 1 + docker/dockerfiles/platform-data-mgmnt | 1 + .../dockerfiles/platform-deployment-manager | 1 + docker/dockerfiles/platform-gobblin-modules | 1 + .../dockerfiles/platform-package-repository | 1 + .../dockerfiles/platform-testing/Dockerfile | 33 + .../platform-testing/build-docker.sh | 4 + .../platform-testing/entrypoint.sh.tpl | 16 + .../platform-testing/hbase_spark_metric.py | 26 + .../platform-testing/jinja_entrypoint.sh | 4 + 29 files changed, 1288 insertions(+), 3 deletions(-) create mode 100644 docker/dockerfiles/jupyter/Dockerfile create mode 100755 docker/dockerfiles/jupyter/build-docker.sh create mode 100644 docker/dockerfiles/jupyter/data_generator.py create mode 100755 docker/dockerfiles/jupyter/docker/create_notebook_dir.sh create mode 100755 docker/dockerfiles/jupyter/docker/entrypoint.sh create mode 100644 docker/dockerfiles/jupyter/docker/hdfs_root_uri_conf.diff create mode 100644 docker/dockerfiles/jupyter/docker/notebooks/PNDA minimal SqlMagic notebook.ipynb create mode 100644 docker/dockerfiles/jupyter/docker/notebooks/PNDA minimal notebook.ipynb create mode 100644 docker/dockerfiles/jupyter/docker/notebooks/tutorial/Example Platform-library PySpark Notebook.ipynb create mode 100644 docker/dockerfiles/jupyter/docker/platform-libraries-env-conf.diff create mode 100644 docker/dockerfiles/jupyter/docker/platformlibs.ini.tpl create mode 100644 docker/dockerfiles/jupyter/docker/pyspark2_kernel.json.tpl create mode 100644 docker/dockerfiles/jupyter/docker/requirements/app-packages-requirements.txt create mode 100644 docker/dockerfiles/jupyter/docker/requirements/requirements-jupyter-extensions.txt create mode 100644 docker/dockerfiles/jupyter/docker/requirements/requirements-jupyter.txt create mode 100644 docker/dockerfiles/jupyter/docker/requirements/requirements-jupyterhub.txt create mode 160000 docker/dockerfiles/platform-console-backend create mode 160000 docker/dockerfiles/platform-console-frontend create mode 160000 docker/dockerfiles/platform-data-mgmnt create mode 160000 docker/dockerfiles/platform-deployment-manager create mode 160000 docker/dockerfiles/platform-gobblin-modules create mode 160000 docker/dockerfiles/platform-package-repository create mode 100644 docker/dockerfiles/platform-testing/Dockerfile create mode 100755 docker/dockerfiles/platform-testing/build-docker.sh create mode 100644 docker/dockerfiles/platform-testing/entrypoint.sh.tpl create mode 100644 docker/dockerfiles/platform-testing/hbase_spark_metric.py create mode 100755 docker/dockerfiles/platform-testing/jinja_entrypoint.sh diff --git a/docker/deploy.sh b/docker/deploy.sh index 163c5a3..638b780 100755 --- a/docker/deploy.sh +++ b/docker/deploy.sh @@ -4,11 +4,11 @@ echo "---------------- STARTING HDFS and HBASE ----------------" docker-compose up -d zookeeper docker-compose up -d hdfs-namenode docker-compose up -d hdfs-datanode -docker-compose up -d hbase-master while ! docker exec -ti hdfs-namenode nc -vz hdfs-namenode:8020 ; do echo "waiting for hdfs-namenode to start" sleep 2 done +docker-compose up -d hbase-master docker-compose up -d hbase-region echo "---------------- ADDING users to HDFS ----------------" diff --git a/docker/docker-compose.yml b/docker/docker-compose.yml index c91591a..619b99f 100644 --- a/docker/docker-compose.yml +++ b/docker/docker-compose.yml @@ -1,9 +1,13 @@ -version: '3' +version: '3.4' services: gobblin: container_name: gobblin hostname: gobblin image: pnda/gobblin:0.11.0-0.1.0 + build: + context: ./dockerfiles/platform-gobblin-modules + args: + version: 0.1.0 environment: - HDFS_URL=hdfs://hdfs-namenode:8020 - MASTER_DATASET_DIRECTORY=/user/pnda/PNDA_datasets/datasets @@ -47,6 +51,10 @@ services: container_name: jupyter hostname: jupyter image: pnda/jupyter:4.4.0 + build: + context: ./dockerfiles/jupyter + args: + version: 4.4.0 volumes: - jupyter-home:/home environment: @@ -76,6 +84,10 @@ services: container_name: deployment-manager hostname: deployment-manager image: pnda/deployment-manager:1.0.0 + build: + context: ./dockerfiles/platform-deployment-manager + args: + version: 1.0.0 environment: - JUPYTER_HOST=jupyter - DATA_LOGGER_URL=http://console-backend:3001 #data-logger uses the data-manager network stack @@ -101,6 +113,10 @@ services: container_name: package-repository hostname: package-repository image: pnda/package-repository:0.3.2 + build: + context: ./dockerfiles/platform-package-repository + args: + version: 0.3.2 environment: - FS_LOCATION_PATH=/mnt/packages - DATA_LOGGER_URL=http://console-backend:3001 #data-logger uses the data-manager network stack @@ -199,7 +215,11 @@ services: platform-testing: container_name: platform-testing hostname: platform-testing - image: pnda/testing:0.5.0 + image: pnda/platform-testing:0.5.0 + build: + context: ./dockerfiles/platform-testing + args: + version: 0.5.0 environment: - CONSOLE_HOSTS=console-backend:3001 - ZOOKEEPERS=zookeeper:2181 @@ -214,6 +234,10 @@ services: container_name: console-frontend hostname: console-frontend image: pnda/console-frontend:1.0.0 + build: + context: ./dockerfiles/platform-console-frontend + args: + version: 1.0.0 environment: - DATA_MANAGER_HOST=console-backend - DATA_MANAGER_PORT=3123 @@ -226,6 +250,11 @@ services: container_name: console-backend hostname: console-backend image: pnda/console-backend-data-manager:1.0.0 + build: + context: ./dockerfiles/platform-console-backend + args: + version: 1.0.0 + target: console-backend-data-manager environment: - CONSOLE_FRONTEND_HOSTS_CSV=console-frontend - DATASET_MANAGER_URL=http://data-service:7000 @@ -234,6 +263,11 @@ services: container_name: console-backend-data-logger network_mode: service:console-backend image: pnda/console-backend-data-logger:1.0.0 + build: + context: ./dockerfiles/platform-console-backend + args: + version: 1.0.0 + target: console-backend-data-logger redis: container_name: redis network_mode: service:console-backend @@ -242,6 +276,11 @@ services: container_name: data-service hostname: data-service image: pnda/data-service:0.2.2 + build: + context: ./dockerfiles/platform-data-mgmnt + args: + version: 0.2.2 + target: data-service environment: - LOCATION=/user/pnda/PNDA_datasets/datasets - HADOOP_DISTRO=env diff --git a/docker/dockerfiles/jupyter/Dockerfile b/docker/dockerfiles/jupyter/Dockerfile new file mode 100644 index 0000000..54618f9 --- /dev/null +++ b/docker/dockerfiles/jupyter/Dockerfile @@ -0,0 +1,60 @@ +FROM alpine:3.7 as platformlibs + +LABEL maintainer="cgiraldo@gradiant.org" +LABEL organization="gradiant.org" + +COPY docker/hdfs_root_uri_conf.diff / +RUN apk add --no-cache git bash python py2-pip && pip install setuptools +RUN git clone https://github.com/pndaproject/platform-libraries.git +RUN cd platform-libraries && git checkout tags/release/4.0 && \ + export VERSION=$(git describe --tags) && \ + git apply /hdfs_root_uri_conf.diff && \ + python setup.py bdist_egg + +FROM alpine:3.7 + +COPY --from=platformlibs /platform-libraries/dist/platformlibs-0.1.5-py2.7.egg / +COPY docker / +ENV SPARK_HOME=/opt/spark + +RUN apk add --no-cache bash python2 py2-pip postgresql-dev libpng-dev freetype-dev ca-certificates build-base python2-dev krb5-dev libffi-dev cyrus-sasl-dev nodejs shadow python3 python3-dev openjdk8-jre && \ + echo 'Installing python2 requirements' && \ + pip2 install -r /requirements/requirements-jupyter.txt && \ + pip2 install -r /requirements/app-packages-requirements.txt && pip2 install j2cli && \ + /usr/bin/python2 -m ipykernel.kernelspec --name python2 --display-name "Python 2" && \ + echo 'Instaling python3 requirements' && \ + pip3 install -r /requirements/requirements-jupyter.txt && \ + /usr/bin/python3 -m ipykernel.kernelspec --name python3 --display-name "Python 3" && \ + echo 'Adding pyspark2 support' && \ + mkdir -p /usr/local/share/jupyter/kernels/pyspark2 && mkdir -p /opt && \ + wget -O- https://archive.apache.org/dist/spark/spark-2.3.0/spark-2.3.0-bin-hadoop2.7.tgz | tar -xvz -C /tmp && \ + mv /tmp/spark-2.3.0-bin-hadoop2.7 /opt/spark && \ + echo 'Adding jupyter-scala_extension_spark' && \ + jupyter nbextension enable --py widgetsnbextension --system && \ + jupyter-kernelspec install /usr/lib/python3.6/site-packages/sparkmagic/kernels/sparkkernel && \ + jupyter serverextension enable --py sparkmagic && \ + echo 'Adding jupyter-extensions' && \ + apk add --no-cache libxml2-dev libxslt-dev && \ + pip3 install -r /requirements/requirements-jupyter-extensions.txt && \ + jupyter serverextension enable --py jupyter_spark --system && \ + jupyter nbextension install --py jupyter_spark --system && \ + jupyter nbextension enable --py jupyter_spark --system && \ + jupyter nbextension enable --py widgetsnbextension --system && \ + echo 'Adding jupyterhub' && \ + pip3 install -r /requirements/requirements-jupyterhub.txt && \ + npm install -g configurable-http-proxy && mkdir -p /var/log/pnda && \ + echo 'auth required pam_exec.so debug log=/var/log/pnda/login.log /create_notebook_dir.sh' >> /etc/pam.d/login +RUN echo 'Adding pnda platform-libraries' && \ + mkdir /etc/platformlibs && /usr/bin/python2 -m easy_install /platformlibs-0.1.5-py2.7.egg && \ + adduser -D pnda && echo "pnda:pnda" | chpasswd && \ + mkdir -p /opt/pnda && mv /notebooks /opt/pnda/jupyter_notebooks && \ + echo 'auth required pam_listfile.so item=user sense=deny file=/etc/login.deny onerr=succeed' >> /etc/pam.d/login && \ + echo 'root' >> /etc/login.deny + +RUN wget http://central.maven.org/maven2/org/apache/spark/spark-sql-kafka-0-10_2.11/2.3.0/spark-sql-kafka-0-10_2.11-2.3.0.jar \ +-O /opt/spark/jars/spark-sql-kafka-0-10_2.11-2.3.0.jar && \ +wget http://central.maven.org/maven2/org/apache/kafka/kafka-clients/1.0.0/kafka-clients-1.0.0.jar \ +-O /opt/spark/jars/kafka-clients-1.0.0.jar + +ENTRYPOINT /entrypoint.sh + diff --git a/docker/dockerfiles/jupyter/build-docker.sh b/docker/dockerfiles/jupyter/build-docker.sh new file mode 100755 index 0000000..f1acc33 --- /dev/null +++ b/docker/dockerfiles/jupyter/build-docker.sh @@ -0,0 +1,4 @@ +#!/bin/bash + +VERSION=4.4.0 +docker build -t pnda/jupyter:$VERSION . diff --git a/docker/dockerfiles/jupyter/data_generator.py b/docker/dockerfiles/jupyter/data_generator.py new file mode 100644 index 0000000..c428a86 --- /dev/null +++ b/docker/dockerfiles/jupyter/data_generator.py @@ -0,0 +1,114 @@ +#!/usr/bin/python + +import argparse +import subprocess +import json +import avro.schema +import avro.io +import io +import datetime +import uuid +import time +import sys + +from random import randint +from avro.datafile import DataFileWriter +from avro.io import DatumWriter +from argparse import RawTextHelpFormatter + +def generate_sample_datasets (host_ips, metric_ids, year, month, day, hour): + avro_schema = '' + #load data from hdfs + cat = subprocess.Popen(['sudo', '-u', 'hdfs', 'hadoop', 'fs', '-cat', '/user/pnda/PNDA_datasets/datasets/.metadata/schema.avsc'], stdout=subprocess.PIPE) + for line in cat.stdout: + avro_schema = avro_schema + line + schema = avro.schema.parse(avro_schema) + bytes_writer = io.BytesIO() + encoder = avro.io.BinaryEncoder(bytes_writer) + #create hdfs folder structure + dir = create_hdfs_dirs (year, month, day, hour) + filename = str(uuid.uuid4()) + '.avro' + filepath = dir + filename + tmp_file = '/tmp/' + filename + + writer = DataFileWriter(open(tmp_file, "w"), DatumWriter(), schema) + + start_dt = datetime.datetime(year, month, day, hour, 0, 0) + start_ts = int(time.mktime(start_dt.timetuple())) + end_dt = start_dt.replace(hour=hour+1) + end_ts = int(time.mktime(end_dt.timetuple())) + + for ts in xrange(start_ts, end_ts, 1): + #generate random pnda record on per host ip basis + for host_ip in host_ips: + record = {} + record['timestamp'] = (ts * 1000) + record['src'] = 'test' + record['host_ip'] = host_ip + record['rawdata'] = generate_random_metrics(metric_ids) + #encode avro + writer.append(record) + writer.close() + subprocess.Popen(['sudo', '-u', 'hdfs', 'hadoop', 'fs', '-copyFromLocal', tmp_file, dir]) + return filepath + +def generate_random_metrics (metric_ids): + ''' + generate random raw_data elementTon + ''' + raw_data = {} + for id in metric_ids: + raw_data[id] = str(randint(0, 100)) + return json.dumps(raw_data).encode('utf-8') + +def create_hdfs_dirs (year, month, day, hour): + dir = "/user/pnda/PNDA_datasets/datasets/source=test/year=%0d/month=%02d/day=%02d/hour=%02d/" % (year, month, day, hour) + subprocess.Popen(['sudo', '-u', 'hdfs', 'hadoop', 'fs', '-mkdir', '-p', dir]) + return dir + +def get_args(): + epilog = """ example: + - create sample data sets + data_generator.py --hosts '10.0.0.1, 10.0.0.2' --metrics 'a, b, c' --year 2016 --month 4 --day 27 --hour 14 + - create sample data sets using system datetime + data_generator.py --hosts '10.0.0.1, 10.0.0.2' --metrics 'a, b, c' + """ + + dt = datetime.datetime.now() + parser = argparse.ArgumentParser(formatter_class=RawTextHelpFormatter, description='Sample datasets generator', epilog=epilog) + parser.add_argument('--hosts', help='list of sample host ips separated by comma', default='') + parser.add_argument('--metrics', help='list of metrics ids', default='') + parser.add_argument('--year', type=int, help='year', default=dt.year) + parser.add_argument('--month', type=int, help='month', default=dt.month) + parser.add_argument('--day', type=int, help='day of the month', default=dt.day) + parser.add_argument('--hour', help='hour of the day', default=dt.hour) + args = parser.parse_args() + return args + +def main(): + args = get_args() + hosts = args.hosts.strip() + if not hosts: + print 'mandatory arg --hosts missing (aborting).' + sys.exit() + + host_ips = [x.strip() for x in hosts.split(",")] + + metrics = args.metrics.strip() + if not metrics: + print 'mandatory arg --metrics missing (aborting).' + sys.exit() + metric_ids = [x.strip() for x in metrics.split(",")] + + year = int(args.year) + month = int(args.month) + day = int(args.day) + hour = int(args.hour) + filepath = generate_sample_datasets(host_ips, metric_ids, year, month, day, hour) + print "Success: generated file path at " + filepath + +if __name__ == "__main__": + main() + + + \ No newline at end of file diff --git a/docker/dockerfiles/jupyter/docker/create_notebook_dir.sh b/docker/dockerfiles/jupyter/docker/create_notebook_dir.sh new file mode 100755 index 0000000..fb501c5 --- /dev/null +++ b/docker/dockerfiles/jupyter/docker/create_notebook_dir.sh @@ -0,0 +1,19 @@ +#!/bin/sh + +set -x + +DIR=/home/$PAM_USER +if [ ! -d $DIR ]; then + mkdir $DIR + chmod 0755 $DIR + chown $PAM_USER: $DIR +fi + +DIR=$DIR/jupyter_notebooks +if [ ! -d $DIR ]; then + mkdir $DIR + cp -r /opt/pnda/jupyter_notebooks $DIR/examples + chmod -R 0755 $DIR + chown -R $PAM_USER: $DIR +fi + diff --git a/docker/dockerfiles/jupyter/docker/entrypoint.sh b/docker/dockerfiles/jupyter/docker/entrypoint.sh new file mode 100755 index 0000000..e9108dd --- /dev/null +++ b/docker/dockerfiles/jupyter/docker/entrypoint.sh @@ -0,0 +1,4 @@ +#/bin/sh +j2 /pyspark2_kernel.json.tpl > /usr/local/share/jupyter/kernels/pyspark2/kernel.json +j2 /platformlibs.ini.tpl > /etc/platformlibs/platformlibs.ini +/usr/bin/jupyterhub diff --git a/docker/dockerfiles/jupyter/docker/hdfs_root_uri_conf.diff b/docker/dockerfiles/jupyter/docker/hdfs_root_uri_conf.diff new file mode 100644 index 0000000..9c83a5c --- /dev/null +++ b/docker/dockerfiles/jupyter/docker/hdfs_root_uri_conf.diff @@ -0,0 +1,16 @@ +diff --git a/platformlibs/data_handler.py b/platformlibs/data_handler.py +index 27a2ea5..7bc1ae3 100644 +--- a/platformlibs/data_handler.py ++++ b/platformlibs/data_handler.py +@@ -63,7 +63,10 @@ class DataHandler(object): + if self._hdfs_root_uri: + return self._hdfs_root_uri + cm_conf = read_config('/etc/platformlibs/platformlibs.ini') +- self._hdfs_root_uri = get_hdfs_uri(cm_conf['cm_host'], cm_conf['cm_user'], cm_conf['cm_pass'], cm_conf['hadoop_distro']) ++ if 'hdfs_root_uri' in cm_conf: ++ self._hdfs_root_uri = cm_conf['hdfs_root_uri'] ++ else: ++ self._hdfs_root_uri = get_hdfs_uri(cm_conf['cm_host'], cm_conf['cm_user'], cm_conf['cm_pass'], cm_conf['hadoop_distro']) + return self._hdfs_root_uri + + @property diff --git a/docker/dockerfiles/jupyter/docker/notebooks/PNDA minimal SqlMagic notebook.ipynb b/docker/dockerfiles/jupyter/docker/notebooks/PNDA minimal SqlMagic notebook.ipynb new file mode 100644 index 0000000..75c74d5 --- /dev/null +++ b/docker/dockerfiles/jupyter/docker/notebooks/PNDA minimal SqlMagic notebook.ipynb @@ -0,0 +1,57 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Minimal PNDA Jupyter SqlMagic notebook" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "'''\n", + "Use following connection string to connect to MySQL DB. Enter valid username/password and hostname/IP of mysql server. \n", + "%load_ext sql\n", + "%sql mysql+pymysql://username:password@hostname/dbname\n", + "\n", + "\n", + "Use following connection string to connect to Postregsql. Enter valid username/password and hostname/IP of postgresql server.\n", + "%load_ext sql\n", + "%sql postgresql://username:password@localhost/dbname\n", + "\n", + "Use following connection string to connect to Impala (CDH distribution only). Enter valid username/password and hostname/IP of impala server.\n", + "Note : Impala connection through impyla requires to disable autocommit. Use %config SqlMagic to check various configurations available.\n", + "%load_ext sql\n", + "%config SqlMagic.autocommit=False\n", + "%sql impala://hostname:port/dbname\n", + "'''\n", + "%load_ext sql" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 2", + "language": "python", + "name": "python2" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 2 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython2", + "version": "2.7.6" + } + }, + "nbformat": 4, + "nbformat_minor": 1 +} diff --git a/docker/dockerfiles/jupyter/docker/notebooks/PNDA minimal notebook.ipynb b/docker/dockerfiles/jupyter/docker/notebooks/PNDA minimal notebook.ipynb new file mode 100644 index 0000000..61fd532 --- /dev/null +++ b/docker/dockerfiles/jupyter/docker/notebooks/PNDA minimal notebook.ipynb @@ -0,0 +1,98 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Minimal PNDA Jupyter notebook\n", + "\n", + "`%matplotlib notebook` must be set before `import matplotlib.pyplot as plt` or plotting with matplotlib will fail " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "%matplotlib notebook\n", + "\n", + "import matplotlib.pyplot as plt\n", + "import sys\n", + "import pandas as pd\n", + "import matplotlib" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "print(u'▶ Python version ' + sys.version)\n", + "print(u'▶ Pandas version ' + pd.__version__)\n", + "print(u'▶ Matplotlib version ' + matplotlib.__version__)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "import numpy as np\n", + "values = np.random.rand(100)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "df = pd.DataFrame(data=values, columns=['RandomValue'])\n", + "df.head(10)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "df.plot()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "PySpark2/Python2", + "language": "python", + "name": "pyspark2" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 2 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython2", + "version": "2.7.12" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/docker/dockerfiles/jupyter/docker/notebooks/tutorial/Example Platform-library PySpark Notebook.ipynb b/docker/dockerfiles/jupyter/docker/notebooks/tutorial/Example Platform-library PySpark Notebook.ipynb new file mode 100644 index 0000000..7dda082 --- /dev/null +++ b/docker/dockerfiles/jupyter/docker/notebooks/tutorial/Example Platform-library PySpark Notebook.ipynb @@ -0,0 +1,659 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "collapsed": true + }, + "source": [ + "
\n", + "\n", + "\n", + "
\n", + "\n", + "# Welcome to Example Platform-library PySpark Notebook\n", + "\n", + "It's a shared Jupyter server for you to learn and try out Jupyter notebook and perform interactive data analytics using PNDA platform libraries.\n", + "\n", + "In this example notebook, **JsonDataHandler**, a data handler implementation based on the assumption that the 'rawdata' field wrapped in Pnda avro record is a well-formatted in JSON.

\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Instructions\n", + "\n", + "To run the codes below:\n", + "\n", + "1. Click on the cell to select it.\n", + "2. Press `SHIFT+ENTER` on your keyboard or press the play button () in the toolbar above." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Generate sample datasets ###\n", + "\n", + "If you don't have existed datasets, there are two ways to generate sample datasets:\n", + " * use data generation tool\n", + " * use embedded cell in this notebook\n", + "\n", + "Data generation tool is pre-installed on this node at `/home/cloud-user/data_generator.py`. \n", + "\n", + "
\n", + "

** Usage **

\n", + "```\n", + "./data_generator.py --hosts ','\\\n", + " --metrics ','\\\n", + " --year \\\n", + " --month \\\n", + " --day \\\n", + " --hour \n", + "```\n", + "

\n", + "[NOTE: if year|month|day|hour option is ignored, the script will extract values from current system time.]\n", + "

\n", + "
\n", + "\n", + "Alternative, you can simply run the cell below to generate sample network usage datesets." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "** Example 1: ** Generate sample network usage datasets " + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'/user/pnda/PNDA_datasets/datasets/source=test/year=2016/month=04/day=26/hour=16/f8236764-222c-4fd1-a873-61a71c28f6a3.avro'" + ] + }, + "execution_count": 1, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import subprocess\n", + "import json\n", + "import avro.schema\n", + "import avro.io\n", + "import io\n", + "import datetime\n", + "import uuid\n", + "import time\n", + "import sys\n", + "import pyhdfs\n", + "\n", + "from random import randint\n", + "from avro.datafile import DataFileWriter\n", + "from avro.io import DatumWriter\n", + "from argparse import RawTextHelpFormatter\n", + "\n", + "fs = pyhdfs.HdfsClient(hosts='hdfs-namenode:50070', user_name='pnda')\n", + "\n", + "def generate_sample_datasets (host_ips, metric_ids, year, month, day, hour):\n", + " avro_schema = ''\n", + " #load data from hdfs\n", + " with fs.open('/user/pnda/PNDA_datasets/datasets/.metadata/schema.avsc') as f:\n", + " avro_schema = f.read()\n", + " schema = avro.schema.parse(avro_schema)\n", + " bytes_writer = io.BytesIO()\n", + " encoder = avro.io.BinaryEncoder(bytes_writer)\n", + " #create hdfs folder structure\n", + " dir = create_hdfs_dirs (year, month, day, hour)\n", + " filename = str(uuid.uuid4()) + '.avro'\n", + " filepath = dir + filename\n", + " tmp_file = '/tmp/' + filename\n", + " writer = DataFileWriter(open(tmp_file, \"w\"), DatumWriter(), schema)\n", + " start_dt = datetime.datetime(year, month, day, hour, 0, 0) \n", + " start_ts = int(time.mktime(start_dt.timetuple()))\n", + " end_dt = start_dt.replace(hour=hour+1)\n", + " end_ts = int(time.mktime(end_dt.timetuple()))\n", + "\n", + " for ts in xrange(start_ts, end_ts, 1):\n", + " #generate random pnda record on per host ip basis\n", + " for host_ip in host_ips:\n", + " record = {}\n", + " record['timestamp'] = (ts * 1000)\n", + " record['src'] = 'test'\n", + " record['host_ip'] = host_ip\n", + " record['rawdata'] = generate_random_metrics(metric_ids)\n", + " #encode avro\n", + " writer.append(record)\n", + " writer.close()\n", + " fs.copy_from_local(tmp_file, dir+filename)\n", + " return filepath\n", + "\n", + "def generate_random_metrics (metric_ids):\n", + " raw_data = {}\n", + " for id in metric_ids:\n", + " raw_data[id] = str(randint(0, 100))\n", + " return json.dumps(raw_data).encode('utf-8')\n", + "\n", + "def create_hdfs_dirs (year, month, day, hour):\n", + " dir = \"/user/pnda/PNDA_datasets/datasets/source=test/year=%0d/month=%02d/day=%02d/hour=%02d/\" % (year, month, day, hour)\n", + " fs.mkdirs(dir)\n", + " return dir \n", + "\n", + "#example host ips (update as you wish)\n", + "host_ips = ['10.0.0.1', '10.0.0.2', '10.0.0.3']\n", + "#example metric list (update as you wish)\n", + "metrics=['in_bytes', 'out_bytes', 'in_pks', 'out_pks']\n", + "#generate example datasets (update year, month, day, and hour as you wish)\n", + "generate_sample_datasets(host_ips, metrics, 2016, 4, 26, 16)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Play with RDD ###\n", + "RDD can be created automatically using PNDA platform libary. This allows data exploration using low-level RDD APIs.\n", + "\n", + "** Example 2: ** Create an instance of JsonDataHandler" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "ename": "KeyError", + "evalue": "'cm_host'", + "output_type": "error", + "traceback": [ + "\u001b[0;31m\u001b[0m", + "\u001b[0;31mKeyError\u001b[0mTraceback (most recent call last)", + "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0;32mfrom\u001b[0m \u001b[0mplatformlibs\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mjson_data_handler\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mJsonDataHandler\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0mhandler\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mJsonDataHandler\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0msc\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"test\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"year=2016/month=04/day=26/hour=16\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", + "\u001b[0;32m/usr/lib/python2.7/site-packages/platformlibs-0.6.8-py2.7.egg/platformlibs/json_data_handler.pyc\u001b[0m in \u001b[0;36m__init__\u001b[0;34m(self, spark_context, datasource, path)\u001b[0m\n\u001b[1;32m 36\u001b[0m \u001b[0mspark_context\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 37\u001b[0m \u001b[0mdatasource\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 38\u001b[0;31m path)\n\u001b[0m\u001b[1;32m 39\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 40\u001b[0m \u001b[0;34m@\u001b[0m\u001b[0mstaticmethod\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/usr/lib/python2.7/site-packages/platformlibs-0.6.8-py2.7.egg/platformlibs/data_handler.pyc\u001b[0m in \u001b[0;36m__init__\u001b[0;34m(self, spark_context, datasource, path)\u001b[0m\n\u001b[1;32m 36\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_rdd\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mNone\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 37\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_hdfs_root_uri\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mNone\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 38\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mschema\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_load_schema\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 39\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 40\u001b[0m \u001b[0;34m@\u001b[0m\u001b[0mstaticmethod\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/usr/lib/python2.7/site-packages/platformlibs-0.6.8-py2.7.egg/platformlibs/data_handler.pyc\u001b[0m in \u001b[0;36m_load_schema\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 106\u001b[0m \"\"\"\n\u001b[1;32m 107\u001b[0m \u001b[0mschema_path\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m'{}/user/pnda/PNDA_datasets/datasets/'\u001b[0m\u001b[0;31m \u001b[0m\u001b[0;31m\\\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 108\u001b[0;31m \u001b[0;34m'.metadata/schema.avsc'\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mformat\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mhdfs_root_uri\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 109\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mspark_context\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtextFile\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mschema_path\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcollect\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/usr/lib/python2.7/site-packages/platformlibs-0.6.8-py2.7.egg/platformlibs/data_handler.pyc\u001b[0m in \u001b[0;36mhdfs_root_uri\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 64\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_hdfs_root_uri\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 65\u001b[0m \u001b[0mcm_conf\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mread_config\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'/etc/platformlibs/platformlibs.ini'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 66\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_hdfs_root_uri\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mget_hdfs_uri\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcm_conf\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'cm_host'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcm_conf\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'cm_user'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcm_conf\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'cm_pass'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcm_conf\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'hadoop_distro'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 67\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_hdfs_root_uri\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 68\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;31mKeyError\u001b[0m: 'cm_host'" + ] + } + ], + "source": [ + "from platformlibs.json_data_handler import JsonDataHandler\n", + "handler = JsonDataHandler(sc, \"test\", \"year=2016/month=04/day=26/hour=16\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "** Example 3: ** Simple RDD operations" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "import pprint\n", + "\n", + "rdd = handler.rdd\n", + "# print total nubmer of records\n", + "print rdd.count()\n", + "\n", + "# print one record\n", + "pprint.pprint(rdd.take(1))\n", + "\n", + "# use MapR function to print list of unique router ips\n", + "host_ips = rdd.map(lambda x: x['host_ip']).distinct().collect()\n", + "pprint.pprint(host_ips)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "** Challenge 1: ** How many unique metrics of all routers have been collected? What are they?" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Speculate your anwser here\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Visualize high-level statistics ###\n", + "PNDA platform library provide functions to return high-level statistics on per host basis using `list_host_ips()` and on per metric basis using `list_metric_ids()`.\n", + "\n", + "** Example 4: ** Plot a bar chart to show the total number of records per host" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# plot simple bar chart\n", + "%matplotlib inline \n", + "import matplotlib.pyplot as plt\n", + "import numpy as np\n", + "\n", + "# query host IPs\n", + "host_stats = handler.list_host_ips()\n", + "host_ips = []\n", + "counts = []\n", + "for stat in host_stats:\n", + " host_ips.append(stat[0])\n", + " counts.append(stat[1])\n", + "\n", + "fig, ax = plt.subplots(figsize=(15, 8))\n", + "x = np.arange(len(host_ips))\n", + "rects = ax.bar(x, counts, color='y')\n", + "plt.xticks(x+0.5, host_ips, rotation=45) \n", + "\n", + "def autolabel(rects):\n", + " # attach 'counts' labels\n", + " for rect in rects:\n", + " height = rect.get_height()\n", + " ax.text(rect.get_x() + rect.get_width()/2., 1.05*height,\n", + " '%d' % int(height),\n", + " ha='center', va='bottom')\n", + "autolabel(rects) # add label on bar\n", + "plt.ylabel('counts')\n", + "plt.title('Statistics of hosts')\n", + "plt.show()\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "** Challenge 2: ** Generate a bar chart to show total number of records per metric of host 10.0.0.1" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Speculate your anwser here\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Introducing interactive UI ###\n", + "Interactivity introduction to your notebook can be done by adding widgets provided in the `ipywidgets` package. Each widget consists of two parts: the UI element (e.g. Text Input, sliding bar, etc.) and an event handler. \n", + "\n", + "** Example 5: ** Interactive visualization of total number of records per metric of a particular host" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%matplotlib inline\n", + "from ipywidgets import *\n", + "from IPython.display import display\n", + "import matplotlib.patches as mpatches\n", + "\n", + "options= ['--select--'] + sorted(host_ips)\n", + "selected_host = \"--select--\"\n", + "\n", + "host_ip_widget = widgets.Dropdown(description='Host IP:', width=100, options=options)\n", + "display(host_ip_widget)\n", + "# diplaying the limits input widget:\n", + "limits_input = widgets.Text(description=\"limits :\", width=200)\n", + "display(limits_input)\n", + "# preparing a container to put in the created checkbox per host ip\n", + "checkboxes = []\n", + "cb_container=widgets.HBox()\n", + "display(cb_container)\n", + "\n", + "# add button that updates the graph based on the checkboxes\n", + "button = widgets.Button(description=\"submit\")\n", + "display(button)\n", + "\n", + "def on_button_clicked(b):\n", + " selected_host = host_ip_widget.value\n", + " \n", + " def autolabel(rects):\n", + " # attach 'counts' labels\n", + " for rect in rects:\n", + " height = rect.get_height()\n", + " ax.text(rect.get_x() + rect.get_width()/2., 1.05*height,\n", + " '%d' % int(height),\n", + " ha='center', va='bottom')\n", + " limit = -1\n", + " if limits_input.value:\n", + " limit = int(limits_input.value)\n", + " \n", + " filters={}\n", + " metrics = None\n", + " if selected_host != \"--select--\":\n", + " filters['host_ips']=[selected_host] \n", + " metrics = handler.list_metric_ids(limit=limit, filters=filters)\n", + " if len(metrics) > 0:\n", + " host_ip = metrics[0][0]\n", + " metric_stats = metrics[0][1]\n", + "\n", + " metric_ids=[]\n", + " metric_counts=[]\n", + " for stat in metric_stats:\n", + " metric_ids.append(stat[0])\n", + " metric_counts.append(stat[1])\n", + " x = np.arange(len(metric_ids))\n", + " fig, ax = plt.subplots(figsize=(15, 8))\n", + " metric_rects = ax.bar(x, metric_counts, color='y')\n", + " plt.xticks(x+0.5, metric_ids, rotation='vertical') \n", + " plt.ylabel ('counts')\n", + " patch = mpatches.Patch(color='y', label=host_ip)\n", + " plt.legend(handles=[patch])\n", + " autolabel(metric_rects)\n", + " plt.draw()\n", + " else:\n", + " print \"Please select a host ip from dropdown list.\"\n", + " \n", + "button.on_click(on_button_clicked)\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "** Example 6: ** Interactive time series visualization" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "scrolled": false + }, + "outputs": [], + "source": [ + "%matplotlib inline\n", + "import matplotlib\n", + "import matplotlib.pyplot as plt\n", + "import numpy as np\n", + "import datetime\n", + "from ipywidgets import *\n", + "from operator import add\n", + "from IPython.display import display\n", + "import calendar\n", + "import time\n", + "\n", + "dateFormatString = '%Y-%m-%d %H:%M:%S'\n", + "\n", + "colors=['b', 'c', 'y', 'm', 'r']\n", + "\n", + "# displaying the metric id input widget\n", + "metric_id_input = widgets.Text(description=\"metric id:\", width=200)\n", + "display(metric_id_input)\n", + "\n", + "host_ip_input = widgets.Text(description=\"host ip:\", width=200, value='edit and hit to add')\n", + "display(host_ip_input)\n", + "\n", + "#preparing the plot \n", + "plots = dict() \n", + "\n", + "#preparing a container to put in created checkbox per host ip\n", + "checkboxes = [] \n", + "cb_container = widgets.HBox() \n", + "display(cb_container)\n", + "\n", + "#preparing update button\n", + "update_button = widgets.Button(description=\"Update\")\n", + "\n", + "#normalise data with 5-min interval\n", + "def post_process(data):\n", + " def f(x): \n", + " sum_val = 0\n", + " for val in x:\n", + " sum_val = sum_val + x[0][1]\n", + " return sum_val\n", + " data_rdd = sc.parallelize(data).map(lambda x: (x[0], int(x[1]))).foldByKey(0, add).sortBy(lambda x: x[0]).groupBy(lambda x : (calendar.timegm(time.strptime(datetime.datetime.fromtimestamp(x[0]/1000).strftime('%Y-%m-%d %H:%M:%S'), dateFormatString))/(5*60))).map(lambda x : (x[0],list(x[1]))).mapValues(f).map(lambda x: (datetime.datetime.fromtimestamp(x[0] * 6*50), x[1]))\n", + " return data_rdd.keys().collect(), data_rdd.values().collect()\n", + "\n", + "#function to deal with the added host ip\n", + "def handle_submit(sender): \n", + " exists = False\n", + " for cb in checkboxes:\n", + " if cb.description is host_ip_input.value:\n", + " exists = True\n", + " if not exists and len(checkboxes)<5:\n", + " #add a new checkbox for the new host ip\n", + " checkboxes.append(widgets.Checkbox(description = host_ip_input.value, value=True, width=90))\n", + " cb_container.children=[i for i in checkboxes]\n", + " if len(checkboxes) == 1:\n", + " display(update_button)\n", + "\n", + "#function to deal with the checkbox update button \n", + "def on_button_clicked(b): \n", + " filters = {}\n", + " filters['metrics']=[metric_id_input.value]\n", + " host_ips = []\n", + " for c in cb_container.children:\n", + " if c.value:\n", + " host_ips.append(c.description) \n", + " filters['host_ips'] = host_ips\n", + "\n", + " results = handler.execute_query(filters=filters)\n", + "\n", + " i=0\n", + " if len(results) > 0:\n", + " # Plot things...\n", + " fig = plt.figure(figsize=(15, 8))\n", + " ax=fig.add_subplot(111)\n", + " for result in results:\n", + " label = result[0][1]\n", + " timestamps, values = post_process(result[1])\n", + " ax.plot_date(timestamps, values, c=colors[i], label=label)\n", + " i=i+1\n", + " ax.xaxis.set_major_formatter(matplotlib.dates.DateFormatter(\"%H:%M:%S\"))\n", + " plt.ylabel(metric_id_input.value)\n", + " plt.xlabel(\"time of the day\")\n", + " plt.legend(loc='upper right')\n", + " plt.gray() \n", + " plt.show()\n", + " \n", + "update_button.on_click(on_button_clicked) \n", + "host_ip_input.on_submit(handle_submit) \n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "** Challenge 3: ** generate scatter plots to show packet/bytes drops (e.g. use in_byte metric) of a partiular host" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Speculate your anwser here" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "PySpark2/Python2", + "language": "python", + "name": "pyspark2" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 2 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython2", + "version": "2.7.14" + }, + "widgets": { + "state": { + "1557e450223e4dde8c2413a5ee83e705": { + "views": [ + { + "cell_index": 20 + } + ] + }, + "1afdbacd1b0b478aa01978edac6c190a": { + "views": [ + { + "cell_index": 20 + } + ] + }, + "1ba2b95d0e4d4a26a7aa819dc8534c6a": { + "views": [] + }, + "2db1cd0c66dd4e8196a900db826d64c8": { + "views": [] + }, + "38670bab911f412191a9b3deb84babc7": { + "views": [] + }, + "461bb60055dd4397b778a407640d1d99": { + "views": [] + }, + "533f372af51e4cb79e2122e986d701df": { + "views": [] + }, + "561a2c0073014a1d8cad39649bc96573": { + "views": [] + }, + "58ce38b80de34d6e82a4d160febff999": { + "views": [ + { + "cell_index": 18 + } + ] + }, + "633ca203e85541b59f9131e83f6b12bb": { + "views": [] + }, + "634e89cc27f14c5eb125d8294ab0e262": { + "views": [] + }, + "63bf2949f1ab4cdbb73d1d0d6d20dbd0": { + "views": [] + }, + "657e67c80c174b04bbd1f8cee14674ef": { + "views": [] + }, + "6b7ff28cf96145abbcb000a6d784b58c": { + "views": [] + }, + "7cf1b6bb25d841b0b83bf337300df6fd": { + "views": [] + }, + "810c709e9cc84968a1c20d1d13db8acf": { + "views": [] + }, + "a2439b82434d41a3b63efdbf3eee6519": { + "views": [ + { + "cell_index": 16 + } + ] + }, + "a4848088511d42379c3daf05b7150343": { + "views": [] + }, + "ac64a66ddb0e4cf9bd3cd37cf4275b28": { + "views": [] + }, + "ade75b3524bc42d39979108791ac27e0": { + "views": [] + }, + "aeac437c13634b02be44b86ce14a16d7": { + "views": [ + { + "cell_index": 18 + } + ] + }, + "b2d5229dfa0c4bc093922e6fd9afbd7a": { + "views": [] + }, + "b81aa794d7d74402bb68519be24bb444": { + "views": [] + }, + "bd62dae252ac444fbc377bec6458eb6b": { + "views": [] + }, + "c6d9fa2d6f2a4f0fb0b662d6bd89c6a4": { + "views": [] + }, + "cc93c8dda54d4f139f41b42c3dba67ad": { + "views": [ + { + "cell_index": 16 + } + ] + }, + "ce3a67a85dcf4cefbe5fa3e9e27644d8": { + "views": [] + }, + "df0a9835ec7c49ffaa7992c919a6ddcf": { + "views": [] + }, + "e61981d0808e4ce1ac91ad16d8f3b55f": { + "views": [] + }, + "e87b1311dbde4cd0bc020eb484d72f5b": { + "views": [] + }, + "e9353bafbf6a4a9b8b9de0747078720f": { + "views": [] + }, + "f0874b40c3da49fbb514e3f9f2047df3": { + "views": [] + }, + "f2bb960e4fc2454fa1be3b679c7fd078": { + "views": [] + }, + "fe575e9009c54084bd0a27bdd4c03382": { + "views": [] + } + }, + "version": "1.1.2" + } + }, + "nbformat": 4, + "nbformat_minor": 1 +} diff --git a/docker/dockerfiles/jupyter/docker/platform-libraries-env-conf.diff b/docker/dockerfiles/jupyter/docker/platform-libraries-env-conf.diff new file mode 100644 index 0000000..c8a1b39 --- /dev/null +++ b/docker/dockerfiles/jupyter/docker/platform-libraries-env-conf.diff @@ -0,0 +1,23 @@ +--- common_helpers.py.orig 2018-04-20 10:37:55.033647186 +0200 ++++ common_helpers.py 2018-04-20 10:50:57.434402052 +0200 +@@ -14,6 +14,7 @@ + Purpose: Utility library that defines common helper functions + """ + import requests ++import os + from cm_api.api_client import ApiResource + + def flatten_dict(input_d, result=None): +@@ -118,8 +119,10 @@ + - hadoop_distro: 'CDH' or 'HDP' + ''' + hdfs_uri = '' +- +- if hadoop_distro == 'CDH': ++ ++ if hadoop_distro == 'env': ++ hdfs_uri = os.getenv('HDFS_ROOT_URI') ++ elif hadoop_distro == 'CDH': + api = connect_cm(cm_host, cm_user, cm_pass) + + for cluster_detail in api.get_all_clusters(): diff --git a/docker/dockerfiles/jupyter/docker/platformlibs.ini.tpl b/docker/dockerfiles/jupyter/docker/platformlibs.ini.tpl new file mode 100644 index 0000000..7206110 --- /dev/null +++ b/docker/dockerfiles/jupyter/docker/platformlibs.ini.tpl @@ -0,0 +1,6 @@ +[cm] +hadoop_distro={{ HADOOP_DISTRO | default('HDP') }} +hdfs_root_uri={{ HDFS_ROOT_URI | default('hdfs://hdfs-namenode:8020') }} +cm_host={{ CM_HOST | default('cm') }} +cm_user={{ CM_USER | default('scm') }} +cm_pass={{ CM_PASSWORD | default('scm') }} diff --git a/docker/dockerfiles/jupyter/docker/pyspark2_kernel.json.tpl b/docker/dockerfiles/jupyter/docker/pyspark2_kernel.json.tpl new file mode 100644 index 0000000..5428176 --- /dev/null +++ b/docker/dockerfiles/jupyter/docker/pyspark2_kernel.json.tpl @@ -0,0 +1,21 @@ +{ + "display_name": "PySpark2/Python2", + "language": "python", + "argv": [ + "/usr/bin/python2", + "-m", + "ipykernel", + "-f", + "{connection_file}" + ], + "env": { + "HADOOP_CONF_DIR":"{{HADOOP_CONF_DIR | default('/')}}", + "PYSPARK_PYTHON":"/usr/bin/python2", + "SPARK_MAJOR_VERSION":"2", + "SPARK_HOME": "/opt/spark", + "WRAPPED_SPARK_HOME": "/usr/", + "PYTHONPATH": "/usr/lib/python2.7/site-packages:/opt/spark/python:/opt/spark/python/lib/py4j-0.10.6-src.zip", + "PYTHONSTARTUP": "/opt/spark/python/pyspark/shell.py", + "PYSPARK_SUBMIT_ARGS": "--master {{SPARK_MASTER_URL | default('spark://spark-master:7077')}} --jars /opt/spark/examples/jars/spark-examples_2.11-2.3.0.jar pyspark-shell" + } +} diff --git a/docker/dockerfiles/jupyter/docker/requirements/app-packages-requirements.txt b/docker/dockerfiles/jupyter/docker/requirements/app-packages-requirements.txt new file mode 100644 index 0000000..9c0a454 --- /dev/null +++ b/docker/dockerfiles/jupyter/docker/requirements/app-packages-requirements.txt @@ -0,0 +1,14 @@ +# App dependency packages to be installed for use in pyspark and Jupyter +avro==1.8.1 +cm-api==14.0.0 +fastavro==0.17.9 +happybase==1.0.0 +kafka-python==1.3.5 +PyHDFS==0.1.2 +pykafka==2.7.0 +pywebhdfs==0.4.1 +PyYAML==3.12 +thrift==0.9.3 +thrift_sasl==0.2.1 +thriftpy==0.3.9 +xmltodict==0.11.0 diff --git a/docker/dockerfiles/jupyter/docker/requirements/requirements-jupyter-extensions.txt b/docker/dockerfiles/jupyter/docker/requirements/requirements-jupyter-extensions.txt new file mode 100644 index 0000000..735db94 --- /dev/null +++ b/docker/dockerfiles/jupyter/docker/requirements/requirements-jupyter-extensions.txt @@ -0,0 +1,3 @@ +lxml==3.6.4 +ipywidgets==6.0.0 +widgetsnbextension==2.0.0 diff --git a/docker/dockerfiles/jupyter/docker/requirements/requirements-jupyter.txt b/docker/dockerfiles/jupyter/docker/requirements/requirements-jupyter.txt new file mode 100644 index 0000000..7323cf8 --- /dev/null +++ b/docker/dockerfiles/jupyter/docker/requirements/requirements-jupyter.txt @@ -0,0 +1,44 @@ +backports-abc==0.5 +bleach==1.5.0 +decorator==4.0.10 +entrypoints==0.2.2 +https://github.com/klyr/jupyter-spark/releases/download/0.3.0-patch/jupyter-spark-0.3.0-patch.tar.gz +html5lib==0.9999999 +impyla==0.14.0 +ipykernel==4.5.2 +ipython==5.1.0 +ipython-genutils==0.1.0 +ipython-sql==0.3.8 +Jinja2==2.8 +jsonschema==2.5.1 +jupyter==1.0.0 +jupyter-client==4.4.0 +jupyter-console==5.0.0 +jupyter-core==4.4.0 +MarkupSafe==0.23 +mistune==0.7.3 +nbconvert==5.0.0 +nbformat==4.2.0 +notebook==4.3.1 +pandocfilters==1.4.1 +pexpect==4.2.1 +pickleshare==0.7.4 +prompt-toolkit==1.0.9 +ptyprocess==0.5.1 +Pygments==2.1.3 +pymysql==0.7.11 +psycopg2==2.7.3.2 +pyzmq==16.0.2 +qtconsole==4.2.1 +simplegeneric==0.8.1 +six==1.10.0 +sparkmagic==0.12.4 +sql-magic==0.0.3 +terminado==0.6 +testpath==0.3 +thrift==0.9.3 +tornado==4.4.2 +traitlets==4.3.1 +wcwidth==0.1.7 +widgetsnbextension==2.0.0 +matplotlib==2.1.2 diff --git a/docker/dockerfiles/jupyter/docker/requirements/requirements-jupyterhub.txt b/docker/dockerfiles/jupyter/docker/requirements/requirements-jupyterhub.txt new file mode 100644 index 0000000..10113a8 --- /dev/null +++ b/docker/dockerfiles/jupyter/docker/requirements/requirements-jupyterhub.txt @@ -0,0 +1,15 @@ +alembic==0.8.9 +backports-abc==0.5 +decorator==4.0.10 +ipython-genutils==0.1.0 +Jinja2==2.8 +jupyterhub==0.7.0 +Mako==1.0.6 +MarkupSafe==0.23 +pamela==0.3.0 +python-editor==1.0.3 +requests==2.12.4 +six==1.10.0 +SQLAlchemy==1.1.4 +tornado==4.4.2 +traitlets==4.3.1 diff --git a/docker/dockerfiles/platform-console-backend b/docker/dockerfiles/platform-console-backend new file mode 160000 index 0000000..67961d9 --- /dev/null +++ b/docker/dockerfiles/platform-console-backend @@ -0,0 +1 @@ +Subproject commit 67961d942d4a69347129594212f63debdb02ac02 diff --git a/docker/dockerfiles/platform-console-frontend b/docker/dockerfiles/platform-console-frontend new file mode 160000 index 0000000..9c1c6d6 --- /dev/null +++ b/docker/dockerfiles/platform-console-frontend @@ -0,0 +1 @@ +Subproject commit 9c1c6d665a41afc96801241e786ef1b6f0a3714d diff --git a/docker/dockerfiles/platform-data-mgmnt b/docker/dockerfiles/platform-data-mgmnt new file mode 160000 index 0000000..862f049 --- /dev/null +++ b/docker/dockerfiles/platform-data-mgmnt @@ -0,0 +1 @@ +Subproject commit 862f0491e122f4ca78b8a7ef88f93d47ab8f9e20 diff --git a/docker/dockerfiles/platform-deployment-manager b/docker/dockerfiles/platform-deployment-manager new file mode 160000 index 0000000..ed81386 --- /dev/null +++ b/docker/dockerfiles/platform-deployment-manager @@ -0,0 +1 @@ +Subproject commit ed81386e35a983c559f6a0e2d79f90dd6d8ac222 diff --git a/docker/dockerfiles/platform-gobblin-modules b/docker/dockerfiles/platform-gobblin-modules new file mode 160000 index 0000000..6195093 --- /dev/null +++ b/docker/dockerfiles/platform-gobblin-modules @@ -0,0 +1 @@ +Subproject commit 61950938cd6dd86bb098c1910169d78fe560c899 diff --git a/docker/dockerfiles/platform-package-repository b/docker/dockerfiles/platform-package-repository new file mode 160000 index 0000000..2e06083 --- /dev/null +++ b/docker/dockerfiles/platform-package-repository @@ -0,0 +1 @@ +Subproject commit 2e0608356a246539a464ab3af6653415b6155594 diff --git a/docker/dockerfiles/platform-testing/Dockerfile b/docker/dockerfiles/platform-testing/Dockerfile new file mode 100644 index 0000000..e3903e7 --- /dev/null +++ b/docker/dockerfiles/platform-testing/Dockerfile @@ -0,0 +1,33 @@ +FROM alpine:3.7 as builder +LABEL maintainer="cgiraldo@gradiant.org" +LABEL organization="gradiant.org" +ARG version +ENV VERSION $version +RUN apk add --no-cache git bash python build-base linux-pam-dev maven=3.5.2-r0 bc grep python2-dev py2-nose py2-pip cyrus-sasl-dev ca-certificates wget\ +&& pip install spur==0.3.12 starbase==0.3.2 happybase==1.0.0 pyhs2==0.6.0 pywebhdfs==0.4.0 PyHDFS==0.1.2 cm-api==8.0.0 shyaml==0.4.1 \ +nose==1.3.7 mock==2.0.0 pylint==1.6.4 python-swiftclient==3.1.0 tornado==4.4.2 tornado-cors==0.6.0 Tornado-JSON==1.2.2 boto==2.40.0 \ +setuptools==28.8.0 --upgrade impyla==0.13.8 eventlet==0.19.0 kazoo==2.2.1 avro==1.8.1 kafka-python==1.3.5 prettytable==0.7.2 \ +pyhive==0.2.1 thrift_sasl==0.2.1 JayDeBeApi==1.1.1 \ +&& ln -s /usr/bin/nosetests-2.7 /usr/bin/nosetests + +RUN wget -qO- https://github.com/pndaproject/platform-testing/archive/$VERSION.tar.gz | tar -xvz && \ + mv platform-testing-$VERSION src + +#pnda.io platform-testing search for Maven 3.0.5. We patch this to use Maven 3.5 +RUN sed -i 's/Apache Maven 3.0.5/Apache Maven 3.5/g' /src/build.sh +RUN cd src && ./build.sh $VERSION + + +FROM alpine:3.7 as platform-testing +LABEL maintainer="cgiraldo@gradiant.org" +LABEL organization="gradiant.org" +ARG version +ENV VERSION $version +COPY --from=builder /src/pnda-build / +COPY jinja_entrypoint.sh entrypoint.sh.tpl hbase_spark_metric.py / +ENTRYPOINT /jinja_entrypoint.sh +RUN apk add --no-cache bash py2-pip tar && tar -xzf /platform-testing-general-${VERSION}.tar.gz \ +&& mv /platform-testing-general-${VERSION} /platform-testing-general \ +&& pip install j2cli \ +&& find /platform-testing-general -name requirements.txt -exec pip install -r '{}' \; + diff --git a/docker/dockerfiles/platform-testing/build-docker.sh b/docker/dockerfiles/platform-testing/build-docker.sh new file mode 100755 index 0000000..9f2af26 --- /dev/null +++ b/docker/dockerfiles/platform-testing/build-docker.sh @@ -0,0 +1,4 @@ +#!/bin/bash + +VERSION=0.5.0 +docker build --build-arg version=$VERSION -t pnda/testing:$VERSION . diff --git a/docker/dockerfiles/platform-testing/entrypoint.sh.tpl b/docker/dockerfiles/platform-testing/entrypoint.sh.tpl new file mode 100644 index 0000000..b747886 --- /dev/null +++ b/docker/dockerfiles/platform-testing/entrypoint.sh.tpl @@ -0,0 +1,16 @@ +#/bin/sh + +while true +do +python /platform-testing-general/monitor.py --plugin zookeeper \ +--postjson http://{{ CONSOLE_HOSTS | default('console-backend-data-logger:3001') }}/metrics \ +--extra "--zconnect {{ ZOOKEEPERS | default('zookeeper:2181') }}" + +python /platform-testing-general/monitor.py --plugin kafka \ +--postjson http://{{ CONSOLE_HOSTS | default('console-backend-data-logger:3001') }}/metrics \ +--extra "--brokerlist {{ KAFKA_BROKERS | default('kafka:9092') }} \ +--zkconnect {{ ZOOKEEPERS | default('zookeeper:2181') }} --prod2cons" + +python /hbase_spark_metric.py http://{{ CONSOLE_HOSTS | default('console-backend-data-logger:3001') }}/metrics +sleep 60 +done diff --git a/docker/dockerfiles/platform-testing/hbase_spark_metric.py b/docker/dockerfiles/platform-testing/hbase_spark_metric.py new file mode 100644 index 0000000..2aceec8 --- /dev/null +++ b/docker/dockerfiles/platform-testing/hbase_spark_metric.py @@ -0,0 +1,26 @@ +import requests +import json +import time +import sys + +# a python script to return OK status for hbase and spark components. Hard coded to return OK on every run - need to change this + +TIMESTAMP_MILLIS = lambda: int(time.time() * 1000) +components = {'hbase01': 'hadoop.HBASE.health', 'spark_on_yarn': 'hadoop.SPARK_ON_YARN.health'} +if len(sys.argv) > 2 : + print ("usage: hbase_spark_metric.py [console_backend_data_manager_metrics_endpoint]") + sys.exit() +elif len(sys.argv) == 2: + host = sys.argv[1] +else: + host = "http://127.0.0.1:3001/metrics" + +for key, value in components.iteritems(): + json_data = {"data": [{"source": key, "metric": value, "value": "OK", "causes": "[]", "timestamp": TIMESTAMP_MILLIS()}], "timestamp": TIMESTAMP_MILLIS()} + try: + headers = {'Content-Type': 'application/json', 'Connection':'close'} + response = requests.post(host, data=json.dumps(json_data), headers=headers) + if response.status_code != 200: + print "_send failed: %s", response.status_code + except requests.exceptions.RequestException as ex: + print "_send failed: %s", ex diff --git a/docker/dockerfiles/platform-testing/jinja_entrypoint.sh b/docker/dockerfiles/platform-testing/jinja_entrypoint.sh new file mode 100755 index 0000000..b8474b2 --- /dev/null +++ b/docker/dockerfiles/platform-testing/jinja_entrypoint.sh @@ -0,0 +1,4 @@ +#/bin/sh +j2 /entrypoint.sh.tpl > /entrypoint.sh +chmod +x /entrypoint.sh +/entrypoint.sh