diff --git a/docker/deploy.sh b/docker/deploy.sh
index 163c5a3..638b780 100755
--- a/docker/deploy.sh
+++ b/docker/deploy.sh
@@ -4,11 +4,11 @@ echo "---------------- STARTING HDFS and HBASE ----------------"
docker-compose up -d zookeeper
docker-compose up -d hdfs-namenode
docker-compose up -d hdfs-datanode
-docker-compose up -d hbase-master
while ! docker exec -ti hdfs-namenode nc -vz hdfs-namenode:8020 ; do
echo "waiting for hdfs-namenode to start"
sleep 2
done
+docker-compose up -d hbase-master
docker-compose up -d hbase-region
echo "---------------- ADDING users to HDFS ----------------"
diff --git a/docker/docker-compose.yml b/docker/docker-compose.yml
index c91591a..619b99f 100644
--- a/docker/docker-compose.yml
+++ b/docker/docker-compose.yml
@@ -1,9 +1,13 @@
-version: '3'
+version: '3.4'
services:
gobblin:
container_name: gobblin
hostname: gobblin
image: pnda/gobblin:0.11.0-0.1.0
+ build:
+ context: ./dockerfiles/platform-gobblin-modules
+ args:
+ version: 0.1.0
environment:
- HDFS_URL=hdfs://hdfs-namenode:8020
- MASTER_DATASET_DIRECTORY=/user/pnda/PNDA_datasets/datasets
@@ -47,6 +51,10 @@ services:
container_name: jupyter
hostname: jupyter
image: pnda/jupyter:4.4.0
+ build:
+ context: ./dockerfiles/jupyter
+ args:
+ version: 4.4.0
volumes:
- jupyter-home:/home
environment:
@@ -76,6 +84,10 @@ services:
container_name: deployment-manager
hostname: deployment-manager
image: pnda/deployment-manager:1.0.0
+ build:
+ context: ./dockerfiles/platform-deployment-manager
+ args:
+ version: 1.0.0
environment:
- JUPYTER_HOST=jupyter
- DATA_LOGGER_URL=http://console-backend:3001 #data-logger uses the data-manager network stack
@@ -101,6 +113,10 @@ services:
container_name: package-repository
hostname: package-repository
image: pnda/package-repository:0.3.2
+ build:
+ context: ./dockerfiles/platform-package-repository
+ args:
+ version: 0.3.2
environment:
- FS_LOCATION_PATH=/mnt/packages
- DATA_LOGGER_URL=http://console-backend:3001 #data-logger uses the data-manager network stack
@@ -199,7 +215,11 @@ services:
platform-testing:
container_name: platform-testing
hostname: platform-testing
- image: pnda/testing:0.5.0
+ image: pnda/platform-testing:0.5.0
+ build:
+ context: ./dockerfiles/platform-testing
+ args:
+ version: 0.5.0
environment:
- CONSOLE_HOSTS=console-backend:3001
- ZOOKEEPERS=zookeeper:2181
@@ -214,6 +234,10 @@ services:
container_name: console-frontend
hostname: console-frontend
image: pnda/console-frontend:1.0.0
+ build:
+ context: ./dockerfiles/platform-console-frontend
+ args:
+ version: 1.0.0
environment:
- DATA_MANAGER_HOST=console-backend
- DATA_MANAGER_PORT=3123
@@ -226,6 +250,11 @@ services:
container_name: console-backend
hostname: console-backend
image: pnda/console-backend-data-manager:1.0.0
+ build:
+ context: ./dockerfiles/platform-console-backend
+ args:
+ version: 1.0.0
+ target: console-backend-data-manager
environment:
- CONSOLE_FRONTEND_HOSTS_CSV=console-frontend
- DATASET_MANAGER_URL=http://data-service:7000
@@ -234,6 +263,11 @@ services:
container_name: console-backend-data-logger
network_mode: service:console-backend
image: pnda/console-backend-data-logger:1.0.0
+ build:
+ context: ./dockerfiles/platform-console-backend
+ args:
+ version: 1.0.0
+ target: console-backend-data-logger
redis:
container_name: redis
network_mode: service:console-backend
@@ -242,6 +276,11 @@ services:
container_name: data-service
hostname: data-service
image: pnda/data-service:0.2.2
+ build:
+ context: ./dockerfiles/platform-data-mgmnt
+ args:
+ version: 0.2.2
+ target: data-service
environment:
- LOCATION=/user/pnda/PNDA_datasets/datasets
- HADOOP_DISTRO=env
diff --git a/docker/dockerfiles/jupyter/Dockerfile b/docker/dockerfiles/jupyter/Dockerfile
new file mode 100644
index 0000000..54618f9
--- /dev/null
+++ b/docker/dockerfiles/jupyter/Dockerfile
@@ -0,0 +1,60 @@
+FROM alpine:3.7 as platformlibs
+
+LABEL maintainer="cgiraldo@gradiant.org"
+LABEL organization="gradiant.org"
+
+COPY docker/hdfs_root_uri_conf.diff /
+RUN apk add --no-cache git bash python py2-pip && pip install setuptools
+RUN git clone https://github.com/pndaproject/platform-libraries.git
+RUN cd platform-libraries && git checkout tags/release/4.0 && \
+ export VERSION=$(git describe --tags) && \
+ git apply /hdfs_root_uri_conf.diff && \
+ python setup.py bdist_egg
+
+FROM alpine:3.7
+
+COPY --from=platformlibs /platform-libraries/dist/platformlibs-0.1.5-py2.7.egg /
+COPY docker /
+ENV SPARK_HOME=/opt/spark
+
+RUN apk add --no-cache bash python2 py2-pip postgresql-dev libpng-dev freetype-dev ca-certificates build-base python2-dev krb5-dev libffi-dev cyrus-sasl-dev nodejs shadow python3 python3-dev openjdk8-jre && \
+ echo 'Installing python2 requirements' && \
+ pip2 install -r /requirements/requirements-jupyter.txt && \
+ pip2 install -r /requirements/app-packages-requirements.txt && pip2 install j2cli && \
+ /usr/bin/python2 -m ipykernel.kernelspec --name python2 --display-name "Python 2" && \
+ echo 'Instaling python3 requirements' && \
+ pip3 install -r /requirements/requirements-jupyter.txt && \
+ /usr/bin/python3 -m ipykernel.kernelspec --name python3 --display-name "Python 3" && \
+ echo 'Adding pyspark2 support' && \
+ mkdir -p /usr/local/share/jupyter/kernels/pyspark2 && mkdir -p /opt && \
+ wget -O- https://archive.apache.org/dist/spark/spark-2.3.0/spark-2.3.0-bin-hadoop2.7.tgz | tar -xvz -C /tmp && \
+ mv /tmp/spark-2.3.0-bin-hadoop2.7 /opt/spark && \
+ echo 'Adding jupyter-scala_extension_spark' && \
+ jupyter nbextension enable --py widgetsnbextension --system && \
+ jupyter-kernelspec install /usr/lib/python3.6/site-packages/sparkmagic/kernels/sparkkernel && \
+ jupyter serverextension enable --py sparkmagic && \
+ echo 'Adding jupyter-extensions' && \
+ apk add --no-cache libxml2-dev libxslt-dev && \
+ pip3 install -r /requirements/requirements-jupyter-extensions.txt && \
+ jupyter serverextension enable --py jupyter_spark --system && \
+ jupyter nbextension install --py jupyter_spark --system && \
+ jupyter nbextension enable --py jupyter_spark --system && \
+ jupyter nbextension enable --py widgetsnbextension --system && \
+ echo 'Adding jupyterhub' && \
+ pip3 install -r /requirements/requirements-jupyterhub.txt && \
+ npm install -g configurable-http-proxy && mkdir -p /var/log/pnda && \
+ echo 'auth required pam_exec.so debug log=/var/log/pnda/login.log /create_notebook_dir.sh' >> /etc/pam.d/login
+RUN echo 'Adding pnda platform-libraries' && \
+ mkdir /etc/platformlibs && /usr/bin/python2 -m easy_install /platformlibs-0.1.5-py2.7.egg && \
+ adduser -D pnda && echo "pnda:pnda" | chpasswd && \
+ mkdir -p /opt/pnda && mv /notebooks /opt/pnda/jupyter_notebooks && \
+ echo 'auth required pam_listfile.so item=user sense=deny file=/etc/login.deny onerr=succeed' >> /etc/pam.d/login && \
+ echo 'root' >> /etc/login.deny
+
+RUN wget http://central.maven.org/maven2/org/apache/spark/spark-sql-kafka-0-10_2.11/2.3.0/spark-sql-kafka-0-10_2.11-2.3.0.jar \
+-O /opt/spark/jars/spark-sql-kafka-0-10_2.11-2.3.0.jar && \
+wget http://central.maven.org/maven2/org/apache/kafka/kafka-clients/1.0.0/kafka-clients-1.0.0.jar \
+-O /opt/spark/jars/kafka-clients-1.0.0.jar
+
+ENTRYPOINT /entrypoint.sh
+
diff --git a/docker/dockerfiles/jupyter/build-docker.sh b/docker/dockerfiles/jupyter/build-docker.sh
new file mode 100755
index 0000000..f1acc33
--- /dev/null
+++ b/docker/dockerfiles/jupyter/build-docker.sh
@@ -0,0 +1,4 @@
+#!/bin/bash
+
+VERSION=4.4.0
+docker build -t pnda/jupyter:$VERSION .
diff --git a/docker/dockerfiles/jupyter/data_generator.py b/docker/dockerfiles/jupyter/data_generator.py
new file mode 100644
index 0000000..c428a86
--- /dev/null
+++ b/docker/dockerfiles/jupyter/data_generator.py
@@ -0,0 +1,114 @@
+#!/usr/bin/python
+
+import argparse
+import subprocess
+import json
+import avro.schema
+import avro.io
+import io
+import datetime
+import uuid
+import time
+import sys
+
+from random import randint
+from avro.datafile import DataFileWriter
+from avro.io import DatumWriter
+from argparse import RawTextHelpFormatter
+
+def generate_sample_datasets (host_ips, metric_ids, year, month, day, hour):
+ avro_schema = ''
+ #load data from hdfs
+ cat = subprocess.Popen(['sudo', '-u', 'hdfs', 'hadoop', 'fs', '-cat', '/user/pnda/PNDA_datasets/datasets/.metadata/schema.avsc'], stdout=subprocess.PIPE)
+ for line in cat.stdout:
+ avro_schema = avro_schema + line
+ schema = avro.schema.parse(avro_schema)
+ bytes_writer = io.BytesIO()
+ encoder = avro.io.BinaryEncoder(bytes_writer)
+ #create hdfs folder structure
+ dir = create_hdfs_dirs (year, month, day, hour)
+ filename = str(uuid.uuid4()) + '.avro'
+ filepath = dir + filename
+ tmp_file = '/tmp/' + filename
+
+ writer = DataFileWriter(open(tmp_file, "w"), DatumWriter(), schema)
+
+ start_dt = datetime.datetime(year, month, day, hour, 0, 0)
+ start_ts = int(time.mktime(start_dt.timetuple()))
+ end_dt = start_dt.replace(hour=hour+1)
+ end_ts = int(time.mktime(end_dt.timetuple()))
+
+ for ts in xrange(start_ts, end_ts, 1):
+ #generate random pnda record on per host ip basis
+ for host_ip in host_ips:
+ record = {}
+ record['timestamp'] = (ts * 1000)
+ record['src'] = 'test'
+ record['host_ip'] = host_ip
+ record['rawdata'] = generate_random_metrics(metric_ids)
+ #encode avro
+ writer.append(record)
+ writer.close()
+ subprocess.Popen(['sudo', '-u', 'hdfs', 'hadoop', 'fs', '-copyFromLocal', tmp_file, dir])
+ return filepath
+
+def generate_random_metrics (metric_ids):
+ '''
+ generate random raw_data elementTon
+ '''
+ raw_data = {}
+ for id in metric_ids:
+ raw_data[id] = str(randint(0, 100))
+ return json.dumps(raw_data).encode('utf-8')
+
+def create_hdfs_dirs (year, month, day, hour):
+ dir = "/user/pnda/PNDA_datasets/datasets/source=test/year=%0d/month=%02d/day=%02d/hour=%02d/" % (year, month, day, hour)
+ subprocess.Popen(['sudo', '-u', 'hdfs', 'hadoop', 'fs', '-mkdir', '-p', dir])
+ return dir
+
+def get_args():
+ epilog = """ example:
+ - create sample data sets
+ data_generator.py --hosts '10.0.0.1, 10.0.0.2' --metrics 'a, b, c' --year 2016 --month 4 --day 27 --hour 14
+ - create sample data sets using system datetime
+ data_generator.py --hosts '10.0.0.1, 10.0.0.2' --metrics 'a, b, c'
+ """
+
+ dt = datetime.datetime.now()
+ parser = argparse.ArgumentParser(formatter_class=RawTextHelpFormatter, description='Sample datasets generator', epilog=epilog)
+ parser.add_argument('--hosts', help='list of sample host ips separated by comma', default='')
+ parser.add_argument('--metrics', help='list of metrics ids', default='')
+ parser.add_argument('--year', type=int, help='year', default=dt.year)
+ parser.add_argument('--month', type=int, help='month', default=dt.month)
+ parser.add_argument('--day', type=int, help='day of the month', default=dt.day)
+ parser.add_argument('--hour', help='hour of the day', default=dt.hour)
+ args = parser.parse_args()
+ return args
+
+def main():
+ args = get_args()
+ hosts = args.hosts.strip()
+ if not hosts:
+ print 'mandatory arg --hosts missing (aborting).'
+ sys.exit()
+
+ host_ips = [x.strip() for x in hosts.split(",")]
+
+ metrics = args.metrics.strip()
+ if not metrics:
+ print 'mandatory arg --metrics missing (aborting).'
+ sys.exit()
+ metric_ids = [x.strip() for x in metrics.split(",")]
+
+ year = int(args.year)
+ month = int(args.month)
+ day = int(args.day)
+ hour = int(args.hour)
+ filepath = generate_sample_datasets(host_ips, metric_ids, year, month, day, hour)
+ print "Success: generated file path at " + filepath
+
+if __name__ == "__main__":
+ main()
+
+
+
\ No newline at end of file
diff --git a/docker/dockerfiles/jupyter/docker/create_notebook_dir.sh b/docker/dockerfiles/jupyter/docker/create_notebook_dir.sh
new file mode 100755
index 0000000..fb501c5
--- /dev/null
+++ b/docker/dockerfiles/jupyter/docker/create_notebook_dir.sh
@@ -0,0 +1,19 @@
+#!/bin/sh
+
+set -x
+
+DIR=/home/$PAM_USER
+if [ ! -d $DIR ]; then
+ mkdir $DIR
+ chmod 0755 $DIR
+ chown $PAM_USER: $DIR
+fi
+
+DIR=$DIR/jupyter_notebooks
+if [ ! -d $DIR ]; then
+ mkdir $DIR
+ cp -r /opt/pnda/jupyter_notebooks $DIR/examples
+ chmod -R 0755 $DIR
+ chown -R $PAM_USER: $DIR
+fi
+
diff --git a/docker/dockerfiles/jupyter/docker/entrypoint.sh b/docker/dockerfiles/jupyter/docker/entrypoint.sh
new file mode 100755
index 0000000..e9108dd
--- /dev/null
+++ b/docker/dockerfiles/jupyter/docker/entrypoint.sh
@@ -0,0 +1,4 @@
+#/bin/sh
+j2 /pyspark2_kernel.json.tpl > /usr/local/share/jupyter/kernels/pyspark2/kernel.json
+j2 /platformlibs.ini.tpl > /etc/platformlibs/platformlibs.ini
+/usr/bin/jupyterhub
diff --git a/docker/dockerfiles/jupyter/docker/hdfs_root_uri_conf.diff b/docker/dockerfiles/jupyter/docker/hdfs_root_uri_conf.diff
new file mode 100644
index 0000000..9c83a5c
--- /dev/null
+++ b/docker/dockerfiles/jupyter/docker/hdfs_root_uri_conf.diff
@@ -0,0 +1,16 @@
+diff --git a/platformlibs/data_handler.py b/platformlibs/data_handler.py
+index 27a2ea5..7bc1ae3 100644
+--- a/platformlibs/data_handler.py
++++ b/platformlibs/data_handler.py
+@@ -63,7 +63,10 @@ class DataHandler(object):
+ if self._hdfs_root_uri:
+ return self._hdfs_root_uri
+ cm_conf = read_config('/etc/platformlibs/platformlibs.ini')
+- self._hdfs_root_uri = get_hdfs_uri(cm_conf['cm_host'], cm_conf['cm_user'], cm_conf['cm_pass'], cm_conf['hadoop_distro'])
++ if 'hdfs_root_uri' in cm_conf:
++ self._hdfs_root_uri = cm_conf['hdfs_root_uri']
++ else:
++ self._hdfs_root_uri = get_hdfs_uri(cm_conf['cm_host'], cm_conf['cm_user'], cm_conf['cm_pass'], cm_conf['hadoop_distro'])
+ return self._hdfs_root_uri
+
+ @property
diff --git a/docker/dockerfiles/jupyter/docker/notebooks/PNDA minimal SqlMagic notebook.ipynb b/docker/dockerfiles/jupyter/docker/notebooks/PNDA minimal SqlMagic notebook.ipynb
new file mode 100644
index 0000000..75c74d5
--- /dev/null
+++ b/docker/dockerfiles/jupyter/docker/notebooks/PNDA minimal SqlMagic notebook.ipynb
@@ -0,0 +1,57 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Minimal PNDA Jupyter SqlMagic notebook"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "'''\n",
+ "Use following connection string to connect to MySQL DB. Enter valid username/password and hostname/IP of mysql server. \n",
+ "%load_ext sql\n",
+ "%sql mysql+pymysql://username:password@hostname/dbname\n",
+ "\n",
+ "\n",
+ "Use following connection string to connect to Postregsql. Enter valid username/password and hostname/IP of postgresql server.\n",
+ "%load_ext sql\n",
+ "%sql postgresql://username:password@localhost/dbname\n",
+ "\n",
+ "Use following connection string to connect to Impala (CDH distribution only). Enter valid username/password and hostname/IP of impala server.\n",
+ "Note : Impala connection through impyla requires to disable autocommit. Use %config SqlMagic to check various configurations available.\n",
+ "%load_ext sql\n",
+ "%config SqlMagic.autocommit=False\n",
+ "%sql impala://hostname:port/dbname\n",
+ "'''\n",
+ "%load_ext sql"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 2",
+ "language": "python",
+ "name": "python2"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 2
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython2",
+ "version": "2.7.6"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 1
+}
diff --git a/docker/dockerfiles/jupyter/docker/notebooks/PNDA minimal notebook.ipynb b/docker/dockerfiles/jupyter/docker/notebooks/PNDA minimal notebook.ipynb
new file mode 100644
index 0000000..61fd532
--- /dev/null
+++ b/docker/dockerfiles/jupyter/docker/notebooks/PNDA minimal notebook.ipynb
@@ -0,0 +1,98 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Minimal PNDA Jupyter notebook\n",
+ "\n",
+ "`%matplotlib notebook` must be set before `import matplotlib.pyplot as plt` or plotting with matplotlib will fail "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [],
+ "source": [
+ "%matplotlib notebook\n",
+ "\n",
+ "import matplotlib.pyplot as plt\n",
+ "import sys\n",
+ "import pandas as pd\n",
+ "import matplotlib"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [],
+ "source": [
+ "print(u'▶ Python version ' + sys.version)\n",
+ "print(u'▶ Pandas version ' + pd.__version__)\n",
+ "print(u'▶ Matplotlib version ' + matplotlib.__version__)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [],
+ "source": [
+ "import numpy as np\n",
+ "values = np.random.rand(100)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [],
+ "source": [
+ "df = pd.DataFrame(data=values, columns=['RandomValue'])\n",
+ "df.head(10)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [],
+ "source": [
+ "df.plot()"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "PySpark2/Python2",
+ "language": "python",
+ "name": "pyspark2"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 2
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython2",
+ "version": "2.7.12"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 0
+}
diff --git a/docker/dockerfiles/jupyter/docker/notebooks/tutorial/Example Platform-library PySpark Notebook.ipynb b/docker/dockerfiles/jupyter/docker/notebooks/tutorial/Example Platform-library PySpark Notebook.ipynb
new file mode 100644
index 0000000..7dda082
--- /dev/null
+++ b/docker/dockerfiles/jupyter/docker/notebooks/tutorial/Example Platform-library PySpark Notebook.ipynb
@@ -0,0 +1,659 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "collapsed": true
+ },
+ "source": [
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "\n",
+ "# Welcome to Example Platform-library PySpark Notebook\n",
+ "\n",
+ "It's a shared Jupyter server for you to learn and try out Jupyter notebook and perform interactive data analytics using PNDA platform libraries.\n",
+ "\n",
+ "In this example notebook, **JsonDataHandler**, a data handler implementation based on the assumption that the 'rawdata' field wrapped in Pnda avro record is a well-formatted in JSON.\n",
+ "\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Instructions\n",
+ "\n",
+ "To run the codes below:\n",
+ "\n",
+ "1. Click on the cell to select it.\n",
+ "2. Press `SHIFT+ENTER` on your keyboard or press the play button ( ) in the toolbar above."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Generate sample datasets ###\n",
+ "\n",
+ "If you don't have existed datasets, there are two ways to generate sample datasets:\n",
+ " * use data generation tool\n",
+ " * use embedded cell in this notebook\n",
+ "\n",
+ "Data generation tool is pre-installed on this node at `/home/cloud-user/data_generator.py`. \n",
+ "\n",
+ "\n",
+ "
** Usage **
\n",
+ "```\n",
+ "./data_generator.py --hosts '
,'\\\n",
+ " --metrics ','\\\n",
+ " --year \\\n",
+ " --month \\\n",
+ " --day \\\n",
+ " --hour \n",
+ "```\n",
+ "\n",
+ "[NOTE: if year|month|day|hour option is ignored, the script will extract values from current system time.]\n",
+ "
\n",
+ " \n",
+ "\n",
+ "Alternative, you can simply run the cell below to generate sample network usage datesets."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "** Example 1: ** Generate sample network usage datasets "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "'/user/pnda/PNDA_datasets/datasets/source=test/year=2016/month=04/day=26/hour=16/f8236764-222c-4fd1-a873-61a71c28f6a3.avro'"
+ ]
+ },
+ "execution_count": 1,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "import subprocess\n",
+ "import json\n",
+ "import avro.schema\n",
+ "import avro.io\n",
+ "import io\n",
+ "import datetime\n",
+ "import uuid\n",
+ "import time\n",
+ "import sys\n",
+ "import pyhdfs\n",
+ "\n",
+ "from random import randint\n",
+ "from avro.datafile import DataFileWriter\n",
+ "from avro.io import DatumWriter\n",
+ "from argparse import RawTextHelpFormatter\n",
+ "\n",
+ "fs = pyhdfs.HdfsClient(hosts='hdfs-namenode:50070', user_name='pnda')\n",
+ "\n",
+ "def generate_sample_datasets (host_ips, metric_ids, year, month, day, hour):\n",
+ " avro_schema = ''\n",
+ " #load data from hdfs\n",
+ " with fs.open('/user/pnda/PNDA_datasets/datasets/.metadata/schema.avsc') as f:\n",
+ " avro_schema = f.read()\n",
+ " schema = avro.schema.parse(avro_schema)\n",
+ " bytes_writer = io.BytesIO()\n",
+ " encoder = avro.io.BinaryEncoder(bytes_writer)\n",
+ " #create hdfs folder structure\n",
+ " dir = create_hdfs_dirs (year, month, day, hour)\n",
+ " filename = str(uuid.uuid4()) + '.avro'\n",
+ " filepath = dir + filename\n",
+ " tmp_file = '/tmp/' + filename\n",
+ " writer = DataFileWriter(open(tmp_file, \"w\"), DatumWriter(), schema)\n",
+ " start_dt = datetime.datetime(year, month, day, hour, 0, 0) \n",
+ " start_ts = int(time.mktime(start_dt.timetuple()))\n",
+ " end_dt = start_dt.replace(hour=hour+1)\n",
+ " end_ts = int(time.mktime(end_dt.timetuple()))\n",
+ "\n",
+ " for ts in xrange(start_ts, end_ts, 1):\n",
+ " #generate random pnda record on per host ip basis\n",
+ " for host_ip in host_ips:\n",
+ " record = {}\n",
+ " record['timestamp'] = (ts * 1000)\n",
+ " record['src'] = 'test'\n",
+ " record['host_ip'] = host_ip\n",
+ " record['rawdata'] = generate_random_metrics(metric_ids)\n",
+ " #encode avro\n",
+ " writer.append(record)\n",
+ " writer.close()\n",
+ " fs.copy_from_local(tmp_file, dir+filename)\n",
+ " return filepath\n",
+ "\n",
+ "def generate_random_metrics (metric_ids):\n",
+ " raw_data = {}\n",
+ " for id in metric_ids:\n",
+ " raw_data[id] = str(randint(0, 100))\n",
+ " return json.dumps(raw_data).encode('utf-8')\n",
+ "\n",
+ "def create_hdfs_dirs (year, month, day, hour):\n",
+ " dir = \"/user/pnda/PNDA_datasets/datasets/source=test/year=%0d/month=%02d/day=%02d/hour=%02d/\" % (year, month, day, hour)\n",
+ " fs.mkdirs(dir)\n",
+ " return dir \n",
+ "\n",
+ "#example host ips (update as you wish)\n",
+ "host_ips = ['10.0.0.1', '10.0.0.2', '10.0.0.3']\n",
+ "#example metric list (update as you wish)\n",
+ "metrics=['in_bytes', 'out_bytes', 'in_pks', 'out_pks']\n",
+ "#generate example datasets (update year, month, day, and hour as you wish)\n",
+ "generate_sample_datasets(host_ips, metrics, 2016, 4, 26, 16)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Play with RDD ###\n",
+ "RDD can be created automatically using PNDA platform libary. This allows data exploration using low-level RDD APIs.\n",
+ "\n",
+ "** Example 2: ** Create an instance of JsonDataHandler"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {
+ "scrolled": true
+ },
+ "outputs": [
+ {
+ "ename": "KeyError",
+ "evalue": "'cm_host'",
+ "output_type": "error",
+ "traceback": [
+ "\u001b[0;31m\u001b[0m",
+ "\u001b[0;31mKeyError\u001b[0mTraceback (most recent call last)",
+ "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0;32mfrom\u001b[0m \u001b[0mplatformlibs\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mjson_data_handler\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mJsonDataHandler\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0mhandler\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mJsonDataHandler\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0msc\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"test\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"year=2016/month=04/day=26/hour=16\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
+ "\u001b[0;32m/usr/lib/python2.7/site-packages/platformlibs-0.6.8-py2.7.egg/platformlibs/json_data_handler.pyc\u001b[0m in \u001b[0;36m__init__\u001b[0;34m(self, spark_context, datasource, path)\u001b[0m\n\u001b[1;32m 36\u001b[0m \u001b[0mspark_context\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 37\u001b[0m \u001b[0mdatasource\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 38\u001b[0;31m path)\n\u001b[0m\u001b[1;32m 39\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 40\u001b[0m \u001b[0;34m@\u001b[0m\u001b[0mstaticmethod\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+ "\u001b[0;32m/usr/lib/python2.7/site-packages/platformlibs-0.6.8-py2.7.egg/platformlibs/data_handler.pyc\u001b[0m in \u001b[0;36m__init__\u001b[0;34m(self, spark_context, datasource, path)\u001b[0m\n\u001b[1;32m 36\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_rdd\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mNone\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 37\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_hdfs_root_uri\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mNone\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 38\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mschema\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_load_schema\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 39\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 40\u001b[0m \u001b[0;34m@\u001b[0m\u001b[0mstaticmethod\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+ "\u001b[0;32m/usr/lib/python2.7/site-packages/platformlibs-0.6.8-py2.7.egg/platformlibs/data_handler.pyc\u001b[0m in \u001b[0;36m_load_schema\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 106\u001b[0m \"\"\"\n\u001b[1;32m 107\u001b[0m \u001b[0mschema_path\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m'{}/user/pnda/PNDA_datasets/datasets/'\u001b[0m\u001b[0;31m \u001b[0m\u001b[0;31m\\\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 108\u001b[0;31m \u001b[0;34m'.metadata/schema.avsc'\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mformat\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mhdfs_root_uri\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 109\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mspark_context\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtextFile\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mschema_path\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcollect\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+ "\u001b[0;32m/usr/lib/python2.7/site-packages/platformlibs-0.6.8-py2.7.egg/platformlibs/data_handler.pyc\u001b[0m in \u001b[0;36mhdfs_root_uri\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 64\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_hdfs_root_uri\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 65\u001b[0m \u001b[0mcm_conf\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mread_config\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'/etc/platformlibs/platformlibs.ini'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 66\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_hdfs_root_uri\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mget_hdfs_uri\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcm_conf\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'cm_host'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcm_conf\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'cm_user'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcm_conf\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'cm_pass'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcm_conf\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'hadoop_distro'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 67\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_hdfs_root_uri\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 68\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
+ "\u001b[0;31mKeyError\u001b[0m: 'cm_host'"
+ ]
+ }
+ ],
+ "source": [
+ "from platformlibs.json_data_handler import JsonDataHandler\n",
+ "handler = JsonDataHandler(sc, \"test\", \"year=2016/month=04/day=26/hour=16\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "** Example 3: ** Simple RDD operations"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "scrolled": true
+ },
+ "outputs": [],
+ "source": [
+ "import pprint\n",
+ "\n",
+ "rdd = handler.rdd\n",
+ "# print total nubmer of records\n",
+ "print rdd.count()\n",
+ "\n",
+ "# print one record\n",
+ "pprint.pprint(rdd.take(1))\n",
+ "\n",
+ "# use MapR function to print list of unique router ips\n",
+ "host_ips = rdd.map(lambda x: x['host_ip']).distinct().collect()\n",
+ "pprint.pprint(host_ips)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "** Challenge 1: ** How many unique metrics of all routers have been collected? What are they?"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Speculate your anwser here\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Visualize high-level statistics ###\n",
+ "PNDA platform library provide functions to return high-level statistics on per host basis using `list_host_ips()` and on per metric basis using `list_metric_ids()`.\n",
+ "\n",
+ "** Example 4: ** Plot a bar chart to show the total number of records per host"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# plot simple bar chart\n",
+ "%matplotlib inline \n",
+ "import matplotlib.pyplot as plt\n",
+ "import numpy as np\n",
+ "\n",
+ "# query host IPs\n",
+ "host_stats = handler.list_host_ips()\n",
+ "host_ips = []\n",
+ "counts = []\n",
+ "for stat in host_stats:\n",
+ " host_ips.append(stat[0])\n",
+ " counts.append(stat[1])\n",
+ "\n",
+ "fig, ax = plt.subplots(figsize=(15, 8))\n",
+ "x = np.arange(len(host_ips))\n",
+ "rects = ax.bar(x, counts, color='y')\n",
+ "plt.xticks(x+0.5, host_ips, rotation=45) \n",
+ "\n",
+ "def autolabel(rects):\n",
+ " # attach 'counts' labels\n",
+ " for rect in rects:\n",
+ " height = rect.get_height()\n",
+ " ax.text(rect.get_x() + rect.get_width()/2., 1.05*height,\n",
+ " '%d' % int(height),\n",
+ " ha='center', va='bottom')\n",
+ "autolabel(rects) # add label on bar\n",
+ "plt.ylabel('counts')\n",
+ "plt.title('Statistics of hosts')\n",
+ "plt.show()\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "** Challenge 2: ** Generate a bar chart to show total number of records per metric of host 10.0.0.1"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Speculate your anwser here\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Introducing interactive UI ###\n",
+ "Interactivity introduction to your notebook can be done by adding widgets provided in the `ipywidgets` package. Each widget consists of two parts: the UI element (e.g. Text Input, sliding bar, etc.) and an event handler. \n",
+ "\n",
+ "** Example 5: ** Interactive visualization of total number of records per metric of a particular host"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "%matplotlib inline\n",
+ "from ipywidgets import *\n",
+ "from IPython.display import display\n",
+ "import matplotlib.patches as mpatches\n",
+ "\n",
+ "options= ['--select--'] + sorted(host_ips)\n",
+ "selected_host = \"--select--\"\n",
+ "\n",
+ "host_ip_widget = widgets.Dropdown(description='Host IP:', width=100, options=options)\n",
+ "display(host_ip_widget)\n",
+ "# diplaying the limits input widget:\n",
+ "limits_input = widgets.Text(description=\"limits :\", width=200)\n",
+ "display(limits_input)\n",
+ "# preparing a container to put in the created checkbox per host ip\n",
+ "checkboxes = []\n",
+ "cb_container=widgets.HBox()\n",
+ "display(cb_container)\n",
+ "\n",
+ "# add button that updates the graph based on the checkboxes\n",
+ "button = widgets.Button(description=\"submit\")\n",
+ "display(button)\n",
+ "\n",
+ "def on_button_clicked(b):\n",
+ " selected_host = host_ip_widget.value\n",
+ " \n",
+ " def autolabel(rects):\n",
+ " # attach 'counts' labels\n",
+ " for rect in rects:\n",
+ " height = rect.get_height()\n",
+ " ax.text(rect.get_x() + rect.get_width()/2., 1.05*height,\n",
+ " '%d' % int(height),\n",
+ " ha='center', va='bottom')\n",
+ " limit = -1\n",
+ " if limits_input.value:\n",
+ " limit = int(limits_input.value)\n",
+ " \n",
+ " filters={}\n",
+ " metrics = None\n",
+ " if selected_host != \"--select--\":\n",
+ " filters['host_ips']=[selected_host] \n",
+ " metrics = handler.list_metric_ids(limit=limit, filters=filters)\n",
+ " if len(metrics) > 0:\n",
+ " host_ip = metrics[0][0]\n",
+ " metric_stats = metrics[0][1]\n",
+ "\n",
+ " metric_ids=[]\n",
+ " metric_counts=[]\n",
+ " for stat in metric_stats:\n",
+ " metric_ids.append(stat[0])\n",
+ " metric_counts.append(stat[1])\n",
+ " x = np.arange(len(metric_ids))\n",
+ " fig, ax = plt.subplots(figsize=(15, 8))\n",
+ " metric_rects = ax.bar(x, metric_counts, color='y')\n",
+ " plt.xticks(x+0.5, metric_ids, rotation='vertical') \n",
+ " plt.ylabel ('counts')\n",
+ " patch = mpatches.Patch(color='y', label=host_ip)\n",
+ " plt.legend(handles=[patch])\n",
+ " autolabel(metric_rects)\n",
+ " plt.draw()\n",
+ " else:\n",
+ " print \"Please select a host ip from dropdown list.\"\n",
+ " \n",
+ "button.on_click(on_button_clicked)\n",
+ "\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "** Example 6: ** Interactive time series visualization"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "scrolled": false
+ },
+ "outputs": [],
+ "source": [
+ "%matplotlib inline\n",
+ "import matplotlib\n",
+ "import matplotlib.pyplot as plt\n",
+ "import numpy as np\n",
+ "import datetime\n",
+ "from ipywidgets import *\n",
+ "from operator import add\n",
+ "from IPython.display import display\n",
+ "import calendar\n",
+ "import time\n",
+ "\n",
+ "dateFormatString = '%Y-%m-%d %H:%M:%S'\n",
+ "\n",
+ "colors=['b', 'c', 'y', 'm', 'r']\n",
+ "\n",
+ "# displaying the metric id input widget\n",
+ "metric_id_input = widgets.Text(description=\"metric id:\", width=200)\n",
+ "display(metric_id_input)\n",
+ "\n",
+ "host_ip_input = widgets.Text(description=\"host ip:\", width=200, value='edit and hit to add')\n",
+ "display(host_ip_input)\n",
+ "\n",
+ "#preparing the plot \n",
+ "plots = dict() \n",
+ "\n",
+ "#preparing a container to put in created checkbox per host ip\n",
+ "checkboxes = [] \n",
+ "cb_container = widgets.HBox() \n",
+ "display(cb_container)\n",
+ "\n",
+ "#preparing update button\n",
+ "update_button = widgets.Button(description=\"Update\")\n",
+ "\n",
+ "#normalise data with 5-min interval\n",
+ "def post_process(data):\n",
+ " def f(x): \n",
+ " sum_val = 0\n",
+ " for val in x:\n",
+ " sum_val = sum_val + x[0][1]\n",
+ " return sum_val\n",
+ " data_rdd = sc.parallelize(data).map(lambda x: (x[0], int(x[1]))).foldByKey(0, add).sortBy(lambda x: x[0]).groupBy(lambda x : (calendar.timegm(time.strptime(datetime.datetime.fromtimestamp(x[0]/1000).strftime('%Y-%m-%d %H:%M:%S'), dateFormatString))/(5*60))).map(lambda x : (x[0],list(x[1]))).mapValues(f).map(lambda x: (datetime.datetime.fromtimestamp(x[0] * 6*50), x[1]))\n",
+ " return data_rdd.keys().collect(), data_rdd.values().collect()\n",
+ "\n",
+ "#function to deal with the added host ip\n",
+ "def handle_submit(sender): \n",
+ " exists = False\n",
+ " for cb in checkboxes:\n",
+ " if cb.description is host_ip_input.value:\n",
+ " exists = True\n",
+ " if not exists and len(checkboxes)<5:\n",
+ " #add a new checkbox for the new host ip\n",
+ " checkboxes.append(widgets.Checkbox(description = host_ip_input.value, value=True, width=90))\n",
+ " cb_container.children=[i for i in checkboxes]\n",
+ " if len(checkboxes) == 1:\n",
+ " display(update_button)\n",
+ "\n",
+ "#function to deal with the checkbox update button \n",
+ "def on_button_clicked(b): \n",
+ " filters = {}\n",
+ " filters['metrics']=[metric_id_input.value]\n",
+ " host_ips = []\n",
+ " for c in cb_container.children:\n",
+ " if c.value:\n",
+ " host_ips.append(c.description) \n",
+ " filters['host_ips'] = host_ips\n",
+ "\n",
+ " results = handler.execute_query(filters=filters)\n",
+ "\n",
+ " i=0\n",
+ " if len(results) > 0:\n",
+ " # Plot things...\n",
+ " fig = plt.figure(figsize=(15, 8))\n",
+ " ax=fig.add_subplot(111)\n",
+ " for result in results:\n",
+ " label = result[0][1]\n",
+ " timestamps, values = post_process(result[1])\n",
+ " ax.plot_date(timestamps, values, c=colors[i], label=label)\n",
+ " i=i+1\n",
+ " ax.xaxis.set_major_formatter(matplotlib.dates.DateFormatter(\"%H:%M:%S\"))\n",
+ " plt.ylabel(metric_id_input.value)\n",
+ " plt.xlabel(\"time of the day\")\n",
+ " plt.legend(loc='upper right')\n",
+ " plt.gray() \n",
+ " plt.show()\n",
+ " \n",
+ "update_button.on_click(on_button_clicked) \n",
+ "host_ip_input.on_submit(handle_submit) \n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "** Challenge 3: ** generate scatter plots to show packet/bytes drops (e.g. use in_byte metric) of a partiular host"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Speculate your anwser here"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "PySpark2/Python2",
+ "language": "python",
+ "name": "pyspark2"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 2
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython2",
+ "version": "2.7.14"
+ },
+ "widgets": {
+ "state": {
+ "1557e450223e4dde8c2413a5ee83e705": {
+ "views": [
+ {
+ "cell_index": 20
+ }
+ ]
+ },
+ "1afdbacd1b0b478aa01978edac6c190a": {
+ "views": [
+ {
+ "cell_index": 20
+ }
+ ]
+ },
+ "1ba2b95d0e4d4a26a7aa819dc8534c6a": {
+ "views": []
+ },
+ "2db1cd0c66dd4e8196a900db826d64c8": {
+ "views": []
+ },
+ "38670bab911f412191a9b3deb84babc7": {
+ "views": []
+ },
+ "461bb60055dd4397b778a407640d1d99": {
+ "views": []
+ },
+ "533f372af51e4cb79e2122e986d701df": {
+ "views": []
+ },
+ "561a2c0073014a1d8cad39649bc96573": {
+ "views": []
+ },
+ "58ce38b80de34d6e82a4d160febff999": {
+ "views": [
+ {
+ "cell_index": 18
+ }
+ ]
+ },
+ "633ca203e85541b59f9131e83f6b12bb": {
+ "views": []
+ },
+ "634e89cc27f14c5eb125d8294ab0e262": {
+ "views": []
+ },
+ "63bf2949f1ab4cdbb73d1d0d6d20dbd0": {
+ "views": []
+ },
+ "657e67c80c174b04bbd1f8cee14674ef": {
+ "views": []
+ },
+ "6b7ff28cf96145abbcb000a6d784b58c": {
+ "views": []
+ },
+ "7cf1b6bb25d841b0b83bf337300df6fd": {
+ "views": []
+ },
+ "810c709e9cc84968a1c20d1d13db8acf": {
+ "views": []
+ },
+ "a2439b82434d41a3b63efdbf3eee6519": {
+ "views": [
+ {
+ "cell_index": 16
+ }
+ ]
+ },
+ "a4848088511d42379c3daf05b7150343": {
+ "views": []
+ },
+ "ac64a66ddb0e4cf9bd3cd37cf4275b28": {
+ "views": []
+ },
+ "ade75b3524bc42d39979108791ac27e0": {
+ "views": []
+ },
+ "aeac437c13634b02be44b86ce14a16d7": {
+ "views": [
+ {
+ "cell_index": 18
+ }
+ ]
+ },
+ "b2d5229dfa0c4bc093922e6fd9afbd7a": {
+ "views": []
+ },
+ "b81aa794d7d74402bb68519be24bb444": {
+ "views": []
+ },
+ "bd62dae252ac444fbc377bec6458eb6b": {
+ "views": []
+ },
+ "c6d9fa2d6f2a4f0fb0b662d6bd89c6a4": {
+ "views": []
+ },
+ "cc93c8dda54d4f139f41b42c3dba67ad": {
+ "views": [
+ {
+ "cell_index": 16
+ }
+ ]
+ },
+ "ce3a67a85dcf4cefbe5fa3e9e27644d8": {
+ "views": []
+ },
+ "df0a9835ec7c49ffaa7992c919a6ddcf": {
+ "views": []
+ },
+ "e61981d0808e4ce1ac91ad16d8f3b55f": {
+ "views": []
+ },
+ "e87b1311dbde4cd0bc020eb484d72f5b": {
+ "views": []
+ },
+ "e9353bafbf6a4a9b8b9de0747078720f": {
+ "views": []
+ },
+ "f0874b40c3da49fbb514e3f9f2047df3": {
+ "views": []
+ },
+ "f2bb960e4fc2454fa1be3b679c7fd078": {
+ "views": []
+ },
+ "fe575e9009c54084bd0a27bdd4c03382": {
+ "views": []
+ }
+ },
+ "version": "1.1.2"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 1
+}
diff --git a/docker/dockerfiles/jupyter/docker/platform-libraries-env-conf.diff b/docker/dockerfiles/jupyter/docker/platform-libraries-env-conf.diff
new file mode 100644
index 0000000..c8a1b39
--- /dev/null
+++ b/docker/dockerfiles/jupyter/docker/platform-libraries-env-conf.diff
@@ -0,0 +1,23 @@
+--- common_helpers.py.orig 2018-04-20 10:37:55.033647186 +0200
++++ common_helpers.py 2018-04-20 10:50:57.434402052 +0200
+@@ -14,6 +14,7 @@
+ Purpose: Utility library that defines common helper functions
+ """
+ import requests
++import os
+ from cm_api.api_client import ApiResource
+
+ def flatten_dict(input_d, result=None):
+@@ -118,8 +119,10 @@
+ - hadoop_distro: 'CDH' or 'HDP'
+ '''
+ hdfs_uri = ''
+-
+- if hadoop_distro == 'CDH':
++
++ if hadoop_distro == 'env':
++ hdfs_uri = os.getenv('HDFS_ROOT_URI')
++ elif hadoop_distro == 'CDH':
+ api = connect_cm(cm_host, cm_user, cm_pass)
+
+ for cluster_detail in api.get_all_clusters():
diff --git a/docker/dockerfiles/jupyter/docker/platformlibs.ini.tpl b/docker/dockerfiles/jupyter/docker/platformlibs.ini.tpl
new file mode 100644
index 0000000..7206110
--- /dev/null
+++ b/docker/dockerfiles/jupyter/docker/platformlibs.ini.tpl
@@ -0,0 +1,6 @@
+[cm]
+hadoop_distro={{ HADOOP_DISTRO | default('HDP') }}
+hdfs_root_uri={{ HDFS_ROOT_URI | default('hdfs://hdfs-namenode:8020') }}
+cm_host={{ CM_HOST | default('cm') }}
+cm_user={{ CM_USER | default('scm') }}
+cm_pass={{ CM_PASSWORD | default('scm') }}
diff --git a/docker/dockerfiles/jupyter/docker/pyspark2_kernel.json.tpl b/docker/dockerfiles/jupyter/docker/pyspark2_kernel.json.tpl
new file mode 100644
index 0000000..5428176
--- /dev/null
+++ b/docker/dockerfiles/jupyter/docker/pyspark2_kernel.json.tpl
@@ -0,0 +1,21 @@
+{
+ "display_name": "PySpark2/Python2",
+ "language": "python",
+ "argv": [
+ "/usr/bin/python2",
+ "-m",
+ "ipykernel",
+ "-f",
+ "{connection_file}"
+ ],
+ "env": {
+ "HADOOP_CONF_DIR":"{{HADOOP_CONF_DIR | default('/')}}",
+ "PYSPARK_PYTHON":"/usr/bin/python2",
+ "SPARK_MAJOR_VERSION":"2",
+ "SPARK_HOME": "/opt/spark",
+ "WRAPPED_SPARK_HOME": "/usr/",
+ "PYTHONPATH": "/usr/lib/python2.7/site-packages:/opt/spark/python:/opt/spark/python/lib/py4j-0.10.6-src.zip",
+ "PYTHONSTARTUP": "/opt/spark/python/pyspark/shell.py",
+ "PYSPARK_SUBMIT_ARGS": "--master {{SPARK_MASTER_URL | default('spark://spark-master:7077')}} --jars /opt/spark/examples/jars/spark-examples_2.11-2.3.0.jar pyspark-shell"
+ }
+}
diff --git a/docker/dockerfiles/jupyter/docker/requirements/app-packages-requirements.txt b/docker/dockerfiles/jupyter/docker/requirements/app-packages-requirements.txt
new file mode 100644
index 0000000..9c0a454
--- /dev/null
+++ b/docker/dockerfiles/jupyter/docker/requirements/app-packages-requirements.txt
@@ -0,0 +1,14 @@
+# App dependency packages to be installed for use in pyspark and Jupyter
+avro==1.8.1
+cm-api==14.0.0
+fastavro==0.17.9
+happybase==1.0.0
+kafka-python==1.3.5
+PyHDFS==0.1.2
+pykafka==2.7.0
+pywebhdfs==0.4.1
+PyYAML==3.12
+thrift==0.9.3
+thrift_sasl==0.2.1
+thriftpy==0.3.9
+xmltodict==0.11.0
diff --git a/docker/dockerfiles/jupyter/docker/requirements/requirements-jupyter-extensions.txt b/docker/dockerfiles/jupyter/docker/requirements/requirements-jupyter-extensions.txt
new file mode 100644
index 0000000..735db94
--- /dev/null
+++ b/docker/dockerfiles/jupyter/docker/requirements/requirements-jupyter-extensions.txt
@@ -0,0 +1,3 @@
+lxml==3.6.4
+ipywidgets==6.0.0
+widgetsnbextension==2.0.0
diff --git a/docker/dockerfiles/jupyter/docker/requirements/requirements-jupyter.txt b/docker/dockerfiles/jupyter/docker/requirements/requirements-jupyter.txt
new file mode 100644
index 0000000..7323cf8
--- /dev/null
+++ b/docker/dockerfiles/jupyter/docker/requirements/requirements-jupyter.txt
@@ -0,0 +1,44 @@
+backports-abc==0.5
+bleach==1.5.0
+decorator==4.0.10
+entrypoints==0.2.2
+https://github.com/klyr/jupyter-spark/releases/download/0.3.0-patch/jupyter-spark-0.3.0-patch.tar.gz
+html5lib==0.9999999
+impyla==0.14.0
+ipykernel==4.5.2
+ipython==5.1.0
+ipython-genutils==0.1.0
+ipython-sql==0.3.8
+Jinja2==2.8
+jsonschema==2.5.1
+jupyter==1.0.0
+jupyter-client==4.4.0
+jupyter-console==5.0.0
+jupyter-core==4.4.0
+MarkupSafe==0.23
+mistune==0.7.3
+nbconvert==5.0.0
+nbformat==4.2.0
+notebook==4.3.1
+pandocfilters==1.4.1
+pexpect==4.2.1
+pickleshare==0.7.4
+prompt-toolkit==1.0.9
+ptyprocess==0.5.1
+Pygments==2.1.3
+pymysql==0.7.11
+psycopg2==2.7.3.2
+pyzmq==16.0.2
+qtconsole==4.2.1
+simplegeneric==0.8.1
+six==1.10.0
+sparkmagic==0.12.4
+sql-magic==0.0.3
+terminado==0.6
+testpath==0.3
+thrift==0.9.3
+tornado==4.4.2
+traitlets==4.3.1
+wcwidth==0.1.7
+widgetsnbextension==2.0.0
+matplotlib==2.1.2
diff --git a/docker/dockerfiles/jupyter/docker/requirements/requirements-jupyterhub.txt b/docker/dockerfiles/jupyter/docker/requirements/requirements-jupyterhub.txt
new file mode 100644
index 0000000..10113a8
--- /dev/null
+++ b/docker/dockerfiles/jupyter/docker/requirements/requirements-jupyterhub.txt
@@ -0,0 +1,15 @@
+alembic==0.8.9
+backports-abc==0.5
+decorator==4.0.10
+ipython-genutils==0.1.0
+Jinja2==2.8
+jupyterhub==0.7.0
+Mako==1.0.6
+MarkupSafe==0.23
+pamela==0.3.0
+python-editor==1.0.3
+requests==2.12.4
+six==1.10.0
+SQLAlchemy==1.1.4
+tornado==4.4.2
+traitlets==4.3.1
diff --git a/docker/dockerfiles/platform-console-backend b/docker/dockerfiles/platform-console-backend
new file mode 160000
index 0000000..67961d9
--- /dev/null
+++ b/docker/dockerfiles/platform-console-backend
@@ -0,0 +1 @@
+Subproject commit 67961d942d4a69347129594212f63debdb02ac02
diff --git a/docker/dockerfiles/platform-console-frontend b/docker/dockerfiles/platform-console-frontend
new file mode 160000
index 0000000..9c1c6d6
--- /dev/null
+++ b/docker/dockerfiles/platform-console-frontend
@@ -0,0 +1 @@
+Subproject commit 9c1c6d665a41afc96801241e786ef1b6f0a3714d
diff --git a/docker/dockerfiles/platform-data-mgmnt b/docker/dockerfiles/platform-data-mgmnt
new file mode 160000
index 0000000..862f049
--- /dev/null
+++ b/docker/dockerfiles/platform-data-mgmnt
@@ -0,0 +1 @@
+Subproject commit 862f0491e122f4ca78b8a7ef88f93d47ab8f9e20
diff --git a/docker/dockerfiles/platform-deployment-manager b/docker/dockerfiles/platform-deployment-manager
new file mode 160000
index 0000000..ed81386
--- /dev/null
+++ b/docker/dockerfiles/platform-deployment-manager
@@ -0,0 +1 @@
+Subproject commit ed81386e35a983c559f6a0e2d79f90dd6d8ac222
diff --git a/docker/dockerfiles/platform-gobblin-modules b/docker/dockerfiles/platform-gobblin-modules
new file mode 160000
index 0000000..6195093
--- /dev/null
+++ b/docker/dockerfiles/platform-gobblin-modules
@@ -0,0 +1 @@
+Subproject commit 61950938cd6dd86bb098c1910169d78fe560c899
diff --git a/docker/dockerfiles/platform-package-repository b/docker/dockerfiles/platform-package-repository
new file mode 160000
index 0000000..2e06083
--- /dev/null
+++ b/docker/dockerfiles/platform-package-repository
@@ -0,0 +1 @@
+Subproject commit 2e0608356a246539a464ab3af6653415b6155594
diff --git a/docker/dockerfiles/platform-testing/Dockerfile b/docker/dockerfiles/platform-testing/Dockerfile
new file mode 100644
index 0000000..e3903e7
--- /dev/null
+++ b/docker/dockerfiles/platform-testing/Dockerfile
@@ -0,0 +1,33 @@
+FROM alpine:3.7 as builder
+LABEL maintainer="cgiraldo@gradiant.org"
+LABEL organization="gradiant.org"
+ARG version
+ENV VERSION $version
+RUN apk add --no-cache git bash python build-base linux-pam-dev maven=3.5.2-r0 bc grep python2-dev py2-nose py2-pip cyrus-sasl-dev ca-certificates wget\
+&& pip install spur==0.3.12 starbase==0.3.2 happybase==1.0.0 pyhs2==0.6.0 pywebhdfs==0.4.0 PyHDFS==0.1.2 cm-api==8.0.0 shyaml==0.4.1 \
+nose==1.3.7 mock==2.0.0 pylint==1.6.4 python-swiftclient==3.1.0 tornado==4.4.2 tornado-cors==0.6.0 Tornado-JSON==1.2.2 boto==2.40.0 \
+setuptools==28.8.0 --upgrade impyla==0.13.8 eventlet==0.19.0 kazoo==2.2.1 avro==1.8.1 kafka-python==1.3.5 prettytable==0.7.2 \
+pyhive==0.2.1 thrift_sasl==0.2.1 JayDeBeApi==1.1.1 \
+&& ln -s /usr/bin/nosetests-2.7 /usr/bin/nosetests
+
+RUN wget -qO- https://github.com/pndaproject/platform-testing/archive/$VERSION.tar.gz | tar -xvz && \
+ mv platform-testing-$VERSION src
+
+#pnda.io platform-testing search for Maven 3.0.5. We patch this to use Maven 3.5
+RUN sed -i 's/Apache Maven 3.0.5/Apache Maven 3.5/g' /src/build.sh
+RUN cd src && ./build.sh $VERSION
+
+
+FROM alpine:3.7 as platform-testing
+LABEL maintainer="cgiraldo@gradiant.org"
+LABEL organization="gradiant.org"
+ARG version
+ENV VERSION $version
+COPY --from=builder /src/pnda-build /
+COPY jinja_entrypoint.sh entrypoint.sh.tpl hbase_spark_metric.py /
+ENTRYPOINT /jinja_entrypoint.sh
+RUN apk add --no-cache bash py2-pip tar && tar -xzf /platform-testing-general-${VERSION}.tar.gz \
+&& mv /platform-testing-general-${VERSION} /platform-testing-general \
+&& pip install j2cli \
+&& find /platform-testing-general -name requirements.txt -exec pip install -r '{}' \;
+
diff --git a/docker/dockerfiles/platform-testing/build-docker.sh b/docker/dockerfiles/platform-testing/build-docker.sh
new file mode 100755
index 0000000..9f2af26
--- /dev/null
+++ b/docker/dockerfiles/platform-testing/build-docker.sh
@@ -0,0 +1,4 @@
+#!/bin/bash
+
+VERSION=0.5.0
+docker build --build-arg version=$VERSION -t pnda/testing:$VERSION .
diff --git a/docker/dockerfiles/platform-testing/entrypoint.sh.tpl b/docker/dockerfiles/platform-testing/entrypoint.sh.tpl
new file mode 100644
index 0000000..b747886
--- /dev/null
+++ b/docker/dockerfiles/platform-testing/entrypoint.sh.tpl
@@ -0,0 +1,16 @@
+#/bin/sh
+
+while true
+do
+python /platform-testing-general/monitor.py --plugin zookeeper \
+--postjson http://{{ CONSOLE_HOSTS | default('console-backend-data-logger:3001') }}/metrics \
+--extra "--zconnect {{ ZOOKEEPERS | default('zookeeper:2181') }}"
+
+python /platform-testing-general/monitor.py --plugin kafka \
+--postjson http://{{ CONSOLE_HOSTS | default('console-backend-data-logger:3001') }}/metrics \
+--extra "--brokerlist {{ KAFKA_BROKERS | default('kafka:9092') }} \
+--zkconnect {{ ZOOKEEPERS | default('zookeeper:2181') }} --prod2cons"
+
+python /hbase_spark_metric.py http://{{ CONSOLE_HOSTS | default('console-backend-data-logger:3001') }}/metrics
+sleep 60
+done
diff --git a/docker/dockerfiles/platform-testing/hbase_spark_metric.py b/docker/dockerfiles/platform-testing/hbase_spark_metric.py
new file mode 100644
index 0000000..2aceec8
--- /dev/null
+++ b/docker/dockerfiles/platform-testing/hbase_spark_metric.py
@@ -0,0 +1,26 @@
+import requests
+import json
+import time
+import sys
+
+# a python script to return OK status for hbase and spark components. Hard coded to return OK on every run - need to change this
+
+TIMESTAMP_MILLIS = lambda: int(time.time() * 1000)
+components = {'hbase01': 'hadoop.HBASE.health', 'spark_on_yarn': 'hadoop.SPARK_ON_YARN.health'}
+if len(sys.argv) > 2 :
+ print ("usage: hbase_spark_metric.py [console_backend_data_manager_metrics_endpoint]")
+ sys.exit()
+elif len(sys.argv) == 2:
+ host = sys.argv[1]
+else:
+ host = "http://127.0.0.1:3001/metrics"
+
+for key, value in components.iteritems():
+ json_data = {"data": [{"source": key, "metric": value, "value": "OK", "causes": "[]", "timestamp": TIMESTAMP_MILLIS()}], "timestamp": TIMESTAMP_MILLIS()}
+ try:
+ headers = {'Content-Type': 'application/json', 'Connection':'close'}
+ response = requests.post(host, data=json.dumps(json_data), headers=headers)
+ if response.status_code != 200:
+ print "_send failed: %s", response.status_code
+ except requests.exceptions.RequestException as ex:
+ print "_send failed: %s", ex
diff --git a/docker/dockerfiles/platform-testing/jinja_entrypoint.sh b/docker/dockerfiles/platform-testing/jinja_entrypoint.sh
new file mode 100755
index 0000000..b8474b2
--- /dev/null
+++ b/docker/dockerfiles/platform-testing/jinja_entrypoint.sh
@@ -0,0 +1,4 @@
+#/bin/sh
+j2 /entrypoint.sh.tpl > /entrypoint.sh
+chmod +x /entrypoint.sh
+/entrypoint.sh