Skip to content

Commit

Permalink
Adding all dockerfiles for PNDA services
Browse files Browse the repository at this point in the history
  • Loading branch information
cgiraldo committed Oct 16, 2018
1 parent 41c92d6 commit ca2c8df
Show file tree
Hide file tree
Showing 83 changed files with 2,147 additions and 0 deletions.
60 changes: 60 additions & 0 deletions docker/dockerfiles/jupyter/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
FROM alpine:3.7 as platformlibs

LABEL maintainer="[email protected]"
LABEL organization="gradiant.org"

COPY docker/hdfs_root_uri_conf.diff /
RUN apk add --no-cache git bash python py2-pip && pip install setuptools
RUN git clone https://github.com/pndaproject/platform-libraries.git
RUN cd platform-libraries && git checkout tags/release/4.0 && \
export VERSION=$(git describe --tags) && \
git apply /hdfs_root_uri_conf.diff && \
python setup.py bdist_egg

FROM alpine:3.7

COPY --from=platformlibs /platform-libraries/dist/platformlibs-0.1.5-py2.7.egg /
COPY docker /
ENV SPARK_HOME=/opt/spark

RUN apk add --no-cache bash python2 py2-pip postgresql-dev libpng-dev freetype-dev ca-certificates build-base python2-dev krb5-dev libffi-dev cyrus-sasl-dev nodejs shadow python3 python3-dev openjdk8-jre && \
echo 'Installing python2 requirements' && \
pip2 install -r /requirements/requirements-jupyter.txt && \
pip2 install -r /requirements/app-packages-requirements.txt && pip2 install j2cli && \
/usr/bin/python2 -m ipykernel.kernelspec --name python2 --display-name "Python 2" && \
echo 'Instaling python3 requirements' && \
pip3 install -r /requirements/requirements-jupyter.txt && \
/usr/bin/python3 -m ipykernel.kernelspec --name python3 --display-name "Python 3" && \
echo 'Adding pyspark2 support' && \
mkdir -p /usr/local/share/jupyter/kernels/pyspark2 && mkdir -p /opt && \
wget -O- https://archive.apache.org/dist/spark/spark-2.3.0/spark-2.3.0-bin-hadoop2.7.tgz | tar -xvz -C /tmp && \
mv /tmp/spark-2.3.0-bin-hadoop2.7 /opt/spark && \
echo 'Adding jupyter-scala_extension_spark' && \
jupyter nbextension enable --py widgetsnbextension --system && \
jupyter-kernelspec install /usr/lib/python3.6/site-packages/sparkmagic/kernels/sparkkernel && \
jupyter serverextension enable --py sparkmagic && \
echo 'Adding jupyter-extensions' && \
apk add --no-cache libxml2-dev libxslt-dev && \
pip3 install -r /requirements/requirements-jupyter-extensions.txt && \
jupyter serverextension enable --py jupyter_spark --system && \
jupyter nbextension install --py jupyter_spark --system && \
jupyter nbextension enable --py jupyter_spark --system && \
jupyter nbextension enable --py widgetsnbextension --system && \
echo 'Adding jupyterhub' && \
pip3 install -r /requirements/requirements-jupyterhub.txt && \
npm install -g configurable-http-proxy && mkdir -p /var/log/pnda && \
echo 'auth required pam_exec.so debug log=/var/log/pnda/login.log /create_notebook_dir.sh' >> /etc/pam.d/login
RUN echo 'Adding pnda platform-libraries' && \
mkdir /etc/platformlibs && /usr/bin/python2 -m easy_install /platformlibs-0.1.5-py2.7.egg && \
adduser -D pnda && echo "pnda:pnda" | chpasswd && \
mkdir -p /opt/pnda && mv /notebooks /opt/pnda/jupyter_notebooks && \
echo 'auth required pam_listfile.so item=user sense=deny file=/etc/login.deny onerr=succeed' >> /etc/pam.d/login && \
echo 'root' >> /etc/login.deny

RUN wget http://central.maven.org/maven2/org/apache/spark/spark-sql-kafka-0-10_2.11/2.3.0/spark-sql-kafka-0-10_2.11-2.3.0.jar \
-O /opt/spark/jars/spark-sql-kafka-0-10_2.11-2.3.0.jar && \
wget http://central.maven.org/maven2/org/apache/kafka/kafka-clients/1.0.0/kafka-clients-1.0.0.jar \
-O /opt/spark/jars/kafka-clients-1.0.0.jar

ENTRYPOINT /entrypoint.sh

4 changes: 4 additions & 0 deletions docker/dockerfiles/jupyter/build-docker.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
#!/bin/bash

VERSION=4.4.0
docker build -t pnda/jupyter:$VERSION .
114 changes: 114 additions & 0 deletions docker/dockerfiles/jupyter/data_generator.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,114 @@
#!/usr/bin/python

import argparse
import subprocess
import json
import avro.schema
import avro.io
import io
import datetime
import uuid
import time
import sys

from random import randint
from avro.datafile import DataFileWriter
from avro.io import DatumWriter
from argparse import RawTextHelpFormatter

def generate_sample_datasets (host_ips, metric_ids, year, month, day, hour):
avro_schema = ''
#load data from hdfs
cat = subprocess.Popen(['sudo', '-u', 'hdfs', 'hadoop', 'fs', '-cat', '/user/pnda/PNDA_datasets/datasets/.metadata/schema.avsc'], stdout=subprocess.PIPE)
for line in cat.stdout:
avro_schema = avro_schema + line
schema = avro.schema.parse(avro_schema)
bytes_writer = io.BytesIO()
encoder = avro.io.BinaryEncoder(bytes_writer)
#create hdfs folder structure
dir = create_hdfs_dirs (year, month, day, hour)
filename = str(uuid.uuid4()) + '.avro'
filepath = dir + filename
tmp_file = '/tmp/' + filename

writer = DataFileWriter(open(tmp_file, "w"), DatumWriter(), schema)

start_dt = datetime.datetime(year, month, day, hour, 0, 0)
start_ts = int(time.mktime(start_dt.timetuple()))
end_dt = start_dt.replace(hour=hour+1)
end_ts = int(time.mktime(end_dt.timetuple()))

for ts in xrange(start_ts, end_ts, 1):
#generate random pnda record on per host ip basis
for host_ip in host_ips:
record = {}
record['timestamp'] = (ts * 1000)
record['src'] = 'test'
record['host_ip'] = host_ip
record['rawdata'] = generate_random_metrics(metric_ids)
#encode avro
writer.append(record)
writer.close()
subprocess.Popen(['sudo', '-u', 'hdfs', 'hadoop', 'fs', '-copyFromLocal', tmp_file, dir])
return filepath

def generate_random_metrics (metric_ids):
'''
generate random raw_data elementTon
'''
raw_data = {}
for id in metric_ids:
raw_data[id] = str(randint(0, 100))
return json.dumps(raw_data).encode('utf-8')

def create_hdfs_dirs (year, month, day, hour):
dir = "/user/pnda/PNDA_datasets/datasets/source=test/year=%0d/month=%02d/day=%02d/hour=%02d/" % (year, month, day, hour)
subprocess.Popen(['sudo', '-u', 'hdfs', 'hadoop', 'fs', '-mkdir', '-p', dir])
return dir

def get_args():
epilog = """ example:
- create sample data sets
data_generator.py --hosts '10.0.0.1, 10.0.0.2' --metrics 'a, b, c' --year 2016 --month 4 --day 27 --hour 14
- create sample data sets using system datetime
data_generator.py --hosts '10.0.0.1, 10.0.0.2' --metrics 'a, b, c'
"""

dt = datetime.datetime.now()
parser = argparse.ArgumentParser(formatter_class=RawTextHelpFormatter, description='Sample datasets generator', epilog=epilog)
parser.add_argument('--hosts', help='list of sample host ips separated by comma', default='')
parser.add_argument('--metrics', help='list of metrics ids', default='')
parser.add_argument('--year', type=int, help='year', default=dt.year)
parser.add_argument('--month', type=int, help='month', default=dt.month)
parser.add_argument('--day', type=int, help='day of the month', default=dt.day)
parser.add_argument('--hour', help='hour of the day', default=dt.hour)
args = parser.parse_args()
return args

def main():
args = get_args()
hosts = args.hosts.strip()
if not hosts:
print 'mandatory arg --hosts missing (aborting).'
sys.exit()

host_ips = [x.strip() for x in hosts.split(",")]

metrics = args.metrics.strip()
if not metrics:
print 'mandatory arg --metrics missing (aborting).'
sys.exit()
metric_ids = [x.strip() for x in metrics.split(",")]

year = int(args.year)
month = int(args.month)
day = int(args.day)
hour = int(args.hour)
filepath = generate_sample_datasets(host_ips, metric_ids, year, month, day, hour)
print "Success: generated file path at " + filepath

if __name__ == "__main__":
main()



19 changes: 19 additions & 0 deletions docker/dockerfiles/jupyter/docker/create_notebook_dir.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
#!/bin/sh

set -x

DIR=/home/$PAM_USER
if [ ! -d $DIR ]; then
mkdir $DIR
chmod 0755 $DIR
chown $PAM_USER: $DIR
fi

DIR=$DIR/jupyter_notebooks
if [ ! -d $DIR ]; then
mkdir $DIR
cp -r /opt/pnda/jupyter_notebooks $DIR/examples
chmod -R 0755 $DIR
chown -R $PAM_USER: $DIR
fi

4 changes: 4 additions & 0 deletions docker/dockerfiles/jupyter/docker/entrypoint.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
#/bin/sh
j2 /pyspark2_kernel.json.tpl > /usr/local/share/jupyter/kernels/pyspark2/kernel.json
j2 /platformlibs.ini.tpl > /etc/platformlibs/platformlibs.ini
/usr/bin/jupyterhub
16 changes: 16 additions & 0 deletions docker/dockerfiles/jupyter/docker/hdfs_root_uri_conf.diff
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
diff --git a/platformlibs/data_handler.py b/platformlibs/data_handler.py
index 27a2ea5..7bc1ae3 100644
--- a/platformlibs/data_handler.py
+++ b/platformlibs/data_handler.py
@@ -63,7 +63,10 @@ class DataHandler(object):
if self._hdfs_root_uri:
return self._hdfs_root_uri
cm_conf = read_config('/etc/platformlibs/platformlibs.ini')
- self._hdfs_root_uri = get_hdfs_uri(cm_conf['cm_host'], cm_conf['cm_user'], cm_conf['cm_pass'], cm_conf['hadoop_distro'])
+ if 'hdfs_root_uri' in cm_conf:
+ self._hdfs_root_uri = cm_conf['hdfs_root_uri']
+ else:
+ self._hdfs_root_uri = get_hdfs_uri(cm_conf['cm_host'], cm_conf['cm_user'], cm_conf['cm_pass'], cm_conf['hadoop_distro'])
return self._hdfs_root_uri

@property
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Minimal PNDA Jupyter SqlMagic notebook"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"'''\n",
"Use following connection string to connect to MySQL DB. Enter valid username/password and hostname/IP of mysql server. \n",
"%load_ext sql\n",
"%sql mysql+pymysql://username:password@hostname/dbname\n",
"\n",
"\n",
"Use following connection string to connect to Postregsql. Enter valid username/password and hostname/IP of postgresql server.\n",
"%load_ext sql\n",
"%sql postgresql://username:password@localhost/dbname\n",
"\n",
"Use following connection string to connect to Impala (CDH distribution only). Enter valid username/password and hostname/IP of impala server.\n",
"Note : Impala connection through impyla requires to disable autocommit. Use %config SqlMagic to check various configurations available.\n",
"%load_ext sql\n",
"%config SqlMagic.autocommit=False\n",
"%sql impala://hostname:port/dbname\n",
"'''\n",
"%load_ext sql"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 2",
"language": "python",
"name": "python2"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.6"
}
},
"nbformat": 4,
"nbformat_minor": 1
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Minimal PNDA Jupyter notebook\n",
"\n",
"`%matplotlib notebook` must be set before `import matplotlib.pyplot as plt` or plotting with matplotlib will fail "
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"%matplotlib notebook\n",
"\n",
"import matplotlib.pyplot as plt\n",
"import sys\n",
"import pandas as pd\n",
"import matplotlib"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"print(u'▶ Python version ' + sys.version)\n",
"print(u'▶ Pandas version ' + pd.__version__)\n",
"print(u'▶ Matplotlib version ' + matplotlib.__version__)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"import numpy as np\n",
"values = np.random.rand(100)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"df = pd.DataFrame(data=values, columns=['RandomValue'])\n",
"df.head(10)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"df.plot()"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "PySpark2/Python2",
"language": "python",
"name": "pyspark2"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.12"
}
},
"nbformat": 4,
"nbformat_minor": 0
}
Loading

0 comments on commit ca2c8df

Please sign in to comment.