Adding all dockerfiles for PNDA services

pndaproject · Oct 16, 2018 · ca2c8df · ca2c8df
1 parent 41c92d6
commit ca2c8df
Show file tree

Hide file tree

Showing 83 changed files with 2,147 additions and 0 deletions.
diff --git a/docker/dockerfiles/jupyter/Dockerfile b/docker/dockerfiles/jupyter/Dockerfile
@@ -0,0 +1,60 @@
+FROM alpine:3.7 as platformlibs
+
+LABEL maintainer="[email protected]"
+LABEL organization="gradiant.org"
+
+COPY docker/hdfs_root_uri_conf.diff /
+RUN apk add --no-cache git bash python py2-pip && pip install setuptools
+RUN git clone https://github.com/pndaproject/platform-libraries.git
+RUN cd platform-libraries && git checkout tags/release/4.0 && \
+    export VERSION=$(git describe --tags) && \
+    git apply /hdfs_root_uri_conf.diff && \
+    python setup.py bdist_egg
+
+FROM alpine:3.7
+
+COPY --from=platformlibs /platform-libraries/dist/platformlibs-0.1.5-py2.7.egg /
+COPY docker /
+ENV SPARK_HOME=/opt/spark
+
+RUN apk add --no-cache bash python2 py2-pip postgresql-dev libpng-dev freetype-dev ca-certificates build-base python2-dev krb5-dev libffi-dev cyrus-sasl-dev nodejs shadow python3 python3-dev openjdk8-jre && \
+    echo 'Installing python2 requirements' && \
+    pip2 install -r /requirements/requirements-jupyter.txt && \
+    pip2 install -r /requirements/app-packages-requirements.txt && pip2 install j2cli && \
+    /usr/bin/python2 -m ipykernel.kernelspec --name python2 --display-name "Python 2" && \
+    echo 'Instaling  python3 requirements' && \
+    pip3 install -r /requirements/requirements-jupyter.txt && \
+    /usr/bin/python3 -m ipykernel.kernelspec --name python3 --display-name "Python 3" && \
+    echo 'Adding pyspark2 support' && \
+    mkdir -p /usr/local/share/jupyter/kernels/pyspark2 && mkdir -p /opt && \
+    wget -O- https://archive.apache.org/dist/spark/spark-2.3.0/spark-2.3.0-bin-hadoop2.7.tgz | tar -xvz -C /tmp && \
+    mv /tmp/spark-2.3.0-bin-hadoop2.7 /opt/spark && \
+    echo 'Adding jupyter-scala_extension_spark' && \
+    jupyter nbextension enable --py widgetsnbextension --system && \
+    jupyter-kernelspec install  /usr/lib/python3.6/site-packages/sparkmagic/kernels/sparkkernel && \
+    jupyter serverextension enable --py sparkmagic && \
+    echo 'Adding jupyter-extensions' && \
+    apk add --no-cache libxml2-dev libxslt-dev && \
+    pip3 install -r /requirements/requirements-jupyter-extensions.txt && \
+    jupyter serverextension enable --py jupyter_spark --system && \
+    jupyter nbextension install --py jupyter_spark --system && \
+    jupyter nbextension enable --py jupyter_spark --system && \
+    jupyter nbextension enable --py widgetsnbextension --system  && \
+    echo 'Adding jupyterhub' && \
+    pip3 install -r /requirements/requirements-jupyterhub.txt && \
+    npm install -g configurable-http-proxy && mkdir -p /var/log/pnda && \
+    echo 'auth    required    pam_exec.so    debug log=/var/log/pnda/login.log /create_notebook_dir.sh' >> /etc/pam.d/login
+RUN echo 'Adding pnda platform-libraries' && \
+    mkdir /etc/platformlibs && /usr/bin/python2 -m easy_install /platformlibs-0.1.5-py2.7.egg && \
+    adduser -D pnda && echo "pnda:pnda" | chpasswd && \
+    mkdir -p /opt/pnda && mv /notebooks /opt/pnda/jupyter_notebooks && \
+    echo 'auth required pam_listfile.so item=user sense=deny file=/etc/login.deny onerr=succeed' >> /etc/pam.d/login && \
+    echo 'root' >> /etc/login.deny
+
+RUN wget http://central.maven.org/maven2/org/apache/spark/spark-sql-kafka-0-10_2.11/2.3.0/spark-sql-kafka-0-10_2.11-2.3.0.jar \
+-O /opt/spark/jars/spark-sql-kafka-0-10_2.11-2.3.0.jar && \
+wget http://central.maven.org/maven2/org/apache/kafka/kafka-clients/1.0.0/kafka-clients-1.0.0.jar \
+-O /opt/spark/jars/kafka-clients-1.0.0.jar
+
+ENTRYPOINT /entrypoint.sh
+
diff --git a/docker/dockerfiles/jupyter/build-docker.sh b/docker/dockerfiles/jupyter/build-docker.sh
@@ -0,0 +1,4 @@
+#!/bin/bash
+
+VERSION=4.4.0
+docker build -t pnda/jupyter:$VERSION .
diff --git a/docker/dockerfiles/jupyter/data_generator.py b/docker/dockerfiles/jupyter/data_generator.py
@@ -0,0 +1,114 @@
+#!/usr/bin/python
+
+import argparse
+import subprocess
+import json
+import avro.schema
+import avro.io
+import io
+import datetime
+import uuid
+import time
+import sys
+
+from random import randint
+from avro.datafile import DataFileWriter
+from avro.io import DatumWriter
+from argparse import RawTextHelpFormatter
+
+def generate_sample_datasets (host_ips, metric_ids, year, month, day, hour):
+    avro_schema = ''
+    #load data from hdfs
+    cat = subprocess.Popen(['sudo', '-u', 'hdfs', 'hadoop', 'fs', '-cat', '/user/pnda/PNDA_datasets/datasets/.metadata/schema.avsc'], stdout=subprocess.PIPE)
+    for line in cat.stdout:
+        avro_schema = avro_schema + line
+    schema = avro.schema.parse(avro_schema)
+    bytes_writer = io.BytesIO()
+    encoder = avro.io.BinaryEncoder(bytes_writer)
+    #create hdfs folder structure
+    dir = create_hdfs_dirs (year, month, day, hour)
+    filename = str(uuid.uuid4()) + '.avro'
+    filepath = dir + filename
+    tmp_file = '/tmp/' + filename
+
+    writer = DataFileWriter(open(tmp_file, "w"), DatumWriter(), schema)
+
+    start_dt = datetime.datetime(year, month, day, hour, 0, 0) 
+    start_ts = int(time.mktime(start_dt.timetuple()))
+    end_dt = start_dt.replace(hour=hour+1)
+    end_ts = int(time.mktime(end_dt.timetuple()))
+
+    for ts in xrange(start_ts, end_ts, 1):
+        #generate random pnda record on per host ip basis
+        for host_ip in host_ips:
+           record = {}
+           record['timestamp'] = (ts * 1000)
+           record['src'] = 'test'
+           record['host_ip'] = host_ip
+           record['rawdata'] = generate_random_metrics(metric_ids)
+           #encode avro
+           writer.append(record)
+    writer.close()
+    subprocess.Popen(['sudo', '-u', 'hdfs', 'hadoop', 'fs', '-copyFromLocal', tmp_file, dir])
+    return filepath
+
+def generate_random_metrics (metric_ids):
+    '''
+        generate random raw_data elementTon
+    '''
+    raw_data = {}
+    for id in metric_ids:
+        raw_data[id] = str(randint(0, 100))
+    return json.dumps(raw_data).encode('utf-8')
+
+def create_hdfs_dirs (year, month, day, hour):
+    dir = "/user/pnda/PNDA_datasets/datasets/source=test/year=%0d/month=%02d/day=%02d/hour=%02d/" % (year, month, day, hour)
+    subprocess.Popen(['sudo', '-u', 'hdfs', 'hadoop', 'fs', '-mkdir', '-p', dir])
+    return dir    
+
+def get_args():
+    epilog = """ example:
+    - create sample data sets
+      data_generator.py --hosts '10.0.0.1, 10.0.0.2' --metrics 'a, b, c' --year 2016 --month 4 --day 27 --hour 14
+    - create sample data sets using system datetime
+      data_generator.py --hosts '10.0.0.1, 10.0.0.2' --metrics 'a, b, c'
+    """
+
+    dt = datetime.datetime.now()
+    parser = argparse.ArgumentParser(formatter_class=RawTextHelpFormatter, description='Sample datasets generator', epilog=epilog)
+    parser.add_argument('--hosts', help='list of sample host ips separated by comma', default='')
+    parser.add_argument('--metrics', help='list of metrics ids', default='')
+    parser.add_argument('--year', type=int, help='year', default=dt.year)
+    parser.add_argument('--month', type=int, help='month', default=dt.month)
+    parser.add_argument('--day', type=int, help='day of the month', default=dt.day)
+    parser.add_argument('--hour', help='hour of the day', default=dt.hour)
+    args = parser.parse_args()
+    return args
+
+def main():
+    args = get_args() 
+    hosts = args.hosts.strip()
+    if not hosts:
+        print 'mandatory arg --hosts missing (aborting).'
+        sys.exit()
+
+    host_ips = [x.strip() for x in hosts.split(",")]
+
+    metrics = args.metrics.strip()
+    if not metrics:
+        print 'mandatory arg --metrics missing (aborting).'
+        sys.exit()
+    metric_ids = [x.strip() for x in metrics.split(",")]
+
+    year = int(args.year)
+    month = int(args.month)
+    day = int(args.day)
+    hour = int(args.hour)
+    filepath = generate_sample_datasets(host_ips, metric_ids, year, month, day, hour)
+    print "Success: generated file path at " + filepath
+
+if __name__ == "__main__":
+    main()
+
+
+
diff --git a/docker/dockerfiles/jupyter/docker/create_notebook_dir.sh b/docker/dockerfiles/jupyter/docker/create_notebook_dir.sh
@@ -0,0 +1,19 @@
+#!/bin/sh
+
+set -x
+
+DIR=/home/$PAM_USER
+if [ ! -d $DIR ]; then
+    mkdir $DIR
+    chmod 0755 $DIR
+    chown $PAM_USER: $DIR
+fi
+
+DIR=$DIR/jupyter_notebooks
+if [ ! -d $DIR ]; then
+    mkdir $DIR
+    cp -r /opt/pnda/jupyter_notebooks $DIR/examples
+    chmod -R 0755 $DIR
+    chown -R $PAM_USER: $DIR
+fi
+
diff --git a/docker/dockerfiles/jupyter/docker/entrypoint.sh b/docker/dockerfiles/jupyter/docker/entrypoint.sh
@@ -0,0 +1,4 @@
+#/bin/sh
+j2 /pyspark2_kernel.json.tpl > /usr/local/share/jupyter/kernels/pyspark2/kernel.json
+j2 /platformlibs.ini.tpl > /etc/platformlibs/platformlibs.ini
+/usr/bin/jupyterhub
diff --git a/docker/dockerfiles/jupyter/docker/hdfs_root_uri_conf.diff b/docker/dockerfiles/jupyter/docker/hdfs_root_uri_conf.diff
@@ -0,0 +1,16 @@
+diff --git a/platformlibs/data_handler.py b/platformlibs/data_handler.py
+index 27a2ea5..7bc1ae3 100644
+--- a/platformlibs/data_handler.py
++++ b/platformlibs/data_handler.py
+@@ -63,7 +63,10 @@ class DataHandler(object):
+         if self._hdfs_root_uri:
+             return self._hdfs_root_uri
+         cm_conf = read_config('/etc/platformlibs/platformlibs.ini')
+-        self._hdfs_root_uri = get_hdfs_uri(cm_conf['cm_host'], cm_conf['cm_user'], cm_conf['cm_pass'], cm_conf['hadoop_distro'])
++        if 'hdfs_root_uri' in cm_conf:
++            self._hdfs_root_uri = cm_conf['hdfs_root_uri']
++        else:
++            self._hdfs_root_uri = get_hdfs_uri(cm_conf['cm_host'], cm_conf['cm_user'], cm_conf['cm_pass'], cm_conf['hadoop_distro'])
+         return self._hdfs_root_uri
+
+     @property
diff --git a/docker/dockerfiles/jupyter/docker/notebooks/PNDA minimal SqlMagic notebook.ipynb b/docker/dockerfiles/jupyter/docker/notebooks/PNDA minimal SqlMagic notebook.ipynb
@@ -0,0 +1,57 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Minimal PNDA Jupyter SqlMagic notebook"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "'''\n",
+    "Use following connection string to connect to MySQL DB. Enter valid username/password and hostname/IP of mysql server. \n",
+    "%load_ext sql\n",
+    "%sql mysql+pymysql://username:password@hostname/dbname\n",
+    "\n",
+    "\n",
+    "Use following connection string to connect to Postregsql. Enter valid username/password and hostname/IP of postgresql server.\n",
+    "%load_ext sql\n",
+    "%sql postgresql://username:password@localhost/dbname\n",
+    "\n",
+    "Use following connection string to connect to Impala (CDH distribution only). Enter valid username/password and hostname/IP of impala server.\n",
+    "Note : Impala connection through impyla requires to disable autocommit. Use %config SqlMagic to check various configurations available.\n",
+    "%load_ext sql\n",
+    "%config SqlMagic.autocommit=False\n",
+    "%sql impala://hostname:port/dbname\n",
+    "'''\n",
+    "%load_ext sql"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 2",
+   "language": "python",
+   "name": "python2"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 2
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython2",
+   "version": "2.7.6"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 1
+}
diff --git a/docker/dockerfiles/jupyter/docker/notebooks/PNDA minimal notebook.ipynb b/docker/dockerfiles/jupyter/docker/notebooks/PNDA minimal notebook.ipynb
@@ -0,0 +1,98 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Minimal PNDA Jupyter notebook\n",
+    "\n",
+    "`%matplotlib notebook` must be set before `import matplotlib.pyplot as plt` or plotting with matplotlib will fail "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "%matplotlib notebook\n",
+    "\n",
+    "import matplotlib.pyplot as plt\n",
+    "import sys\n",
+    "import pandas as pd\n",
+    "import matplotlib"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "print(u'▶ Python version ' + sys.version)\n",
+    "print(u'▶ Pandas version ' + pd.__version__)\n",
+    "print(u'▶ Matplotlib version ' + matplotlib.__version__)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "import numpy as np\n",
+    "values = np.random.rand(100)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "df = pd.DataFrame(data=values, columns=['RandomValue'])\n",
+    "df.head(10)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "df.plot()"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "PySpark2/Python2",
+   "language": "python",
+   "name": "pyspark2"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 2
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython2",
+   "version": "2.7.12"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 0
+}