Skip to content

Commit

Permalink
Added Dockerfiles to build pnda docker images
Browse files Browse the repository at this point in the history
  • Loading branch information
cgiraldo committed Oct 8, 2018
1 parent 929274d commit 112d264
Show file tree
Hide file tree
Showing 29 changed files with 1,288 additions and 3 deletions.
2 changes: 1 addition & 1 deletion docker/deploy.sh
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,11 @@ echo "---------------- STARTING HDFS and HBASE ----------------"
docker-compose up -d zookeeper
docker-compose up -d hdfs-namenode
docker-compose up -d hdfs-datanode
docker-compose up -d hbase-master
while ! docker exec -ti hdfs-namenode nc -vz hdfs-namenode:8020 ; do
echo "waiting for hdfs-namenode to start"
sleep 2
done
docker-compose up -d hbase-master
docker-compose up -d hbase-region

echo "---------------- ADDING users to HDFS ----------------"
Expand Down
43 changes: 41 additions & 2 deletions docker/docker-compose.yml
Original file line number Diff line number Diff line change
@@ -1,9 +1,13 @@
version: '3'
version: '3.4'
services:
gobblin:
container_name: gobblin
hostname: gobblin
image: pnda/gobblin:0.11.0-0.1.0
build:
context: ./dockerfiles/platform-gobblin-modules
args:
version: 0.1.0
environment:
- HDFS_URL=hdfs://hdfs-namenode:8020
- MASTER_DATASET_DIRECTORY=/user/pnda/PNDA_datasets/datasets
Expand Down Expand Up @@ -47,6 +51,10 @@ services:
container_name: jupyter
hostname: jupyter
image: pnda/jupyter:4.4.0
build:
context: ./dockerfiles/jupyter
args:
version: 4.4.0
volumes:
- jupyter-home:/home
environment:
Expand Down Expand Up @@ -76,6 +84,10 @@ services:
container_name: deployment-manager
hostname: deployment-manager
image: pnda/deployment-manager:1.0.0
build:
context: ./dockerfiles/platform-deployment-manager
args:
version: 1.0.0
environment:
- JUPYTER_HOST=jupyter
- DATA_LOGGER_URL=http://console-backend:3001 #data-logger uses the data-manager network stack
Expand All @@ -101,6 +113,10 @@ services:
container_name: package-repository
hostname: package-repository
image: pnda/package-repository:0.3.2
build:
context: ./dockerfiles/platform-package-repository
args:
version: 0.3.2
environment:
- FS_LOCATION_PATH=/mnt/packages
- DATA_LOGGER_URL=http://console-backend:3001 #data-logger uses the data-manager network stack
Expand Down Expand Up @@ -199,7 +215,11 @@ services:
platform-testing:
container_name: platform-testing
hostname: platform-testing
image: pnda/testing:0.5.0
image: pnda/platform-testing:0.5.0
build:
context: ./dockerfiles/platform-testing
args:
version: 0.5.0
environment:
- CONSOLE_HOSTS=console-backend:3001
- ZOOKEEPERS=zookeeper:2181
Expand All @@ -214,6 +234,10 @@ services:
container_name: console-frontend
hostname: console-frontend
image: pnda/console-frontend:1.0.0
build:
context: ./dockerfiles/platform-console-frontend
args:
version: 1.0.0
environment:
- DATA_MANAGER_HOST=console-backend
- DATA_MANAGER_PORT=3123
Expand All @@ -226,6 +250,11 @@ services:
container_name: console-backend
hostname: console-backend
image: pnda/console-backend-data-manager:1.0.0
build:
context: ./dockerfiles/platform-console-backend
args:
version: 1.0.0
target: console-backend-data-manager
environment:
- CONSOLE_FRONTEND_HOSTS_CSV=console-frontend
- DATASET_MANAGER_URL=http://data-service:7000
Expand All @@ -234,6 +263,11 @@ services:
container_name: console-backend-data-logger
network_mode: service:console-backend
image: pnda/console-backend-data-logger:1.0.0
build:
context: ./dockerfiles/platform-console-backend
args:
version: 1.0.0
target: console-backend-data-logger
redis:
container_name: redis
network_mode: service:console-backend
Expand All @@ -242,6 +276,11 @@ services:
container_name: data-service
hostname: data-service
image: pnda/data-service:0.2.2
build:
context: ./dockerfiles/platform-data-mgmnt
args:
version: 0.2.2
target: data-service
environment:
- LOCATION=/user/pnda/PNDA_datasets/datasets
- HADOOP_DISTRO=env
Expand Down
60 changes: 60 additions & 0 deletions docker/dockerfiles/jupyter/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
FROM alpine:3.7 as platformlibs

LABEL maintainer="[email protected]"
LABEL organization="gradiant.org"

COPY docker/hdfs_root_uri_conf.diff /
RUN apk add --no-cache git bash python py2-pip && pip install setuptools
RUN git clone https://github.com/pndaproject/platform-libraries.git
RUN cd platform-libraries && git checkout tags/release/4.0 && \
export VERSION=$(git describe --tags) && \
git apply /hdfs_root_uri_conf.diff && \
python setup.py bdist_egg

FROM alpine:3.7

COPY --from=platformlibs /platform-libraries/dist/platformlibs-0.1.5-py2.7.egg /
COPY docker /
ENV SPARK_HOME=/opt/spark

RUN apk add --no-cache bash python2 py2-pip postgresql-dev libpng-dev freetype-dev ca-certificates build-base python2-dev krb5-dev libffi-dev cyrus-sasl-dev nodejs shadow python3 python3-dev openjdk8-jre && \
echo 'Installing python2 requirements' && \
pip2 install -r /requirements/requirements-jupyter.txt && \
pip2 install -r /requirements/app-packages-requirements.txt && pip2 install j2cli && \
/usr/bin/python2 -m ipykernel.kernelspec --name python2 --display-name "Python 2" && \
echo 'Instaling python3 requirements' && \
pip3 install -r /requirements/requirements-jupyter.txt && \
/usr/bin/python3 -m ipykernel.kernelspec --name python3 --display-name "Python 3" && \
echo 'Adding pyspark2 support' && \
mkdir -p /usr/local/share/jupyter/kernels/pyspark2 && mkdir -p /opt && \
wget -O- https://archive.apache.org/dist/spark/spark-2.3.0/spark-2.3.0-bin-hadoop2.7.tgz | tar -xvz -C /tmp && \
mv /tmp/spark-2.3.0-bin-hadoop2.7 /opt/spark && \
echo 'Adding jupyter-scala_extension_spark' && \
jupyter nbextension enable --py widgetsnbextension --system && \
jupyter-kernelspec install /usr/lib/python3.6/site-packages/sparkmagic/kernels/sparkkernel && \
jupyter serverextension enable --py sparkmagic && \
echo 'Adding jupyter-extensions' && \
apk add --no-cache libxml2-dev libxslt-dev && \
pip3 install -r /requirements/requirements-jupyter-extensions.txt && \
jupyter serverextension enable --py jupyter_spark --system && \
jupyter nbextension install --py jupyter_spark --system && \
jupyter nbextension enable --py jupyter_spark --system && \
jupyter nbextension enable --py widgetsnbextension --system && \
echo 'Adding jupyterhub' && \
pip3 install -r /requirements/requirements-jupyterhub.txt && \
npm install -g configurable-http-proxy && mkdir -p /var/log/pnda && \
echo 'auth required pam_exec.so debug log=/var/log/pnda/login.log /create_notebook_dir.sh' >> /etc/pam.d/login
RUN echo 'Adding pnda platform-libraries' && \
mkdir /etc/platformlibs && /usr/bin/python2 -m easy_install /platformlibs-0.1.5-py2.7.egg && \
adduser -D pnda && echo "pnda:pnda" | chpasswd && \
mkdir -p /opt/pnda && mv /notebooks /opt/pnda/jupyter_notebooks && \
echo 'auth required pam_listfile.so item=user sense=deny file=/etc/login.deny onerr=succeed' >> /etc/pam.d/login && \
echo 'root' >> /etc/login.deny

RUN wget http://central.maven.org/maven2/org/apache/spark/spark-sql-kafka-0-10_2.11/2.3.0/spark-sql-kafka-0-10_2.11-2.3.0.jar \
-O /opt/spark/jars/spark-sql-kafka-0-10_2.11-2.3.0.jar && \
wget http://central.maven.org/maven2/org/apache/kafka/kafka-clients/1.0.0/kafka-clients-1.0.0.jar \
-O /opt/spark/jars/kafka-clients-1.0.0.jar

ENTRYPOINT /entrypoint.sh

4 changes: 4 additions & 0 deletions docker/dockerfiles/jupyter/build-docker.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
#!/bin/bash

VERSION=4.4.0
docker build -t pnda/jupyter:$VERSION .
114 changes: 114 additions & 0 deletions docker/dockerfiles/jupyter/data_generator.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,114 @@
#!/usr/bin/python

import argparse
import subprocess
import json
import avro.schema
import avro.io
import io
import datetime
import uuid
import time
import sys

from random import randint
from avro.datafile import DataFileWriter
from avro.io import DatumWriter
from argparse import RawTextHelpFormatter

def generate_sample_datasets (host_ips, metric_ids, year, month, day, hour):
avro_schema = ''
#load data from hdfs
cat = subprocess.Popen(['sudo', '-u', 'hdfs', 'hadoop', 'fs', '-cat', '/user/pnda/PNDA_datasets/datasets/.metadata/schema.avsc'], stdout=subprocess.PIPE)
for line in cat.stdout:
avro_schema = avro_schema + line
schema = avro.schema.parse(avro_schema)
bytes_writer = io.BytesIO()
encoder = avro.io.BinaryEncoder(bytes_writer)
#create hdfs folder structure
dir = create_hdfs_dirs (year, month, day, hour)
filename = str(uuid.uuid4()) + '.avro'
filepath = dir + filename
tmp_file = '/tmp/' + filename

writer = DataFileWriter(open(tmp_file, "w"), DatumWriter(), schema)

start_dt = datetime.datetime(year, month, day, hour, 0, 0)
start_ts = int(time.mktime(start_dt.timetuple()))
end_dt = start_dt.replace(hour=hour+1)
end_ts = int(time.mktime(end_dt.timetuple()))

for ts in xrange(start_ts, end_ts, 1):
#generate random pnda record on per host ip basis
for host_ip in host_ips:
record = {}
record['timestamp'] = (ts * 1000)
record['src'] = 'test'
record['host_ip'] = host_ip
record['rawdata'] = generate_random_metrics(metric_ids)
#encode avro
writer.append(record)
writer.close()
subprocess.Popen(['sudo', '-u', 'hdfs', 'hadoop', 'fs', '-copyFromLocal', tmp_file, dir])
return filepath

def generate_random_metrics (metric_ids):
'''
generate random raw_data elementTon
'''
raw_data = {}
for id in metric_ids:
raw_data[id] = str(randint(0, 100))
return json.dumps(raw_data).encode('utf-8')

def create_hdfs_dirs (year, month, day, hour):
dir = "/user/pnda/PNDA_datasets/datasets/source=test/year=%0d/month=%02d/day=%02d/hour=%02d/" % (year, month, day, hour)
subprocess.Popen(['sudo', '-u', 'hdfs', 'hadoop', 'fs', '-mkdir', '-p', dir])
return dir

def get_args():
epilog = """ example:
- create sample data sets
data_generator.py --hosts '10.0.0.1, 10.0.0.2' --metrics 'a, b, c' --year 2016 --month 4 --day 27 --hour 14
- create sample data sets using system datetime
data_generator.py --hosts '10.0.0.1, 10.0.0.2' --metrics 'a, b, c'
"""

dt = datetime.datetime.now()
parser = argparse.ArgumentParser(formatter_class=RawTextHelpFormatter, description='Sample datasets generator', epilog=epilog)
parser.add_argument('--hosts', help='list of sample host ips separated by comma', default='')
parser.add_argument('--metrics', help='list of metrics ids', default='')
parser.add_argument('--year', type=int, help='year', default=dt.year)
parser.add_argument('--month', type=int, help='month', default=dt.month)
parser.add_argument('--day', type=int, help='day of the month', default=dt.day)
parser.add_argument('--hour', help='hour of the day', default=dt.hour)
args = parser.parse_args()
return args

def main():
args = get_args()
hosts = args.hosts.strip()
if not hosts:
print 'mandatory arg --hosts missing (aborting).'
sys.exit()

host_ips = [x.strip() for x in hosts.split(",")]

metrics = args.metrics.strip()
if not metrics:
print 'mandatory arg --metrics missing (aborting).'
sys.exit()
metric_ids = [x.strip() for x in metrics.split(",")]

year = int(args.year)
month = int(args.month)
day = int(args.day)
hour = int(args.hour)
filepath = generate_sample_datasets(host_ips, metric_ids, year, month, day, hour)
print "Success: generated file path at " + filepath

if __name__ == "__main__":
main()



19 changes: 19 additions & 0 deletions docker/dockerfiles/jupyter/docker/create_notebook_dir.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
#!/bin/sh

set -x

DIR=/home/$PAM_USER
if [ ! -d $DIR ]; then
mkdir $DIR
chmod 0755 $DIR
chown $PAM_USER: $DIR
fi

DIR=$DIR/jupyter_notebooks
if [ ! -d $DIR ]; then
mkdir $DIR
cp -r /opt/pnda/jupyter_notebooks $DIR/examples
chmod -R 0755 $DIR
chown -R $PAM_USER: $DIR
fi

4 changes: 4 additions & 0 deletions docker/dockerfiles/jupyter/docker/entrypoint.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
#/bin/sh
j2 /pyspark2_kernel.json.tpl > /usr/local/share/jupyter/kernels/pyspark2/kernel.json
j2 /platformlibs.ini.tpl > /etc/platformlibs/platformlibs.ini
/usr/bin/jupyterhub
16 changes: 16 additions & 0 deletions docker/dockerfiles/jupyter/docker/hdfs_root_uri_conf.diff
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
diff --git a/platformlibs/data_handler.py b/platformlibs/data_handler.py
index 27a2ea5..7bc1ae3 100644
--- a/platformlibs/data_handler.py
+++ b/platformlibs/data_handler.py
@@ -63,7 +63,10 @@ class DataHandler(object):
if self._hdfs_root_uri:
return self._hdfs_root_uri
cm_conf = read_config('/etc/platformlibs/platformlibs.ini')
- self._hdfs_root_uri = get_hdfs_uri(cm_conf['cm_host'], cm_conf['cm_user'], cm_conf['cm_pass'], cm_conf['hadoop_distro'])
+ if 'hdfs_root_uri' in cm_conf:
+ self._hdfs_root_uri = cm_conf['hdfs_root_uri']
+ else:
+ self._hdfs_root_uri = get_hdfs_uri(cm_conf['cm_host'], cm_conf['cm_user'], cm_conf['cm_pass'], cm_conf['hadoop_distro'])
return self._hdfs_root_uri

@property
Loading

0 comments on commit 112d264

Please sign in to comment.