diff --git a/tools/helm/livy/Dockerfile b/tools/helm/livy/Dockerfile index 7fe535f479..8aa5157fa1 100644 --- a/tools/helm/livy/Dockerfile +++ b/tools/helm/livy/Dockerfile @@ -1,5 +1,55 @@ -FROM java:openjdk-8-jdk -MAINTAINER Dalitso Banda +FROM openjdk:8-jdk-slim-buster +LABEL maintainer="Dalitso Banda dalitsohb@gmail.com" + +# Get Spark from US Apache mirror. +ENV APACHE_SPARK_VERSION 2.4.5 +ENV HADOOP_VERSION 3.2.1 + +RUN echo "$LOG_TAG Getting SPARK_HOME" && \ + apt-get update && \ + # build deps and deps for c bindings for cntk + apt-get install -y build-essential && \ + apt-get install -y autoconf automake libtool curl make unzip && \ + mkdir -p /opt && \ + cd /opt && \ + curl http://apache.claz.org/spark/spark-${APACHE_SPARK_VERSION}/spark-${APACHE_SPARK_VERSION}-bin-without-hadoop.tgz | \ + tar -xz && \ + ln -s spark-${APACHE_SPARK_VERSION}-bin-without-hadoop spark && \ + echo Spark ${APACHE_SPARK_VERSION} installed in /opt/spark && \ + export SPARK_HOME=/opt/spark + +RUN echo "downloading hadoop" && \ + apt-get install -y wget && \ + cd /tmp && \ + wget http://apache.claz.org/hadoop/common/hadoop-${HADOOP_VERSION}/hadoop-${HADOOP_VERSION}.tar.gz -O - | \ + tar -xz && \ + mv /tmp/hadoop-${HADOOP_VERSION} /opt/hadoop && \ + echo "export HADOOP_CLASSPATH=/opt/hadoop/share/hadoop/tools/lib/*" >> /opt/hadoop/etc/hadoop/hadoop-env.sh && \ + echo Hadoop ${HADOOP_VERSION} installed in /opt/hadoop && \ + rm -rf /opt/hadoop/share/doc + +RUN echo "\nSPARK_DIST_CLASSPATH=/jars:/jars/*:$(/opt/hadoop/bin/hadoop classpath)" >> /opt/spark/conf/spark-env.sh +ENV HADOOP_HOME=/opt/hadoop +ADD jars /jars + +# if numpy is installed on a driver it needs to be installed on all +# workers, so install it everywhere +RUN apt-get update && \ + apt install -y python3-pip && \ + pip3 install numpy && \ + pip3 install matplotlib && \ + pip3 install pandas==0.24.1 && \ + pip3 install scikit-learn && \ + pip3 install pyarrow==0.11.1 && \ + apt-get clean && \ + rm -rf /var/lib/apt/lists/* + +# Final config +ADD spark-config/log4j.properties /opt/spark/conf/log4j.properties +ADD spark-config/start-common.sh spark-config/start-worker spark-config/start-master / +ADD spark-config/core-site.xml /opt/spark/conf/core-site.xml +ADD spark-config/spark-defaults.conf /opt/spark/conf/spark-defaults.conf +ENV PATH $PATH:/opt/spark/bin ENV LIVY_VERSION="git_master" ENV LIVY_COMMIT="02550f7919b7348b6a7270cf806e031670037b2f" @@ -9,91 +59,43 @@ ENV LOG_TAG="[LIVY_${LIVY_VERSION}]:" \ LC_ALL=en_US.UTF-8 RUN echo "$LOG_TAG Install essentials" && \ - apt-get -y update && \ - apt-get install -y locales && \ - locale-gen $LANG && \ - apt-get install -y git wget grep curl sed && \ - apt-get install -y python-setuptools && \ - apt-get autoclean && apt-get autoremove && \ - echo "$LOG_TAG Install python dependencies" && \ - curl https://bootstrap.pypa.io/get-pip.py -o get-pip.py && \ - python get-pip.py && \ - rm get-pip.py && \ - apt-get install -y python-dev libpython3-dev build-essential pkg-config gfortran && \ - pip install -U pip setuptools wheel && \ + apt-get update && \ + apt-get install -y git wget curl && \ + echo "$LOG_TAG setting python dependencies" && \ + ln -s /usr/bin/python3 /usr/bin/python && \ echo "$LOG_TAG Getting maven" && \ wget http://www.eu.apache.org/dist/maven/maven-3/3.3.9/binaries/apache-maven-3.3.9-bin.tar.gz && \ tar -zxf apache-maven-3.3.9-bin.tar.gz -C /usr/local/ && \ + rm -rf apache-maven-3.3.9-bin.tar.gz && \ ln -s /usr/local/apache-maven-3.3.9/bin/mvn /usr/local/bin/mvn && \ echo "$LOG_TAG Download and build Livy source" && \ git clone https://github.com/apache/incubator-livy.git ${LIVY_HOME}_src && \ cd ${LIVY_HOME}_src && \ git checkout ${LIVY_COMMIT} && \ mvn package -DskipTests && \ - ls ${LIVY_HOME}_src && \ + mv ${LIVY_HOME}_src ${LIVY_HOME} && \ echo "$LOG_TAG Cleanup" && \ - apt-get purge -y --auto-remove build-essential pkg-config gfortran libpython3-dev && \ - apt-get autoremove && \ - apt-get autoclean && \ - apt-get clean && \ + rm -rf /usr/local/apache-maven-3.3.9 && \ rm -rf /root/.ivy2 && \ rm -rf /root/.npm && \ rm -rf /root/.m2 && \ rm -rf /root/.cache && \ rm -rf /tmp/* -# Get Spark from US Apache mirror. -ENV APACHE_SPARK_VERSION 2.4.0 -ENV HADOOP_VERSION 3.2.0 -ENV HADOOP_GIT_COMMIT="release-3.2.0-RC1" - -RUN echo "$LOG_TAG Getting SPARK_HOME" && \ - mkdir -p /opt && \ - cd /opt && \ - curl http://apache.claz.org/spark/spark-${APACHE_SPARK_VERSION}/spark-${APACHE_SPARK_VERSION}-bin-without-hadoop.tgz | \ - tar -xz && \ - ln -s spark-${APACHE_SPARK_VERSION}-bin-without-hadoop spark && \ - echo Spark ${APACHE_SPARK_VERSION} installed in /opt/spark && \ - export SPARK_HOME=/opt/spark - -RUN echo "$LOG_TAG building hadoop" && \ - apt-get update && \ - apt-get install -y make autoconf automake libtool g++ unzip && \ - cd / && \ - git clone https://github.com/apache/hadoop.git hadoop_src&& \ - mkdir /hadoop_deps && cd /hadoop_deps && \ - wget https://github.com/protocolbuffers/protobuf/releases/download/v2.5.0/protobuf-2.5.0.tar.bz2 && \ - tar xvf protobuf-2.5.0.tar.bz2 && \ - cd protobuf-2.5.0 && \ - ./configure && make && make install && ldconfig && \ - cd /hadoop_src && git checkout ${HADOOP_GIT_COMMIT} && mvn package -Pdist -DskipTests -Dtar && \ - mv hadoop-dist/target/hadoop-${HADOOP_VERSION} /opt/hadoop && \ - rm -r /hadoop_src && \ - rm -rf /root/.ivy2 && \ - rm -rf /root/.m2 && \ - export HADOOP_HOME=/opt/hadoop && \ - echo "\nexport HADOOP_CLASSPATH=/opt/hadoop/share/hadoop/tools/lib/*" >> /opt/hadoop/etc/hadoop/hadoop-env.sh && \ - echo Hadoop ${HADOOP_VERSION} installed in /opt/hadoop && \ - apt-get purge -y --auto-remove g++ make build-essential autoconf automake && \ - cd / && rm -rf /hadoop_deps - -RUN echo "\nSPARK_DIST_CLASSPATH=/jars:/jars/*:$(/opt/hadoop/bin/hadoop classpath)" >> /opt/spark/conf/spark-env.sh -ENV HADOOP_HOME=/opt/hadoop ADD jars /jars ENV HADOOP_CONF_DIR /opt/hadoop/conf ENV CONF_DIR /livy/conf ENV SPARK_CONF_DIR /opt/spark/conf -RUN mv ${LIVY_HOME}_src ${LIVY_HOME} ADD livy.conf ${LIVY_HOME}/conf EXPOSE 8998 WORKDIR ${LIVY_HOME} -RUN mkdir logs && export SPARK_HOME=/opt/spark && export HADOOP_HOME=/opt/hadoop && export SPARK_CONF_DIR=/opt/spark/conf +RUN mkdir logs #hive needed for livy pyspark -RUN wget http://central.maven.org/maven2/org/apache/spark/spark-hive_2.11/2.4.0/spark-hive_2.11-2.4.0.jar -P /opt/spark/jars +RUN wget https://repo1.maven.org/maven2/org/apache/spark/spark-hive_2.11/2.4.5/spark-hive_2.11-2.4.5.jar -P /opt/spark/jars -CMD ["sh", "-c", "echo '\nspark.driver.host' $(hostname -i) >> /opt/spark/conf/spark-defaults.conf && echo '\nlivy.spark.master' $SPARK_MASTER >> /livy/conf/livy.conf && bin/livy-server"] +CMD ["sh", "-c", "echo '\nspark.driver.host' $(hostname -i) >> /opt/spark/conf/spark-defaults.conf && echo '\nlivy.spark.master' $SPARK_MASTER >> /livy/conf/livy.conf && bin/livy-server"] \ No newline at end of file diff --git a/tools/helm/livy/spark-config/core-site.xml b/tools/helm/livy/spark-config/core-site.xml new file mode 100644 index 0000000000..2fecabedc8 --- /dev/null +++ b/tools/helm/livy/spark-config/core-site.xml @@ -0,0 +1,19 @@ + + + + + + fs.gs.impl + com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem + The FileSystem for gs: (GCS) uris. + + + fs.AbstractFileSystem.gs.impl + com.google.cloud.hadoop.fs.gcs.GoogleHadoopFS + The AbstractFileSystem for gs: (GCS) uris. Only necessary for use with Hadoop 2. + + + fs.gs.project.id + NOT_RUNNING_INSIDE_GCE + + diff --git a/tools/helm/livy/spark-config/log4j.properties b/tools/helm/livy/spark-config/log4j.properties new file mode 100644 index 0000000000..3a2a882198 --- /dev/null +++ b/tools/helm/livy/spark-config/log4j.properties @@ -0,0 +1,12 @@ +# Set everything to be logged to the console +log4j.rootCategory=INFO, console +log4j.appender.console=org.apache.log4j.ConsoleAppender +log4j.appender.console.target=System.err +log4j.appender.console.layout=org.apache.log4j.PatternLayout +log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{1}: %m%n + +# Settings to quiet third party logs that are too verbose +log4j.logger.org.spark-project.jetty=WARN +log4j.logger.org.spark-project.jetty.util.component.AbstractLifeCycle=ERROR +log4j.logger.org.apache.spark.repl.SparkIMain$exprTyper=INFO +log4j.logger.org.apache.spark.repl.SparkILoop$SparkILoopInterpreter=INFO diff --git a/tools/helm/livy/spark-config/spark-defaults.conf b/tools/helm/livy/spark-config/spark-defaults.conf new file mode 100644 index 0000000000..5b3e62b9f4 --- /dev/null +++ b/tools/helm/livy/spark-config/spark-defaults.conf @@ -0,0 +1 @@ +spark.app.id KubernetesSpark diff --git a/tools/helm/livy/spark-config/start-common.sh b/tools/helm/livy/spark-config/start-common.sh new file mode 100644 index 0000000000..ac8d505838 --- /dev/null +++ b/tools/helm/livy/spark-config/start-common.sh @@ -0,0 +1,30 @@ +#!/bin/bash + +# Copyright 2015 The Kubernetes Authors All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +PROJECT_ID=$(curl -s -H "Metadata-Flavor: Google" http://metadata.google.internal/computeMetadata/v1/project/project-id) + +if [[ -n "${PROJECT_ID}" ]]; then + sed -i "s/NOT_RUNNING_INSIDE_GCE/${PROJECT_ID}/" /opt/spark/conf/core-site.xml +fi + +# We don't want any of the incoming service variables, we'd rather use +# DNS. But this one interferes directly with Spark. +unset SPARK_MASTER_PORT + +# spark.{executor,driver}.extraLibraryPath don't actually seem to +# work, this seems to be the only reliable way to get the native libs +# picked up. +export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/opt/hadoop/lib/native diff --git a/tools/helm/livy/spark-config/start-master b/tools/helm/livy/spark-config/start-master new file mode 100644 index 0000000000..f5e83a3074 --- /dev/null +++ b/tools/helm/livy/spark-config/start-master @@ -0,0 +1,22 @@ +#!/bin/bash + +# Copyright 2015 The Kubernetes Authors All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +. start-common.sh + +echo "$(hostname -i) spark-master" >> /etc/hosts + +# Run spark-class directly so that when it exits (or crashes), the pod restarts. +/opt/spark/bin/spark-class org.apache.spark.deploy.master.Master --ip spark-master --port 7077 --webui-port 8080 diff --git a/tools/helm/livy/spark-config/start-worker b/tools/helm/livy/spark-config/start-worker new file mode 100644 index 0000000000..5b9ccaebce --- /dev/null +++ b/tools/helm/livy/spark-config/start-worker @@ -0,0 +1,28 @@ +#!/bin/bash + +# Copyright 2015 The Kubernetes Authors All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +. start-common.sh + +if ! getent hosts spark-master; then + echo "=== Cannot resolve the DNS entry for spark-master. Has the service been created yet, and is SkyDNS functional?" + echo "=== See http://kubernetes.io/v1.1/docs/admin/dns.html for more details on DNS integration." + echo "=== Sleeping 10s before pod exit." + sleep 10 + exit 0 +fi + +# Run spark-class directly so that when it exits (or crashes), the pod restarts. +/opt/spark/bin/spark-class org.apache.spark.deploy.worker.Worker spark://spark-master:7077 --webui-port 8081 diff --git a/tools/helm/spark/Dockerfile b/tools/helm/spark/Dockerfile index a04d5e62e6..7c89e2d448 100644 --- a/tools/helm/spark/Dockerfile +++ b/tools/helm/spark/Dockerfile @@ -1,11 +1,15 @@ -FROM java:openjdk-8-jdk +FROM openjdk:8-jdk-slim-buster +LABEL maintainer="Dalitso Banda dalitsohb@gmail.com" # Get Spark from US Apache mirror. -ENV APACHE_SPARK_VERSION 2.4.0 -ENV HADOOP_VERSION 3.2.0 -ENV HADOOP_GIT_COMMIT="release-3.2.0-RC1" +ENV APACHE_SPARK_VERSION 2.4.5 +ENV HADOOP_VERSION 3.2.1 RUN echo "$LOG_TAG Getting SPARK_HOME" && \ + apt-get update && \ + # build deps and deps for c bindings for cntk + apt-get install -y build-essential && \ + apt-get install -y autoconf automake libtool curl make unzip && \ mkdir -p /opt && \ cd /opt && \ curl http://apache.claz.org/spark/spark-${APACHE_SPARK_VERSION}/spark-${APACHE_SPARK_VERSION}-bin-without-hadoop.tgz | \ @@ -14,55 +18,35 @@ RUN echo "$LOG_TAG Getting SPARK_HOME" && \ echo Spark ${APACHE_SPARK_VERSION} installed in /opt/spark && \ export SPARK_HOME=/opt/spark -RUN echo "$LOG_TAG Getting maven" && \ - wget http://www.eu.apache.org/dist/maven/maven-3/3.3.9/binaries/apache-maven-3.3.9-bin.tar.gz && \ - tar -zxf apache-maven-3.3.9-bin.tar.gz -C /usr/local/ && \ - ln -s /usr/local/apache-maven-3.3.9/bin/mvn /usr/local/bin/mvn - -RUN echo "$LOG_TAG building hadoop" && \ - echo "deb http://deb.debian.org/debian stretch main" >> /etc/apt/sources.list && \ - apt-get update && \ - # build deps and deps for c bindings for cntk - apt-get install -y g++ gcc-6 libstdc++-6-dev make build-essential && \ - apt-get install -y autoconf automake libtool curl make unzip && \ - cd / && \ - git clone https://github.com/apache/hadoop.git hadoop_src&& \ - mkdir /hadoop_deps && cd /hadoop_deps && \ - wget https://github.com/protocolbuffers/protobuf/releases/download/v2.5.0/protobuf-2.5.0.tar.bz2 && \ - tar xvf protobuf-2.5.0.tar.bz2 && \ - cd protobuf-2.5.0 && \ - ./configure && make && make install && ldconfig && \ - cd /hadoop_src && git checkout ${HADOOP_GIT_COMMIT} && mvn package -Pdist -DskipTests -Dtar && \ - mv hadoop-dist/target/hadoop-${HADOOP_VERSION} /opt/hadoop && \ - rm -r /hadoop_src && \ - export HADOOP_HOME=/opt/hadoop && \ - echo "\nexport HADOOP_CLASSPATH=/opt/hadoop/share/hadoop/tools/lib/*" >> /opt/hadoop/etc/hadoop/hadoop-env.sh && \ +RUN echo "downloading hadoop" && \ + apt-get install -y wget && \ + cd /tmp && \ + wget http://apache.claz.org/hadoop/common/hadoop-${HADOOP_VERSION}/hadoop-${HADOOP_VERSION}.tar.gz -O - | \ + tar -xz && \ + mv /tmp/hadoop-${HADOOP_VERSION} /opt/hadoop && \ + echo "export HADOOP_CLASSPATH=/opt/hadoop/share/hadoop/tools/lib/*" >> /opt/hadoop/etc/hadoop/hadoop-env.sh && \ echo Hadoop ${HADOOP_VERSION} installed in /opt/hadoop && \ - apt-get purge -y --auto-remove g++ make build-essential autoconf automake && \ - cd / && rm -rf /hadoop_deps + rm -rf /opt/hadoop/share/doc RUN echo "\nSPARK_DIST_CLASSPATH=/jars:/jars/*:$(/opt/hadoop/bin/hadoop classpath)" >> /opt/spark/conf/spark-env.sh ENV HADOOP_HOME=/opt/hadoop ADD jars /jars - # if numpy is installed on a driver it needs to be installed on all # workers, so install it everywhere RUN apt-get update && \ - apt-get install -y g++ python-dev build-essential python3-dev && \ - curl https://bootstrap.pypa.io/get-pip.py -o get-pip.py && \ - python get-pip.py && \ - rm get-pip.py && \ - pip install -U pip setuptools wheel && \ - pip install numpy && \ - pip install matplotlib && \ - pip install pandas && \ - apt-get purge -y --auto-remove python-dev build-essential python3-dev && \ + apt install -y python3-pip && \ + pip3 install numpy && \ + pip3 install matplotlib && \ + pip3 install pandas==0.24.1 && \ + pip3 install scikit-learn && \ + pip3 install pyarrow==0.11.1 && \ apt-get clean && \ rm -rf /var/lib/apt/lists/* +# Final config ADD log4j.properties /opt/spark/conf/log4j.properties ADD start-common.sh start-worker start-master / ADD core-site.xml /opt/spark/conf/core-site.xml ADD spark-defaults.conf /opt/spark/conf/spark-defaults.conf -ENV PATH $PATH:/opt/spark/bin +ENV PATH $PATH:/opt/spark/bin \ No newline at end of file diff --git a/tools/helm/zepplin/Dockerfile b/tools/helm/zeppelin/Dockerfile similarity index 51% rename from tools/helm/zepplin/Dockerfile rename to tools/helm/zeppelin/Dockerfile index 870765e38a..686101540e 100644 --- a/tools/helm/zepplin/Dockerfile +++ b/tools/helm/zeppelin/Dockerfile @@ -1,7 +1,58 @@ -FROM java:openjdk-8-jdk -MAINTAINER Dalitso Banda +FROM openjdk:8-jdk-slim-buster +LABEL maintainer="Dalitso Banda dalitsohb@gmail.com" + +# Get Spark from US Apache mirror. +ENV APACHE_SPARK_VERSION 2.4.5 +ENV HADOOP_VERSION 3.2.1 + +RUN echo "$LOG_TAG Getting SPARK_HOME" && \ + apt-get update && \ + # build deps and deps for c bindings for cntk + apt-get install -y build-essential && \ + apt-get install -y autoconf automake libtool curl make unzip && \ + mkdir -p /opt && \ + cd /opt && \ + curl http://apache.claz.org/spark/spark-${APACHE_SPARK_VERSION}/spark-${APACHE_SPARK_VERSION}-bin-without-hadoop.tgz | \ + tar -xz && \ + ln -s spark-${APACHE_SPARK_VERSION}-bin-without-hadoop spark && \ + echo Spark ${APACHE_SPARK_VERSION} installed in /opt/spark && \ + export SPARK_HOME=/opt/spark + +RUN echo "downloading hadoop" && \ + apt-get install -y wget && \ + cd /tmp && \ + wget http://apache.claz.org/hadoop/common/hadoop-${HADOOP_VERSION}/hadoop-${HADOOP_VERSION}.tar.gz -O - | \ + tar -xz && \ + mv /tmp/hadoop-${HADOOP_VERSION} /opt/hadoop && \ + echo "export HADOOP_CLASSPATH=/opt/hadoop/share/hadoop/tools/lib/*" >> /opt/hadoop/etc/hadoop/hadoop-env.sh && \ + echo Hadoop ${HADOOP_VERSION} installed in /opt/hadoop && \ + rm -rf /opt/hadoop/share/doc + +RUN echo "\nSPARK_DIST_CLASSPATH=/jars:/jars/*:$(/opt/hadoop/bin/hadoop classpath)" >> /opt/spark/conf/spark-env.sh +ENV HADOOP_HOME=/opt/hadoop +ADD jars /jars + +# if numpy is installed on a driver it needs to be installed on all +# workers, so install it everywhere +RUN apt-get update && \ + apt install -y python3-pip && \ + pip3 install numpy && \ + pip3 install matplotlib && \ + pip3 install pandas==0.24.1 && \ + pip3 install scikit-learn && \ + pip3 install pyarrow==0.11.1 && \ + apt-get clean && \ + rm -rf /var/lib/apt/lists/* + +# Final config +ADD spark-config/log4j.properties /opt/spark/conf/log4j.properties +ADD spark-config/start-common.sh spark-config/start-worker spark-config/start-master / +ADD spark-config/core-site.xml /opt/spark/conf/core-site.xml +ADD spark-config/spark-defaults.conf /opt/spark/conf/spark-defaults.conf +ENV PATH $PATH:/opt/spark/bin + +ADD patch_beam.patch /tmp/patch_beam.patch -# `Z_VERSION` will be updated by `dev/change_zeppelin_version.sh` ENV Z_VERSION="git_master" ENV Z_COMMIT="2ea945f548a4e41312026d5ee1070714c155a11e" ENV LOG_TAG="[ZEPPELIN_${Z_VERSION}]:" \ @@ -21,10 +72,8 @@ RUN echo "$LOG_TAG Getting maven" && \ tar -zxf apache-maven-3.3.9-bin.tar.gz -C /usr/local/ && \ ln -s /usr/local/apache-maven-3.3.9/bin/mvn /usr/local/bin/mvn -ADD patch_beam.patch /tmp/patch_beam.patch - RUN echo "$LOG_TAG install nodejs" && \ - curl -sL https://deb.nodesource.com/setup_11.x | bash - && apt-get install -y nodejs && \ + curl -sL https://deb.nodesource.com/setup_12.x | bash - && apt-get install -y nodejs && \ echo "$LOG_TAG Download Zeppelin source" && \ git clone https://github.com/apache/zeppelin.git /zeppelin-${Z_VERSION}-bin-all && \ mv /zeppelin-${Z_VERSION}-bin-all ${Z_HOME}_src && \ @@ -48,18 +97,19 @@ RUN echo "$LOG_TAG install nodejs" && \ mkdir -p /usr/local/lib/node_modules && \ npm install -g @angular/cli && \ npm install -g grunt-cli bower && \ - bower install && \ - cd ${Z_HOME}_src && \ - export MAVEN_OPTS="-Xmx2g -Xss128M -XX:MetaspaceSize=512M -XX:MaxMetaspaceSize=1024M -XX:+CMSClassUnloadingEnabled" && \ + bower install + +RUN cd ${Z_HOME}_src && \ + export MAVEN_OPTS="-Xmx2048m -XX:MaxPermSize=256m" && \ mvn -e -B package -DskipTests -Pscala-2.11 -Pbuild-distr && \ tar xvf ${Z_HOME}_src/zeppelin-distribution/target/zeppelin-0.9.0-SNAPSHOT.tar.gz && \ rm -rf ${Z_HOME}/* && \ mv zeppelin-0.9.0-SNAPSHOT ${Z_HOME}_dist && \ mv ${Z_HOME}_dist/* ${Z_HOME} && \ echo "$LOG_TAG Cleanup" && \ - apt-get remove --purge -y r-base-dev r-cran-evaluate libfontconfig && \ + rm -rf /usr/local/apache-maven-3.3.9 && \ npm uninstall -g @angular/cli grunt-cli bower && \ - apt-get autoclean && apt-get autoremove -y && \ + rm -rf /usr/local/apache-maven-3.3.9 && \ rm -rf ${Z_HOME}_dist && \ rm -rf ${Z_HOME}_src && \ rm -rf /root/.ivy2 && \ @@ -68,83 +118,19 @@ RUN echo "$LOG_TAG install nodejs" && \ rm -rf /root/.cache && \ rm -rf /tmp/* -RUN echo "$LOG_TAG install tini related packages" && \ - apt-get install -y wget curl grep sed dpkg && \ - TINI_VERSION=`curl https://github.com/krallin/tini/releases/latest | grep -o "/v.*\"" | sed 's:^..\(.*\).$:\1:'` && \ - curl -L "https://github.com/krallin/tini/releases/download/v${TINI_VERSION}/tini_${TINI_VERSION}.deb" > tini.deb && \ - dpkg -i tini.deb && \ - rm tini.deb - -RUN echo "$LOG_TAG installing python related packages" && \ - curl https://bootstrap.pypa.io/get-pip.py -o get-pip.py && \ - python get-pip.py && \ - rm get-pip.py && \ - apt-get install -y python-dev libpython3-dev build-essential pkg-config gfortran && \ - pip install -U pip setuptools wheel && \ - pip install numpy && \ - pip install matplotlib && \ - pip install pandas && \ - apt-get update && \ - apt-get upgrade -y && \ - echo "deb http://deb.debian.org/debian stretch main" >> /etc/apt/sources.list && \ - apt-get update && \ - apt-get install -y g++ gcc-6 libstdc++-6-dev && \ - echo "$LOG_TAG Cleanup" && \ - apt-get purge -y --auto-remove build-essential pkg-config gfortran libpython3-dev && \ - apt-get autoremove -y && \ - apt-get autoclean && \ - apt-get clean && \ - rm -rf /root/.npm && \ - rm -rf /root/.m2 && \ - rm -rf /root/.cache && \ - rm -rf /tmp/* - -ENV APACHE_SPARK_VERSION 2.4.0 -ENV HADOOP_VERSION 3.2.0 -ENV HADOOP_GIT_COMMIT="release-3.2.0-RC1" - -RUN echo "$LOG_TAG Getting SPARK_HOME" && \ - mkdir -p /opt && \ - cd /opt && \ - curl http://apache.claz.org/spark/spark-${APACHE_SPARK_VERSION}/spark-${APACHE_SPARK_VERSION}-bin-without-hadoop.tgz | \ - tar -xz && \ - ln -s spark-${APACHE_SPARK_VERSION}-bin-without-hadoop spark && \ - echo Spark ${APACHE_SPARK_VERSION} installed in /opt/spark && \ - export SPARK_HOME=/opt/spark - -RUN echo "$LOG_TAG building hadoop" && \ - apt-get update && \ - apt-get install -y make && \ - cd / && \ - git clone https://github.com/apache/hadoop.git hadoop_src&& \ - mkdir /hadoop_deps && cd /hadoop_deps && \ - wget https://github.com/protocolbuffers/protobuf/releases/download/v2.5.0/protobuf-2.5.0.tar.bz2 && \ - tar xvf protobuf-2.5.0.tar.bz2 && \ - cd protobuf-2.5.0 && \ - ./configure && make && make install && ldconfig && \ - cd /hadoop_src && git checkout ${HADOOP_GIT_COMMIT} && mvn package -Pdist -DskipTests -Dtar && \ - mv hadoop-dist/target/hadoop-${HADOOP_VERSION} /opt/hadoop && \ - rm -r /hadoop_src && \ - rm -rf /root/.ivy2 && \ - rm -rf /root/.m2 && \ - export HADOOP_HOME=/opt/hadoop && \ - echo "\nexport HADOOP_CLASSPATH=/opt/hadoop/share/hadoop/tools/lib/*" >> /opt/hadoop/etc/hadoop/hadoop-env.sh && \ - echo Hadoop ${HADOOP_VERSION} installed in /opt/hadoop && \ - apt-get purge -y --auto-remove g++ make build-essential autoconf automake && \ - cd / && rm -rf /hadoop_deps - -RUN echo "\nSPARK_DIST_CLASSPATH=/jars:/jars/*:$(/opt/hadoop/bin/hadoop classpath)" >> /opt/spark/conf/spark-env.sh -ENV HADOOP_HOME=/opt/hadoop ADD jars /jars # add notebooks -ADD mmlsparkExamples ${Z_HOME}/notebook/mmlspark/ +ADD mmlsparkExamples/ ${Z_HOME}/notebook/mmlspark/ ADD spark-defaults.conf /opt/spark/conf/spark-defaults.conf ADD zeppelin-env.sh ${Z_HOME}/conf/ +# use python3 as default since thats what's in the base image \ +RUN echo "export PYSPARK_DRIVER_PYTHON=python3" >> ${Z_HOME}/conf/zeppelin-env.sh && \ + echo "export PYSPARK_PYTHON=python3" >> ${Z_HOME}/conf/zeppelin-env.sh + EXPOSE 8080 -ENTRYPOINT [ "/usr/bin/tini", "--" ] WORKDIR ${Z_HOME} -CMD ["sh", "-c", "echo '\nspark.driver.host' $(hostname -i) >> /opt/spark/conf/spark-defaults.conf && bin/zeppelin.sh"] +CMD ["sh", "-c", "echo '\nspark.driver.host' $(hostname -i) >> /opt/spark/conf/spark-defaults.conf && bin/zeppelin.sh"] \ No newline at end of file diff --git a/tools/helm/zepplin/jars/README.md b/tools/helm/zeppelin/jars/README.md similarity index 100% rename from tools/helm/zepplin/jars/README.md rename to tools/helm/zeppelin/jars/README.md diff --git a/tools/helm/zepplin/mini.Dockerfile b/tools/helm/zeppelin/mini.Dockerfile similarity index 100% rename from tools/helm/zepplin/mini.Dockerfile rename to tools/helm/zeppelin/mini.Dockerfile diff --git a/tools/helm/zepplin/mmlsparkExamples/classification_mmlspark_2E3REACQR.zpln b/tools/helm/zeppelin/mmlsparkExamples/classification_mmlspark_2E3REACQR.zpln similarity index 100% rename from tools/helm/zepplin/mmlsparkExamples/classification_mmlspark_2E3REACQR.zpln rename to tools/helm/zeppelin/mmlsparkExamples/classification_mmlspark_2E3REACQR.zpln diff --git a/tools/helm/zepplin/mmlsparkExamples/serving.py b/tools/helm/zeppelin/mmlsparkExamples/serving.py similarity index 100% rename from tools/helm/zepplin/mmlsparkExamples/serving.py rename to tools/helm/zeppelin/mmlsparkExamples/serving.py diff --git a/tools/helm/zepplin/mmlsparkExamples/simplification_mmlspark.zpln b/tools/helm/zeppelin/mmlsparkExamples/simplification_mmlspark.zpln similarity index 100% rename from tools/helm/zepplin/mmlsparkExamples/simplification_mmlspark.zpln rename to tools/helm/zeppelin/mmlsparkExamples/simplification_mmlspark.zpln diff --git a/tools/helm/zepplin/mmlsparkExamples/sparkPi_2E12S8C29.zpln b/tools/helm/zeppelin/mmlsparkExamples/sparkPi_2E12S8C29.zpln similarity index 100% rename from tools/helm/zepplin/mmlsparkExamples/sparkPi_2E12S8C29.zpln rename to tools/helm/zeppelin/mmlsparkExamples/sparkPi_2E12S8C29.zpln diff --git a/tools/helm/zepplin/mmlsparkExamples/sparkserving_2DZFNGU8A.zpln b/tools/helm/zeppelin/mmlsparkExamples/sparkserving_2DZFNGU8A.zpln similarity index 100% rename from tools/helm/zepplin/mmlsparkExamples/sparkserving_2DZFNGU8A.zpln rename to tools/helm/zeppelin/mmlsparkExamples/sparkserving_2DZFNGU8A.zpln diff --git a/tools/helm/zepplin/mmlsparkExamples/submitjob_2DZ7DHX6E.zpln b/tools/helm/zeppelin/mmlsparkExamples/submitjob_2DZ7DHX6E.zpln similarity index 100% rename from tools/helm/zepplin/mmlsparkExamples/submitjob_2DZ7DHX6E.zpln rename to tools/helm/zeppelin/mmlsparkExamples/submitjob_2DZ7DHX6E.zpln diff --git a/tools/helm/zepplin/patch_beam.patch b/tools/helm/zeppelin/patch_beam.patch similarity index 100% rename from tools/helm/zepplin/patch_beam.patch rename to tools/helm/zeppelin/patch_beam.patch diff --git a/tools/helm/zeppelin/spark-config/core-site.xml b/tools/helm/zeppelin/spark-config/core-site.xml new file mode 100644 index 0000000000..2fecabedc8 --- /dev/null +++ b/tools/helm/zeppelin/spark-config/core-site.xml @@ -0,0 +1,19 @@ + + + + + + fs.gs.impl + com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem + The FileSystem for gs: (GCS) uris. + + + fs.AbstractFileSystem.gs.impl + com.google.cloud.hadoop.fs.gcs.GoogleHadoopFS + The AbstractFileSystem for gs: (GCS) uris. Only necessary for use with Hadoop 2. + + + fs.gs.project.id + NOT_RUNNING_INSIDE_GCE + + diff --git a/tools/helm/zeppelin/spark-config/log4j.properties b/tools/helm/zeppelin/spark-config/log4j.properties new file mode 100644 index 0000000000..3a2a882198 --- /dev/null +++ b/tools/helm/zeppelin/spark-config/log4j.properties @@ -0,0 +1,12 @@ +# Set everything to be logged to the console +log4j.rootCategory=INFO, console +log4j.appender.console=org.apache.log4j.ConsoleAppender +log4j.appender.console.target=System.err +log4j.appender.console.layout=org.apache.log4j.PatternLayout +log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{1}: %m%n + +# Settings to quiet third party logs that are too verbose +log4j.logger.org.spark-project.jetty=WARN +log4j.logger.org.spark-project.jetty.util.component.AbstractLifeCycle=ERROR +log4j.logger.org.apache.spark.repl.SparkIMain$exprTyper=INFO +log4j.logger.org.apache.spark.repl.SparkILoop$SparkILoopInterpreter=INFO diff --git a/tools/helm/zeppelin/spark-config/spark-defaults.conf b/tools/helm/zeppelin/spark-config/spark-defaults.conf new file mode 100644 index 0000000000..5b3e62b9f4 --- /dev/null +++ b/tools/helm/zeppelin/spark-config/spark-defaults.conf @@ -0,0 +1 @@ +spark.app.id KubernetesSpark diff --git a/tools/helm/zeppelin/spark-config/start-common.sh b/tools/helm/zeppelin/spark-config/start-common.sh new file mode 100644 index 0000000000..ac8d505838 --- /dev/null +++ b/tools/helm/zeppelin/spark-config/start-common.sh @@ -0,0 +1,30 @@ +#!/bin/bash + +# Copyright 2015 The Kubernetes Authors All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +PROJECT_ID=$(curl -s -H "Metadata-Flavor: Google" http://metadata.google.internal/computeMetadata/v1/project/project-id) + +if [[ -n "${PROJECT_ID}" ]]; then + sed -i "s/NOT_RUNNING_INSIDE_GCE/${PROJECT_ID}/" /opt/spark/conf/core-site.xml +fi + +# We don't want any of the incoming service variables, we'd rather use +# DNS. But this one interferes directly with Spark. +unset SPARK_MASTER_PORT + +# spark.{executor,driver}.extraLibraryPath don't actually seem to +# work, this seems to be the only reliable way to get the native libs +# picked up. +export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/opt/hadoop/lib/native diff --git a/tools/helm/zeppelin/spark-config/start-master b/tools/helm/zeppelin/spark-config/start-master new file mode 100644 index 0000000000..f5e83a3074 --- /dev/null +++ b/tools/helm/zeppelin/spark-config/start-master @@ -0,0 +1,22 @@ +#!/bin/bash + +# Copyright 2015 The Kubernetes Authors All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +. start-common.sh + +echo "$(hostname -i) spark-master" >> /etc/hosts + +# Run spark-class directly so that when it exits (or crashes), the pod restarts. +/opt/spark/bin/spark-class org.apache.spark.deploy.master.Master --ip spark-master --port 7077 --webui-port 8080 diff --git a/tools/helm/zeppelin/spark-config/start-worker b/tools/helm/zeppelin/spark-config/start-worker new file mode 100644 index 0000000000..5b9ccaebce --- /dev/null +++ b/tools/helm/zeppelin/spark-config/start-worker @@ -0,0 +1,28 @@ +#!/bin/bash + +# Copyright 2015 The Kubernetes Authors All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +. start-common.sh + +if ! getent hosts spark-master; then + echo "=== Cannot resolve the DNS entry for spark-master. Has the service been created yet, and is SkyDNS functional?" + echo "=== See http://kubernetes.io/v1.1/docs/admin/dns.html for more details on DNS integration." + echo "=== Sleeping 10s before pod exit." + sleep 10 + exit 0 +fi + +# Run spark-class directly so that when it exits (or crashes), the pod restarts. +/opt/spark/bin/spark-class org.apache.spark.deploy.worker.Worker spark://spark-master:7077 --webui-port 8081 diff --git a/tools/helm/zepplin/spark-defaults.conf b/tools/helm/zeppelin/spark-defaults.conf similarity index 100% rename from tools/helm/zepplin/spark-defaults.conf rename to tools/helm/zeppelin/spark-defaults.conf diff --git a/tools/helm/zepplin/zeppelin-env.sh b/tools/helm/zeppelin/zeppelin-env.sh similarity index 100% rename from tools/helm/zepplin/zeppelin-env.sh rename to tools/helm/zeppelin/zeppelin-env.sh