fix: Updating regular Docker Images for helm chart. (#885)

* Updating regular Docker Images from helm chart. * Removed image dependency * Removed FBProphet * Added maintainer.
microsoft · Aug 19, 2020 · b431a61 · b431a61
1 parent 96f0b77
commit b431a61
Show file tree

Hide file tree

Showing 26 changed files with 377 additions and 181 deletions.
diff --git a/tools/helm/livy/Dockerfile b/tools/helm/livy/Dockerfile
@@ -1,5 +1,55 @@
-FROM java:openjdk-8-jdk
-MAINTAINER Dalitso Banda <[email protected]>
+FROM openjdk:8-jdk-slim-buster
+LABEL maintainer="Dalitso Banda [email protected]"
+
+# Get Spark from US Apache mirror.
+ENV APACHE_SPARK_VERSION 2.4.5
+ENV HADOOP_VERSION 3.2.1
+
+RUN echo "$LOG_TAG Getting SPARK_HOME" && \
+    apt-get update && \
+    # build deps and deps for c bindings for cntk
+    apt-get install -y build-essential && \
+    apt-get install -y autoconf automake libtool curl make unzip && \
+    mkdir -p /opt && \
+    cd /opt && \
+    curl http://apache.claz.org/spark/spark-${APACHE_SPARK_VERSION}/spark-${APACHE_SPARK_VERSION}-bin-without-hadoop.tgz  | \
+        tar -xz && \
+    ln -s spark-${APACHE_SPARK_VERSION}-bin-without-hadoop spark && \
+    echo Spark ${APACHE_SPARK_VERSION} installed in /opt/spark && \
+    export SPARK_HOME=/opt/spark
+
+RUN echo "downloading hadoop" && \
+    apt-get install -y wget && \
+    cd /tmp && \
+    wget http://apache.claz.org/hadoop/common/hadoop-${HADOOP_VERSION}/hadoop-${HADOOP_VERSION}.tar.gz -O - | \
+    tar -xz && \
+    mv /tmp/hadoop-${HADOOP_VERSION} /opt/hadoop && \
+    echo "export HADOOP_CLASSPATH=/opt/hadoop/share/hadoop/tools/lib/*" >> /opt/hadoop/etc/hadoop/hadoop-env.sh && \
+    echo Hadoop ${HADOOP_VERSION} installed in /opt/hadoop && \
+    rm -rf /opt/hadoop/share/doc
+
+RUN echo "\nSPARK_DIST_CLASSPATH=/jars:/jars/*:$(/opt/hadoop/bin/hadoop classpath)" >> /opt/spark/conf/spark-env.sh
+ENV HADOOP_HOME=/opt/hadoop
+ADD jars /jars
+
+# if numpy is installed on a driver it needs to be installed on all
+# workers, so install it everywhere
+RUN apt-get update && \
+    apt install -y python3-pip && \
+    pip3 install numpy && \
+    pip3 install matplotlib && \
+    pip3 install pandas==0.24.1 && \
+    pip3 install scikit-learn && \
+    pip3 install pyarrow==0.11.1 && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/*
+
+# Final config
+ADD spark-config/log4j.properties /opt/spark/conf/log4j.properties
+ADD spark-config/start-common.sh spark-config/start-worker spark-config/start-master /
+ADD spark-config/core-site.xml /opt/spark/conf/core-site.xml
+ADD spark-config/spark-defaults.conf /opt/spark/conf/spark-defaults.conf
+ENV PATH $PATH:/opt/spark/bin
 
 ENV LIVY_VERSION="git_master"
 ENV LIVY_COMMIT="02550f7919b7348b6a7270cf806e031670037b2f"
@@ -9,91 +59,43 @@ ENV LOG_TAG="[LIVY_${LIVY_VERSION}]:" \
     LC_ALL=en_US.UTF-8
 
 RUN echo "$LOG_TAG Install essentials" && \
-    apt-get -y update && \
-    apt-get install -y locales && \
-    locale-gen $LANG && \
-    apt-get install -y git wget grep curl sed && \
-    apt-get install -y python-setuptools && \
-    apt-get autoclean &&  apt-get autoremove && \
-    echo "$LOG_TAG Install python dependencies" && \
-    curl https://bootstrap.pypa.io/get-pip.py -o get-pip.py && \
-    python get-pip.py && \
-    rm get-pip.py && \
-    apt-get install -y python-dev libpython3-dev build-essential pkg-config gfortran && \
-    pip install -U pip setuptools wheel && \
+    apt-get update && \
+    apt-get install -y git wget curl && \
+    echo "$LOG_TAG setting python dependencies" && \
+    ln -s /usr/bin/python3 /usr/bin/python && \
     echo "$LOG_TAG Getting maven" && \
     wget http://www.eu.apache.org/dist/maven/maven-3/3.3.9/binaries/apache-maven-3.3.9-bin.tar.gz && \
     tar -zxf apache-maven-3.3.9-bin.tar.gz -C /usr/local/ && \
+    rm -rf apache-maven-3.3.9-bin.tar.gz && \
     ln -s /usr/local/apache-maven-3.3.9/bin/mvn /usr/local/bin/mvn && \
     echo "$LOG_TAG Download and build Livy source" && \
     git clone https://github.com/apache/incubator-livy.git ${LIVY_HOME}_src && \
     cd ${LIVY_HOME}_src  && \
     git checkout ${LIVY_COMMIT} && \
     mvn package -DskipTests && \
-    ls ${LIVY_HOME}_src && \
+    mv ${LIVY_HOME}_src ${LIVY_HOME} && \
     echo "$LOG_TAG Cleanup" && \
-    apt-get purge -y --auto-remove build-essential pkg-config gfortran libpython3-dev  && \
-    apt-get autoremove && \
-    apt-get autoclean && \
-    apt-get clean && \
+    rm -rf /usr/local/apache-maven-3.3.9 && \
     rm -rf /root/.ivy2 && \
     rm -rf /root/.npm && \
     rm -rf /root/.m2 && \
     rm -rf /root/.cache && \
     rm -rf /tmp/*
 
-# Get Spark from US Apache mirror.
-ENV APACHE_SPARK_VERSION 2.4.0
-ENV HADOOP_VERSION 3.2.0
-ENV HADOOP_GIT_COMMIT="release-3.2.0-RC1"
-
-RUN echo "$LOG_TAG Getting SPARK_HOME" && \
-    mkdir -p /opt && \
-    cd /opt && \
-    curl http://apache.claz.org/spark/spark-${APACHE_SPARK_VERSION}/spark-${APACHE_SPARK_VERSION}-bin-without-hadoop.tgz  | \
-        tar -xz && \
-    ln -s spark-${APACHE_SPARK_VERSION}-bin-without-hadoop spark && \
-    echo Spark ${APACHE_SPARK_VERSION} installed in /opt/spark && \
-    export SPARK_HOME=/opt/spark
-
-RUN echo "$LOG_TAG building hadoop" && \
-    apt-get update && \
-    apt-get install -y make autoconf automake libtool g++ unzip && \
-    cd  / && \
-    git clone https://github.com/apache/hadoop.git  hadoop_src&& \
-    mkdir /hadoop_deps && cd /hadoop_deps && \
-    wget https://github.com/protocolbuffers/protobuf/releases/download/v2.5.0/protobuf-2.5.0.tar.bz2 && \
-    tar xvf protobuf-2.5.0.tar.bz2 && \
-    cd protobuf-2.5.0 && \
-    ./configure && make && make install && ldconfig && \
-    cd /hadoop_src && git checkout ${HADOOP_GIT_COMMIT} && mvn package -Pdist -DskipTests -Dtar && \
-    mv hadoop-dist/target/hadoop-${HADOOP_VERSION} /opt/hadoop && \
-    rm -r /hadoop_src && \
-    rm -rf /root/.ivy2 && \
-    rm -rf /root/.m2 && \
-    export HADOOP_HOME=/opt/hadoop && \
-    echo "\nexport HADOOP_CLASSPATH=/opt/hadoop/share/hadoop/tools/lib/*" >> /opt/hadoop/etc/hadoop/hadoop-env.sh && \
-    echo Hadoop ${HADOOP_VERSION} installed in /opt/hadoop && \
-    apt-get purge -y --auto-remove g++ make build-essential autoconf automake && \
-    cd  / && rm -rf /hadoop_deps
-
-RUN echo "\nSPARK_DIST_CLASSPATH=/jars:/jars/*:$(/opt/hadoop/bin/hadoop classpath)" >> /opt/spark/conf/spark-env.sh
-ENV HADOOP_HOME=/opt/hadoop
 ADD jars /jars
 
 ENV HADOOP_CONF_DIR /opt/hadoop/conf
 ENV CONF_DIR /livy/conf
 ENV SPARK_CONF_DIR /opt/spark/conf
 
-RUN mv ${LIVY_HOME}_src ${LIVY_HOME}
 ADD livy.conf ${LIVY_HOME}/conf
 EXPOSE 8998
 
 WORKDIR ${LIVY_HOME}
 
-RUN mkdir logs && export SPARK_HOME=/opt/spark && export HADOOP_HOME=/opt/hadoop && export SPARK_CONF_DIR=/opt/spark/conf
+RUN mkdir logs
 
 #hive needed for livy pyspark
-RUN wget http://central.maven.org/maven2/org/apache/spark/spark-hive_2.11/2.4.0/spark-hive_2.11-2.4.0.jar -P /opt/spark/jars
+RUN wget https://repo1.maven.org/maven2/org/apache/spark/spark-hive_2.11/2.4.5/spark-hive_2.11-2.4.5.jar -P /opt/spark/jars
 
-CMD ["sh", "-c", "echo '\nspark.driver.host' $(hostname -i) >> /opt/spark/conf/spark-defaults.conf && echo '\nlivy.spark.master' $SPARK_MASTER >> /livy/conf/livy.conf && bin/livy-server"]
+CMD ["sh", "-c", "echo '\nspark.driver.host' $(hostname -i) >> /opt/spark/conf/spark-defaults.conf && echo '\nlivy.spark.master' $SPARK_MASTER >> /livy/conf/livy.conf && bin/livy-server"]
diff --git a/tools/helm/livy/spark-config/core-site.xml b/tools/helm/livy/spark-config/core-site.xml
@@ -0,0 +1,19 @@
+<?xml version="1.0"?>
+<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
+
+<configuration>
+  <property>
+    <name>fs.gs.impl</name>
+    <value>com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem</value>
+    <description>The FileSystem for gs: (GCS) uris.</description>
+  </property>
+  <property>
+    <name>fs.AbstractFileSystem.gs.impl</name>
+    <value>com.google.cloud.hadoop.fs.gcs.GoogleHadoopFS</value>
+    <description>The AbstractFileSystem for gs: (GCS) uris. Only necessary for use with Hadoop 2.</description>
+  </property>
+  <property>
+    <name>fs.gs.project.id</name>
+    <value>NOT_RUNNING_INSIDE_GCE</value>
+  </property>
+</configuration>
diff --git a/tools/helm/livy/spark-config/log4j.properties b/tools/helm/livy/spark-config/log4j.properties
@@ -0,0 +1,12 @@
+# Set everything to be logged to the console
+log4j.rootCategory=INFO, console
+log4j.appender.console=org.apache.log4j.ConsoleAppender
+log4j.appender.console.target=System.err
+log4j.appender.console.layout=org.apache.log4j.PatternLayout
+log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{1}: %m%n
+
+# Settings to quiet third party logs that are too verbose
+log4j.logger.org.spark-project.jetty=WARN
+log4j.logger.org.spark-project.jetty.util.component.AbstractLifeCycle=ERROR
+log4j.logger.org.apache.spark.repl.SparkIMain$exprTyper=INFO
+log4j.logger.org.apache.spark.repl.SparkILoop$SparkILoopInterpreter=INFO
diff --git a/tools/helm/livy/spark-config/spark-defaults.conf b/tools/helm/livy/spark-config/spark-defaults.conf
@@ -0,0 +1 @@
+spark.app.id KubernetesSpark
diff --git a/tools/helm/livy/spark-config/start-common.sh b/tools/helm/livy/spark-config/start-common.sh
@@ -0,0 +1,30 @@
+#!/bin/bash
+
+# Copyright 2015 The Kubernetes Authors All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+PROJECT_ID=$(curl -s -H "Metadata-Flavor: Google" http://metadata.google.internal/computeMetadata/v1/project/project-id)
+
+if [[ -n "${PROJECT_ID}" ]]; then
+  sed -i "s/NOT_RUNNING_INSIDE_GCE/${PROJECT_ID}/" /opt/spark/conf/core-site.xml
+fi
+
+# We don't want any of the incoming service variables, we'd rather use
+# DNS. But this one interferes directly with Spark.
+unset SPARK_MASTER_PORT
+
+# spark.{executor,driver}.extraLibraryPath don't actually seem to
+# work, this seems to be the only reliable way to get the native libs
+# picked up.
+export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/opt/hadoop/lib/native
diff --git a/tools/helm/livy/spark-config/start-master b/tools/helm/livy/spark-config/start-master
@@ -0,0 +1,22 @@
+#!/bin/bash
+
+# Copyright 2015 The Kubernetes Authors All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+. start-common.sh
+
+echo "$(hostname -i) spark-master" >> /etc/hosts
+
+# Run spark-class directly so that when it exits (or crashes), the pod restarts.
+/opt/spark/bin/spark-class org.apache.spark.deploy.master.Master --ip spark-master --port 7077 --webui-port 8080
diff --git a/tools/helm/livy/spark-config/start-worker b/tools/helm/livy/spark-config/start-worker
@@ -0,0 +1,28 @@
+#!/bin/bash
+
+# Copyright 2015 The Kubernetes Authors All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+. start-common.sh
+
+if ! getent hosts spark-master; then
+  echo "=== Cannot resolve the DNS entry for spark-master. Has the service been created yet, and is SkyDNS functional?"
+  echo "=== See http://kubernetes.io/v1.1/docs/admin/dns.html for more details on DNS integration."
+  echo "=== Sleeping 10s before pod exit."
+  sleep 10
+  exit 0
+fi
+
+# Run spark-class directly so that when it exits (or crashes), the pod restarts.
+/opt/spark/bin/spark-class org.apache.spark.deploy.worker.Worker spark://spark-master:7077 --webui-port 8081
diff --git a/tools/helm/spark/Dockerfile b/tools/helm/spark/Dockerfile
@@ -1,11 +1,15 @@
-FROM java:openjdk-8-jdk
+FROM openjdk:8-jdk-slim-buster
+LABEL maintainer="Dalitso Banda [email protected]"
 
 # Get Spark from US Apache mirror.
-ENV APACHE_SPARK_VERSION 2.4.0
-ENV HADOOP_VERSION 3.2.0
-ENV HADOOP_GIT_COMMIT="release-3.2.0-RC1"
+ENV APACHE_SPARK_VERSION 2.4.5
+ENV HADOOP_VERSION 3.2.1
 
 RUN echo "$LOG_TAG Getting SPARK_HOME" && \
+    apt-get update && \
+    # build deps and deps for c bindings for cntk
+    apt-get install -y build-essential && \
+    apt-get install -y autoconf automake libtool curl make unzip && \
     mkdir -p /opt && \
     cd /opt && \
     curl http://apache.claz.org/spark/spark-${APACHE_SPARK_VERSION}/spark-${APACHE_SPARK_VERSION}-bin-without-hadoop.tgz  | \
@@ -14,55 +18,35 @@ RUN echo "$LOG_TAG Getting SPARK_HOME" && \
     echo Spark ${APACHE_SPARK_VERSION} installed in /opt/spark && \
     export SPARK_HOME=/opt/spark
 
-RUN echo "$LOG_TAG Getting maven" && \
-    wget http://www.eu.apache.org/dist/maven/maven-3/3.3.9/binaries/apache-maven-3.3.9-bin.tar.gz && \
-    tar -zxf apache-maven-3.3.9-bin.tar.gz -C /usr/local/ && \
-    ln -s /usr/local/apache-maven-3.3.9/bin/mvn /usr/local/bin/mvn 
-
-RUN echo "$LOG_TAG building hadoop" && \
-    echo "deb http://deb.debian.org/debian stretch main" >> /etc/apt/sources.list && \
-    apt-get update && \
-    # build deps and deps for c bindings for cntk
-    apt-get install -y g++ gcc-6 libstdc++-6-dev make build-essential && \
-    apt-get install -y autoconf automake libtool curl make unzip && \
-    cd  / && \
-    git clone https://github.com/apache/hadoop.git  hadoop_src&& \
-    mkdir /hadoop_deps && cd /hadoop_deps && \
-    wget https://github.com/protocolbuffers/protobuf/releases/download/v2.5.0/protobuf-2.5.0.tar.bz2 && \
-    tar xvf protobuf-2.5.0.tar.bz2 && \
-    cd protobuf-2.5.0 && \
-    ./configure && make && make install && ldconfig && \
-    cd /hadoop_src && git checkout ${HADOOP_GIT_COMMIT} && mvn package -Pdist -DskipTests -Dtar && \
-    mv hadoop-dist/target/hadoop-${HADOOP_VERSION} /opt/hadoop && \
-    rm -r /hadoop_src && \
-    export HADOOP_HOME=/opt/hadoop && \
-    echo "\nexport HADOOP_CLASSPATH=/opt/hadoop/share/hadoop/tools/lib/*" >> /opt/hadoop/etc/hadoop/hadoop-env.sh && \
+RUN echo "downloading hadoop" && \
+    apt-get install -y wget && \
+    cd /tmp && \
+    wget http://apache.claz.org/hadoop/common/hadoop-${HADOOP_VERSION}/hadoop-${HADOOP_VERSION}.tar.gz -O - | \
+    tar -xz && \
+    mv /tmp/hadoop-${HADOOP_VERSION} /opt/hadoop && \
+    echo "export HADOOP_CLASSPATH=/opt/hadoop/share/hadoop/tools/lib/*" >> /opt/hadoop/etc/hadoop/hadoop-env.sh && \
     echo Hadoop ${HADOOP_VERSION} installed in /opt/hadoop && \
-    apt-get purge -y --auto-remove g++ make build-essential autoconf automake && \
-    cd  / && rm -rf /hadoop_deps
+    rm -rf /opt/hadoop/share/doc
 
 RUN echo "\nSPARK_DIST_CLASSPATH=/jars:/jars/*:$(/opt/hadoop/bin/hadoop classpath)" >> /opt/spark/conf/spark-env.sh
 ENV HADOOP_HOME=/opt/hadoop
 ADD jars /jars
 
-
 # if numpy is installed on a driver it needs to be installed on all
 # workers, so install it everywhere
 RUN apt-get update && \
-    apt-get install -y g++ python-dev build-essential python3-dev && \
-    curl https://bootstrap.pypa.io/get-pip.py -o get-pip.py && \
-    python get-pip.py && \
-    rm get-pip.py && \
-    pip install -U pip setuptools wheel && \
-    pip install numpy && \
-    pip install matplotlib && \
-    pip install pandas && \
-    apt-get purge -y --auto-remove python-dev build-essential python3-dev && \
+    apt install -y python3-pip && \
+    pip3 install numpy && \
+    pip3 install matplotlib && \
+    pip3 install pandas==0.24.1 && \
+    pip3 install scikit-learn && \
+    pip3 install pyarrow==0.11.1 && \
     apt-get clean && \
     rm -rf /var/lib/apt/lists/*
 
+# Final config
 ADD log4j.properties /opt/spark/conf/log4j.properties
 ADD start-common.sh start-worker start-master /
 ADD core-site.xml /opt/spark/conf/core-site.xml
 ADD spark-defaults.conf /opt/spark/conf/spark-defaults.conf
-ENV PATH $PATH:/opt/spark/bin
+ENV PATH $PATH:/opt/spark/bin