Skip to content

Commit

Permalink
fix: Updating regular Docker Images for helm chart. (#885)
Browse files Browse the repository at this point in the history
* Updating regular Docker Images from helm chart.

* Removed image dependency

* Removed FBProphet

* Added maintainer.
  • Loading branch information
WaterKnight1998 authored Aug 19, 2020
1 parent 96f0b77 commit b431a61
Show file tree
Hide file tree
Showing 26 changed files with 377 additions and 181 deletions.
122 changes: 62 additions & 60 deletions tools/helm/livy/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,5 +1,55 @@
FROM java:openjdk-8-jdk
MAINTAINER Dalitso Banda <[email protected]>
FROM openjdk:8-jdk-slim-buster
LABEL maintainer="Dalitso Banda [email protected]"

# Get Spark from US Apache mirror.
ENV APACHE_SPARK_VERSION 2.4.5
ENV HADOOP_VERSION 3.2.1

RUN echo "$LOG_TAG Getting SPARK_HOME" && \
apt-get update && \
# build deps and deps for c bindings for cntk
apt-get install -y build-essential && \
apt-get install -y autoconf automake libtool curl make unzip && \
mkdir -p /opt && \
cd /opt && \
curl http://apache.claz.org/spark/spark-${APACHE_SPARK_VERSION}/spark-${APACHE_SPARK_VERSION}-bin-without-hadoop.tgz | \
tar -xz && \
ln -s spark-${APACHE_SPARK_VERSION}-bin-without-hadoop spark && \
echo Spark ${APACHE_SPARK_VERSION} installed in /opt/spark && \
export SPARK_HOME=/opt/spark

RUN echo "downloading hadoop" && \
apt-get install -y wget && \
cd /tmp && \
wget http://apache.claz.org/hadoop/common/hadoop-${HADOOP_VERSION}/hadoop-${HADOOP_VERSION}.tar.gz -O - | \
tar -xz && \
mv /tmp/hadoop-${HADOOP_VERSION} /opt/hadoop && \
echo "export HADOOP_CLASSPATH=/opt/hadoop/share/hadoop/tools/lib/*" >> /opt/hadoop/etc/hadoop/hadoop-env.sh && \
echo Hadoop ${HADOOP_VERSION} installed in /opt/hadoop && \
rm -rf /opt/hadoop/share/doc

RUN echo "\nSPARK_DIST_CLASSPATH=/jars:/jars/*:$(/opt/hadoop/bin/hadoop classpath)" >> /opt/spark/conf/spark-env.sh
ENV HADOOP_HOME=/opt/hadoop
ADD jars /jars

# if numpy is installed on a driver it needs to be installed on all
# workers, so install it everywhere
RUN apt-get update && \
apt install -y python3-pip && \
pip3 install numpy && \
pip3 install matplotlib && \
pip3 install pandas==0.24.1 && \
pip3 install scikit-learn && \
pip3 install pyarrow==0.11.1 && \
apt-get clean && \
rm -rf /var/lib/apt/lists/*

# Final config
ADD spark-config/log4j.properties /opt/spark/conf/log4j.properties
ADD spark-config/start-common.sh spark-config/start-worker spark-config/start-master /
ADD spark-config/core-site.xml /opt/spark/conf/core-site.xml
ADD spark-config/spark-defaults.conf /opt/spark/conf/spark-defaults.conf
ENV PATH $PATH:/opt/spark/bin

ENV LIVY_VERSION="git_master"
ENV LIVY_COMMIT="02550f7919b7348b6a7270cf806e031670037b2f"
Expand All @@ -9,91 +59,43 @@ ENV LOG_TAG="[LIVY_${LIVY_VERSION}]:" \
LC_ALL=en_US.UTF-8

RUN echo "$LOG_TAG Install essentials" && \
apt-get -y update && \
apt-get install -y locales && \
locale-gen $LANG && \
apt-get install -y git wget grep curl sed && \
apt-get install -y python-setuptools && \
apt-get autoclean && apt-get autoremove && \
echo "$LOG_TAG Install python dependencies" && \
curl https://bootstrap.pypa.io/get-pip.py -o get-pip.py && \
python get-pip.py && \
rm get-pip.py && \
apt-get install -y python-dev libpython3-dev build-essential pkg-config gfortran && \
pip install -U pip setuptools wheel && \
apt-get update && \
apt-get install -y git wget curl && \
echo "$LOG_TAG setting python dependencies" && \
ln -s /usr/bin/python3 /usr/bin/python && \
echo "$LOG_TAG Getting maven" && \
wget http://www.eu.apache.org/dist/maven/maven-3/3.3.9/binaries/apache-maven-3.3.9-bin.tar.gz && \
tar -zxf apache-maven-3.3.9-bin.tar.gz -C /usr/local/ && \
rm -rf apache-maven-3.3.9-bin.tar.gz && \
ln -s /usr/local/apache-maven-3.3.9/bin/mvn /usr/local/bin/mvn && \
echo "$LOG_TAG Download and build Livy source" && \
git clone https://github.com/apache/incubator-livy.git ${LIVY_HOME}_src && \
cd ${LIVY_HOME}_src && \
git checkout ${LIVY_COMMIT} && \
mvn package -DskipTests && \
ls ${LIVY_HOME}_src && \
mv ${LIVY_HOME}_src ${LIVY_HOME} && \
echo "$LOG_TAG Cleanup" && \
apt-get purge -y --auto-remove build-essential pkg-config gfortran libpython3-dev && \
apt-get autoremove && \
apt-get autoclean && \
apt-get clean && \
rm -rf /usr/local/apache-maven-3.3.9 && \
rm -rf /root/.ivy2 && \
rm -rf /root/.npm && \
rm -rf /root/.m2 && \
rm -rf /root/.cache && \
rm -rf /tmp/*

# Get Spark from US Apache mirror.
ENV APACHE_SPARK_VERSION 2.4.0
ENV HADOOP_VERSION 3.2.0
ENV HADOOP_GIT_COMMIT="release-3.2.0-RC1"

RUN echo "$LOG_TAG Getting SPARK_HOME" && \
mkdir -p /opt && \
cd /opt && \
curl http://apache.claz.org/spark/spark-${APACHE_SPARK_VERSION}/spark-${APACHE_SPARK_VERSION}-bin-without-hadoop.tgz | \
tar -xz && \
ln -s spark-${APACHE_SPARK_VERSION}-bin-without-hadoop spark && \
echo Spark ${APACHE_SPARK_VERSION} installed in /opt/spark && \
export SPARK_HOME=/opt/spark

RUN echo "$LOG_TAG building hadoop" && \
apt-get update && \
apt-get install -y make autoconf automake libtool g++ unzip && \
cd / && \
git clone https://github.com/apache/hadoop.git hadoop_src&& \
mkdir /hadoop_deps && cd /hadoop_deps && \
wget https://github.com/protocolbuffers/protobuf/releases/download/v2.5.0/protobuf-2.5.0.tar.bz2 && \
tar xvf protobuf-2.5.0.tar.bz2 && \
cd protobuf-2.5.0 && \
./configure && make && make install && ldconfig && \
cd /hadoop_src && git checkout ${HADOOP_GIT_COMMIT} && mvn package -Pdist -DskipTests -Dtar && \
mv hadoop-dist/target/hadoop-${HADOOP_VERSION} /opt/hadoop && \
rm -r /hadoop_src && \
rm -rf /root/.ivy2 && \
rm -rf /root/.m2 && \
export HADOOP_HOME=/opt/hadoop && \
echo "\nexport HADOOP_CLASSPATH=/opt/hadoop/share/hadoop/tools/lib/*" >> /opt/hadoop/etc/hadoop/hadoop-env.sh && \
echo Hadoop ${HADOOP_VERSION} installed in /opt/hadoop && \
apt-get purge -y --auto-remove g++ make build-essential autoconf automake && \
cd / && rm -rf /hadoop_deps

RUN echo "\nSPARK_DIST_CLASSPATH=/jars:/jars/*:$(/opt/hadoop/bin/hadoop classpath)" >> /opt/spark/conf/spark-env.sh
ENV HADOOP_HOME=/opt/hadoop
ADD jars /jars

ENV HADOOP_CONF_DIR /opt/hadoop/conf
ENV CONF_DIR /livy/conf
ENV SPARK_CONF_DIR /opt/spark/conf

RUN mv ${LIVY_HOME}_src ${LIVY_HOME}
ADD livy.conf ${LIVY_HOME}/conf
EXPOSE 8998

WORKDIR ${LIVY_HOME}

RUN mkdir logs && export SPARK_HOME=/opt/spark && export HADOOP_HOME=/opt/hadoop && export SPARK_CONF_DIR=/opt/spark/conf
RUN mkdir logs

#hive needed for livy pyspark
RUN wget http://central.maven.org/maven2/org/apache/spark/spark-hive_2.11/2.4.0/spark-hive_2.11-2.4.0.jar -P /opt/spark/jars
RUN wget https://repo1.maven.org/maven2/org/apache/spark/spark-hive_2.11/2.4.5/spark-hive_2.11-2.4.5.jar -P /opt/spark/jars

CMD ["sh", "-c", "echo '\nspark.driver.host' $(hostname -i) >> /opt/spark/conf/spark-defaults.conf && echo '\nlivy.spark.master' $SPARK_MASTER >> /livy/conf/livy.conf && bin/livy-server"]
CMD ["sh", "-c", "echo '\nspark.driver.host' $(hostname -i) >> /opt/spark/conf/spark-defaults.conf && echo '\nlivy.spark.master' $SPARK_MASTER >> /livy/conf/livy.conf && bin/livy-server"]
19 changes: 19 additions & 0 deletions tools/helm/livy/spark-config/core-site.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
<?xml version="1.0"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>

<configuration>
<property>
<name>fs.gs.impl</name>
<value>com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem</value>
<description>The FileSystem for gs: (GCS) uris.</description>
</property>
<property>
<name>fs.AbstractFileSystem.gs.impl</name>
<value>com.google.cloud.hadoop.fs.gcs.GoogleHadoopFS</value>
<description>The AbstractFileSystem for gs: (GCS) uris. Only necessary for use with Hadoop 2.</description>
</property>
<property>
<name>fs.gs.project.id</name>
<value>NOT_RUNNING_INSIDE_GCE</value>
</property>
</configuration>
12 changes: 12 additions & 0 deletions tools/helm/livy/spark-config/log4j.properties
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
# Set everything to be logged to the console
log4j.rootCategory=INFO, console
log4j.appender.console=org.apache.log4j.ConsoleAppender
log4j.appender.console.target=System.err
log4j.appender.console.layout=org.apache.log4j.PatternLayout
log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{1}: %m%n

# Settings to quiet third party logs that are too verbose
log4j.logger.org.spark-project.jetty=WARN
log4j.logger.org.spark-project.jetty.util.component.AbstractLifeCycle=ERROR
log4j.logger.org.apache.spark.repl.SparkIMain$exprTyper=INFO
log4j.logger.org.apache.spark.repl.SparkILoop$SparkILoopInterpreter=INFO
1 change: 1 addition & 0 deletions tools/helm/livy/spark-config/spark-defaults.conf
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
spark.app.id KubernetesSpark
30 changes: 30 additions & 0 deletions tools/helm/livy/spark-config/start-common.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
#!/bin/bash

# Copyright 2015 The Kubernetes Authors All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

PROJECT_ID=$(curl -s -H "Metadata-Flavor: Google" http://metadata.google.internal/computeMetadata/v1/project/project-id)

if [[ -n "${PROJECT_ID}" ]]; then
sed -i "s/NOT_RUNNING_INSIDE_GCE/${PROJECT_ID}/" /opt/spark/conf/core-site.xml
fi

# We don't want any of the incoming service variables, we'd rather use
# DNS. But this one interferes directly with Spark.
unset SPARK_MASTER_PORT

# spark.{executor,driver}.extraLibraryPath don't actually seem to
# work, this seems to be the only reliable way to get the native libs
# picked up.
export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/opt/hadoop/lib/native
22 changes: 22 additions & 0 deletions tools/helm/livy/spark-config/start-master
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
#!/bin/bash

# Copyright 2015 The Kubernetes Authors All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

. start-common.sh

echo "$(hostname -i) spark-master" >> /etc/hosts

# Run spark-class directly so that when it exits (or crashes), the pod restarts.
/opt/spark/bin/spark-class org.apache.spark.deploy.master.Master --ip spark-master --port 7077 --webui-port 8080
28 changes: 28 additions & 0 deletions tools/helm/livy/spark-config/start-worker
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
#!/bin/bash

# Copyright 2015 The Kubernetes Authors All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

. start-common.sh

if ! getent hosts spark-master; then
echo "=== Cannot resolve the DNS entry for spark-master. Has the service been created yet, and is SkyDNS functional?"
echo "=== See http://kubernetes.io/v1.1/docs/admin/dns.html for more details on DNS integration."
echo "=== Sleeping 10s before pod exit."
sleep 10
exit 0
fi

# Run spark-class directly so that when it exits (or crashes), the pod restarts.
/opt/spark/bin/spark-class org.apache.spark.deploy.worker.Worker spark://spark-master:7077 --webui-port 8081
64 changes: 24 additions & 40 deletions tools/helm/spark/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,11 +1,15 @@
FROM java:openjdk-8-jdk
FROM openjdk:8-jdk-slim-buster
LABEL maintainer="Dalitso Banda [email protected]"

# Get Spark from US Apache mirror.
ENV APACHE_SPARK_VERSION 2.4.0
ENV HADOOP_VERSION 3.2.0
ENV HADOOP_GIT_COMMIT="release-3.2.0-RC1"
ENV APACHE_SPARK_VERSION 2.4.5
ENV HADOOP_VERSION 3.2.1

RUN echo "$LOG_TAG Getting SPARK_HOME" && \
apt-get update && \
# build deps and deps for c bindings for cntk
apt-get install -y build-essential && \
apt-get install -y autoconf automake libtool curl make unzip && \
mkdir -p /opt && \
cd /opt && \
curl http://apache.claz.org/spark/spark-${APACHE_SPARK_VERSION}/spark-${APACHE_SPARK_VERSION}-bin-without-hadoop.tgz | \
Expand All @@ -14,55 +18,35 @@ RUN echo "$LOG_TAG Getting SPARK_HOME" && \
echo Spark ${APACHE_SPARK_VERSION} installed in /opt/spark && \
export SPARK_HOME=/opt/spark

RUN echo "$LOG_TAG Getting maven" && \
wget http://www.eu.apache.org/dist/maven/maven-3/3.3.9/binaries/apache-maven-3.3.9-bin.tar.gz && \
tar -zxf apache-maven-3.3.9-bin.tar.gz -C /usr/local/ && \
ln -s /usr/local/apache-maven-3.3.9/bin/mvn /usr/local/bin/mvn

RUN echo "$LOG_TAG building hadoop" && \
echo "deb http://deb.debian.org/debian stretch main" >> /etc/apt/sources.list && \
apt-get update && \
# build deps and deps for c bindings for cntk
apt-get install -y g++ gcc-6 libstdc++-6-dev make build-essential && \
apt-get install -y autoconf automake libtool curl make unzip && \
cd / && \
git clone https://github.com/apache/hadoop.git hadoop_src&& \
mkdir /hadoop_deps && cd /hadoop_deps && \
wget https://github.com/protocolbuffers/protobuf/releases/download/v2.5.0/protobuf-2.5.0.tar.bz2 && \
tar xvf protobuf-2.5.0.tar.bz2 && \
cd protobuf-2.5.0 && \
./configure && make && make install && ldconfig && \
cd /hadoop_src && git checkout ${HADOOP_GIT_COMMIT} && mvn package -Pdist -DskipTests -Dtar && \
mv hadoop-dist/target/hadoop-${HADOOP_VERSION} /opt/hadoop && \
rm -r /hadoop_src && \
export HADOOP_HOME=/opt/hadoop && \
echo "\nexport HADOOP_CLASSPATH=/opt/hadoop/share/hadoop/tools/lib/*" >> /opt/hadoop/etc/hadoop/hadoop-env.sh && \
RUN echo "downloading hadoop" && \
apt-get install -y wget && \
cd /tmp && \
wget http://apache.claz.org/hadoop/common/hadoop-${HADOOP_VERSION}/hadoop-${HADOOP_VERSION}.tar.gz -O - | \
tar -xz && \
mv /tmp/hadoop-${HADOOP_VERSION} /opt/hadoop && \
echo "export HADOOP_CLASSPATH=/opt/hadoop/share/hadoop/tools/lib/*" >> /opt/hadoop/etc/hadoop/hadoop-env.sh && \
echo Hadoop ${HADOOP_VERSION} installed in /opt/hadoop && \
apt-get purge -y --auto-remove g++ make build-essential autoconf automake && \
cd / && rm -rf /hadoop_deps
rm -rf /opt/hadoop/share/doc

RUN echo "\nSPARK_DIST_CLASSPATH=/jars:/jars/*:$(/opt/hadoop/bin/hadoop classpath)" >> /opt/spark/conf/spark-env.sh
ENV HADOOP_HOME=/opt/hadoop
ADD jars /jars


# if numpy is installed on a driver it needs to be installed on all
# workers, so install it everywhere
RUN apt-get update && \
apt-get install -y g++ python-dev build-essential python3-dev && \
curl https://bootstrap.pypa.io/get-pip.py -o get-pip.py && \
python get-pip.py && \
rm get-pip.py && \
pip install -U pip setuptools wheel && \
pip install numpy && \
pip install matplotlib && \
pip install pandas && \
apt-get purge -y --auto-remove python-dev build-essential python3-dev && \
apt install -y python3-pip && \
pip3 install numpy && \
pip3 install matplotlib && \
pip3 install pandas==0.24.1 && \
pip3 install scikit-learn && \
pip3 install pyarrow==0.11.1 && \
apt-get clean && \
rm -rf /var/lib/apt/lists/*

# Final config
ADD log4j.properties /opt/spark/conf/log4j.properties
ADD start-common.sh start-worker start-master /
ADD core-site.xml /opt/spark/conf/core-site.xml
ADD spark-defaults.conf /opt/spark/conf/spark-defaults.conf
ENV PATH $PATH:/opt/spark/bin
ENV PATH $PATH:/opt/spark/bin
Loading

0 comments on commit b431a61

Please sign in to comment.