diff --git a/Dockerfile b/Dockerfile index 0a3eb53..78b0c1a 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,4 +1,4 @@ -FROM debian:jessie +FROM debian:stretch MAINTAINER Getty Images "https://github.com/gettyimages" RUN apt-get update \ @@ -30,22 +30,13 @@ ENV PYTHONIOENCODING UTF-8 ENV PIP_DISABLE_PIP_VERSION_CHECK 1 # JAVA -ARG JAVA_MAJOR_VERSION=8 -ARG JAVA_UPDATE_VERSION=112 -ARG JAVA_BUILD_NUMBER=15 -ENV JAVA_HOME /usr/jdk1.${JAVA_MAJOR_VERSION}.0_${JAVA_UPDATE_VERSION} - -ENV PATH $PATH:$JAVA_HOME/bin -RUN curl -sL --retry 3 --insecure \ - --header "Cookie: oraclelicense=accept-securebackup-cookie;" \ - "http://download.oracle.com/otn-pub/java/jdk/$%7BJAVA_MAJOR_VERSION%7Du$%7BJAVA_UPDATE_VERSION%7D-b$%7BJAVA_BUILD_NUMBER%7D/server-jre-$%7BJAVA_MAJOR_VERSION%7Du$%7BJAVA_UPDATE_VERSION%7D-linux-x64.tar.gz" \ - | gunzip \ - | tar x -C /usr/ \ - && ln -s $JAVA_HOME /usr/java \ - && rm -rf $JAVA_HOME/man +RUN apt-get update \ + && apt-get install -y openjdk-8-jre \ + && apt-get clean \ + && rm -rf /var/lib/apt/lists/* # HADOOP -ENV HADOOP_VERSION 2.7.3 +ENV HADOOP_VERSION 3.0.0 ENV HADOOP_HOME /usr/hadoop-$HADOOP_VERSION ENV HADOOP_CONF_DIR=$HADOOP_HOME/etc/hadoop ENV PATH $PATH:$HADOOP_HOME/bin @@ -57,13 +48,13 @@ RUN curl -sL --retry 3 \ && chown -R root:root $HADOOP_HOME # SPARK -ENV SPARK_VERSION 2.0.2 +ENV SPARK_VERSION 2.4.1 ENV SPARK_PACKAGE spark-${SPARK_VERSION}-bin-without-hadoop ENV SPARK_HOME /usr/spark-${SPARK_VERSION} ENV SPARK_DIST_CLASSPATH="$HADOOP_HOME/etc/hadoop/*:$HADOOP_HOME/share/hadoop/common/lib/*:$HADOOP_HOME/share/hadoop/common/*:$HADOOP_HOME/share/hadoop/hdfs/*:$HADOOP_HOME/share/hadoop/hdfs/lib/*:$HADOOP_HOME/share/hadoop/hdfs/*:$HADOOP_HOME/share/hadoop/yarn/lib/*:$HADOOP_HOME/share/hadoop/yarn/*:$HADOOP_HOME/share/hadoop/mapreduce/lib/*:$HADOOP_HOME/share/hadoop/mapreduce/*:$HADOOP_HOME/share/hadoop/tools/lib/*" ENV PATH $PATH:${SPARK_HOME}/bin RUN curl -sL --retry 3 \ - "http://d3kbcqa49mib13.cloudfront.net/$%7BSPARK_PACKAGE%7D.tgz" \ + "https://archive.apache.org/dist/spark/spark-$%7BSPARK_VERSION%7D/$%7BSPARK_PACKAGE%7D.tgz" \ | gunzip \ | tar x -C /usr/ \ && mv /usr/$SPARK_PACKAGE $SPARK_HOME \ diff --git a/README.md b/README.md index d195025..5679220 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,7 @@ # spark -A `debian:jessie` based [Spark](http://spark.apache.org) container. Use it in a standalone cluster with the accompanying `docker-compose.yml`, or as a base for more complex recipes. +A `debian:stretch` based [Spark](http://spark.apache.org) container. Use it in a standalone cluster with the accompanying `docker-compose.yml`, or as a base for more complex recipes. ## docker example @@ -15,7 +15,7 @@ To start `spark-shell` with your AWS credentials: To do a thing with Pyspark - echo "import pyspark\nprint(pyspark.SparkContext().parallelize(range(0, 10)).count())" > count.py + echo -e "import pyspark\n\nprint(pyspark.SparkContext().parallelize(range(0, 10)).count())" > count.py docker run --rm -it -p 4040:4040 -v $(pwd)/count.py:/count.py gettyimages/spark bin/spark-submit /count.py ## docker-compose example @@ -26,12 +26,12 @@ To create a simplistic standalone cluster with [docker-compose](http://docs.dock The SparkUI will be running at `http://${YOUR_DOCKER_HOST}:8080` with one worker listed. To run `pyspark`, exec into a container: - docker exec -it dockerspark_master_1 /bin/bash + docker exec -it docker-spark_master_1 /bin/bash bin/pyspark To run `SparkPi`, exec into a container: - docker exec -it dockerspark_master_1 /bin/bash + docker exec -it docker-spark_master_1 /bin/bash bin/run-example SparkPi 10 ## license diff --git a/cloudbuild.yaml b/cloudbuild.yaml new file mode 100644 index 0000000..67e7e20 --- /dev/null +++ b/cloudbuild.yaml @@ -0,0 +1,24 @@ +# https://cloud.google.com/cloud-build/docs/speeding-up-builds +# https://cloud.google.com/cloud-build/docs/configuring-builds/substitute-variable-values +substitutions: + _IMAGE: 'gcr.io/whiteblock/spark' +timeout: '45m' +steps: +# allow these steps to fail, they try to pull cache first +- name: 'gcr.io/cloud-builders/docker' + entrypoint: 'bash' + args: ['-c', 'docker pull $_IMAGE:$BRANCH_NAME || true' ] +# build final docker image +- name: 'gcr.io/cloud-builders/docker' + args: [ + 'build', + '-t', '$_IMAGE:$BRANCH_NAME', + '-t', '$_IMAGE:$COMMIT_SHA', + '--cache-from', '$_IMAGE:$BRANCH_NAME', + '.' + ] +# push docker image tag(s) one branch, one immutable +- name: 'gcr.io/cloud-builders/docker' + args: [ 'push', '$_IMAGE:$COMMIT_SHA' ] +- name: 'gcr.io/cloud-builders/docker' + args: [ 'push', '$_IMAGE:$BRANCH_NAME' ] diff --git a/conf/master/spark-defaults.conf b/conf/master/spark-defaults.conf index 25ac600..90be208 100644 --- a/conf/master/spark-defaults.conf +++ b/conf/master/spark-defaults.conf @@ -6,7 +6,6 @@ spark.fileserver.port 7002 spark.broadcast.port 7003 spark.replClassServer.port 7004 spark.blockManager.port 7005 -spark.executor.port 7006 spark.broadcast.factory=org.apache.spark.broadcast.HttpBroadcastFactory spark.port.maxRetries 4 diff --git a/conf/worker/spark-defaults.conf b/conf/worker/spark-defaults.conf index 0e70efa..7c1e38a 100644 --- a/conf/worker/spark-defaults.conf +++ b/conf/worker/spark-defaults.conf @@ -6,7 +6,6 @@ spark.fileserver.port 7012 spark.broadcast.port 7013 spark.replClassServer.port 7014 spark.blockManager.port 7015 -spark.executor.port 7016 spark.broadcast.factory=org.apache.spark.broadcast.HttpBroadcastFactory spark.port.maxRetries 4 diff --git a/docker-compose.yml b/docker-compose.yml index 7de9ab3..70c65e2 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -1,51 +1,51 @@ -master: - image: gettyimages/spark - command: bin/spark-class org.apache.spark.deploy.master.Master -h master - hostname: master - environment: - MASTER: spark://master:7077 - SPARK_CONF_DIR: /conf - SPARK_PUBLIC_DNS: localhost - expose: - - 7001 - - 7002 - - 7003 - - 7004 - - 7005 - - 7006 - - 7077 - - 6066 - ports: - - 4040:4040 - - 6066:6066 - - 7077:7077 - - 8080:8080 - volumes: - - ./conf/master:/conf - - ./data:/tmp/data +version: "2.2" +services: + master: + image: gettyimages/spark + command: bin/spark-class org.apache.spark.deploy.master.Master -h master + hostname: master + environment: + MASTER: spark://master:7077 + SPARK_CONF_DIR: /conf + SPARK_PUBLIC_DNS: localhost + expose: + - 7001 + - 7002 + - 7003 + - 7004 + - 7005 + - 7077 + - 6066 + ports: + - 4040:4040 + - 6066:6066 + - 7077:7077 + - 8080:8080 + volumes: + - ./conf/master:/conf + - ./data:/tmp/data -worker: - image: gettyimages/spark - command: bin/spark-class org.apache.spark.deploy.worker.Worker spark://master:7077 - hostname: worker - environment: - SPARK_CONF_DIR: /conf - SPARK_WORKER_CORES: 2 - SPARK_WORKER_MEMORY: 1g - SPARK_WORKER_PORT: 8881 - SPARK_WORKER_WEBUI_PORT: 8081 - SPARK_PUBLIC_DNS: localhost - links: - - master - expose: - - 7012 - - 7013 - - 7014 - - 7015 - - 7016 - - 8881 - ports: - - 8081:8081 - volumes: - - ./conf/worker:/conf - - ./data:/tmp/data + worker: + image: gettyimages/spark + command: bin/spark-class org.apache.spark.deploy.worker.Worker spark://master:7077 + hostname: worker + environment: + SPARK_CONF_DIR: /conf + SPARK_WORKER_CORES: 2 + SPARK_WORKER_MEMORY: 1g + SPARK_WORKER_PORT: 8881 + SPARK_WORKER_WEBUI_PORT: 8081 + SPARK_PUBLIC_DNS: localhost + links: + - master + expose: + - 7012 + - 7013 + - 7014 + - 7015 + - 8881 + ports: + - 8081:8081 + volumes: + - ./conf/worker:/conf + - ./data:/tmp/data