Skip to content

Commit c8ac2f3

Browse files
adoroszlaiReamer
authored andcommitted
[ZEPPELIN-6157] Download artifacts from CDN if available
## What changes were proposed in this pull request? Current artifacts available in CDN (`dlcdn.apache.org`) may get removed without notice when new releases appear. To avoid broken links, build scripts contain permanent addresses from `archive.apache.org`. But download from `archive.apache.org` may be slow: ``` Thu, 05 Dec 2024 08:39:53 GMT [INFO] --- download:1.6.0:wget (download-sparkr-files) <at> r --- Thu, 05 Dec 2024 08:39:54 GMT Warning: No signatures were supplied, skipping file validation Thu, 05 Dec 2024 08:39:54 GMT [INFO] Read Timeout is set to 60000 milliseconds (apprx 1 minutes) Thu, 05 Dec 2024 08:45:46 GMT [INFO] Expanding: /home/runner/work/zeppelin/zeppelin/rlang/target/spark-3.5.3-bin-without-hadoop.tgz into /home/runner/work/zeppelin/zeppelin/rlang/target ``` Apache Infra's [`closer.lua` script](https://infra.apache.org/release-download-pages.html#closer) can redirect to CDN or archive, depending on artifact availability. This change replaces `archive.apache.org` URLs, and one instance of `dist.apache.org`, with their `closer.lua` equivalent. Output filename has to be specified for `wget` unfortunately. https://issues.apache.org/jira/browse/ZEPPELIN-6157 ## How was this patch tested? Tried some of the URLs locally, both from CLI (`curl -L --head`) and regular build (`mvn -DskipTests clean package`). Full CI: - quick: https://github.com/adoroszlai/zeppelin/actions/runs/12319072153 - frontend: https://github.com/adoroszlai/zeppelin/actions/runs/12319072142 - core: https://github.com/adoroszlai/zeppelin/actions/runs/12319072156 Closes #4901 from adoroszlai/ZEPPELIN-6157. Signed-off-by: Philipp Dallig <[email protected]>
1 parent 5551cd4 commit c8ac2f3

File tree

11 files changed

+26
-18
lines changed

11 files changed

+26
-18
lines changed

docs/quickstart/kubernetes.md

+1-1
Original file line numberDiff line numberDiff line change
@@ -179,7 +179,7 @@ $ mv zeppelin-distribution/target/zeppelin-*-bin.tgz scripts/docker/zeppelin/bin
179179
180180
# Find following section and comment out
181181
#RUN echo "$LOG_TAG Download Zeppelin binary" && \
182-
# wget -O /tmp/zeppelin-${Z_VERSION}-bin-all.tgz http://archive.apache.org/dist/zeppelin/zeppelin-${Z_VERSION}/zeppelin-${Z_VERSION}-bin-all.tgz && \
182+
# wget -O /tmp/zeppelin-${Z_VERSION}-bin-all.tgz "https://www.apache.org/dyn/closer.lua/zeppelin/zeppelin-${Z_VERSION}/zeppelin-${Z_VERSION}-bin-all.tgz?action=download" && \
183183
# tar -zxvf /tmp/zeppelin-${Z_VERSION}-bin-all.tgz && \
184184
# rm -rf /tmp/zeppelin-${Z_VERSION}-bin-all.tgz && \
185185
# mv /zeppelin-${Z_VERSION}-bin-all ${ZEPPELIN_HOME}

docs/setup/deployment/flink_and_spark_cluster.md

+2-2
Original file line numberDiff line numberDiff line change
@@ -215,7 +215,7 @@ Building from source is recommended where possible, for simplicity in this tuto
215215
To download the Flink Binary use `wget`
216216

217217
```bash
218-
wget "https://archive.apache.org/dist/flink/flink-1.17.1/flink-1.17.1-bin-scala_2.12.tgz"
218+
wget -O flink-1.17.1-bin-scala_2.12.tgz "https://www.apache.org/dyn/closer.lua/flink/flink-1.17.1/flink-1.17.1-bin-scala_2.12.tgz?action=download"
219219
tar -xzvf flink-1.17.1-bin-scala_2.12.tgz
220220
```
221221

@@ -285,7 +285,7 @@ Using binaries is also
285285
To download the Spark Binary use `wget`
286286

287287
```bash
288-
wget "https://archive.apache.org/dist/spark/spark-3.5.2/spark-3.5.2-bin-hadoop3.tgz"
288+
wget -O spark-3.5.2-bin-hadoop3.tgz "https://www.apache.org/dyn/closer.lua/spark/spark-3.5.2/spark-3.5.2-bin-hadoop3.tgz?action=download"
289289
tar -xzvf spark-3.5.2-bin-hadoop3.tgz
290290
mv spark-3.5.2-bin-hadoop3 spark
291291
```

flink/flink-scala-2.12/pom.xml

+2-1
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,7 @@
4242
<derby.version>10.14.2.0</derby.version>
4343
<hiverunner.version>5.3.0</hiverunner.version>
4444

45-
<flink.bin.download.url>https://archive.apache.org/dist/flink/flink-${flink.version}/flink-${flink.version}-bin-scala_${flink.scala.binary.version}.tgz</flink.bin.download.url>
45+
<flink.bin.download.url>https://www.apache.org/dyn/closer.lua/flink/flink-${flink.version}/flink-${flink.version}-bin-scala_${flink.scala.binary.version}.tgz?action=download</flink.bin.download.url>
4646
</properties>
4747

4848
<dependencies>
@@ -1056,6 +1056,7 @@
10561056
<url>${flink.bin.download.url}</url>
10571057
<unpack>true</unpack>
10581058
<outputDirectory>${project.build.directory}</outputDirectory>
1059+
<outputFileName>flink-${flink.version}-bin-scala_${flink.scala.binary.version}.tgz</outputFileName>
10591060
</configuration>
10601061
</execution>
10611062
</executions>

rlang/pom.xml

+2-1
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,7 @@
3838

3939
<spark.archive>spark-${spark.version}</spark.archive>
4040
<spark.bin.download.url>
41-
https://archive.apache.org/dist/spark/${spark.archive}/${spark.archive}-bin-without-hadoop.tgz
41+
https://www.apache.org/dyn/closer.lua/spark/${spark.archive}/${spark.archive}-bin-without-hadoop.tgz?action=download
4242
</spark.bin.download.url>
4343
<interpreter.jar.name>zeppelin-interpreter-r</interpreter.jar.name>
4444
</properties>
@@ -154,6 +154,7 @@
154154
<url>${spark.bin.download.url}</url>
155155
<unpack>true</unpack>
156156
<outputDirectory>${project.build.directory}</outputDirectory>
157+
<outputFileName>${spark.archive}-bin-without-hadoop.tgz</outputFileName>
157158
</configuration>
158159
</execution>
159160
</executions>

scripts/docker/interpreter/Dockerfile

+1-1
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@ RUN apt-get update && apt-get install -y curl unzip wget grep sed vim tzdata &&
3030
RUN rm -rf /opt/zeppelin
3131

3232
RUN rm -rf /spark
33-
RUN wget https://archive.apache.org/dist/spark/spark-${SPARK_VERSION}/spark-${SPARK_VERSION}-bin-${SPARK_BIN_NAME}.tgz
33+
RUN wget -O spark-${SPARK_VERSION}-bin-${SPARK_BIN_NAME}.tgz "https://www.apache.org/dyn/closer.lua/spark/spark-${SPARK_VERSION}/spark-${SPARK_VERSION}-bin-${SPARK_BIN_NAME}.tgz?action=download"
3434
RUN tar zxvf spark-${SPARK_VERSION}-bin-${SPARK_BIN_NAME}.tgz
3535
RUN mv spark-${SPARK_VERSION}-bin-${SPARK_BIN_NAME} /opt/spark
3636
RUN rm spark-${SPARK_VERSION}-bin-${SPARK_BIN_NAME}.tgz

scripts/docker/spark-cluster-managers/spark_yarn_cluster/Dockerfile

+2-2
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,7 @@ ENV PATH $PATH:$JAVA_HOME/bin
4242
RUN yum install -y curl which tar sudo openssh-server openssh-clients rsync
4343

4444
# hadoop
45-
RUN curl -s https://archive.apache.org/dist/hadoop/core/hadoop-$HADOOP_VERSION/hadoop-$HADOOP_VERSION.tar.gz | tar -xz -C /usr/local/
45+
RUN curl -s "https://www.apache.org/dyn/closer.lua/hadoop/common/hadoop-$HADOOP_VERSION/hadoop-$HADOOP_VERSION.tar.gz?action=download" | tar -xz -C /usr/local/
4646
RUN cd /usr/local && ln -s ./hadoop-$HADOOP_VERSION hadoop
4747

4848
ENV HADOOP_PREFIX /usr/local/hadoop
@@ -72,7 +72,7 @@ RUN rm /usr/local/hadoop/lib/native/*
7272
RUN curl -Ls http://dl.bintray.com/sequenceiq/sequenceiq-bin/hadoop-native-64-$HADOOP_VERSION.tar|tar -x -C /usr/local/hadoop/lib/native/
7373

7474
# install spark
75-
RUN curl -s http://archive.apache.org/dist/spark/spark-$SPARK_VERSION/spark-$SPARK_VERSION-bin-hadoop$HADOOP_PROFILE.tgz | tar -xz -C /usr/local/
75+
RUN curl -s "https://www.apache.org/dyn/closer.lua/spark/spark-$SPARK_VERSION/spark-$SPARK_VERSION-bin-hadoop$HADOOP_PROFILE.tgz?action=download" | tar -xz -C /usr/local/
7676
RUN cd /usr/local && ln -s spark-$SPARK_VERSION-bin-hadoop$HADOOP_PROFILE spark
7777
ENV SPARK_HOME /usr/local/spark
7878

scripts/docker/zeppelin/bin/Dockerfile

+1-1
Original file line numberDiff line numberDiff line change
@@ -65,7 +65,7 @@ ENV PATH /opt/conda/envs/python_3_with_R/bin:/opt/conda/bin:$PATH
6565

6666
RUN echo "$LOG_TAG Download Zeppelin binary" && \
6767
mkdir -p ${ZEPPELIN_HOME} && \
68-
wget -nv -O /tmp/zeppelin-${Z_VERSION}-bin-all.tgz https://archive.apache.org/dist/zeppelin/zeppelin-${Z_VERSION}/zeppelin-${Z_VERSION}-bin-all.tgz && \
68+
wget -nv -O /tmp/zeppelin-${Z_VERSION}-bin-all.tgz "https://www.apache.org/dyn/closer.lua/zeppelin/zeppelin-${Z_VERSION}/zeppelin-${Z_VERSION}-bin-all.tgz?action=download" && \
6969
tar --strip-components=1 -zxvf /tmp/zeppelin-${Z_VERSION}-bin-all.tgz -C ${ZEPPELIN_HOME} && \
7070
rm -f /tmp/zeppelin-${Z_VERSION}-bin-all.tgz && \
7171
chown -R root:root ${ZEPPELIN_HOME} && \

spark/interpreter/pom.xml

+4-2
Original file line numberDiff line numberDiff line change
@@ -48,10 +48,10 @@
4848

4949
<spark.archive>spark-${spark.version}</spark.archive>
5050
<spark.src.download.url>
51-
https://archive.apache.org/dist/spark/${spark.archive}/${spark.archive}.tgz
51+
https://www.apache.org/dyn/closer.lua/spark/${spark.archive}/${spark.archive}.tgz?action=download
5252
</spark.src.download.url>
5353
<spark.bin.download.url>
54-
https://archive.apache.org/dist/spark/${spark.archive}/${spark.archive}-bin-without-hadoop.tgz
54+
https://www.apache.org/dyn/closer.lua/spark/${spark.archive}/${spark.archive}-bin-without-hadoop.tgz?action=download
5555
</spark.bin.download.url>
5656

5757
<scala.compile.version>${spark.scala.version}</scala.compile.version>
@@ -280,6 +280,7 @@
280280
<unpack>true</unpack>
281281
<url>${spark.src.download.url}</url>
282282
<outputDirectory>${project.build.directory}</outputDirectory>
283+
<outputFileName>${spark.archive}.tgz</outputFileName>
283284
</configuration>
284285
</execution>
285286
<!-- include sparkr by default -->
@@ -295,6 +296,7 @@
295296
<url>${spark.bin.download.url}</url>
296297
<unpack>true</unpack>
297298
<outputDirectory>${project.build.directory}</outputDirectory>
299+
<outputFileName>${spark.archive}-bin-without-hadoop.tgz</outputFileName>
298300
</configuration>
299301
</execution>
300302
</executions>

spark/pom.xml

+2-2
Original file line numberDiff line numberDiff line change
@@ -45,10 +45,10 @@
4545

4646
<spark.archive>spark-${spark.version}</spark.archive>
4747
<spark.src.download.url>
48-
https://archive.apache.org/dist/spark/${spark.archive}/${spark.archive}.tgz
48+
https://www.apache.org/dyn/closer.lua/spark/${spark.archive}/${spark.archive}.tgz?action=download
4949
</spark.src.download.url>
5050
<spark.bin.download.url>
51-
https://archive.apache.org/dist/spark/${spark.archive}/${spark.archive}-bin-without-hadoop.tgz
51+
https://www.apache.org/dyn/closer.lua/spark/${spark.archive}/${spark.archive}-bin-without-hadoop.tgz?action=download
5252
</spark.bin.download.url>
5353
</properties>
5454

testing/downloadLivy.sh

+4-2
Original file line numberDiff line numberDiff line change
@@ -45,12 +45,14 @@ ZEPPELIN_HOME="$(cd "${FWDIR}/.."; pwd)"
4545
# None
4646
# Arguments:
4747
# url - source URL
48+
# file - output filename
4849
# Returns:
4950
# None
5051
#######################################
5152
download_with_retry() {
5253
local url="$1"
53-
wget --retry-connrefused --waitretry=1 --read-timeout=20 --timeout=15 -t 3 "${url}"
54+
local file="${2:-$(basename $url)}"
55+
wget --retry-connrefused --waitretry=1 --read-timeout=20 --timeout=15 -t 3 --output-document "${file}" "${url}"
5456
if [[ "$?" -ne 0 ]]; then
5557
echo "3 download attempts for ${url} failed"
5658
fi
@@ -72,7 +74,7 @@ if [[ ! -d "${LIVY_HOME}" ]]; then
7274
# download livy from archive if not cached
7375
echo "${LIVY_VERSION} being downloaded from archives"
7476
STARTTIME=`date +%s`
75-
download_with_retry "https://dist.apache.org/repos/dist/release/incubator/livy/${LIVY_VERSION}/${LIVY_ARCHIVE}.zip"
77+
download_with_retry "https://www.apache.org/dyn/closer.lua/incubator/livy/${LIVY_VERSION}/${LIVY_ARCHIVE}.zip?action=download" "${LIVY_ARCHIVE}.zip"
7678
ENDTIME=`date +%s`
7779
DOWNLOADTIME="$((ENDTIME-STARTTIME))"
7880
fi

testing/downloadSpark.sh

+5-3
Original file line numberDiff line numberDiff line change
@@ -38,12 +38,14 @@ ZEPPELIN_HOME="$(cd "${FWDIR}/.."; pwd)"
3838
# None
3939
# Arguments:
4040
# url - source URL
41+
# file - output filename
4142
# Returns:
4243
# None
4344
#######################################
4445
download_with_retry() {
4546
local url="$1"
46-
wget --retry-connrefused --waitretry=1 --read-timeout=20 --timeout=15 -t 3 "${url}"
47+
local file="${2:-$(basename $url)}"
48+
wget --retry-connrefused --waitretry=1 --read-timeout=20 --timeout=15 -t 3 --output-document "${file}" "${url}"
4749
if [[ "$?" -ne 0 ]]; then
4850
echo "3 download attempts for ${url} failed"
4951
fi
@@ -65,8 +67,8 @@ if [[ ! -d "${SPARK_HOME}" ]]; then
6567
# download spark from archive if not cached
6668
echo "${SPARK_VERSION} being downloaded from archives"
6769
STARTTIME=`date +%s`
68-
#timeout -s KILL "${MAX_DOWNLOAD_TIME_SEC}" wget "http://archive.apache.org/dist/spark/spark-${SPARK_VERSION}/${SPARK_ARCHIVE}.tgz"
69-
download_with_retry "http://archive.apache.org/dist/spark/spark-${SPARK_VERSION}/${SPARK_ARCHIVE}.tgz"
70+
#timeout -s KILL "${MAX_DOWNLOAD_TIME_SEC}" wget -O "${SPARK_ARCHIVE}.tgz" "https://www.apache.org/dyn/closer.lua/spark/spark-${SPARK_VERSION}/${SPARK_ARCHIVE}.tgz?action=download"
71+
download_with_retry "https://www.apache.org/dyn/closer.lua/spark/spark-${SPARK_VERSION}/${SPARK_ARCHIVE}.tgz?action=download" "${SPARK_ARCHIVE}.tgz"
7072
ENDTIME=`date +%s`
7173
DOWNLOADTIME="$((ENDTIME-STARTTIME))"
7274
fi

0 commit comments

Comments
 (0)