[ZEPPELIN-6157] Download artifacts from CDN if available

adoroszlai · Reamer · commit c8ac2f3984e3 · 2024-12-19T10:22:50.000+01:00
## What changes were proposed in this pull request? Current artifacts available in CDN (`dlcdn.apache.org`) may get removed without notice when new releases appear. To avoid broken links, build scripts contain permanent addresses from `archive.apache.org`. But download from `archive.apache.org` may be slow: ``` Thu, 05 Dec 2024 08:39:53 GMT [INFO] --- download:1.6.0:wget (download-sparkr-files) <at> r --- Thu, 05 Dec 2024 08:39:54 GMT Warning: No signatures were supplied, skipping file validation Thu, 05 Dec 2024 08:39:54 GMT [INFO] Read Timeout is set to 60000 milliseconds (apprx 1 minutes) Thu, 05 Dec 2024 08:45:46 GMT [INFO] Expanding: /home/runner/work/zeppelin/zeppelin/rlang/target/spark-3.5.3-bin-without-hadoop.tgz into /home/runner/work/zeppelin/zeppelin/rlang/target ``` Apache Infra's [`closer.lua` script](https://infra.apache.org/release-download-pages.html#closer) can redirect to CDN or archive, depending on artifact availability. This change replaces `archive.apache.org` URLs, and one instance of `dist.apache.org`, with their `closer.lua` equivalent. Output filename has to be specified for `wget` unfortunately. https://issues.apache.org/jira/browse/ZEPPELIN-6157 ## How was this patch tested? Tried some of the URLs locally, both from CLI (`curl -L --head`) and regular build (`mvn -DskipTests clean package`). Full CI: - quick: https://github.com/adoroszlai/zeppelin/actions/runs/12319072153 - frontend: https://github.com/adoroszlai/zeppelin/actions/runs/12319072142 - core: https://github.com/adoroszlai/zeppelin/actions/runs/12319072156 Closes #4901 from adoroszlai/ZEPPELIN-6157. Signed-off-by: Philipp Dallig <philipp.dallig@gmail.com>
diff --git a/docs/quickstart/kubernetes.md b/docs/quickstart/kubernetes.md
@@ -179,7 +179,7 @@ $ mv zeppelin-distribution/target/zeppelin-*-bin.tgz scripts/docker/zeppelin/bin
 
 # Find following section and comment out
 #RUN echo "$LOG_TAG Download Zeppelin binary" && \
-#    wget -O /tmp/zeppelin-${Z_VERSION}-bin-all.tgz http://archive.apache.org/dist/zeppelin/zeppelin-${Z_VERSION}/zeppelin-${Z_VERSION}-bin-all.tgz && \
+#    wget -O /tmp/zeppelin-${Z_VERSION}-bin-all.tgz "https://www.apache.org/dyn/closer.lua/zeppelin/zeppelin-${Z_VERSION}/zeppelin-${Z_VERSION}-bin-all.tgz?action=download" && \
 #    tar -zxvf /tmp/zeppelin-${Z_VERSION}-bin-all.tgz && \
 #    rm -rf /tmp/zeppelin-${Z_VERSION}-bin-all.tgz && \
 #    mv /zeppelin-${Z_VERSION}-bin-all ${ZEPPELIN_HOME}
diff --git a/docs/setup/deployment/flink_and_spark_cluster.md b/docs/setup/deployment/flink_and_spark_cluster.md
@@ -215,7 +215,7 @@ Building from source is recommended  where possible, for simplicity in this tuto
 To download the Flink Binary use `wget`
 
 ```bash
-wget "https://archive.apache.org/dist/flink/flink-1.17.1/flink-1.17.1-bin-scala_2.12.tgz"
+wget -O flink-1.17.1-bin-scala_2.12.tgz "https://www.apache.org/dyn/closer.lua/flink/flink-1.17.1/flink-1.17.1-bin-scala_2.12.tgz?action=download"
 tar -xzvf flink-1.17.1-bin-scala_2.12.tgz
 ```
 
@@ -285,7 +285,7 @@ Using binaries is also
 To download the Spark Binary use `wget`
 
 ```bash
-wget "https://archive.apache.org/dist/spark/spark-3.5.2/spark-3.5.2-bin-hadoop3.tgz"
+wget -O spark-3.5.2-bin-hadoop3.tgz "https://www.apache.org/dyn/closer.lua/spark/spark-3.5.2/spark-3.5.2-bin-hadoop3.tgz?action=download"
 tar -xzvf spark-3.5.2-bin-hadoop3.tgz
 mv spark-3.5.2-bin-hadoop3 spark
 ```
diff --git a/flink/flink-scala-2.12/pom.xml b/flink/flink-scala-2.12/pom.xml
@@ -42,7 +42,7 @@
     <derby.version>10.14.2.0</derby.version>
     <hiverunner.version>5.3.0</hiverunner.version>
 
-    <flink.bin.download.url>https://archive.apache.org/dist/flink/flink-${flink.version}/flink-${flink.version}-bin-scala_${flink.scala.binary.version}.tgz</flink.bin.download.url>
+    <flink.bin.download.url>https://www.apache.org/dyn/closer.lua/flink/flink-${flink.version}/flink-${flink.version}-bin-scala_${flink.scala.binary.version}.tgz?action=download</flink.bin.download.url>
   </properties>
   
   <dependencies>
@@ -1056,6 +1056,7 @@
               <url>${flink.bin.download.url}</url>
               <unpack>true</unpack>
               <outputDirectory>${project.build.directory}</outputDirectory>
+              <outputFileName>flink-${flink.version}-bin-scala_${flink.scala.binary.version}.tgz</outputFileName>
             </configuration>
           </execution>
         </executions>
diff --git a/rlang/pom.xml b/rlang/pom.xml
@@ -38,7 +38,7 @@
 
         <spark.archive>spark-${spark.version}</spark.archive>
         <spark.bin.download.url>
-            https://archive.apache.org/dist/spark/${spark.archive}/${spark.archive}-bin-without-hadoop.tgz
+            https://www.apache.org/dyn/closer.lua/spark/${spark.archive}/${spark.archive}-bin-without-hadoop.tgz?action=download
         </spark.bin.download.url>
         <interpreter.jar.name>zeppelin-interpreter-r</interpreter.jar.name>
     </properties>
@@ -154,6 +154,7 @@
                             <url>${spark.bin.download.url}</url>
                             <unpack>true</unpack>
                             <outputDirectory>${project.build.directory}</outputDirectory>
+                            <outputFileName>${spark.archive}-bin-without-hadoop.tgz</outputFileName>
                         </configuration>
                     </execution>
                 </executions>
diff --git a/scripts/docker/interpreter/Dockerfile b/scripts/docker/interpreter/Dockerfile
@@ -30,7 +30,7 @@ RUN apt-get update && apt-get install -y curl unzip wget grep sed vim tzdata &&
 RUN rm -rf /opt/zeppelin
 
 RUN rm -rf /spark
-RUN wget https://archive.apache.org/dist/spark/spark-${SPARK_VERSION}/spark-${SPARK_VERSION}-bin-${SPARK_BIN_NAME}.tgz
+RUN wget -O spark-${SPARK_VERSION}-bin-${SPARK_BIN_NAME}.tgz "https://www.apache.org/dyn/closer.lua/spark/spark-${SPARK_VERSION}/spark-${SPARK_VERSION}-bin-${SPARK_BIN_NAME}.tgz?action=download"
 RUN tar zxvf spark-${SPARK_VERSION}-bin-${SPARK_BIN_NAME}.tgz
 RUN mv spark-${SPARK_VERSION}-bin-${SPARK_BIN_NAME} /opt/spark
 RUN rm spark-${SPARK_VERSION}-bin-${SPARK_BIN_NAME}.tgz
diff --git a/scripts/docker/spark-cluster-managers/spark_yarn_cluster/Dockerfile b/scripts/docker/spark-cluster-managers/spark_yarn_cluster/Dockerfile
@@ -42,7 +42,7 @@ ENV PATH $PATH:$JAVA_HOME/bin
 RUN yum install -y curl which tar sudo openssh-server openssh-clients rsync
 
 # hadoop
-RUN curl -s https://archive.apache.org/dist/hadoop/core/hadoop-$HADOOP_VERSION/hadoop-$HADOOP_VERSION.tar.gz | tar -xz -C /usr/local/
+RUN curl -s "https://www.apache.org/dyn/closer.lua/hadoop/common/hadoop-$HADOOP_VERSION/hadoop-$HADOOP_VERSION.tar.gz?action=download" | tar -xz -C /usr/local/
 RUN cd /usr/local && ln -s ./hadoop-$HADOOP_VERSION hadoop
 
 ENV HADOOP_PREFIX /usr/local/hadoop
@@ -72,7 +72,7 @@ RUN rm  /usr/local/hadoop/lib/native/*
 RUN curl -Ls http://dl.bintray.com/sequenceiq/sequenceiq-bin/hadoop-native-64-$HADOOP_VERSION.tar|tar -x -C /usr/local/hadoop/lib/native/
 
 # install spark
-RUN curl -s http://archive.apache.org/dist/spark/spark-$SPARK_VERSION/spark-$SPARK_VERSION-bin-hadoop$HADOOP_PROFILE.tgz | tar -xz -C /usr/local/
+RUN curl -s "https://www.apache.org/dyn/closer.lua/spark/spark-$SPARK_VERSION/spark-$SPARK_VERSION-bin-hadoop$HADOOP_PROFILE.tgz?action=download" | tar -xz -C /usr/local/
 RUN cd /usr/local && ln -s spark-$SPARK_VERSION-bin-hadoop$HADOOP_PROFILE spark
 ENV SPARK_HOME /usr/local/spark
 
diff --git a/scripts/docker/zeppelin/bin/Dockerfile b/scripts/docker/zeppelin/bin/Dockerfile
@@ -65,7 +65,7 @@ ENV PATH /opt/conda/envs/python_3_with_R/bin:/opt/conda/bin:$PATH
 
 RUN echo "$LOG_TAG Download Zeppelin binary" && \
     mkdir -p ${ZEPPELIN_HOME} && \
-    wget -nv -O /tmp/zeppelin-${Z_VERSION}-bin-all.tgz https://archive.apache.org/dist/zeppelin/zeppelin-${Z_VERSION}/zeppelin-${Z_VERSION}-bin-all.tgz && \
+    wget -nv -O /tmp/zeppelin-${Z_VERSION}-bin-all.tgz "https://www.apache.org/dyn/closer.lua/zeppelin/zeppelin-${Z_VERSION}/zeppelin-${Z_VERSION}-bin-all.tgz?action=download" && \
     tar --strip-components=1 -zxvf  /tmp/zeppelin-${Z_VERSION}-bin-all.tgz -C ${ZEPPELIN_HOME} && \
     rm -f /tmp/zeppelin-${Z_VERSION}-bin-all.tgz && \
     chown -R root:root ${ZEPPELIN_HOME} && \
diff --git a/spark/interpreter/pom.xml b/spark/interpreter/pom.xml
@@ -48,10 +48,10 @@
 
     <spark.archive>spark-${spark.version}</spark.archive>
     <spark.src.download.url>
-      https://archive.apache.org/dist/spark/${spark.archive}/${spark.archive}.tgz
+      https://www.apache.org/dyn/closer.lua/spark/${spark.archive}/${spark.archive}.tgz?action=download
     </spark.src.download.url>
     <spark.bin.download.url>
-      https://archive.apache.org/dist/spark/${spark.archive}/${spark.archive}-bin-without-hadoop.tgz
+      https://www.apache.org/dyn/closer.lua/spark/${spark.archive}/${spark.archive}-bin-without-hadoop.tgz?action=download
     </spark.bin.download.url>
 
     <scala.compile.version>${spark.scala.version}</scala.compile.version>
@@ -280,6 +280,7 @@
               <unpack>true</unpack>
               <url>${spark.src.download.url}</url>
               <outputDirectory>${project.build.directory}</outputDirectory>
+              <outputFileName>${spark.archive}.tgz</outputFileName>
             </configuration>
           </execution>
           <!-- include sparkr by default -->
@@ -295,6 +296,7 @@
               <url>${spark.bin.download.url}</url>
               <unpack>true</unpack>
               <outputDirectory>${project.build.directory}</outputDirectory>
+              <outputFileName>${spark.archive}-bin-without-hadoop.tgz</outputFileName>
             </configuration>
           </execution>
         </executions>
diff --git a/spark/pom.xml b/spark/pom.xml
@@ -45,10 +45,10 @@
 
         <spark.archive>spark-${spark.version}</spark.archive>
         <spark.src.download.url>
-            https://archive.apache.org/dist/spark/${spark.archive}/${spark.archive}.tgz
+            https://www.apache.org/dyn/closer.lua/spark/${spark.archive}/${spark.archive}.tgz?action=download
         </spark.src.download.url>
         <spark.bin.download.url>
-            https://archive.apache.org/dist/spark/${spark.archive}/${spark.archive}-bin-without-hadoop.tgz
+            https://www.apache.org/dyn/closer.lua/spark/${spark.archive}/${spark.archive}-bin-without-hadoop.tgz?action=download
         </spark.bin.download.url>
     </properties>
 
diff --git a/testing/downloadLivy.sh b/testing/downloadLivy.sh
@@ -45,12 +45,14 @@ ZEPPELIN_HOME="$(cd "${FWDIR}/.."; pwd)"
 #   None
 # Arguments:
 #   url - source URL
+#   file - output filename
 # Returns:
 #   None
 #######################################
 download_with_retry() {
     local url="$1"
-    wget --retry-connrefused --waitretry=1 --read-timeout=20 --timeout=15 -t 3 "${url}"
+    local file="${2:-$(basename $url)}"
+    wget --retry-connrefused --waitretry=1 --read-timeout=20 --timeout=15 -t 3 --output-document "${file}" "${url}"
     if [[ "$?" -ne 0 ]]; then
         echo "3 download attempts for ${url} failed"
     fi
@@ -72,7 +74,7 @@ if [[ ! -d "${LIVY_HOME}" ]]; then
         # download livy from archive if not cached
         echo "${LIVY_VERSION} being downloaded from archives"
         STARTTIME=`date +%s`
-        download_with_retry "https://dist.apache.org/repos/dist/release/incubator/livy/${LIVY_VERSION}/${LIVY_ARCHIVE}.zip"
+        download_with_retry "https://www.apache.org/dyn/closer.lua/incubator/livy/${LIVY_VERSION}/${LIVY_ARCHIVE}.zip?action=download" "${LIVY_ARCHIVE}.zip"
         ENDTIME=`date +%s`
         DOWNLOADTIME="$((ENDTIME-STARTTIME))"
     fi
diff --git a/testing/downloadSpark.sh b/testing/downloadSpark.sh
@@ -38,12 +38,14 @@ ZEPPELIN_HOME="$(cd "${FWDIR}/.."; pwd)"
 #   None
 # Arguments:
 #   url - source URL
+#   file - output filename
 # Returns:
 #   None
 #######################################
 download_with_retry() {
     local url="$1"
-    wget --retry-connrefused --waitretry=1 --read-timeout=20 --timeout=15 -t 3 "${url}"
+    local file="${2:-$(basename $url)}"
+    wget --retry-connrefused --waitretry=1 --read-timeout=20 --timeout=15 -t 3 --output-document "${file}" "${url}"
     if [[ "$?" -ne 0 ]]; then
         echo "3 download attempts for ${url} failed"
     fi
@@ -65,8 +67,8 @@ if [[ ! -d "${SPARK_HOME}" ]]; then
         # download spark from archive if not cached
         echo "${SPARK_VERSION} being downloaded from archives"
         STARTTIME=`date +%s`
-        #timeout -s KILL "${MAX_DOWNLOAD_TIME_SEC}" wget "http://archive.apache.org/dist/spark/spark-${SPARK_VERSION}/${SPARK_ARCHIVE}.tgz"
-        download_with_retry "http://archive.apache.org/dist/spark/spark-${SPARK_VERSION}/${SPARK_ARCHIVE}.tgz"
+        #timeout -s KILL "${MAX_DOWNLOAD_TIME_SEC}" wget -O "${SPARK_ARCHIVE}.tgz" "https://www.apache.org/dyn/closer.lua/spark/spark-${SPARK_VERSION}/${SPARK_ARCHIVE}.tgz?action=download"
+        download_with_retry "https://www.apache.org/dyn/closer.lua/spark/spark-${SPARK_VERSION}/${SPARK_ARCHIVE}.tgz?action=download" "${SPARK_ARCHIVE}.tgz"
         ENDTIME=`date +%s`
         DOWNLOADTIME="$((ENDTIME-STARTTIME))"
     fi