diff --git a/scale-tests/batch_test.py b/scale-tests/batch_test.py
index 1862b5a0..e085708f 100755
--- a/scale-tests/batch_test.py
+++ b/scale-tests/batch_test.py
@@ -10,6 +10,7 @@
Options:
--docker-image
docker image to run on executors
+ --group-role root-level group to apply quotas against (e.g. '/dev') [default: None]
--max-num-dispatchers maximum number of dispatchers to use from dispatchers file
--submits-per-min number of jobs to submit per minute [default: 1]
--spark-cores-max max executor cores per job [default: 1]
@@ -24,13 +25,15 @@
"""
+from docopt import docopt
+from threading import Thread
+
import json
import logging
+import os
import random
import sys
import time
-from docopt import docopt
-from threading import Thread
import typing
import sdk_utils
@@ -48,9 +51,10 @@
logging.basicConfig(
- format='[%(asctime)s|%(name)s|%(levelname)s]: %(message)s',
+ format="[%(asctime)s|%(name)s|%(levelname)s]: %(message)s",
level=logging.INFO,
- stream=sys.stdout)
+ stream=sys.stdout,
+)
log = logging.getLogger(__name__)
MONTE_CARLO_APP_URL = "https://raw.githubusercontent.com/mesosphere/spark-build/master/scale-tests/apps/monte-carlo-portfolio.py"
@@ -76,15 +80,19 @@ def _get_duration() -> int:
def _get_gpu_user_conf(args):
def _verify_required_args():
- if not (args["--spark-mesos-max-gpus"] and
- args["--spark-mesos-executor-gpus"] and
- args["--docker-image"]):
- log.error("""
+ if not (
+ args["--spark-mesos-max-gpus"]
+ and args["--spark-mesos-executor-gpus"]
+ and args["--docker-image"]
+ ):
+ log.error(
+ """
Missing required arguments for running gpu jobs. Please include:
--spark-mesos-max-gpus
--spark-mesos-executor-gpus
--docker-image
- """)
+ """
+ )
_verify_required_args()
@@ -92,38 +100,61 @@ def _verify_required_args():
# This is due to memory being divvied up and allocated to each GPU device.
memory_multiplier = 20
memory = int(args["--spark-mesos-executor-gpus"]) * memory_multiplier
- return ["--conf", "spark.driver.memory={}g".format(str(memory)),
- "--conf", "spark.executor.memory={}g".format(str(memory)),
- "--conf", "spark.mesos.gpus.max={}".format(args["--spark-mesos-max-gpus"]),
- "--conf", "spark.mesos.executor.gpus={}".format(args["--spark-mesos-executor-gpus"]),
- "--conf", "spark.mesos.executor.docker.image={}".format(args["--docker-image"]),
- "--conf", "spark.mesos.executor.docker.forcePullImage=false"
- ]
-
-
-def submit_job(app_url: str, app_args: str, dispatcher: typing.Dict, duration: int, config: typing.List[str]):
+ return [
+ "--conf",
+ "spark.driver.memory={}g".format(str(memory)),
+ "--conf",
+ "spark.executor.memory={}g".format(str(memory)),
+ "--conf",
+ "spark.mesos.gpus.max={}".format(args["--spark-mesos-max-gpus"]),
+ "--conf",
+ "spark.mesos.executor.gpus={}".format(args["--spark-mesos-executor-gpus"]),
+ "--conf",
+ "spark.mesos.executor.docker.image={}".format(args["--docker-image"]),
+ "--conf",
+ "spark.mesos.executor.docker.forcePullImage=false",
+ ]
+
+
+def submit_job(
+ app_url: str,
+ app_args: str,
+ dispatcher: typing.Dict,
+ duration: int,
+ config: typing.List[str],
+ group_role: str,
+):
dispatcher_name = dispatcher["service"]["name"]
log.info("Submitting job to dispatcher: %s, with duration: %s min.", dispatcher_name, duration)
+ driver_role = None if group_role else dispatcher["roles"]["executors"]
+
spark_utils.submit_job(
service_name=dispatcher_name,
app_url=app_url,
app_args=app_args,
verbose=False,
args=config,
- driver_role=dispatcher["roles"]["executors"],
+ driver_role=driver_role,
spark_user=dispatcher["service"]["user"] if sdk_utils.is_strict_mode() else None,
- principal=dispatcher["service"]["service_account"] if sdk_utils.is_strict_mode() else None)
+ principal=dispatcher["service"]["service_account"] if sdk_utils.is_strict_mode() else None,
+ )
-def submit_loop(app_url: str, submits_per_min: int, dispatchers: typing.List[typing.Dict], user_conf: typing.List[str]):
+def submit_loop(
+ app_url: str,
+ submits_per_min: int,
+ dispatchers: typing.List[typing.Dict],
+ user_conf: typing.List[str],
+ group_role: str,
+):
sec_between_submits = 60 / submits_per_min
log.info("sec_between_submits: %s", sec_between_submits)
num_dispatchers = len(dispatchers)
log.info("num_dispatchers: %s", num_dispatchers)
dispatcher_index = 0
- while(True):
+ while True:
duration = _get_duration()
if app_url == MONTE_CARLO_APP_URL:
@@ -131,7 +162,17 @@ def submit_loop(app_url: str, submits_per_min: int, dispatchers: typing.List[typ
else:
app_args = "550 3" # 550 images in 3 batches
- t = Thread(target=submit_job, args=(app_url, app_args, dispatchers[dispatcher_index], duration, user_conf))
+ t = Thread(
+ target=submit_job,
+ args=(
+ app_url,
+ app_args,
+ dispatchers[dispatcher_index],
+ duration,
+ user_conf,
+ group_role,
+ ),
+ )
t.start()
dispatcher_index = (dispatcher_index + 1) % num_dispatchers
log.info("sleeping %s sec.", sec_between_submits)
@@ -151,35 +192,57 @@ def submit_loop(app_url: str, submits_per_min: int, dispatchers: typing.List[typ
if end <= len(dispatchers):
dispatchers = dispatchers[0:end]
else:
- log.warning("""
+ log.warning(
+ """
Specified --max-num-dispatchers is greater than actual dispatcher count in {}.
Using list of dispatchers from file instead.
- """.format(args[""]))
-
- user_conf = ["--conf", "spark.cores.max={}".format(args["--spark-cores-max"]),
- "--conf", "spark.executor.cores={}".format(args["--spark-executor-cores"]),
- "--conf", "spark.mesos.containerizer={}".format(args["--spark-mesos-containerizer"]),
- "--conf", "spark.port.maxRetries={}".format(args["--spark-port-max-retries"]),
- "--conf", "spark.mesos.driver.failoverTimeout={}".format(args["--spark-mesos-driver-failover-timeout"])
- ]
+ """.format(
+ args[""]
+ )
+ )
+
+ user_conf = [
+ "--conf",
+ "spark.cores.max={}".format(args["--spark-cores-max"]),
+ "--conf",
+ "spark.executor.cores={}".format(args["--spark-executor-cores"]),
+ "--conf",
+ "spark.mesos.containerizer={}".format(args["--spark-mesos-containerizer"]),
+ "--conf",
+ "spark.port.maxRetries={}".format(args["--spark-port-max-retries"]),
+ "--conf",
+ "spark.mesos.driver.failoverTimeout={}".format(
+ args["--spark-mesos-driver-failover-timeout"]
+ ),
+ ]
if args["--spark-mesos-executor-gpus"]:
user_conf += _get_gpu_user_conf(args)
MEMORY_MULTIPLIER = 20
memory = int(args["--spark-mesos-executor-gpus"]) * MEMORY_MULTIPLIER
- user_conf += ["--conf", "spark.driver.memory={}g".format(str(memory)),
- "--conf", "spark.executor.memory={}g".format(str(memory)),
- "--conf", "spark.mesos.gpus.max={}".format(args["--spark-mesos-max-gpus"]),
- "--conf", "spark.mesos.executor.gpus={}".format(args["--spark-mesos-executor-gpus"]),
- "--conf", "spark.mesos.executor.docker.image={}".format(args["--docker-image"]),
- "--conf", "spark.mesos.executor.docker.forcePullImage=false"
- ]
+ user_conf += [
+ "--conf",
+ "spark.driver.memory={}g".format(str(memory)),
+ "--conf",
+ "spark.executor.memory={}g".format(str(memory)),
+ "--conf",
+ "spark.mesos.gpus.max={}".format(args["--spark-mesos-max-gpus"]),
+ "--conf",
+ "spark.mesos.executor.gpus={}".format(args["--spark-mesos-executor-gpus"]),
+ "--conf",
+ "spark.mesos.executor.docker.image={}".format(args["--docker-image"]),
+ "--conf",
+ "spark.mesos.executor.docker.forcePullImage=false",
+ ]
app_url = GPU_IMAGE_RECOGNITION_APP_URL
else:
app_url = MONTE_CARLO_APP_URL
if args["--spark-mesos-driver-labels"] is not None:
- user_conf += ["--conf", "spark.mesos.driver.labels={}".format(args["--spark-mesos-driver-labels"])]
+ user_conf += [
+ "--conf",
+ "spark.mesos.driver.labels={}".format(args["--spark-mesos-driver-labels"]),
+ ]
if not args["--no-supervise"]:
user_conf += ["--supervise"]
@@ -188,4 +251,6 @@ def submit_loop(app_url: str, submits_per_min: int, dispatchers: typing.List[typ
end = int(args["--max-num-dispatchers"])
dispatchers = dispatchers[0:end]
- submit_loop(app_url, int(args["--submits-per-min"]), dispatchers, user_conf)
+ group_role = args["--group-role"]
+
+ submit_loop(app_url, int(args["--submits-per-min"]), dispatchers, user_conf, group_role)
diff --git a/scale-tests/configs/2020-05-14-mwt25dr.env b/scale-tests/configs/2020-05-14-mwt25dr.env
new file mode 100644
index 00000000..e3253a76
--- /dev/null
+++ b/scale-tests/configs/2020-05-14-mwt25dr.env
@@ -0,0 +1,168 @@
+# Depends on:
+# - TEST_NAME
+# - TEST_S3_BUCKET
+# - TEST_S3_FOLDER
+
+# Workload configuration #######################################################
+#
+# Total CPU quota: 88
+# Total MEM quota: 200000
+# Total GPU quota: 40
+
+CLUSTER_URL="https://mw25dry.scaletesting.mesosphe.re/"
+SECURITY="strict"
+
+DCOS_CLI_URL="https://downloads.dcos.io/cli/releases/binaries/dcos/linux/x86-64/latest/dcos"
+
+# Test configuration ###########################################################
+
+SHOULD_INSTALL_INFRASTRUCTURE=true
+SHOULD_INSTALL_NON_GPU_DISPATCHERS=true
+SHOULD_INSTALL_GPU_DISPATCHERS=false
+SHOULD_RUN_FINITE_STREAMING_JOBS=true
+SHOULD_RUN_INFINITE_STREAMING_JOBS=true
+SHOULD_RUN_BATCH_JOBS=true
+SHOULD_RUN_GPU_BATCH_JOBS=false
+SHOULD_UNINSTALL_INFRASTRUCTURE_AT_THE_END=false
+
+# Infrastructure configuration #################################################
+
+GROUP_NAME="${TEST_NAME}"
+
+SERVICE_NAMES_PREFIX="${TEST_NAME}/"
+INFRASTRUCTURE_OUTPUT_FILE="infrastructure.json"
+
+KAFKA_CLUSTER_COUNT=1
+CASSANDRA_CLUSTER_COUNT=1
+
+ZOOKEEPER_CPUS=10
+ZOOKEEPER_MEM=20000
+ZOOKEEPER_CONFIG='scale-tests/configs/kafka-zookeeper-options.json'
+# Note: empty package repo values will default to latest Universe packages.
+ZOOKEEPER_PACKAGE_REPO=
+# 2.7.0-5.1.2e from the Universe.
+
+KAFKA_CPUS=10
+KAFKA_MEM=20000
+KAFKA_CONFIG='scale-tests/configs/kafka-options.json'
+# Note: empty package repo values will default to latest Universe packages.
+KAFKA_PACKAGE_REPO=
+# 2.9.0-5.4.0 from the Universe.
+
+CASSANDRA_CPUS=10
+CASSANDRA_MEM=20000
+CASSANDRA_CONFIG='scale-tests/configs/cassandra-options.json'
+# Note: empty package repo values will default to latest Universe packages.
+CASSANDRA_PACKAGE_REPO=
+# 2.9.0-3.11.6 from the Universe.
+
+# DSEngine configuration #######################################################
+
+DSENGINE_CPUS=10
+DSENGINE_MEM=20000
+DSENGINE_GPUS=40
+DSENGINE_PACKAGE_REPO=
+
+# Spark configuration ##########################################################
+
+SPARK_CONFIG='scale-tests/configs/spark-options.json'
+
+# Note: empty package repo values will default to latest Universe packages.
+# Spark version 2.10.0-2.4.5
+SPARK_PACKAGE_REPO=https://infinity-artifacts.s3.amazonaws.com/permanent/spark/2.10.0-2.4.5/stub-universe-spark.json
+
+# Note: leaving the Spark executor Docker image empty so that executors inherit
+# the image used for dispatchers.
+SPARK_EXECUTOR_DOCKER_IMAGE=
+
+# Non-GPU Spark dispatchers configuration ######################################
+
+# Not currently used.
+BATCH_MAX_NON_GPU_JOBS=30
+
+SPARK_NON_GPU_DISPATCHERS=3
+SPARK_NON_GPU_DISPATCHERS_OUTPUT_FILE="non-gpu-dispatchers.out"
+# Note: this name is built internally by the deploy-dispatchers.py script.
+SPARK_NON_GPU_DISPATCHERS_JSON_OUTPUT_FILE="${SPARK_NON_GPU_DISPATCHERS_OUTPUT_FILE}-dispatchers.json"
+# Note: driver resources used per dispatcher (1 dispatcher will be able to run
+# 8 drivers since each driver requires 1 CPU).
+SPARK_NON_GPU_QUOTA_DRIVERS_CPUS=8
+SPARK_NON_GPU_QUOTA_DRIVERS_MEM=20000
+# Note: executor resources used per job (1 driver will run 1 job).
+SPARK_NON_GPU_QUOTA_EXECUTORS_CPUS=8
+SPARK_NON_GPU_QUOTA_EXECUTORS_MEM=20000
+
+# GPU Spark dispatchers configuration ##########################################
+
+# Not currently used.
+BATCH_MAX_GPU_JOBS=2
+
+SPARK_GPU_DISPATCHERS=0
+SPARK_GPU_DISPATCHERS_OUTPUT_FILE="gpu-dispatchers.out"
+SPARK_GPU_DISPATCHERS_JSON_OUTPUT_FILE="${SPARK_GPU_DISPATCHERS_OUTPUT_FILE}-dispatchers.json" # NOTE: this name is built internally by the deploy-dispatchers.py script.
+SPARK_GPU_QUOTA_DRIVERS_CPUS=
+SPARK_GPU_QUOTA_DRIVERS_MEM=
+SPARK_GPU_QUOTA_DRIVERS_GPUS=
+SPARK_GPU_QUOTA_EXECUTORS_CPUS=
+SPARK_GPU_QUOTA_EXECUTORS_MEM=
+SPARK_GPU_QUOTA_EXECUTORS_GPUS=
+
+# Common streaming jobs configuration ##########################################
+
+TEST_ASSEMBLY_JAR_URL='http://infinity-artifacts.s3.amazonaws.com/scale-tests/dcos-spark-scala-tests-assembly-2.4.0-20190325.jar'
+DISPATCHERS_JSON_OUTPUT_FILE="all-dispatchers.json"
+
+# Finite streaming jobs configuration ##########################################
+
+STREAMING_FINITE_SUBMISSIONS_OUTPUT_FILE="finite-submissions.out"
+STREAMING_FINITE_PRODUCERS_PER_KAFKA="${SPARK_NON_GPU_DISPATCHERS}" # 1 Kafka and 3 dispatchers -> 3 producers.
+STREAMING_FINITE_CONSUMERS_PER_PRODUCER=1 # 3 producers -> 3 consumers.
+# 3 producers + 3 consumers = 6 total finite streaming jobs
+STREAMING_FINITE_PRODUCER_NUMBER_OF_WORDS=7692
+STREAMING_FINITE_PRODUCER_WORDS_PER_SECOND=1
+# 7692 words / 1 word per second -> ~2h runtime.
+STREAMING_FINITE_PRODUCER_SPARK_CORES_MAX=2
+STREAMING_FINITE_PRODUCER_SPARK_EXECUTOR_CORES=2
+STREAMING_FINITE_CONSUMER_BATCH_SIZE_SECONDS=10
+STREAMING_FINITE_CONSUMER_SPARK_CORES_MAX=1
+STREAMING_FINITE_CONSUMER_SPARK_EXECUTOR_CORES=1
+
+# Infinite streaming jobs configuration ########################################
+
+STREAMING_INFINITE_SUBMISSIONS_OUTPUT_FILE="infinite-submissions.out"
+STREAMING_INFINITE_PRODUCERS_PER_KAFKA="${SPARK_NON_GPU_DISPATCHERS}" # 1 Kafka and 3 dispatchers -> 3 producers.
+STREAMING_INFINITE_CONSUMERS_PER_PRODUCER=1 # 3 producers -> 3 consumers.
+# 3 producers + 3 consumers = 6 total infinite streaming jobs
+STREAMING_INFINITE_PRODUCER_NUMBER_OF_WORDS=0
+STREAMING_INFINITE_PRODUCER_WORDS_PER_SECOND=1
+STREAMING_INFINITE_PRODUCER_SPARK_CORES_MAX=2
+STREAMING_INFINITE_PRODUCER_SPARK_EXECUTOR_CORES=2
+STREAMING_INFINITE_CONSUMER_BATCH_SIZE_SECONDS=10
+STREAMING_INFINITE_CONSUMER_SPARK_CORES_MAX=1
+STREAMING_INFINITE_CONSUMER_SPARK_EXECUTOR_CORES=1
+
+# Batch jobs configuration #####################################################
+
+SPARK_NON_GPU_DISPATCHERS_JSON_OUTPUT_FILE_URL="https://${TEST_S3_BUCKET}.s3.amazonaws.com/${TEST_S3_FOLDER}/${SPARK_NON_GPU_DISPATCHERS_JSON_OUTPUT_FILE}"
+
+BATCH_APP_ID="/${SERVICE_NAMES_PREFIX}batch-workload"
+BATCH_SCRIPT_CPUS=6
+BATCH_SCRIPT_MEM=12288
+BATCH_SUBMITS_PER_MIN=13
+# TODO: update to master for the next MWT.
+BATCH_SPARK_BUILD_BRANCH="dcos-58437-deploy-workloads-under-role-enforced-group"
+
+# Batch GPU jobs configuration #################################################
+
+SPARK_GPU_DISPATCHERS_JSON_OUTPUT_FILE_URL="https://${TEST_S3_BUCKET}.s3.amazonaws.com/${TEST_S3_FOLDER}/${SPARK_GPU_DISPATCHERS_JSON_OUTPUT_FILE}"
+
+GPU_APP_ID="/${SERVICE_NAMES_PREFIX}gpu-batch-workload"
+GPU_SCRIPT_CPUS=2
+GPU_SCRIPT_MEM=4096
+GPU_DOCKER_IMAGE='samvantran/spark-dcos-gpu:metrics'
+GPU_SUBMITS_PER_MIN=5
+GPU_MAX_DISPATCHERS=${SPARK_GPU_DISPATCHERS}
+GPU_SPARK_CORES_MAX=4
+GPU_SPARK_MESOS_EXECUTOR_GPUS=4
+GPU_SPARK_MESOS_MAX_GPUS=4
+GPU_SPARK_BUILD_BRANCH=master
diff --git a/scale-tests/configs/2020-05-20-mwt25.env b/scale-tests/configs/2020-05-20-mwt25.env
new file mode 100644
index 00000000..5395a411
--- /dev/null
+++ b/scale-tests/configs/2020-05-20-mwt25.env
@@ -0,0 +1,168 @@
+# Depends on:
+# - TEST_NAME
+# - TEST_S3_BUCKET
+# - TEST_S3_FOLDER
+
+# Workload configuration #######################################################
+#
+# Total CPU quota: 2290
+# Total MEM quota: 4580000
+# Total GPU quota: 40
+
+CLUSTER_URL="https://mw25dry.scaletesting.mesosphe.re/"
+SECURITY="strict"
+
+DCOS_CLI_URL="https://downloads.dcos.io/cli/releases/binaries/dcos/linux/x86-64/latest/dcos"
+
+# Test configuration ###########################################################
+
+SHOULD_INSTALL_INFRASTRUCTURE=true
+SHOULD_INSTALL_NON_GPU_DISPATCHERS=true
+SHOULD_INSTALL_GPU_DISPATCHERS=false
+SHOULD_RUN_FINITE_STREAMING_JOBS=true
+SHOULD_RUN_INFINITE_STREAMING_JOBS=true
+SHOULD_RUN_BATCH_JOBS=true
+SHOULD_RUN_GPU_BATCH_JOBS=false
+SHOULD_UNINSTALL_INFRASTRUCTURE_AT_THE_END=false
+
+# Infrastructure configuration #################################################
+
+GROUP_NAME="${TEST_NAME}"
+
+SERVICE_NAMES_PREFIX="${TEST_NAME}/"
+INFRASTRUCTURE_OUTPUT_FILE="infrastructure.json"
+
+KAFKA_CLUSTER_COUNT=1
+CASSANDRA_CLUSTER_COUNT=1
+
+ZOOKEEPER_CPUS=10
+ZOOKEEPER_MEM=20000
+ZOOKEEPER_CONFIG='scale-tests/configs/kafka-zookeeper-options.json'
+# Note: empty package repo values will default to latest Universe packages.
+ZOOKEEPER_PACKAGE_REPO=
+# 2.7.0-5.1.2e from the Universe.
+
+KAFKA_CPUS=10
+KAFKA_MEM=20000
+KAFKA_CONFIG='scale-tests/configs/kafka-options.json'
+# Note: empty package repo values will default to latest Universe packages.
+KAFKA_PACKAGE_REPO=
+# 2.9.0-5.4.0 from the Universe.
+
+CASSANDRA_CPUS=10
+CASSANDRA_MEM=20000
+CASSANDRA_CONFIG='scale-tests/configs/cassandra-options.json'
+# Note: empty package repo values will default to latest Universe packages.
+CASSANDRA_PACKAGE_REPO=
+# 2.9.0-3.11.6 from the Universe.
+
+# DSEngine configuration #######################################################
+
+DSENGINE_CPUS=10
+DSENGINE_MEM=20000
+DSENGINE_GPUS=40
+DSENGINE_PACKAGE_REPO=
+
+# Spark configuration ##########################################################
+
+SPARK_CONFIG='scale-tests/configs/spark-options.json'
+
+# Note: empty package repo values will default to latest Universe packages.
+# Spark version 2.10.0-2.4.5
+SPARK_PACKAGE_REPO=https://infinity-artifacts.s3.amazonaws.com/permanent/spark/2.10.0-2.4.5/stub-universe-spark.json
+
+# Note: leaving the Spark executor Docker image empty so that executors inherit
+# the image used for dispatchers.
+SPARK_EXECUTOR_DOCKER_IMAGE=
+
+# Non-GPU Spark dispatchers configuration ######################################
+
+# Not currently used.
+BATCH_MAX_NON_GPU_JOBS=1000
+
+SPARK_NON_GPU_DISPATCHERS=50
+SPARK_NON_GPU_DISPATCHERS_OUTPUT_FILE="non-gpu-dispatchers.out"
+# Note: this name is built internally by the deploy-dispatchers.py script.
+SPARK_NON_GPU_DISPATCHERS_JSON_OUTPUT_FILE="${SPARK_NON_GPU_DISPATCHERS_OUTPUT_FILE}-dispatchers.json"
+# Note: driver resources used per dispatcher (1 dispatcher will be able to run
+# 20 drivers since each driver requires 1 CPU).
+SPARK_NON_GPU_QUOTA_DRIVERS_CPUS=20
+SPARK_NON_GPU_QUOTA_DRIVERS_MEM=50000
+# Note: executor resources used per job (1 driver will run 1 job).
+SPARK_NON_GPU_QUOTA_EXECUTORS_CPUS=25
+SPARK_NON_GPU_QUOTA_EXECUTORS_MEM=40000
+
+# GPU Spark dispatchers configuration ##########################################
+
+# Not currently used.
+BATCH_MAX_GPU_JOBS=10
+
+SPARK_GPU_DISPATCHERS=0
+SPARK_GPU_DISPATCHERS_OUTPUT_FILE="gpu-dispatchers.out"
+SPARK_GPU_DISPATCHERS_JSON_OUTPUT_FILE="${SPARK_GPU_DISPATCHERS_OUTPUT_FILE}-dispatchers.json" # NOTE: this name is built internally by the deploy-dispatchers.py script.
+SPARK_GPU_QUOTA_DRIVERS_CPUS=
+SPARK_GPU_QUOTA_DRIVERS_MEM=
+SPARK_GPU_QUOTA_DRIVERS_GPUS=
+SPARK_GPU_QUOTA_EXECUTORS_CPUS=
+SPARK_GPU_QUOTA_EXECUTORS_MEM=
+SPARK_GPU_QUOTA_EXECUTORS_GPUS=
+SPARK_GPU_REMOVE_EXECUTORS_ROLES_QUOTAS=true
+# Common streaming jobs configuration ##########################################
+
+TEST_ASSEMBLY_JAR_URL='http://infinity-artifacts.s3.amazonaws.com/scale-tests/dcos-spark-scala-tests-assembly-2.4.0-20190325.jar'
+DISPATCHERS_JSON_OUTPUT_FILE="all-dispatchers.json"
+
+# Finite streaming jobs configuration ##########################################
+
+STREAMING_FINITE_SUBMISSIONS_OUTPUT_FILE="finite-submissions.out"
+STREAMING_FINITE_PRODUCERS_PER_KAFKA="${SPARK_NON_GPU_DISPATCHERS}" # 1 Kafka and 50 dispatchers -> 50 producers.
+STREAMING_FINITE_CONSUMERS_PER_PRODUCER=1 # 50 producers -> 50 consumers.
+# 50 producers + 50 consumers = 100 total finite streaming jobs
+STREAMING_FINITE_PRODUCER_NUMBER_OF_WORDS=7692
+STREAMING_FINITE_PRODUCER_WORDS_PER_SECOND=1
+# 7692 words / 1 word per second -> ~2h runtime.
+STREAMING_FINITE_PRODUCER_SPARK_CORES_MAX=2
+STREAMING_FINITE_PRODUCER_SPARK_EXECUTOR_CORES=2
+STREAMING_FINITE_CONSUMER_BATCH_SIZE_SECONDS=10
+STREAMING_FINITE_CONSUMER_SPARK_CORES_MAX=1
+STREAMING_FINITE_CONSUMER_SPARK_EXECUTOR_CORES=1
+
+# Infinite streaming jobs configuration ########################################
+
+STREAMING_INFINITE_SUBMISSIONS_OUTPUT_FILE="infinite-submissions.out"
+STREAMING_INFINITE_PRODUCERS_PER_KAFKA="${SPARK_NON_GPU_DISPATCHERS}" # 1 Kafka and 50 dispatchers -> 50 producers.
+STREAMING_INFINITE_CONSUMERS_PER_PRODUCER=1 # 50 producers -> 50 consumers.
+# 50 producers + 50 consumers = 100 total infinite streaming jobs
+STREAMING_INFINITE_PRODUCER_NUMBER_OF_WORDS=0
+STREAMING_INFINITE_PRODUCER_WORDS_PER_SECOND=1
+STREAMING_INFINITE_PRODUCER_SPARK_CORES_MAX=2
+STREAMING_INFINITE_PRODUCER_SPARK_EXECUTOR_CORES=2
+STREAMING_INFINITE_CONSUMER_BATCH_SIZE_SECONDS=10
+STREAMING_INFINITE_CONSUMER_SPARK_CORES_MAX=1
+STREAMING_INFINITE_CONSUMER_SPARK_EXECUTOR_CORES=1
+
+# Batch jobs configuration #####################################################
+
+SPARK_NON_GPU_DISPATCHERS_JSON_OUTPUT_FILE_URL="https://${TEST_S3_BUCKET}.s3.amazonaws.com/${TEST_S3_FOLDER}/${SPARK_NON_GPU_DISPATCHERS_JSON_OUTPUT_FILE}"
+
+BATCH_APP_ID="/${SERVICE_NAMES_PREFIX}batch-workload"
+BATCH_SCRIPT_CPUS=6
+BATCH_SCRIPT_MEM=12288
+BATCH_SUBMITS_PER_MIN=13
+# TODO: update to master for the next MWT.
+BATCH_SPARK_BUILD_BRANCH="dcos-58437-deploy-workloads-under-role-enforced-group"
+
+# Batch GPU jobs configuration #################################################
+
+SPARK_GPU_DISPATCHERS_JSON_OUTPUT_FILE_URL="https://${TEST_S3_BUCKET}.s3.amazonaws.com/${TEST_S3_FOLDER}/${SPARK_GPU_DISPATCHERS_JSON_OUTPUT_FILE}"
+
+GPU_APP_ID="/${SERVICE_NAMES_PREFIX}gpu-batch-workload"
+GPU_SCRIPT_CPUS=2
+GPU_SCRIPT_MEM=4096
+GPU_DOCKER_IMAGE='samvantran/spark-dcos-gpu:metrics'
+GPU_SUBMITS_PER_MIN=5
+GPU_MAX_DISPATCHERS=${SPARK_GPU_DISPATCHERS}
+GPU_SPARK_CORES_MAX=4
+GPU_SPARK_MESOS_EXECUTOR_GPUS=4
+GPU_SPARK_MESOS_MAX_GPUS=4
+GPU_SPARK_BUILD_BRANCH=master
diff --git a/scale-tests/configs/2020-09-14-mwt26dr.env b/scale-tests/configs/2020-09-14-mwt26dr.env
new file mode 100644
index 00000000..87bd43bb
--- /dev/null
+++ b/scale-tests/configs/2020-09-14-mwt26dr.env
@@ -0,0 +1,176 @@
+# Depends on:
+# - TEST_NAME
+# - TEST_S3_BUCKET
+# - TEST_S3_FOLDER
+
+# Workload configuration #######################################################
+#
+# Total CPU quota: 88
+# Total MEM quota: 200000
+# Total GPU quota: 40
+
+CLUSTER_URL="https://mwt26-dry.scaletesting.mesosphe.re/"
+SECURITY="strict"
+
+DCOS_CLI_URL="https://downloads.dcos.io/cli/releases/binaries/dcos/linux/x86-64/latest/dcos"
+
+# Test configuration ###########################################################
+
+SHOULD_INSTALL_INFRASTRUCTURE=true
+SHOULD_INSTALL_NON_GPU_DISPATCHERS=true
+SHOULD_INSTALL_GPU_DISPATCHERS=false
+SHOULD_RUN_FINITE_STREAMING_JOBS=true
+SHOULD_RUN_INFINITE_STREAMING_JOBS=true
+SHOULD_RUN_BATCH_JOBS=true
+SHOULD_RUN_GPU_BATCH_JOBS=false
+SHOULD_UNINSTALL_INFRASTRUCTURE_AT_THE_END=false
+
+# Infrastructure configuration #################################################
+
+GROUP_NAME="${TEST_NAME}"
+
+SERVICE_NAMES_PREFIX="${TEST_NAME}/"
+INFRASTRUCTURE_OUTPUT_FILE="infrastructure.json"
+
+KAFKA_CLUSTER_COUNT=1
+CASSANDRA_CLUSTER_COUNT=1
+
+ZOOKEEPER_CPUS=10
+ZOOKEEPER_MEM=20000
+ZOOKEEPER_CONFIG='scale-tests/configs/kafka-zookeeper-options.json'
+# Note: empty package repo values will default to latest Universe packages.
+ZOOKEEPER_PACKAGE_REPO='https://universe-converter.mesosphere.com/transform?url=https://infinity-artifacts.s3.amazonaws.com/permanent/confluent-zookeeper/assets/2.8.0-5.5.1-beta/stub-universe-confluent-zookeeper.json'
+# 2.8.0-5.5.1-beta from the Universe.
+
+KAFKA_CPUS=10
+KAFKA_MEM=20000
+KAFKA_CONFIG='scale-tests/configs/kafka-options.json'
+KAFKA_USER='root'
+# Note: empty package repo values will default to latest Universe packages.
+KAFKA_PACKAGE_REPO='https://universe-converter.mesosphere.com/transform?url=https://infinity-artifacts.s3.amazonaws.com/permanent/confluent-kafka/assets/2.10.0-5.5.1-beta/stub-universe-confluent-kafka.json'
+# 2.10.0-5.5.1-beta from the Universe.
+KAFKA_PACKAGE_NAME='beta-confluent-kafka'
+
+CASSANDRA_CPUS=10
+CASSANDRA_MEM=20000
+CASSANDRA_CONFIG='scale-tests/configs/cassandra-options.json'
+CASSANDRA_USER='root'
+# Note: empty package repo values will default to latest Universe packages.
+CASSANDRA_PACKAGE_REPO='https://universe-converter.mesosphere.com/transform?url=https://infinity-artifacts.s3.amazonaws.com/permanent/cassandra/assets/2.10.0-3.11.6-beta/stub-universe-cassandra.json'
+CASSANDRA_PACKAGE_NAME='beta-cassandra'
+# 2.10.0-3.11.6-beta from the Universe.
+
+# DSEngine configuration #######################################################
+
+DSENGINE_CPUS=10
+DSENGINE_MEM=20000
+DSENGINE_GPUS=40
+DSENGINE_CONFIG='scale-tests/configs/dsengine-options.json'
+# Note: empty package repo values will default to latest Universe packages.
+DSENGINE_PACKAGE_REPO='https://universe-converter.mesosphere.com/transform?url=https://infinity-artifacts.s3.amazonaws.com/permanent/data-science-engine/assets/2.1.0-beta/stub-universe-data-science-engine.json'
+DSENGINE_PACKAGE_NAME='beta-data-science-engine'
+# 2.1.0-beta from the Universe.
+
+# Spark configuration ##########################################################
+
+SPARK_CONFIG='scale-tests/configs/spark-options.json'
+
+# Note: empty package repo values will default to latest Universe packages.
+# Spark version 2.11.0-2.4.6
+SPARK_PACKAGE_REPO='https://universe-converter.mesosphere.com/transform?url=https://infinity-artifacts.s3.amazonaws.com/permanent/spark/2.11.0-2.4.6/stub-universe-spark.json'
+
+# Note: leaving the Spark executor Docker image empty so that executors inherit
+# the image used for dispatchers.
+SPARK_EXECUTOR_DOCKER_IMAGE=
+
+# Non-GPU Spark dispatchers configuration ######################################
+
+# Not currently used.
+BATCH_MAX_NON_GPU_JOBS=30
+
+SPARK_NON_GPU_DISPATCHERS=3
+SPARK_NON_GPU_DISPATCHERS_OUTPUT_FILE="non-gpu-dispatchers.out"
+# Note: this name is built internally by the deploy-dispatchers.py script.
+SPARK_NON_GPU_DISPATCHERS_JSON_OUTPUT_FILE="${SPARK_NON_GPU_DISPATCHERS_OUTPUT_FILE}-dispatchers.json"
+# Note: driver resources used per dispatcher (1 dispatcher will be able to run
+# 8 drivers since each driver requires 1 CPU).
+SPARK_NON_GPU_QUOTA_DRIVERS_CPUS=8
+SPARK_NON_GPU_QUOTA_DRIVERS_MEM=20000
+# Note: executor resources used per job (1 driver will run 1 job).
+SPARK_NON_GPU_QUOTA_EXECUTORS_CPUS=8
+SPARK_NON_GPU_QUOTA_EXECUTORS_MEM=20000
+
+# GPU Spark dispatchers configuration ##########################################
+
+# Not currently used.
+BATCH_MAX_GPU_JOBS=2
+
+SPARK_GPU_DISPATCHERS=0
+SPARK_GPU_DISPATCHERS_OUTPUT_FILE="gpu-dispatchers.out"
+SPARK_GPU_DISPATCHERS_JSON_OUTPUT_FILE="${SPARK_GPU_DISPATCHERS_OUTPUT_FILE}-dispatchers.json" # NOTE: this name is built internally by the deploy-dispatchers.py script.
+SPARK_GPU_QUOTA_DRIVERS_CPUS=
+SPARK_GPU_QUOTA_DRIVERS_MEM=
+SPARK_GPU_QUOTA_DRIVERS_GPUS=
+SPARK_GPU_QUOTA_EXECUTORS_CPUS=
+SPARK_GPU_QUOTA_EXECUTORS_MEM=
+SPARK_GPU_QUOTA_EXECUTORS_GPUS=
+
+# Common streaming jobs configuration ##########################################
+
+TEST_ASSEMBLY_JAR_URL='http://infinity-artifacts.s3.amazonaws.com/scale-tests/dcos-spark-scala-tests-assembly-2.4.0-20190325.jar'
+DISPATCHERS_JSON_OUTPUT_FILE="all-dispatchers.json"
+
+# Finite streaming jobs configuration ##########################################
+
+STREAMING_FINITE_SUBMISSIONS_OUTPUT_FILE="finite-submissions.out"
+STREAMING_FINITE_PRODUCERS_PER_KAFKA="${SPARK_NON_GPU_DISPATCHERS}" # 1 Kafka and 3 dispatchers -> 3 producers.
+STREAMING_FINITE_CONSUMERS_PER_PRODUCER=1 # 3 producers -> 3 consumers.
+# 3 producers + 3 consumers = 6 total finite streaming jobs
+STREAMING_FINITE_PRODUCER_NUMBER_OF_WORDS=7692
+STREAMING_FINITE_PRODUCER_WORDS_PER_SECOND=1
+# 7692 words / 1 word per second -> ~2h runtime.
+STREAMING_FINITE_PRODUCER_SPARK_CORES_MAX=2
+STREAMING_FINITE_PRODUCER_SPARK_EXECUTOR_CORES=2
+STREAMING_FINITE_CONSUMER_BATCH_SIZE_SECONDS=10
+STREAMING_FINITE_CONSUMER_SPARK_CORES_MAX=1
+STREAMING_FINITE_CONSUMER_SPARK_EXECUTOR_CORES=1
+
+# Infinite streaming jobs configuration ########################################
+
+STREAMING_INFINITE_SUBMISSIONS_OUTPUT_FILE="infinite-submissions.out"
+STREAMING_INFINITE_PRODUCERS_PER_KAFKA="${SPARK_NON_GPU_DISPATCHERS}" # 1 Kafka and 3 dispatchers -> 3 producers.
+STREAMING_INFINITE_CONSUMERS_PER_PRODUCER=1 # 3 producers -> 3 consumers.
+# 3 producers + 3 consumers = 6 total infinite streaming jobs
+STREAMING_INFINITE_PRODUCER_NUMBER_OF_WORDS=0
+STREAMING_INFINITE_PRODUCER_WORDS_PER_SECOND=1
+STREAMING_INFINITE_PRODUCER_SPARK_CORES_MAX=2
+STREAMING_INFINITE_PRODUCER_SPARK_EXECUTOR_CORES=2
+STREAMING_INFINITE_CONSUMER_BATCH_SIZE_SECONDS=10
+STREAMING_INFINITE_CONSUMER_SPARK_CORES_MAX=1
+STREAMING_INFINITE_CONSUMER_SPARK_EXECUTOR_CORES=1
+
+# Batch jobs configuration #####################################################
+
+SPARK_NON_GPU_DISPATCHERS_JSON_OUTPUT_FILE_URL="https://${TEST_S3_BUCKET}.s3.amazonaws.com/${TEST_S3_FOLDER}/${SPARK_NON_GPU_DISPATCHERS_JSON_OUTPUT_FILE}"
+
+BATCH_APP_ID="/${SERVICE_NAMES_PREFIX}batch-workload"
+BATCH_SCRIPT_CPUS=6
+BATCH_SCRIPT_MEM=12288
+BATCH_SUBMITS_PER_MIN=13
+# TODO: update to master for the next MWT.
+BATCH_SPARK_BUILD_BRANCH="dcos-58437-deploy-workloads-under-role-enforced-group"
+
+# Batch GPU jobs configuration #################################################
+
+SPARK_GPU_DISPATCHERS_JSON_OUTPUT_FILE_URL="https://${TEST_S3_BUCKET}.s3.amazonaws.com/${TEST_S3_FOLDER}/${SPARK_GPU_DISPATCHERS_JSON_OUTPUT_FILE}"
+
+GPU_APP_ID="/${SERVICE_NAMES_PREFIX}gpu-batch-workload"
+GPU_SCRIPT_CPUS=2
+GPU_SCRIPT_MEM=4096
+GPU_DOCKER_IMAGE='samvantran/spark-dcos-gpu:metrics'
+GPU_SUBMITS_PER_MIN=5
+GPU_MAX_DISPATCHERS=${SPARK_GPU_DISPATCHERS}
+GPU_SPARK_CORES_MAX=4
+GPU_SPARK_MESOS_EXECUTOR_GPUS=4
+GPU_SPARK_MESOS_MAX_GPUS=4
+GPU_SPARK_BUILD_BRANCH=master
diff --git a/scale-tests/configs/2020-10-01-mwt26run.env b/scale-tests/configs/2020-10-01-mwt26run.env
new file mode 100644
index 00000000..89eb4292
--- /dev/null
+++ b/scale-tests/configs/2020-10-01-mwt26run.env
@@ -0,0 +1,176 @@
+# Depends on:
+# - TEST_NAME
+# - TEST_S3_BUCKET
+# - TEST_S3_FOLDER
+
+# Workload configuration #######################################################
+#
+# Total CPU quota: 2290
+# Total MEM quota: 4580000
+# Total GPU quota: 40
+
+CLUSTER_URL="https://mwt26.scaletesting.mesosphe.re"
+SECURITY="strict"
+
+DCOS_CLI_URL="https://downloads.dcos.io/cli/releases/binaries/dcos/linux/x86-64/latest/dcos"
+
+# Test configuration ###########################################################
+
+SHOULD_INSTALL_INFRASTRUCTURE=true
+SHOULD_INSTALL_NON_GPU_DISPATCHERS=true
+SHOULD_INSTALL_GPU_DISPATCHERS=false
+SHOULD_RUN_FINITE_STREAMING_JOBS=true
+SHOULD_RUN_INFINITE_STREAMING_JOBS=true
+SHOULD_RUN_BATCH_JOBS=true
+SHOULD_RUN_GPU_BATCH_JOBS=false
+SHOULD_UNINSTALL_INFRASTRUCTURE_AT_THE_END=false
+
+# Infrastructure configuration #################################################
+
+GROUP_NAME="${TEST_NAME}"
+
+SERVICE_NAMES_PREFIX="${TEST_NAME}/"
+INFRASTRUCTURE_OUTPUT_FILE="infrastructure.json"
+
+KAFKA_CLUSTER_COUNT=1
+CASSANDRA_CLUSTER_COUNT=1
+
+ZOOKEEPER_CPUS=10
+ZOOKEEPER_MEM=20000
+ZOOKEEPER_CONFIG='scale-tests/configs/kafka-zookeeper-options.json'
+# Note: empty package repo values will default to latest Universe packages.
+ZOOKEEPER_PACKAGE_REPO='https://universe-converter.mesosphere.com/transform?url=https://infinity-artifacts.s3.amazonaws.com/permanent/confluent-zookeeper/assets/2.8.0-5.5.1-beta/stub-universe-confluent-zookeeper.json'
+# 2.8.0-5.5.1-beta from the Universe.
+
+KAFKA_CPUS=10
+KAFKA_MEM=20000
+KAFKA_CONFIG='scale-tests/configs/kafka-options.json'
+KAFKA_USER='root'
+# Note: empty package repo values will default to latest Universe packages.
+KAFKA_PACKAGE_REPO='https://universe-converter.mesosphere.com/transform?url=https://infinity-artifacts.s3.amazonaws.com/permanent/confluent-kafka/assets/2.10.0-5.5.1-beta/stub-universe-confluent-kafka.json'
+# 2.10.0-5.5.1-beta from the Universe.
+KAFKA_PACKAGE_NAME='beta-confluent-kafka'
+
+CASSANDRA_CPUS=10
+CASSANDRA_MEM=20000
+CASSANDRA_CONFIG='scale-tests/configs/cassandra-options.json'
+CASSANDRA_USER='root'
+# Note: empty package repo values will default to latest Universe packages.
+CASSANDRA_PACKAGE_REPO='https://universe-converter.mesosphere.com/transform?url=https://infinity-artifacts.s3.amazonaws.com/permanent/cassandra/assets/2.10.0-3.11.6-beta/stub-universe-cassandra.json'
+CASSANDRA_PACKAGE_NAME='beta-cassandra'
+# 2.10.0-3.11.6-beta from the Universe.
+
+# DSEngine configuration #######################################################
+
+DSENGINE_CPUS=10
+DSENGINE_MEM=20000
+DSENGINE_GPUS=40
+DSENGINE_CONFIG='scale-tests/configs/dsengine-options.json'
+# Note: empty package repo values will default to latest Universe packages.
+DSENGINE_PACKAGE_REPO='https://universe-converter.mesosphere.com/transform?url=https://infinity-artifacts.s3.amazonaws.com/permanent/data-science-engine/assets/2.1.0-beta/stub-universe-data-science-engine.json'
+DSENGINE_PACKAGE_NAME='beta-data-science-engine'
+# 2.1.0-beta from the Universe.
+
+# Spark configuration ##########################################################
+
+SPARK_CONFIG='scale-tests/configs/spark-options.json'
+
+# Note: empty package repo values will default to latest Universe packages.
+# Spark version 2.11.0-2.4.6
+SPARK_PACKAGE_REPO='https://universe-converter.mesosphere.com/transform?url=https://infinity-artifacts.s3.amazonaws.com/permanent/spark/2.11.0-2.4.6/stub-universe-spark.json'
+
+# Note: leaving the Spark executor Docker image empty so that executors inherit
+# the image used for dispatchers.
+SPARK_EXECUTOR_DOCKER_IMAGE=
+
+# Non-GPU Spark dispatchers configuration ######################################
+
+# Not currently used.
+BATCH_MAX_NON_GPU_JOBS=1000
+
+SPARK_NON_GPU_DISPATCHERS=50
+SPARK_NON_GPU_DISPATCHERS_OUTPUT_FILE="non-gpu-dispatchers.out"
+# Note: this name is built internally by the deploy-dispatchers.py script.
+SPARK_NON_GPU_DISPATCHERS_JSON_OUTPUT_FILE="${SPARK_NON_GPU_DISPATCHERS_OUTPUT_FILE}-dispatchers.json"
+# Note: driver resources used per dispatcher (1 dispatcher will be able to run
+# 20 drivers since each driver requires 1 CPU).
+SPARK_NON_GPU_QUOTA_DRIVERS_CPUS=20
+SPARK_NON_GPU_QUOTA_DRIVERS_MEM=50000
+# Note: executor resources used per job (1 driver will run 1 job).
+SPARK_NON_GPU_QUOTA_EXECUTORS_CPUS=25
+SPARK_NON_GPU_QUOTA_EXECUTORS_MEM=40000
+
+# GPU Spark dispatchers configuration ##########################################
+
+# Not currently used.
+BATCH_MAX_GPU_JOBS=10
+
+SPARK_GPU_DISPATCHERS=0
+SPARK_GPU_DISPATCHERS_OUTPUT_FILE="gpu-dispatchers.out"
+SPARK_GPU_DISPATCHERS_JSON_OUTPUT_FILE="${SPARK_GPU_DISPATCHERS_OUTPUT_FILE}-dispatchers.json" # NOTE: this name is built internally by the deploy-dispatchers.py script.
+SPARK_GPU_QUOTA_DRIVERS_CPUS=
+SPARK_GPU_QUOTA_DRIVERS_MEM=
+SPARK_GPU_QUOTA_DRIVERS_GPUS=
+SPARK_GPU_QUOTA_EXECUTORS_CPUS=
+SPARK_GPU_QUOTA_EXECUTORS_MEM=
+SPARK_GPU_QUOTA_EXECUTORS_GPUS=
+SPARK_GPU_REMOVE_EXECUTORS_ROLES_QUOTAS=true
+# Common streaming jobs configuration ##########################################
+
+TEST_ASSEMBLY_JAR_URL='http://infinity-artifacts.s3.amazonaws.com/scale-tests/dcos-spark-scala-tests-assembly-2.4.0-20190325.jar'
+DISPATCHERS_JSON_OUTPUT_FILE="all-dispatchers.json"
+
+# Finite streaming jobs configuration ##########################################
+
+STREAMING_FINITE_SUBMISSIONS_OUTPUT_FILE="finite-submissions.out"
+STREAMING_FINITE_PRODUCERS_PER_KAFKA="${SPARK_NON_GPU_DISPATCHERS}" # 1 Kafka and 50 dispatchers -> 50 producers.
+STREAMING_FINITE_CONSUMERS_PER_PRODUCER=1 # 50 producers -> 50 consumers.
+# 50 producers + 50 consumers = 100 total finite streaming jobs
+STREAMING_FINITE_PRODUCER_NUMBER_OF_WORDS=7692
+STREAMING_FINITE_PRODUCER_WORDS_PER_SECOND=1
+# 7692 words / 1 word per second -> ~2h runtime.
+STREAMING_FINITE_PRODUCER_SPARK_CORES_MAX=2
+STREAMING_FINITE_PRODUCER_SPARK_EXECUTOR_CORES=2
+STREAMING_FINITE_CONSUMER_BATCH_SIZE_SECONDS=10
+STREAMING_FINITE_CONSUMER_SPARK_CORES_MAX=1
+STREAMING_FINITE_CONSUMER_SPARK_EXECUTOR_CORES=1
+
+# Infinite streaming jobs configuration ########################################
+
+STREAMING_INFINITE_SUBMISSIONS_OUTPUT_FILE="infinite-submissions.out"
+STREAMING_INFINITE_PRODUCERS_PER_KAFKA="${SPARK_NON_GPU_DISPATCHERS}" # 1 Kafka and 50 dispatchers -> 50 producers.
+STREAMING_INFINITE_CONSUMERS_PER_PRODUCER=1 # 50 producers -> 50 consumers.
+# 50 producers + 50 consumers = 100 total infinite streaming jobs
+STREAMING_INFINITE_PRODUCER_NUMBER_OF_WORDS=0
+STREAMING_INFINITE_PRODUCER_WORDS_PER_SECOND=1
+STREAMING_INFINITE_PRODUCER_SPARK_CORES_MAX=2
+STREAMING_INFINITE_PRODUCER_SPARK_EXECUTOR_CORES=2
+STREAMING_INFINITE_CONSUMER_BATCH_SIZE_SECONDS=10
+STREAMING_INFINITE_CONSUMER_SPARK_CORES_MAX=1
+STREAMING_INFINITE_CONSUMER_SPARK_EXECUTOR_CORES=1
+
+# Batch jobs configuration #####################################################
+
+SPARK_NON_GPU_DISPATCHERS_JSON_OUTPUT_FILE_URL="https://${TEST_S3_BUCKET}.s3.amazonaws.com/${TEST_S3_FOLDER}/${SPARK_NON_GPU_DISPATCHERS_JSON_OUTPUT_FILE}"
+
+BATCH_APP_ID="/${SERVICE_NAMES_PREFIX}batch-workload"
+BATCH_SCRIPT_CPUS=6
+BATCH_SCRIPT_MEM=12288
+BATCH_SUBMITS_PER_MIN=13
+# TODO: update to master for the next MWT.
+BATCH_SPARK_BUILD_BRANCH="mwt-26"
+
+# Batch GPU jobs configuration #################################################
+
+SPARK_GPU_DISPATCHERS_JSON_OUTPUT_FILE_URL="https://${TEST_S3_BUCKET}.s3.amazonaws.com/${TEST_S3_FOLDER}/${SPARK_GPU_DISPATCHERS_JSON_OUTPUT_FILE}"
+
+GPU_APP_ID="/${SERVICE_NAMES_PREFIX}gpu-batch-workload"
+GPU_SCRIPT_CPUS=2
+GPU_SCRIPT_MEM=4096
+GPU_DOCKER_IMAGE='samvantran/spark-dcos-gpu:metrics'
+GPU_SUBMITS_PER_MIN=5
+GPU_MAX_DISPATCHERS=${SPARK_GPU_DISPATCHERS}
+GPU_SPARK_CORES_MAX=4
+GPU_SPARK_MESOS_EXECUTOR_GPUS=4
+GPU_SPARK_MESOS_MAX_GPUS=4
+GPU_SPARK_BUILD_BRANCH=master
diff --git a/scale-tests/configs/2020-10-20-mwt27dr.env b/scale-tests/configs/2020-10-20-mwt27dr.env
new file mode 100644
index 00000000..acf97bd5
--- /dev/null
+++ b/scale-tests/configs/2020-10-20-mwt27dr.env
@@ -0,0 +1,177 @@
+# Depends on:
+# - TEST_NAME
+# - TEST_S3_BUCKET
+# - TEST_S3_FOLDER
+
+# Workload configuration #######################################################
+#
+# Total CPU quota: 88
+# Total MEM quota: 200000
+# Total GPU quota: 40
+
+CLUSTER_URL="https://mwt27-dry.scaletesting.mesosphe.re/"
+SECURITY="strict"
+
+DCOS_CLI_URL="https://downloads.dcos.io/cli/releases/binaries/dcos/linux/x86-64/latest/dcos"
+
+# Test configuration ###########################################################
+
+SHOULD_INSTALL_INFRASTRUCTURE=true
+SHOULD_INSTALL_NON_GPU_DISPATCHERS=true
+SHOULD_INSTALL_GPU_DISPATCHERS=false
+SHOULD_RUN_FINITE_STREAMING_JOBS=true
+SHOULD_RUN_INFINITE_STREAMING_JOBS=true
+SHOULD_RUN_BATCH_JOBS=true
+SHOULD_RUN_GPU_BATCH_JOBS=false
+SHOULD_UNINSTALL_INFRASTRUCTURE_AT_THE_END=false
+
+# Infrastructure configuration #################################################
+
+GROUP_NAME="${TEST_NAME}"
+
+SERVICE_NAMES_PREFIX="${TEST_NAME}/"
+INFRASTRUCTURE_OUTPUT_FILE="infrastructure.json"
+
+KAFKA_CLUSTER_COUNT=1
+CASSANDRA_CLUSTER_COUNT=1
+
+ZOOKEEPER_CPUS=10
+ZOOKEEPER_MEM=20000
+ZOOKEEPER_CONFIG='scale-tests/configs/kafka-zookeeper-options.json'
+# Note: empty package repo values will default to latest Universe packages.
+ZOOKEEPER_PACKAGE_REPO=
+ZOOKEEPER_PACKAGE_NAME='beta-confluent-zookeeper'
+# 2.8.0-5.5.1-beta from the Universe.
+
+KAFKA_CPUS=10
+KAFKA_MEM=20000
+KAFKA_CONFIG='scale-tests/configs/kafka-options.json'
+KAFKA_USER='root'
+# Note: empty package repo values will default to latest Universe packages.
+KAFKA_PACKAGE_REPO=
+# 2.10.0-5.5.1-3-beta from the Universe.
+KAFKA_PACKAGE_NAME='beta-confluent-kafka'
+
+CASSANDRA_CPUS=10
+CASSANDRA_MEM=20000
+CASSANDRA_CONFIG='scale-tests/configs/cassandra-options.json'
+CASSANDRA_USER='root'
+# Note: empty package repo values will default to latest Universe packages.
+CASSANDRA_PACKAGE_REPO='https://universe-converter.mesosphere.com/transform?url=https://infinity-artifacts.s3.amazonaws.com/permanent/cassandra/assets/2.10.0-3.11.6-beta/stub-universe-cassandra.json'
+CASSANDRA_PACKAGE_NAME='beta-cassandra'
+# 2.10.0-3.11.6-beta from the Universe.
+
+# DSEngine configuration #######################################################
+
+DSENGINE_CPUS=10
+DSENGINE_MEM=20000
+DSENGINE_GPUS=40
+DSENGINE_CONFIG='scale-tests/configs/dsengine-options.json'
+# Note: empty package repo values will default to latest Universe packages.
+DSENGINE_PACKAGE_REPO=
+DSENGINE_PACKAGE_NAME='beta-data-science-engine'
+# 2.1.0-beta from the Universe.
+
+# Spark configuration ##########################################################
+
+SPARK_CONFIG='scale-tests/configs/spark-options.json'
+
+# Note: empty package repo values will default to latest Universe packages.
+# Spark version 2.11.0-2.4.6
+SPARK_PACKAGE_REPO='https://universe-converter.mesosphere.com/transform?url=https://infinity-artifacts.s3.amazonaws.com/permanent/spark/2.11.0-2.4.6/stub-universe-spark.json'
+
+# Note: leaving the Spark executor Docker image empty so that executors inherit
+# the image used for dispatchers.
+SPARK_EXECUTOR_DOCKER_IMAGE=
+
+# Non-GPU Spark dispatchers configuration ######################################
+
+# Not currently used.
+BATCH_MAX_NON_GPU_JOBS=30
+
+SPARK_NON_GPU_DISPATCHERS=3
+SPARK_NON_GPU_DISPATCHERS_OUTPUT_FILE="non-gpu-dispatchers.out"
+# Note: this name is built internally by the deploy-dispatchers.py script.
+SPARK_NON_GPU_DISPATCHERS_JSON_OUTPUT_FILE="${SPARK_NON_GPU_DISPATCHERS_OUTPUT_FILE}-dispatchers.json"
+# Note: driver resources used per dispatcher (1 dispatcher will be able to run
+# 8 drivers since each driver requires 1 CPU).
+SPARK_NON_GPU_QUOTA_DRIVERS_CPUS=8
+SPARK_NON_GPU_QUOTA_DRIVERS_MEM=20000
+# Note: executor resources used per job (1 driver will run 1 job).
+SPARK_NON_GPU_QUOTA_EXECUTORS_CPUS=8
+SPARK_NON_GPU_QUOTA_EXECUTORS_MEM=20000
+
+# GPU Spark dispatchers configuration ##########################################
+
+# Not currently used.
+BATCH_MAX_GPU_JOBS=2
+
+SPARK_GPU_DISPATCHERS=0
+SPARK_GPU_DISPATCHERS_OUTPUT_FILE="gpu-dispatchers.out"
+SPARK_GPU_DISPATCHERS_JSON_OUTPUT_FILE="${SPARK_GPU_DISPATCHERS_OUTPUT_FILE}-dispatchers.json" # NOTE: this name is built internally by the deploy-dispatchers.py script.
+SPARK_GPU_QUOTA_DRIVERS_CPUS=
+SPARK_GPU_QUOTA_DRIVERS_MEM=
+SPARK_GPU_QUOTA_DRIVERS_GPUS=
+SPARK_GPU_QUOTA_EXECUTORS_CPUS=
+SPARK_GPU_QUOTA_EXECUTORS_MEM=
+SPARK_GPU_QUOTA_EXECUTORS_GPUS=
+
+# Common streaming jobs configuration ##########################################
+
+TEST_ASSEMBLY_JAR_URL='http://infinity-artifacts.s3.amazonaws.com/scale-tests/dcos-spark-scala-tests-assembly-2.4.0-20190325.jar'
+DISPATCHERS_JSON_OUTPUT_FILE="all-dispatchers.json"
+
+# Finite streaming jobs configuration ##########################################
+
+STREAMING_FINITE_SUBMISSIONS_OUTPUT_FILE="finite-submissions.out"
+STREAMING_FINITE_PRODUCERS_PER_KAFKA="${SPARK_NON_GPU_DISPATCHERS}" # 1 Kafka and 3 dispatchers -> 3 producers.
+STREAMING_FINITE_CONSUMERS_PER_PRODUCER=1 # 3 producers -> 3 consumers.
+# 3 producers + 3 consumers = 6 total finite streaming jobs
+STREAMING_FINITE_PRODUCER_NUMBER_OF_WORDS=7692
+STREAMING_FINITE_PRODUCER_WORDS_PER_SECOND=1
+# 7692 words / 1 word per second -> ~2h runtime.
+STREAMING_FINITE_PRODUCER_SPARK_CORES_MAX=2
+STREAMING_FINITE_PRODUCER_SPARK_EXECUTOR_CORES=2
+STREAMING_FINITE_CONSUMER_BATCH_SIZE_SECONDS=10
+STREAMING_FINITE_CONSUMER_SPARK_CORES_MAX=1
+STREAMING_FINITE_CONSUMER_SPARK_EXECUTOR_CORES=1
+
+# Infinite streaming jobs configuration ########################################
+
+STREAMING_INFINITE_SUBMISSIONS_OUTPUT_FILE="infinite-submissions.out"
+STREAMING_INFINITE_PRODUCERS_PER_KAFKA="${SPARK_NON_GPU_DISPATCHERS}" # 1 Kafka and 3 dispatchers -> 3 producers.
+STREAMING_INFINITE_CONSUMERS_PER_PRODUCER=1 # 3 producers -> 3 consumers.
+# 3 producers + 3 consumers = 6 total infinite streaming jobs
+STREAMING_INFINITE_PRODUCER_NUMBER_OF_WORDS=0
+STREAMING_INFINITE_PRODUCER_WORDS_PER_SECOND=1
+STREAMING_INFINITE_PRODUCER_SPARK_CORES_MAX=2
+STREAMING_INFINITE_PRODUCER_SPARK_EXECUTOR_CORES=2
+STREAMING_INFINITE_CONSUMER_BATCH_SIZE_SECONDS=10
+STREAMING_INFINITE_CONSUMER_SPARK_CORES_MAX=1
+STREAMING_INFINITE_CONSUMER_SPARK_EXECUTOR_CORES=1
+
+# Batch jobs configuration #####################################################
+
+SPARK_NON_GPU_DISPATCHERS_JSON_OUTPUT_FILE_URL="https://${TEST_S3_BUCKET}.s3.amazonaws.com/${TEST_S3_FOLDER}/${SPARK_NON_GPU_DISPATCHERS_JSON_OUTPUT_FILE}"
+
+BATCH_APP_ID="/${SERVICE_NAMES_PREFIX}batch-workload"
+BATCH_SCRIPT_CPUS=6
+BATCH_SCRIPT_MEM=12288
+BATCH_SUBMITS_PER_MIN=13
+# TODO: update to master for the next MWT.
+BATCH_SPARK_BUILD_BRANCH="dcos-58437-deploy-workloads-under-role-enforced-group"
+
+# Batch GPU jobs configuration #################################################
+
+SPARK_GPU_DISPATCHERS_JSON_OUTPUT_FILE_URL="https://${TEST_S3_BUCKET}.s3.amazonaws.com/${TEST_S3_FOLDER}/${SPARK_GPU_DISPATCHERS_JSON_OUTPUT_FILE}"
+
+GPU_APP_ID="/${SERVICE_NAMES_PREFIX}gpu-batch-workload"
+GPU_SCRIPT_CPUS=2
+GPU_SCRIPT_MEM=4096
+GPU_DOCKER_IMAGE='samvantran/spark-dcos-gpu:metrics'
+GPU_SUBMITS_PER_MIN=5
+GPU_MAX_DISPATCHERS=${SPARK_GPU_DISPATCHERS}
+GPU_SPARK_CORES_MAX=4
+GPU_SPARK_MESOS_EXECUTOR_GPUS=4
+GPU_SPARK_MESOS_MAX_GPUS=4
+GPU_SPARK_BUILD_BRANCH=master
diff --git a/scale-tests/configs/2020-10-22-mwt27run.env b/scale-tests/configs/2020-10-22-mwt27run.env
new file mode 100644
index 00000000..80ca6576
--- /dev/null
+++ b/scale-tests/configs/2020-10-22-mwt27run.env
@@ -0,0 +1,177 @@
+# Depends on:
+# - TEST_NAME
+# - TEST_S3_BUCKET
+# - TEST_S3_FOLDER
+
+# Workload configuration #######################################################
+#
+# Total CPU quota: 2290
+# Total MEM quota: 4580000
+# Total GPU quota: 40
+
+CLUSTER_URL="https://mwt27.scaletesting.mesosphe.re"
+SECURITY="strict"
+
+DCOS_CLI_URL="https://downloads.dcos.io/cli/releases/binaries/dcos/linux/x86-64/latest/dcos"
+
+# Test configuration ###########################################################
+
+SHOULD_INSTALL_INFRASTRUCTURE=true
+SHOULD_INSTALL_NON_GPU_DISPATCHERS=true
+SHOULD_INSTALL_GPU_DISPATCHERS=false
+SHOULD_RUN_FINITE_STREAMING_JOBS=true
+SHOULD_RUN_INFINITE_STREAMING_JOBS=true
+SHOULD_RUN_BATCH_JOBS=true
+SHOULD_RUN_GPU_BATCH_JOBS=false
+SHOULD_UNINSTALL_INFRASTRUCTURE_AT_THE_END=false
+
+# Infrastructure configuration #################################################
+
+GROUP_NAME="${TEST_NAME}"
+
+SERVICE_NAMES_PREFIX="${TEST_NAME}/"
+INFRASTRUCTURE_OUTPUT_FILE="infrastructure.json"
+
+KAFKA_CLUSTER_COUNT=1
+CASSANDRA_CLUSTER_COUNT=1
+
+ZOOKEEPER_CPUS=10
+ZOOKEEPER_MEM=20000
+ZOOKEEPER_CONFIG='scale-tests/configs/kafka-zookeeper-options.json'
+# Note: empty package repo values will default to latest Universe packages.
+ZOOKEEPER_PACKAGE_REPO=
+ZOOKEEPER_PACKAGE_NAME='beta-confluent-zookeeper'
+# 2.8.0-5.5.1-beta from the Universe.
+
+KAFKA_CPUS=10
+KAFKA_MEM=20000
+KAFKA_CONFIG='scale-tests/configs/kafka-options.json'
+KAFKA_USER='root'
+# Note: empty package repo values will default to latest Universe packages.
+KAFKA_PACKAGE_REPO=
+# 2.10.0-5.5.1-3-beta from the Universe.
+KAFKA_PACKAGE_NAME='beta-confluent-kafka'
+
+CASSANDRA_CPUS=10
+CASSANDRA_MEM=20000
+CASSANDRA_CONFIG='scale-tests/configs/cassandra-options.json'
+CASSANDRA_USER='root'
+# Note: empty package repo values will default to latest Universe packages.
+CASSANDRA_PACKAGE_REPO='https://universe-converter.mesosphere.com/transform?url=https://infinity-artifacts.s3.amazonaws.com/permanent/cassandra/assets/2.10.0-3.11.6-beta/stub-universe-cassandra.json'
+CASSANDRA_PACKAGE_NAME='beta-cassandra'
+# 2.10.0-3.11.6-beta from the Universe.
+
+# DSEngine configuration #######################################################
+
+DSENGINE_CPUS=10
+DSENGINE_MEM=20000
+DSENGINE_GPUS=40
+DSENGINE_CONFIG='scale-tests/configs/dsengine-options.json'
+# Note: empty package repo values will default to latest Universe packages.
+DSENGINE_PACKAGE_REPO=
+DSENGINE_PACKAGE_NAME='beta-data-science-engine'
+# 2.1.0-beta from the Universe.
+
+# Spark configuration ##########################################################
+
+SPARK_CONFIG='scale-tests/configs/spark-options.json'
+
+# Note: empty package repo values will default to latest Universe packages.
+# Spark version 2.11.0-2.4.6
+SPARK_PACKAGE_REPO='https://universe-converter.mesosphere.com/transform?url=https://infinity-artifacts.s3.amazonaws.com/permanent/spark/2.11.0-2.4.6/stub-universe-spark.json'
+
+# Note: leaving the Spark executor Docker image empty so that executors inherit
+# the image used for dispatchers.
+SPARK_EXECUTOR_DOCKER_IMAGE=
+
+# Non-GPU Spark dispatchers configuration ######################################
+
+# Not currently used.
+BATCH_MAX_NON_GPU_JOBS=1000
+
+SPARK_NON_GPU_DISPATCHERS=50
+SPARK_NON_GPU_DISPATCHERS_OUTPUT_FILE="non-gpu-dispatchers.out"
+# Note: this name is built internally by the deploy-dispatchers.py script.
+SPARK_NON_GPU_DISPATCHERS_JSON_OUTPUT_FILE="${SPARK_NON_GPU_DISPATCHERS_OUTPUT_FILE}-dispatchers.json"
+# Note: driver resources used per dispatcher (1 dispatcher will be able to run
+# 20 drivers since each driver requires 1 CPU).
+SPARK_NON_GPU_QUOTA_DRIVERS_CPUS=20
+SPARK_NON_GPU_QUOTA_DRIVERS_MEM=50000
+# Note: executor resources used per job (1 driver will run 1 job).
+SPARK_NON_GPU_QUOTA_EXECUTORS_CPUS=25
+SPARK_NON_GPU_QUOTA_EXECUTORS_MEM=40000
+
+# GPU Spark dispatchers configuration ##########################################
+
+# Not currently used.
+BATCH_MAX_GPU_JOBS=10
+
+SPARK_GPU_DISPATCHERS=0
+SPARK_GPU_DISPATCHERS_OUTPUT_FILE="gpu-dispatchers.out"
+SPARK_GPU_DISPATCHERS_JSON_OUTPUT_FILE="${SPARK_GPU_DISPATCHERS_OUTPUT_FILE}-dispatchers.json" # NOTE: this name is built internally by the deploy-dispatchers.py script.
+SPARK_GPU_QUOTA_DRIVERS_CPUS=
+SPARK_GPU_QUOTA_DRIVERS_MEM=
+SPARK_GPU_QUOTA_DRIVERS_GPUS=
+SPARK_GPU_QUOTA_EXECUTORS_CPUS=
+SPARK_GPU_QUOTA_EXECUTORS_MEM=
+SPARK_GPU_QUOTA_EXECUTORS_GPUS=
+SPARK_GPU_REMOVE_EXECUTORS_ROLES_QUOTAS=true
+# Common streaming jobs configuration ##########################################
+
+TEST_ASSEMBLY_JAR_URL='http://infinity-artifacts.s3.amazonaws.com/scale-tests/dcos-spark-scala-tests-assembly-2.4.0-20190325.jar'
+DISPATCHERS_JSON_OUTPUT_FILE="all-dispatchers.json"
+
+# Finite streaming jobs configuration ##########################################
+
+STREAMING_FINITE_SUBMISSIONS_OUTPUT_FILE="finite-submissions.out"
+STREAMING_FINITE_PRODUCERS_PER_KAFKA="${SPARK_NON_GPU_DISPATCHERS}" # 1 Kafka and 50 dispatchers -> 50 producers.
+STREAMING_FINITE_CONSUMERS_PER_PRODUCER=1 # 50 producers -> 50 consumers.
+# 50 producers + 50 consumers = 100 total finite streaming jobs
+STREAMING_FINITE_PRODUCER_NUMBER_OF_WORDS=7692
+STREAMING_FINITE_PRODUCER_WORDS_PER_SECOND=1
+# 7692 words / 1 word per second -> ~2h runtime.
+STREAMING_FINITE_PRODUCER_SPARK_CORES_MAX=2
+STREAMING_FINITE_PRODUCER_SPARK_EXECUTOR_CORES=2
+STREAMING_FINITE_CONSUMER_BATCH_SIZE_SECONDS=10
+STREAMING_FINITE_CONSUMER_SPARK_CORES_MAX=1
+STREAMING_FINITE_CONSUMER_SPARK_EXECUTOR_CORES=1
+
+# Infinite streaming jobs configuration ########################################
+
+STREAMING_INFINITE_SUBMISSIONS_OUTPUT_FILE="infinite-submissions.out"
+STREAMING_INFINITE_PRODUCERS_PER_KAFKA="${SPARK_NON_GPU_DISPATCHERS}" # 1 Kafka and 50 dispatchers -> 50 producers.
+STREAMING_INFINITE_CONSUMERS_PER_PRODUCER=1 # 50 producers -> 50 consumers.
+# 50 producers + 50 consumers = 100 total infinite streaming jobs
+STREAMING_INFINITE_PRODUCER_NUMBER_OF_WORDS=0
+STREAMING_INFINITE_PRODUCER_WORDS_PER_SECOND=1
+STREAMING_INFINITE_PRODUCER_SPARK_CORES_MAX=2
+STREAMING_INFINITE_PRODUCER_SPARK_EXECUTOR_CORES=2
+STREAMING_INFINITE_CONSUMER_BATCH_SIZE_SECONDS=10
+STREAMING_INFINITE_CONSUMER_SPARK_CORES_MAX=1
+STREAMING_INFINITE_CONSUMER_SPARK_EXECUTOR_CORES=1
+
+# Batch jobs configuration #####################################################
+
+SPARK_NON_GPU_DISPATCHERS_JSON_OUTPUT_FILE_URL="https://${TEST_S3_BUCKET}.s3.amazonaws.com/${TEST_S3_FOLDER}/${SPARK_NON_GPU_DISPATCHERS_JSON_OUTPUT_FILE}"
+
+BATCH_APP_ID="/${SERVICE_NAMES_PREFIX}batch-workload"
+BATCH_SCRIPT_CPUS=6
+BATCH_SCRIPT_MEM=12288
+BATCH_SUBMITS_PER_MIN=13
+# TODO: update to master for the next MWT.
+BATCH_SPARK_BUILD_BRANCH="mwt-27"
+
+# Batch GPU jobs configuration #################################################
+
+SPARK_GPU_DISPATCHERS_JSON_OUTPUT_FILE_URL="https://${TEST_S3_BUCKET}.s3.amazonaws.com/${TEST_S3_FOLDER}/${SPARK_GPU_DISPATCHERS_JSON_OUTPUT_FILE}"
+
+GPU_APP_ID="/${SERVICE_NAMES_PREFIX}gpu-batch-workload"
+GPU_SCRIPT_CPUS=2
+GPU_SCRIPT_MEM=4096
+GPU_DOCKER_IMAGE='samvantran/spark-dcos-gpu:metrics'
+GPU_SUBMITS_PER_MIN=5
+GPU_MAX_DISPATCHERS=${SPARK_GPU_DISPATCHERS}
+GPU_SPARK_CORES_MAX=4
+GPU_SPARK_MESOS_EXECUTOR_GPUS=4
+GPU_SPARK_MESOS_MAX_GPUS=4
+GPU_SPARK_BUILD_BRANCH=master
diff --git a/scale-tests/configs/cassandra-options.json b/scale-tests/configs/cassandra-options.json
index e1038b2b..de7d1e78 100644
--- a/scale-tests/configs/cassandra-options.json
+++ b/scale-tests/configs/cassandra-options.json
@@ -1,9 +1,16 @@
{
"service": {
"virtual_network_enabled": true,
- "virtual_network_name": "calico"
+ "virtual_network_name": "calico",
+ "user": "root"
},
"nodes": {
- "count": 5
+ "count": 5,
+ "external_volume": {
+ "enabled": true,
+ "portworx_volume_options": "size=100",
+ "volume_name": "",
+ "driver_name": "pxd"
+ }
}
}
diff --git a/scale-tests/configs/dsengine-options.json b/scale-tests/configs/dsengine-options.json
new file mode 100644
index 00000000..2707bb01
--- /dev/null
+++ b/scale-tests/configs/dsengine-options.json
@@ -0,0 +1,37 @@
+{
+ "service": {
+ "user_id": 0,
+ "group_id": 0,
+ "cmd": "/usr/local/bin/start.sh ${CONDA_DIR}/bin/jupyter lab --notebook-dir=\"${JUPYTER_NOTEBOOK_DIR}\" --allow-root",
+ "virtual_network_enabled": true,
+ "virtual_network_name": "calico",
+ "gpu": {
+ "enabled": true,
+ "gpus": 1
+ },
+ "jupyter_notebook_type": "TensorFlow-2.1.0"
+ },
+ "storage": {
+ "local_persistence": {
+ "enabled": false
+ },
+ "external_volume": {
+ "enabled": true,
+ "driver_name": "pxd",
+ "volume_driver_options": "size=100",
+ "volume_path": "jupyter_data"
+ }
+ },
+ "spark": {
+ "spark_mesos_role": "data-services__data-science-engine-00-role",
+ "spark_mesos_principal": "data_services__data_science_engine_00_service_account",
+ "spark_mesos_secret": "data-services/data-science-engine-00-service-account-secret",
+ "spark_cores_max": 4,
+ "spark_driver_cores": 2,
+ "spark_executor_cores": 1,
+ "spark_mesos_gpus_max": 40,
+ "spark_mesos_executor_gpus": 7,
+ "spark_driver_memory": "2g",
+ "spark_executor_memory": "6g"
+ }
+}
\ No newline at end of file
diff --git a/scale-tests/configs/kafka-options.json b/scale-tests/configs/kafka-options.json
index 80fea6f4..5d302e84 100644
--- a/scale-tests/configs/kafka-options.json
+++ b/scale-tests/configs/kafka-options.json
@@ -1,10 +1,17 @@
{
"service": {
"virtual_network_enabled": true,
- "virtual_network_name": "calico"
+ "virtual_network_name": "calico",
+ "user": "root"
},
"brokers": {
"count": 5,
- "disk_type": "ROOT"
+ "disk_type": "ROOT",
+ "external_volume": {
+ "enabled": true,
+ "volume_name": "KafkaVolume",
+ "volume_driver_options": "size=100",
+ "volume_path": "kafka-broker-data"
+ }
}
}
diff --git a/scale-tests/configs/spark-options.json b/scale-tests/configs/spark-options.json
new file mode 100644
index 00000000..d4ac3d92
--- /dev/null
+++ b/scale-tests/configs/spark-options.json
@@ -0,0 +1,7 @@
+{
+ "service": {
+ "role": "data-services",
+ "enforce_role": true,
+ "virtual_network_enabled": true
+ }
+}
diff --git a/scale-tests/deploy-dispatchers.py b/scale-tests/deploy-dispatchers.py
index 33a4c990..537cec68 100755
--- a/scale-tests/deploy-dispatchers.py
+++ b/scale-tests/deploy-dispatchers.py
@@ -31,6 +31,7 @@
--quota-executors-gpus number of GPUs to use for executors quota [default: 0]
--quota-executors-mem amount of memory (mb) to use per executors quota [default: 1524.0]
--role Mesos role registered by dispatcher [default: *]
+ --group-role TODO: description [default: None]
--ucr-containerizer launch using the Universal Container Runtime [default: True]
--user user to run dispatcher service as [default: root]
@@ -57,9 +58,10 @@
logging.basicConfig(
- format='[%(asctime)s|%(name)s|%(levelname)s]: %(message)s',
+ format="[%(asctime)s|%(name)s|%(levelname)s]: %(message)s",
level=logging.INFO,
- stream=sys.stdout)
+ stream=sys.stdout,
+)
log = logging.getLogger(__name__)
@@ -73,25 +75,21 @@
# file.
-def create_quota(
- role_name: str,
- quota: typing.Dict
-):
+def create_quota(role_name: str, quota: typing.Dict):
"""
Create quota for the specified role.
"""
existing_quotas = sdk_cmd.get_json_output("spark quota list --json", print_output=False)
# remove existing quotas matching name
- if role_name in [x['role'] for x in existing_quotas.get('infos', [])]:
+ if role_name in [x["role"] for x in existing_quotas.get("infos", [])]:
rc, _, _ = sdk_cmd.run_raw_cli("spark quota remove {}".format(role_name))
assert rc == 0, "Error removing quota"
-
cmd_list = ["spark", "quota", "create"]
- for r in ["cpus", "mem", "gpus", ]:
+ for r in ["cpus", "mem", "gpus"]:
if r in quota:
- cmd_list.extend(["-{}".format(r[0]), quota[r],])
+ cmd_list.extend(["-{}".format(r[0]), quota[r]])
cmd_list.append(role_name)
@@ -117,10 +115,13 @@ def setup_role(service_name: str, role_base: str, quota: typing.Dict) -> str:
return role_name
-def setup_spark_security(service_name: str,
- drivers_role: str,
- executors_role: str,
- service_account_info: typing.Dict):
+def setup_spark_security(
+ service_name: str,
+ group_role: str,
+ drivers_role: str,
+ executors_role: str,
+ service_account_info: typing.Dict,
+):
"""
In strict mode, additional permissions are required for Spark.
@@ -134,48 +135,56 @@ def setup_spark_security(service_name: str,
linux_user = service_account_info.get("linux_user", "nobody")
service_account = service_account_info["name"]
- for role_name in [drivers_role, executors_role]:
+ if group_role:
sdk_security.grant_permissions(
- linux_user=linux_user,
- role_name=role_name,
- service_account_name=service_account,
+ linux_user=linux_user, role_name=group_role, service_account_name=service_account
)
+ else:
+ for role_name in [drivers_role, executors_role]:
+ sdk_security.grant_permissions(
+ linux_user=linux_user, role_name=role_name, service_account_name=service_account
+ )
# TODO: Is this required?
app_id = "/{}".format(service_name)
- app_id = urllib.parse.quote(
- urllib.parse.quote(app_id, safe=''),
- safe=''
+ app_id = urllib.parse.quote(urllib.parse.quote(app_id, safe=""), safe="")
+
+ sdk_security._grant(
+ service_account_info["name"],
+ "dcos:mesos:master:task:app_id:{}".format(app_id),
+ description="Spark drivers may execute Mesos tasks",
+ action="create",
)
- sdk_security._grant(service_account_info["name"],
- "dcos:mesos:master:task:app_id:{}".format(app_id),
- description="Spark drivers may execute Mesos tasks",
- action="create")
if linux_user == "root":
log.info("Marathon must be able to launch tasks as root")
- sdk_security._grant("dcos_marathon",
- "dcos:mesos:master:task:user:root",
- description="Root Marathon may launch tasks as root",
- action="create")
+ sdk_security._grant(
+ "dcos_marathon",
+ "dcos:mesos:master:task:user:root",
+ description="Root Marathon may launch tasks as root",
+ action="create",
+ )
return
-def install_package(package_name: str,
- service_prefix: str,
- index: int,
- linux_user: str,
- service_task_count: int,
- config_path: str,
- additional_options: typing.Dict = None,
- quota_options: typing.Dict = None) -> typing.Dict:
+def install_package(
+ package_name: str,
+ service_prefix: str,
+ index: int,
+ linux_user: str,
+ service_task_count: int,
+ group_role: str,
+ config_path: str,
+ additional_options: typing.Dict = None,
+ quota_options: typing.Dict = None,
+) -> typing.Dict:
"""
Deploy a single dispatcher with the specified index.
"""
if package_name.startswith("beta-"):
- basename = package_name[len("beta-"):]
+ basename = package_name[len("beta-") :]
else:
basename = package_name
@@ -183,15 +192,22 @@ def install_package(package_name: str,
service_account_info = scale_tests_utils.setup_security(service_name, linux_user)
- drivers_role = setup_role(service_name, "drivers", quota_options)
- executors_role = setup_role(service_name, "executors", quota_options)
+ service_options = scale_tests_utils.get_service_options(
+ service_name, service_account_info, additional_options, config_path
+ )
- setup_spark_security(service_name, drivers_role, executors_role, service_account_info)
+ if group_role:
+ setup_spark_security(service_name, group_role, None, None, service_account_info)
- service_options = scale_tests_utils.get_service_options(service_name, service_account_info, additional_options, config_path)
+ service_options["service"]["role"] = group_role
+ roles = {"drivers": None, "executors": None}
+ else:
+ drivers_role = setup_role(service_name, "drivers", quota_options)
+ executors_role = setup_role(service_name, "executors", quota_options)
+ setup_spark_security(service_name, drivers_role, executors_role, service_account_info)
- # install dispatcher with appropriate role
- service_options["service"]["role"] = drivers_role
+ service_options["service"]["role"] = drivers_role
+ roles = {"drivers": drivers_role, "executors": executors_role}
expected_task_count = service_task_count(service_options)
log.info("Expected task count: %s", expected_task_count)
@@ -202,47 +218,62 @@ def install_package(package_name: str,
package_name,
service_name,
expected_task_count,
- timeout_seconds=60*60,
+ timeout_seconds=60 * 60,
additional_options=service_options,
wait_for_deployment=False,
insert_strict_options=False,
- install_cli=False)
+ install_cli=False,
+ )
- return {"package_name": package_name,
- "roles": {"drivers": drivers_role, "executors": executors_role},
- "service_account_info": service_account_info,
- **service_options}
+ return {
+ "package_name": package_name,
+ "roles": roles,
+ "service_account_info": service_account_info,
+ **service_options,
+ }
def deploy_dispatchers(
num_dispatchers: int,
service_name_base: str,
+ group_role: str,
output_file: str,
linux_user: str,
options: typing.Dict,
- quota_options: typing.Dict
+ quota_options: typing.Dict,
) -> typing.Dict:
"""
Deploy the required number of dispatchers and store their information to a text file.
"""
+
def deploy_dispatcher(index: int) -> dict:
- return install_package('spark',
- service_name_base,
- index,
- linux_user,
- lambda x: 0,
- None,
- options,
- quota_options)
+ return install_package(
+ "spark",
+ service_name_base,
+ index,
+ linux_user,
+ lambda x: 0,
+ group_role,
+ None,
+ options,
+ quota_options,
+ )
with ThreadPoolExecutor(max_workers=MAX_THREADPOOL_WORKERS) as executor:
dispatchers = list(executor.map(deploy_dispatcher, range(num_dispatchers)))
- with open(output_file, 'w') as outfile:
+ with open(output_file, "w") as outfile:
for dispatcher in dispatchers:
- outfile.write('{},{},{}\n'.format(dispatcher['service']['name'],
- dispatcher['roles']['drivers'],
- dispatcher['roles']['executors']))
+ if group_role:
+ outfile.write("{},{}\n".format(dispatcher["service"]["name"], group_role))
+ else:
+ outfile.write(
+ "{},{},{}\n".format(
+ dispatcher["service"]["name"],
+ dispatcher["roles"]["drivers"],
+ dispatcher["roles"]["executors"],
+ )
+ )
outfile.flush()
return dispatchers
@@ -260,21 +291,19 @@ def get_default_options(arguments: dict) -> dict:
"spark-history-server-url": arguments["--history-service"] or "",
"UCR_containerizer": ast.literal_eval(arguments.get("--ucr-containerizer", True)),
"virtual_network_enabled": True,
- "virtual_network_name": "dcos"
+ "virtual_network_name": "dcos",
},
"security": {
"kerberos": {
"enabled": ast.literal_eval(arguments.get("--enable-kerberos", False)),
"kdc": {
"hostname": arguments["--kdc-hostname"] or "",
- "port": int(arguments["--kdc-port"])
+ "port": int(arguments["--kdc-port"]),
},
- "realm": arguments["--kerberos-realm"] or ""
+ "realm": arguments["--kerberos-realm"] or "",
}
},
- "hdfs": {
- "config-url": arguments["--hdfs-config"] or ""
- }
+ "hdfs": {"config-url": arguments["--hdfs-config"] or ""},
}
return options
@@ -283,12 +312,12 @@ def get_quota_options(arguments: typing.Dict) -> typing.Dict:
"""
Move the quota options from the command line arguments to a dict.
"""
- create_quotas = ast.literal_eval(arguments.get("--create-quotas", True))
+ create_quotas = arguments.get("--create-quotas", True)
if not create_quotas:
return {}
- resources = ["cpus", "mem", "gpus", ]
- targets = ["drivers", "executors", ]
+ resources = ["cpus", "mem", "gpus"]
+ targets = ["drivers", "executors"]
quota_options = {}
for t in targets:
@@ -302,19 +331,19 @@ def get_quota_options(arguments: typing.Dict) -> typing.Dict:
def install(args):
- options_file = args['--options-json']
+ options_file = args["--options-json"]
if options_file:
if not os.path.isfile(options_file):
# TODO: Replace with logging
log.error("The specified file does not exist: %s", options_file)
sys.exit(1)
- options = json.load(open(options_file, 'r'))
+ options = json.load(open(options_file, "r"))
else:
options = get_default_options(args)
- if args['--package-repo']:
- sdk_repository.add_stub_universe_urls([args['--package-repo']])
+ if args["--package-repo"]:
+ sdk_repository.add_stub_universe_urls([args["--package-repo"]])
rc, _, _ = sdk_cmd.run_raw_cli("package install spark --cli --yes")
assert rc == 0, "Error installing spark CLI"
@@ -324,12 +353,14 @@ def install(args):
services = {}
services["spark"] = deploy_dispatchers(
- num_dispatchers=int(args['']),
- service_name_base=args[''],
- output_file=args[''],
+ num_dispatchers=int(args[""]),
+ service_name_base=args[""],
+ group_role=args["--group-role"],
+ output_file=args[""],
linux_user=args["--user"],
options=options,
- quota_options=quota_options)
+ quota_options=quota_options,
+ )
output_filename = "{}-dispatchers.json".format(args[""])
with open(output_filename, "w") as fp:
diff --git a/scale-tests/deploy_dsengine.sh b/scale-tests/deploy_dsengine.sh
new file mode 100755
index 00000000..a34aebcc
--- /dev/null
+++ b/scale-tests/deploy_dsengine.sh
@@ -0,0 +1,93 @@
+#!/usr/bin/env bash
+
+set -x
+
+create_service_account() {
+ SERVICE_ACCOUNT="${1}"
+ SECRET_NAME="${SERVICE_ACCOUNT}-secret"
+
+ dcos security org service-accounts delete "${SERVICE_ACCOUNT}"
+ dcos security secrets delete "${SECRET_NAME}"
+
+ dcos security org service-accounts keypair private.pem public.pem
+ dcos security org service-accounts create \
+ -p public.pem \
+ -d "Service account for ${SERVICE_ACCOUNT}" "${SERVICE_ACCOUNT}"
+ dcos security secrets create-sa-secret \
+ --strict private.pem \
+ "${SERVICE_ACCOUNT}" \
+ "${SECRET_NAME}"
+
+ rm -f private.pem public.pem
+}
+
+grant_permissions() {
+ SERVICE_ACCOUNT="${1}"
+
+ echo "Granting permissions to Service Account ${SERVICE_ACCOUNT}"
+
+ dcos security org users grant "${SERVICE_ACCOUNT}" \
+ "dcos:mesos:master:task:user:nobody" \
+ create
+ dcos security org users grant "${SERVICE_ACCOUNT}" \
+ "dcos:mesos:agent:task:user:nobody" \
+ create
+ dcos security org users grant "${SERVICE_ACCOUNT}" \
+ "dcos:mesos:master:task:user:root" \
+ create
+ dcos security org users grant "${SERVICE_ACCOUNT}" \
+ "dcos:mesos:agent:task:user:root" \
+ create
+ dcos security org users grant "${SERVICE_ACCOUNT}" \
+ "dcos:secrets:list:default:__dcos_base64__hdfs_jupyter_keytab" \
+ read
+}
+
+grant_spark_permissions() {
+ SERVICE_ACCOUNT=$1
+ echo "Granting Spark permissions to Jupyter Service Account ${SERVICE_ACCOUNT}"
+
+ dcos security org users grant "${SERVICE_ACCOUNT}" \
+ "dcos:mesos:master:framework:role:${SERVICE_ACCOUNT}" \
+ create
+ dcos security org users grant "${SERVICE_ACCOUNT}" \
+ "dcos:mesos:master:reservation:role:${SERVICE_ACCOUNT}" \
+ create
+ dcos security org users grant "${SERVICE_ACCOUNT}" \
+ "dcos:mesos:master:reservation:principal:${SERVICE_ACCOUNT}" \
+ delete
+ dcos security org users grant "${SERVICE_ACCOUNT}" \
+ "dcos:mesos:master:volume:role:${SERVICE_ACCOUNT}" \
+ create
+ dcos security org users grant "${SERVICE_ACCOUNT}" \
+ "dcos:mesos:master:volume:principal:${SERVICE_ACCOUNT}" \
+ delete
+
+ dcos security org users grant "${SERVICE_ACCOUNT}" \
+ "dcos:mesos:master:task:role:${SERVICE_ACCOUNT}" \
+ create
+ dcos security org users grant "${SERVICE_ACCOUNT}" \
+ "dcos:mesos:master:task:principal:${SERVICE_ACCOUNT}" \
+ create
+ dcos security org users grant "${SERVICE_ACCOUNT}" \
+ "dcos:mesos:master:task:app_id:data-services/jupyter" \
+ create
+}
+create_service_account data_services__jupyter
+
+grant_permissions data_services__jupyter
+grant_spark_permissions data_services__jupyter
+
+dcos package install --yes data-science-engine \
+ --options=scale-tests/configs/dsengine-options.json
+
+# Run the following in the Jupyter notebook UI (password: jupyter):
+#
+# ! spark-submit \
+ # --conf spark.mesos.gpus.max=40 \
+ # --conf spark.cores.max=40 \
+ # --conf spark.mesos.executor.gpus=1 \
+ # --conf spark.executor.cores=1 \
+ # --verbose \
+ # --class MockTaskRunner \
+ # https://infinity-artifacts.s3.amazonaws.com/scale-tests/dcos-spark-scala-tests-assembly-2.4.0-20190325.jar 5000 10
diff --git a/scale-tests/kafka_cassandra_streaming_test.py b/scale-tests/kafka_cassandra_streaming_test.py
index 232a076c..f0fcd00d 100755
--- a/scale-tests/kafka_cassandra_streaming_test.py
+++ b/scale-tests/kafka_cassandra_streaming_test.py
@@ -15,6 +15,7 @@
Options:
--spark-executor-docker-image Docker image for Spark executors [default: ]
--jar hosted JAR URL
+ --group-role TODO: description [default: None]
--num-producers-per-kafka number of producers per Kafka cluster to create [default: 1]
--num-consumers-per-producer number of consumers for producer to create [default: 1]
--producer-number-of-words number of total words published by producers [default: 1]
@@ -45,16 +46,21 @@
logging.basicConfig(level=logging.INFO, format="%(message)s")
-DEFAULT_JAR = 'http://infinity-artifacts.s3.amazonaws.com/scale-tests/dcos-spark-scala-tests-assembly-20180523-fa29ab5.jar'
-PRODUCER_CLASS_NAME = 'KafkaRandomFeeder'
-CONSUMER_CLASS_NAME = 'KafkaWordCount'
-SPARK_PACKAGE_NAME = 'spark'
+DEFAULT_JAR = "http://infinity-artifacts.s3.amazonaws.com/scale-tests/dcos-spark-scala-tests-assembly-20180523-fa29ab5.jar"
+PRODUCER_CLASS_NAME = "KafkaRandomFeeder"
+CONSUMER_CLASS_NAME = "KafkaWordCount"
+SPARK_PACKAGE_NAME = "spark"
COMMON_CONF = [
- "--conf", "spark.mesos.containerizer=mesos",
- "--conf", "spark.mesos.driver.failoverTimeout=30",
- "--conf", "spark.port.maxRetries=32",
- "--conf", "spark.scheduler.maxRegisteredResourcesWaitingTime=2400s",
- "--conf", "spark.scheduler.minRegisteredResourcesRatio=1.0"
+ "--conf",
+ "spark.mesos.containerizer=mesos",
+ "--conf",
+ "spark.mesos.driver.failoverTimeout=30",
+ "--conf",
+ "spark.port.maxRetries=32",
+ "--conf",
+ "spark.scheduler.maxRegisteredResourcesWaitingTime=2400s",
+ "--conf",
+ "spark.scheduler.minRegisteredResourcesRatio=1.0",
]
@@ -66,38 +72,53 @@ def _install_package_cli(package_name):
def _service_endpoint_dns(package_name, service_name, endpoint_name):
cmd = "{package_name} --name={service_name} endpoints {endpoint_name}".format(
- package_name=package_name,
- service_name=service_name,
- endpoint_name=endpoint_name)
+ package_name=package_name, service_name=service_name, endpoint_name=endpoint_name
+ )
rt, stdout, _ = sdk_cmd.run_raw_cli(cmd)
assert rt == 0, "Failed to get {endpoint_name} endpoints"
return json.loads(stdout)["dns"]
-def _submit_producer(name,
- spark_executor_docker_image,
- jar,
- kafka_broker_dns,
- dispatcher,
- kafka_topics,
- number_of_words,
- words_per_second,
- spark_cores_max,
- spark_executor_cores,
- must_fail: bool):
- app_args = ["--appName", name,
- "--brokers", ",".join(kafka_broker_dns),
- "--topics", kafka_topics,
- "--numberOfWords", str(number_of_words),
- "--wordsPerSecond", str(words_per_second)]
+def _submit_producer(
+ name,
+ spark_executor_docker_image,
+ group_role,
+ jar,
+ kafka_broker_dns,
+ dispatcher,
+ kafka_topics,
+ number_of_words,
+ words_per_second,
+ spark_cores_max,
+ spark_executor_cores,
+ must_fail: bool,
+):
+ app_args = [
+ "--appName",
+ name,
+ "--brokers",
+ ",".join(kafka_broker_dns),
+ "--topics",
+ kafka_topics,
+ "--numberOfWords",
+ str(number_of_words),
+ "--wordsPerSecond",
+ str(words_per_second),
+ ]
if must_fail:
- app_args.extend(["--mustFailDueToInvalidArgument", ])
+ app_args.extend(["--mustFailDueToInvalidArgument"])
- app_config = ["--conf", "spark.cores.max={}".format(spark_cores_max),
- "--conf", "spark.executor.cores={}".format(spark_executor_cores),
- "--name", name,
- "--class", PRODUCER_CLASS_NAME]
+ app_config = [
+ "--conf",
+ "spark.cores.max={}".format(spark_cores_max),
+ "--conf",
+ "spark.executor.cores={}".format(spark_executor_cores),
+ "--name",
+ name,
+ "--class",
+ PRODUCER_CLASS_NAME,
+ ]
# `number_of_words == 0` means infinite stream, so we'd like to have it
# restarted in the case of failures.
@@ -105,47 +126,62 @@ def _submit_producer(name,
app_config.extend(["--supervise"])
if spark_executor_docker_image:
- app_config.extend([
- "--conf", "spark.mesos.executor.docker.image={}".format(spark_executor_docker_image)
- ])
+ app_config.extend(
+ ["--conf", "spark.mesos.executor.docker.image={}".format(spark_executor_docker_image)]
+ )
args = app_config + COMMON_CONF
+ driver_role = None if group_role else dispatcher["roles"]["executors"]
+
submission_id = spark_utils.submit_job(
app_url=jar,
app_args=" ".join(str(a) for a in app_args),
args=args,
verbose=False,
- service_name=dispatcher['service']['name'],
- driver_role=dispatcher['roles']['executors'],
- spark_user=dispatcher['service']['user'] if sdk_utils.is_strict_mode() else None,
- principal=dispatcher['service']['service_account'] if sdk_utils.is_strict_mode() else None)
+ service_name=dispatcher["service"]["name"],
+ driver_role=driver_role,
+ spark_user=dispatcher["service"]["user"] if sdk_utils.is_strict_mode() else None,
+ principal=dispatcher["service"]["service_account"] if sdk_utils.is_strict_mode() else None,
+ )
return submission_id
-def _submit_consumer(name,
- spark_executor_docker_image,
- jar,
- kafka_broker_dns,
- cassandra_native_client_dns,
- dispatcher,
- kafka_topics,
- kafka_group_id,
- write_to_cassandra,
- batch_size_seconds,
- cassandra_keyspace,
- cassandra_table,
- spark_cores_max,
- spark_executor_cores,
- must_fail: bool):
- app_args = ["--appName", name,
- "--brokers", ",".join(kafka_broker_dns),
- "--topics", kafka_topics,
- "--groupId", kafka_group_id,
- "--batchSizeSeconds", str(batch_size_seconds),
- "--cassandraKeyspace", cassandra_keyspace,
- "--cassandraTable", cassandra_table]
+def _submit_consumer(
+ name,
+ spark_executor_docker_image,
+ group_role,
+ jar,
+ kafka_broker_dns,
+ cassandra_native_client_dns,
+ dispatcher,
+ kafka_topics,
+ kafka_group_id,
+ write_to_cassandra,
+ batch_size_seconds,
+ cassandra_keyspace,
+ cassandra_table,
+ spark_cores_max,
+ spark_executor_cores,
+ must_fail: bool,
+):
+ app_args = [
+ "--appName",
+ name,
+ "--brokers",
+ ",".join(kafka_broker_dns),
+ "--topics",
+ kafka_topics,
+ "--groupId",
+ kafka_group_id,
+ "--batchSizeSeconds",
+ str(batch_size_seconds),
+ "--cassandraKeyspace",
+ cassandra_keyspace,
+ "--cassandraTable",
+ cassandra_table,
+ ]
if must_fail:
app_args.extend(["--mustFailDueToInvalidArgument"])
@@ -153,40 +189,51 @@ def _submit_consumer(name,
if not write_to_cassandra:
app_args.extend(["--shouldNotWriteToCassandra"])
- cassandra_hosts = map(lambda x: x.split(':')[0], cassandra_native_client_dns)
- cassandra_port = cassandra_native_client_dns[0].split(':')[1]
-
- app_config = ["--supervise",
- "--conf", "spark.cores.max={}".format(spark_cores_max),
- "--conf", "spark.executor.cores={}".format(spark_executor_cores),
- "--conf", "spark.cassandra.connection.host={}".format(",".join(cassandra_hosts)),
- "--conf", "spark.cassandra.connection.port={}".format(cassandra_port),
- "--name", name,
- "--class", CONSUMER_CLASS_NAME]
+ cassandra_hosts = map(lambda x: x.split(":")[0], cassandra_native_client_dns)
+ cassandra_port = cassandra_native_client_dns[0].split(":")[1]
+
+ app_config = [
+ "--supervise",
+ "--conf",
+ "spark.cores.max={}".format(spark_cores_max),
+ "--conf",
+ "spark.executor.cores={}".format(spark_executor_cores),
+ "--conf",
+ "spark.cassandra.connection.host={}".format(",".join(cassandra_hosts)),
+ "--conf",
+ "spark.cassandra.connection.port={}".format(cassandra_port),
+ "--name",
+ name,
+ "--class",
+ CONSUMER_CLASS_NAME,
+ ]
if spark_executor_docker_image:
- app_config.extend([
- "--conf", "spark.mesos.executor.docker.image={}".format(spark_executor_docker_image)
- ])
+ app_config.extend(
+ ["--conf", "spark.mesos.executor.docker.image={}".format(spark_executor_docker_image)]
+ )
args = app_config + COMMON_CONF
+ driver_role = None if group_role else dispatcher["roles"]["executors"]
+
submission_id = spark_utils.submit_job(
app_url=jar,
app_args=" ".join(str(a) for a in app_args),
args=args,
verbose=False,
- service_name=dispatcher['service']['name'],
- driver_role=dispatcher['roles']['executors'],
- spark_user=dispatcher['service']['user'] if sdk_utils.is_strict_mode() else None,
- principal=dispatcher['service']['service_account'] if sdk_utils.is_strict_mode() else None)
+ service_name=dispatcher["service"]["name"],
+ driver_role=driver_role,
+ spark_user=dispatcher["service"]["user"] if sdk_utils.is_strict_mode() else None,
+ principal=dispatcher["service"]["service_account"] if sdk_utils.is_strict_mode() else None,
+ )
return submission_id
def append_submission(output_file: str, dispatcher: dict, submission_id: str):
with open(output_file, "a") as f:
- f.write("{},{}\n".format(dispatcher['service']['name'], submission_id))
+ f.write("{},{}\n".format(dispatcher["service"]["name"], submission_id))
def is_valid_cassandra_keyspace_name(keyspace_name: str) -> bool:
@@ -201,15 +248,12 @@ def __init__(self, dispatchers, num_jobs):
self.prepare()
-
def prepare(self):
raise NotImplementedError
-
def provide(self):
raise NotImplementedError
-
def report(self):
raise NotImplementedError
@@ -226,74 +270,78 @@ def prepare(self):
self.avg_num_jobs_per_dispatcher = self.num_jobs / self.num_dispatchers
self.max_num_jobs_per_dispatcher = math.ceil(self.avg_num_jobs_per_dispatcher)
- self.slots = mapcat(make_repeater(self.max_num_jobs_per_dispatcher),
- self.dispatchers)
-
+ self.slots = mapcat(make_repeater(self.max_num_jobs_per_dispatcher), self.dispatchers)
def provide(self):
return next(self.slots)
-
def report(self):
- log.info('Providing strategy: block')
- log.info('Average number of jobs per dispatcher: %s', self.avg_num_jobs_per_dispatcher)
- log.info('Will run at most %s jobs per dispatcher', self.max_num_jobs_per_dispatcher)
- log.info("\n%s dispatchers: \n%s\n",
- self.num_dispatchers, json.dumps(self.dispatchers, indent=2, sort_keys=True))
+ log.info("Providing strategy: block")
+ log.info("Average number of jobs per dispatcher: %s", self.avg_num_jobs_per_dispatcher)
+ log.info("Will run at most %s jobs per dispatcher", self.max_num_jobs_per_dispatcher)
+ log.info(
+ "\n%s dispatchers: \n%s\n",
+ self.num_dispatchers,
+ json.dumps(self.dispatchers, indent=2, sort_keys=True),
+ )
class DispatcherProvider(object):
"""Provides dispatchers for jobs in a given strategy.
"""
+
def __init__(self, dispatchers, num_jobs, strategy=BlockProvidingStrategy):
self.strategy = strategy(dispatchers, num_jobs)
-
def provide(self):
return self.strategy.provide()
-
def report(self):
return self.strategy.report()
def main(args):
with open(args[""]) as f:
- dispatchers = json.load(f)['spark']
+ dispatchers = json.load(f)["spark"]
with open(args[""]) as f:
infrastructure = json.loads(f.read())
- kafkas = infrastructure['kafka']
+ kafkas = infrastructure["kafka"]
# Assuming only 1 Cassandra cluster.
- cassandra = infrastructure['cassandra'][0]
-
- spark_executor_docker_image = args['--spark-executor-docker-image']
- jar = args["--jar"] if args["--jar"] else DEFAULT_JAR
- submissions_output_file = args[""]
- kafka_package_names = map(lambda kafka: kafka['package_name'], kafkas)
- cassandra_package_name = cassandra['package_name']
- cassandra_service_name = cassandra['service']['name']
- num_producers_per_kafka = int(args['--num-producers-per-kafka'])
- num_consumers_per_producer = int(args['--num-consumers-per-producer'])
- producer_must_fail = args['--producer-must-fail']
- producer_number_of_words = int(args['--producer-number-of-words'])
- producer_words_per_second = int(args['--producer-words-per-second'])
- producer_spark_cores_max = int(args['--producer-spark-cores-max'])
- producer_spark_executor_cores = int(args['--producer-spark-executor-cores'])
- consumer_must_fail = args['--consumer-must-fail']
- consumer_write_to_cassandra = args['--consumer-write-to-cassandra']
- consumer_batch_size_seconds = int(args['--consumer-batch-size-seconds'])
- consumer_spark_cores_max = int(args['--consumer-spark-cores-max'])
- consumer_spark_executor_cores = int(args['--consumer-spark-executor-cores'])
+ cassandra = infrastructure["cassandra"][0]
+
+ spark_executor_docker_image = args["--spark-executor-docker-image"]
+ jar = args["--jar"] if args["--jar"] else DEFAULT_JAR
+ group_role = args["--group-role"]
+ submissions_output_file = args[""]
+ kafka_package_names = map(lambda kafka: kafka["package_name"], kafkas)
+ cassandra_package_name = cassandra["package_name"]
+ cassandra_service_name = cassandra["service"]["name"]
+ num_producers_per_kafka = int(args["--num-producers-per-kafka"])
+ num_consumers_per_producer = int(args["--num-consumers-per-producer"])
+ producer_must_fail = args["--producer-must-fail"]
+ producer_number_of_words = int(args["--producer-number-of-words"])
+ producer_words_per_second = int(args["--producer-words-per-second"])
+ producer_spark_cores_max = int(args["--producer-spark-cores-max"])
+ producer_spark_executor_cores = int(args["--producer-spark-executor-cores"])
+ consumer_must_fail = args["--consumer-must-fail"]
+ consumer_write_to_cassandra = args["--consumer-write-to-cassandra"]
+ consumer_batch_size_seconds = int(args["--consumer-batch-size-seconds"])
+ consumer_spark_cores_max = int(args["--consumer-spark-cores-max"])
+ consumer_spark_executor_cores = int(args["--consumer-spark-executor-cores"])
num_kafkas = len(kafkas)
num_producers = num_kafkas * num_producers_per_kafka
num_consumers = num_producers * num_consumers_per_producer
num_jobs = num_producers + num_consumers
- log.info('Number of Kafka clusters: %s', num_kafkas)
- log.info('Total number of jobs: %s (%s producers, %s consumers)',
- num_jobs, num_producers, num_consumers)
+ log.info("Number of Kafka clusters: %s", num_kafkas)
+ log.info(
+ "Total number of jobs: %s (%s producers, %s consumers)",
+ num_jobs,
+ num_producers,
+ num_consumers,
+ )
dispatcher_provider = DispatcherProvider(dispatchers, num_jobs)
dispatcher_provider.report()
@@ -303,28 +351,34 @@ def main(args):
_install_package_cli(cassandra_package_name)
_install_package_cli(SPARK_PACKAGE_NAME)
- cassandra_native_client_dns = _service_endpoint_dns(cassandra_package_name, cassandra_service_name, "native-client")
+ cassandra_native_client_dns = _service_endpoint_dns(
+ cassandra_package_name, cassandra_service_name, "native-client"
+ )
for kafka_idx, kafka in enumerate(kafkas):
- kafka_package_name = kafka['package_name']
- kafka_service_name = kafka['service']['name']
- kafka_broker_dns = _service_endpoint_dns(kafka_package_name, kafka_service_name, 'broker')
+ kafka_package_name = kafka["package_name"]
+ kafka_service_name = kafka["service"]["name"]
+ kafka_broker_dns = _service_endpoint_dns(kafka_package_name, kafka_service_name, "broker")
- kafka_service_basename = kafka_service_name.split('/')[-1]
+ kafka_service_basename = kafka_service_name.split("/")[-1]
for producer_idx in range(0, num_producers_per_kafka):
dispatcher = dispatcher_provider.provide()
- producer_name = '{}-{}'.format(normalize_string(kafka_service_basename), producer_idx)
+ producer_name = "{}-{}".format(normalize_string(kafka_service_basename), producer_idx)
kafka_topics = producer_name
- producer_cassandra_keyspace = 'keyspace_{}'.format(normalize_string(producer_name))
+ producer_cassandra_keyspace = "keyspace_{}".format(normalize_string(producer_name))
if not is_valid_cassandra_keyspace_name(producer_cassandra_keyspace):
- raise ValueError('\'{}\' is not a valid Cassandra keyspace name'.format(
- producer_cassandra_keyspace))
+ raise ValueError(
+ "'{}' is not a valid Cassandra keyspace name".format(
+ producer_cassandra_keyspace
+ )
+ )
producer_submission_id = _submit_producer(
- '{}-k{:02d}-p{:02d}'.format(PRODUCER_CLASS_NAME, kafka_idx, producer_idx),
+ "{}-k{:02d}-p{:02d}".format(PRODUCER_CLASS_NAME, kafka_idx, producer_idx),
spark_executor_docker_image,
+ group_role,
jar,
kafka_broker_dns,
dispatcher,
@@ -333,23 +387,24 @@ def main(args):
producer_words_per_second,
producer_spark_cores_max,
producer_spark_executor_cores,
- producer_must_fail)
+ producer_must_fail,
+ )
- append_submission(
- submissions_output_file,
- dispatcher,
- producer_submission_id)
+ append_submission(submissions_output_file, dispatcher, producer_submission_id)
for consumer_idx in range(0, num_consumers_per_producer):
dispatcher = dispatcher_provider.provide()
- consumer_name = '{}-{}'.format(producer_name, consumer_idx)
+ consumer_name = "{}-{}".format(producer_name, consumer_idx)
consumer_kafka_group_id = consumer_name
- consumer_cassandra_table = 'table_{}'.format(consumer_idx)
+ consumer_cassandra_table = "table_{}".format(consumer_idx)
consumer_submission_id = _submit_consumer(
- '{}-k{:02d}-p{:02d}-c{:02d}'.format(CONSUMER_CLASS_NAME, kafka_idx, producer_idx, consumer_idx),
+ "{}-k{:02d}-p{:02d}-c{:02d}".format(
+ CONSUMER_CLASS_NAME, kafka_idx, producer_idx, consumer_idx
+ ),
spark_executor_docker_image,
+ group_role,
jar,
kafka_broker_dns,
cassandra_native_client_dns,
@@ -362,12 +417,10 @@ def main(args):
consumer_cassandra_table,
consumer_spark_cores_max,
consumer_spark_executor_cores,
- consumer_must_fail)
+ consumer_must_fail,
+ )
- append_submission(
- submissions_output_file,
- dispatcher,
- consumer_submission_id)
+ append_submission(submissions_output_file, dispatcher, consumer_submission_id)
if __name__ == "__main__":
diff --git a/scale-tests/list_service_tasks.sh b/scale-tests/list_service_tasks.sh
new file mode 100755
index 00000000..9d5bcdfc
--- /dev/null
+++ b/scale-tests/list_service_tasks.sh
@@ -0,0 +1,17 @@
+#!/usr/bin/env bash
+
+set -euo pipefail
+
+readonly SERVICE_NAME="${1}"
+
+case ${SERVICE_NAME} in
+ kafka) metric="kafka_app_info_version" ;;
+ cassandra) metric="org_apache_cassandra_metrics_Storage_Load" ;;
+ *) echo "Only 'kafka' and 'cassandra' are valid arguments. '${SERVICE_NAME}' given"; exit 1 ;;
+esac
+
+readonly CLUSTER_URL="$(dcos config show core.dcos_url)"
+
+curl --silent "${CLUSTER_URL}/service/monitoring/prometheus/api/v1/query?query=${metric}" \
+ -H "Authorization: token=$(dcos config show core.dcos_acs_token)" \
+ | jq -r '.data.result[].metric.task_name?'
diff --git a/scale-tests/run.sh b/scale-tests/run.sh
index 1eec3b2a..e6adf38e 100755
--- a/scale-tests/run.sh
+++ b/scale-tests/run.sh
@@ -25,6 +25,10 @@ function usage () {
echo ' non-interactive \\'
}
+################################################################################
+# Parse and validate command line and parameter file parameters ################
+################################################################################
+
if [ "${#}" -lt 7 ]; then
echo -e "run.sh needs at least 7 arguments but was given ${#}\\n"
usage
@@ -66,19 +70,21 @@ function is_interactive () {
}
readonly AWS_ACCOUNT='Team 10'
-readonly CONTAINER_NAME="${TEST_NAME}"
+readonly CONTAINER_NAME="${TEST_NAME}-$(basename "${TEST_CONFIG}" .env)"
readonly CONTAINER_SSH_AGENT_EXPORTS=/tmp/ssh-agent-exports
readonly CONTAINER_SSH_KEY=/ssh/key
readonly CONTAINER_FINISHED_SETTING_UP_FILE=/tmp/finished-setting-up
-readonly IMAGE_NAME="mesosphere/dcos-commons:${TEST_NAME}"
+readonly IMAGE_NAME="mesosphere/dcos-commons:${CONTAINER_NAME}"
readonly SCALE_TESTS_DIRECTORY="scale-tests"
-readonly TEST_DIRECTORY="${SCALE_TESTS_DIRECTORY}/runs/${TEST_NAME}"
+readonly TEST_DIRECTORY="${SCALE_TESTS_DIRECTORY}/runs/${CONTAINER_NAME}"
readonly TEST_REPOSITORY_DIRECTORY="${SCALE_TESTS_DIRECTORY}/checkouts/${TEST_NAME}"
readonly TEST_S3_DIRECTORY_URL="s3://${TEST_S3_BUCKET}/${TEST_S3_FOLDER}/"
readonly LOGS_DIRECTORY="${TEST_DIRECTORY}/script_logs"
readonly LOG_FILE="${LOGS_DIRECTORY}/$(date +%Y%m%dT%H%M%SZ)_$(whoami).log"
readonly DCOS_CLI_REFRESH_INTERVAL_SEC=600 # 10 minutes.
+readonly GROUP_FILE_NAME="${TEST_REPOSITORY_DIRECTORY}/marathon_group.json"
+# shellcheck source=/dev/null
source "${TEST_CONFIG}"
mkdir -p "${TEST_DIRECTORY}"
@@ -92,7 +98,6 @@ fi
for boolean_option in SHOULD_INSTALL_INFRASTRUCTURE \
SHOULD_INSTALL_NON_GPU_DISPATCHERS \
SHOULD_INSTALL_GPU_DISPATCHERS \
- SHOULD_RUN_FAILING_STREAMING_JOBS \
SHOULD_RUN_FINITE_STREAMING_JOBS \
SHOULD_RUN_INFINITE_STREAMING_JOBS \
SHOULD_RUN_BATCH_JOBS \
@@ -119,63 +124,87 @@ function container_exec () {
declare -x AWS_PROFILE
eval "$(maws li "${AWS_ACCOUNT}")"
-readonly FINITE_NUM_PRODUCERS=$((KAFKA_CLUSTER_COUNT * FINITE_NUM_PRODUCERS_PER_KAFKA))
-readonly FINITE_NUM_CONSUMERS=$((FINITE_NUM_PRODUCERS * FINITE_NUM_CONSUMERS_PER_PRODUCER))
-readonly FINITE_NUM_JOBS=$((FINITE_NUM_PRODUCERS + FINITE_NUM_CONSUMERS))
-readonly INFINITE_NUM_PRODUCERS=$((KAFKA_CLUSTER_COUNT * INFINITE_NUM_PRODUCERS_PER_KAFKA))
-readonly INFINITE_NUM_CONSUMERS=$((INFINITE_NUM_PRODUCERS * INFINITE_NUM_CONSUMERS_PER_PRODUCER))
-readonly INFINITE_NUM_JOBS=$((INFINITE_NUM_PRODUCERS + INFINITE_NUM_CONSUMERS))
-readonly STREAMING_NUM_JOBS=$((FINITE_NUM_JOBS + INFINITE_NUM_JOBS))
-
-readonly NON_GPU_TOTAL_QUOTA_DRIVERS_CPUS=$((NON_GPU_NUM_DISPATCHERS * NON_GPU_QUOTA_DRIVERS_CPUS))
-readonly NON_GPU_TOTAL_QUOTA_DRIVERS_MEM=$((NON_GPU_NUM_DISPATCHERS * NON_GPU_QUOTA_DRIVERS_MEM))
-readonly NON_GPU_TOTAL_QUOTA_EXECUTORS_CPUS=$((NON_GPU_NUM_DISPATCHERS * NON_GPU_QUOTA_EXECUTORS_CPUS))
-readonly NON_GPU_TOTAL_QUOTA_EXECUTORS_MEM=$((NON_GPU_NUM_DISPATCHERS * NON_GPU_QUOTA_EXECUTORS_MEM))
-
-readonly GPU_TOTAL_QUOTA_DRIVERS_CPUS=$((GPU_NUM_DISPATCHERS * GPU_QUOTA_DRIVERS_CPUS))
-readonly GPU_TOTAL_QUOTA_DRIVERS_MEM=$((GPU_NUM_DISPATCHERS * GPU_QUOTA_DRIVERS_MEM))
-readonly GPU_TOTAL_QUOTA_DRIVERS_GPUS=$((GPU_NUM_DISPATCHERS * GPU_QUOTA_DRIVERS_GPUS))
-readonly GPU_TOTAL_QUOTA_EXECUTORS_CPUS=$((GPU_NUM_DISPATCHERS * GPU_QUOTA_EXECUTORS_CPUS))
-readonly GPU_TOTAL_QUOTA_EXECUTORS_MEM=$((GPU_NUM_DISPATCHERS * GPU_QUOTA_EXECUTORS_MEM))
-readonly GPU_TOTAL_QUOTA_EXECUTORS_GPUS=$((GPU_NUM_DISPATCHERS * GPU_QUOTA_EXECUTORS_GPUS))
-
-readonly NON_GPU_QUOTA_CPUS=$((NON_GPU_TOTAL_QUOTA_DRIVERS_CPUS + NON_GPU_TOTAL_QUOTA_EXECUTORS_CPUS))
-readonly NON_GPU_QUOTA_MEM=$((NON_GPU_TOTAL_QUOTA_DRIVERS_MEM + NON_GPU_TOTAL_QUOTA_EXECUTORS_MEM))
-readonly GPU_QUOTA_CPUS=$((GPU_TOTAL_QUOTA_DRIVERS_CPUS + GPU_TOTAL_QUOTA_EXECUTORS_CPUS))
-readonly GPU_QUOTA_MEM=$((GPU_TOTAL_QUOTA_DRIVERS_MEM + GPU_TOTAL_QUOTA_EXECUTORS_MEM))
+################################################################################
+# Calculate a few things and present a pre-test report #########################
+################################################################################
+
+readonly SPARK_TOTAL_DISPATCHERS=$((SPARK_NON_GPU_DISPATCHERS + SPARK_GPU_DISPATCHERS))
+readonly STREAMING_FINITE_PRODUCERS=$((KAFKA_CLUSTER_COUNT * STREAMING_FINITE_PRODUCERS_PER_KAFKA))
+readonly STREAMING_FINITE_CONSUMERS=$((STREAMING_FINITE_PRODUCERS * STREAMING_FINITE_CONSUMERS_PER_PRODUCER))
+readonly STREAMING_FINITE_JOBS=$((STREAMING_FINITE_PRODUCERS + STREAMING_FINITE_CONSUMERS))
+readonly STREAMING_INFINITE_PRODUCERS=$((KAFKA_CLUSTER_COUNT * STREAMING_INFINITE_PRODUCERS_PER_KAFKA))
+readonly STREAMING_INFINITE_CONSUMERS=$((STREAMING_INFINITE_PRODUCERS * STREAMING_INFINITE_CONSUMERS_PER_PRODUCER))
+readonly STREAMING_INFINITE_JOBS=$((STREAMING_INFINITE_PRODUCERS + STREAMING_INFINITE_CONSUMERS))
+readonly STREAMING_JOBS=$((STREAMING_FINITE_JOBS + STREAMING_INFINITE_JOBS))
+
+readonly SPARK_NON_GPU_TOTAL_QUOTA_DRIVERS_CPUS=$((SPARK_NON_GPU_DISPATCHERS * SPARK_NON_GPU_QUOTA_DRIVERS_CPUS))
+readonly SPARK_NON_GPU_TOTAL_QUOTA_DRIVERS_MEM=$((SPARK_NON_GPU_DISPATCHERS * SPARK_NON_GPU_QUOTA_DRIVERS_MEM))
+readonly SPARK_NON_GPU_TOTAL_QUOTA_EXECUTORS_CPUS=$((SPARK_NON_GPU_DISPATCHERS * SPARK_NON_GPU_QUOTA_EXECUTORS_CPUS))
+readonly SPARK_NON_GPU_TOTAL_QUOTA_EXECUTORS_MEM=$((SPARK_NON_GPU_DISPATCHERS * SPARK_NON_GPU_QUOTA_EXECUTORS_MEM))
+
+readonly SPARK_GPU_TOTAL_QUOTA_DRIVERS_CPUS=$((SPARK_GPU_DISPATCHERS * SPARK_GPU_QUOTA_DRIVERS_CPUS))
+readonly SPARK_GPU_TOTAL_QUOTA_DRIVERS_MEM=$((SPARK_GPU_DISPATCHERS * SPARK_GPU_QUOTA_DRIVERS_MEM))
+readonly SPARK_GPU_TOTAL_QUOTA_DRIVERS_GPUS=$((SPARK_GPU_DISPATCHERS * SPARK_GPU_QUOTA_DRIVERS_GPUS))
+readonly SPARK_GPU_TOTAL_QUOTA_EXECUTORS_CPUS=$((SPARK_GPU_DISPATCHERS * SPARK_GPU_QUOTA_EXECUTORS_CPUS))
+readonly SPARK_GPU_TOTAL_QUOTA_EXECUTORS_MEM=$((SPARK_GPU_DISPATCHERS * SPARK_GPU_QUOTA_EXECUTORS_MEM))
+readonly SPARK_GPU_TOTAL_QUOTA_EXECUTORS_GPUS=$((SPARK_GPU_DISPATCHERS * SPARK_GPU_QUOTA_EXECUTORS_GPUS))
+
+readonly SPARK_NON_GPU_QUOTA_CPUS=$((SPARK_NON_GPU_TOTAL_QUOTA_DRIVERS_CPUS + SPARK_NON_GPU_TOTAL_QUOTA_EXECUTORS_CPUS))
+readonly SPARK_NON_GPU_QUOTA_MEM=$((SPARK_NON_GPU_TOTAL_QUOTA_DRIVERS_MEM + SPARK_NON_GPU_TOTAL_QUOTA_EXECUTORS_MEM))
+readonly SPARK_GPU_QUOTA_CPUS=$((SPARK_GPU_TOTAL_QUOTA_DRIVERS_CPUS + SPARK_GPU_TOTAL_QUOTA_EXECUTORS_CPUS))
+readonly SPARK_GPU_QUOTA_MEM=$((SPARK_GPU_TOTAL_QUOTA_DRIVERS_MEM + SPARK_GPU_TOTAL_QUOTA_EXECUTORS_MEM))
+
+readonly TOTAL_QUOTA_CPUS=$((SPARK_NON_GPU_QUOTA_CPUS +
+ SPARK_GPU_QUOTA_CPUS +
+ ZOOKEEPER_CPUS +
+ KAFKA_CPUS +
+ CASSANDRA_CPUS +
+ DSENGINE_CPUS))
+readonly TOTAL_QUOTA_MEM=$((SPARK_NON_GPU_QUOTA_MEM +
+ SPARK_GPU_QUOTA_MEM +
+ ZOOKEEPER_MEM +
+ KAFKA_MEM +
+ CASSANDRA_MEM +
+ DSENGINE_MEM))
+readonly TOTAL_QUOTA_GPUS=$((SPARK_GPU_TOTAL_QUOTA_DRIVERS_GPUS +
+ SPARK_GPU_TOTAL_QUOTA_EXECUTORS_GPUS +
+ DSENGINE_GPUS))
echo
echo "Test '${TEST_NAME}' parameters:"
echo
+echo "CLUSTER_URL '${CLUSTER_URL}'"
+echo
echo "KAFKA_CLUSTER_COUNT: ${KAFKA_CLUSTER_COUNT}"
echo "CASSANDRA_CLUSTER_COUNT: ${CASSANDRA_CLUSTER_COUNT}"
+echo "SPARK_TOTAL_DISPATCHERS: ${SPARK_TOTAL_DISPATCHERS} (non-GPU: ${SPARK_NON_GPU_DISPATCHERS}, GPU: ${SPARK_GPU_DISPATCHERS})"
echo
-echo "NON_GPU_NUM_DISPATCHERS: ${NON_GPU_NUM_DISPATCHERS}"
+echo "SPARK_NON_GPU_DISPATCHERS: ${SPARK_NON_GPU_DISPATCHERS}"
echo " Quota cpus/mem:"
echo -n " Each:"
-echo -n " driver ${NON_GPU_QUOTA_DRIVERS_CPUS}/${NON_GPU_QUOTA_DRIVERS_MEM},"
-echo " executor ${NON_GPU_QUOTA_EXECUTORS_CPUS}/${NON_GPU_QUOTA_EXECUTORS_MEM}"
+echo -n " driver ${SPARK_NON_GPU_QUOTA_DRIVERS_CPUS}/${SPARK_NON_GPU_QUOTA_DRIVERS_MEM},"
+echo " executor ${SPARK_NON_GPU_QUOTA_EXECUTORS_CPUS}/${SPARK_NON_GPU_QUOTA_EXECUTORS_MEM}"
echo -n " Total:"
-echo -n " driver ${NON_GPU_TOTAL_QUOTA_DRIVERS_CPUS}/${NON_GPU_TOTAL_QUOTA_DRIVERS_MEM},"
-echo " executor ${NON_GPU_TOTAL_QUOTA_EXECUTORS_CPUS}/${NON_GPU_TOTAL_QUOTA_EXECUTORS_MEM}"
+echo -n " driver ${SPARK_NON_GPU_TOTAL_QUOTA_DRIVERS_CPUS}/${SPARK_NON_GPU_TOTAL_QUOTA_DRIVERS_MEM},"
+echo " executor ${SPARK_NON_GPU_TOTAL_QUOTA_EXECUTORS_CPUS}/${SPARK_NON_GPU_TOTAL_QUOTA_EXECUTORS_MEM}"
echo
-echo "GPU_NUM_DISPATCHERS: ${GPU_NUM_DISPATCHERS}"
+echo "SPARK_GPU_DISPATCHERS: ${SPARK_GPU_DISPATCHERS}"
echo " Quota cpus/mem/gpus:"
echo -n " Each:"
-echo -n " driver ${GPU_QUOTA_DRIVERS_CPUS}/${GPU_QUOTA_DRIVERS_MEM}/${GPU_QUOTA_DRIVERS_GPUS:--},"
-echo " executor ${GPU_QUOTA_EXECUTORS_CPUS:--}/${GPU_QUOTA_EXECUTORS_MEM:--}/${GPU_QUOTA_EXECUTORS_GPUS:--}"
+echo -n " driver ${SPARK_GPU_QUOTA_DRIVERS_CPUS:-0}/${SPARK_GPU_QUOTA_DRIVERS_MEM:-0}/${SPARK_GPU_QUOTA_DRIVERS_GPUS:-0},"
+echo " executor ${SPARK_GPU_QUOTA_EXECUTORS_CPUS:-0}/${SPARK_GPU_QUOTA_EXECUTORS_MEM:-0}/${SPARK_GPU_QUOTA_EXECUTORS_GPUS:-0}"
echo -n " Total:"
-echo -n " driver ${GPU_TOTAL_QUOTA_DRIVERS_CPUS:--}/${GPU_TOTAL_QUOTA_DRIVERS_MEM:--}/${GPU_TOTAL_QUOTA_DRIVERS_GPUS:--},"
-echo " executor ${GPU_TOTAL_QUOTA_EXECUTORS_CPUS:--}/${GPU_TOTAL_QUOTA_EXECUTORS_MEM:--}/${GPU_TOTAL_QUOTA_EXECUTORS_GPUS:--}"
+echo -n " driver ${SPARK_GPU_TOTAL_QUOTA_DRIVERS_CPUS:-0}/${SPARK_GPU_TOTAL_QUOTA_DRIVERS_MEM:-0}/${SPARK_GPU_TOTAL_QUOTA_DRIVERS_GPUS:-0},"
+echo " executor ${SPARK_GPU_TOTAL_QUOTA_EXECUTORS_CPUS:-0}/${SPARK_GPU_TOTAL_QUOTA_EXECUTORS_MEM:-0}/${SPARK_GPU_TOTAL_QUOTA_EXECUTORS_GPUS:-0}"
echo
-echo "FINITE_NUM_JOBS: ${FINITE_NUM_JOBS}"
-echo "INFINITE_NUM_JOBS: ${INFINITE_NUM_JOBS}"
-echo "STREAMING_NUM_JOBS: ${STREAMING_NUM_JOBS}"
-echo "BATCH_SUBMITS_PER_MIN: ${BATCH_SUBMITS_PER_MIN}"
-echo "GPU_SUBMITS_PER_MIN: ${GPU_SUBMITS_PER_MIN}"
+echo "STREAMING_JOBS: ${STREAMING_JOBS} (finite: ${STREAMING_FINITE_JOBS}, infinite: ${STREAMING_INFINITE_JOBS})"
+echo "BATCH_MAX_NON_GPU_JOBS: ${BATCH_MAX_NON_GPU_JOBS}"
+echo "BATCH_SUBMITS_PER_MIN: ${BATCH_SUBMITS_PER_MIN}"
+echo "GPU_SUBMITS_PER_MIN: ${GPU_SUBMITS_PER_MIN}"
echo
-echo "Total CPU quota: $((NON_GPU_QUOTA_CPUS + GPU_QUOTA_CPUS))"
-echo "Total MEM quota: $((NON_GPU_QUOTA_MEM + GPU_QUOTA_MEM))"
+echo "Total CPU quota: ${TOTAL_QUOTA_CPUS}"
+echo "Total MEM quota: ${TOTAL_QUOTA_MEM}"
+echo "Total GPU quota: ${TOTAL_QUOTA_GPUS}"
echo
echo "Existing S3 artifacts for ${TEST_NAME}:"
@@ -189,11 +218,14 @@ case "${ANSWER}" in
* ) log 'Exiting...' && exit 0;;
esac
+################################################################################
+# Set a few more parameters ####################################################
+################################################################################
+
if is_interactive; then
for boolean_option in SHOULD_INSTALL_INFRASTRUCTURE \
SHOULD_INSTALL_NON_GPU_DISPATCHERS \
SHOULD_INSTALL_GPU_DISPATCHERS \
- SHOULD_RUN_FAILING_STREAMING_JOBS \
SHOULD_RUN_FINITE_STREAMING_JOBS \
SHOULD_RUN_INFINITE_STREAMING_JOBS \
SHOULD_RUN_BATCH_JOBS \
@@ -208,8 +240,12 @@ if is_interactive; then
done
fi
+################################################################################
+# Create Docker container for test if it doesn't exist yet #####################
+################################################################################
+
set +e
-docker inspect -f {{.State.Running}} "${CONTAINER_NAME}" > /dev/null 2>&1
+docker inspect -f '{{.State.Running}}' "${CONTAINER_NAME}" > /dev/null 2>&1
readonly container_running=$?
docker exec -it "${CONTAINER_NAME}" test -f "${CONTAINER_FINISHED_SETTING_UP_FILE}"
@@ -255,6 +291,21 @@ if [ ${container_running} -ne 0 ] || [ ${container_finished_setting_up} -ne 0 ];
container_exec \
ssh-add -k "${CONTAINER_SSH_KEY}"
+ container_exec \
+ curl "${DCOS_CLI_URL}" -o dcos
+
+ container_exec \
+ chmod +x ./dcos
+
+ container_exec \
+ mv dcos /usr/local/bin
+
+ container_exec \
+ which dcos
+
+ container_exec \
+ dcos
+
container_exec \
dcos cluster setup \
--insecure \
@@ -274,40 +325,87 @@ if [ ${container_running} -ne 0 ] || [ ${container_finished_setting_up} -ne 0 ];
container_exec \
dcos package install --yes dcos-enterprise-cli
- if [ -n "${ZOOKEEPER_PACKAGE_REPO}" ]; then
- container_exec \
- dcos package repo add --index=0 zk-aws "${ZOOKEEPER_PACKAGE_REPO}" || true
- fi
- if [ -n "${KAFKA_PACKAGE_REPO}" ]; then
- container_exec \
- dcos package repo add --index=0 kafka-aws "${KAFKA_PACKAGE_REPO}" || true
- fi
- if [ -n "${CASSANDRA_PACKAGE_REPO}" ]; then
- container_exec \
- dcos package repo add --index=0 cassandra-aws "${CASSANDRA_PACKAGE_REPO}" || true
+ container_exec \
+ touch "${CONTAINER_FINISHED_SETTING_UP_FILE}"
+fi
+
+################################################################################
+# Create package repository stubs if they're not there #########################
+################################################################################
+
+readonly dcos_package_repo_uris="$(container_exec 'bash -c "dcos package repo list --json | jq -r '.repositories[].uri'"')"
+
+for package_repo_envvar in ZOOKEEPER_PACKAGE_REPO \
+ KAFKA_PACKAGE_REPO \
+ CASSANDRA_PACKAGE_REPO \
+ SPARK_PACKAGE_REPO \
+ DSENGINE_PACKAGE_REPO; do
+ # Skip envvar if its value is empty.
+ if [ -z "${!package_repo_envvar}" ]; then
+ continue;
fi
- if [ -n "${SPARK_PACKAGE_REPO}" ]; then
+
+ # Add package repository stub if it's not already there.
+ if ! grep -qx "${!package_repo_envvar}" <<< "${dcos_package_repo_uris}"; then
+ # ZOOKEEPER_PACKAGE_REPO => zookeeper.
+ package_repo_name="$(awk '{s = tolower($0); sub(/_package_repo$/, "", s); print(s)}' <<< "${package_repo_envvar}")"
container_exec \
- dcos package repo add --index=0 spark-aws "${SPARK_PACKAGE_REPO}" || true
+ dcos package repo add --index=0 "${package_repo_name}" "${!package_repo_envvar}" || true
fi
+done
+
+################################################################################
+# Create Marathon group if it doesn't exist ####################################
+################################################################################
+
+if ! grep -qx "/${GROUP_NAME}" <<< "$(container_exec 'bash -c "dcos marathon group list --json | jq -r '.[].id'"')"; then
+ cat <<-EOF > "${GROUP_FILE_NAME}"
+ {
+ "id": "${GROUP_NAME}",
+ "enforceRole": true
+ }
+ EOF
container_exec \
- touch "${CONTAINER_FINISHED_SETTING_UP_FILE}"
+ dcos marathon group add "${GROUP_FILE_NAME}"
fi
+################################################################################
+# Create quota if it doesn't already exist #####################################
+################################################################################
+
+if ! grep -qx "${GROUP_NAME}" <<< "$(container_exec 'bash -c "dcos quota list --json | jq -r '.[].role'"')"; then
+ container_exec \
+ dcos quota create "${GROUP_NAME}" \
+ --cpu "${TOTAL_QUOTA_CPUS}" \
+ --mem "${TOTAL_QUOTA_MEM}" \
+ --gpu "${TOTAL_QUOTA_GPUS}"
+fi
+
+################################################################################
+# Install infrastructure #######################################################
+################################################################################
+
if [ "${SHOULD_INSTALL_INFRASTRUCTURE}" = true ]; then
log 'Installing infrastructure'
start_time=$(date +%s)
container_exec \
./scale-tests/setup_streaming.py "${TEST_DIRECTORY}/${INFRASTRUCTURE_OUTPUT_FILE}" \
--service-names-prefix "${SERVICE_NAMES_PREFIX}" \
- --kafka-zookeeper-config "${KAFKA_ZOOKEEPER_CONFIG}" \
+ --kafka-zookeeper-package-name "${ZOOKEEPER_PACKAGE_NAME}" \
+ --kafka-zookeeper-config "${ZOOKEEPER_CONFIG}" \
--kafka-cluster-count "${KAFKA_CLUSTER_COUNT}" \
+ --kafka-package-name "${KAFKA_PACKAGE_NAME}" \
+ --kafka-user "${KAFKA_USER}" \
--kafka-config "${KAFKA_CONFIG}" \
--cassandra-cluster-count "${CASSANDRA_CLUSTER_COUNT}" \
- --cassandra-config "${CASSANDRA_CONFIG}"
+ --cassandra-package-name "${CASSANDRA_PACKAGE_NAME}" \
+ --cassandra-user "${CASSANDRA_USER}" \
+ --cassandra-config "${CASSANDRA_CONFIG}" \
+ --dsengine-package-name "${DSENGINE_PACKAGE_NAME}" \
+ --dsengine-config "${DSENGINE_CONFIG}"
end_time=$(date +%s)
- runtime=$(($end_time - $start_time))
+ runtime=$((end_time - start_time))
log "Installed infrastructure in ${runtime} seconds"
log 'Uploading infrastructure file to S3'
@@ -319,55 +417,63 @@ else
log 'Skipping infrastructure installation'
fi
+################################################################################
+# Install non-GPU Spark dispatchers ############################################
+################################################################################
+
if [ "${SHOULD_INSTALL_NON_GPU_DISPATCHERS}" = true ]; then
log 'Installing non-GPU dispatchers'
start_time=$(date +%s)
container_exec \
./scale-tests/deploy-dispatchers.py \
- --quota-drivers-cpus "${NON_GPU_QUOTA_DRIVERS_CPUS}" \
- --quota-drivers-mem "${NON_GPU_QUOTA_DRIVERS_MEM}" \
- --quota-executors-cpus "${NON_GPU_QUOTA_EXECUTORS_CPUS}" \
- --quota-executors-mem "${NON_GPU_QUOTA_EXECUTORS_MEM}" \
- "${NON_GPU_NUM_DISPATCHERS}" \
+ --group-role "${GROUP_NAME}" \
+ --options-json "${SPARK_CONFIG}" \
+ --create-quotas false \
+ "${SPARK_NON_GPU_DISPATCHERS}" \
"${SERVICE_NAMES_PREFIX}" \
- "${TEST_DIRECTORY}/${NON_GPU_DISPATCHERS_OUTPUT_FILE}"
+ "${TEST_DIRECTORY}/${SPARK_NON_GPU_DISPATCHERS_OUTPUT_FILE}"
end_time=$(date +%s)
- runtime=$(($end_time - $start_time))
+ runtime=$((end_time - start_time))
log "Installed non-GPU dispatchers in ${runtime} seconds"
log 'Uploading non-GPU dispatcher list to S3'
container_exec \
aws s3 cp --acl public-read \
- "${TEST_DIRECTORY}/${NON_GPU_DISPATCHERS_OUTPUT_FILE}" \
+ "${TEST_DIRECTORY}/${SPARK_NON_GPU_DISPATCHERS_OUTPUT_FILE}" \
"${TEST_S3_DIRECTORY_URL}"
log 'Uploading non-GPU JSON dispatcher list to S3'
container_exec \
aws s3 cp --acl public-read \
- "${TEST_DIRECTORY}/${NON_GPU_DISPATCHERS_JSON_OUTPUT_FILE}" \
+ "${TEST_DIRECTORY}/${SPARK_NON_GPU_DISPATCHERS_JSON_OUTPUT_FILE}" \
"${TEST_S3_DIRECTORY_URL}"
else
log 'Skipping non-GPU dispatchers installation'
fi
+################################################################################
+# Install GPU Spark dispatchers ################################################
+################################################################################
+
if [ "${SHOULD_INSTALL_GPU_DISPATCHERS}" = true ]; then
log 'Installing GPU dispatchers'
start_time=$(date +%s)
container_exec \
./scale-tests/deploy-dispatchers.py \
- --quota-drivers-cpus "${GPU_QUOTA_DRIVERS_CPUS}" \
- --quota-drivers-mem "${GPU_QUOTA_DRIVERS_MEM}" \
- "${GPU_NUM_DISPATCHERS}" \
+ --group-role "${GROUP_NAME}" \
+ --options-json "${SPARK_CONFIG}" \
+ --create-quotas false \
+ "${SPARK_GPU_DISPATCHERS}" \
"${SERVICE_NAMES_PREFIX}gpu-" \
- "${TEST_DIRECTORY}/${GPU_DISPATCHERS_OUTPUT_FILE}"
+ "${TEST_DIRECTORY}/${SPARK_GPU_DISPATCHERS_OUTPUT_FILE}"
end_time=$(date +%s)
- runtime=$(($end_time - $start_time))
+ runtime=$((end_time - start_time))
log "Installed GPU dispatchers in ${runtime} seconds"
- if [ "${GPU_REMOVE_EXECUTORS_ROLES_QUOTAS}" = true ]; then
+ if [ "${SPARK_GPU_REMOVE_EXECUTORS_ROLES_QUOTAS}" = true ]; then
log 'Removing GPU executors roles quotas'
- last_gpu_index=$(($GPU_NUM_DISPATCHERS - 1))
- for i in $(seq 0 "${last_gpu_index}"); do
+ last_gpu_index=$((SPARK_GPU_DISPATCHERS - 1))
+ for i in $(seq 0 ${last_gpu_index}); do
container_exec \
dcos spark quota remove "${TEST_NAME}__gpu-spark-0${i}-executors-role"
done
@@ -376,25 +482,29 @@ if [ "${SHOULD_INSTALL_GPU_DISPATCHERS}" = true ]; then
log 'Uploading GPU dispatcher list to S3'
container_exec \
aws s3 cp --acl public-read \
- "${TEST_DIRECTORY}/${GPU_DISPATCHERS_OUTPUT_FILE}" \
+ "${TEST_DIRECTORY}/${SPARK_GPU_DISPATCHERS_OUTPUT_FILE}" \
"${TEST_S3_DIRECTORY_URL}"
log 'Uploading GPU JSON dispatcher list to S3'
container_exec \
aws s3 cp --acl public-read \
- "${TEST_DIRECTORY}/${GPU_DISPATCHERS_JSON_OUTPUT_FILE}" \
+ "${TEST_DIRECTORY}/${SPARK_GPU_DISPATCHERS_JSON_OUTPUT_FILE}" \
"${TEST_S3_DIRECTORY_URL}"
else
log 'Skipping GPU dispatchers installation'
fi
-if [[ -s ${TEST_DIRECTORY}/${NON_GPU_DISPATCHERS_JSON_OUTPUT_FILE} && -s ${TEST_DIRECTORY}/${GPU_DISPATCHERS_JSON_OUTPUT_FILE} ]]; then
+################################################################################
+# Upload merged (non-GPU + GPU) Spark dispatcher list file #####################
+################################################################################
+
+if [[ -s ${TEST_DIRECTORY}/${SPARK_NON_GPU_DISPATCHERS_JSON_OUTPUT_FILE} && -s ${TEST_DIRECTORY}/${SPARK_GPU_DISPATCHERS_JSON_OUTPUT_FILE} ]]; then
log 'Merging non-GPU and GPU dispatcher list files'
container_exec "\
jq -s \
'{spark: (.[0].spark + .[1].spark)}' \
- ${TEST_DIRECTORY}/${NON_GPU_DISPATCHERS_JSON_OUTPUT_FILE} \
- ${TEST_DIRECTORY}/${GPU_DISPATCHERS_JSON_OUTPUT_FILE} \
+ ${TEST_DIRECTORY}/${SPARK_NON_GPU_DISPATCHERS_JSON_OUTPUT_FILE} \
+ ${TEST_DIRECTORY}/${SPARK_GPU_DISPATCHERS_JSON_OUTPUT_FILE} \
> ${TEST_DIRECTORY}/${DISPATCHERS_JSON_OUTPUT_FILE} \
"
@@ -407,106 +517,83 @@ else
log 'Skipping merging of non-GPU and GPU dispatcher list files'
fi
-if [ "${SHOULD_RUN_FAILING_STREAMING_JOBS}" = true ]; then
- log 'Starting failing jobs'
- start_time=$(date +%s)
- container_exec \
- ./scale-tests/kafka_cassandra_streaming_test.py \
- "${TEST_DIRECTORY}/${NON_GPU_DISPATCHERS_JSON_OUTPUT_FILE}" \
- "${TEST_DIRECTORY}/${INFRASTRUCTURE_OUTPUT_FILE}" \
- "${TEST_DIRECTORY}/${FAILING_SUBMISSIONS_OUTPUT_FILE}" \
- --spark-executor-docker-image \""${SPARK_EXECUTOR_DOCKER_IMAGE}"\" \
- --jar "${TEST_ASSEMBLY_JAR_URL}" \
- --num-producers-per-kafka "${FAILING_NUM_PRODUCERS_PER_KAFKA}" \
- --num-consumers-per-producer "${FAILING_NUM_CONSUMERS_PER_PRODUCER}" \
- --producer-must-fail \
- --producer-number-of-words "${FAILING_PRODUCER_NUMBER_OF_WORDS}" \
- --producer-words-per-second "${FAILING_PRODUCER_WORDS_PER_SECOND}" \
- --producer-spark-cores-max "${FAILING_PRODUCER_SPARK_CORES_MAX}" \
- --producer-spark-executor-cores "${FAILING_PRODUCER_SPARK_EXECUTOR_CORES}" \
- --consumer-must-fail \
- --consumer-write-to-cassandra \
- --consumer-batch-size-seconds "${FAILING_CONSUMER_BATCH_SIZE_SECONDS}" \
- --consumer-spark-cores-max "${FAILING_CONSUMER_SPARK_CORES_MAX}" \
- --consumer-spark-executor-cores "${FAILING_CONSUMER_SPARK_EXECUTOR_CORES}"
- end_time=$(date +%s)
- runtime=$(($end_time - $start_time))
- log "Started failing jobs in ${runtime} seconds"
-
- log 'Uploading failing jobs submissions file'
- container_exec \
- aws s3 cp --acl public-read \
- "${TEST_DIRECTORY}/${FAILING_SUBMISSIONS_OUTPUT_FILE}" \
- "${TEST_S3_DIRECTORY_URL}"
-else
- log 'Skipping running of failing streaming jobs'
-fi
+################################################################################
+# Run finite streaming jobs ####################################################
+################################################################################
if [ "${SHOULD_RUN_FINITE_STREAMING_JOBS}" = true ]; then
log 'Starting finite jobs. Consumers write to Cassandra'
start_time=$(date +%s)
container_exec \
./scale-tests/kafka_cassandra_streaming_test.py \
- "${TEST_DIRECTORY}/${NON_GPU_DISPATCHERS_JSON_OUTPUT_FILE}" \
+ "${TEST_DIRECTORY}/${SPARK_NON_GPU_DISPATCHERS_JSON_OUTPUT_FILE}" \
"${TEST_DIRECTORY}/${INFRASTRUCTURE_OUTPUT_FILE}" \
- "${TEST_DIRECTORY}/${FINITE_SUBMISSIONS_OUTPUT_FILE}" \
+ "${TEST_DIRECTORY}/${STREAMING_FINITE_SUBMISSIONS_OUTPUT_FILE}" \
--spark-executor-docker-image \""${SPARK_EXECUTOR_DOCKER_IMAGE}"\" \
--jar "${TEST_ASSEMBLY_JAR_URL}" \
- --num-producers-per-kafka "${FINITE_NUM_PRODUCERS_PER_KAFKA}" \
- --num-consumers-per-producer "${FINITE_NUM_CONSUMERS_PER_PRODUCER}" \
- --producer-number-of-words "${FINITE_PRODUCER_NUMBER_OF_WORDS}" \
- --producer-words-per-second "${FINITE_PRODUCER_WORDS_PER_SECOND}" \
- --producer-spark-cores-max "${FINITE_PRODUCER_SPARK_CORES_MAX}" \
- --producer-spark-executor-cores "${FINITE_PRODUCER_SPARK_EXECUTOR_CORES}" \
+ --num-producers-per-kafka "${STREAMING_FINITE_PRODUCERS_PER_KAFKA}" \
+ --num-consumers-per-producer "${STREAMING_FINITE_CONSUMERS_PER_PRODUCER}" \
+ --producer-number-of-words "${STREAMING_FINITE_PRODUCER_NUMBER_OF_WORDS}" \
+ --producer-words-per-second "${STREAMING_FINITE_PRODUCER_WORDS_PER_SECOND}" \
+ --producer-spark-cores-max "${STREAMING_FINITE_PRODUCER_SPARK_CORES_MAX}" \
+ --producer-spark-executor-cores "${STREAMING_FINITE_PRODUCER_SPARK_EXECUTOR_CORES}" \
--consumer-write-to-cassandra \
- --consumer-batch-size-seconds "${FINITE_CONSUMER_BATCH_SIZE_SECONDS}" \
- --consumer-spark-cores-max "${FINITE_CONSUMER_SPARK_CORES_MAX}" \
- --consumer-spark-executor-cores "${FINITE_CONSUMER_SPARK_EXECUTOR_CORES}"
+ --consumer-batch-size-seconds "${STREAMING_FINITE_CONSUMER_BATCH_SIZE_SECONDS}" \
+ --consumer-spark-cores-max "${STREAMING_FINITE_CONSUMER_SPARK_CORES_MAX}" \
+ --consumer-spark-executor-cores "${STREAMING_FINITE_CONSUMER_SPARK_EXECUTOR_CORES}"
end_time=$(date +%s)
- runtime=$(($end_time - $start_time))
+ runtime=$((end_time - start_time))
log "Started finite jobs in ${runtime} seconds"
log 'Uploading finite jobs submissions file'
container_exec \
aws s3 cp --acl public-read \
- "${TEST_DIRECTORY}/${FINITE_SUBMISSIONS_OUTPUT_FILE}" \
+ "${TEST_DIRECTORY}/${STREAMING_FINITE_SUBMISSIONS_OUTPUT_FILE}" \
"${TEST_S3_DIRECTORY_URL}"
else
log 'Skipping running of finite streaming jobs'
fi
+################################################################################
+# Run infinite streaming jobs ##################################################
+################################################################################
+
if [ "${SHOULD_RUN_INFINITE_STREAMING_JOBS}" = true ]; then
log 'Starting infinite jobs. Consumers do not write to Cassandra'
start_time=$(date +%s)
container_exec \
./scale-tests/kafka_cassandra_streaming_test.py \
- "${TEST_DIRECTORY}/${NON_GPU_DISPATCHERS_JSON_OUTPUT_FILE}" \
+ "${TEST_DIRECTORY}/${SPARK_NON_GPU_DISPATCHERS_JSON_OUTPUT_FILE}" \
"${TEST_DIRECTORY}/${INFRASTRUCTURE_OUTPUT_FILE}" \
- "${TEST_DIRECTORY}/${INFINITE_SUBMISSIONS_OUTPUT_FILE}" \
+ "${TEST_DIRECTORY}/${STREAMING_INFINITE_SUBMISSIONS_OUTPUT_FILE}" \
--spark-executor-docker-image \""${SPARK_EXECUTOR_DOCKER_IMAGE}"\" \
--jar "${TEST_ASSEMBLY_JAR_URL}" \
- --num-producers-per-kafka "${INFINITE_NUM_PRODUCERS_PER_KAFKA}" \
- --num-consumers-per-producer "${INFINITE_NUM_CONSUMERS_PER_PRODUCER}" \
+ --num-producers-per-kafka "${STREAMING_INFINITE_PRODUCERS_PER_KAFKA}" \
+ --num-consumers-per-producer "${STREAMING_INFINITE_CONSUMERS_PER_PRODUCER}" \
--producer-number-of-words 0 \
- --producer-words-per-second "${INFINITE_PRODUCER_WORDS_PER_SECOND}" \
- --producer-spark-cores-max "${INFINITE_PRODUCER_SPARK_CORES_MAX}" \
- --producer-spark-executor-cores "${INFINITE_PRODUCER_SPARK_EXECUTOR_CORES}" \
- --consumer-batch-size-seconds "${INFINITE_CONSUMER_BATCH_SIZE_SECONDS}" \
- --consumer-spark-cores-max "${INFINITE_CONSUMER_SPARK_CORES_MAX}" \
- --consumer-spark-executor-cores "${INFINITE_CONSUMER_SPARK_EXECUTOR_CORES}"
+ --producer-words-per-second "${STREAMING_INFINITE_PRODUCER_WORDS_PER_SECOND}" \
+ --producer-spark-cores-max "${STREAMING_INFINITE_PRODUCER_SPARK_CORES_MAX}" \
+ --producer-spark-executor-cores "${STREAMING_INFINITE_PRODUCER_SPARK_EXECUTOR_CORES}" \
+ --consumer-batch-size-seconds "${STREAMING_INFINITE_CONSUMER_BATCH_SIZE_SECONDS}" \
+ --consumer-spark-cores-max "${STREAMING_INFINITE_CONSUMER_SPARK_CORES_MAX}" \
+ --consumer-spark-executor-cores "${STREAMING_INFINITE_CONSUMER_SPARK_EXECUTOR_CORES}"
end_time=$(date +%s)
- runtime=$(($end_time - $start_time))
+ runtime=$((end_time - start_time))
log "Started infinite jobs in ${runtime} seconds"
log 'Uploading infinite jobs submissions file'
container_exec \
aws s3 cp --acl public-read \
- "${TEST_DIRECTORY}/${INFINITE_SUBMISSIONS_OUTPUT_FILE}" \
+ "${TEST_DIRECTORY}/${STREAMING_INFINITE_SUBMISSIONS_OUTPUT_FILE}" \
"${TEST_S3_DIRECTORY_URL}"
else
log 'Skipping running of infinite streaming jobs'
fi
+################################################################################
+# Run non-GPU batch jobs #######################################################
+################################################################################
+
if [ "${SHOULD_RUN_BATCH_JOBS}" = true ]; then
log 'Starting batch jobs'
start_time=$(date +%s)
@@ -516,21 +603,26 @@ if [ "${SHOULD_RUN_BATCH_JOBS}" = true ]; then
--dcos-username "${DCOS_USERNAME}" \
--dcos-password "${DCOS_PASSWORD}" \
--security "${SECURITY}" \
- --input-file-uri "${NON_GPU_DISPATCHERS_JSON_OUTPUT_FILE_URL}" \
+ --input-file-uri "${SPARK_NON_GPU_DISPATCHERS_JSON_OUTPUT_FILE_URL}" \
--script-cpus "${BATCH_SCRIPT_CPUS}" \
--script-mem "${BATCH_SCRIPT_MEM}" \
--spark-build-branch "${BATCH_SPARK_BUILD_BRANCH}" \
--script-args "\"\
- ${NON_GPU_DISPATCHERS_JSON_OUTPUT_FILE} \
+ ${SPARK_NON_GPU_DISPATCHERS_JSON_OUTPUT_FILE} \
--submits-per-min ${BATCH_SUBMITS_PER_MIN} \
+ --group-role ${GROUP_NAME} \
\""
end_time=$(date +%s)
- runtime=$(($end_time - $start_time))
+ runtime=$((end_time - start_time))
log "Started batch jobs in ${runtime} seconds"
else
log 'Skipping running of batch jobs'
fi
+################################################################################
+# Run GPU batch jobs ###########################################################
+################################################################################
+
if [ "${SHOULD_RUN_GPU_BATCH_JOBS}" = true ]; then
log 'Starting GPU batch jobs'
start_time=$(date +%s)
@@ -540,38 +632,47 @@ if [ "${SHOULD_RUN_GPU_BATCH_JOBS}" = true ]; then
--dcos-username "${DCOS_USERNAME}" \
--dcos-password "${DCOS_PASSWORD}" \
--security "${SECURITY}" \
- --input-file-uri "${GPU_DISPATCHERS_JSON_OUTPUT_FILE_URL}" \
+ --input-file-uri "${SPARK_GPU_DISPATCHERS_JSON_OUTPUT_FILE_URL}" \
--script-cpus "${GPU_SCRIPT_CPUS}" \
--script-mem "${GPU_SCRIPT_MEM}" \
--spark-build-branch "${GPU_SPARK_BUILD_BRANCH}" \
--script-args "\"\
- ${GPU_DISPATCHERS_JSON_OUTPUT_FILE} \
+ ${SPARK_GPU_DISPATCHERS_JSON_OUTPUT_FILE} \
--submits-per-min ${GPU_SUBMITS_PER_MIN} \
--docker-image ${GPU_DOCKER_IMAGE} \
- --max-num-dispatchers ${GPU_MAX_NUM_DISPATCHERS} \
+ --group-role ${GROUP_NAME} \
+ --max-num-dispatchers ${GPU_MAX_DISPATCHERS} \
--spark-cores-max ${GPU_SPARK_CORES_MAX} \
--spark-mesos-executor-gpus ${GPU_SPARK_MESOS_EXECUTOR_GPUS} \
--spark-mesos-max-gpus ${GPU_SPARK_MESOS_MAX_GPUS} \
\""
end_time=$(date +%s)
- runtime=$(($end_time - $start_time))
+ runtime=$((end_time - start_time))
log "Started GPU batch jobs in ${runtime} seconds"
else
log 'Skipping running of GPU batch jobs'
fi
+################################################################################
+# Uninstall infrastructure #####################################################
+################################################################################
+
if [ "${SHOULD_UNINSTALL_INFRASTRUCTURE_AT_THE_END}" = true ]; then
log 'Uninstalling infrastructure'
start_time=$(date +%s)
container_exec \
./scale-tests/setup_streaming.py "${TEST_DIRECTORY}/${INFRASTRUCTURE_OUTPUT_FILE}" --cleanup
end_time=$(date +%s)
- runtime=$(($end_time - $start_time))
+ runtime=$((end_time - start_time))
log "Uninstalled infrastructure in ${runtime} seconds"
else
log 'Skipping uninstalling of infrastructure'
fi
+################################################################################
+################################################################################
+################################################################################
+
log 'Uploading log file to S3'
container_exec \
aws s3 cp --acl public-read \
diff --git a/scale-tests/scale_tests_utils.py b/scale-tests/scale_tests_utils.py
index 8a4bb279..be8d2af5 100644
--- a/scale-tests/scale_tests_utils.py
+++ b/scale-tests/scale_tests_utils.py
@@ -22,9 +22,9 @@ def setup_security(service_name: str, linux_user: str) -> typing.Dict:
service_account = normalize_string("{}-service-account".format(service_name))
service_account_secret = "{}-service-account-secret".format(service_name)
- return sdk_security.setup_security(service_name,
- linux_user,
- service_account, service_account_secret)
+ return sdk_security.setup_security(
+ service_name, linux_user, service_account, service_account_secret
+ )
def get_strict_mode_options(service_account_info: typing.Dict) -> typing.Dict:
@@ -32,20 +32,14 @@ def get_strict_mode_options(service_account_info: typing.Dict) -> typing.Dict:
options = {}
if "linux_user" in service_account_info:
- user_options = {
- "service": {
- "user": service_account_info["linux_user"]
- }
-
- }
+ user_options = {"service": {"user": service_account_info["linux_user"]}}
options = sdk_install.merge_dictionaries(options, user_options)
-
if sdk_utils.is_strict_mode():
service_account_options = {
- 'service': {
- 'service_account': service_account_info["name"],
- 'service_account_secret': service_account_info["secret"],
+ "service": {
+ "service_account": service_account_info["name"],
+ "service_account_secret": service_account_info["secret"],
}
}
options = sdk_install.merge_dictionaries(options, service_account_options)
@@ -53,8 +47,9 @@ def get_strict_mode_options(service_account_info: typing.Dict) -> typing.Dict:
return options
-def get_service_options(service_name: str, service_account_info: typing.Dict,
- options: typing.Dict, config_path: str) -> typing.Dict:
+def get_service_options(
+ service_name: str, service_account_info: typing.Dict, options: typing.Dict, config_path: str
+) -> typing.Dict:
"""
Get the options for a service as a combination of other options.
"""
@@ -62,7 +57,7 @@ def get_service_options(service_name: str, service_account_info: typing.Dict,
config_options = {}
if config_path:
if os.path.isfile(config_path):
- with open(config_path, 'r') as fp:
+ with open(config_path, "r") as fp:
log.info("Reading options from %s", config_path)
config_options = json.load(fp)
else:
@@ -71,13 +66,19 @@ def get_service_options(service_name: str, service_account_info: typing.Dict,
else:
log.info("No options specified. Using defaults")
+ if service_name[0] != "/":
+ service_name = "/" + service_name
# Always set the service name
service_name_options = {"service": {"name": service_name}}
- return merge_service_options([get_strict_mode_options(service_account_info),
- options,
- config_options,
- service_name_options, ])
+ return merge_service_options(
+ [
+ get_strict_mode_options(service_account_info),
+ options,
+ config_options,
+ service_name_options,
+ ]
+ )
def merge_service_options(options: typing.List[typing.Dict]) -> typing.Dict:
@@ -118,7 +119,7 @@ def mapcat(func, seqs):
def normalize_string(s: str) -> str:
- return s.replace("/", "__").replace('-', '_')
+ return s.replace("/", "__").replace("-", "_")
def make_repeater(n):
diff --git a/scale-tests/setup_streaming.py b/scale-tests/setup_streaming.py
index 10abc652..f30f3f42 100755
--- a/scale-tests/setup_streaming.py
+++ b/scale-tests/setup_streaming.py
@@ -8,6 +8,7 @@
* Kafka ZooKeeper is installed
* Kafka is installed
* Cassandra is installed
+* Data Science Engine is installed
Usage:
setup_streaming.py [options]
@@ -23,14 +24,21 @@
This is used for both Kafka and ZooKeeper [default: 0]
--kafka-package-name The package name to use for Kafka [default: confluent-kafka]
+ --kafka-user user for the Kafka installation [default: nobody]
--kafka-config path to the config.json for the Kafka installation
--kafka-zookeeper-package-name The package name to use for Kafka ZooKeeper [default: confluent-zookeeper]
+ --kafka-zookeeper-user user for the Kafka ZooKeeper installation [default: nobody]
--kafka-zookeeper-config path to the config.json for the Kafka ZooKeeper installation
--cassandra-cluster-count The number of Cassandra clusters to install [default: 0]
--cassandra-package-name The package name to use for Cassandra [default: cassandra]
+ --cassandra-user user for the Cassandra installation [default: nobody]
--cassandra-config path to the config.json for the Cassandra installation
+
+ --dsengine-package-name The package name to use for Data Science Engine [default: data-science-engine]
+ --dsengine-user user for the Data Science Engine installation [default: nobody]
+ --dsengine-config path to the config.json for the Data Science Engine installation
"""
import json
import logging
@@ -63,6 +71,7 @@ def install_package(package_name: str,
index: int,
service_task_count: int,
config_path: str,
+ user: str = None,
additional_options: dict = None) -> dict:
if package_name.startswith("beta-"):
basename = package_name[len("beta-"):]
@@ -71,7 +80,18 @@ def install_package(package_name: str,
service_name = "{}{}-{:0>2}".format(service_prefix, basename, index)
- service_account_info = scale_tests_utils.setup_security(service_name, "nobody")
+ if not user:
+ user = "nobody"
+
+ service_account_info = scale_tests_utils.setup_security(service_name, user)
+
+ if "data-science-engine" in package_name:
+ for permission in [
+ "dcos:mesos:master:task:app_id:{}".format(service_name),
+ "dcos:mesos:agent:task:user:root",
+ "dcos:mesos:master:task:user:root",
+ "dcos:mesos:master:task:role:{}".format(service_account_info['name'])]:
+ grant_permission(service_account_info['name'], permission, "create")
service_options = scale_tests_utils.get_service_options(service_name, service_account_info, additional_options, config_path)
@@ -126,13 +146,14 @@ def get_expected_task_count(service_options: dict) -> int:
kafka_zookeeper_package_name = args["--kafka-zookeeper-package-name"]
kafka_zookeeper_service_prefix = args["--service-names-prefix"]
+ kafka_zookeeper_user = args.get("--kafka-zookeeper-user", "")
kafka_zookeeper_config = args.get("--kafka-zookeeper-config", "")
services = []
for i in range(kafka_cluster_count):
services.append(install_package(kafka_zookeeper_package_name,
kafka_zookeeper_service_prefix, i, get_expected_task_count,
- kafka_zookeeper_config))
+ user = kafka_zookeeper_user, config_path=kafka_zookeeper_config))
return services
@@ -151,6 +172,7 @@ def get_expected_task_count(service_options: dict) -> int:
kafka_package_name = args["--kafka-package-name"]
kafka_service_prefix = args["--service-names-prefix"]
+ kafka_user = args.get("--kafka-user", "")
kafka_config = args.get("--kafka-config", "")
services = []
@@ -169,7 +191,7 @@ def get_expected_task_count(service_options: dict) -> int:
services.append(install_package(kafka_package_name, kafka_service_prefix, i,
get_expected_task_count, kafka_config,
- additional_options=service_options))
+ user = kafka_user, additional_options=service_options))
return services
@@ -188,12 +210,37 @@ def get_expected_task_count(service_options: dict) -> int:
cassandra_package_name = args["--cassandra-package-name"]
cassandra_service_prefix = args["--service-names-prefix"]
+ cassandra_user = args.get("--cassandra-user", "")
cassandra_config = args.get("--cassandra-config", "")
services = []
for i in range(cassandra_cluster_count):
services.append(install_package(cassandra_package_name, cassandra_service_prefix, i,
- get_expected_task_count, cassandra_config))
+ get_expected_task_count, user = cassandra_user, config_path=cassandra_config))
+
+ return services
+
+def install_dsengine(args: dict) -> list:
+ """
+ Install the Data Science Engine service(s) as defined by the arguments
+ """
+ def get_expected_task_count(service_options: dict) -> int:
+ return _get_pod_count(service_options, "notebooks", 1)
+
+ dse_cluster_count = 1
+
+ if not dse_cluster_count:
+ return []
+
+ dsengine_package_name = args["--dsengine-package-name"]
+ dsengine_service_prefix = args["--service-names-prefix"]
+ dsengine_user = args.get("--dsengine-user", "")
+ dsengine_config = args.get("--dsengine-config", "")
+
+ services = []
+ for i in range(dse_cluster_count):
+ services.append(install_package(dsengine_package_name, dsengine_service_prefix, i,
+ get_expected_task_count, user=dsengine_user,config_path=dsengine_config))
return services
@@ -203,6 +250,7 @@ def install(args):
services["zookeeper"] = install_zookeeper(args)
services["kafka"] = install_kafka(args, services["zookeeper"])
services["cassandra"] = install_cassandra(args)
+ services["dsengine"] = install_dsengine(args)
for k, v in services.items():
log.info("%s service(s): %s", k, v)
@@ -236,6 +284,13 @@ def cleanup(args):
log.info("Removing service accounts and secrets")
sdk_security.cleanup_security(service_name, s["service_account_info"])
+def grant_permission(service_account: str, acl: str, action: str, description = None) -> None:
+ cmd = "security org users grant {} {} {}".format(service_account, acl, action)
+
+ if description:
+ cmd = "{} --description '{}'".format(cmd, description)
+
+ sdk_cmd.run_cli(cmd=cmd, print_output=True)
def main(args):
if "--cleanup" in args and args["--cleanup"]:
diff --git a/spark-testing/spark_utils.py b/spark-testing/spark_utils.py
index 3a5e9d01..9111dcb5 100644
--- a/spark-testing/spark_utils.py
+++ b/spark-testing/spark_utils.py
@@ -14,7 +14,6 @@
import sdk_utils
import spark_s3
-import dcos_utils
DCOS_SPARK_TEST_JAR_PATH_ENV = "DCOS_SPARK_TEST_JAR_PATH"
DCOS_SPARK_TEST_JAR_PATH = os.getenv(DCOS_SPARK_TEST_JAR_PATH_ENV, None)
@@ -27,7 +26,9 @@
MESOS_SPARK_TEST_JAR_URL = os.getenv(MESOS_SPARK_TEST_JAR_URL_ENV, None)
SPARK_SERVICE_ACCOUNT = os.getenv("SPARK_SERVICE_ACCOUNT", "spark-service-acct")
-SPARK_SERVICE_ACCOUNT_SECRET = os.getenv("SPARK_SERVICE_ACCOUNT_SECRET", "spark-service-acct-secret")
+SPARK_SERVICE_ACCOUNT_SECRET = os.getenv(
+ "SPARK_SERVICE_ACCOUNT_SECRET", "spark-service-acct-secret"
+)
SPARK_SERVICE_NAME = os.getenv("SPARK_SERVICE_NAME", "spark")
FOLDERED_SPARK_SERVICE_NAME = "/path/to/" + SPARK_SERVICE_NAME
@@ -46,27 +47,33 @@
SPARK_PACKAGE_NAME = os.getenv("SPARK_PACKAGE_NAME", "spark")
SPARK_EXAMPLES = "http://downloads.mesosphere.com/spark/assets/spark-examples_2.11-2.4.0.jar"
-start_agent_cmd = "sudo systemctl start dcos-mesos-slave"
-stop_agent_cmd = "sudo systemctl stop dcos-mesos-slave"
-check_agent_cmd = "sudo systemctl is-active dcos-mesos-slave"
-
def _check_tests_assembly():
if not DCOS_SPARK_TEST_JAR_URL and not os.path.exists(DCOS_SPARK_TEST_JAR_PATH):
- raise Exception('''Missing URL or path to file dcos-spark-scala-tests-assembly-[...].jar:
+ raise Exception(
+ """Missing URL or path to file dcos-spark-scala-tests-assembly-[...].jar:
- No URL: {}={}
- - File not found: {}={}'''.format(
- DCOS_SPARK_TEST_JAR_URL_ENV, DCOS_SPARK_TEST_JAR_URL,
- DCOS_SPARK_TEST_JAR_PATH_ENV, DCOS_SPARK_TEST_JAR_PATH))
+ - File not found: {}={}""".format(
+ DCOS_SPARK_TEST_JAR_URL_ENV,
+ DCOS_SPARK_TEST_JAR_URL,
+ DCOS_SPARK_TEST_JAR_PATH_ENV,
+ DCOS_SPARK_TEST_JAR_PATH,
+ )
+ )
def _check_mesos_integration_tests_assembly():
if not MESOS_SPARK_TEST_JAR_URL and not os.path.exists(MESOS_SPARK_TEST_JAR_PATH):
- raise Exception('''Missing URL or path to file mesos-spark-integration-tests-assembly-[...].jar:
+ raise Exception(
+ """Missing URL or path to file mesos-spark-integration-tests-assembly-[...].jar:
- No URL: {}={}
- - File not found: {}={}'''.format(
- MESOS_SPARK_TEST_JAR_URL_ENV, MESOS_SPARK_TEST_JAR_URL,
- MESOS_SPARK_TEST_JAR_PATH_ENV, MESOS_SPARK_TEST_JAR_PATH))
+ - File not found: {}={}""".format(
+ MESOS_SPARK_TEST_JAR_URL_ENV,
+ MESOS_SPARK_TEST_JAR_URL,
+ MESOS_SPARK_TEST_JAR_PATH_ENV,
+ MESOS_SPARK_TEST_JAR_PATH,
+ )
+ )
def hdfs_enabled():
@@ -77,7 +84,9 @@ def kafka_enabled():
return os.environ.get("KAFKA_ENABLED") != "false"
-def require_spark(service_name=SPARK_SERVICE_NAME, additional_options={}, zk='spark_mesos_dispatcher'):
+def require_spark(
+ service_name=SPARK_SERVICE_NAME, additional_options={}, zk="spark_mesos_dispatcher"
+):
teardown_spark(service_name, zk)
sdk_install.install(
@@ -85,34 +94,31 @@ def require_spark(service_name=SPARK_SERVICE_NAME, additional_options={}, zk='sp
service_name,
0,
additional_options=get_spark_options(service_name, additional_options),
- wait_for_deployment=False, # no deploy plan
- insert_strict_options=False) # lacks principal + secret_name options
+ wait_for_deployment=False, # no deploy plan
+ insert_strict_options=False,
+ ) # lacks principal + secret_name options
# wait for dispatcher to be reachable over HTTP
- sdk_cmd.service_request('GET', service_name, '', timeout_seconds=300)
+ sdk_cmd.service_request("GET", service_name, "", timeout_seconds=300)
# Note: zk may be customized in spark via 'spark.deploy.zookeeper.dir'
-def teardown_spark(service_name=SPARK_SERVICE_NAME, zk='spark_mesos_dispatcher'):
+def teardown_spark(service_name=SPARK_SERVICE_NAME, zk="spark_mesos_dispatcher"):
sdk_install.uninstall(
SPARK_PACKAGE_NAME,
service_name,
- role=re.escape('*'),
- service_account='spark-service-acct',
- zk=zk)
+ role=re.escape("*"),
+ service_account="spark-service-acct",
+ zk=zk,
+ )
- if not sdk_utils.dcos_version_less_than('1.10'):
+ if not sdk_utils.dcos_version_less_than("1.10"):
# On 1.10+, sdk_uninstall doesn't run janitor. However Spark always needs it for ZK cleanup.
- sdk_install.retried_run_janitor(service_name, re.escape('*'), 'spark-service-acct', zk)
+ sdk_install.retried_run_janitor(service_name, re.escape("*"), "spark-service-acct", zk)
def get_spark_options(service_name, additional_options):
- options = {
- "service": {
- "user": SPARK_USER,
- "name": service_name
- }
- }
+ options = {"service": {"user": SPARK_USER, "name": service_name}}
if SPARK_DOCKER_USER is not None:
options["service"]["docker_user"] = SPARK_DOCKER_USER
@@ -146,48 +152,51 @@ def run_tests(app_url, app_args, expected_output, service_name=SPARK_SERVICE_NAM
try:
check_job_output(driver_id, expected_output)
except TimeoutError:
- log.error("Timed out waiting for job output, will attempt to cleanup and kill driver: {}".format(driver_id))
+ log.error(
+ "Timed out waiting for job output, will attempt to cleanup and kill driver: {}".format(
+ driver_id
+ )
+ )
raise
finally:
kill_driver(driver_id, service_name=service_name)
def submit_job(
- app_url,
- app_args,
- service_name=SPARK_SERVICE_NAME,
- args=[],
- spark_user=None,
- driver_role=SPARK_DRIVER_ROLE,
- verbose=True,
- principal=SPARK_SERVICE_ACCOUNT,
- use_cli=True):
+ app_url,
+ app_args,
+ service_name=SPARK_SERVICE_NAME,
+ args=[],
+ spark_user=None,
+ driver_role=SPARK_DRIVER_ROLE,
+ verbose=True,
+ principal=SPARK_SERVICE_ACCOUNT,
+ use_cli=True,
+):
conf_args = args.copy()
+ # Don't overwrite spark.mesos.role in case of running under enforce_role.
+ # We're expecting the caller to pass `driver_role` as `None` in that case.
if driver_role:
- conf_args += ['--conf', 'spark.mesos.role={}'.format(driver_role)]
+ conf_args += ["--conf", "spark.mesos.role={}".format(driver_role)]
if SPARK_DOCKER_USER is not None:
- conf_args += ['--conf', 'spark.mesos.executor.docker.parameters=user={}'.format(SPARK_DOCKER_USER)]
+ conf_args += [
+ "--conf",
+ "spark.mesos.executor.docker.parameters=user={}".format(SPARK_DOCKER_USER),
+ ]
- if not list(filter(lambda x: "spark.driver.memory=" in x, conf_args)):
- conf_args += ['--conf', 'spark.driver.memory=2g']
+ if not list(filter(lambda x: x.startswith("spark.driver.memory="), conf_args)):
+ conf_args += ["--conf", "spark.driver.memory=2g"]
if sdk_utils.is_strict_mode():
- conf_args += [
- '--conf spark.mesos.principal={}'.format(principal)
- ]
+ conf_args += ["--conf spark.mesos.principal={}".format(principal)]
if spark_user is not None:
- conf_args += [
- '--conf spark.mesos.driverEnv.SPARK_USER={}'.format(spark_user)
- ]
-
- if not list(filter(lambda x: "spark.mesos.containerizer=" in x, conf_args)):
- conf_args += ['--conf', 'spark.mesos.containerizer=mesos']
+ conf_args += ["--conf spark.mesos.driverEnv.SPARK_USER={}".format(spark_user)]
- submit_args = ' '.join([' '.join(conf_args), app_url, app_args])
+ submit_args = " ".join([" ".join(conf_args), app_url, app_args])
verbose_flag = "--verbose" if verbose else ""
result = None
@@ -195,14 +204,21 @@ def submit_job(
stdout = sdk_cmd.svc_cli(
SPARK_PACKAGE_NAME,
service_name,
- 'run {} --submit-args="{}"'.format(verbose_flag, submit_args))
+ 'run {} --submit-args="{}"'.format(verbose_flag, submit_args),
+ )
result = re.search(r"Submission id: (\S+)", stdout)
else:
- docker_cmd = "sudo docker run --net=host -ti {} bin/spark-submit {}".format(SPARK_DOCKER_IMAGE, submit_args)
+ docker_cmd = "sudo docker run --net=host -ti {} bin/spark-submit {}".format(
+ SPARK_DOCKER_IMAGE, submit_args
+ )
ssh_opts = "--option UserKnownHostsFile=/dev/null --option StrictHostKeyChecking=no"
log.info("Running Docker command on leader: {}".format(docker_cmd))
- _, stdout, stderr = sdk_cmd.run_raw_cli("node ssh --master-proxy --leader --user={} {} '{}'".format(sdk_cmd.LINUX_USER, ssh_opts, docker_cmd))
+ _, stdout, stderr = sdk_cmd.run_raw_cli(
+ "node ssh --master-proxy --leader --user={} {} '{}'".format(
+ sdk_cmd.LINUX_USER, ssh_opts, docker_cmd
+ )
+ )
result = re.search(r'"submissionId" : "(\S+)"', stdout)
if not result:
@@ -211,7 +227,7 @@ def submit_job(
def check_job_output(task_id, expected_output):
- log.info('Waiting for task id={} to complete'.format(task_id))
+ log.info("Waiting for task id={} to complete".format(task_id))
shakedown.wait_for_task_completion(task_id, timeout_sec=JOB_WAIT_TIMEOUT_SECONDS)
stdout = _task_log(task_id)
@@ -222,28 +238,11 @@ def check_job_output(task_id, expected_output):
raise Exception("{} not found in stdout".format(expected_output))
-# Reads the logs and matches each line for specified regular expressions.
-# Returns a map of [regular expression] -> [list of matched line numbers]
-def log_matches(task_id, filename, expressions):
- output = _task_log(task_id, filename)
- matched_lines = {}
- for exp in expressions:
- matched_lines[exp] = []
- for line_number, line in enumerate(output.splitlines()):
- for exp in expressions:
- if re.search(exp, line):
- matched_lines[exp].append(line_number)
- return matched_lines
-
-
-@retrying.retry(
- wait_fixed=5000,
- stop_max_delay=600 * 1000,
- retry_on_result=lambda res: not res)
+@retrying.retry(wait_fixed=5000, stop_max_delay=600 * 1000, retry_on_result=lambda res: not res)
def wait_for_running_job_output(task_id, expected_line):
stdout = sdk_cmd.run_cli("task log --lines=1000 {}".format(task_id))
result = expected_line in stdout
- log.info('Checking for {} in STDOUT:\n{}\nResult: {}'.format(expected_line, stdout, result))
+ log.info("Checking for {} in STDOUT:\n{}\nResult: {}".format(expected_line, stdout, result))
return result
@@ -287,82 +286,85 @@ def kill_driver(driver_id, service_name=SPARK_SERVICE_NAME):
def _task_log(task_id, filename=None):
- return sdk_cmd.run_cli("task log --completed --lines=1000 {}".format(task_id) + \
- ("" if filename is None else " {}".format(filename)))
+ return sdk_cmd.run_cli(
+ "task log --completed --lines=1000 {}".format(task_id)
+ + ("" if filename is None else " {}".format(filename))
+ )
def grant_user_permissions(user, role="*", service_account=SPARK_SERVICE_ACCOUNT):
log.info(f"Adding user permissions for Marathon. User: {user}")
sdk_security.grant_permissions(
- linux_user=user,
- role_name="slave_public",
- service_account_name="dcos_marathon"
+ linux_user=user, role_name="slave_public", service_account_name="dcos_marathon"
)
log.info(f"Adding user permissions for {service_account}. User: {user}, role: {role}")
sdk_security.grant_permissions(
- linux_user=user,
- role_name=role,
- service_account_name=service_account
+ linux_user=user, role_name=role, service_account_name=service_account
)
def revoke_user_permissions(user, role="*", service_account=SPARK_SERVICE_ACCOUNT):
log.info(f"Revoking user permissions for Marathon. User: {user}")
sdk_security.grant_permissions(
- linux_user=user,
- role_name="slave_public",
- service_account_name="dcos_marathon"
+ linux_user=user, role_name="slave_public", service_account_name="dcos_marathon"
)
log.info(f"Revoking user permissions for {service_account}. User: {user}, role: {role}")
sdk_security.revoke_permissions(
- linux_user=user,
- role_name=role,
- service_account_name=service_account
+ linux_user=user, role_name=role, service_account_name=service_account
)
def _escape_service_name(service_name):
app_id = "/{}".format(service_name.lstrip("/"))
# double-encoded (why?)
- return urllib.parse.quote(
- urllib.parse.quote(app_id, safe=''),
- safe=''
- )
+ return urllib.parse.quote(urllib.parse.quote(app_id, safe=""), safe="")
def grant_launch_task_permission(service_name, service_account_name=SPARK_SERVICE_ACCOUNT):
- log.info(f"Granting launch task permission to service account: {service_account_name}, service: {service_name}")
+ log.info(
+ f"Granting launch task permission to service account: {service_account_name}, service: {service_name}"
+ )
app_id = _escape_service_name(service_name)
- sdk_security._grant(service_account_name,
- "dcos:mesos:master:task:app_id:{}".format(app_id),
- description="Spark drivers may execute Mesos tasks",
- action="create")
+ sdk_security._grant(
+ service_account_name,
+ "dcos:mesos:master:task:app_id:{}".format(app_id),
+ description="Spark drivers may execute Mesos tasks",
+ action="create",
+ )
def revoke_launch_task_permission(service_name, service_account_name=SPARK_SERVICE_ACCOUNT):
- log.info(f"Revoking launch task permission to service account: {service_account_name}, service: {service_name}")
+ log.info(
+ f"Revoking launch task permission to service account: {service_account_name}, service: {service_name}"
+ )
app_id = _escape_service_name(service_name)
- sdk_security._revoke(service_account_name,
- "dcos:mesos:master:task:app_id:{}".format(app_id),
- description="Spark drivers may execute Mesos tasks",
- action="create")
+ sdk_security._revoke(
+ service_account_name,
+ "dcos:mesos:master:task:app_id:{}".format(app_id),
+ description="Spark drivers may execute Mesos tasks",
+ action="create",
+ )
-def spark_security_session(users=[SPARK_USER], service_names=[SPARK_SERVICE_NAME, FOLDERED_SPARK_SERVICE_NAME]):
- '''
+def spark_security_session(
+ users=[SPARK_USER], service_names=[SPARK_SERVICE_NAME, FOLDERED_SPARK_SERVICE_NAME]
+):
+ """
Spark strict mode setup is slightly different from dcos-commons, so can't use sdk_security::security_session.
Differences:
(1) the role is "*", (2) the driver itself is a framework and needs permission to execute tasks.
- '''
- role = '*'
+ """
+ role = "*"
service_account = SPARK_SERVICE_ACCOUNT
secret = SPARK_SERVICE_ACCOUNT_SECRET
def setup_security():
- log.info('Setting up strict-mode security for Spark')
- sdk_security.create_service_account(service_account_name=service_account, service_account_secret=secret)
+ log.info("Setting up strict-mode security for Spark")
+ sdk_security.create_service_account(
+ service_account_name=service_account, service_account_secret=secret
+ )
for user in users:
grant_user_permissions(user, role, service_account)
@@ -370,10 +372,10 @@ def setup_security():
for service_name in service_names:
grant_launch_task_permission(service_name)
- log.info('Finished setting up strict-mode security for Spark')
+ log.info("Finished setting up strict-mode security for Spark")
def cleanup_security():
- log.info('Cleaning up strict-mode security for Spark')
+ log.info("Cleaning up strict-mode security for Spark")
for user in users:
revoke_user_permissions(user, role, service_account)
@@ -381,7 +383,7 @@ def cleanup_security():
# TODO: improve security setup/teardown to make it more fine-grained (allow different service names/accts/users)
# tracking issue: https://jira.mesosphere.com/browse/DCOS-50933
sdk_security.delete_service_account(service_account, secret)
- log.info('Finished cleaning up strict-mode security for Spark')
+ log.info("Finished cleaning up strict-mode security for Spark")
try:
if not sdk_utils.is_open_dcos():
@@ -393,33 +395,3 @@ def cleanup_security():
finally:
if sdk_utils.is_strict_mode():
cleanup_security()
-
-
-def restart_task_agent_and_verify_state(host_ip, task, expected_state):
- dcos_utils.agent_ssh(host_ip, stop_agent_cmd)
- _check_agent_status(host_ip, "inactive")
- dcos_utils.agent_ssh(host_ip, start_agent_cmd)
- _check_agent_status(host_ip, "active")
- _wait_for_task_status(task["id"], expected_state)
-
-
-@retrying.retry(
- wait_fixed=5000,
- stop_max_delay=120 * 1000,
- retry_on_result=lambda res: not res)
-def _check_agent_status(host_ip, expected_status):
- status = dcos_utils.agent_ssh(host_ip, check_agent_cmd)
- log.info(f"Checking status of agent at host {host_ip}, expected: {expected_status}, actual: {status}")
- return expected_status == status
-
-
-@retrying.retry(
- wait_fixed=5000,
- stop_max_delay=120 * 1000,
- retry_on_result=lambda res: not res)
-def _wait_for_task_status(task_id, expected_state):
- completed = expected_state != "TASK_RUNNING"
- task = shakedown.get_task(task_id, completed=completed)
- assert task is not None
- log.info(f"Checking task state for '{task_id}', expected: {expected_state}, actual: {task['state']}")
- return expected_state == task["state"]
diff --git a/testing/sdk_install.py b/testing/sdk_install.py
index b591f248..f6cb043c 100644
--- a/testing/sdk_install.py
+++ b/testing/sdk_install.py
@@ -1,10 +1,10 @@
-'''Utilities relating to installing services
+"""Utilities relating to installing services
************************************************************************
FOR THE TIME BEING WHATEVER MODIFICATIONS ARE APPLIED TO THIS FILE
SHOULD ALSO BE APPLIED TO sdk_install IN ANY OTHER PARTNER REPOS
************************************************************************
-'''
+"""
import collections
import logging
import time
@@ -27,86 +27,98 @@
TIMEOUT_SECONDS = 15 * 60
-'''List of services which are currently installed via install().
-Used by post-test diagnostics to retrieve stuff from currently running services.'''
+"""List of services which are currently installed via install().
+Used by post-test diagnostics to retrieve stuff from currently running services."""
_installed_service_names = set([])
def get_installed_service_names() -> set:
- '''Returns the a set of service names which had been installed via sdk_install in this session.'''
+ """Returns the a set of service names which had been installed via sdk_install in this session."""
return _installed_service_names
-@retrying.retry(stop_max_attempt_number=3,
- retry_on_exception=lambda e: isinstance(e, dcos.errors.DCOSException))
+@retrying.retry(
+ stop_max_attempt_number=3, retry_on_exception=lambda e: isinstance(e, dcos.errors.DCOSException)
+)
def _retried_install_impl(
- package_name,
- service_name,
- expected_running_tasks,
- options={},
- package_version=None,
- timeout_seconds=TIMEOUT_SECONDS,
- install_cli=True):
- '''Cleaned up version of shakedown's package_install().'''
+ package_name,
+ service_name,
+ expected_running_tasks,
+ options={},
+ package_version=None,
+ timeout_seconds=TIMEOUT_SECONDS,
+ install_cli=True,
+):
+ """Cleaned up version of shakedown's package_install()."""
package_manager = dcos.packagemanager.PackageManager(dcos.cosmos.get_cosmos_url())
pkg = package_manager.get_package_version(package_name, package_version)
if package_version is None:
# Get the resolved version for logging below
- package_version = 'auto:{}'.format(pkg.version())
+ package_version = "auto:{}".format(pkg.version())
- log.info('Installing package={} service={} with options={} version={}'.format(
- package_name, service_name, options, package_version))
+ log.info(
+ "Installing package={} service={} with options={} version={}".format(
+ package_name, service_name, options, package_version
+ )
+ )
# Trigger package install, but only if it's not already installed.
# We expect upstream to have confirmed that it wasn't already installed beforehand.
if sdk_marathon.app_exists(service_name):
- log.info('Marathon app={} exists, skipping package install call'.format(service_name))
+ log.info("Marathon app={} exists, skipping package install call".format(service_name))
else:
package_manager.install_app(pkg, options)
# Install CLI while package starts to install
if install_cli and pkg.cli_definition():
- log.info('Installing CLI for package={}'.format(package_name))
+ log.info("Installing CLI for package={}".format(package_name))
dcos.subcommand.install(pkg)
# Wait for expected tasks to come up
if expected_running_tasks > 0:
+ if service_name[0] != "/":
+ service_name = "/" + service_name
shakedown.wait_for_service_tasks_running(
- service_name, expected_running_tasks, timeout_seconds)
+ service_name, expected_running_tasks, timeout_seconds
+ )
# Wait for completed marathon deployment
- app_id = pkg.marathon_json(options).get('id')
+ app_id = pkg.marathon_json(options).get("id")
shakedown.deployment_wait(timeout_seconds, app_id)
def install(
- package_name,
- service_name,
- expected_running_tasks,
- additional_options={},
- package_version=None,
- timeout_seconds=TIMEOUT_SECONDS,
- wait_for_deployment=True,
- insert_strict_options=True,
- install_cli=True):
+ package_name,
+ service_name,
+ expected_running_tasks,
+ additional_options={},
+ package_version=None,
+ timeout_seconds=TIMEOUT_SECONDS,
+ wait_for_deployment=True,
+ insert_strict_options=True,
+ install_cli=True,
+):
start = time.time()
# If the package is already installed at this point, fail immediately.
if sdk_marathon.app_exists(service_name):
- raise dcos.errors.DCOSException('Service is already installed: {}'.format(service_name))
+ raise dcos.errors.DCOSException("Service is already installed: {}".format(service_name))
if insert_strict_options and sdk_utils.is_strict_mode():
# strict mode requires correct principal and secret to perform install.
# see also: sdk_security.py
- options = merge_dictionaries({
- 'service': {
- 'service_account': 'service-acct',
- 'principal': 'service-acct',
- 'service_account_secret': 'secret',
- 'secret_name': 'secret'
- }
- }, additional_options)
+ options = merge_dictionaries(
+ {
+ "service": {
+ "service_account": "service-acct",
+ "principal": "service-acct",
+ "service_account_secret": "secret",
+ "secret_name": "secret",
+ }
+ },
+ additional_options,
+ )
else:
options = additional_options
@@ -118,7 +130,8 @@ def install(
options,
package_version,
timeout_seconds,
- install_cli)
+ install_cli,
+ )
# 2. Wait for the scheduler to be idle (as implied by deploy plan completion and suppressed bit)
# This should be skipped ONLY when it's known that the scheduler will be stuck in an incomplete
@@ -126,12 +139,18 @@ def install(
if wait_for_deployment:
# this can take a while, default is 15 minutes. for example with HDFS, we can hit the expected
# total task count via FINISHED tasks, without actually completing deployment
- log.info('Waiting for package={} service={} to finish deployment plan...'.format(
- package_name, service_name))
+ log.info(
+ "Waiting for package={} service={} to finish deployment plan...".format(
+ package_name, service_name
+ )
+ )
sdk_plan.wait_for_completed_deployment(service_name, timeout_seconds)
- log.info('Installed package={} service={} after {}'.format(
- package_name, service_name, shakedown.pretty_duration(time.time() - start)))
+ log.info(
+ "Installed package={} service={} after {}".format(
+ package_name, service_name, shakedown.pretty_duration(time.time() - start)
+ )
+ )
global _installed_service_names
_installed_service_names.add(service_name)
@@ -139,46 +158,52 @@ def install(
def run_janitor(service_name, role, service_account, znode):
if role is None:
- role = sdk_utils.get_deslashed_service_name(service_name) + '-role'
+ role = sdk_utils.get_deslashed_service_name(service_name) + "-role"
if service_account is None:
- service_account = service_name + '-principal'
+ service_account = service_name + "-principal"
if znode is None:
znode = sdk_utils.get_zk_path(service_name)
- auth_token = sdk_cmd.run_cli('config show core.dcos_acs_token', print_output=False).strip()
- exhibitor_url = sdk_cmd.run_cli('config show core.dcos_url', print_output=False).strip() + '/exhibitor/'
-
- cmd_list = ["sudo", "docker", "run", "mesosphere/janitor", "/janitor.py",
- "-r", role,
- "-p", service_account,
- "-z", znode,
- "--auth_token={}".format(auth_token),
- "-e", exhibitor_url]
+ auth_token = sdk_cmd.run_cli("config show core.dcos_acs_token", print_output=False).strip()
+
+ cmd_list = [
+ "sudo",
+ "docker",
+ "run",
+ "mesosphere/janitor",
+ "/janitor.py",
+ "-r",
+ role,
+ "-p",
+ service_account,
+ "-z",
+ znode,
+ "--auth_token={}".format(auth_token),
+ ]
cmd = " ".join(cmd_list)
sdk_cmd.master_ssh(cmd)
-@retrying.retry(stop_max_attempt_number=5,
- wait_fixed=5000,
- retry_on_exception=lambda e: isinstance(e, Exception))
+@retrying.retry(
+ stop_max_attempt_number=5,
+ wait_fixed=5000,
+ retry_on_exception=lambda e: isinstance(e, Exception),
+)
def retried_run_janitor(*args, **kwargs):
run_janitor(*args, **kwargs)
-@retrying.retry(stop_max_attempt_number=5,
- wait_fixed=5000,
- retry_on_exception=lambda e: isinstance(e, Exception))
+@retrying.retry(
+ stop_max_attempt_number=5,
+ wait_fixed=5000,
+ retry_on_exception=lambda e: isinstance(e, Exception),
+)
def retried_uninstall_package_and_wait(*args, **kwargs):
shakedown.uninstall_package_and_wait(*args, **kwargs)
-def uninstall(
- package_name,
- service_name,
- role=None,
- service_account=None,
- zk=None):
+def uninstall(package_name, service_name, role=None, service_account=None, zk=None):
start = time.time()
global _installed_service_names
@@ -187,55 +212,57 @@ def uninstall(
except KeyError:
pass # allow tests to 'uninstall' up-front
- log.info('Uninstalling {}'.format(service_name))
+ log.info("Uninstalling {}".format(service_name))
try:
retried_uninstall_package_and_wait(package_name, service_name=service_name)
except Exception:
- log.info('Got exception when uninstalling {}'.format(service_name))
+ log.info("Got exception when uninstalling {}".format(service_name))
log.info(traceback.format_exc())
raise
finally:
- log.info('Reserved resources post uninstall:')
+ log.info("Reserved resources post uninstall:")
sdk_utils.list_reserved_resources()
cleanup_start = time.time()
try:
- if sdk_utils.dcos_version_less_than('1.10'):
- log.info('Janitoring {}'.format(service_name))
+ if sdk_utils.dcos_version_less_than("1.10"):
+ log.info("Janitoring {}".format(service_name))
retried_run_janitor(service_name, role, service_account, zk)
else:
- log.info('Waiting for Marathon app to be removed {}'.format(service_name))
+ log.info("Waiting for Marathon app to be removed {}".format(service_name))
sdk_marathon.retried_wait_for_deployment_and_app_removal(
- sdk_marathon.get_app_id(service_name), timeout=TIMEOUT_SECONDS)
+ sdk_marathon.get_app_id(service_name), timeout=TIMEOUT_SECONDS
+ )
except Exception:
- log.info('Got exception when cleaning up {}'.format(service_name))
+ log.info("Got exception when cleaning up {}".format(service_name))
log.info(traceback.format_exc())
raise
finally:
- log.info('Reserved resources post cleanup:')
+ log.info("Reserved resources post cleanup:")
sdk_utils.list_reserved_resources()
finish = time.time()
log.info(
- 'Uninstalled {} after pkg({}) + cleanup({}) = total({})'.format(
+ "Uninstalled {} after pkg({}) + cleanup({}) = total({})".format(
service_name,
shakedown.pretty_duration(cleanup_start - start),
shakedown.pretty_duration(finish - cleanup_start),
- shakedown.pretty_duration(finish - start)))
+ shakedown.pretty_duration(finish - start),
+ )
+ )
def merge_dictionaries(dict1, dict2):
- if (not isinstance(dict2, dict)):
+ if not isinstance(dict2, dict):
return dict1
ret = {}
for k, v in dict1.items():
ret[k] = v
for k, v in dict2.items():
- if (k in dict1 and isinstance(dict1[k], dict)
- and isinstance(dict2[k], collections.Mapping)):
+ if k in dict1 and isinstance(dict1[k], dict) and isinstance(dict2[k], collections.Mapping):
ret[k] = merge_dictionaries(dict1[k], dict2[k])
else:
ret[k] = dict2[k]
diff --git a/testing/sdk_marathon.py b/testing/sdk_marathon.py
index a4fb65ff..551a4409 100644
--- a/testing/sdk_marathon.py
+++ b/testing/sdk_marathon.py
@@ -89,7 +89,9 @@ def wait_for_response():
def is_app_running(app: dict) -> bool:
- return app['tasksStaged'] == 0 and app['tasksUnhealthy'] == 0 and app['tasksRunning'] > 0
+ return ('tasksStaged' not in app or app['tasksStaged'] == 0) and \
+ ('tasksUnhealthy' not in app or app['tasksUnhealthy'] == 0) and \
+ ('tasksRunning' in app and app['tasksRunning'] > 0)
def wait_for_deployment_and_app_running(app_name: str, timeout: int):
diff --git a/testing/sdk_security.py b/testing/sdk_security.py
index b1176218..dbca6328 100644
--- a/testing/sdk_security.py
+++ b/testing/sdk_security.py
@@ -1,9 +1,9 @@
-'''
+"""
************************************************************************
FOR THE TIME BEING WHATEVER MODIFICATIONS ARE APPLIED TO THIS FILE
SHOULD ALSO BE APPLIED TO sdk_security IN ANY OTHER PARTNER REPOS
************************************************************************
-'''
+"""
import logging
import os
import tempfile
@@ -31,9 +31,9 @@ def install_enterprise_cli(force=False):
cmd = "package install --yes --cli dcos-enterprise-cli"
- @retrying.retry(stop_max_attempt_number=3,
- wait_fixed=2000,
- retry_on_result=lambda result: result)
+ @retrying.retry(
+ stop_max_attempt_number=3, wait_fixed=2000, retry_on_result=lambda result: result
+ )
def _install_impl():
rc, stdout, stderr = sdk_cmd.run_raw_cli(cmd)
if rc:
@@ -47,103 +47,149 @@ def _install_impl():
raise RuntimeError("Failed to install the dcos-enterprise-cli: {}".format(repr(e)))
-def _grant(user: str, acl: str, description: str, action: str="create") -> None:
- log.info('Granting permission to {user} for {acl}/{action} ({description})'.format(
- user=user, acl=acl, action=action, description=description))
+def _grant(user: str, acl: str, description: str, action: str = "create") -> None:
+ log.info(
+ "Granting permission to {user} for {acl}/{action} ({description})".format(
+ user=user, acl=acl, action=action, description=description
+ )
+ )
# Create the ACL
r = sdk_cmd.cluster_request(
- 'PUT', '/acs/api/v1/acls/{acl}'.format(acl=acl),
+ "PUT",
+ "/acs/api/v1/acls/{acl}".format(acl=acl),
raise_on_error=False,
- json={'description': description})
+ json={"description": description},
+ )
# 201=created, 409=already exists
- assert r.status_code in [201, 409, ], '{} failed {}: {}'.format(r.url, r.status_code, r.text)
+ assert r.status_code in [201, 409], "{} failed {}: {}".format(r.url, r.status_code, r.text)
# Assign the user to the ACL
r = sdk_cmd.cluster_request(
- 'PUT', '/acs/api/v1/acls/{acl}/users/{user}/{action}'.format(acl=acl, user=user, action=action),
- raise_on_error=False)
+ "PUT",
+ "/acs/api/v1/acls/{acl}/users/{user}/{action}".format(acl=acl, user=user, action=action),
+ raise_on_error=False,
+ )
# 204=success, 409=already exists
- assert r.status_code in [204, 409, ], '{} failed {}: {}'.format(r.url, r.status_code, r.text)
+ assert r.status_code in [204, 409], "{} failed {}: {}".format(r.url, r.status_code, r.text)
-def _revoke(user: str, acl: str, description: str, action: str="create") -> None:
+def _revoke(user: str, acl: str, description: str, action: str = "create") -> None:
# TODO(kwood): INFINITY-2065 - implement security cleanup
log.info("Want to delete {user}+{acl}".format(user=user, acl=acl))
-def get_permissions(service_account_name: str, role: str, linux_user: str) -> typing.List[dict]:
+def get_role_permissions(service_account_name: str, role: str) -> typing.List[dict]:
return [
# registration permissions
{
- 'user': service_account_name,
- 'acl': "dcos:mesos:master:framework:role:{}".format(role),
- 'description': "Service {} may register with the Mesos master with role={}".format(
- service_account_name, role),
+ "user": service_account_name,
+ "acl": "dcos:mesos:master:framework:role:{}".format(role),
+ "description": "Service {} may register with the Mesos master with role={}".format(
+ service_account_name, role
+ ),
+ },
+ # resource permissions
+ {
+ "user": service_account_name,
+ "acl": "dcos:mesos:master:reservation:role:{}".format(role),
+ "description": "Service {} may reserve Mesos resources with role={}".format(
+ service_account_name, role
+ ),
+ },
+ # volume permissions
+ {
+ "user": service_account_name,
+ "acl": "dcos:mesos:master:volume:role:{}".format(role),
+ "description": "Service {} may create Mesos volumes with role={}".format(
+ service_account_name, role
+ ),
},
+ ]
+
+def get_permissions(service_account_name: str, linux_user: str) -> typing.List[dict]:
+ return [
# task execution permissions
{
- 'user': service_account_name,
- 'acl': "dcos:mesos:master:task:user:{}".format(linux_user),
- 'description': "Service {} may execute Mesos tasks as user={}".format(
- service_account_name, linux_user)
+ "user": service_account_name,
+ "acl": "dcos:mesos:master:task:user:{}".format(linux_user),
+ "description": "Service {} may execute Mesos tasks as user={}".format(
+ service_account_name, linux_user
+ ),
},
-
# XXX 1.10 currently requires this mesos:agent permission as well as
# mesos:task permission. unclear if this will be ongoing requirement.
# See DCOS-15682
{
- 'user': service_account_name,
- 'acl': "dcos:mesos:agent:task:user:{}".format(linux_user),
- 'description': "Service {} may execute Mesos tasks as user={}".format(
- service_account_name, linux_user)
+ "user": service_account_name,
+ "acl": "dcos:mesos:agent:task:user:{}".format(linux_user),
+ "description": "Service {} may execute Mesos tasks as user={}".format(
+ service_account_name, linux_user
+ ),
},
-
# resource permissions
{
- 'user': service_account_name,
- 'acl': "dcos:mesos:master:reservation:role:{}".format(role),
- 'description': "Service {} may reserve Mesos resources with role={}".format(
- service_account_name, role)
- },
- {
- 'user': service_account_name,
- 'acl': "dcos:mesos:master:reservation:principal:{}".format(service_account_name),
- 'description': "Service {} may reserve Mesos resources with principal={}".format(
- service_account_name, service_account_name),
- 'action': "delete",
+ "user": service_account_name,
+ "acl": "dcos:mesos:master:reservation:principal:{}".format(service_account_name),
+ "description": "Service {} may reserve Mesos resources with principal={}".format(
+ service_account_name, service_account_name
+ ),
+ "action": "delete",
},
-
# volume permissions
{
- 'user': service_account_name,
- 'acl': "dcos:mesos:master:volume:role:{}".format(role),
- 'description': "Service {} may create Mesos volumes with role={}".format(
- service_account_name, role)
+ "user": service_account_name,
+ "acl": "dcos:mesos:master:volume:principal:{}".format(service_account_name),
+ "description": "Service {} may create Mesos volumes with principal={}".format(
+ service_account_name, service_account_name
+ ),
+ "action": "delete",
},
- {
- 'user': service_account_name,
- 'acl': "dcos:mesos:master:volume:principal:{}".format(service_account_name),
- 'description': "Service {} may create Mesos volumes with principal={}".format(
- service_account_name, service_account_name),
- 'action': "delete",
- }]
+ ]
-def grant_permissions(linux_user: str, role_name: str, service_account_name: str) -> None:
+def grant_permissions(
+ linux_user: str,
+ role_name: str = "",
+ role_list: typing.List[str] = [],
+ service_account_name: str = "",
+) -> None:
log.info("Granting permissions to {account}".format(account=service_account_name))
- permissions = get_permissions(service_account_name, role_name, linux_user)
+
+ permissions = get_permissions(service_account_name, linux_user)
+
+ if not role_list:
+ role_list = [role_name]
+
+ for role in role_list:
+ permissions += get_role_permissions(service_account_name, role)
+
for permission in permissions:
_grant(**permission)
+
log.info("Permission setup completed for {account}".format(account=service_account_name))
-def revoke_permissions(linux_user: str, role_name: str, service_account_name: str) -> None:
+def revoke_permissions(
+ linux_user: str,
+ service_account_name: str,
+ role_name: str = "",
+ role_list: typing.List[str] = [],
+) -> None:
log.info("Revoking permissions to {account}".format(account=service_account_name))
- permissions = get_permissions(service_account_name, role_name, linux_user)
+
+ permissions = get_permissions(service_account_name, linux_user)
+
+ if not role_list:
+ role_list = [role_name]
+
+ for role in role_list:
+ permissions += get_role_permissions(service_account_name, role)
+
for permission in permissions:
_revoke(**permission)
+
log.info("Permission cleanup completed for {account}".format(account=service_account_name))
@@ -153,36 +199,49 @@ def create_service_account(service_account_name: str, service_account_secret: st
"""
install_enterprise_cli()
- log.info('Creating service account for account={account} secret={secret}'.format(
- account=service_account_name,
- secret=service_account_secret))
+ log.info(
+ "Creating service account for account={account} secret={secret}".format(
+ account=service_account_name, secret=service_account_secret
+ )
+ )
if service_account_secret == service_account_name:
log.warning("Values for service_account_name and service_account_secret are the same.")
- log.info('Remove any existing service account and/or secret')
+ log.info("Remove any existing service account and/or secret")
delete_service_account(service_account_name, service_account_secret)
with tempfile.TemporaryDirectory() as tmp_dir:
private_key_file = os.path.join(tmp_dir, "private-key.pem")
public_key_file = os.path.join(tmp_dir, "public-key.pem")
- log.info('Create keypair')
- sdk_cmd.run_cli('security org service-accounts keypair {} {}'.format(private_key_file, public_key_file))
+ log.info("Create keypair")
+ sdk_cmd.run_cli(
+ "security org service-accounts keypair {} {}".format(private_key_file, public_key_file)
+ )
- log.info('Create service account')
- sdk_cmd.run_cli('security org service-accounts create -p {public_key} '
- '-d "Service account for integration tests" "{account}"'.format(public_key=public_key_file,
- account=service_account_name))
+ log.info("Create service account")
+ sdk_cmd.run_cli(
+ "security org service-accounts create -p {public_key} "
+ '-d "Service account for integration tests" "{account}"'.format(
+ public_key=public_key_file, account=service_account_name
+ )
+ )
- log.info('Create secret')
+ log.info("Create secret")
sdk_cmd.run_cli(
'security secrets create-sa-secret --strict "{private_key}" "{account}" "{secret}"'.format(
- private_key=private_key_file, account=service_account_name, secret=service_account_secret))
+ private_key=private_key_file,
+ account=service_account_name,
+ secret=service_account_secret,
+ )
+ )
- log.info('Service account created for account={account} secret={secret}'.format(
- account=service_account_name,
- secret=service_account_secret))
+ log.info(
+ "Service account created for account={account} secret={secret}".format(
+ account=service_account_name, secret=service_account_secret
+ )
+ )
def delete_service_account(service_account_name: str, service_account_secret: str) -> None:
@@ -204,31 +263,43 @@ def delete_secret(secret: str) -> None:
def _get_role_list(service_name: str) -> typing.List[str]:
- # TODO: spark_utils uses:
- # app_id_encoded = urllib.parse.quote(
- # urllib.parse.quote(app_id, safe=''),
- # safe=''
- # )
role_basename = service_name.replace("/", "__")
- return [
+
+ roles = []
+
+ # Grant roles on each group components.
+ path_components = service_name.split("/")
+ role_path = ""
+ for component in path_components[:-1]:
+ if role_path != "":
+ role_path += "__"
+ role_path += component
+
+ roles.append(role_path)
+
+ return roles + [
"{}-role".format(role_basename),
"slave_public%252F{}-role".format(role_basename),
]
-def setup_security(service_name: str,
- linux_user: str="nobody",
- service_account: str="service-acct",
- service_account_secret: str="secret") -> dict:
+def setup_security(
+ service_name: str,
+ linux_user: str = "nobody",
+ service_account: str = "service-acct",
+ service_account_secret: str = "secret",
+) -> dict:
- create_service_account(service_account_name=service_account,
- service_account_secret=service_account_secret)
+ create_service_account(
+ service_account_name=service_account, service_account_secret=service_account_secret
+ )
- service_account_info = {"name": service_account,
- "secret": service_account_secret,
- "linux_user": linux_user,
- "roles": []
- }
+ service_account_info = {
+ "name": service_account,
+ "secret": service_account_secret,
+ "linux_user": linux_user,
+ "roles": [],
+ }
if not sdk_utils.is_strict_mode():
log.info("Skipping strict-mode security setup on non-strict cluster")
@@ -238,20 +309,18 @@ def setup_security(service_name: str,
service_account_info["roles"] = _get_role_list(service_name)
- for role_name in service_account_info["roles"]:
- grant_permissions(
- linux_user=linux_user,
- role_name=role_name,
- service_account_name=service_account
- )
+ grant_permissions(
+ linux_user=linux_user,
+ role_list=service_account_info["roles"],
+ service_account_name=service_account,
+ )
log.info("Finished setting up strict-mode security")
return service_account_info
-def cleanup_security(service_name: str,
- service_account_info: typing.Dict) -> None:
+def cleanup_security(service_name: str, service_account_info: typing.Dict) -> None:
service_account = service_account_info.get("name", "service-acct")
service_account_secret = service_account_info.get("secret", "secret")
@@ -262,19 +331,18 @@ def cleanup_security(service_name: str,
roles = service_account_info.get("roles", _get_role_list(service_name))
linux_user = service_account_info.get("linux_user", "nobody")
- for role_name in roles:
- revoke_permissions(
- linux_user=linux_user,
- role_name=role_name,
- service_account_name=service_account
- )
+ revoke_permissions(
+ linux_user=linux_user, role_list=roles, service_account_name=service_account
+ )
delete_service_account(service_account, service_account_secret)
log.info("Finished cleaning up strict-mode security")
-def security_session(framework_name: str, service_account: str="service-acct", secret: str="secret") -> None:
+def security_session(
+ framework_name: str, service_account: str = "service-acct", secret: str = "secret"
+) -> None:
"""Create a service account and configure permissions for strict-mode tests.
This should generally be used as a fixture in a framework's conftest.py:
@@ -286,9 +354,11 @@ def configure_security(configure_universe):
try:
is_strict = sdk_utils.is_strict_mode()
if is_strict:
- service_account_info = setup_security(service_name=framework_name,
- service_account=service_account,
- service_account_secret=secret)
+ service_account_info = setup_security(
+ service_name=framework_name,
+ service_account=service_account,
+ service_account_secret=secret,
+ )
yield
finally:
if is_strict:
@@ -297,23 +367,31 @@ def configure_security(configure_universe):
def openssl_ciphers():
return set(
- check_output(['openssl', 'ciphers',
- 'ALL:eNULL']).decode('utf-8').rstrip().split(':'))
-
-
-def is_cipher_enabled(service_name: str,
- task_name: str,
- cipher: str,
- endpoint: str,
- openssl_timeout: str = '1') -> bool:
- @retrying.retry(stop_max_attempt_number=3,
- wait_fixed=2000,
- retry_on_result=lambda result: 'Failed to enter mount namespace' in result)
+ check_output(["openssl", "ciphers", "ALL:eNULL"]).decode("utf-8").rstrip().split(":")
+ )
+
+
+def is_cipher_enabled(
+ service_name: str, task_name: str, cipher: str, endpoint: str, openssl_timeout: str = "1"
+) -> bool:
+ @retrying.retry(
+ stop_max_attempt_number=3,
+ wait_fixed=2000,
+ retry_on_result=lambda result: "Failed to enter mount namespace" in result,
+ )
def run_openssl_command() -> str:
- command = ' '.join([
- 'timeout', openssl_timeout,
- 'openssl', 's_client', '-cipher', cipher, '-connect', endpoint
- ])
+ command = " ".join(
+ [
+ "timeout",
+ openssl_timeout,
+ "openssl",
+ "s_client",
+ "-cipher",
+ cipher,
+ "-connect",
+ endpoint,
+ ]
+ )
_, output = sdk_cmd.service_task_exec(service_name, task_name, command, True)
return output