diff --git a/scale-tests/batch_test.py b/scale-tests/batch_test.py index 1862b5a0..e085708f 100755 --- a/scale-tests/batch_test.py +++ b/scale-tests/batch_test.py @@ -10,6 +10,7 @@ Options: --docker-image docker image to run on executors + --group-role root-level group to apply quotas against (e.g. '/dev') [default: None] --max-num-dispatchers maximum number of dispatchers to use from dispatchers file --submits-per-min number of jobs to submit per minute [default: 1] --spark-cores-max max executor cores per job [default: 1] @@ -24,13 +25,15 @@ """ +from docopt import docopt +from threading import Thread + import json import logging +import os import random import sys import time -from docopt import docopt -from threading import Thread import typing import sdk_utils @@ -48,9 +51,10 @@ logging.basicConfig( - format='[%(asctime)s|%(name)s|%(levelname)s]: %(message)s', + format="[%(asctime)s|%(name)s|%(levelname)s]: %(message)s", level=logging.INFO, - stream=sys.stdout) + stream=sys.stdout, +) log = logging.getLogger(__name__) MONTE_CARLO_APP_URL = "https://raw.githubusercontent.com/mesosphere/spark-build/master/scale-tests/apps/monte-carlo-portfolio.py" @@ -76,15 +80,19 @@ def _get_duration() -> int: def _get_gpu_user_conf(args): def _verify_required_args(): - if not (args["--spark-mesos-max-gpus"] and - args["--spark-mesos-executor-gpus"] and - args["--docker-image"]): - log.error(""" + if not ( + args["--spark-mesos-max-gpus"] + and args["--spark-mesos-executor-gpus"] + and args["--docker-image"] + ): + log.error( + """ Missing required arguments for running gpu jobs. Please include: --spark-mesos-max-gpus --spark-mesos-executor-gpus --docker-image - """) + """ + ) _verify_required_args() @@ -92,38 +100,61 @@ def _verify_required_args(): # This is due to memory being divvied up and allocated to each GPU device. memory_multiplier = 20 memory = int(args["--spark-mesos-executor-gpus"]) * memory_multiplier - return ["--conf", "spark.driver.memory={}g".format(str(memory)), - "--conf", "spark.executor.memory={}g".format(str(memory)), - "--conf", "spark.mesos.gpus.max={}".format(args["--spark-mesos-max-gpus"]), - "--conf", "spark.mesos.executor.gpus={}".format(args["--spark-mesos-executor-gpus"]), - "--conf", "spark.mesos.executor.docker.image={}".format(args["--docker-image"]), - "--conf", "spark.mesos.executor.docker.forcePullImage=false" - ] - - -def submit_job(app_url: str, app_args: str, dispatcher: typing.Dict, duration: int, config: typing.List[str]): + return [ + "--conf", + "spark.driver.memory={}g".format(str(memory)), + "--conf", + "spark.executor.memory={}g".format(str(memory)), + "--conf", + "spark.mesos.gpus.max={}".format(args["--spark-mesos-max-gpus"]), + "--conf", + "spark.mesos.executor.gpus={}".format(args["--spark-mesos-executor-gpus"]), + "--conf", + "spark.mesos.executor.docker.image={}".format(args["--docker-image"]), + "--conf", + "spark.mesos.executor.docker.forcePullImage=false", + ] + + +def submit_job( + app_url: str, + app_args: str, + dispatcher: typing.Dict, + duration: int, + config: typing.List[str], + group_role: str, +): dispatcher_name = dispatcher["service"]["name"] log.info("Submitting job to dispatcher: %s, with duration: %s min.", dispatcher_name, duration) + driver_role = None if group_role else dispatcher["roles"]["executors"] + spark_utils.submit_job( service_name=dispatcher_name, app_url=app_url, app_args=app_args, verbose=False, args=config, - driver_role=dispatcher["roles"]["executors"], + driver_role=driver_role, spark_user=dispatcher["service"]["user"] if sdk_utils.is_strict_mode() else None, - principal=dispatcher["service"]["service_account"] if sdk_utils.is_strict_mode() else None) + principal=dispatcher["service"]["service_account"] if sdk_utils.is_strict_mode() else None, + ) -def submit_loop(app_url: str, submits_per_min: int, dispatchers: typing.List[typing.Dict], user_conf: typing.List[str]): +def submit_loop( + app_url: str, + submits_per_min: int, + dispatchers: typing.List[typing.Dict], + user_conf: typing.List[str], + group_role: str, +): sec_between_submits = 60 / submits_per_min log.info("sec_between_submits: %s", sec_between_submits) num_dispatchers = len(dispatchers) log.info("num_dispatchers: %s", num_dispatchers) dispatcher_index = 0 - while(True): + while True: duration = _get_duration() if app_url == MONTE_CARLO_APP_URL: @@ -131,7 +162,17 @@ def submit_loop(app_url: str, submits_per_min: int, dispatchers: typing.List[typ else: app_args = "550 3" # 550 images in 3 batches - t = Thread(target=submit_job, args=(app_url, app_args, dispatchers[dispatcher_index], duration, user_conf)) + t = Thread( + target=submit_job, + args=( + app_url, + app_args, + dispatchers[dispatcher_index], + duration, + user_conf, + group_role, + ), + ) t.start() dispatcher_index = (dispatcher_index + 1) % num_dispatchers log.info("sleeping %s sec.", sec_between_submits) @@ -151,35 +192,57 @@ def submit_loop(app_url: str, submits_per_min: int, dispatchers: typing.List[typ if end <= len(dispatchers): dispatchers = dispatchers[0:end] else: - log.warning(""" + log.warning( + """ Specified --max-num-dispatchers is greater than actual dispatcher count in {}. Using list of dispatchers from file instead. - """.format(args[""])) - - user_conf = ["--conf", "spark.cores.max={}".format(args["--spark-cores-max"]), - "--conf", "spark.executor.cores={}".format(args["--spark-executor-cores"]), - "--conf", "spark.mesos.containerizer={}".format(args["--spark-mesos-containerizer"]), - "--conf", "spark.port.maxRetries={}".format(args["--spark-port-max-retries"]), - "--conf", "spark.mesos.driver.failoverTimeout={}".format(args["--spark-mesos-driver-failover-timeout"]) - ] + """.format( + args[""] + ) + ) + + user_conf = [ + "--conf", + "spark.cores.max={}".format(args["--spark-cores-max"]), + "--conf", + "spark.executor.cores={}".format(args["--spark-executor-cores"]), + "--conf", + "spark.mesos.containerizer={}".format(args["--spark-mesos-containerizer"]), + "--conf", + "spark.port.maxRetries={}".format(args["--spark-port-max-retries"]), + "--conf", + "spark.mesos.driver.failoverTimeout={}".format( + args["--spark-mesos-driver-failover-timeout"] + ), + ] if args["--spark-mesos-executor-gpus"]: user_conf += _get_gpu_user_conf(args) MEMORY_MULTIPLIER = 20 memory = int(args["--spark-mesos-executor-gpus"]) * MEMORY_MULTIPLIER - user_conf += ["--conf", "spark.driver.memory={}g".format(str(memory)), - "--conf", "spark.executor.memory={}g".format(str(memory)), - "--conf", "spark.mesos.gpus.max={}".format(args["--spark-mesos-max-gpus"]), - "--conf", "spark.mesos.executor.gpus={}".format(args["--spark-mesos-executor-gpus"]), - "--conf", "spark.mesos.executor.docker.image={}".format(args["--docker-image"]), - "--conf", "spark.mesos.executor.docker.forcePullImage=false" - ] + user_conf += [ + "--conf", + "spark.driver.memory={}g".format(str(memory)), + "--conf", + "spark.executor.memory={}g".format(str(memory)), + "--conf", + "spark.mesos.gpus.max={}".format(args["--spark-mesos-max-gpus"]), + "--conf", + "spark.mesos.executor.gpus={}".format(args["--spark-mesos-executor-gpus"]), + "--conf", + "spark.mesos.executor.docker.image={}".format(args["--docker-image"]), + "--conf", + "spark.mesos.executor.docker.forcePullImage=false", + ] app_url = GPU_IMAGE_RECOGNITION_APP_URL else: app_url = MONTE_CARLO_APP_URL if args["--spark-mesos-driver-labels"] is not None: - user_conf += ["--conf", "spark.mesos.driver.labels={}".format(args["--spark-mesos-driver-labels"])] + user_conf += [ + "--conf", + "spark.mesos.driver.labels={}".format(args["--spark-mesos-driver-labels"]), + ] if not args["--no-supervise"]: user_conf += ["--supervise"] @@ -188,4 +251,6 @@ def submit_loop(app_url: str, submits_per_min: int, dispatchers: typing.List[typ end = int(args["--max-num-dispatchers"]) dispatchers = dispatchers[0:end] - submit_loop(app_url, int(args["--submits-per-min"]), dispatchers, user_conf) + group_role = args["--group-role"] + + submit_loop(app_url, int(args["--submits-per-min"]), dispatchers, user_conf, group_role) diff --git a/scale-tests/configs/2020-05-14-mwt25dr.env b/scale-tests/configs/2020-05-14-mwt25dr.env new file mode 100644 index 00000000..e3253a76 --- /dev/null +++ b/scale-tests/configs/2020-05-14-mwt25dr.env @@ -0,0 +1,168 @@ +# Depends on: +# - TEST_NAME +# - TEST_S3_BUCKET +# - TEST_S3_FOLDER + +# Workload configuration ####################################################### +# +# Total CPU quota: 88 +# Total MEM quota: 200000 +# Total GPU quota: 40 + +CLUSTER_URL="https://mw25dry.scaletesting.mesosphe.re/" +SECURITY="strict" + +DCOS_CLI_URL="https://downloads.dcos.io/cli/releases/binaries/dcos/linux/x86-64/latest/dcos" + +# Test configuration ########################################################### + +SHOULD_INSTALL_INFRASTRUCTURE=true +SHOULD_INSTALL_NON_GPU_DISPATCHERS=true +SHOULD_INSTALL_GPU_DISPATCHERS=false +SHOULD_RUN_FINITE_STREAMING_JOBS=true +SHOULD_RUN_INFINITE_STREAMING_JOBS=true +SHOULD_RUN_BATCH_JOBS=true +SHOULD_RUN_GPU_BATCH_JOBS=false +SHOULD_UNINSTALL_INFRASTRUCTURE_AT_THE_END=false + +# Infrastructure configuration ################################################# + +GROUP_NAME="${TEST_NAME}" + +SERVICE_NAMES_PREFIX="${TEST_NAME}/" +INFRASTRUCTURE_OUTPUT_FILE="infrastructure.json" + +KAFKA_CLUSTER_COUNT=1 +CASSANDRA_CLUSTER_COUNT=1 + +ZOOKEEPER_CPUS=10 +ZOOKEEPER_MEM=20000 +ZOOKEEPER_CONFIG='scale-tests/configs/kafka-zookeeper-options.json' +# Note: empty package repo values will default to latest Universe packages. +ZOOKEEPER_PACKAGE_REPO= +# 2.7.0-5.1.2e from the Universe. + +KAFKA_CPUS=10 +KAFKA_MEM=20000 +KAFKA_CONFIG='scale-tests/configs/kafka-options.json' +# Note: empty package repo values will default to latest Universe packages. +KAFKA_PACKAGE_REPO= +# 2.9.0-5.4.0 from the Universe. + +CASSANDRA_CPUS=10 +CASSANDRA_MEM=20000 +CASSANDRA_CONFIG='scale-tests/configs/cassandra-options.json' +# Note: empty package repo values will default to latest Universe packages. +CASSANDRA_PACKAGE_REPO= +# 2.9.0-3.11.6 from the Universe. + +# DSEngine configuration ####################################################### + +DSENGINE_CPUS=10 +DSENGINE_MEM=20000 +DSENGINE_GPUS=40 +DSENGINE_PACKAGE_REPO= + +# Spark configuration ########################################################## + +SPARK_CONFIG='scale-tests/configs/spark-options.json' + +# Note: empty package repo values will default to latest Universe packages. +# Spark version 2.10.0-2.4.5 +SPARK_PACKAGE_REPO=https://infinity-artifacts.s3.amazonaws.com/permanent/spark/2.10.0-2.4.5/stub-universe-spark.json + +# Note: leaving the Spark executor Docker image empty so that executors inherit +# the image used for dispatchers. +SPARK_EXECUTOR_DOCKER_IMAGE= + +# Non-GPU Spark dispatchers configuration ###################################### + +# Not currently used. +BATCH_MAX_NON_GPU_JOBS=30 + +SPARK_NON_GPU_DISPATCHERS=3 +SPARK_NON_GPU_DISPATCHERS_OUTPUT_FILE="non-gpu-dispatchers.out" +# Note: this name is built internally by the deploy-dispatchers.py script. +SPARK_NON_GPU_DISPATCHERS_JSON_OUTPUT_FILE="${SPARK_NON_GPU_DISPATCHERS_OUTPUT_FILE}-dispatchers.json" +# Note: driver resources used per dispatcher (1 dispatcher will be able to run +# 8 drivers since each driver requires 1 CPU). +SPARK_NON_GPU_QUOTA_DRIVERS_CPUS=8 +SPARK_NON_GPU_QUOTA_DRIVERS_MEM=20000 +# Note: executor resources used per job (1 driver will run 1 job). +SPARK_NON_GPU_QUOTA_EXECUTORS_CPUS=8 +SPARK_NON_GPU_QUOTA_EXECUTORS_MEM=20000 + +# GPU Spark dispatchers configuration ########################################## + +# Not currently used. +BATCH_MAX_GPU_JOBS=2 + +SPARK_GPU_DISPATCHERS=0 +SPARK_GPU_DISPATCHERS_OUTPUT_FILE="gpu-dispatchers.out" +SPARK_GPU_DISPATCHERS_JSON_OUTPUT_FILE="${SPARK_GPU_DISPATCHERS_OUTPUT_FILE}-dispatchers.json" # NOTE: this name is built internally by the deploy-dispatchers.py script. +SPARK_GPU_QUOTA_DRIVERS_CPUS= +SPARK_GPU_QUOTA_DRIVERS_MEM= +SPARK_GPU_QUOTA_DRIVERS_GPUS= +SPARK_GPU_QUOTA_EXECUTORS_CPUS= +SPARK_GPU_QUOTA_EXECUTORS_MEM= +SPARK_GPU_QUOTA_EXECUTORS_GPUS= + +# Common streaming jobs configuration ########################################## + +TEST_ASSEMBLY_JAR_URL='http://infinity-artifacts.s3.amazonaws.com/scale-tests/dcos-spark-scala-tests-assembly-2.4.0-20190325.jar' +DISPATCHERS_JSON_OUTPUT_FILE="all-dispatchers.json" + +# Finite streaming jobs configuration ########################################## + +STREAMING_FINITE_SUBMISSIONS_OUTPUT_FILE="finite-submissions.out" +STREAMING_FINITE_PRODUCERS_PER_KAFKA="${SPARK_NON_GPU_DISPATCHERS}" # 1 Kafka and 3 dispatchers -> 3 producers. +STREAMING_FINITE_CONSUMERS_PER_PRODUCER=1 # 3 producers -> 3 consumers. +# 3 producers + 3 consumers = 6 total finite streaming jobs +STREAMING_FINITE_PRODUCER_NUMBER_OF_WORDS=7692 +STREAMING_FINITE_PRODUCER_WORDS_PER_SECOND=1 +# 7692 words / 1 word per second -> ~2h runtime. +STREAMING_FINITE_PRODUCER_SPARK_CORES_MAX=2 +STREAMING_FINITE_PRODUCER_SPARK_EXECUTOR_CORES=2 +STREAMING_FINITE_CONSUMER_BATCH_SIZE_SECONDS=10 +STREAMING_FINITE_CONSUMER_SPARK_CORES_MAX=1 +STREAMING_FINITE_CONSUMER_SPARK_EXECUTOR_CORES=1 + +# Infinite streaming jobs configuration ######################################## + +STREAMING_INFINITE_SUBMISSIONS_OUTPUT_FILE="infinite-submissions.out" +STREAMING_INFINITE_PRODUCERS_PER_KAFKA="${SPARK_NON_GPU_DISPATCHERS}" # 1 Kafka and 3 dispatchers -> 3 producers. +STREAMING_INFINITE_CONSUMERS_PER_PRODUCER=1 # 3 producers -> 3 consumers. +# 3 producers + 3 consumers = 6 total infinite streaming jobs +STREAMING_INFINITE_PRODUCER_NUMBER_OF_WORDS=0 +STREAMING_INFINITE_PRODUCER_WORDS_PER_SECOND=1 +STREAMING_INFINITE_PRODUCER_SPARK_CORES_MAX=2 +STREAMING_INFINITE_PRODUCER_SPARK_EXECUTOR_CORES=2 +STREAMING_INFINITE_CONSUMER_BATCH_SIZE_SECONDS=10 +STREAMING_INFINITE_CONSUMER_SPARK_CORES_MAX=1 +STREAMING_INFINITE_CONSUMER_SPARK_EXECUTOR_CORES=1 + +# Batch jobs configuration ##################################################### + +SPARK_NON_GPU_DISPATCHERS_JSON_OUTPUT_FILE_URL="https://${TEST_S3_BUCKET}.s3.amazonaws.com/${TEST_S3_FOLDER}/${SPARK_NON_GPU_DISPATCHERS_JSON_OUTPUT_FILE}" + +BATCH_APP_ID="/${SERVICE_NAMES_PREFIX}batch-workload" +BATCH_SCRIPT_CPUS=6 +BATCH_SCRIPT_MEM=12288 +BATCH_SUBMITS_PER_MIN=13 +# TODO: update to master for the next MWT. +BATCH_SPARK_BUILD_BRANCH="dcos-58437-deploy-workloads-under-role-enforced-group" + +# Batch GPU jobs configuration ################################################# + +SPARK_GPU_DISPATCHERS_JSON_OUTPUT_FILE_URL="https://${TEST_S3_BUCKET}.s3.amazonaws.com/${TEST_S3_FOLDER}/${SPARK_GPU_DISPATCHERS_JSON_OUTPUT_FILE}" + +GPU_APP_ID="/${SERVICE_NAMES_PREFIX}gpu-batch-workload" +GPU_SCRIPT_CPUS=2 +GPU_SCRIPT_MEM=4096 +GPU_DOCKER_IMAGE='samvantran/spark-dcos-gpu:metrics' +GPU_SUBMITS_PER_MIN=5 +GPU_MAX_DISPATCHERS=${SPARK_GPU_DISPATCHERS} +GPU_SPARK_CORES_MAX=4 +GPU_SPARK_MESOS_EXECUTOR_GPUS=4 +GPU_SPARK_MESOS_MAX_GPUS=4 +GPU_SPARK_BUILD_BRANCH=master diff --git a/scale-tests/configs/2020-05-20-mwt25.env b/scale-tests/configs/2020-05-20-mwt25.env new file mode 100644 index 00000000..5395a411 --- /dev/null +++ b/scale-tests/configs/2020-05-20-mwt25.env @@ -0,0 +1,168 @@ +# Depends on: +# - TEST_NAME +# - TEST_S3_BUCKET +# - TEST_S3_FOLDER + +# Workload configuration ####################################################### +# +# Total CPU quota: 2290 +# Total MEM quota: 4580000 +# Total GPU quota: 40 + +CLUSTER_URL="https://mw25dry.scaletesting.mesosphe.re/" +SECURITY="strict" + +DCOS_CLI_URL="https://downloads.dcos.io/cli/releases/binaries/dcos/linux/x86-64/latest/dcos" + +# Test configuration ########################################################### + +SHOULD_INSTALL_INFRASTRUCTURE=true +SHOULD_INSTALL_NON_GPU_DISPATCHERS=true +SHOULD_INSTALL_GPU_DISPATCHERS=false +SHOULD_RUN_FINITE_STREAMING_JOBS=true +SHOULD_RUN_INFINITE_STREAMING_JOBS=true +SHOULD_RUN_BATCH_JOBS=true +SHOULD_RUN_GPU_BATCH_JOBS=false +SHOULD_UNINSTALL_INFRASTRUCTURE_AT_THE_END=false + +# Infrastructure configuration ################################################# + +GROUP_NAME="${TEST_NAME}" + +SERVICE_NAMES_PREFIX="${TEST_NAME}/" +INFRASTRUCTURE_OUTPUT_FILE="infrastructure.json" + +KAFKA_CLUSTER_COUNT=1 +CASSANDRA_CLUSTER_COUNT=1 + +ZOOKEEPER_CPUS=10 +ZOOKEEPER_MEM=20000 +ZOOKEEPER_CONFIG='scale-tests/configs/kafka-zookeeper-options.json' +# Note: empty package repo values will default to latest Universe packages. +ZOOKEEPER_PACKAGE_REPO= +# 2.7.0-5.1.2e from the Universe. + +KAFKA_CPUS=10 +KAFKA_MEM=20000 +KAFKA_CONFIG='scale-tests/configs/kafka-options.json' +# Note: empty package repo values will default to latest Universe packages. +KAFKA_PACKAGE_REPO= +# 2.9.0-5.4.0 from the Universe. + +CASSANDRA_CPUS=10 +CASSANDRA_MEM=20000 +CASSANDRA_CONFIG='scale-tests/configs/cassandra-options.json' +# Note: empty package repo values will default to latest Universe packages. +CASSANDRA_PACKAGE_REPO= +# 2.9.0-3.11.6 from the Universe. + +# DSEngine configuration ####################################################### + +DSENGINE_CPUS=10 +DSENGINE_MEM=20000 +DSENGINE_GPUS=40 +DSENGINE_PACKAGE_REPO= + +# Spark configuration ########################################################## + +SPARK_CONFIG='scale-tests/configs/spark-options.json' + +# Note: empty package repo values will default to latest Universe packages. +# Spark version 2.10.0-2.4.5 +SPARK_PACKAGE_REPO=https://infinity-artifacts.s3.amazonaws.com/permanent/spark/2.10.0-2.4.5/stub-universe-spark.json + +# Note: leaving the Spark executor Docker image empty so that executors inherit +# the image used for dispatchers. +SPARK_EXECUTOR_DOCKER_IMAGE= + +# Non-GPU Spark dispatchers configuration ###################################### + +# Not currently used. +BATCH_MAX_NON_GPU_JOBS=1000 + +SPARK_NON_GPU_DISPATCHERS=50 +SPARK_NON_GPU_DISPATCHERS_OUTPUT_FILE="non-gpu-dispatchers.out" +# Note: this name is built internally by the deploy-dispatchers.py script. +SPARK_NON_GPU_DISPATCHERS_JSON_OUTPUT_FILE="${SPARK_NON_GPU_DISPATCHERS_OUTPUT_FILE}-dispatchers.json" +# Note: driver resources used per dispatcher (1 dispatcher will be able to run +# 20 drivers since each driver requires 1 CPU). +SPARK_NON_GPU_QUOTA_DRIVERS_CPUS=20 +SPARK_NON_GPU_QUOTA_DRIVERS_MEM=50000 +# Note: executor resources used per job (1 driver will run 1 job). +SPARK_NON_GPU_QUOTA_EXECUTORS_CPUS=25 +SPARK_NON_GPU_QUOTA_EXECUTORS_MEM=40000 + +# GPU Spark dispatchers configuration ########################################## + +# Not currently used. +BATCH_MAX_GPU_JOBS=10 + +SPARK_GPU_DISPATCHERS=0 +SPARK_GPU_DISPATCHERS_OUTPUT_FILE="gpu-dispatchers.out" +SPARK_GPU_DISPATCHERS_JSON_OUTPUT_FILE="${SPARK_GPU_DISPATCHERS_OUTPUT_FILE}-dispatchers.json" # NOTE: this name is built internally by the deploy-dispatchers.py script. +SPARK_GPU_QUOTA_DRIVERS_CPUS= +SPARK_GPU_QUOTA_DRIVERS_MEM= +SPARK_GPU_QUOTA_DRIVERS_GPUS= +SPARK_GPU_QUOTA_EXECUTORS_CPUS= +SPARK_GPU_QUOTA_EXECUTORS_MEM= +SPARK_GPU_QUOTA_EXECUTORS_GPUS= +SPARK_GPU_REMOVE_EXECUTORS_ROLES_QUOTAS=true +# Common streaming jobs configuration ########################################## + +TEST_ASSEMBLY_JAR_URL='http://infinity-artifacts.s3.amazonaws.com/scale-tests/dcos-spark-scala-tests-assembly-2.4.0-20190325.jar' +DISPATCHERS_JSON_OUTPUT_FILE="all-dispatchers.json" + +# Finite streaming jobs configuration ########################################## + +STREAMING_FINITE_SUBMISSIONS_OUTPUT_FILE="finite-submissions.out" +STREAMING_FINITE_PRODUCERS_PER_KAFKA="${SPARK_NON_GPU_DISPATCHERS}" # 1 Kafka and 50 dispatchers -> 50 producers. +STREAMING_FINITE_CONSUMERS_PER_PRODUCER=1 # 50 producers -> 50 consumers. +# 50 producers + 50 consumers = 100 total finite streaming jobs +STREAMING_FINITE_PRODUCER_NUMBER_OF_WORDS=7692 +STREAMING_FINITE_PRODUCER_WORDS_PER_SECOND=1 +# 7692 words / 1 word per second -> ~2h runtime. +STREAMING_FINITE_PRODUCER_SPARK_CORES_MAX=2 +STREAMING_FINITE_PRODUCER_SPARK_EXECUTOR_CORES=2 +STREAMING_FINITE_CONSUMER_BATCH_SIZE_SECONDS=10 +STREAMING_FINITE_CONSUMER_SPARK_CORES_MAX=1 +STREAMING_FINITE_CONSUMER_SPARK_EXECUTOR_CORES=1 + +# Infinite streaming jobs configuration ######################################## + +STREAMING_INFINITE_SUBMISSIONS_OUTPUT_FILE="infinite-submissions.out" +STREAMING_INFINITE_PRODUCERS_PER_KAFKA="${SPARK_NON_GPU_DISPATCHERS}" # 1 Kafka and 50 dispatchers -> 50 producers. +STREAMING_INFINITE_CONSUMERS_PER_PRODUCER=1 # 50 producers -> 50 consumers. +# 50 producers + 50 consumers = 100 total infinite streaming jobs +STREAMING_INFINITE_PRODUCER_NUMBER_OF_WORDS=0 +STREAMING_INFINITE_PRODUCER_WORDS_PER_SECOND=1 +STREAMING_INFINITE_PRODUCER_SPARK_CORES_MAX=2 +STREAMING_INFINITE_PRODUCER_SPARK_EXECUTOR_CORES=2 +STREAMING_INFINITE_CONSUMER_BATCH_SIZE_SECONDS=10 +STREAMING_INFINITE_CONSUMER_SPARK_CORES_MAX=1 +STREAMING_INFINITE_CONSUMER_SPARK_EXECUTOR_CORES=1 + +# Batch jobs configuration ##################################################### + +SPARK_NON_GPU_DISPATCHERS_JSON_OUTPUT_FILE_URL="https://${TEST_S3_BUCKET}.s3.amazonaws.com/${TEST_S3_FOLDER}/${SPARK_NON_GPU_DISPATCHERS_JSON_OUTPUT_FILE}" + +BATCH_APP_ID="/${SERVICE_NAMES_PREFIX}batch-workload" +BATCH_SCRIPT_CPUS=6 +BATCH_SCRIPT_MEM=12288 +BATCH_SUBMITS_PER_MIN=13 +# TODO: update to master for the next MWT. +BATCH_SPARK_BUILD_BRANCH="dcos-58437-deploy-workloads-under-role-enforced-group" + +# Batch GPU jobs configuration ################################################# + +SPARK_GPU_DISPATCHERS_JSON_OUTPUT_FILE_URL="https://${TEST_S3_BUCKET}.s3.amazonaws.com/${TEST_S3_FOLDER}/${SPARK_GPU_DISPATCHERS_JSON_OUTPUT_FILE}" + +GPU_APP_ID="/${SERVICE_NAMES_PREFIX}gpu-batch-workload" +GPU_SCRIPT_CPUS=2 +GPU_SCRIPT_MEM=4096 +GPU_DOCKER_IMAGE='samvantran/spark-dcos-gpu:metrics' +GPU_SUBMITS_PER_MIN=5 +GPU_MAX_DISPATCHERS=${SPARK_GPU_DISPATCHERS} +GPU_SPARK_CORES_MAX=4 +GPU_SPARK_MESOS_EXECUTOR_GPUS=4 +GPU_SPARK_MESOS_MAX_GPUS=4 +GPU_SPARK_BUILD_BRANCH=master diff --git a/scale-tests/configs/2020-09-14-mwt26dr.env b/scale-tests/configs/2020-09-14-mwt26dr.env new file mode 100644 index 00000000..87bd43bb --- /dev/null +++ b/scale-tests/configs/2020-09-14-mwt26dr.env @@ -0,0 +1,176 @@ +# Depends on: +# - TEST_NAME +# - TEST_S3_BUCKET +# - TEST_S3_FOLDER + +# Workload configuration ####################################################### +# +# Total CPU quota: 88 +# Total MEM quota: 200000 +# Total GPU quota: 40 + +CLUSTER_URL="https://mwt26-dry.scaletesting.mesosphe.re/" +SECURITY="strict" + +DCOS_CLI_URL="https://downloads.dcos.io/cli/releases/binaries/dcos/linux/x86-64/latest/dcos" + +# Test configuration ########################################################### + +SHOULD_INSTALL_INFRASTRUCTURE=true +SHOULD_INSTALL_NON_GPU_DISPATCHERS=true +SHOULD_INSTALL_GPU_DISPATCHERS=false +SHOULD_RUN_FINITE_STREAMING_JOBS=true +SHOULD_RUN_INFINITE_STREAMING_JOBS=true +SHOULD_RUN_BATCH_JOBS=true +SHOULD_RUN_GPU_BATCH_JOBS=false +SHOULD_UNINSTALL_INFRASTRUCTURE_AT_THE_END=false + +# Infrastructure configuration ################################################# + +GROUP_NAME="${TEST_NAME}" + +SERVICE_NAMES_PREFIX="${TEST_NAME}/" +INFRASTRUCTURE_OUTPUT_FILE="infrastructure.json" + +KAFKA_CLUSTER_COUNT=1 +CASSANDRA_CLUSTER_COUNT=1 + +ZOOKEEPER_CPUS=10 +ZOOKEEPER_MEM=20000 +ZOOKEEPER_CONFIG='scale-tests/configs/kafka-zookeeper-options.json' +# Note: empty package repo values will default to latest Universe packages. +ZOOKEEPER_PACKAGE_REPO='https://universe-converter.mesosphere.com/transform?url=https://infinity-artifacts.s3.amazonaws.com/permanent/confluent-zookeeper/assets/2.8.0-5.5.1-beta/stub-universe-confluent-zookeeper.json' +# 2.8.0-5.5.1-beta from the Universe. + +KAFKA_CPUS=10 +KAFKA_MEM=20000 +KAFKA_CONFIG='scale-tests/configs/kafka-options.json' +KAFKA_USER='root' +# Note: empty package repo values will default to latest Universe packages. +KAFKA_PACKAGE_REPO='https://universe-converter.mesosphere.com/transform?url=https://infinity-artifacts.s3.amazonaws.com/permanent/confluent-kafka/assets/2.10.0-5.5.1-beta/stub-universe-confluent-kafka.json' +# 2.10.0-5.5.1-beta from the Universe. +KAFKA_PACKAGE_NAME='beta-confluent-kafka' + +CASSANDRA_CPUS=10 +CASSANDRA_MEM=20000 +CASSANDRA_CONFIG='scale-tests/configs/cassandra-options.json' +CASSANDRA_USER='root' +# Note: empty package repo values will default to latest Universe packages. +CASSANDRA_PACKAGE_REPO='https://universe-converter.mesosphere.com/transform?url=https://infinity-artifacts.s3.amazonaws.com/permanent/cassandra/assets/2.10.0-3.11.6-beta/stub-universe-cassandra.json' +CASSANDRA_PACKAGE_NAME='beta-cassandra' +# 2.10.0-3.11.6-beta from the Universe. + +# DSEngine configuration ####################################################### + +DSENGINE_CPUS=10 +DSENGINE_MEM=20000 +DSENGINE_GPUS=40 +DSENGINE_CONFIG='scale-tests/configs/dsengine-options.json' +# Note: empty package repo values will default to latest Universe packages. +DSENGINE_PACKAGE_REPO='https://universe-converter.mesosphere.com/transform?url=https://infinity-artifacts.s3.amazonaws.com/permanent/data-science-engine/assets/2.1.0-beta/stub-universe-data-science-engine.json' +DSENGINE_PACKAGE_NAME='beta-data-science-engine' +# 2.1.0-beta from the Universe. + +# Spark configuration ########################################################## + +SPARK_CONFIG='scale-tests/configs/spark-options.json' + +# Note: empty package repo values will default to latest Universe packages. +# Spark version 2.11.0-2.4.6 +SPARK_PACKAGE_REPO='https://universe-converter.mesosphere.com/transform?url=https://infinity-artifacts.s3.amazonaws.com/permanent/spark/2.11.0-2.4.6/stub-universe-spark.json' + +# Note: leaving the Spark executor Docker image empty so that executors inherit +# the image used for dispatchers. +SPARK_EXECUTOR_DOCKER_IMAGE= + +# Non-GPU Spark dispatchers configuration ###################################### + +# Not currently used. +BATCH_MAX_NON_GPU_JOBS=30 + +SPARK_NON_GPU_DISPATCHERS=3 +SPARK_NON_GPU_DISPATCHERS_OUTPUT_FILE="non-gpu-dispatchers.out" +# Note: this name is built internally by the deploy-dispatchers.py script. +SPARK_NON_GPU_DISPATCHERS_JSON_OUTPUT_FILE="${SPARK_NON_GPU_DISPATCHERS_OUTPUT_FILE}-dispatchers.json" +# Note: driver resources used per dispatcher (1 dispatcher will be able to run +# 8 drivers since each driver requires 1 CPU). +SPARK_NON_GPU_QUOTA_DRIVERS_CPUS=8 +SPARK_NON_GPU_QUOTA_DRIVERS_MEM=20000 +# Note: executor resources used per job (1 driver will run 1 job). +SPARK_NON_GPU_QUOTA_EXECUTORS_CPUS=8 +SPARK_NON_GPU_QUOTA_EXECUTORS_MEM=20000 + +# GPU Spark dispatchers configuration ########################################## + +# Not currently used. +BATCH_MAX_GPU_JOBS=2 + +SPARK_GPU_DISPATCHERS=0 +SPARK_GPU_DISPATCHERS_OUTPUT_FILE="gpu-dispatchers.out" +SPARK_GPU_DISPATCHERS_JSON_OUTPUT_FILE="${SPARK_GPU_DISPATCHERS_OUTPUT_FILE}-dispatchers.json" # NOTE: this name is built internally by the deploy-dispatchers.py script. +SPARK_GPU_QUOTA_DRIVERS_CPUS= +SPARK_GPU_QUOTA_DRIVERS_MEM= +SPARK_GPU_QUOTA_DRIVERS_GPUS= +SPARK_GPU_QUOTA_EXECUTORS_CPUS= +SPARK_GPU_QUOTA_EXECUTORS_MEM= +SPARK_GPU_QUOTA_EXECUTORS_GPUS= + +# Common streaming jobs configuration ########################################## + +TEST_ASSEMBLY_JAR_URL='http://infinity-artifacts.s3.amazonaws.com/scale-tests/dcos-spark-scala-tests-assembly-2.4.0-20190325.jar' +DISPATCHERS_JSON_OUTPUT_FILE="all-dispatchers.json" + +# Finite streaming jobs configuration ########################################## + +STREAMING_FINITE_SUBMISSIONS_OUTPUT_FILE="finite-submissions.out" +STREAMING_FINITE_PRODUCERS_PER_KAFKA="${SPARK_NON_GPU_DISPATCHERS}" # 1 Kafka and 3 dispatchers -> 3 producers. +STREAMING_FINITE_CONSUMERS_PER_PRODUCER=1 # 3 producers -> 3 consumers. +# 3 producers + 3 consumers = 6 total finite streaming jobs +STREAMING_FINITE_PRODUCER_NUMBER_OF_WORDS=7692 +STREAMING_FINITE_PRODUCER_WORDS_PER_SECOND=1 +# 7692 words / 1 word per second -> ~2h runtime. +STREAMING_FINITE_PRODUCER_SPARK_CORES_MAX=2 +STREAMING_FINITE_PRODUCER_SPARK_EXECUTOR_CORES=2 +STREAMING_FINITE_CONSUMER_BATCH_SIZE_SECONDS=10 +STREAMING_FINITE_CONSUMER_SPARK_CORES_MAX=1 +STREAMING_FINITE_CONSUMER_SPARK_EXECUTOR_CORES=1 + +# Infinite streaming jobs configuration ######################################## + +STREAMING_INFINITE_SUBMISSIONS_OUTPUT_FILE="infinite-submissions.out" +STREAMING_INFINITE_PRODUCERS_PER_KAFKA="${SPARK_NON_GPU_DISPATCHERS}" # 1 Kafka and 3 dispatchers -> 3 producers. +STREAMING_INFINITE_CONSUMERS_PER_PRODUCER=1 # 3 producers -> 3 consumers. +# 3 producers + 3 consumers = 6 total infinite streaming jobs +STREAMING_INFINITE_PRODUCER_NUMBER_OF_WORDS=0 +STREAMING_INFINITE_PRODUCER_WORDS_PER_SECOND=1 +STREAMING_INFINITE_PRODUCER_SPARK_CORES_MAX=2 +STREAMING_INFINITE_PRODUCER_SPARK_EXECUTOR_CORES=2 +STREAMING_INFINITE_CONSUMER_BATCH_SIZE_SECONDS=10 +STREAMING_INFINITE_CONSUMER_SPARK_CORES_MAX=1 +STREAMING_INFINITE_CONSUMER_SPARK_EXECUTOR_CORES=1 + +# Batch jobs configuration ##################################################### + +SPARK_NON_GPU_DISPATCHERS_JSON_OUTPUT_FILE_URL="https://${TEST_S3_BUCKET}.s3.amazonaws.com/${TEST_S3_FOLDER}/${SPARK_NON_GPU_DISPATCHERS_JSON_OUTPUT_FILE}" + +BATCH_APP_ID="/${SERVICE_NAMES_PREFIX}batch-workload" +BATCH_SCRIPT_CPUS=6 +BATCH_SCRIPT_MEM=12288 +BATCH_SUBMITS_PER_MIN=13 +# TODO: update to master for the next MWT. +BATCH_SPARK_BUILD_BRANCH="dcos-58437-deploy-workloads-under-role-enforced-group" + +# Batch GPU jobs configuration ################################################# + +SPARK_GPU_DISPATCHERS_JSON_OUTPUT_FILE_URL="https://${TEST_S3_BUCKET}.s3.amazonaws.com/${TEST_S3_FOLDER}/${SPARK_GPU_DISPATCHERS_JSON_OUTPUT_FILE}" + +GPU_APP_ID="/${SERVICE_NAMES_PREFIX}gpu-batch-workload" +GPU_SCRIPT_CPUS=2 +GPU_SCRIPT_MEM=4096 +GPU_DOCKER_IMAGE='samvantran/spark-dcos-gpu:metrics' +GPU_SUBMITS_PER_MIN=5 +GPU_MAX_DISPATCHERS=${SPARK_GPU_DISPATCHERS} +GPU_SPARK_CORES_MAX=4 +GPU_SPARK_MESOS_EXECUTOR_GPUS=4 +GPU_SPARK_MESOS_MAX_GPUS=4 +GPU_SPARK_BUILD_BRANCH=master diff --git a/scale-tests/configs/2020-10-01-mwt26run.env b/scale-tests/configs/2020-10-01-mwt26run.env new file mode 100644 index 00000000..89eb4292 --- /dev/null +++ b/scale-tests/configs/2020-10-01-mwt26run.env @@ -0,0 +1,176 @@ +# Depends on: +# - TEST_NAME +# - TEST_S3_BUCKET +# - TEST_S3_FOLDER + +# Workload configuration ####################################################### +# +# Total CPU quota: 2290 +# Total MEM quota: 4580000 +# Total GPU quota: 40 + +CLUSTER_URL="https://mwt26.scaletesting.mesosphe.re" +SECURITY="strict" + +DCOS_CLI_URL="https://downloads.dcos.io/cli/releases/binaries/dcos/linux/x86-64/latest/dcos" + +# Test configuration ########################################################### + +SHOULD_INSTALL_INFRASTRUCTURE=true +SHOULD_INSTALL_NON_GPU_DISPATCHERS=true +SHOULD_INSTALL_GPU_DISPATCHERS=false +SHOULD_RUN_FINITE_STREAMING_JOBS=true +SHOULD_RUN_INFINITE_STREAMING_JOBS=true +SHOULD_RUN_BATCH_JOBS=true +SHOULD_RUN_GPU_BATCH_JOBS=false +SHOULD_UNINSTALL_INFRASTRUCTURE_AT_THE_END=false + +# Infrastructure configuration ################################################# + +GROUP_NAME="${TEST_NAME}" + +SERVICE_NAMES_PREFIX="${TEST_NAME}/" +INFRASTRUCTURE_OUTPUT_FILE="infrastructure.json" + +KAFKA_CLUSTER_COUNT=1 +CASSANDRA_CLUSTER_COUNT=1 + +ZOOKEEPER_CPUS=10 +ZOOKEEPER_MEM=20000 +ZOOKEEPER_CONFIG='scale-tests/configs/kafka-zookeeper-options.json' +# Note: empty package repo values will default to latest Universe packages. +ZOOKEEPER_PACKAGE_REPO='https://universe-converter.mesosphere.com/transform?url=https://infinity-artifacts.s3.amazonaws.com/permanent/confluent-zookeeper/assets/2.8.0-5.5.1-beta/stub-universe-confluent-zookeeper.json' +# 2.8.0-5.5.1-beta from the Universe. + +KAFKA_CPUS=10 +KAFKA_MEM=20000 +KAFKA_CONFIG='scale-tests/configs/kafka-options.json' +KAFKA_USER='root' +# Note: empty package repo values will default to latest Universe packages. +KAFKA_PACKAGE_REPO='https://universe-converter.mesosphere.com/transform?url=https://infinity-artifacts.s3.amazonaws.com/permanent/confluent-kafka/assets/2.10.0-5.5.1-beta/stub-universe-confluent-kafka.json' +# 2.10.0-5.5.1-beta from the Universe. +KAFKA_PACKAGE_NAME='beta-confluent-kafka' + +CASSANDRA_CPUS=10 +CASSANDRA_MEM=20000 +CASSANDRA_CONFIG='scale-tests/configs/cassandra-options.json' +CASSANDRA_USER='root' +# Note: empty package repo values will default to latest Universe packages. +CASSANDRA_PACKAGE_REPO='https://universe-converter.mesosphere.com/transform?url=https://infinity-artifacts.s3.amazonaws.com/permanent/cassandra/assets/2.10.0-3.11.6-beta/stub-universe-cassandra.json' +CASSANDRA_PACKAGE_NAME='beta-cassandra' +# 2.10.0-3.11.6-beta from the Universe. + +# DSEngine configuration ####################################################### + +DSENGINE_CPUS=10 +DSENGINE_MEM=20000 +DSENGINE_GPUS=40 +DSENGINE_CONFIG='scale-tests/configs/dsengine-options.json' +# Note: empty package repo values will default to latest Universe packages. +DSENGINE_PACKAGE_REPO='https://universe-converter.mesosphere.com/transform?url=https://infinity-artifacts.s3.amazonaws.com/permanent/data-science-engine/assets/2.1.0-beta/stub-universe-data-science-engine.json' +DSENGINE_PACKAGE_NAME='beta-data-science-engine' +# 2.1.0-beta from the Universe. + +# Spark configuration ########################################################## + +SPARK_CONFIG='scale-tests/configs/spark-options.json' + +# Note: empty package repo values will default to latest Universe packages. +# Spark version 2.11.0-2.4.6 +SPARK_PACKAGE_REPO='https://universe-converter.mesosphere.com/transform?url=https://infinity-artifacts.s3.amazonaws.com/permanent/spark/2.11.0-2.4.6/stub-universe-spark.json' + +# Note: leaving the Spark executor Docker image empty so that executors inherit +# the image used for dispatchers. +SPARK_EXECUTOR_DOCKER_IMAGE= + +# Non-GPU Spark dispatchers configuration ###################################### + +# Not currently used. +BATCH_MAX_NON_GPU_JOBS=1000 + +SPARK_NON_GPU_DISPATCHERS=50 +SPARK_NON_GPU_DISPATCHERS_OUTPUT_FILE="non-gpu-dispatchers.out" +# Note: this name is built internally by the deploy-dispatchers.py script. +SPARK_NON_GPU_DISPATCHERS_JSON_OUTPUT_FILE="${SPARK_NON_GPU_DISPATCHERS_OUTPUT_FILE}-dispatchers.json" +# Note: driver resources used per dispatcher (1 dispatcher will be able to run +# 20 drivers since each driver requires 1 CPU). +SPARK_NON_GPU_QUOTA_DRIVERS_CPUS=20 +SPARK_NON_GPU_QUOTA_DRIVERS_MEM=50000 +# Note: executor resources used per job (1 driver will run 1 job). +SPARK_NON_GPU_QUOTA_EXECUTORS_CPUS=25 +SPARK_NON_GPU_QUOTA_EXECUTORS_MEM=40000 + +# GPU Spark dispatchers configuration ########################################## + +# Not currently used. +BATCH_MAX_GPU_JOBS=10 + +SPARK_GPU_DISPATCHERS=0 +SPARK_GPU_DISPATCHERS_OUTPUT_FILE="gpu-dispatchers.out" +SPARK_GPU_DISPATCHERS_JSON_OUTPUT_FILE="${SPARK_GPU_DISPATCHERS_OUTPUT_FILE}-dispatchers.json" # NOTE: this name is built internally by the deploy-dispatchers.py script. +SPARK_GPU_QUOTA_DRIVERS_CPUS= +SPARK_GPU_QUOTA_DRIVERS_MEM= +SPARK_GPU_QUOTA_DRIVERS_GPUS= +SPARK_GPU_QUOTA_EXECUTORS_CPUS= +SPARK_GPU_QUOTA_EXECUTORS_MEM= +SPARK_GPU_QUOTA_EXECUTORS_GPUS= +SPARK_GPU_REMOVE_EXECUTORS_ROLES_QUOTAS=true +# Common streaming jobs configuration ########################################## + +TEST_ASSEMBLY_JAR_URL='http://infinity-artifacts.s3.amazonaws.com/scale-tests/dcos-spark-scala-tests-assembly-2.4.0-20190325.jar' +DISPATCHERS_JSON_OUTPUT_FILE="all-dispatchers.json" + +# Finite streaming jobs configuration ########################################## + +STREAMING_FINITE_SUBMISSIONS_OUTPUT_FILE="finite-submissions.out" +STREAMING_FINITE_PRODUCERS_PER_KAFKA="${SPARK_NON_GPU_DISPATCHERS}" # 1 Kafka and 50 dispatchers -> 50 producers. +STREAMING_FINITE_CONSUMERS_PER_PRODUCER=1 # 50 producers -> 50 consumers. +# 50 producers + 50 consumers = 100 total finite streaming jobs +STREAMING_FINITE_PRODUCER_NUMBER_OF_WORDS=7692 +STREAMING_FINITE_PRODUCER_WORDS_PER_SECOND=1 +# 7692 words / 1 word per second -> ~2h runtime. +STREAMING_FINITE_PRODUCER_SPARK_CORES_MAX=2 +STREAMING_FINITE_PRODUCER_SPARK_EXECUTOR_CORES=2 +STREAMING_FINITE_CONSUMER_BATCH_SIZE_SECONDS=10 +STREAMING_FINITE_CONSUMER_SPARK_CORES_MAX=1 +STREAMING_FINITE_CONSUMER_SPARK_EXECUTOR_CORES=1 + +# Infinite streaming jobs configuration ######################################## + +STREAMING_INFINITE_SUBMISSIONS_OUTPUT_FILE="infinite-submissions.out" +STREAMING_INFINITE_PRODUCERS_PER_KAFKA="${SPARK_NON_GPU_DISPATCHERS}" # 1 Kafka and 50 dispatchers -> 50 producers. +STREAMING_INFINITE_CONSUMERS_PER_PRODUCER=1 # 50 producers -> 50 consumers. +# 50 producers + 50 consumers = 100 total infinite streaming jobs +STREAMING_INFINITE_PRODUCER_NUMBER_OF_WORDS=0 +STREAMING_INFINITE_PRODUCER_WORDS_PER_SECOND=1 +STREAMING_INFINITE_PRODUCER_SPARK_CORES_MAX=2 +STREAMING_INFINITE_PRODUCER_SPARK_EXECUTOR_CORES=2 +STREAMING_INFINITE_CONSUMER_BATCH_SIZE_SECONDS=10 +STREAMING_INFINITE_CONSUMER_SPARK_CORES_MAX=1 +STREAMING_INFINITE_CONSUMER_SPARK_EXECUTOR_CORES=1 + +# Batch jobs configuration ##################################################### + +SPARK_NON_GPU_DISPATCHERS_JSON_OUTPUT_FILE_URL="https://${TEST_S3_BUCKET}.s3.amazonaws.com/${TEST_S3_FOLDER}/${SPARK_NON_GPU_DISPATCHERS_JSON_OUTPUT_FILE}" + +BATCH_APP_ID="/${SERVICE_NAMES_PREFIX}batch-workload" +BATCH_SCRIPT_CPUS=6 +BATCH_SCRIPT_MEM=12288 +BATCH_SUBMITS_PER_MIN=13 +# TODO: update to master for the next MWT. +BATCH_SPARK_BUILD_BRANCH="mwt-26" + +# Batch GPU jobs configuration ################################################# + +SPARK_GPU_DISPATCHERS_JSON_OUTPUT_FILE_URL="https://${TEST_S3_BUCKET}.s3.amazonaws.com/${TEST_S3_FOLDER}/${SPARK_GPU_DISPATCHERS_JSON_OUTPUT_FILE}" + +GPU_APP_ID="/${SERVICE_NAMES_PREFIX}gpu-batch-workload" +GPU_SCRIPT_CPUS=2 +GPU_SCRIPT_MEM=4096 +GPU_DOCKER_IMAGE='samvantran/spark-dcos-gpu:metrics' +GPU_SUBMITS_PER_MIN=5 +GPU_MAX_DISPATCHERS=${SPARK_GPU_DISPATCHERS} +GPU_SPARK_CORES_MAX=4 +GPU_SPARK_MESOS_EXECUTOR_GPUS=4 +GPU_SPARK_MESOS_MAX_GPUS=4 +GPU_SPARK_BUILD_BRANCH=master diff --git a/scale-tests/configs/2020-10-20-mwt27dr.env b/scale-tests/configs/2020-10-20-mwt27dr.env new file mode 100644 index 00000000..acf97bd5 --- /dev/null +++ b/scale-tests/configs/2020-10-20-mwt27dr.env @@ -0,0 +1,177 @@ +# Depends on: +# - TEST_NAME +# - TEST_S3_BUCKET +# - TEST_S3_FOLDER + +# Workload configuration ####################################################### +# +# Total CPU quota: 88 +# Total MEM quota: 200000 +# Total GPU quota: 40 + +CLUSTER_URL="https://mwt27-dry.scaletesting.mesosphe.re/" +SECURITY="strict" + +DCOS_CLI_URL="https://downloads.dcos.io/cli/releases/binaries/dcos/linux/x86-64/latest/dcos" + +# Test configuration ########################################################### + +SHOULD_INSTALL_INFRASTRUCTURE=true +SHOULD_INSTALL_NON_GPU_DISPATCHERS=true +SHOULD_INSTALL_GPU_DISPATCHERS=false +SHOULD_RUN_FINITE_STREAMING_JOBS=true +SHOULD_RUN_INFINITE_STREAMING_JOBS=true +SHOULD_RUN_BATCH_JOBS=true +SHOULD_RUN_GPU_BATCH_JOBS=false +SHOULD_UNINSTALL_INFRASTRUCTURE_AT_THE_END=false + +# Infrastructure configuration ################################################# + +GROUP_NAME="${TEST_NAME}" + +SERVICE_NAMES_PREFIX="${TEST_NAME}/" +INFRASTRUCTURE_OUTPUT_FILE="infrastructure.json" + +KAFKA_CLUSTER_COUNT=1 +CASSANDRA_CLUSTER_COUNT=1 + +ZOOKEEPER_CPUS=10 +ZOOKEEPER_MEM=20000 +ZOOKEEPER_CONFIG='scale-tests/configs/kafka-zookeeper-options.json' +# Note: empty package repo values will default to latest Universe packages. +ZOOKEEPER_PACKAGE_REPO= +ZOOKEEPER_PACKAGE_NAME='beta-confluent-zookeeper' +# 2.8.0-5.5.1-beta from the Universe. + +KAFKA_CPUS=10 +KAFKA_MEM=20000 +KAFKA_CONFIG='scale-tests/configs/kafka-options.json' +KAFKA_USER='root' +# Note: empty package repo values will default to latest Universe packages. +KAFKA_PACKAGE_REPO= +# 2.10.0-5.5.1-3-beta from the Universe. +KAFKA_PACKAGE_NAME='beta-confluent-kafka' + +CASSANDRA_CPUS=10 +CASSANDRA_MEM=20000 +CASSANDRA_CONFIG='scale-tests/configs/cassandra-options.json' +CASSANDRA_USER='root' +# Note: empty package repo values will default to latest Universe packages. +CASSANDRA_PACKAGE_REPO='https://universe-converter.mesosphere.com/transform?url=https://infinity-artifacts.s3.amazonaws.com/permanent/cassandra/assets/2.10.0-3.11.6-beta/stub-universe-cassandra.json' +CASSANDRA_PACKAGE_NAME='beta-cassandra' +# 2.10.0-3.11.6-beta from the Universe. + +# DSEngine configuration ####################################################### + +DSENGINE_CPUS=10 +DSENGINE_MEM=20000 +DSENGINE_GPUS=40 +DSENGINE_CONFIG='scale-tests/configs/dsengine-options.json' +# Note: empty package repo values will default to latest Universe packages. +DSENGINE_PACKAGE_REPO= +DSENGINE_PACKAGE_NAME='beta-data-science-engine' +# 2.1.0-beta from the Universe. + +# Spark configuration ########################################################## + +SPARK_CONFIG='scale-tests/configs/spark-options.json' + +# Note: empty package repo values will default to latest Universe packages. +# Spark version 2.11.0-2.4.6 +SPARK_PACKAGE_REPO='https://universe-converter.mesosphere.com/transform?url=https://infinity-artifacts.s3.amazonaws.com/permanent/spark/2.11.0-2.4.6/stub-universe-spark.json' + +# Note: leaving the Spark executor Docker image empty so that executors inherit +# the image used for dispatchers. +SPARK_EXECUTOR_DOCKER_IMAGE= + +# Non-GPU Spark dispatchers configuration ###################################### + +# Not currently used. +BATCH_MAX_NON_GPU_JOBS=30 + +SPARK_NON_GPU_DISPATCHERS=3 +SPARK_NON_GPU_DISPATCHERS_OUTPUT_FILE="non-gpu-dispatchers.out" +# Note: this name is built internally by the deploy-dispatchers.py script. +SPARK_NON_GPU_DISPATCHERS_JSON_OUTPUT_FILE="${SPARK_NON_GPU_DISPATCHERS_OUTPUT_FILE}-dispatchers.json" +# Note: driver resources used per dispatcher (1 dispatcher will be able to run +# 8 drivers since each driver requires 1 CPU). +SPARK_NON_GPU_QUOTA_DRIVERS_CPUS=8 +SPARK_NON_GPU_QUOTA_DRIVERS_MEM=20000 +# Note: executor resources used per job (1 driver will run 1 job). +SPARK_NON_GPU_QUOTA_EXECUTORS_CPUS=8 +SPARK_NON_GPU_QUOTA_EXECUTORS_MEM=20000 + +# GPU Spark dispatchers configuration ########################################## + +# Not currently used. +BATCH_MAX_GPU_JOBS=2 + +SPARK_GPU_DISPATCHERS=0 +SPARK_GPU_DISPATCHERS_OUTPUT_FILE="gpu-dispatchers.out" +SPARK_GPU_DISPATCHERS_JSON_OUTPUT_FILE="${SPARK_GPU_DISPATCHERS_OUTPUT_FILE}-dispatchers.json" # NOTE: this name is built internally by the deploy-dispatchers.py script. +SPARK_GPU_QUOTA_DRIVERS_CPUS= +SPARK_GPU_QUOTA_DRIVERS_MEM= +SPARK_GPU_QUOTA_DRIVERS_GPUS= +SPARK_GPU_QUOTA_EXECUTORS_CPUS= +SPARK_GPU_QUOTA_EXECUTORS_MEM= +SPARK_GPU_QUOTA_EXECUTORS_GPUS= + +# Common streaming jobs configuration ########################################## + +TEST_ASSEMBLY_JAR_URL='http://infinity-artifacts.s3.amazonaws.com/scale-tests/dcos-spark-scala-tests-assembly-2.4.0-20190325.jar' +DISPATCHERS_JSON_OUTPUT_FILE="all-dispatchers.json" + +# Finite streaming jobs configuration ########################################## + +STREAMING_FINITE_SUBMISSIONS_OUTPUT_FILE="finite-submissions.out" +STREAMING_FINITE_PRODUCERS_PER_KAFKA="${SPARK_NON_GPU_DISPATCHERS}" # 1 Kafka and 3 dispatchers -> 3 producers. +STREAMING_FINITE_CONSUMERS_PER_PRODUCER=1 # 3 producers -> 3 consumers. +# 3 producers + 3 consumers = 6 total finite streaming jobs +STREAMING_FINITE_PRODUCER_NUMBER_OF_WORDS=7692 +STREAMING_FINITE_PRODUCER_WORDS_PER_SECOND=1 +# 7692 words / 1 word per second -> ~2h runtime. +STREAMING_FINITE_PRODUCER_SPARK_CORES_MAX=2 +STREAMING_FINITE_PRODUCER_SPARK_EXECUTOR_CORES=2 +STREAMING_FINITE_CONSUMER_BATCH_SIZE_SECONDS=10 +STREAMING_FINITE_CONSUMER_SPARK_CORES_MAX=1 +STREAMING_FINITE_CONSUMER_SPARK_EXECUTOR_CORES=1 + +# Infinite streaming jobs configuration ######################################## + +STREAMING_INFINITE_SUBMISSIONS_OUTPUT_FILE="infinite-submissions.out" +STREAMING_INFINITE_PRODUCERS_PER_KAFKA="${SPARK_NON_GPU_DISPATCHERS}" # 1 Kafka and 3 dispatchers -> 3 producers. +STREAMING_INFINITE_CONSUMERS_PER_PRODUCER=1 # 3 producers -> 3 consumers. +# 3 producers + 3 consumers = 6 total infinite streaming jobs +STREAMING_INFINITE_PRODUCER_NUMBER_OF_WORDS=0 +STREAMING_INFINITE_PRODUCER_WORDS_PER_SECOND=1 +STREAMING_INFINITE_PRODUCER_SPARK_CORES_MAX=2 +STREAMING_INFINITE_PRODUCER_SPARK_EXECUTOR_CORES=2 +STREAMING_INFINITE_CONSUMER_BATCH_SIZE_SECONDS=10 +STREAMING_INFINITE_CONSUMER_SPARK_CORES_MAX=1 +STREAMING_INFINITE_CONSUMER_SPARK_EXECUTOR_CORES=1 + +# Batch jobs configuration ##################################################### + +SPARK_NON_GPU_DISPATCHERS_JSON_OUTPUT_FILE_URL="https://${TEST_S3_BUCKET}.s3.amazonaws.com/${TEST_S3_FOLDER}/${SPARK_NON_GPU_DISPATCHERS_JSON_OUTPUT_FILE}" + +BATCH_APP_ID="/${SERVICE_NAMES_PREFIX}batch-workload" +BATCH_SCRIPT_CPUS=6 +BATCH_SCRIPT_MEM=12288 +BATCH_SUBMITS_PER_MIN=13 +# TODO: update to master for the next MWT. +BATCH_SPARK_BUILD_BRANCH="dcos-58437-deploy-workloads-under-role-enforced-group" + +# Batch GPU jobs configuration ################################################# + +SPARK_GPU_DISPATCHERS_JSON_OUTPUT_FILE_URL="https://${TEST_S3_BUCKET}.s3.amazonaws.com/${TEST_S3_FOLDER}/${SPARK_GPU_DISPATCHERS_JSON_OUTPUT_FILE}" + +GPU_APP_ID="/${SERVICE_NAMES_PREFIX}gpu-batch-workload" +GPU_SCRIPT_CPUS=2 +GPU_SCRIPT_MEM=4096 +GPU_DOCKER_IMAGE='samvantran/spark-dcos-gpu:metrics' +GPU_SUBMITS_PER_MIN=5 +GPU_MAX_DISPATCHERS=${SPARK_GPU_DISPATCHERS} +GPU_SPARK_CORES_MAX=4 +GPU_SPARK_MESOS_EXECUTOR_GPUS=4 +GPU_SPARK_MESOS_MAX_GPUS=4 +GPU_SPARK_BUILD_BRANCH=master diff --git a/scale-tests/configs/2020-10-22-mwt27run.env b/scale-tests/configs/2020-10-22-mwt27run.env new file mode 100644 index 00000000..80ca6576 --- /dev/null +++ b/scale-tests/configs/2020-10-22-mwt27run.env @@ -0,0 +1,177 @@ +# Depends on: +# - TEST_NAME +# - TEST_S3_BUCKET +# - TEST_S3_FOLDER + +# Workload configuration ####################################################### +# +# Total CPU quota: 2290 +# Total MEM quota: 4580000 +# Total GPU quota: 40 + +CLUSTER_URL="https://mwt27.scaletesting.mesosphe.re" +SECURITY="strict" + +DCOS_CLI_URL="https://downloads.dcos.io/cli/releases/binaries/dcos/linux/x86-64/latest/dcos" + +# Test configuration ########################################################### + +SHOULD_INSTALL_INFRASTRUCTURE=true +SHOULD_INSTALL_NON_GPU_DISPATCHERS=true +SHOULD_INSTALL_GPU_DISPATCHERS=false +SHOULD_RUN_FINITE_STREAMING_JOBS=true +SHOULD_RUN_INFINITE_STREAMING_JOBS=true +SHOULD_RUN_BATCH_JOBS=true +SHOULD_RUN_GPU_BATCH_JOBS=false +SHOULD_UNINSTALL_INFRASTRUCTURE_AT_THE_END=false + +# Infrastructure configuration ################################################# + +GROUP_NAME="${TEST_NAME}" + +SERVICE_NAMES_PREFIX="${TEST_NAME}/" +INFRASTRUCTURE_OUTPUT_FILE="infrastructure.json" + +KAFKA_CLUSTER_COUNT=1 +CASSANDRA_CLUSTER_COUNT=1 + +ZOOKEEPER_CPUS=10 +ZOOKEEPER_MEM=20000 +ZOOKEEPER_CONFIG='scale-tests/configs/kafka-zookeeper-options.json' +# Note: empty package repo values will default to latest Universe packages. +ZOOKEEPER_PACKAGE_REPO= +ZOOKEEPER_PACKAGE_NAME='beta-confluent-zookeeper' +# 2.8.0-5.5.1-beta from the Universe. + +KAFKA_CPUS=10 +KAFKA_MEM=20000 +KAFKA_CONFIG='scale-tests/configs/kafka-options.json' +KAFKA_USER='root' +# Note: empty package repo values will default to latest Universe packages. +KAFKA_PACKAGE_REPO= +# 2.10.0-5.5.1-3-beta from the Universe. +KAFKA_PACKAGE_NAME='beta-confluent-kafka' + +CASSANDRA_CPUS=10 +CASSANDRA_MEM=20000 +CASSANDRA_CONFIG='scale-tests/configs/cassandra-options.json' +CASSANDRA_USER='root' +# Note: empty package repo values will default to latest Universe packages. +CASSANDRA_PACKAGE_REPO='https://universe-converter.mesosphere.com/transform?url=https://infinity-artifacts.s3.amazonaws.com/permanent/cassandra/assets/2.10.0-3.11.6-beta/stub-universe-cassandra.json' +CASSANDRA_PACKAGE_NAME='beta-cassandra' +# 2.10.0-3.11.6-beta from the Universe. + +# DSEngine configuration ####################################################### + +DSENGINE_CPUS=10 +DSENGINE_MEM=20000 +DSENGINE_GPUS=40 +DSENGINE_CONFIG='scale-tests/configs/dsengine-options.json' +# Note: empty package repo values will default to latest Universe packages. +DSENGINE_PACKAGE_REPO= +DSENGINE_PACKAGE_NAME='beta-data-science-engine' +# 2.1.0-beta from the Universe. + +# Spark configuration ########################################################## + +SPARK_CONFIG='scale-tests/configs/spark-options.json' + +# Note: empty package repo values will default to latest Universe packages. +# Spark version 2.11.0-2.4.6 +SPARK_PACKAGE_REPO='https://universe-converter.mesosphere.com/transform?url=https://infinity-artifacts.s3.amazonaws.com/permanent/spark/2.11.0-2.4.6/stub-universe-spark.json' + +# Note: leaving the Spark executor Docker image empty so that executors inherit +# the image used for dispatchers. +SPARK_EXECUTOR_DOCKER_IMAGE= + +# Non-GPU Spark dispatchers configuration ###################################### + +# Not currently used. +BATCH_MAX_NON_GPU_JOBS=1000 + +SPARK_NON_GPU_DISPATCHERS=50 +SPARK_NON_GPU_DISPATCHERS_OUTPUT_FILE="non-gpu-dispatchers.out" +# Note: this name is built internally by the deploy-dispatchers.py script. +SPARK_NON_GPU_DISPATCHERS_JSON_OUTPUT_FILE="${SPARK_NON_GPU_DISPATCHERS_OUTPUT_FILE}-dispatchers.json" +# Note: driver resources used per dispatcher (1 dispatcher will be able to run +# 20 drivers since each driver requires 1 CPU). +SPARK_NON_GPU_QUOTA_DRIVERS_CPUS=20 +SPARK_NON_GPU_QUOTA_DRIVERS_MEM=50000 +# Note: executor resources used per job (1 driver will run 1 job). +SPARK_NON_GPU_QUOTA_EXECUTORS_CPUS=25 +SPARK_NON_GPU_QUOTA_EXECUTORS_MEM=40000 + +# GPU Spark dispatchers configuration ########################################## + +# Not currently used. +BATCH_MAX_GPU_JOBS=10 + +SPARK_GPU_DISPATCHERS=0 +SPARK_GPU_DISPATCHERS_OUTPUT_FILE="gpu-dispatchers.out" +SPARK_GPU_DISPATCHERS_JSON_OUTPUT_FILE="${SPARK_GPU_DISPATCHERS_OUTPUT_FILE}-dispatchers.json" # NOTE: this name is built internally by the deploy-dispatchers.py script. +SPARK_GPU_QUOTA_DRIVERS_CPUS= +SPARK_GPU_QUOTA_DRIVERS_MEM= +SPARK_GPU_QUOTA_DRIVERS_GPUS= +SPARK_GPU_QUOTA_EXECUTORS_CPUS= +SPARK_GPU_QUOTA_EXECUTORS_MEM= +SPARK_GPU_QUOTA_EXECUTORS_GPUS= +SPARK_GPU_REMOVE_EXECUTORS_ROLES_QUOTAS=true +# Common streaming jobs configuration ########################################## + +TEST_ASSEMBLY_JAR_URL='http://infinity-artifacts.s3.amazonaws.com/scale-tests/dcos-spark-scala-tests-assembly-2.4.0-20190325.jar' +DISPATCHERS_JSON_OUTPUT_FILE="all-dispatchers.json" + +# Finite streaming jobs configuration ########################################## + +STREAMING_FINITE_SUBMISSIONS_OUTPUT_FILE="finite-submissions.out" +STREAMING_FINITE_PRODUCERS_PER_KAFKA="${SPARK_NON_GPU_DISPATCHERS}" # 1 Kafka and 50 dispatchers -> 50 producers. +STREAMING_FINITE_CONSUMERS_PER_PRODUCER=1 # 50 producers -> 50 consumers. +# 50 producers + 50 consumers = 100 total finite streaming jobs +STREAMING_FINITE_PRODUCER_NUMBER_OF_WORDS=7692 +STREAMING_FINITE_PRODUCER_WORDS_PER_SECOND=1 +# 7692 words / 1 word per second -> ~2h runtime. +STREAMING_FINITE_PRODUCER_SPARK_CORES_MAX=2 +STREAMING_FINITE_PRODUCER_SPARK_EXECUTOR_CORES=2 +STREAMING_FINITE_CONSUMER_BATCH_SIZE_SECONDS=10 +STREAMING_FINITE_CONSUMER_SPARK_CORES_MAX=1 +STREAMING_FINITE_CONSUMER_SPARK_EXECUTOR_CORES=1 + +# Infinite streaming jobs configuration ######################################## + +STREAMING_INFINITE_SUBMISSIONS_OUTPUT_FILE="infinite-submissions.out" +STREAMING_INFINITE_PRODUCERS_PER_KAFKA="${SPARK_NON_GPU_DISPATCHERS}" # 1 Kafka and 50 dispatchers -> 50 producers. +STREAMING_INFINITE_CONSUMERS_PER_PRODUCER=1 # 50 producers -> 50 consumers. +# 50 producers + 50 consumers = 100 total infinite streaming jobs +STREAMING_INFINITE_PRODUCER_NUMBER_OF_WORDS=0 +STREAMING_INFINITE_PRODUCER_WORDS_PER_SECOND=1 +STREAMING_INFINITE_PRODUCER_SPARK_CORES_MAX=2 +STREAMING_INFINITE_PRODUCER_SPARK_EXECUTOR_CORES=2 +STREAMING_INFINITE_CONSUMER_BATCH_SIZE_SECONDS=10 +STREAMING_INFINITE_CONSUMER_SPARK_CORES_MAX=1 +STREAMING_INFINITE_CONSUMER_SPARK_EXECUTOR_CORES=1 + +# Batch jobs configuration ##################################################### + +SPARK_NON_GPU_DISPATCHERS_JSON_OUTPUT_FILE_URL="https://${TEST_S3_BUCKET}.s3.amazonaws.com/${TEST_S3_FOLDER}/${SPARK_NON_GPU_DISPATCHERS_JSON_OUTPUT_FILE}" + +BATCH_APP_ID="/${SERVICE_NAMES_PREFIX}batch-workload" +BATCH_SCRIPT_CPUS=6 +BATCH_SCRIPT_MEM=12288 +BATCH_SUBMITS_PER_MIN=13 +# TODO: update to master for the next MWT. +BATCH_SPARK_BUILD_BRANCH="mwt-27" + +# Batch GPU jobs configuration ################################################# + +SPARK_GPU_DISPATCHERS_JSON_OUTPUT_FILE_URL="https://${TEST_S3_BUCKET}.s3.amazonaws.com/${TEST_S3_FOLDER}/${SPARK_GPU_DISPATCHERS_JSON_OUTPUT_FILE}" + +GPU_APP_ID="/${SERVICE_NAMES_PREFIX}gpu-batch-workload" +GPU_SCRIPT_CPUS=2 +GPU_SCRIPT_MEM=4096 +GPU_DOCKER_IMAGE='samvantran/spark-dcos-gpu:metrics' +GPU_SUBMITS_PER_MIN=5 +GPU_MAX_DISPATCHERS=${SPARK_GPU_DISPATCHERS} +GPU_SPARK_CORES_MAX=4 +GPU_SPARK_MESOS_EXECUTOR_GPUS=4 +GPU_SPARK_MESOS_MAX_GPUS=4 +GPU_SPARK_BUILD_BRANCH=master diff --git a/scale-tests/configs/cassandra-options.json b/scale-tests/configs/cassandra-options.json index e1038b2b..de7d1e78 100644 --- a/scale-tests/configs/cassandra-options.json +++ b/scale-tests/configs/cassandra-options.json @@ -1,9 +1,16 @@ { "service": { "virtual_network_enabled": true, - "virtual_network_name": "calico" + "virtual_network_name": "calico", + "user": "root" }, "nodes": { - "count": 5 + "count": 5, + "external_volume": { + "enabled": true, + "portworx_volume_options": "size=100", + "volume_name": "", + "driver_name": "pxd" + } } } diff --git a/scale-tests/configs/dsengine-options.json b/scale-tests/configs/dsengine-options.json new file mode 100644 index 00000000..2707bb01 --- /dev/null +++ b/scale-tests/configs/dsengine-options.json @@ -0,0 +1,37 @@ +{ + "service": { + "user_id": 0, + "group_id": 0, + "cmd": "/usr/local/bin/start.sh ${CONDA_DIR}/bin/jupyter lab --notebook-dir=\"${JUPYTER_NOTEBOOK_DIR}\" --allow-root", + "virtual_network_enabled": true, + "virtual_network_name": "calico", + "gpu": { + "enabled": true, + "gpus": 1 + }, + "jupyter_notebook_type": "TensorFlow-2.1.0" + }, + "storage": { + "local_persistence": { + "enabled": false + }, + "external_volume": { + "enabled": true, + "driver_name": "pxd", + "volume_driver_options": "size=100", + "volume_path": "jupyter_data" + } + }, + "spark": { + "spark_mesos_role": "data-services__data-science-engine-00-role", + "spark_mesos_principal": "data_services__data_science_engine_00_service_account", + "spark_mesos_secret": "data-services/data-science-engine-00-service-account-secret", + "spark_cores_max": 4, + "spark_driver_cores": 2, + "spark_executor_cores": 1, + "spark_mesos_gpus_max": 40, + "spark_mesos_executor_gpus": 7, + "spark_driver_memory": "2g", + "spark_executor_memory": "6g" + } +} \ No newline at end of file diff --git a/scale-tests/configs/kafka-options.json b/scale-tests/configs/kafka-options.json index 80fea6f4..5d302e84 100644 --- a/scale-tests/configs/kafka-options.json +++ b/scale-tests/configs/kafka-options.json @@ -1,10 +1,17 @@ { "service": { "virtual_network_enabled": true, - "virtual_network_name": "calico" + "virtual_network_name": "calico", + "user": "root" }, "brokers": { "count": 5, - "disk_type": "ROOT" + "disk_type": "ROOT", + "external_volume": { + "enabled": true, + "volume_name": "KafkaVolume", + "volume_driver_options": "size=100", + "volume_path": "kafka-broker-data" + } } } diff --git a/scale-tests/configs/spark-options.json b/scale-tests/configs/spark-options.json new file mode 100644 index 00000000..d4ac3d92 --- /dev/null +++ b/scale-tests/configs/spark-options.json @@ -0,0 +1,7 @@ +{ + "service": { + "role": "data-services", + "enforce_role": true, + "virtual_network_enabled": true + } +} diff --git a/scale-tests/deploy-dispatchers.py b/scale-tests/deploy-dispatchers.py index 33a4c990..537cec68 100755 --- a/scale-tests/deploy-dispatchers.py +++ b/scale-tests/deploy-dispatchers.py @@ -31,6 +31,7 @@ --quota-executors-gpus number of GPUs to use for executors quota [default: 0] --quota-executors-mem amount of memory (mb) to use per executors quota [default: 1524.0] --role Mesos role registered by dispatcher [default: *] + --group-role TODO: description [default: None] --ucr-containerizer launch using the Universal Container Runtime [default: True] --user user to run dispatcher service as [default: root] @@ -57,9 +58,10 @@ logging.basicConfig( - format='[%(asctime)s|%(name)s|%(levelname)s]: %(message)s', + format="[%(asctime)s|%(name)s|%(levelname)s]: %(message)s", level=logging.INFO, - stream=sys.stdout) + stream=sys.stdout, +) log = logging.getLogger(__name__) @@ -73,25 +75,21 @@ # file. -def create_quota( - role_name: str, - quota: typing.Dict -): +def create_quota(role_name: str, quota: typing.Dict): """ Create quota for the specified role. """ existing_quotas = sdk_cmd.get_json_output("spark quota list --json", print_output=False) # remove existing quotas matching name - if role_name in [x['role'] for x in existing_quotas.get('infos', [])]: + if role_name in [x["role"] for x in existing_quotas.get("infos", [])]: rc, _, _ = sdk_cmd.run_raw_cli("spark quota remove {}".format(role_name)) assert rc == 0, "Error removing quota" - cmd_list = ["spark", "quota", "create"] - for r in ["cpus", "mem", "gpus", ]: + for r in ["cpus", "mem", "gpus"]: if r in quota: - cmd_list.extend(["-{}".format(r[0]), quota[r],]) + cmd_list.extend(["-{}".format(r[0]), quota[r]]) cmd_list.append(role_name) @@ -117,10 +115,13 @@ def setup_role(service_name: str, role_base: str, quota: typing.Dict) -> str: return role_name -def setup_spark_security(service_name: str, - drivers_role: str, - executors_role: str, - service_account_info: typing.Dict): +def setup_spark_security( + service_name: str, + group_role: str, + drivers_role: str, + executors_role: str, + service_account_info: typing.Dict, +): """ In strict mode, additional permissions are required for Spark. @@ -134,48 +135,56 @@ def setup_spark_security(service_name: str, linux_user = service_account_info.get("linux_user", "nobody") service_account = service_account_info["name"] - for role_name in [drivers_role, executors_role]: + if group_role: sdk_security.grant_permissions( - linux_user=linux_user, - role_name=role_name, - service_account_name=service_account, + linux_user=linux_user, role_name=group_role, service_account_name=service_account ) + else: + for role_name in [drivers_role, executors_role]: + sdk_security.grant_permissions( + linux_user=linux_user, role_name=role_name, service_account_name=service_account + ) # TODO: Is this required? app_id = "/{}".format(service_name) - app_id = urllib.parse.quote( - urllib.parse.quote(app_id, safe=''), - safe='' + app_id = urllib.parse.quote(urllib.parse.quote(app_id, safe=""), safe="") + + sdk_security._grant( + service_account_info["name"], + "dcos:mesos:master:task:app_id:{}".format(app_id), + description="Spark drivers may execute Mesos tasks", + action="create", ) - sdk_security._grant(service_account_info["name"], - "dcos:mesos:master:task:app_id:{}".format(app_id), - description="Spark drivers may execute Mesos tasks", - action="create") if linux_user == "root": log.info("Marathon must be able to launch tasks as root") - sdk_security._grant("dcos_marathon", - "dcos:mesos:master:task:user:root", - description="Root Marathon may launch tasks as root", - action="create") + sdk_security._grant( + "dcos_marathon", + "dcos:mesos:master:task:user:root", + description="Root Marathon may launch tasks as root", + action="create", + ) return -def install_package(package_name: str, - service_prefix: str, - index: int, - linux_user: str, - service_task_count: int, - config_path: str, - additional_options: typing.Dict = None, - quota_options: typing.Dict = None) -> typing.Dict: +def install_package( + package_name: str, + service_prefix: str, + index: int, + linux_user: str, + service_task_count: int, + group_role: str, + config_path: str, + additional_options: typing.Dict = None, + quota_options: typing.Dict = None, +) -> typing.Dict: """ Deploy a single dispatcher with the specified index. """ if package_name.startswith("beta-"): - basename = package_name[len("beta-"):] + basename = package_name[len("beta-") :] else: basename = package_name @@ -183,15 +192,22 @@ def install_package(package_name: str, service_account_info = scale_tests_utils.setup_security(service_name, linux_user) - drivers_role = setup_role(service_name, "drivers", quota_options) - executors_role = setup_role(service_name, "executors", quota_options) + service_options = scale_tests_utils.get_service_options( + service_name, service_account_info, additional_options, config_path + ) - setup_spark_security(service_name, drivers_role, executors_role, service_account_info) + if group_role: + setup_spark_security(service_name, group_role, None, None, service_account_info) - service_options = scale_tests_utils.get_service_options(service_name, service_account_info, additional_options, config_path) + service_options["service"]["role"] = group_role + roles = {"drivers": None, "executors": None} + else: + drivers_role = setup_role(service_name, "drivers", quota_options) + executors_role = setup_role(service_name, "executors", quota_options) + setup_spark_security(service_name, drivers_role, executors_role, service_account_info) - # install dispatcher with appropriate role - service_options["service"]["role"] = drivers_role + service_options["service"]["role"] = drivers_role + roles = {"drivers": drivers_role, "executors": executors_role} expected_task_count = service_task_count(service_options) log.info("Expected task count: %s", expected_task_count) @@ -202,47 +218,62 @@ def install_package(package_name: str, package_name, service_name, expected_task_count, - timeout_seconds=60*60, + timeout_seconds=60 * 60, additional_options=service_options, wait_for_deployment=False, insert_strict_options=False, - install_cli=False) + install_cli=False, + ) - return {"package_name": package_name, - "roles": {"drivers": drivers_role, "executors": executors_role}, - "service_account_info": service_account_info, - **service_options} + return { + "package_name": package_name, + "roles": roles, + "service_account_info": service_account_info, + **service_options, + } def deploy_dispatchers( num_dispatchers: int, service_name_base: str, + group_role: str, output_file: str, linux_user: str, options: typing.Dict, - quota_options: typing.Dict + quota_options: typing.Dict, ) -> typing.Dict: """ Deploy the required number of dispatchers and store their information to a text file. """ + def deploy_dispatcher(index: int) -> dict: - return install_package('spark', - service_name_base, - index, - linux_user, - lambda x: 0, - None, - options, - quota_options) + return install_package( + "spark", + service_name_base, + index, + linux_user, + lambda x: 0, + group_role, + None, + options, + quota_options, + ) with ThreadPoolExecutor(max_workers=MAX_THREADPOOL_WORKERS) as executor: dispatchers = list(executor.map(deploy_dispatcher, range(num_dispatchers))) - with open(output_file, 'w') as outfile: + with open(output_file, "w") as outfile: for dispatcher in dispatchers: - outfile.write('{},{},{}\n'.format(dispatcher['service']['name'], - dispatcher['roles']['drivers'], - dispatcher['roles']['executors'])) + if group_role: + outfile.write("{},{}\n".format(dispatcher["service"]["name"], group_role)) + else: + outfile.write( + "{},{},{}\n".format( + dispatcher["service"]["name"], + dispatcher["roles"]["drivers"], + dispatcher["roles"]["executors"], + ) + ) outfile.flush() return dispatchers @@ -260,21 +291,19 @@ def get_default_options(arguments: dict) -> dict: "spark-history-server-url": arguments["--history-service"] or "", "UCR_containerizer": ast.literal_eval(arguments.get("--ucr-containerizer", True)), "virtual_network_enabled": True, - "virtual_network_name": "dcos" + "virtual_network_name": "dcos", }, "security": { "kerberos": { "enabled": ast.literal_eval(arguments.get("--enable-kerberos", False)), "kdc": { "hostname": arguments["--kdc-hostname"] or "", - "port": int(arguments["--kdc-port"]) + "port": int(arguments["--kdc-port"]), }, - "realm": arguments["--kerberos-realm"] or "" + "realm": arguments["--kerberos-realm"] or "", } }, - "hdfs": { - "config-url": arguments["--hdfs-config"] or "" - } + "hdfs": {"config-url": arguments["--hdfs-config"] or ""}, } return options @@ -283,12 +312,12 @@ def get_quota_options(arguments: typing.Dict) -> typing.Dict: """ Move the quota options from the command line arguments to a dict. """ - create_quotas = ast.literal_eval(arguments.get("--create-quotas", True)) + create_quotas = arguments.get("--create-quotas", True) if not create_quotas: return {} - resources = ["cpus", "mem", "gpus", ] - targets = ["drivers", "executors", ] + resources = ["cpus", "mem", "gpus"] + targets = ["drivers", "executors"] quota_options = {} for t in targets: @@ -302,19 +331,19 @@ def get_quota_options(arguments: typing.Dict) -> typing.Dict: def install(args): - options_file = args['--options-json'] + options_file = args["--options-json"] if options_file: if not os.path.isfile(options_file): # TODO: Replace with logging log.error("The specified file does not exist: %s", options_file) sys.exit(1) - options = json.load(open(options_file, 'r')) + options = json.load(open(options_file, "r")) else: options = get_default_options(args) - if args['--package-repo']: - sdk_repository.add_stub_universe_urls([args['--package-repo']]) + if args["--package-repo"]: + sdk_repository.add_stub_universe_urls([args["--package-repo"]]) rc, _, _ = sdk_cmd.run_raw_cli("package install spark --cli --yes") assert rc == 0, "Error installing spark CLI" @@ -324,12 +353,14 @@ def install(args): services = {} services["spark"] = deploy_dispatchers( - num_dispatchers=int(args['']), - service_name_base=args[''], - output_file=args[''], + num_dispatchers=int(args[""]), + service_name_base=args[""], + group_role=args["--group-role"], + output_file=args[""], linux_user=args["--user"], options=options, - quota_options=quota_options) + quota_options=quota_options, + ) output_filename = "{}-dispatchers.json".format(args[""]) with open(output_filename, "w") as fp: diff --git a/scale-tests/deploy_dsengine.sh b/scale-tests/deploy_dsengine.sh new file mode 100755 index 00000000..a34aebcc --- /dev/null +++ b/scale-tests/deploy_dsengine.sh @@ -0,0 +1,93 @@ +#!/usr/bin/env bash + +set -x + +create_service_account() { + SERVICE_ACCOUNT="${1}" + SECRET_NAME="${SERVICE_ACCOUNT}-secret" + + dcos security org service-accounts delete "${SERVICE_ACCOUNT}" + dcos security secrets delete "${SECRET_NAME}" + + dcos security org service-accounts keypair private.pem public.pem + dcos security org service-accounts create \ + -p public.pem \ + -d "Service account for ${SERVICE_ACCOUNT}" "${SERVICE_ACCOUNT}" + dcos security secrets create-sa-secret \ + --strict private.pem \ + "${SERVICE_ACCOUNT}" \ + "${SECRET_NAME}" + + rm -f private.pem public.pem +} + +grant_permissions() { + SERVICE_ACCOUNT="${1}" + + echo "Granting permissions to Service Account ${SERVICE_ACCOUNT}" + + dcos security org users grant "${SERVICE_ACCOUNT}" \ + "dcos:mesos:master:task:user:nobody" \ + create + dcos security org users grant "${SERVICE_ACCOUNT}" \ + "dcos:mesos:agent:task:user:nobody" \ + create + dcos security org users grant "${SERVICE_ACCOUNT}" \ + "dcos:mesos:master:task:user:root" \ + create + dcos security org users grant "${SERVICE_ACCOUNT}" \ + "dcos:mesos:agent:task:user:root" \ + create + dcos security org users grant "${SERVICE_ACCOUNT}" \ + "dcos:secrets:list:default:__dcos_base64__hdfs_jupyter_keytab" \ + read +} + +grant_spark_permissions() { + SERVICE_ACCOUNT=$1 + echo "Granting Spark permissions to Jupyter Service Account ${SERVICE_ACCOUNT}" + + dcos security org users grant "${SERVICE_ACCOUNT}" \ + "dcos:mesos:master:framework:role:${SERVICE_ACCOUNT}" \ + create + dcos security org users grant "${SERVICE_ACCOUNT}" \ + "dcos:mesos:master:reservation:role:${SERVICE_ACCOUNT}" \ + create + dcos security org users grant "${SERVICE_ACCOUNT}" \ + "dcos:mesos:master:reservation:principal:${SERVICE_ACCOUNT}" \ + delete + dcos security org users grant "${SERVICE_ACCOUNT}" \ + "dcos:mesos:master:volume:role:${SERVICE_ACCOUNT}" \ + create + dcos security org users grant "${SERVICE_ACCOUNT}" \ + "dcos:mesos:master:volume:principal:${SERVICE_ACCOUNT}" \ + delete + + dcos security org users grant "${SERVICE_ACCOUNT}" \ + "dcos:mesos:master:task:role:${SERVICE_ACCOUNT}" \ + create + dcos security org users grant "${SERVICE_ACCOUNT}" \ + "dcos:mesos:master:task:principal:${SERVICE_ACCOUNT}" \ + create + dcos security org users grant "${SERVICE_ACCOUNT}" \ + "dcos:mesos:master:task:app_id:data-services/jupyter" \ + create +} +create_service_account data_services__jupyter + +grant_permissions data_services__jupyter +grant_spark_permissions data_services__jupyter + +dcos package install --yes data-science-engine \ + --options=scale-tests/configs/dsengine-options.json + +# Run the following in the Jupyter notebook UI (password: jupyter): +# +# ! spark-submit \ + # --conf spark.mesos.gpus.max=40 \ + # --conf spark.cores.max=40 \ + # --conf spark.mesos.executor.gpus=1 \ + # --conf spark.executor.cores=1 \ + # --verbose \ + # --class MockTaskRunner \ + # https://infinity-artifacts.s3.amazonaws.com/scale-tests/dcos-spark-scala-tests-assembly-2.4.0-20190325.jar 5000 10 diff --git a/scale-tests/kafka_cassandra_streaming_test.py b/scale-tests/kafka_cassandra_streaming_test.py index 232a076c..f0fcd00d 100755 --- a/scale-tests/kafka_cassandra_streaming_test.py +++ b/scale-tests/kafka_cassandra_streaming_test.py @@ -15,6 +15,7 @@ Options: --spark-executor-docker-image Docker image for Spark executors [default: ] --jar hosted JAR URL + --group-role TODO: description [default: None] --num-producers-per-kafka number of producers per Kafka cluster to create [default: 1] --num-consumers-per-producer number of consumers for producer to create [default: 1] --producer-number-of-words number of total words published by producers [default: 1] @@ -45,16 +46,21 @@ logging.basicConfig(level=logging.INFO, format="%(message)s") -DEFAULT_JAR = 'http://infinity-artifacts.s3.amazonaws.com/scale-tests/dcos-spark-scala-tests-assembly-20180523-fa29ab5.jar' -PRODUCER_CLASS_NAME = 'KafkaRandomFeeder' -CONSUMER_CLASS_NAME = 'KafkaWordCount' -SPARK_PACKAGE_NAME = 'spark' +DEFAULT_JAR = "http://infinity-artifacts.s3.amazonaws.com/scale-tests/dcos-spark-scala-tests-assembly-20180523-fa29ab5.jar" +PRODUCER_CLASS_NAME = "KafkaRandomFeeder" +CONSUMER_CLASS_NAME = "KafkaWordCount" +SPARK_PACKAGE_NAME = "spark" COMMON_CONF = [ - "--conf", "spark.mesos.containerizer=mesos", - "--conf", "spark.mesos.driver.failoverTimeout=30", - "--conf", "spark.port.maxRetries=32", - "--conf", "spark.scheduler.maxRegisteredResourcesWaitingTime=2400s", - "--conf", "spark.scheduler.minRegisteredResourcesRatio=1.0" + "--conf", + "spark.mesos.containerizer=mesos", + "--conf", + "spark.mesos.driver.failoverTimeout=30", + "--conf", + "spark.port.maxRetries=32", + "--conf", + "spark.scheduler.maxRegisteredResourcesWaitingTime=2400s", + "--conf", + "spark.scheduler.minRegisteredResourcesRatio=1.0", ] @@ -66,38 +72,53 @@ def _install_package_cli(package_name): def _service_endpoint_dns(package_name, service_name, endpoint_name): cmd = "{package_name} --name={service_name} endpoints {endpoint_name}".format( - package_name=package_name, - service_name=service_name, - endpoint_name=endpoint_name) + package_name=package_name, service_name=service_name, endpoint_name=endpoint_name + ) rt, stdout, _ = sdk_cmd.run_raw_cli(cmd) assert rt == 0, "Failed to get {endpoint_name} endpoints" return json.loads(stdout)["dns"] -def _submit_producer(name, - spark_executor_docker_image, - jar, - kafka_broker_dns, - dispatcher, - kafka_topics, - number_of_words, - words_per_second, - spark_cores_max, - spark_executor_cores, - must_fail: bool): - app_args = ["--appName", name, - "--brokers", ",".join(kafka_broker_dns), - "--topics", kafka_topics, - "--numberOfWords", str(number_of_words), - "--wordsPerSecond", str(words_per_second)] +def _submit_producer( + name, + spark_executor_docker_image, + group_role, + jar, + kafka_broker_dns, + dispatcher, + kafka_topics, + number_of_words, + words_per_second, + spark_cores_max, + spark_executor_cores, + must_fail: bool, +): + app_args = [ + "--appName", + name, + "--brokers", + ",".join(kafka_broker_dns), + "--topics", + kafka_topics, + "--numberOfWords", + str(number_of_words), + "--wordsPerSecond", + str(words_per_second), + ] if must_fail: - app_args.extend(["--mustFailDueToInvalidArgument", ]) + app_args.extend(["--mustFailDueToInvalidArgument"]) - app_config = ["--conf", "spark.cores.max={}".format(spark_cores_max), - "--conf", "spark.executor.cores={}".format(spark_executor_cores), - "--name", name, - "--class", PRODUCER_CLASS_NAME] + app_config = [ + "--conf", + "spark.cores.max={}".format(spark_cores_max), + "--conf", + "spark.executor.cores={}".format(spark_executor_cores), + "--name", + name, + "--class", + PRODUCER_CLASS_NAME, + ] # `number_of_words == 0` means infinite stream, so we'd like to have it # restarted in the case of failures. @@ -105,47 +126,62 @@ def _submit_producer(name, app_config.extend(["--supervise"]) if spark_executor_docker_image: - app_config.extend([ - "--conf", "spark.mesos.executor.docker.image={}".format(spark_executor_docker_image) - ]) + app_config.extend( + ["--conf", "spark.mesos.executor.docker.image={}".format(spark_executor_docker_image)] + ) args = app_config + COMMON_CONF + driver_role = None if group_role else dispatcher["roles"]["executors"] + submission_id = spark_utils.submit_job( app_url=jar, app_args=" ".join(str(a) for a in app_args), args=args, verbose=False, - service_name=dispatcher['service']['name'], - driver_role=dispatcher['roles']['executors'], - spark_user=dispatcher['service']['user'] if sdk_utils.is_strict_mode() else None, - principal=dispatcher['service']['service_account'] if sdk_utils.is_strict_mode() else None) + service_name=dispatcher["service"]["name"], + driver_role=driver_role, + spark_user=dispatcher["service"]["user"] if sdk_utils.is_strict_mode() else None, + principal=dispatcher["service"]["service_account"] if sdk_utils.is_strict_mode() else None, + ) return submission_id -def _submit_consumer(name, - spark_executor_docker_image, - jar, - kafka_broker_dns, - cassandra_native_client_dns, - dispatcher, - kafka_topics, - kafka_group_id, - write_to_cassandra, - batch_size_seconds, - cassandra_keyspace, - cassandra_table, - spark_cores_max, - spark_executor_cores, - must_fail: bool): - app_args = ["--appName", name, - "--brokers", ",".join(kafka_broker_dns), - "--topics", kafka_topics, - "--groupId", kafka_group_id, - "--batchSizeSeconds", str(batch_size_seconds), - "--cassandraKeyspace", cassandra_keyspace, - "--cassandraTable", cassandra_table] +def _submit_consumer( + name, + spark_executor_docker_image, + group_role, + jar, + kafka_broker_dns, + cassandra_native_client_dns, + dispatcher, + kafka_topics, + kafka_group_id, + write_to_cassandra, + batch_size_seconds, + cassandra_keyspace, + cassandra_table, + spark_cores_max, + spark_executor_cores, + must_fail: bool, +): + app_args = [ + "--appName", + name, + "--brokers", + ",".join(kafka_broker_dns), + "--topics", + kafka_topics, + "--groupId", + kafka_group_id, + "--batchSizeSeconds", + str(batch_size_seconds), + "--cassandraKeyspace", + cassandra_keyspace, + "--cassandraTable", + cassandra_table, + ] if must_fail: app_args.extend(["--mustFailDueToInvalidArgument"]) @@ -153,40 +189,51 @@ def _submit_consumer(name, if not write_to_cassandra: app_args.extend(["--shouldNotWriteToCassandra"]) - cassandra_hosts = map(lambda x: x.split(':')[0], cassandra_native_client_dns) - cassandra_port = cassandra_native_client_dns[0].split(':')[1] - - app_config = ["--supervise", - "--conf", "spark.cores.max={}".format(spark_cores_max), - "--conf", "spark.executor.cores={}".format(spark_executor_cores), - "--conf", "spark.cassandra.connection.host={}".format(",".join(cassandra_hosts)), - "--conf", "spark.cassandra.connection.port={}".format(cassandra_port), - "--name", name, - "--class", CONSUMER_CLASS_NAME] + cassandra_hosts = map(lambda x: x.split(":")[0], cassandra_native_client_dns) + cassandra_port = cassandra_native_client_dns[0].split(":")[1] + + app_config = [ + "--supervise", + "--conf", + "spark.cores.max={}".format(spark_cores_max), + "--conf", + "spark.executor.cores={}".format(spark_executor_cores), + "--conf", + "spark.cassandra.connection.host={}".format(",".join(cassandra_hosts)), + "--conf", + "spark.cassandra.connection.port={}".format(cassandra_port), + "--name", + name, + "--class", + CONSUMER_CLASS_NAME, + ] if spark_executor_docker_image: - app_config.extend([ - "--conf", "spark.mesos.executor.docker.image={}".format(spark_executor_docker_image) - ]) + app_config.extend( + ["--conf", "spark.mesos.executor.docker.image={}".format(spark_executor_docker_image)] + ) args = app_config + COMMON_CONF + driver_role = None if group_role else dispatcher["roles"]["executors"] + submission_id = spark_utils.submit_job( app_url=jar, app_args=" ".join(str(a) for a in app_args), args=args, verbose=False, - service_name=dispatcher['service']['name'], - driver_role=dispatcher['roles']['executors'], - spark_user=dispatcher['service']['user'] if sdk_utils.is_strict_mode() else None, - principal=dispatcher['service']['service_account'] if sdk_utils.is_strict_mode() else None) + service_name=dispatcher["service"]["name"], + driver_role=driver_role, + spark_user=dispatcher["service"]["user"] if sdk_utils.is_strict_mode() else None, + principal=dispatcher["service"]["service_account"] if sdk_utils.is_strict_mode() else None, + ) return submission_id def append_submission(output_file: str, dispatcher: dict, submission_id: str): with open(output_file, "a") as f: - f.write("{},{}\n".format(dispatcher['service']['name'], submission_id)) + f.write("{},{}\n".format(dispatcher["service"]["name"], submission_id)) def is_valid_cassandra_keyspace_name(keyspace_name: str) -> bool: @@ -201,15 +248,12 @@ def __init__(self, dispatchers, num_jobs): self.prepare() - def prepare(self): raise NotImplementedError - def provide(self): raise NotImplementedError - def report(self): raise NotImplementedError @@ -226,74 +270,78 @@ def prepare(self): self.avg_num_jobs_per_dispatcher = self.num_jobs / self.num_dispatchers self.max_num_jobs_per_dispatcher = math.ceil(self.avg_num_jobs_per_dispatcher) - self.slots = mapcat(make_repeater(self.max_num_jobs_per_dispatcher), - self.dispatchers) - + self.slots = mapcat(make_repeater(self.max_num_jobs_per_dispatcher), self.dispatchers) def provide(self): return next(self.slots) - def report(self): - log.info('Providing strategy: block') - log.info('Average number of jobs per dispatcher: %s', self.avg_num_jobs_per_dispatcher) - log.info('Will run at most %s jobs per dispatcher', self.max_num_jobs_per_dispatcher) - log.info("\n%s dispatchers: \n%s\n", - self.num_dispatchers, json.dumps(self.dispatchers, indent=2, sort_keys=True)) + log.info("Providing strategy: block") + log.info("Average number of jobs per dispatcher: %s", self.avg_num_jobs_per_dispatcher) + log.info("Will run at most %s jobs per dispatcher", self.max_num_jobs_per_dispatcher) + log.info( + "\n%s dispatchers: \n%s\n", + self.num_dispatchers, + json.dumps(self.dispatchers, indent=2, sort_keys=True), + ) class DispatcherProvider(object): """Provides dispatchers for jobs in a given strategy. """ + def __init__(self, dispatchers, num_jobs, strategy=BlockProvidingStrategy): self.strategy = strategy(dispatchers, num_jobs) - def provide(self): return self.strategy.provide() - def report(self): return self.strategy.report() def main(args): with open(args[""]) as f: - dispatchers = json.load(f)['spark'] + dispatchers = json.load(f)["spark"] with open(args[""]) as f: infrastructure = json.loads(f.read()) - kafkas = infrastructure['kafka'] + kafkas = infrastructure["kafka"] # Assuming only 1 Cassandra cluster. - cassandra = infrastructure['cassandra'][0] - - spark_executor_docker_image = args['--spark-executor-docker-image'] - jar = args["--jar"] if args["--jar"] else DEFAULT_JAR - submissions_output_file = args[""] - kafka_package_names = map(lambda kafka: kafka['package_name'], kafkas) - cassandra_package_name = cassandra['package_name'] - cassandra_service_name = cassandra['service']['name'] - num_producers_per_kafka = int(args['--num-producers-per-kafka']) - num_consumers_per_producer = int(args['--num-consumers-per-producer']) - producer_must_fail = args['--producer-must-fail'] - producer_number_of_words = int(args['--producer-number-of-words']) - producer_words_per_second = int(args['--producer-words-per-second']) - producer_spark_cores_max = int(args['--producer-spark-cores-max']) - producer_spark_executor_cores = int(args['--producer-spark-executor-cores']) - consumer_must_fail = args['--consumer-must-fail'] - consumer_write_to_cassandra = args['--consumer-write-to-cassandra'] - consumer_batch_size_seconds = int(args['--consumer-batch-size-seconds']) - consumer_spark_cores_max = int(args['--consumer-spark-cores-max']) - consumer_spark_executor_cores = int(args['--consumer-spark-executor-cores']) + cassandra = infrastructure["cassandra"][0] + + spark_executor_docker_image = args["--spark-executor-docker-image"] + jar = args["--jar"] if args["--jar"] else DEFAULT_JAR + group_role = args["--group-role"] + submissions_output_file = args[""] + kafka_package_names = map(lambda kafka: kafka["package_name"], kafkas) + cassandra_package_name = cassandra["package_name"] + cassandra_service_name = cassandra["service"]["name"] + num_producers_per_kafka = int(args["--num-producers-per-kafka"]) + num_consumers_per_producer = int(args["--num-consumers-per-producer"]) + producer_must_fail = args["--producer-must-fail"] + producer_number_of_words = int(args["--producer-number-of-words"]) + producer_words_per_second = int(args["--producer-words-per-second"]) + producer_spark_cores_max = int(args["--producer-spark-cores-max"]) + producer_spark_executor_cores = int(args["--producer-spark-executor-cores"]) + consumer_must_fail = args["--consumer-must-fail"] + consumer_write_to_cassandra = args["--consumer-write-to-cassandra"] + consumer_batch_size_seconds = int(args["--consumer-batch-size-seconds"]) + consumer_spark_cores_max = int(args["--consumer-spark-cores-max"]) + consumer_spark_executor_cores = int(args["--consumer-spark-executor-cores"]) num_kafkas = len(kafkas) num_producers = num_kafkas * num_producers_per_kafka num_consumers = num_producers * num_consumers_per_producer num_jobs = num_producers + num_consumers - log.info('Number of Kafka clusters: %s', num_kafkas) - log.info('Total number of jobs: %s (%s producers, %s consumers)', - num_jobs, num_producers, num_consumers) + log.info("Number of Kafka clusters: %s", num_kafkas) + log.info( + "Total number of jobs: %s (%s producers, %s consumers)", + num_jobs, + num_producers, + num_consumers, + ) dispatcher_provider = DispatcherProvider(dispatchers, num_jobs) dispatcher_provider.report() @@ -303,28 +351,34 @@ def main(args): _install_package_cli(cassandra_package_name) _install_package_cli(SPARK_PACKAGE_NAME) - cassandra_native_client_dns = _service_endpoint_dns(cassandra_package_name, cassandra_service_name, "native-client") + cassandra_native_client_dns = _service_endpoint_dns( + cassandra_package_name, cassandra_service_name, "native-client" + ) for kafka_idx, kafka in enumerate(kafkas): - kafka_package_name = kafka['package_name'] - kafka_service_name = kafka['service']['name'] - kafka_broker_dns = _service_endpoint_dns(kafka_package_name, kafka_service_name, 'broker') + kafka_package_name = kafka["package_name"] + kafka_service_name = kafka["service"]["name"] + kafka_broker_dns = _service_endpoint_dns(kafka_package_name, kafka_service_name, "broker") - kafka_service_basename = kafka_service_name.split('/')[-1] + kafka_service_basename = kafka_service_name.split("/")[-1] for producer_idx in range(0, num_producers_per_kafka): dispatcher = dispatcher_provider.provide() - producer_name = '{}-{}'.format(normalize_string(kafka_service_basename), producer_idx) + producer_name = "{}-{}".format(normalize_string(kafka_service_basename), producer_idx) kafka_topics = producer_name - producer_cassandra_keyspace = 'keyspace_{}'.format(normalize_string(producer_name)) + producer_cassandra_keyspace = "keyspace_{}".format(normalize_string(producer_name)) if not is_valid_cassandra_keyspace_name(producer_cassandra_keyspace): - raise ValueError('\'{}\' is not a valid Cassandra keyspace name'.format( - producer_cassandra_keyspace)) + raise ValueError( + "'{}' is not a valid Cassandra keyspace name".format( + producer_cassandra_keyspace + ) + ) producer_submission_id = _submit_producer( - '{}-k{:02d}-p{:02d}'.format(PRODUCER_CLASS_NAME, kafka_idx, producer_idx), + "{}-k{:02d}-p{:02d}".format(PRODUCER_CLASS_NAME, kafka_idx, producer_idx), spark_executor_docker_image, + group_role, jar, kafka_broker_dns, dispatcher, @@ -333,23 +387,24 @@ def main(args): producer_words_per_second, producer_spark_cores_max, producer_spark_executor_cores, - producer_must_fail) + producer_must_fail, + ) - append_submission( - submissions_output_file, - dispatcher, - producer_submission_id) + append_submission(submissions_output_file, dispatcher, producer_submission_id) for consumer_idx in range(0, num_consumers_per_producer): dispatcher = dispatcher_provider.provide() - consumer_name = '{}-{}'.format(producer_name, consumer_idx) + consumer_name = "{}-{}".format(producer_name, consumer_idx) consumer_kafka_group_id = consumer_name - consumer_cassandra_table = 'table_{}'.format(consumer_idx) + consumer_cassandra_table = "table_{}".format(consumer_idx) consumer_submission_id = _submit_consumer( - '{}-k{:02d}-p{:02d}-c{:02d}'.format(CONSUMER_CLASS_NAME, kafka_idx, producer_idx, consumer_idx), + "{}-k{:02d}-p{:02d}-c{:02d}".format( + CONSUMER_CLASS_NAME, kafka_idx, producer_idx, consumer_idx + ), spark_executor_docker_image, + group_role, jar, kafka_broker_dns, cassandra_native_client_dns, @@ -362,12 +417,10 @@ def main(args): consumer_cassandra_table, consumer_spark_cores_max, consumer_spark_executor_cores, - consumer_must_fail) + consumer_must_fail, + ) - append_submission( - submissions_output_file, - dispatcher, - consumer_submission_id) + append_submission(submissions_output_file, dispatcher, consumer_submission_id) if __name__ == "__main__": diff --git a/scale-tests/list_service_tasks.sh b/scale-tests/list_service_tasks.sh new file mode 100755 index 00000000..9d5bcdfc --- /dev/null +++ b/scale-tests/list_service_tasks.sh @@ -0,0 +1,17 @@ +#!/usr/bin/env bash + +set -euo pipefail + +readonly SERVICE_NAME="${1}" + +case ${SERVICE_NAME} in + kafka) metric="kafka_app_info_version" ;; + cassandra) metric="org_apache_cassandra_metrics_Storage_Load" ;; + *) echo "Only 'kafka' and 'cassandra' are valid arguments. '${SERVICE_NAME}' given"; exit 1 ;; +esac + +readonly CLUSTER_URL="$(dcos config show core.dcos_url)" + +curl --silent "${CLUSTER_URL}/service/monitoring/prometheus/api/v1/query?query=${metric}" \ + -H "Authorization: token=$(dcos config show core.dcos_acs_token)" \ + | jq -r '.data.result[].metric.task_name?' diff --git a/scale-tests/run.sh b/scale-tests/run.sh index 1eec3b2a..e6adf38e 100755 --- a/scale-tests/run.sh +++ b/scale-tests/run.sh @@ -25,6 +25,10 @@ function usage () { echo ' non-interactive \\' } +################################################################################ +# Parse and validate command line and parameter file parameters ################ +################################################################################ + if [ "${#}" -lt 7 ]; then echo -e "run.sh needs at least 7 arguments but was given ${#}\\n" usage @@ -66,19 +70,21 @@ function is_interactive () { } readonly AWS_ACCOUNT='Team 10' -readonly CONTAINER_NAME="${TEST_NAME}" +readonly CONTAINER_NAME="${TEST_NAME}-$(basename "${TEST_CONFIG}" .env)" readonly CONTAINER_SSH_AGENT_EXPORTS=/tmp/ssh-agent-exports readonly CONTAINER_SSH_KEY=/ssh/key readonly CONTAINER_FINISHED_SETTING_UP_FILE=/tmp/finished-setting-up -readonly IMAGE_NAME="mesosphere/dcos-commons:${TEST_NAME}" +readonly IMAGE_NAME="mesosphere/dcos-commons:${CONTAINER_NAME}" readonly SCALE_TESTS_DIRECTORY="scale-tests" -readonly TEST_DIRECTORY="${SCALE_TESTS_DIRECTORY}/runs/${TEST_NAME}" +readonly TEST_DIRECTORY="${SCALE_TESTS_DIRECTORY}/runs/${CONTAINER_NAME}" readonly TEST_REPOSITORY_DIRECTORY="${SCALE_TESTS_DIRECTORY}/checkouts/${TEST_NAME}" readonly TEST_S3_DIRECTORY_URL="s3://${TEST_S3_BUCKET}/${TEST_S3_FOLDER}/" readonly LOGS_DIRECTORY="${TEST_DIRECTORY}/script_logs" readonly LOG_FILE="${LOGS_DIRECTORY}/$(date +%Y%m%dT%H%M%SZ)_$(whoami).log" readonly DCOS_CLI_REFRESH_INTERVAL_SEC=600 # 10 minutes. +readonly GROUP_FILE_NAME="${TEST_REPOSITORY_DIRECTORY}/marathon_group.json" +# shellcheck source=/dev/null source "${TEST_CONFIG}" mkdir -p "${TEST_DIRECTORY}" @@ -92,7 +98,6 @@ fi for boolean_option in SHOULD_INSTALL_INFRASTRUCTURE \ SHOULD_INSTALL_NON_GPU_DISPATCHERS \ SHOULD_INSTALL_GPU_DISPATCHERS \ - SHOULD_RUN_FAILING_STREAMING_JOBS \ SHOULD_RUN_FINITE_STREAMING_JOBS \ SHOULD_RUN_INFINITE_STREAMING_JOBS \ SHOULD_RUN_BATCH_JOBS \ @@ -119,63 +124,87 @@ function container_exec () { declare -x AWS_PROFILE eval "$(maws li "${AWS_ACCOUNT}")" -readonly FINITE_NUM_PRODUCERS=$((KAFKA_CLUSTER_COUNT * FINITE_NUM_PRODUCERS_PER_KAFKA)) -readonly FINITE_NUM_CONSUMERS=$((FINITE_NUM_PRODUCERS * FINITE_NUM_CONSUMERS_PER_PRODUCER)) -readonly FINITE_NUM_JOBS=$((FINITE_NUM_PRODUCERS + FINITE_NUM_CONSUMERS)) -readonly INFINITE_NUM_PRODUCERS=$((KAFKA_CLUSTER_COUNT * INFINITE_NUM_PRODUCERS_PER_KAFKA)) -readonly INFINITE_NUM_CONSUMERS=$((INFINITE_NUM_PRODUCERS * INFINITE_NUM_CONSUMERS_PER_PRODUCER)) -readonly INFINITE_NUM_JOBS=$((INFINITE_NUM_PRODUCERS + INFINITE_NUM_CONSUMERS)) -readonly STREAMING_NUM_JOBS=$((FINITE_NUM_JOBS + INFINITE_NUM_JOBS)) - -readonly NON_GPU_TOTAL_QUOTA_DRIVERS_CPUS=$((NON_GPU_NUM_DISPATCHERS * NON_GPU_QUOTA_DRIVERS_CPUS)) -readonly NON_GPU_TOTAL_QUOTA_DRIVERS_MEM=$((NON_GPU_NUM_DISPATCHERS * NON_GPU_QUOTA_DRIVERS_MEM)) -readonly NON_GPU_TOTAL_QUOTA_EXECUTORS_CPUS=$((NON_GPU_NUM_DISPATCHERS * NON_GPU_QUOTA_EXECUTORS_CPUS)) -readonly NON_GPU_TOTAL_QUOTA_EXECUTORS_MEM=$((NON_GPU_NUM_DISPATCHERS * NON_GPU_QUOTA_EXECUTORS_MEM)) - -readonly GPU_TOTAL_QUOTA_DRIVERS_CPUS=$((GPU_NUM_DISPATCHERS * GPU_QUOTA_DRIVERS_CPUS)) -readonly GPU_TOTAL_QUOTA_DRIVERS_MEM=$((GPU_NUM_DISPATCHERS * GPU_QUOTA_DRIVERS_MEM)) -readonly GPU_TOTAL_QUOTA_DRIVERS_GPUS=$((GPU_NUM_DISPATCHERS * GPU_QUOTA_DRIVERS_GPUS)) -readonly GPU_TOTAL_QUOTA_EXECUTORS_CPUS=$((GPU_NUM_DISPATCHERS * GPU_QUOTA_EXECUTORS_CPUS)) -readonly GPU_TOTAL_QUOTA_EXECUTORS_MEM=$((GPU_NUM_DISPATCHERS * GPU_QUOTA_EXECUTORS_MEM)) -readonly GPU_TOTAL_QUOTA_EXECUTORS_GPUS=$((GPU_NUM_DISPATCHERS * GPU_QUOTA_EXECUTORS_GPUS)) - -readonly NON_GPU_QUOTA_CPUS=$((NON_GPU_TOTAL_QUOTA_DRIVERS_CPUS + NON_GPU_TOTAL_QUOTA_EXECUTORS_CPUS)) -readonly NON_GPU_QUOTA_MEM=$((NON_GPU_TOTAL_QUOTA_DRIVERS_MEM + NON_GPU_TOTAL_QUOTA_EXECUTORS_MEM)) -readonly GPU_QUOTA_CPUS=$((GPU_TOTAL_QUOTA_DRIVERS_CPUS + GPU_TOTAL_QUOTA_EXECUTORS_CPUS)) -readonly GPU_QUOTA_MEM=$((GPU_TOTAL_QUOTA_DRIVERS_MEM + GPU_TOTAL_QUOTA_EXECUTORS_MEM)) +################################################################################ +# Calculate a few things and present a pre-test report ######################### +################################################################################ + +readonly SPARK_TOTAL_DISPATCHERS=$((SPARK_NON_GPU_DISPATCHERS + SPARK_GPU_DISPATCHERS)) +readonly STREAMING_FINITE_PRODUCERS=$((KAFKA_CLUSTER_COUNT * STREAMING_FINITE_PRODUCERS_PER_KAFKA)) +readonly STREAMING_FINITE_CONSUMERS=$((STREAMING_FINITE_PRODUCERS * STREAMING_FINITE_CONSUMERS_PER_PRODUCER)) +readonly STREAMING_FINITE_JOBS=$((STREAMING_FINITE_PRODUCERS + STREAMING_FINITE_CONSUMERS)) +readonly STREAMING_INFINITE_PRODUCERS=$((KAFKA_CLUSTER_COUNT * STREAMING_INFINITE_PRODUCERS_PER_KAFKA)) +readonly STREAMING_INFINITE_CONSUMERS=$((STREAMING_INFINITE_PRODUCERS * STREAMING_INFINITE_CONSUMERS_PER_PRODUCER)) +readonly STREAMING_INFINITE_JOBS=$((STREAMING_INFINITE_PRODUCERS + STREAMING_INFINITE_CONSUMERS)) +readonly STREAMING_JOBS=$((STREAMING_FINITE_JOBS + STREAMING_INFINITE_JOBS)) + +readonly SPARK_NON_GPU_TOTAL_QUOTA_DRIVERS_CPUS=$((SPARK_NON_GPU_DISPATCHERS * SPARK_NON_GPU_QUOTA_DRIVERS_CPUS)) +readonly SPARK_NON_GPU_TOTAL_QUOTA_DRIVERS_MEM=$((SPARK_NON_GPU_DISPATCHERS * SPARK_NON_GPU_QUOTA_DRIVERS_MEM)) +readonly SPARK_NON_GPU_TOTAL_QUOTA_EXECUTORS_CPUS=$((SPARK_NON_GPU_DISPATCHERS * SPARK_NON_GPU_QUOTA_EXECUTORS_CPUS)) +readonly SPARK_NON_GPU_TOTAL_QUOTA_EXECUTORS_MEM=$((SPARK_NON_GPU_DISPATCHERS * SPARK_NON_GPU_QUOTA_EXECUTORS_MEM)) + +readonly SPARK_GPU_TOTAL_QUOTA_DRIVERS_CPUS=$((SPARK_GPU_DISPATCHERS * SPARK_GPU_QUOTA_DRIVERS_CPUS)) +readonly SPARK_GPU_TOTAL_QUOTA_DRIVERS_MEM=$((SPARK_GPU_DISPATCHERS * SPARK_GPU_QUOTA_DRIVERS_MEM)) +readonly SPARK_GPU_TOTAL_QUOTA_DRIVERS_GPUS=$((SPARK_GPU_DISPATCHERS * SPARK_GPU_QUOTA_DRIVERS_GPUS)) +readonly SPARK_GPU_TOTAL_QUOTA_EXECUTORS_CPUS=$((SPARK_GPU_DISPATCHERS * SPARK_GPU_QUOTA_EXECUTORS_CPUS)) +readonly SPARK_GPU_TOTAL_QUOTA_EXECUTORS_MEM=$((SPARK_GPU_DISPATCHERS * SPARK_GPU_QUOTA_EXECUTORS_MEM)) +readonly SPARK_GPU_TOTAL_QUOTA_EXECUTORS_GPUS=$((SPARK_GPU_DISPATCHERS * SPARK_GPU_QUOTA_EXECUTORS_GPUS)) + +readonly SPARK_NON_GPU_QUOTA_CPUS=$((SPARK_NON_GPU_TOTAL_QUOTA_DRIVERS_CPUS + SPARK_NON_GPU_TOTAL_QUOTA_EXECUTORS_CPUS)) +readonly SPARK_NON_GPU_QUOTA_MEM=$((SPARK_NON_GPU_TOTAL_QUOTA_DRIVERS_MEM + SPARK_NON_GPU_TOTAL_QUOTA_EXECUTORS_MEM)) +readonly SPARK_GPU_QUOTA_CPUS=$((SPARK_GPU_TOTAL_QUOTA_DRIVERS_CPUS + SPARK_GPU_TOTAL_QUOTA_EXECUTORS_CPUS)) +readonly SPARK_GPU_QUOTA_MEM=$((SPARK_GPU_TOTAL_QUOTA_DRIVERS_MEM + SPARK_GPU_TOTAL_QUOTA_EXECUTORS_MEM)) + +readonly TOTAL_QUOTA_CPUS=$((SPARK_NON_GPU_QUOTA_CPUS + + SPARK_GPU_QUOTA_CPUS + + ZOOKEEPER_CPUS + + KAFKA_CPUS + + CASSANDRA_CPUS + + DSENGINE_CPUS)) +readonly TOTAL_QUOTA_MEM=$((SPARK_NON_GPU_QUOTA_MEM + + SPARK_GPU_QUOTA_MEM + + ZOOKEEPER_MEM + + KAFKA_MEM + + CASSANDRA_MEM + + DSENGINE_MEM)) +readonly TOTAL_QUOTA_GPUS=$((SPARK_GPU_TOTAL_QUOTA_DRIVERS_GPUS + + SPARK_GPU_TOTAL_QUOTA_EXECUTORS_GPUS + + DSENGINE_GPUS)) echo echo "Test '${TEST_NAME}' parameters:" echo +echo "CLUSTER_URL '${CLUSTER_URL}'" +echo echo "KAFKA_CLUSTER_COUNT: ${KAFKA_CLUSTER_COUNT}" echo "CASSANDRA_CLUSTER_COUNT: ${CASSANDRA_CLUSTER_COUNT}" +echo "SPARK_TOTAL_DISPATCHERS: ${SPARK_TOTAL_DISPATCHERS} (non-GPU: ${SPARK_NON_GPU_DISPATCHERS}, GPU: ${SPARK_GPU_DISPATCHERS})" echo -echo "NON_GPU_NUM_DISPATCHERS: ${NON_GPU_NUM_DISPATCHERS}" +echo "SPARK_NON_GPU_DISPATCHERS: ${SPARK_NON_GPU_DISPATCHERS}" echo " Quota cpus/mem:" echo -n " Each:" -echo -n " driver ${NON_GPU_QUOTA_DRIVERS_CPUS}/${NON_GPU_QUOTA_DRIVERS_MEM}," -echo " executor ${NON_GPU_QUOTA_EXECUTORS_CPUS}/${NON_GPU_QUOTA_EXECUTORS_MEM}" +echo -n " driver ${SPARK_NON_GPU_QUOTA_DRIVERS_CPUS}/${SPARK_NON_GPU_QUOTA_DRIVERS_MEM}," +echo " executor ${SPARK_NON_GPU_QUOTA_EXECUTORS_CPUS}/${SPARK_NON_GPU_QUOTA_EXECUTORS_MEM}" echo -n " Total:" -echo -n " driver ${NON_GPU_TOTAL_QUOTA_DRIVERS_CPUS}/${NON_GPU_TOTAL_QUOTA_DRIVERS_MEM}," -echo " executor ${NON_GPU_TOTAL_QUOTA_EXECUTORS_CPUS}/${NON_GPU_TOTAL_QUOTA_EXECUTORS_MEM}" +echo -n " driver ${SPARK_NON_GPU_TOTAL_QUOTA_DRIVERS_CPUS}/${SPARK_NON_GPU_TOTAL_QUOTA_DRIVERS_MEM}," +echo " executor ${SPARK_NON_GPU_TOTAL_QUOTA_EXECUTORS_CPUS}/${SPARK_NON_GPU_TOTAL_QUOTA_EXECUTORS_MEM}" echo -echo "GPU_NUM_DISPATCHERS: ${GPU_NUM_DISPATCHERS}" +echo "SPARK_GPU_DISPATCHERS: ${SPARK_GPU_DISPATCHERS}" echo " Quota cpus/mem/gpus:" echo -n " Each:" -echo -n " driver ${GPU_QUOTA_DRIVERS_CPUS}/${GPU_QUOTA_DRIVERS_MEM}/${GPU_QUOTA_DRIVERS_GPUS:--}," -echo " executor ${GPU_QUOTA_EXECUTORS_CPUS:--}/${GPU_QUOTA_EXECUTORS_MEM:--}/${GPU_QUOTA_EXECUTORS_GPUS:--}" +echo -n " driver ${SPARK_GPU_QUOTA_DRIVERS_CPUS:-0}/${SPARK_GPU_QUOTA_DRIVERS_MEM:-0}/${SPARK_GPU_QUOTA_DRIVERS_GPUS:-0}," +echo " executor ${SPARK_GPU_QUOTA_EXECUTORS_CPUS:-0}/${SPARK_GPU_QUOTA_EXECUTORS_MEM:-0}/${SPARK_GPU_QUOTA_EXECUTORS_GPUS:-0}" echo -n " Total:" -echo -n " driver ${GPU_TOTAL_QUOTA_DRIVERS_CPUS:--}/${GPU_TOTAL_QUOTA_DRIVERS_MEM:--}/${GPU_TOTAL_QUOTA_DRIVERS_GPUS:--}," -echo " executor ${GPU_TOTAL_QUOTA_EXECUTORS_CPUS:--}/${GPU_TOTAL_QUOTA_EXECUTORS_MEM:--}/${GPU_TOTAL_QUOTA_EXECUTORS_GPUS:--}" +echo -n " driver ${SPARK_GPU_TOTAL_QUOTA_DRIVERS_CPUS:-0}/${SPARK_GPU_TOTAL_QUOTA_DRIVERS_MEM:-0}/${SPARK_GPU_TOTAL_QUOTA_DRIVERS_GPUS:-0}," +echo " executor ${SPARK_GPU_TOTAL_QUOTA_EXECUTORS_CPUS:-0}/${SPARK_GPU_TOTAL_QUOTA_EXECUTORS_MEM:-0}/${SPARK_GPU_TOTAL_QUOTA_EXECUTORS_GPUS:-0}" echo -echo "FINITE_NUM_JOBS: ${FINITE_NUM_JOBS}" -echo "INFINITE_NUM_JOBS: ${INFINITE_NUM_JOBS}" -echo "STREAMING_NUM_JOBS: ${STREAMING_NUM_JOBS}" -echo "BATCH_SUBMITS_PER_MIN: ${BATCH_SUBMITS_PER_MIN}" -echo "GPU_SUBMITS_PER_MIN: ${GPU_SUBMITS_PER_MIN}" +echo "STREAMING_JOBS: ${STREAMING_JOBS} (finite: ${STREAMING_FINITE_JOBS}, infinite: ${STREAMING_INFINITE_JOBS})" +echo "BATCH_MAX_NON_GPU_JOBS: ${BATCH_MAX_NON_GPU_JOBS}" +echo "BATCH_SUBMITS_PER_MIN: ${BATCH_SUBMITS_PER_MIN}" +echo "GPU_SUBMITS_PER_MIN: ${GPU_SUBMITS_PER_MIN}" echo -echo "Total CPU quota: $((NON_GPU_QUOTA_CPUS + GPU_QUOTA_CPUS))" -echo "Total MEM quota: $((NON_GPU_QUOTA_MEM + GPU_QUOTA_MEM))" +echo "Total CPU quota: ${TOTAL_QUOTA_CPUS}" +echo "Total MEM quota: ${TOTAL_QUOTA_MEM}" +echo "Total GPU quota: ${TOTAL_QUOTA_GPUS}" echo echo "Existing S3 artifacts for ${TEST_NAME}:" @@ -189,11 +218,14 @@ case "${ANSWER}" in * ) log 'Exiting...' && exit 0;; esac +################################################################################ +# Set a few more parameters #################################################### +################################################################################ + if is_interactive; then for boolean_option in SHOULD_INSTALL_INFRASTRUCTURE \ SHOULD_INSTALL_NON_GPU_DISPATCHERS \ SHOULD_INSTALL_GPU_DISPATCHERS \ - SHOULD_RUN_FAILING_STREAMING_JOBS \ SHOULD_RUN_FINITE_STREAMING_JOBS \ SHOULD_RUN_INFINITE_STREAMING_JOBS \ SHOULD_RUN_BATCH_JOBS \ @@ -208,8 +240,12 @@ if is_interactive; then done fi +################################################################################ +# Create Docker container for test if it doesn't exist yet ##################### +################################################################################ + set +e -docker inspect -f {{.State.Running}} "${CONTAINER_NAME}" > /dev/null 2>&1 +docker inspect -f '{{.State.Running}}' "${CONTAINER_NAME}" > /dev/null 2>&1 readonly container_running=$? docker exec -it "${CONTAINER_NAME}" test -f "${CONTAINER_FINISHED_SETTING_UP_FILE}" @@ -255,6 +291,21 @@ if [ ${container_running} -ne 0 ] || [ ${container_finished_setting_up} -ne 0 ]; container_exec \ ssh-add -k "${CONTAINER_SSH_KEY}" + container_exec \ + curl "${DCOS_CLI_URL}" -o dcos + + container_exec \ + chmod +x ./dcos + + container_exec \ + mv dcos /usr/local/bin + + container_exec \ + which dcos + + container_exec \ + dcos + container_exec \ dcos cluster setup \ --insecure \ @@ -274,40 +325,87 @@ if [ ${container_running} -ne 0 ] || [ ${container_finished_setting_up} -ne 0 ]; container_exec \ dcos package install --yes dcos-enterprise-cli - if [ -n "${ZOOKEEPER_PACKAGE_REPO}" ]; then - container_exec \ - dcos package repo add --index=0 zk-aws "${ZOOKEEPER_PACKAGE_REPO}" || true - fi - if [ -n "${KAFKA_PACKAGE_REPO}" ]; then - container_exec \ - dcos package repo add --index=0 kafka-aws "${KAFKA_PACKAGE_REPO}" || true - fi - if [ -n "${CASSANDRA_PACKAGE_REPO}" ]; then - container_exec \ - dcos package repo add --index=0 cassandra-aws "${CASSANDRA_PACKAGE_REPO}" || true + container_exec \ + touch "${CONTAINER_FINISHED_SETTING_UP_FILE}" +fi + +################################################################################ +# Create package repository stubs if they're not there ######################### +################################################################################ + +readonly dcos_package_repo_uris="$(container_exec 'bash -c "dcos package repo list --json | jq -r '.repositories[].uri'"')" + +for package_repo_envvar in ZOOKEEPER_PACKAGE_REPO \ + KAFKA_PACKAGE_REPO \ + CASSANDRA_PACKAGE_REPO \ + SPARK_PACKAGE_REPO \ + DSENGINE_PACKAGE_REPO; do + # Skip envvar if its value is empty. + if [ -z "${!package_repo_envvar}" ]; then + continue; fi - if [ -n "${SPARK_PACKAGE_REPO}" ]; then + + # Add package repository stub if it's not already there. + if ! grep -qx "${!package_repo_envvar}" <<< "${dcos_package_repo_uris}"; then + # ZOOKEEPER_PACKAGE_REPO => zookeeper. + package_repo_name="$(awk '{s = tolower($0); sub(/_package_repo$/, "", s); print(s)}' <<< "${package_repo_envvar}")" container_exec \ - dcos package repo add --index=0 spark-aws "${SPARK_PACKAGE_REPO}" || true + dcos package repo add --index=0 "${package_repo_name}" "${!package_repo_envvar}" || true fi +done + +################################################################################ +# Create Marathon group if it doesn't exist #################################### +################################################################################ + +if ! grep -qx "/${GROUP_NAME}" <<< "$(container_exec 'bash -c "dcos marathon group list --json | jq -r '.[].id'"')"; then + cat <<-EOF > "${GROUP_FILE_NAME}" + { + "id": "${GROUP_NAME}", + "enforceRole": true + } + EOF container_exec \ - touch "${CONTAINER_FINISHED_SETTING_UP_FILE}" + dcos marathon group add "${GROUP_FILE_NAME}" fi +################################################################################ +# Create quota if it doesn't already exist ##################################### +################################################################################ + +if ! grep -qx "${GROUP_NAME}" <<< "$(container_exec 'bash -c "dcos quota list --json | jq -r '.[].role'"')"; then + container_exec \ + dcos quota create "${GROUP_NAME}" \ + --cpu "${TOTAL_QUOTA_CPUS}" \ + --mem "${TOTAL_QUOTA_MEM}" \ + --gpu "${TOTAL_QUOTA_GPUS}" +fi + +################################################################################ +# Install infrastructure ####################################################### +################################################################################ + if [ "${SHOULD_INSTALL_INFRASTRUCTURE}" = true ]; then log 'Installing infrastructure' start_time=$(date +%s) container_exec \ ./scale-tests/setup_streaming.py "${TEST_DIRECTORY}/${INFRASTRUCTURE_OUTPUT_FILE}" \ --service-names-prefix "${SERVICE_NAMES_PREFIX}" \ - --kafka-zookeeper-config "${KAFKA_ZOOKEEPER_CONFIG}" \ + --kafka-zookeeper-package-name "${ZOOKEEPER_PACKAGE_NAME}" \ + --kafka-zookeeper-config "${ZOOKEEPER_CONFIG}" \ --kafka-cluster-count "${KAFKA_CLUSTER_COUNT}" \ + --kafka-package-name "${KAFKA_PACKAGE_NAME}" \ + --kafka-user "${KAFKA_USER}" \ --kafka-config "${KAFKA_CONFIG}" \ --cassandra-cluster-count "${CASSANDRA_CLUSTER_COUNT}" \ - --cassandra-config "${CASSANDRA_CONFIG}" + --cassandra-package-name "${CASSANDRA_PACKAGE_NAME}" \ + --cassandra-user "${CASSANDRA_USER}" \ + --cassandra-config "${CASSANDRA_CONFIG}" \ + --dsengine-package-name "${DSENGINE_PACKAGE_NAME}" \ + --dsengine-config "${DSENGINE_CONFIG}" end_time=$(date +%s) - runtime=$(($end_time - $start_time)) + runtime=$((end_time - start_time)) log "Installed infrastructure in ${runtime} seconds" log 'Uploading infrastructure file to S3' @@ -319,55 +417,63 @@ else log 'Skipping infrastructure installation' fi +################################################################################ +# Install non-GPU Spark dispatchers ############################################ +################################################################################ + if [ "${SHOULD_INSTALL_NON_GPU_DISPATCHERS}" = true ]; then log 'Installing non-GPU dispatchers' start_time=$(date +%s) container_exec \ ./scale-tests/deploy-dispatchers.py \ - --quota-drivers-cpus "${NON_GPU_QUOTA_DRIVERS_CPUS}" \ - --quota-drivers-mem "${NON_GPU_QUOTA_DRIVERS_MEM}" \ - --quota-executors-cpus "${NON_GPU_QUOTA_EXECUTORS_CPUS}" \ - --quota-executors-mem "${NON_GPU_QUOTA_EXECUTORS_MEM}" \ - "${NON_GPU_NUM_DISPATCHERS}" \ + --group-role "${GROUP_NAME}" \ + --options-json "${SPARK_CONFIG}" \ + --create-quotas false \ + "${SPARK_NON_GPU_DISPATCHERS}" \ "${SERVICE_NAMES_PREFIX}" \ - "${TEST_DIRECTORY}/${NON_GPU_DISPATCHERS_OUTPUT_FILE}" + "${TEST_DIRECTORY}/${SPARK_NON_GPU_DISPATCHERS_OUTPUT_FILE}" end_time=$(date +%s) - runtime=$(($end_time - $start_time)) + runtime=$((end_time - start_time)) log "Installed non-GPU dispatchers in ${runtime} seconds" log 'Uploading non-GPU dispatcher list to S3' container_exec \ aws s3 cp --acl public-read \ - "${TEST_DIRECTORY}/${NON_GPU_DISPATCHERS_OUTPUT_FILE}" \ + "${TEST_DIRECTORY}/${SPARK_NON_GPU_DISPATCHERS_OUTPUT_FILE}" \ "${TEST_S3_DIRECTORY_URL}" log 'Uploading non-GPU JSON dispatcher list to S3' container_exec \ aws s3 cp --acl public-read \ - "${TEST_DIRECTORY}/${NON_GPU_DISPATCHERS_JSON_OUTPUT_FILE}" \ + "${TEST_DIRECTORY}/${SPARK_NON_GPU_DISPATCHERS_JSON_OUTPUT_FILE}" \ "${TEST_S3_DIRECTORY_URL}" else log 'Skipping non-GPU dispatchers installation' fi +################################################################################ +# Install GPU Spark dispatchers ################################################ +################################################################################ + if [ "${SHOULD_INSTALL_GPU_DISPATCHERS}" = true ]; then log 'Installing GPU dispatchers' start_time=$(date +%s) container_exec \ ./scale-tests/deploy-dispatchers.py \ - --quota-drivers-cpus "${GPU_QUOTA_DRIVERS_CPUS}" \ - --quota-drivers-mem "${GPU_QUOTA_DRIVERS_MEM}" \ - "${GPU_NUM_DISPATCHERS}" \ + --group-role "${GROUP_NAME}" \ + --options-json "${SPARK_CONFIG}" \ + --create-quotas false \ + "${SPARK_GPU_DISPATCHERS}" \ "${SERVICE_NAMES_PREFIX}gpu-" \ - "${TEST_DIRECTORY}/${GPU_DISPATCHERS_OUTPUT_FILE}" + "${TEST_DIRECTORY}/${SPARK_GPU_DISPATCHERS_OUTPUT_FILE}" end_time=$(date +%s) - runtime=$(($end_time - $start_time)) + runtime=$((end_time - start_time)) log "Installed GPU dispatchers in ${runtime} seconds" - if [ "${GPU_REMOVE_EXECUTORS_ROLES_QUOTAS}" = true ]; then + if [ "${SPARK_GPU_REMOVE_EXECUTORS_ROLES_QUOTAS}" = true ]; then log 'Removing GPU executors roles quotas' - last_gpu_index=$(($GPU_NUM_DISPATCHERS - 1)) - for i in $(seq 0 "${last_gpu_index}"); do + last_gpu_index=$((SPARK_GPU_DISPATCHERS - 1)) + for i in $(seq 0 ${last_gpu_index}); do container_exec \ dcos spark quota remove "${TEST_NAME}__gpu-spark-0${i}-executors-role" done @@ -376,25 +482,29 @@ if [ "${SHOULD_INSTALL_GPU_DISPATCHERS}" = true ]; then log 'Uploading GPU dispatcher list to S3' container_exec \ aws s3 cp --acl public-read \ - "${TEST_DIRECTORY}/${GPU_DISPATCHERS_OUTPUT_FILE}" \ + "${TEST_DIRECTORY}/${SPARK_GPU_DISPATCHERS_OUTPUT_FILE}" \ "${TEST_S3_DIRECTORY_URL}" log 'Uploading GPU JSON dispatcher list to S3' container_exec \ aws s3 cp --acl public-read \ - "${TEST_DIRECTORY}/${GPU_DISPATCHERS_JSON_OUTPUT_FILE}" \ + "${TEST_DIRECTORY}/${SPARK_GPU_DISPATCHERS_JSON_OUTPUT_FILE}" \ "${TEST_S3_DIRECTORY_URL}" else log 'Skipping GPU dispatchers installation' fi -if [[ -s ${TEST_DIRECTORY}/${NON_GPU_DISPATCHERS_JSON_OUTPUT_FILE} && -s ${TEST_DIRECTORY}/${GPU_DISPATCHERS_JSON_OUTPUT_FILE} ]]; then +################################################################################ +# Upload merged (non-GPU + GPU) Spark dispatcher list file ##################### +################################################################################ + +if [[ -s ${TEST_DIRECTORY}/${SPARK_NON_GPU_DISPATCHERS_JSON_OUTPUT_FILE} && -s ${TEST_DIRECTORY}/${SPARK_GPU_DISPATCHERS_JSON_OUTPUT_FILE} ]]; then log 'Merging non-GPU and GPU dispatcher list files' container_exec "\ jq -s \ '{spark: (.[0].spark + .[1].spark)}' \ - ${TEST_DIRECTORY}/${NON_GPU_DISPATCHERS_JSON_OUTPUT_FILE} \ - ${TEST_DIRECTORY}/${GPU_DISPATCHERS_JSON_OUTPUT_FILE} \ + ${TEST_DIRECTORY}/${SPARK_NON_GPU_DISPATCHERS_JSON_OUTPUT_FILE} \ + ${TEST_DIRECTORY}/${SPARK_GPU_DISPATCHERS_JSON_OUTPUT_FILE} \ > ${TEST_DIRECTORY}/${DISPATCHERS_JSON_OUTPUT_FILE} \ " @@ -407,106 +517,83 @@ else log 'Skipping merging of non-GPU and GPU dispatcher list files' fi -if [ "${SHOULD_RUN_FAILING_STREAMING_JOBS}" = true ]; then - log 'Starting failing jobs' - start_time=$(date +%s) - container_exec \ - ./scale-tests/kafka_cassandra_streaming_test.py \ - "${TEST_DIRECTORY}/${NON_GPU_DISPATCHERS_JSON_OUTPUT_FILE}" \ - "${TEST_DIRECTORY}/${INFRASTRUCTURE_OUTPUT_FILE}" \ - "${TEST_DIRECTORY}/${FAILING_SUBMISSIONS_OUTPUT_FILE}" \ - --spark-executor-docker-image \""${SPARK_EXECUTOR_DOCKER_IMAGE}"\" \ - --jar "${TEST_ASSEMBLY_JAR_URL}" \ - --num-producers-per-kafka "${FAILING_NUM_PRODUCERS_PER_KAFKA}" \ - --num-consumers-per-producer "${FAILING_NUM_CONSUMERS_PER_PRODUCER}" \ - --producer-must-fail \ - --producer-number-of-words "${FAILING_PRODUCER_NUMBER_OF_WORDS}" \ - --producer-words-per-second "${FAILING_PRODUCER_WORDS_PER_SECOND}" \ - --producer-spark-cores-max "${FAILING_PRODUCER_SPARK_CORES_MAX}" \ - --producer-spark-executor-cores "${FAILING_PRODUCER_SPARK_EXECUTOR_CORES}" \ - --consumer-must-fail \ - --consumer-write-to-cassandra \ - --consumer-batch-size-seconds "${FAILING_CONSUMER_BATCH_SIZE_SECONDS}" \ - --consumer-spark-cores-max "${FAILING_CONSUMER_SPARK_CORES_MAX}" \ - --consumer-spark-executor-cores "${FAILING_CONSUMER_SPARK_EXECUTOR_CORES}" - end_time=$(date +%s) - runtime=$(($end_time - $start_time)) - log "Started failing jobs in ${runtime} seconds" - - log 'Uploading failing jobs submissions file' - container_exec \ - aws s3 cp --acl public-read \ - "${TEST_DIRECTORY}/${FAILING_SUBMISSIONS_OUTPUT_FILE}" \ - "${TEST_S3_DIRECTORY_URL}" -else - log 'Skipping running of failing streaming jobs' -fi +################################################################################ +# Run finite streaming jobs #################################################### +################################################################################ if [ "${SHOULD_RUN_FINITE_STREAMING_JOBS}" = true ]; then log 'Starting finite jobs. Consumers write to Cassandra' start_time=$(date +%s) container_exec \ ./scale-tests/kafka_cassandra_streaming_test.py \ - "${TEST_DIRECTORY}/${NON_GPU_DISPATCHERS_JSON_OUTPUT_FILE}" \ + "${TEST_DIRECTORY}/${SPARK_NON_GPU_DISPATCHERS_JSON_OUTPUT_FILE}" \ "${TEST_DIRECTORY}/${INFRASTRUCTURE_OUTPUT_FILE}" \ - "${TEST_DIRECTORY}/${FINITE_SUBMISSIONS_OUTPUT_FILE}" \ + "${TEST_DIRECTORY}/${STREAMING_FINITE_SUBMISSIONS_OUTPUT_FILE}" \ --spark-executor-docker-image \""${SPARK_EXECUTOR_DOCKER_IMAGE}"\" \ --jar "${TEST_ASSEMBLY_JAR_URL}" \ - --num-producers-per-kafka "${FINITE_NUM_PRODUCERS_PER_KAFKA}" \ - --num-consumers-per-producer "${FINITE_NUM_CONSUMERS_PER_PRODUCER}" \ - --producer-number-of-words "${FINITE_PRODUCER_NUMBER_OF_WORDS}" \ - --producer-words-per-second "${FINITE_PRODUCER_WORDS_PER_SECOND}" \ - --producer-spark-cores-max "${FINITE_PRODUCER_SPARK_CORES_MAX}" \ - --producer-spark-executor-cores "${FINITE_PRODUCER_SPARK_EXECUTOR_CORES}" \ + --num-producers-per-kafka "${STREAMING_FINITE_PRODUCERS_PER_KAFKA}" \ + --num-consumers-per-producer "${STREAMING_FINITE_CONSUMERS_PER_PRODUCER}" \ + --producer-number-of-words "${STREAMING_FINITE_PRODUCER_NUMBER_OF_WORDS}" \ + --producer-words-per-second "${STREAMING_FINITE_PRODUCER_WORDS_PER_SECOND}" \ + --producer-spark-cores-max "${STREAMING_FINITE_PRODUCER_SPARK_CORES_MAX}" \ + --producer-spark-executor-cores "${STREAMING_FINITE_PRODUCER_SPARK_EXECUTOR_CORES}" \ --consumer-write-to-cassandra \ - --consumer-batch-size-seconds "${FINITE_CONSUMER_BATCH_SIZE_SECONDS}" \ - --consumer-spark-cores-max "${FINITE_CONSUMER_SPARK_CORES_MAX}" \ - --consumer-spark-executor-cores "${FINITE_CONSUMER_SPARK_EXECUTOR_CORES}" + --consumer-batch-size-seconds "${STREAMING_FINITE_CONSUMER_BATCH_SIZE_SECONDS}" \ + --consumer-spark-cores-max "${STREAMING_FINITE_CONSUMER_SPARK_CORES_MAX}" \ + --consumer-spark-executor-cores "${STREAMING_FINITE_CONSUMER_SPARK_EXECUTOR_CORES}" end_time=$(date +%s) - runtime=$(($end_time - $start_time)) + runtime=$((end_time - start_time)) log "Started finite jobs in ${runtime} seconds" log 'Uploading finite jobs submissions file' container_exec \ aws s3 cp --acl public-read \ - "${TEST_DIRECTORY}/${FINITE_SUBMISSIONS_OUTPUT_FILE}" \ + "${TEST_DIRECTORY}/${STREAMING_FINITE_SUBMISSIONS_OUTPUT_FILE}" \ "${TEST_S3_DIRECTORY_URL}" else log 'Skipping running of finite streaming jobs' fi +################################################################################ +# Run infinite streaming jobs ################################################## +################################################################################ + if [ "${SHOULD_RUN_INFINITE_STREAMING_JOBS}" = true ]; then log 'Starting infinite jobs. Consumers do not write to Cassandra' start_time=$(date +%s) container_exec \ ./scale-tests/kafka_cassandra_streaming_test.py \ - "${TEST_DIRECTORY}/${NON_GPU_DISPATCHERS_JSON_OUTPUT_FILE}" \ + "${TEST_DIRECTORY}/${SPARK_NON_GPU_DISPATCHERS_JSON_OUTPUT_FILE}" \ "${TEST_DIRECTORY}/${INFRASTRUCTURE_OUTPUT_FILE}" \ - "${TEST_DIRECTORY}/${INFINITE_SUBMISSIONS_OUTPUT_FILE}" \ + "${TEST_DIRECTORY}/${STREAMING_INFINITE_SUBMISSIONS_OUTPUT_FILE}" \ --spark-executor-docker-image \""${SPARK_EXECUTOR_DOCKER_IMAGE}"\" \ --jar "${TEST_ASSEMBLY_JAR_URL}" \ - --num-producers-per-kafka "${INFINITE_NUM_PRODUCERS_PER_KAFKA}" \ - --num-consumers-per-producer "${INFINITE_NUM_CONSUMERS_PER_PRODUCER}" \ + --num-producers-per-kafka "${STREAMING_INFINITE_PRODUCERS_PER_KAFKA}" \ + --num-consumers-per-producer "${STREAMING_INFINITE_CONSUMERS_PER_PRODUCER}" \ --producer-number-of-words 0 \ - --producer-words-per-second "${INFINITE_PRODUCER_WORDS_PER_SECOND}" \ - --producer-spark-cores-max "${INFINITE_PRODUCER_SPARK_CORES_MAX}" \ - --producer-spark-executor-cores "${INFINITE_PRODUCER_SPARK_EXECUTOR_CORES}" \ - --consumer-batch-size-seconds "${INFINITE_CONSUMER_BATCH_SIZE_SECONDS}" \ - --consumer-spark-cores-max "${INFINITE_CONSUMER_SPARK_CORES_MAX}" \ - --consumer-spark-executor-cores "${INFINITE_CONSUMER_SPARK_EXECUTOR_CORES}" + --producer-words-per-second "${STREAMING_INFINITE_PRODUCER_WORDS_PER_SECOND}" \ + --producer-spark-cores-max "${STREAMING_INFINITE_PRODUCER_SPARK_CORES_MAX}" \ + --producer-spark-executor-cores "${STREAMING_INFINITE_PRODUCER_SPARK_EXECUTOR_CORES}" \ + --consumer-batch-size-seconds "${STREAMING_INFINITE_CONSUMER_BATCH_SIZE_SECONDS}" \ + --consumer-spark-cores-max "${STREAMING_INFINITE_CONSUMER_SPARK_CORES_MAX}" \ + --consumer-spark-executor-cores "${STREAMING_INFINITE_CONSUMER_SPARK_EXECUTOR_CORES}" end_time=$(date +%s) - runtime=$(($end_time - $start_time)) + runtime=$((end_time - start_time)) log "Started infinite jobs in ${runtime} seconds" log 'Uploading infinite jobs submissions file' container_exec \ aws s3 cp --acl public-read \ - "${TEST_DIRECTORY}/${INFINITE_SUBMISSIONS_OUTPUT_FILE}" \ + "${TEST_DIRECTORY}/${STREAMING_INFINITE_SUBMISSIONS_OUTPUT_FILE}" \ "${TEST_S3_DIRECTORY_URL}" else log 'Skipping running of infinite streaming jobs' fi +################################################################################ +# Run non-GPU batch jobs ####################################################### +################################################################################ + if [ "${SHOULD_RUN_BATCH_JOBS}" = true ]; then log 'Starting batch jobs' start_time=$(date +%s) @@ -516,21 +603,26 @@ if [ "${SHOULD_RUN_BATCH_JOBS}" = true ]; then --dcos-username "${DCOS_USERNAME}" \ --dcos-password "${DCOS_PASSWORD}" \ --security "${SECURITY}" \ - --input-file-uri "${NON_GPU_DISPATCHERS_JSON_OUTPUT_FILE_URL}" \ + --input-file-uri "${SPARK_NON_GPU_DISPATCHERS_JSON_OUTPUT_FILE_URL}" \ --script-cpus "${BATCH_SCRIPT_CPUS}" \ --script-mem "${BATCH_SCRIPT_MEM}" \ --spark-build-branch "${BATCH_SPARK_BUILD_BRANCH}" \ --script-args "\"\ - ${NON_GPU_DISPATCHERS_JSON_OUTPUT_FILE} \ + ${SPARK_NON_GPU_DISPATCHERS_JSON_OUTPUT_FILE} \ --submits-per-min ${BATCH_SUBMITS_PER_MIN} \ + --group-role ${GROUP_NAME} \ \"" end_time=$(date +%s) - runtime=$(($end_time - $start_time)) + runtime=$((end_time - start_time)) log "Started batch jobs in ${runtime} seconds" else log 'Skipping running of batch jobs' fi +################################################################################ +# Run GPU batch jobs ########################################################### +################################################################################ + if [ "${SHOULD_RUN_GPU_BATCH_JOBS}" = true ]; then log 'Starting GPU batch jobs' start_time=$(date +%s) @@ -540,38 +632,47 @@ if [ "${SHOULD_RUN_GPU_BATCH_JOBS}" = true ]; then --dcos-username "${DCOS_USERNAME}" \ --dcos-password "${DCOS_PASSWORD}" \ --security "${SECURITY}" \ - --input-file-uri "${GPU_DISPATCHERS_JSON_OUTPUT_FILE_URL}" \ + --input-file-uri "${SPARK_GPU_DISPATCHERS_JSON_OUTPUT_FILE_URL}" \ --script-cpus "${GPU_SCRIPT_CPUS}" \ --script-mem "${GPU_SCRIPT_MEM}" \ --spark-build-branch "${GPU_SPARK_BUILD_BRANCH}" \ --script-args "\"\ - ${GPU_DISPATCHERS_JSON_OUTPUT_FILE} \ + ${SPARK_GPU_DISPATCHERS_JSON_OUTPUT_FILE} \ --submits-per-min ${GPU_SUBMITS_PER_MIN} \ --docker-image ${GPU_DOCKER_IMAGE} \ - --max-num-dispatchers ${GPU_MAX_NUM_DISPATCHERS} \ + --group-role ${GROUP_NAME} \ + --max-num-dispatchers ${GPU_MAX_DISPATCHERS} \ --spark-cores-max ${GPU_SPARK_CORES_MAX} \ --spark-mesos-executor-gpus ${GPU_SPARK_MESOS_EXECUTOR_GPUS} \ --spark-mesos-max-gpus ${GPU_SPARK_MESOS_MAX_GPUS} \ \"" end_time=$(date +%s) - runtime=$(($end_time - $start_time)) + runtime=$((end_time - start_time)) log "Started GPU batch jobs in ${runtime} seconds" else log 'Skipping running of GPU batch jobs' fi +################################################################################ +# Uninstall infrastructure ##################################################### +################################################################################ + if [ "${SHOULD_UNINSTALL_INFRASTRUCTURE_AT_THE_END}" = true ]; then log 'Uninstalling infrastructure' start_time=$(date +%s) container_exec \ ./scale-tests/setup_streaming.py "${TEST_DIRECTORY}/${INFRASTRUCTURE_OUTPUT_FILE}" --cleanup end_time=$(date +%s) - runtime=$(($end_time - $start_time)) + runtime=$((end_time - start_time)) log "Uninstalled infrastructure in ${runtime} seconds" else log 'Skipping uninstalling of infrastructure' fi +################################################################################ +################################################################################ +################################################################################ + log 'Uploading log file to S3' container_exec \ aws s3 cp --acl public-read \ diff --git a/scale-tests/scale_tests_utils.py b/scale-tests/scale_tests_utils.py index 8a4bb279..be8d2af5 100644 --- a/scale-tests/scale_tests_utils.py +++ b/scale-tests/scale_tests_utils.py @@ -22,9 +22,9 @@ def setup_security(service_name: str, linux_user: str) -> typing.Dict: service_account = normalize_string("{}-service-account".format(service_name)) service_account_secret = "{}-service-account-secret".format(service_name) - return sdk_security.setup_security(service_name, - linux_user, - service_account, service_account_secret) + return sdk_security.setup_security( + service_name, linux_user, service_account, service_account_secret + ) def get_strict_mode_options(service_account_info: typing.Dict) -> typing.Dict: @@ -32,20 +32,14 @@ def get_strict_mode_options(service_account_info: typing.Dict) -> typing.Dict: options = {} if "linux_user" in service_account_info: - user_options = { - "service": { - "user": service_account_info["linux_user"] - } - - } + user_options = {"service": {"user": service_account_info["linux_user"]}} options = sdk_install.merge_dictionaries(options, user_options) - if sdk_utils.is_strict_mode(): service_account_options = { - 'service': { - 'service_account': service_account_info["name"], - 'service_account_secret': service_account_info["secret"], + "service": { + "service_account": service_account_info["name"], + "service_account_secret": service_account_info["secret"], } } options = sdk_install.merge_dictionaries(options, service_account_options) @@ -53,8 +47,9 @@ def get_strict_mode_options(service_account_info: typing.Dict) -> typing.Dict: return options -def get_service_options(service_name: str, service_account_info: typing.Dict, - options: typing.Dict, config_path: str) -> typing.Dict: +def get_service_options( + service_name: str, service_account_info: typing.Dict, options: typing.Dict, config_path: str +) -> typing.Dict: """ Get the options for a service as a combination of other options. """ @@ -62,7 +57,7 @@ def get_service_options(service_name: str, service_account_info: typing.Dict, config_options = {} if config_path: if os.path.isfile(config_path): - with open(config_path, 'r') as fp: + with open(config_path, "r") as fp: log.info("Reading options from %s", config_path) config_options = json.load(fp) else: @@ -71,13 +66,19 @@ def get_service_options(service_name: str, service_account_info: typing.Dict, else: log.info("No options specified. Using defaults") + if service_name[0] != "/": + service_name = "/" + service_name # Always set the service name service_name_options = {"service": {"name": service_name}} - return merge_service_options([get_strict_mode_options(service_account_info), - options, - config_options, - service_name_options, ]) + return merge_service_options( + [ + get_strict_mode_options(service_account_info), + options, + config_options, + service_name_options, + ] + ) def merge_service_options(options: typing.List[typing.Dict]) -> typing.Dict: @@ -118,7 +119,7 @@ def mapcat(func, seqs): def normalize_string(s: str) -> str: - return s.replace("/", "__").replace('-', '_') + return s.replace("/", "__").replace("-", "_") def make_repeater(n): diff --git a/scale-tests/setup_streaming.py b/scale-tests/setup_streaming.py index 10abc652..f30f3f42 100755 --- a/scale-tests/setup_streaming.py +++ b/scale-tests/setup_streaming.py @@ -8,6 +8,7 @@ * Kafka ZooKeeper is installed * Kafka is installed * Cassandra is installed +* Data Science Engine is installed Usage: setup_streaming.py [options] @@ -23,14 +24,21 @@ This is used for both Kafka and ZooKeeper [default: 0] --kafka-package-name The package name to use for Kafka [default: confluent-kafka] + --kafka-user user for the Kafka installation [default: nobody] --kafka-config path to the config.json for the Kafka installation --kafka-zookeeper-package-name The package name to use for Kafka ZooKeeper [default: confluent-zookeeper] + --kafka-zookeeper-user user for the Kafka ZooKeeper installation [default: nobody] --kafka-zookeeper-config path to the config.json for the Kafka ZooKeeper installation --cassandra-cluster-count The number of Cassandra clusters to install [default: 0] --cassandra-package-name The package name to use for Cassandra [default: cassandra] + --cassandra-user user for the Cassandra installation [default: nobody] --cassandra-config path to the config.json for the Cassandra installation + + --dsengine-package-name The package name to use for Data Science Engine [default: data-science-engine] + --dsengine-user user for the Data Science Engine installation [default: nobody] + --dsengine-config path to the config.json for the Data Science Engine installation """ import json import logging @@ -63,6 +71,7 @@ def install_package(package_name: str, index: int, service_task_count: int, config_path: str, + user: str = None, additional_options: dict = None) -> dict: if package_name.startswith("beta-"): basename = package_name[len("beta-"):] @@ -71,7 +80,18 @@ def install_package(package_name: str, service_name = "{}{}-{:0>2}".format(service_prefix, basename, index) - service_account_info = scale_tests_utils.setup_security(service_name, "nobody") + if not user: + user = "nobody" + + service_account_info = scale_tests_utils.setup_security(service_name, user) + + if "data-science-engine" in package_name: + for permission in [ + "dcos:mesos:master:task:app_id:{}".format(service_name), + "dcos:mesos:agent:task:user:root", + "dcos:mesos:master:task:user:root", + "dcos:mesos:master:task:role:{}".format(service_account_info['name'])]: + grant_permission(service_account_info['name'], permission, "create") service_options = scale_tests_utils.get_service_options(service_name, service_account_info, additional_options, config_path) @@ -126,13 +146,14 @@ def get_expected_task_count(service_options: dict) -> int: kafka_zookeeper_package_name = args["--kafka-zookeeper-package-name"] kafka_zookeeper_service_prefix = args["--service-names-prefix"] + kafka_zookeeper_user = args.get("--kafka-zookeeper-user", "") kafka_zookeeper_config = args.get("--kafka-zookeeper-config", "") services = [] for i in range(kafka_cluster_count): services.append(install_package(kafka_zookeeper_package_name, kafka_zookeeper_service_prefix, i, get_expected_task_count, - kafka_zookeeper_config)) + user = kafka_zookeeper_user, config_path=kafka_zookeeper_config)) return services @@ -151,6 +172,7 @@ def get_expected_task_count(service_options: dict) -> int: kafka_package_name = args["--kafka-package-name"] kafka_service_prefix = args["--service-names-prefix"] + kafka_user = args.get("--kafka-user", "") kafka_config = args.get("--kafka-config", "") services = [] @@ -169,7 +191,7 @@ def get_expected_task_count(service_options: dict) -> int: services.append(install_package(kafka_package_name, kafka_service_prefix, i, get_expected_task_count, kafka_config, - additional_options=service_options)) + user = kafka_user, additional_options=service_options)) return services @@ -188,12 +210,37 @@ def get_expected_task_count(service_options: dict) -> int: cassandra_package_name = args["--cassandra-package-name"] cassandra_service_prefix = args["--service-names-prefix"] + cassandra_user = args.get("--cassandra-user", "") cassandra_config = args.get("--cassandra-config", "") services = [] for i in range(cassandra_cluster_count): services.append(install_package(cassandra_package_name, cassandra_service_prefix, i, - get_expected_task_count, cassandra_config)) + get_expected_task_count, user = cassandra_user, config_path=cassandra_config)) + + return services + +def install_dsengine(args: dict) -> list: + """ + Install the Data Science Engine service(s) as defined by the arguments + """ + def get_expected_task_count(service_options: dict) -> int: + return _get_pod_count(service_options, "notebooks", 1) + + dse_cluster_count = 1 + + if not dse_cluster_count: + return [] + + dsengine_package_name = args["--dsengine-package-name"] + dsengine_service_prefix = args["--service-names-prefix"] + dsengine_user = args.get("--dsengine-user", "") + dsengine_config = args.get("--dsengine-config", "") + + services = [] + for i in range(dse_cluster_count): + services.append(install_package(dsengine_package_name, dsengine_service_prefix, i, + get_expected_task_count, user=dsengine_user,config_path=dsengine_config)) return services @@ -203,6 +250,7 @@ def install(args): services["zookeeper"] = install_zookeeper(args) services["kafka"] = install_kafka(args, services["zookeeper"]) services["cassandra"] = install_cassandra(args) + services["dsengine"] = install_dsengine(args) for k, v in services.items(): log.info("%s service(s): %s", k, v) @@ -236,6 +284,13 @@ def cleanup(args): log.info("Removing service accounts and secrets") sdk_security.cleanup_security(service_name, s["service_account_info"]) +def grant_permission(service_account: str, acl: str, action: str, description = None) -> None: + cmd = "security org users grant {} {} {}".format(service_account, acl, action) + + if description: + cmd = "{} --description '{}'".format(cmd, description) + + sdk_cmd.run_cli(cmd=cmd, print_output=True) def main(args): if "--cleanup" in args and args["--cleanup"]: diff --git a/spark-testing/spark_utils.py b/spark-testing/spark_utils.py index 3a5e9d01..9111dcb5 100644 --- a/spark-testing/spark_utils.py +++ b/spark-testing/spark_utils.py @@ -14,7 +14,6 @@ import sdk_utils import spark_s3 -import dcos_utils DCOS_SPARK_TEST_JAR_PATH_ENV = "DCOS_SPARK_TEST_JAR_PATH" DCOS_SPARK_TEST_JAR_PATH = os.getenv(DCOS_SPARK_TEST_JAR_PATH_ENV, None) @@ -27,7 +26,9 @@ MESOS_SPARK_TEST_JAR_URL = os.getenv(MESOS_SPARK_TEST_JAR_URL_ENV, None) SPARK_SERVICE_ACCOUNT = os.getenv("SPARK_SERVICE_ACCOUNT", "spark-service-acct") -SPARK_SERVICE_ACCOUNT_SECRET = os.getenv("SPARK_SERVICE_ACCOUNT_SECRET", "spark-service-acct-secret") +SPARK_SERVICE_ACCOUNT_SECRET = os.getenv( + "SPARK_SERVICE_ACCOUNT_SECRET", "spark-service-acct-secret" +) SPARK_SERVICE_NAME = os.getenv("SPARK_SERVICE_NAME", "spark") FOLDERED_SPARK_SERVICE_NAME = "/path/to/" + SPARK_SERVICE_NAME @@ -46,27 +47,33 @@ SPARK_PACKAGE_NAME = os.getenv("SPARK_PACKAGE_NAME", "spark") SPARK_EXAMPLES = "http://downloads.mesosphere.com/spark/assets/spark-examples_2.11-2.4.0.jar" -start_agent_cmd = "sudo systemctl start dcos-mesos-slave" -stop_agent_cmd = "sudo systemctl stop dcos-mesos-slave" -check_agent_cmd = "sudo systemctl is-active dcos-mesos-slave" - def _check_tests_assembly(): if not DCOS_SPARK_TEST_JAR_URL and not os.path.exists(DCOS_SPARK_TEST_JAR_PATH): - raise Exception('''Missing URL or path to file dcos-spark-scala-tests-assembly-[...].jar: + raise Exception( + """Missing URL or path to file dcos-spark-scala-tests-assembly-[...].jar: - No URL: {}={} - - File not found: {}={}'''.format( - DCOS_SPARK_TEST_JAR_URL_ENV, DCOS_SPARK_TEST_JAR_URL, - DCOS_SPARK_TEST_JAR_PATH_ENV, DCOS_SPARK_TEST_JAR_PATH)) + - File not found: {}={}""".format( + DCOS_SPARK_TEST_JAR_URL_ENV, + DCOS_SPARK_TEST_JAR_URL, + DCOS_SPARK_TEST_JAR_PATH_ENV, + DCOS_SPARK_TEST_JAR_PATH, + ) + ) def _check_mesos_integration_tests_assembly(): if not MESOS_SPARK_TEST_JAR_URL and not os.path.exists(MESOS_SPARK_TEST_JAR_PATH): - raise Exception('''Missing URL or path to file mesos-spark-integration-tests-assembly-[...].jar: + raise Exception( + """Missing URL or path to file mesos-spark-integration-tests-assembly-[...].jar: - No URL: {}={} - - File not found: {}={}'''.format( - MESOS_SPARK_TEST_JAR_URL_ENV, MESOS_SPARK_TEST_JAR_URL, - MESOS_SPARK_TEST_JAR_PATH_ENV, MESOS_SPARK_TEST_JAR_PATH)) + - File not found: {}={}""".format( + MESOS_SPARK_TEST_JAR_URL_ENV, + MESOS_SPARK_TEST_JAR_URL, + MESOS_SPARK_TEST_JAR_PATH_ENV, + MESOS_SPARK_TEST_JAR_PATH, + ) + ) def hdfs_enabled(): @@ -77,7 +84,9 @@ def kafka_enabled(): return os.environ.get("KAFKA_ENABLED") != "false" -def require_spark(service_name=SPARK_SERVICE_NAME, additional_options={}, zk='spark_mesos_dispatcher'): +def require_spark( + service_name=SPARK_SERVICE_NAME, additional_options={}, zk="spark_mesos_dispatcher" +): teardown_spark(service_name, zk) sdk_install.install( @@ -85,34 +94,31 @@ def require_spark(service_name=SPARK_SERVICE_NAME, additional_options={}, zk='sp service_name, 0, additional_options=get_spark_options(service_name, additional_options), - wait_for_deployment=False, # no deploy plan - insert_strict_options=False) # lacks principal + secret_name options + wait_for_deployment=False, # no deploy plan + insert_strict_options=False, + ) # lacks principal + secret_name options # wait for dispatcher to be reachable over HTTP - sdk_cmd.service_request('GET', service_name, '', timeout_seconds=300) + sdk_cmd.service_request("GET", service_name, "", timeout_seconds=300) # Note: zk may be customized in spark via 'spark.deploy.zookeeper.dir' -def teardown_spark(service_name=SPARK_SERVICE_NAME, zk='spark_mesos_dispatcher'): +def teardown_spark(service_name=SPARK_SERVICE_NAME, zk="spark_mesos_dispatcher"): sdk_install.uninstall( SPARK_PACKAGE_NAME, service_name, - role=re.escape('*'), - service_account='spark-service-acct', - zk=zk) + role=re.escape("*"), + service_account="spark-service-acct", + zk=zk, + ) - if not sdk_utils.dcos_version_less_than('1.10'): + if not sdk_utils.dcos_version_less_than("1.10"): # On 1.10+, sdk_uninstall doesn't run janitor. However Spark always needs it for ZK cleanup. - sdk_install.retried_run_janitor(service_name, re.escape('*'), 'spark-service-acct', zk) + sdk_install.retried_run_janitor(service_name, re.escape("*"), "spark-service-acct", zk) def get_spark_options(service_name, additional_options): - options = { - "service": { - "user": SPARK_USER, - "name": service_name - } - } + options = {"service": {"user": SPARK_USER, "name": service_name}} if SPARK_DOCKER_USER is not None: options["service"]["docker_user"] = SPARK_DOCKER_USER @@ -146,48 +152,51 @@ def run_tests(app_url, app_args, expected_output, service_name=SPARK_SERVICE_NAM try: check_job_output(driver_id, expected_output) except TimeoutError: - log.error("Timed out waiting for job output, will attempt to cleanup and kill driver: {}".format(driver_id)) + log.error( + "Timed out waiting for job output, will attempt to cleanup and kill driver: {}".format( + driver_id + ) + ) raise finally: kill_driver(driver_id, service_name=service_name) def submit_job( - app_url, - app_args, - service_name=SPARK_SERVICE_NAME, - args=[], - spark_user=None, - driver_role=SPARK_DRIVER_ROLE, - verbose=True, - principal=SPARK_SERVICE_ACCOUNT, - use_cli=True): + app_url, + app_args, + service_name=SPARK_SERVICE_NAME, + args=[], + spark_user=None, + driver_role=SPARK_DRIVER_ROLE, + verbose=True, + principal=SPARK_SERVICE_ACCOUNT, + use_cli=True, +): conf_args = args.copy() + # Don't overwrite spark.mesos.role in case of running under enforce_role. + # We're expecting the caller to pass `driver_role` as `None` in that case. if driver_role: - conf_args += ['--conf', 'spark.mesos.role={}'.format(driver_role)] + conf_args += ["--conf", "spark.mesos.role={}".format(driver_role)] if SPARK_DOCKER_USER is not None: - conf_args += ['--conf', 'spark.mesos.executor.docker.parameters=user={}'.format(SPARK_DOCKER_USER)] + conf_args += [ + "--conf", + "spark.mesos.executor.docker.parameters=user={}".format(SPARK_DOCKER_USER), + ] - if not list(filter(lambda x: "spark.driver.memory=" in x, conf_args)): - conf_args += ['--conf', 'spark.driver.memory=2g'] + if not list(filter(lambda x: x.startswith("spark.driver.memory="), conf_args)): + conf_args += ["--conf", "spark.driver.memory=2g"] if sdk_utils.is_strict_mode(): - conf_args += [ - '--conf spark.mesos.principal={}'.format(principal) - ] + conf_args += ["--conf spark.mesos.principal={}".format(principal)] if spark_user is not None: - conf_args += [ - '--conf spark.mesos.driverEnv.SPARK_USER={}'.format(spark_user) - ] - - if not list(filter(lambda x: "spark.mesos.containerizer=" in x, conf_args)): - conf_args += ['--conf', 'spark.mesos.containerizer=mesos'] + conf_args += ["--conf spark.mesos.driverEnv.SPARK_USER={}".format(spark_user)] - submit_args = ' '.join([' '.join(conf_args), app_url, app_args]) + submit_args = " ".join([" ".join(conf_args), app_url, app_args]) verbose_flag = "--verbose" if verbose else "" result = None @@ -195,14 +204,21 @@ def submit_job( stdout = sdk_cmd.svc_cli( SPARK_PACKAGE_NAME, service_name, - 'run {} --submit-args="{}"'.format(verbose_flag, submit_args)) + 'run {} --submit-args="{}"'.format(verbose_flag, submit_args), + ) result = re.search(r"Submission id: (\S+)", stdout) else: - docker_cmd = "sudo docker run --net=host -ti {} bin/spark-submit {}".format(SPARK_DOCKER_IMAGE, submit_args) + docker_cmd = "sudo docker run --net=host -ti {} bin/spark-submit {}".format( + SPARK_DOCKER_IMAGE, submit_args + ) ssh_opts = "--option UserKnownHostsFile=/dev/null --option StrictHostKeyChecking=no" log.info("Running Docker command on leader: {}".format(docker_cmd)) - _, stdout, stderr = sdk_cmd.run_raw_cli("node ssh --master-proxy --leader --user={} {} '{}'".format(sdk_cmd.LINUX_USER, ssh_opts, docker_cmd)) + _, stdout, stderr = sdk_cmd.run_raw_cli( + "node ssh --master-proxy --leader --user={} {} '{}'".format( + sdk_cmd.LINUX_USER, ssh_opts, docker_cmd + ) + ) result = re.search(r'"submissionId" : "(\S+)"', stdout) if not result: @@ -211,7 +227,7 @@ def submit_job( def check_job_output(task_id, expected_output): - log.info('Waiting for task id={} to complete'.format(task_id)) + log.info("Waiting for task id={} to complete".format(task_id)) shakedown.wait_for_task_completion(task_id, timeout_sec=JOB_WAIT_TIMEOUT_SECONDS) stdout = _task_log(task_id) @@ -222,28 +238,11 @@ def check_job_output(task_id, expected_output): raise Exception("{} not found in stdout".format(expected_output)) -# Reads the logs and matches each line for specified regular expressions. -# Returns a map of [regular expression] -> [list of matched line numbers] -def log_matches(task_id, filename, expressions): - output = _task_log(task_id, filename) - matched_lines = {} - for exp in expressions: - matched_lines[exp] = [] - for line_number, line in enumerate(output.splitlines()): - for exp in expressions: - if re.search(exp, line): - matched_lines[exp].append(line_number) - return matched_lines - - -@retrying.retry( - wait_fixed=5000, - stop_max_delay=600 * 1000, - retry_on_result=lambda res: not res) +@retrying.retry(wait_fixed=5000, stop_max_delay=600 * 1000, retry_on_result=lambda res: not res) def wait_for_running_job_output(task_id, expected_line): stdout = sdk_cmd.run_cli("task log --lines=1000 {}".format(task_id)) result = expected_line in stdout - log.info('Checking for {} in STDOUT:\n{}\nResult: {}'.format(expected_line, stdout, result)) + log.info("Checking for {} in STDOUT:\n{}\nResult: {}".format(expected_line, stdout, result)) return result @@ -287,82 +286,85 @@ def kill_driver(driver_id, service_name=SPARK_SERVICE_NAME): def _task_log(task_id, filename=None): - return sdk_cmd.run_cli("task log --completed --lines=1000 {}".format(task_id) + \ - ("" if filename is None else " {}".format(filename))) + return sdk_cmd.run_cli( + "task log --completed --lines=1000 {}".format(task_id) + + ("" if filename is None else " {}".format(filename)) + ) def grant_user_permissions(user, role="*", service_account=SPARK_SERVICE_ACCOUNT): log.info(f"Adding user permissions for Marathon. User: {user}") sdk_security.grant_permissions( - linux_user=user, - role_name="slave_public", - service_account_name="dcos_marathon" + linux_user=user, role_name="slave_public", service_account_name="dcos_marathon" ) log.info(f"Adding user permissions for {service_account}. User: {user}, role: {role}") sdk_security.grant_permissions( - linux_user=user, - role_name=role, - service_account_name=service_account + linux_user=user, role_name=role, service_account_name=service_account ) def revoke_user_permissions(user, role="*", service_account=SPARK_SERVICE_ACCOUNT): log.info(f"Revoking user permissions for Marathon. User: {user}") sdk_security.grant_permissions( - linux_user=user, - role_name="slave_public", - service_account_name="dcos_marathon" + linux_user=user, role_name="slave_public", service_account_name="dcos_marathon" ) log.info(f"Revoking user permissions for {service_account}. User: {user}, role: {role}") sdk_security.revoke_permissions( - linux_user=user, - role_name=role, - service_account_name=service_account + linux_user=user, role_name=role, service_account_name=service_account ) def _escape_service_name(service_name): app_id = "/{}".format(service_name.lstrip("/")) # double-encoded (why?) - return urllib.parse.quote( - urllib.parse.quote(app_id, safe=''), - safe='' - ) + return urllib.parse.quote(urllib.parse.quote(app_id, safe=""), safe="") def grant_launch_task_permission(service_name, service_account_name=SPARK_SERVICE_ACCOUNT): - log.info(f"Granting launch task permission to service account: {service_account_name}, service: {service_name}") + log.info( + f"Granting launch task permission to service account: {service_account_name}, service: {service_name}" + ) app_id = _escape_service_name(service_name) - sdk_security._grant(service_account_name, - "dcos:mesos:master:task:app_id:{}".format(app_id), - description="Spark drivers may execute Mesos tasks", - action="create") + sdk_security._grant( + service_account_name, + "dcos:mesos:master:task:app_id:{}".format(app_id), + description="Spark drivers may execute Mesos tasks", + action="create", + ) def revoke_launch_task_permission(service_name, service_account_name=SPARK_SERVICE_ACCOUNT): - log.info(f"Revoking launch task permission to service account: {service_account_name}, service: {service_name}") + log.info( + f"Revoking launch task permission to service account: {service_account_name}, service: {service_name}" + ) app_id = _escape_service_name(service_name) - sdk_security._revoke(service_account_name, - "dcos:mesos:master:task:app_id:{}".format(app_id), - description="Spark drivers may execute Mesos tasks", - action="create") + sdk_security._revoke( + service_account_name, + "dcos:mesos:master:task:app_id:{}".format(app_id), + description="Spark drivers may execute Mesos tasks", + action="create", + ) -def spark_security_session(users=[SPARK_USER], service_names=[SPARK_SERVICE_NAME, FOLDERED_SPARK_SERVICE_NAME]): - ''' +def spark_security_session( + users=[SPARK_USER], service_names=[SPARK_SERVICE_NAME, FOLDERED_SPARK_SERVICE_NAME] +): + """ Spark strict mode setup is slightly different from dcos-commons, so can't use sdk_security::security_session. Differences: (1) the role is "*", (2) the driver itself is a framework and needs permission to execute tasks. - ''' - role = '*' + """ + role = "*" service_account = SPARK_SERVICE_ACCOUNT secret = SPARK_SERVICE_ACCOUNT_SECRET def setup_security(): - log.info('Setting up strict-mode security for Spark') - sdk_security.create_service_account(service_account_name=service_account, service_account_secret=secret) + log.info("Setting up strict-mode security for Spark") + sdk_security.create_service_account( + service_account_name=service_account, service_account_secret=secret + ) for user in users: grant_user_permissions(user, role, service_account) @@ -370,10 +372,10 @@ def setup_security(): for service_name in service_names: grant_launch_task_permission(service_name) - log.info('Finished setting up strict-mode security for Spark') + log.info("Finished setting up strict-mode security for Spark") def cleanup_security(): - log.info('Cleaning up strict-mode security for Spark') + log.info("Cleaning up strict-mode security for Spark") for user in users: revoke_user_permissions(user, role, service_account) @@ -381,7 +383,7 @@ def cleanup_security(): # TODO: improve security setup/teardown to make it more fine-grained (allow different service names/accts/users) # tracking issue: https://jira.mesosphere.com/browse/DCOS-50933 sdk_security.delete_service_account(service_account, secret) - log.info('Finished cleaning up strict-mode security for Spark') + log.info("Finished cleaning up strict-mode security for Spark") try: if not sdk_utils.is_open_dcos(): @@ -393,33 +395,3 @@ def cleanup_security(): finally: if sdk_utils.is_strict_mode(): cleanup_security() - - -def restart_task_agent_and_verify_state(host_ip, task, expected_state): - dcos_utils.agent_ssh(host_ip, stop_agent_cmd) - _check_agent_status(host_ip, "inactive") - dcos_utils.agent_ssh(host_ip, start_agent_cmd) - _check_agent_status(host_ip, "active") - _wait_for_task_status(task["id"], expected_state) - - -@retrying.retry( - wait_fixed=5000, - stop_max_delay=120 * 1000, - retry_on_result=lambda res: not res) -def _check_agent_status(host_ip, expected_status): - status = dcos_utils.agent_ssh(host_ip, check_agent_cmd) - log.info(f"Checking status of agent at host {host_ip}, expected: {expected_status}, actual: {status}") - return expected_status == status - - -@retrying.retry( - wait_fixed=5000, - stop_max_delay=120 * 1000, - retry_on_result=lambda res: not res) -def _wait_for_task_status(task_id, expected_state): - completed = expected_state != "TASK_RUNNING" - task = shakedown.get_task(task_id, completed=completed) - assert task is not None - log.info(f"Checking task state for '{task_id}', expected: {expected_state}, actual: {task['state']}") - return expected_state == task["state"] diff --git a/testing/sdk_install.py b/testing/sdk_install.py index b591f248..f6cb043c 100644 --- a/testing/sdk_install.py +++ b/testing/sdk_install.py @@ -1,10 +1,10 @@ -'''Utilities relating to installing services +"""Utilities relating to installing services ************************************************************************ FOR THE TIME BEING WHATEVER MODIFICATIONS ARE APPLIED TO THIS FILE SHOULD ALSO BE APPLIED TO sdk_install IN ANY OTHER PARTNER REPOS ************************************************************************ -''' +""" import collections import logging import time @@ -27,86 +27,98 @@ TIMEOUT_SECONDS = 15 * 60 -'''List of services which are currently installed via install(). -Used by post-test diagnostics to retrieve stuff from currently running services.''' +"""List of services which are currently installed via install(). +Used by post-test diagnostics to retrieve stuff from currently running services.""" _installed_service_names = set([]) def get_installed_service_names() -> set: - '''Returns the a set of service names which had been installed via sdk_install in this session.''' + """Returns the a set of service names which had been installed via sdk_install in this session.""" return _installed_service_names -@retrying.retry(stop_max_attempt_number=3, - retry_on_exception=lambda e: isinstance(e, dcos.errors.DCOSException)) +@retrying.retry( + stop_max_attempt_number=3, retry_on_exception=lambda e: isinstance(e, dcos.errors.DCOSException) +) def _retried_install_impl( - package_name, - service_name, - expected_running_tasks, - options={}, - package_version=None, - timeout_seconds=TIMEOUT_SECONDS, - install_cli=True): - '''Cleaned up version of shakedown's package_install().''' + package_name, + service_name, + expected_running_tasks, + options={}, + package_version=None, + timeout_seconds=TIMEOUT_SECONDS, + install_cli=True, +): + """Cleaned up version of shakedown's package_install().""" package_manager = dcos.packagemanager.PackageManager(dcos.cosmos.get_cosmos_url()) pkg = package_manager.get_package_version(package_name, package_version) if package_version is None: # Get the resolved version for logging below - package_version = 'auto:{}'.format(pkg.version()) + package_version = "auto:{}".format(pkg.version()) - log.info('Installing package={} service={} with options={} version={}'.format( - package_name, service_name, options, package_version)) + log.info( + "Installing package={} service={} with options={} version={}".format( + package_name, service_name, options, package_version + ) + ) # Trigger package install, but only if it's not already installed. # We expect upstream to have confirmed that it wasn't already installed beforehand. if sdk_marathon.app_exists(service_name): - log.info('Marathon app={} exists, skipping package install call'.format(service_name)) + log.info("Marathon app={} exists, skipping package install call".format(service_name)) else: package_manager.install_app(pkg, options) # Install CLI while package starts to install if install_cli and pkg.cli_definition(): - log.info('Installing CLI for package={}'.format(package_name)) + log.info("Installing CLI for package={}".format(package_name)) dcos.subcommand.install(pkg) # Wait for expected tasks to come up if expected_running_tasks > 0: + if service_name[0] != "/": + service_name = "/" + service_name shakedown.wait_for_service_tasks_running( - service_name, expected_running_tasks, timeout_seconds) + service_name, expected_running_tasks, timeout_seconds + ) # Wait for completed marathon deployment - app_id = pkg.marathon_json(options).get('id') + app_id = pkg.marathon_json(options).get("id") shakedown.deployment_wait(timeout_seconds, app_id) def install( - package_name, - service_name, - expected_running_tasks, - additional_options={}, - package_version=None, - timeout_seconds=TIMEOUT_SECONDS, - wait_for_deployment=True, - insert_strict_options=True, - install_cli=True): + package_name, + service_name, + expected_running_tasks, + additional_options={}, + package_version=None, + timeout_seconds=TIMEOUT_SECONDS, + wait_for_deployment=True, + insert_strict_options=True, + install_cli=True, +): start = time.time() # If the package is already installed at this point, fail immediately. if sdk_marathon.app_exists(service_name): - raise dcos.errors.DCOSException('Service is already installed: {}'.format(service_name)) + raise dcos.errors.DCOSException("Service is already installed: {}".format(service_name)) if insert_strict_options and sdk_utils.is_strict_mode(): # strict mode requires correct principal and secret to perform install. # see also: sdk_security.py - options = merge_dictionaries({ - 'service': { - 'service_account': 'service-acct', - 'principal': 'service-acct', - 'service_account_secret': 'secret', - 'secret_name': 'secret' - } - }, additional_options) + options = merge_dictionaries( + { + "service": { + "service_account": "service-acct", + "principal": "service-acct", + "service_account_secret": "secret", + "secret_name": "secret", + } + }, + additional_options, + ) else: options = additional_options @@ -118,7 +130,8 @@ def install( options, package_version, timeout_seconds, - install_cli) + install_cli, + ) # 2. Wait for the scheduler to be idle (as implied by deploy plan completion and suppressed bit) # This should be skipped ONLY when it's known that the scheduler will be stuck in an incomplete @@ -126,12 +139,18 @@ def install( if wait_for_deployment: # this can take a while, default is 15 minutes. for example with HDFS, we can hit the expected # total task count via FINISHED tasks, without actually completing deployment - log.info('Waiting for package={} service={} to finish deployment plan...'.format( - package_name, service_name)) + log.info( + "Waiting for package={} service={} to finish deployment plan...".format( + package_name, service_name + ) + ) sdk_plan.wait_for_completed_deployment(service_name, timeout_seconds) - log.info('Installed package={} service={} after {}'.format( - package_name, service_name, shakedown.pretty_duration(time.time() - start))) + log.info( + "Installed package={} service={} after {}".format( + package_name, service_name, shakedown.pretty_duration(time.time() - start) + ) + ) global _installed_service_names _installed_service_names.add(service_name) @@ -139,46 +158,52 @@ def install( def run_janitor(service_name, role, service_account, znode): if role is None: - role = sdk_utils.get_deslashed_service_name(service_name) + '-role' + role = sdk_utils.get_deslashed_service_name(service_name) + "-role" if service_account is None: - service_account = service_name + '-principal' + service_account = service_name + "-principal" if znode is None: znode = sdk_utils.get_zk_path(service_name) - auth_token = sdk_cmd.run_cli('config show core.dcos_acs_token', print_output=False).strip() - exhibitor_url = sdk_cmd.run_cli('config show core.dcos_url', print_output=False).strip() + '/exhibitor/' - - cmd_list = ["sudo", "docker", "run", "mesosphere/janitor", "/janitor.py", - "-r", role, - "-p", service_account, - "-z", znode, - "--auth_token={}".format(auth_token), - "-e", exhibitor_url] + auth_token = sdk_cmd.run_cli("config show core.dcos_acs_token", print_output=False).strip() + + cmd_list = [ + "sudo", + "docker", + "run", + "mesosphere/janitor", + "/janitor.py", + "-r", + role, + "-p", + service_account, + "-z", + znode, + "--auth_token={}".format(auth_token), + ] cmd = " ".join(cmd_list) sdk_cmd.master_ssh(cmd) -@retrying.retry(stop_max_attempt_number=5, - wait_fixed=5000, - retry_on_exception=lambda e: isinstance(e, Exception)) +@retrying.retry( + stop_max_attempt_number=5, + wait_fixed=5000, + retry_on_exception=lambda e: isinstance(e, Exception), +) def retried_run_janitor(*args, **kwargs): run_janitor(*args, **kwargs) -@retrying.retry(stop_max_attempt_number=5, - wait_fixed=5000, - retry_on_exception=lambda e: isinstance(e, Exception)) +@retrying.retry( + stop_max_attempt_number=5, + wait_fixed=5000, + retry_on_exception=lambda e: isinstance(e, Exception), +) def retried_uninstall_package_and_wait(*args, **kwargs): shakedown.uninstall_package_and_wait(*args, **kwargs) -def uninstall( - package_name, - service_name, - role=None, - service_account=None, - zk=None): +def uninstall(package_name, service_name, role=None, service_account=None, zk=None): start = time.time() global _installed_service_names @@ -187,55 +212,57 @@ def uninstall( except KeyError: pass # allow tests to 'uninstall' up-front - log.info('Uninstalling {}'.format(service_name)) + log.info("Uninstalling {}".format(service_name)) try: retried_uninstall_package_and_wait(package_name, service_name=service_name) except Exception: - log.info('Got exception when uninstalling {}'.format(service_name)) + log.info("Got exception when uninstalling {}".format(service_name)) log.info(traceback.format_exc()) raise finally: - log.info('Reserved resources post uninstall:') + log.info("Reserved resources post uninstall:") sdk_utils.list_reserved_resources() cleanup_start = time.time() try: - if sdk_utils.dcos_version_less_than('1.10'): - log.info('Janitoring {}'.format(service_name)) + if sdk_utils.dcos_version_less_than("1.10"): + log.info("Janitoring {}".format(service_name)) retried_run_janitor(service_name, role, service_account, zk) else: - log.info('Waiting for Marathon app to be removed {}'.format(service_name)) + log.info("Waiting for Marathon app to be removed {}".format(service_name)) sdk_marathon.retried_wait_for_deployment_and_app_removal( - sdk_marathon.get_app_id(service_name), timeout=TIMEOUT_SECONDS) + sdk_marathon.get_app_id(service_name), timeout=TIMEOUT_SECONDS + ) except Exception: - log.info('Got exception when cleaning up {}'.format(service_name)) + log.info("Got exception when cleaning up {}".format(service_name)) log.info(traceback.format_exc()) raise finally: - log.info('Reserved resources post cleanup:') + log.info("Reserved resources post cleanup:") sdk_utils.list_reserved_resources() finish = time.time() log.info( - 'Uninstalled {} after pkg({}) + cleanup({}) = total({})'.format( + "Uninstalled {} after pkg({}) + cleanup({}) = total({})".format( service_name, shakedown.pretty_duration(cleanup_start - start), shakedown.pretty_duration(finish - cleanup_start), - shakedown.pretty_duration(finish - start))) + shakedown.pretty_duration(finish - start), + ) + ) def merge_dictionaries(dict1, dict2): - if (not isinstance(dict2, dict)): + if not isinstance(dict2, dict): return dict1 ret = {} for k, v in dict1.items(): ret[k] = v for k, v in dict2.items(): - if (k in dict1 and isinstance(dict1[k], dict) - and isinstance(dict2[k], collections.Mapping)): + if k in dict1 and isinstance(dict1[k], dict) and isinstance(dict2[k], collections.Mapping): ret[k] = merge_dictionaries(dict1[k], dict2[k]) else: ret[k] = dict2[k] diff --git a/testing/sdk_marathon.py b/testing/sdk_marathon.py index a4fb65ff..551a4409 100644 --- a/testing/sdk_marathon.py +++ b/testing/sdk_marathon.py @@ -89,7 +89,9 @@ def wait_for_response(): def is_app_running(app: dict) -> bool: - return app['tasksStaged'] == 0 and app['tasksUnhealthy'] == 0 and app['tasksRunning'] > 0 + return ('tasksStaged' not in app or app['tasksStaged'] == 0) and \ + ('tasksUnhealthy' not in app or app['tasksUnhealthy'] == 0) and \ + ('tasksRunning' in app and app['tasksRunning'] > 0) def wait_for_deployment_and_app_running(app_name: str, timeout: int): diff --git a/testing/sdk_security.py b/testing/sdk_security.py index b1176218..dbca6328 100644 --- a/testing/sdk_security.py +++ b/testing/sdk_security.py @@ -1,9 +1,9 @@ -''' +""" ************************************************************************ FOR THE TIME BEING WHATEVER MODIFICATIONS ARE APPLIED TO THIS FILE SHOULD ALSO BE APPLIED TO sdk_security IN ANY OTHER PARTNER REPOS ************************************************************************ -''' +""" import logging import os import tempfile @@ -31,9 +31,9 @@ def install_enterprise_cli(force=False): cmd = "package install --yes --cli dcos-enterprise-cli" - @retrying.retry(stop_max_attempt_number=3, - wait_fixed=2000, - retry_on_result=lambda result: result) + @retrying.retry( + stop_max_attempt_number=3, wait_fixed=2000, retry_on_result=lambda result: result + ) def _install_impl(): rc, stdout, stderr = sdk_cmd.run_raw_cli(cmd) if rc: @@ -47,103 +47,149 @@ def _install_impl(): raise RuntimeError("Failed to install the dcos-enterprise-cli: {}".format(repr(e))) -def _grant(user: str, acl: str, description: str, action: str="create") -> None: - log.info('Granting permission to {user} for {acl}/{action} ({description})'.format( - user=user, acl=acl, action=action, description=description)) +def _grant(user: str, acl: str, description: str, action: str = "create") -> None: + log.info( + "Granting permission to {user} for {acl}/{action} ({description})".format( + user=user, acl=acl, action=action, description=description + ) + ) # Create the ACL r = sdk_cmd.cluster_request( - 'PUT', '/acs/api/v1/acls/{acl}'.format(acl=acl), + "PUT", + "/acs/api/v1/acls/{acl}".format(acl=acl), raise_on_error=False, - json={'description': description}) + json={"description": description}, + ) # 201=created, 409=already exists - assert r.status_code in [201, 409, ], '{} failed {}: {}'.format(r.url, r.status_code, r.text) + assert r.status_code in [201, 409], "{} failed {}: {}".format(r.url, r.status_code, r.text) # Assign the user to the ACL r = sdk_cmd.cluster_request( - 'PUT', '/acs/api/v1/acls/{acl}/users/{user}/{action}'.format(acl=acl, user=user, action=action), - raise_on_error=False) + "PUT", + "/acs/api/v1/acls/{acl}/users/{user}/{action}".format(acl=acl, user=user, action=action), + raise_on_error=False, + ) # 204=success, 409=already exists - assert r.status_code in [204, 409, ], '{} failed {}: {}'.format(r.url, r.status_code, r.text) + assert r.status_code in [204, 409], "{} failed {}: {}".format(r.url, r.status_code, r.text) -def _revoke(user: str, acl: str, description: str, action: str="create") -> None: +def _revoke(user: str, acl: str, description: str, action: str = "create") -> None: # TODO(kwood): INFINITY-2065 - implement security cleanup log.info("Want to delete {user}+{acl}".format(user=user, acl=acl)) -def get_permissions(service_account_name: str, role: str, linux_user: str) -> typing.List[dict]: +def get_role_permissions(service_account_name: str, role: str) -> typing.List[dict]: return [ # registration permissions { - 'user': service_account_name, - 'acl': "dcos:mesos:master:framework:role:{}".format(role), - 'description': "Service {} may register with the Mesos master with role={}".format( - service_account_name, role), + "user": service_account_name, + "acl": "dcos:mesos:master:framework:role:{}".format(role), + "description": "Service {} may register with the Mesos master with role={}".format( + service_account_name, role + ), + }, + # resource permissions + { + "user": service_account_name, + "acl": "dcos:mesos:master:reservation:role:{}".format(role), + "description": "Service {} may reserve Mesos resources with role={}".format( + service_account_name, role + ), + }, + # volume permissions + { + "user": service_account_name, + "acl": "dcos:mesos:master:volume:role:{}".format(role), + "description": "Service {} may create Mesos volumes with role={}".format( + service_account_name, role + ), }, + ] + +def get_permissions(service_account_name: str, linux_user: str) -> typing.List[dict]: + return [ # task execution permissions { - 'user': service_account_name, - 'acl': "dcos:mesos:master:task:user:{}".format(linux_user), - 'description': "Service {} may execute Mesos tasks as user={}".format( - service_account_name, linux_user) + "user": service_account_name, + "acl": "dcos:mesos:master:task:user:{}".format(linux_user), + "description": "Service {} may execute Mesos tasks as user={}".format( + service_account_name, linux_user + ), }, - # XXX 1.10 currently requires this mesos:agent permission as well as # mesos:task permission. unclear if this will be ongoing requirement. # See DCOS-15682 { - 'user': service_account_name, - 'acl': "dcos:mesos:agent:task:user:{}".format(linux_user), - 'description': "Service {} may execute Mesos tasks as user={}".format( - service_account_name, linux_user) + "user": service_account_name, + "acl": "dcos:mesos:agent:task:user:{}".format(linux_user), + "description": "Service {} may execute Mesos tasks as user={}".format( + service_account_name, linux_user + ), }, - # resource permissions { - 'user': service_account_name, - 'acl': "dcos:mesos:master:reservation:role:{}".format(role), - 'description': "Service {} may reserve Mesos resources with role={}".format( - service_account_name, role) - }, - { - 'user': service_account_name, - 'acl': "dcos:mesos:master:reservation:principal:{}".format(service_account_name), - 'description': "Service {} may reserve Mesos resources with principal={}".format( - service_account_name, service_account_name), - 'action': "delete", + "user": service_account_name, + "acl": "dcos:mesos:master:reservation:principal:{}".format(service_account_name), + "description": "Service {} may reserve Mesos resources with principal={}".format( + service_account_name, service_account_name + ), + "action": "delete", }, - # volume permissions { - 'user': service_account_name, - 'acl': "dcos:mesos:master:volume:role:{}".format(role), - 'description': "Service {} may create Mesos volumes with role={}".format( - service_account_name, role) + "user": service_account_name, + "acl": "dcos:mesos:master:volume:principal:{}".format(service_account_name), + "description": "Service {} may create Mesos volumes with principal={}".format( + service_account_name, service_account_name + ), + "action": "delete", }, - { - 'user': service_account_name, - 'acl': "dcos:mesos:master:volume:principal:{}".format(service_account_name), - 'description': "Service {} may create Mesos volumes with principal={}".format( - service_account_name, service_account_name), - 'action': "delete", - }] + ] -def grant_permissions(linux_user: str, role_name: str, service_account_name: str) -> None: +def grant_permissions( + linux_user: str, + role_name: str = "", + role_list: typing.List[str] = [], + service_account_name: str = "", +) -> None: log.info("Granting permissions to {account}".format(account=service_account_name)) - permissions = get_permissions(service_account_name, role_name, linux_user) + + permissions = get_permissions(service_account_name, linux_user) + + if not role_list: + role_list = [role_name] + + for role in role_list: + permissions += get_role_permissions(service_account_name, role) + for permission in permissions: _grant(**permission) + log.info("Permission setup completed for {account}".format(account=service_account_name)) -def revoke_permissions(linux_user: str, role_name: str, service_account_name: str) -> None: +def revoke_permissions( + linux_user: str, + service_account_name: str, + role_name: str = "", + role_list: typing.List[str] = [], +) -> None: log.info("Revoking permissions to {account}".format(account=service_account_name)) - permissions = get_permissions(service_account_name, role_name, linux_user) + + permissions = get_permissions(service_account_name, linux_user) + + if not role_list: + role_list = [role_name] + + for role in role_list: + permissions += get_role_permissions(service_account_name, role) + for permission in permissions: _revoke(**permission) + log.info("Permission cleanup completed for {account}".format(account=service_account_name)) @@ -153,36 +199,49 @@ def create_service_account(service_account_name: str, service_account_secret: st """ install_enterprise_cli() - log.info('Creating service account for account={account} secret={secret}'.format( - account=service_account_name, - secret=service_account_secret)) + log.info( + "Creating service account for account={account} secret={secret}".format( + account=service_account_name, secret=service_account_secret + ) + ) if service_account_secret == service_account_name: log.warning("Values for service_account_name and service_account_secret are the same.") - log.info('Remove any existing service account and/or secret') + log.info("Remove any existing service account and/or secret") delete_service_account(service_account_name, service_account_secret) with tempfile.TemporaryDirectory() as tmp_dir: private_key_file = os.path.join(tmp_dir, "private-key.pem") public_key_file = os.path.join(tmp_dir, "public-key.pem") - log.info('Create keypair') - sdk_cmd.run_cli('security org service-accounts keypair {} {}'.format(private_key_file, public_key_file)) + log.info("Create keypair") + sdk_cmd.run_cli( + "security org service-accounts keypair {} {}".format(private_key_file, public_key_file) + ) - log.info('Create service account') - sdk_cmd.run_cli('security org service-accounts create -p {public_key} ' - '-d "Service account for integration tests" "{account}"'.format(public_key=public_key_file, - account=service_account_name)) + log.info("Create service account") + sdk_cmd.run_cli( + "security org service-accounts create -p {public_key} " + '-d "Service account for integration tests" "{account}"'.format( + public_key=public_key_file, account=service_account_name + ) + ) - log.info('Create secret') + log.info("Create secret") sdk_cmd.run_cli( 'security secrets create-sa-secret --strict "{private_key}" "{account}" "{secret}"'.format( - private_key=private_key_file, account=service_account_name, secret=service_account_secret)) + private_key=private_key_file, + account=service_account_name, + secret=service_account_secret, + ) + ) - log.info('Service account created for account={account} secret={secret}'.format( - account=service_account_name, - secret=service_account_secret)) + log.info( + "Service account created for account={account} secret={secret}".format( + account=service_account_name, secret=service_account_secret + ) + ) def delete_service_account(service_account_name: str, service_account_secret: str) -> None: @@ -204,31 +263,43 @@ def delete_secret(secret: str) -> None: def _get_role_list(service_name: str) -> typing.List[str]: - # TODO: spark_utils uses: - # app_id_encoded = urllib.parse.quote( - # urllib.parse.quote(app_id, safe=''), - # safe='' - # ) role_basename = service_name.replace("/", "__") - return [ + + roles = [] + + # Grant roles on each group components. + path_components = service_name.split("/") + role_path = "" + for component in path_components[:-1]: + if role_path != "": + role_path += "__" + role_path += component + + roles.append(role_path) + + return roles + [ "{}-role".format(role_basename), "slave_public%252F{}-role".format(role_basename), ] -def setup_security(service_name: str, - linux_user: str="nobody", - service_account: str="service-acct", - service_account_secret: str="secret") -> dict: +def setup_security( + service_name: str, + linux_user: str = "nobody", + service_account: str = "service-acct", + service_account_secret: str = "secret", +) -> dict: - create_service_account(service_account_name=service_account, - service_account_secret=service_account_secret) + create_service_account( + service_account_name=service_account, service_account_secret=service_account_secret + ) - service_account_info = {"name": service_account, - "secret": service_account_secret, - "linux_user": linux_user, - "roles": [] - } + service_account_info = { + "name": service_account, + "secret": service_account_secret, + "linux_user": linux_user, + "roles": [], + } if not sdk_utils.is_strict_mode(): log.info("Skipping strict-mode security setup on non-strict cluster") @@ -238,20 +309,18 @@ def setup_security(service_name: str, service_account_info["roles"] = _get_role_list(service_name) - for role_name in service_account_info["roles"]: - grant_permissions( - linux_user=linux_user, - role_name=role_name, - service_account_name=service_account - ) + grant_permissions( + linux_user=linux_user, + role_list=service_account_info["roles"], + service_account_name=service_account, + ) log.info("Finished setting up strict-mode security") return service_account_info -def cleanup_security(service_name: str, - service_account_info: typing.Dict) -> None: +def cleanup_security(service_name: str, service_account_info: typing.Dict) -> None: service_account = service_account_info.get("name", "service-acct") service_account_secret = service_account_info.get("secret", "secret") @@ -262,19 +331,18 @@ def cleanup_security(service_name: str, roles = service_account_info.get("roles", _get_role_list(service_name)) linux_user = service_account_info.get("linux_user", "nobody") - for role_name in roles: - revoke_permissions( - linux_user=linux_user, - role_name=role_name, - service_account_name=service_account - ) + revoke_permissions( + linux_user=linux_user, role_list=roles, service_account_name=service_account + ) delete_service_account(service_account, service_account_secret) log.info("Finished cleaning up strict-mode security") -def security_session(framework_name: str, service_account: str="service-acct", secret: str="secret") -> None: +def security_session( + framework_name: str, service_account: str = "service-acct", secret: str = "secret" +) -> None: """Create a service account and configure permissions for strict-mode tests. This should generally be used as a fixture in a framework's conftest.py: @@ -286,9 +354,11 @@ def configure_security(configure_universe): try: is_strict = sdk_utils.is_strict_mode() if is_strict: - service_account_info = setup_security(service_name=framework_name, - service_account=service_account, - service_account_secret=secret) + service_account_info = setup_security( + service_name=framework_name, + service_account=service_account, + service_account_secret=secret, + ) yield finally: if is_strict: @@ -297,23 +367,31 @@ def configure_security(configure_universe): def openssl_ciphers(): return set( - check_output(['openssl', 'ciphers', - 'ALL:eNULL']).decode('utf-8').rstrip().split(':')) - - -def is_cipher_enabled(service_name: str, - task_name: str, - cipher: str, - endpoint: str, - openssl_timeout: str = '1') -> bool: - @retrying.retry(stop_max_attempt_number=3, - wait_fixed=2000, - retry_on_result=lambda result: 'Failed to enter mount namespace' in result) + check_output(["openssl", "ciphers", "ALL:eNULL"]).decode("utf-8").rstrip().split(":") + ) + + +def is_cipher_enabled( + service_name: str, task_name: str, cipher: str, endpoint: str, openssl_timeout: str = "1" +) -> bool: + @retrying.retry( + stop_max_attempt_number=3, + wait_fixed=2000, + retry_on_result=lambda result: "Failed to enter mount namespace" in result, + ) def run_openssl_command() -> str: - command = ' '.join([ - 'timeout', openssl_timeout, - 'openssl', 's_client', '-cipher', cipher, '-connect', endpoint - ]) + command = " ".join( + [ + "timeout", + openssl_timeout, + "openssl", + "s_client", + "-cipher", + cipher, + "-connect", + endpoint, + ] + ) _, output = sdk_cmd.service_task_exec(service_name, task_name, command, True) return output