d2iq-archive · mpereira · Sep 11, 2019 · Sep 11, 2019 · Sep 11, 2019 · Sep 12, 2019
diff --git a/scale-tests/batch_test.py b/scale-tests/batch_test.py
@@ -10,6 +10,7 @@
 
 Options:
     --docker-image <img>                              docker image to run on executors
+    --group-role <group-role>                         root-level group to apply quotas against (e.g. '/dev') [default: None]
     --max-num-dispatchers <n>                         maximum number of dispatchers to use from dispatchers file
     --submits-per-min <n>                             number of jobs to submit per minute [default: 1]
     --spark-cores-max <n>                             max executor cores per job [default: 1]
@@ -24,13 +25,15 @@
 """
 
 
+from docopt import docopt
+from threading import Thread
+
 import json
 import logging
+import os
 import random
 import sys
 import time
-from docopt import docopt
-from threading import Thread
 import typing
 
 import sdk_utils
@@ -48,9 +51,10 @@
 
 
 logging.basicConfig(
-    format='[%(asctime)s|%(name)s|%(levelname)s]: %(message)s',
+    format="[%(asctime)s|%(name)s|%(levelname)s]: %(message)s",
     level=logging.INFO,
-    stream=sys.stdout)
+    stream=sys.stdout,
+)
 
 log = logging.getLogger(__name__)
 MONTE_CARLO_APP_URL = "https://raw.githubusercontent.com/mesosphere/spark-build/master/scale-tests/apps/monte-carlo-portfolio.py"
@@ -76,62 +80,99 @@ def _get_duration() -> int:
 
 def _get_gpu_user_conf(args):
     def _verify_required_args():
-        if not (args["--spark-mesos-max-gpus"] and
-                args["--spark-mesos-executor-gpus"] and
-                args["--docker-image"]):
-            log.error("""
+        if not (
+            args["--spark-mesos-max-gpus"]
+            and args["--spark-mesos-executor-gpus"]
+            and args["--docker-image"]
+        ):
+            log.error(
+                """
             Missing required arguments for running gpu jobs. Please include:
             --spark-mesos-max-gpus
             --spark-mesos-executor-gpus
             --docker-image
-            """)
+            """
+            )
 
     _verify_required_args()
 
     # Based on testing, 20gb per GPU is needed to run the job successfully.
     # This is due to memory being divvied up and allocated to each GPU device.
     memory_multiplier = 20
     memory = int(args["--spark-mesos-executor-gpus"]) * memory_multiplier
-    return ["--conf", "spark.driver.memory={}g".format(str(memory)),
-            "--conf", "spark.executor.memory={}g".format(str(memory)),
-            "--conf", "spark.mesos.gpus.max={}".format(args["--spark-mesos-max-gpus"]),
-            "--conf", "spark.mesos.executor.gpus={}".format(args["--spark-mesos-executor-gpus"]),
-            "--conf", "spark.mesos.executor.docker.image={}".format(args["--docker-image"]),
-            "--conf", "spark.mesos.executor.docker.forcePullImage=false"
-            ]
-
-
-def submit_job(app_url: str, app_args: str, dispatcher: typing.Dict, duration: int, config: typing.List[str]):
+    return [
+        "--conf",
+        "spark.driver.memory={}g".format(str(memory)),
+        "--conf",
+        "spark.executor.memory={}g".format(str(memory)),
+        "--conf",
+        "spark.mesos.gpus.max={}".format(args["--spark-mesos-max-gpus"]),
+        "--conf",
+        "spark.mesos.executor.gpus={}".format(args["--spark-mesos-executor-gpus"]),
+        "--conf",
+        "spark.mesos.executor.docker.image={}".format(args["--docker-image"]),
+        "--conf",
+        "spark.mesos.executor.docker.forcePullImage=false",
+    ]
+
+
+def submit_job(
+    app_url: str,
+    app_args: str,
+    dispatcher: typing.Dict,
+    duration: int,
+    config: typing.List[str],
+    group_role: str,
+):
     dispatcher_name = dispatcher["service"]["name"]
     log.info("Submitting job to dispatcher: %s, with duration: %s min.", dispatcher_name, duration)
 
+    driver_role = None if group_role else dispatcher["roles"]["executors"]
+
     spark_utils.submit_job(
         service_name=dispatcher_name,
         app_url=app_url,
         app_args=app_args,
         verbose=False,
         args=config,
-        driver_role=dispatcher["roles"]["executors"],
+        driver_role=driver_role,
         spark_user=dispatcher["service"]["user"] if sdk_utils.is_strict_mode() else None,
-        principal=dispatcher["service"]["service_account"] if sdk_utils.is_strict_mode() else None)
+        principal=dispatcher["service"]["service_account"] if sdk_utils.is_strict_mode() else None,
+    )
 
 
-def submit_loop(app_url: str, submits_per_min: int, dispatchers: typing.List[typing.Dict], user_conf: typing.List[str]):
+def submit_loop(
+    app_url: str,
+    submits_per_min: int,
+    dispatchers: typing.List[typing.Dict],
+    user_conf: typing.List[str],
+    group_role: str,
+):
     sec_between_submits = 60 / submits_per_min
     log.info("sec_between_submits: %s", sec_between_submits)
     num_dispatchers = len(dispatchers)
     log.info("num_dispatchers: %s", num_dispatchers)
 
     dispatcher_index = 0
-    while(True):
+    while True:
         duration = _get_duration()
 
         if app_url == MONTE_CARLO_APP_URL:
             app_args = "100000 {}".format(str(duration * 30))  # about 30 iterations per min.
         else:
             app_args = "550 3"  # 550 images in 3 batches
 
-        t = Thread(target=submit_job, args=(app_url, app_args, dispatchers[dispatcher_index], duration, user_conf))
+        t = Thread(
+            target=submit_job,
+            args=(
+                app_url,
+                app_args,
+                dispatchers[dispatcher_index],
+                duration,
+                user_conf,
+                group_role,
+            ),
+        )
         t.start()
         dispatcher_index = (dispatcher_index + 1) % num_dispatchers
         log.info("sleeping %s sec.", sec_between_submits)
@@ -151,35 +192,57 @@ def submit_loop(app_url: str, submits_per_min: int, dispatchers: typing.List[typ
         if end <= len(dispatchers):
             dispatchers = dispatchers[0:end]
         else:
-            log.warning("""
+            log.warning(
+                """
             Specified --max-num-dispatchers is greater than actual dispatcher count in {}.
             Using list of dispatchers from file instead.
-            """.format(args["<dispatcher_file>"]))
-
-    user_conf = ["--conf", "spark.cores.max={}".format(args["--spark-cores-max"]),
-                 "--conf", "spark.executor.cores={}".format(args["--spark-executor-cores"]),
-                 "--conf", "spark.mesos.containerizer={}".format(args["--spark-mesos-containerizer"]),
-                 "--conf", "spark.port.maxRetries={}".format(args["--spark-port-max-retries"]),
-                 "--conf", "spark.mesos.driver.failoverTimeout={}".format(args["--spark-mesos-driver-failover-timeout"])
-                ]
+            """.format(
+                    args["<dispatcher_file>"]
+                )
+            )
+
+    user_conf = [
+        "--conf",
+        "spark.cores.max={}".format(args["--spark-cores-max"]),
+        "--conf",
+        "spark.executor.cores={}".format(args["--spark-executor-cores"]),
+        "--conf",
+        "spark.mesos.containerizer={}".format(args["--spark-mesos-containerizer"]),
+        "--conf",
+        "spark.port.maxRetries={}".format(args["--spark-port-max-retries"]),
+        "--conf",
+        "spark.mesos.driver.failoverTimeout={}".format(
+            args["--spark-mesos-driver-failover-timeout"]
+        ),
+    ]
 
     if args["--spark-mesos-executor-gpus"]:
         user_conf += _get_gpu_user_conf(args)
         MEMORY_MULTIPLIER = 20
         memory = int(args["--spark-mesos-executor-gpus"]) * MEMORY_MULTIPLIER
-        user_conf += ["--conf", "spark.driver.memory={}g".format(str(memory)),
-                      "--conf", "spark.executor.memory={}g".format(str(memory)),
-                      "--conf", "spark.mesos.gpus.max={}".format(args["--spark-mesos-max-gpus"]),
-                      "--conf", "spark.mesos.executor.gpus={}".format(args["--spark-mesos-executor-gpus"]),
-                      "--conf", "spark.mesos.executor.docker.image={}".format(args["--docker-image"]),
-                      "--conf", "spark.mesos.executor.docker.forcePullImage=false"
-                      ]
+        user_conf += [
+            "--conf",
+            "spark.driver.memory={}g".format(str(memory)),
+            "--conf",
+            "spark.executor.memory={}g".format(str(memory)),
+            "--conf",
+            "spark.mesos.gpus.max={}".format(args["--spark-mesos-max-gpus"]),
+            "--conf",
+            "spark.mesos.executor.gpus={}".format(args["--spark-mesos-executor-gpus"]),
+            "--conf",
+            "spark.mesos.executor.docker.image={}".format(args["--docker-image"]),
+            "--conf",
+            "spark.mesos.executor.docker.forcePullImage=false",
+        ]
         app_url = GPU_IMAGE_RECOGNITION_APP_URL
     else:
         app_url = MONTE_CARLO_APP_URL
 
     if args["--spark-mesos-driver-labels"] is not None:
-        user_conf += ["--conf", "spark.mesos.driver.labels={}".format(args["--spark-mesos-driver-labels"])]
+        user_conf += [
+            "--conf",
+            "spark.mesos.driver.labels={}".format(args["--spark-mesos-driver-labels"]),
+        ]
 
     if not args["--no-supervise"]:
         user_conf += ["--supervise"]
@@ -188,4 +251,6 @@ def submit_loop(app_url: str, submits_per_min: int, dispatchers: typing.List[typ
         end = int(args["--max-num-dispatchers"])
         dispatchers = dispatchers[0:end]
 
-    submit_loop(app_url, int(args["--submits-per-min"]), dispatchers, user_conf)
+    group_role = args["--group-role"]
+
+    submit_loop(app_url, int(args["--submits-per-min"]), dispatchers, user_conf, group_role)
diff --git a/scale-tests/configs/dsengine-options.json b/scale-tests/configs/dsengine-options.json
@@ -0,0 +1,17 @@
+{
+  "service": {
+    "name": "data-services/jupyter",
+    "service_account": "data_services__jupyter",
+    "service_account_secret": "data_services__jupyter-secret",
+    "gpu": {
+      "enabled": true
+    },
+    "virtual_network_enabled": true,
+    "virtual_network_name": "dcos"
+  },
+  "spark": {
+    "spark_mesos_role": "data_services__jupyter",
+    "spark_mesos_principal": "data_services__jupyter",
+    "spark_mesos_secret": "data_services__jupyter-secret"
+  }
+}
diff --git a/scale-tests/configs/spark-options.json b/scale-tests/configs/spark-options.json
@@ -0,0 +1,7 @@
+{
+  "service": {
+    "role": "data-services",
+    "enforce_role": true,
+    "virtual_network_enabled": true
+  }
+}