slalom-ggp · jacksandom · Mar 10, 2020 · Mar 10, 2020 · Mar 10, 2020 · Mar 18, 2020
diff --git a/containers/docker-mlops/.Dockerignore b/containers/docker-mlops/.Dockerignore
@@ -0,0 +1,3 @@
+Dockerfile
+Dockerimage
+README.md
diff --git a/containers/docker-mlops/Dockerfile b/containers/docker-mlops/Dockerfile
@@ -0,0 +1,27 @@
+FROM python:3.6-slim-stretch
+
+
+RUN apt-get -y update && apt-get install -y --no-install-recommends \
+    nginx \
+    ca-certificates \
+    g++ \
+    git \
+    && rm -rf /var/lib/apt/lists/*
+
+# Set some environment variables. PYTHONUNBUFFERED keeps Python from buffering our standard
+# output stream, which means that logs can be delivered to the user quickly. PYTHONDONTWRITEBYTECODE
+# keeps Python from writing the .pyc files which are unnecessary in this case. We also update
+# PATH so that the train and serve programs are found when the container is invoked.
+
+ENV PYTHONUNBUFFERED=TRUE
+ENV PYTHONDONTWRITEBYTECODE=TRUE
+ENV PATH="/opt/program:${PATH}"
+
+# Here we get all python packages.
+RUN pip install flask gevent gunicorn future
+RUN pip install numpy==1.17.3 pandas==0.25.2 scipy==1.3.1 scikit-learn==0.22.2 xgboost==1.0.2 shap==0.35.0 && rm -rf /root/.cache
+RUN apt-get -y purge --auto-remove git
+
+# Set up the program in the image
+COPY xgboost /opt/program
+WORKDIR /opt/program
diff --git a/containers/docker-mlops/build_and_push.sh b/containers/docker-mlops/build_and_push.sh
@@ -0,0 +1,53 @@
+#!/usr/bin/env bash
+
+# This script shows how to build the Docker image and push it to ECR to be ready for use
+# by SageMaker.
+
+# The argument to this script is the image name. This will be used as the image on the local
+# machine and combined with the account and region to form the repository name for ECR.
+image=$1
+
+if [ "$image" == "" ]
+then
+    echo "Usage: $0 <image-name>"
+    exit 1
+fi
+
+chmod +x xgboost/train
+chmod +x xgboost/serve
+
+# Get the account number associated with the current IAM credentials
+account=$(aws sts get-caller-identity --query Account --output text)
+
+if [ $? -ne 0 ]
+then
+    exit 255
+fi
+
+
+# Get the region defined in the current configuration (default to us-west-2 if none defined)
+region=$(aws configure get region)
+region=${region:-us-west-2}
+
+
+fullname="${account}.dkr.ecr.${region}.amazonaws.com/${image}:latest"
+
+# If the repository doesn't exist in ECR, create it.
+
+aws ecr describe-repositories --repository-names "${image}" > /dev/null 2>&1
+
+if [ $? -ne 0 ]
+then
+    aws ecr create-repository --repository-name "${image}" > /dev/null
+fi
+
+# Get the login command from ECR and execute it directly
+$(aws ecr get-login --region ${region} --no-include-email)
+
+# Build the docker image locally with the image name and then push it to ECR
+# with the full name.
+
+docker build  -t ${image} .
+docker tag ${image} ${fullname}
+
+docker push ${fullname}
diff --git a/containers/docker-mlops/requirements.txt b/containers/docker-mlops/requirements.txt
@@ -0,0 +1,56 @@
+awscli
+bcrypt==3.1.7
+boto3
+botocore
+cached-property==1.5.1
+certifi==2019.9.11
+cffi==1.13.1
+chardet==3.0.4
+click==6.7
+cryptography==2.8
+docker==3.7.3
+docker-compose==1.24.1
+docker-pycreds==0.4.0
+dockerpty==0.4.1
+docopt==0.6.2
+docutils==0.15.2
+entrypoints==0.3
+fabric==2.5.0
+flake8==3.7.8
+Flask==1.1.1
+future==0.17.1
+idna==2.7
+invoke==1.3.0
+itsdangerous==1.1.0
+Jinja2==2.10.3
+jmespath==0.9.4
+joblib==0.14.0
+jsonschema==2.6.0
+MarkupSafe==1.1.1
+mccabe==0.6.1
+numpy==1.17.3
+pandas==0.25.2
+paramiko==2.4.3
+pathlib2==2.3.5
+protobuf==3.10.0
+protobuf3-to-dict==0.1.5
+pyasn1==0.4.7
+pycodestyle==2.5.0
+pycparser==2.19
+pyflakes==2.1.1
+PyNaCl==1.3.0
+python-dateutil==2.8.0
+pytz==2019.3
+PyYAML==3.13
+requests==2.20.1
+s3transfer
+sagify>=0.18.0
+scikit-learn==0.21.3
+scipy==1.3.1
+six==1.11.0
+texttable==0.9.1
+urllib3==1.24.3
+websocket-client==0.56.0
+Werkzeug==0.16.0
+xgboost==1.0.2
+shap==0.35.0
diff --git a/containers/docker-mlops/xgboost/nginx.conf b/containers/docker-mlops/xgboost/nginx.conf
@@ -0,0 +1,38 @@
+worker_processes 1;
+daemon off; # Prevent forking
+
+
+pid /tmp/nginx.pid;
+error_log /var/log/nginx/error.log;
+
+events {
+  # defaults
+}
+
+http {
+  include /etc/nginx/mime.types;
+  default_type application/octet-stream;
+  access_log /var/log/nginx/access.log combined;
+
+  upstream gunicorn {
+    server unix:/tmp/gunicorn.sock;
+  }
+
+  server {
+    listen 8080 deferred;
+    client_max_body_size 5m;
+
+    keepalive_timeout 5;
+
+    location ~ ^/(ping|invocations) {
+      proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
+      proxy_set_header Host $http_host;
+      proxy_redirect off;
+      proxy_pass http://gunicorn;
+    }
+
+    location / {
+      return 404 "{}";
+    }
+  }
+}
diff --git a/containers/docker-mlops/xgboost/predictor.py b/containers/docker-mlops/xgboost/predictor.py
@@ -0,0 +1,113 @@
+# This is the file that implements a flask server to do inferences. It's the file that you will modify to
+# implement the scoring for your own algorithm.
+
+from __future__ import print_function
+
+import os
+import json
+import pickle
+from io import StringIO
+import sys
+import signal
+import traceback
+
+import flask
+
+import pandas as pd
+import xgboost
+import shap
+
+prefix = "/opt/ml/"
+model_path = os.path.join(prefix, "model")
+
+# A singleton for holding the model. This simply loads the model and holds it.
+# It has a predict function that does a prediction based on the model and the input data.
+
+
+class ScoringService(object):
+    model = None  # Where we keep the model when it's loaded
+
+    @classmethod
+    def get_model(cls):
+        """Get the model object for this instance, loading it if it's not already loaded."""
+        if cls.model == None:
+            with open(os.path.join(model_path, "xgboost-model.pkl"), "rb") as inp:
+                cls.model = pickle.load(inp)
+        return cls.model
+
+    @classmethod
+    def predict(cls, input):
+        """For the input, do the predictions and return them.
+
+        Args:
+            input (a pandas dataframe): The data on which to do the predictions. There will be
+                one prediction per row in the dataframe"""
+        clf = cls.get_model()
+        return clf.predict_proba(input)
+
+    @classmethod
+    def shap(cls, input):
+        """For the input, do the predictions and return them.
+
+        Args:
+            input (a pandas dataframe): The data on which to do the predictions. There will be
+                one prediction per row in the dataframe"""
+        clf = cls.get_model()
+        # return shap values
+        explainer = shap.TreeExplainer(clf)
+        return explainer.shap_values(input)
+
+
+# The flask app for serving predictions
+app = flask.Flask(__name__)
+
+
+@app.route("/ping", methods=["GET"])
+def ping():
+    """Determine if the container is working and healthy. In this sample container, we declare
+    it healthy if we can load the model successfully."""
+    health = ScoringService.get_model() is not None  # You can insert a health check here
+
+    status = 200 if health else 404
+    return flask.Response(response="\n", status=status, mimetype="application/json")
+
+
+@app.route("/invocations", methods=["POST"])
+def transformation():
+    """Do an inference on a single batch of data. In this sample server, we take data as CSV, convert
+    it to a pandas data frame for internal use and then convert the predictions back to CSV (which really
+    just means one prediction per line, since there's a single column.
+    """
+    data = None
+
+    # Convert from CSV to pandas
+    if flask.request.content_type == "text/csv":
+        data = flask.request.data.decode("utf-8")
+        s = StringIO(data)
+        data = pd.read_csv(s, index_col=0)
+    else:
+        return flask.Response(
+            response="This predictor only supports CSV data",
+            status=415,
+            mimetype="text/plain",
+        )
+
+    print("Invoked with {} records".format(data.shape[0]))
+
+    # Do the prediction
+    predictions = ScoringService.predict(data)
+
+    # Save viz data
+    shap_values = ScoringService.shap(data)
+    df_shap = pd.DataFrame(
+        data=shap_values, index=data.index, columns=data.columns
+    )
+
+    # Convert from numpy back to CSV
+    out = StringIO()
+    pd.DataFrame({"Prediction": predictions[:,1]}, index=data.index).join(df_shap).to_csv(
+        out, header=True, index=True
+    )
+    result = out.getvalue()
+
+    return flask.Response(response=result, status=200, mimetype="text/csv")
diff --git a/containers/docker-mlops/xgboost/serve b/containers/docker-mlops/xgboost/serve
@@ -0,0 +1,81 @@
+#!/usr/bin/env python3.6
+
+# This file implements the scoring service shell. You don't necessarily need to modify it for various
+# algorithms. It starts nginx and gunicorn with the correct configurations and then simply waits until
+# gunicorn exits.
+#
+# The flask server is specified to be the app object in wsgi.py
+#
+# We set the following parameters:
+#
+# Parameter                Environment Variable              Default Value
+# ---------                --------------------              -------------
+# number of workers        MODEL_SERVER_WORKERS              the number of CPU cores
+# timeout                  MODEL_SERVER_TIMEOUT              60 seconds
+
+from __future__ import print_function
+import multiprocessing
+import os
+import signal
+import subprocess
+import sys
+
+cpu_count = multiprocessing.cpu_count()
+
+model_server_timeout = os.environ.get("MODEL_SERVER_TIMEOUT", 60)
+model_server_workers = int(os.environ.get("MODEL_SERVER_WORKERS", cpu_count))
+
+
+def sigterm_handler(nginx_pid, gunicorn_pid):
+    try:
+        os.kill(nginx_pid, signal.SIGQUIT)
+    except OSError:
+        pass
+    try:
+        os.kill(gunicorn_pid, signal.SIGTERM)
+    except OSError:
+        pass
+
+    sys.exit(0)
+
+
+def start_server():
+    print("Starting the inference server with {} workers.".format(model_server_workers))
+
+    # link the log streams to stdout/err so they will be logged to the container logs
+    subprocess.check_call(["ln", "-sf", "/dev/stdout", "/var/log/nginx/access.log"])
+    subprocess.check_call(["ln", "-sf", "/dev/stderr", "/var/log/nginx/error.log"])
+
+    nginx = subprocess.Popen(["nginx", "-c", "/opt/program/nginx.conf"])
+    gunicorn = subprocess.Popen(
+        [
+            "gunicorn",
+            "--timeout",
+            str(model_server_timeout),
+            "-k",
+            "gevent",
+            "-b",
+            "unix:/tmp/gunicorn.sock",
+            "-w",
+            str(model_server_workers),
+            "wsgi:app",
+        ]
+    )
+
+    signal.signal(signal.SIGTERM, lambda a, b: sigterm_handler(nginx.pid, gunicorn.pid))
+
+    # If either subprocess exits, so do we.
+    pids = set([nginx.pid, gunicorn.pid])
+    while True:
+        pid, _ = os.wait()
+        if pid in pids:
+            break
+
+    sigterm_handler(nginx.pid, gunicorn.pid)
+    print("Inference server exiting")
+
+
+# The main routine just invokes the start function.
+
+if __name__ == "__main__":
+    start_server()