Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adding the support tracing of child models invoked from a BLS model #6063

Merged
merged 15 commits into from
Aug 7, 2023
Merged
101 changes: 49 additions & 52 deletions qa/L0_trace/opentelemetry_unittest.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,8 +27,7 @@
import sys

sys.path.append("../common")
import json
import time
import re
import unittest

import numpy as np
Expand All @@ -41,27 +40,29 @@

class OpenTelemetryTest(tu.TestResultCollector):
def setUp(self):
while True:
with open("trace_collector.log", "rt") as f:
data = f.read()
if data.count("resource_spans") != EXPECTED_NUM_SPANS:
time.sleep(5)
continue
else:
break

data = data.split("\n")
full_spans = [
entry.split("POST")[0] for entry in data if "resource_spans" in entry
]
self.spans = []
self.resource_attributes = []
for span in full_spans:
span = json.loads(span)
self.spans.append(span["resource_spans"][0]["scope_spans"][0]["spans"][0])
self.resource_attributes.append(
span["resource_spans"][0]["resource"]["attributes"]
with open("trace_collector.log", "rt") as f:
data = f.read()
json_string = re.sub("\n\t{\n\t", "{", data)
json_string = re.sub(
"resources : \n\t", "resources : {\n\t", json_string
)
json_string = re.sub(
"\n instr-lib :", "}\n instr-lib :", json_string
)
json_string = re.sub(": \n\t", ':"",', json_string)
json_string = re.sub(": \n", ':"",', json_string)
json_string = re.sub("\n|\n\t", ",", json_string)
json_string = re.sub("\t", "", json_string)
json_string = re.sub(r"\b([\w.-]+)\b", r'"\1"', json_string)
json_string = re.sub('"span" "kind"', '"span kind"', json_string)
json_string = re.sub("{,", "{", json_string)
json_string = re.sub(",}", "}", json_string)
json_string = re.sub("}{", "},{", json_string)
json_string = re.sub(
'"events" : {', '"events" : [{', json_string
)
json_string = re.sub('}, "links"', '}], "links"', json_string)
self.spans = eval(json_string[:-1])

self.simple_model_name = "simple"
self.ensemble_model_name = "ensemble_add_sub_int32_int32_int32"
Expand Down Expand Up @@ -126,20 +127,24 @@ def _check_parent(self, child_span, parent_span):
# Check that child and parent span have the same trace_id
# and child's `parent_span_id` is the same as parent's `span_id`
self.assertEqual(child_span["trace_id"], parent_span["trace_id"])
self.assertIn(
"parent_span_id",
child_span,
self.assertNotEqual(
child_span["parent_span_id"],
"0000000000000000",
"child span does not have parent span id specified",
)
self.assertEqual(child_span["parent_span_id"], parent_span["span_id"])
self.assertEqual(
child_span["parent_span_id"],
parent_span["span_id"],
"child {} , parent {}".format(child_span, parent_span),
)

def test_spans(self):
parsed_spans = []

# Check that collected spans have proper events recorded
for span in self.spans:
span_name = span["name"]
self._check_events(span_name, json.dumps(span["events"]))
self._check_events(span_name, str(span["events"]))
parsed_spans.append(span_name)

# There should be 16 spans in total:
Expand All @@ -164,42 +169,34 @@ def test_nested_spans(self):
for child, parent in zip(self.spans[:3], self.spans[1:3]):
self._check_parent(child, parent)

# root_span should not have `parent_span_id` field
self.assertNotIn(
"parent_span_id", self.spans[2], "root span has a parent_span_id specified"
)

# Next 3 spans in `self.spans` belong to GRPC request
# Order of spans and their relationship described earlier
for child, parent in zip(self.spans[3:6], self.spans[4:6]):
self._check_parent(child, parent)

# root_span should not have `parent_span_id` field
self.assertNotIn(
"parent_span_id", self.spans[5], "root span has a parent_span_id specified"
)

# Final 4 spans in `self.spans` belong to ensemble request
# Next 4 spans in `self.spans` belong to ensemble request
# Order of spans: compute span - request span - request span - root span
for child, parent in zip(self.spans[6:10], self.spans[7:10]):
self._check_parent(child, parent)

# root_span should not have `parent_span_id` field
self.assertNotIn(
"parent_span_id", self.spans[9], "root span has a parent_span_id specified"
)
# Final 6 spans in `self.spans` belong to bls with ensemble request
# Order of spans:
# compute span - request span (simple) - request span (ensemble)-
# - compute (for bls) - request (bls) - root span
# request span (ensemble) and compute (for bls) are children of
# request (bls)
children = self.spans[10:]
parents = (self.spans[11:13], self.spans[14], self.spans[14:])
print(parents)
for child, parent in zip(children, parents[0]):
self._check_parent(child, parent)

def test_resource_attributes(self):
test_attribute_entry = "{{'key': {k}, 'value': {{'string_value': {v}}}}}"
for attribute in self.resource_attributes:
self.assertIn(
test_attribute_entry.format(k="'test.key'", v="'test.value'"),
str(attribute),
)
self.assertIn(
test_attribute_entry.format(k="'service.name'", v="'test_triton'"),
str(attribute),
)
for span in self.spans:
self.assertIn("test.key", span["resources"])
self.assertEqual("test.value", span["resources"]["test.key"])
self.assertIn("service.name", span["resources"])
self.assertEqual("test_triton", span["resources"]["service.name"])


def prepare_data(client):
Expand Down
68 changes: 51 additions & 17 deletions qa/L0_trace/test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -690,13 +690,51 @@ fi
# Check opentelemetry trace exporter sends proper info.
# A helper python script starts listening on $OTLP_PORT, where
# OTLP exporter sends traces.
# Unittests then check that produced spans have expected format and events
# FIXME: Redesign this test to remove time sensitivity
TRITON_OPENTELEMETRY_TEST='false'

# Using netcat as trace collector
apt-get update && apt-get install -y netcat
timeout 2m nc -l -k 127.0.0.1 $OTLP_PORT >> trace_collector_http_exporter.log 2>&1 & COLLECTOR_PID=$!

SERVER_ARGS="--trace-config=level=TIMESTAMPS --trace-config=rate=1 \
--trace-config=count=100 --trace-config=mode=opentelemetry \
--trace-config=opentelemetry,url=localhost:$OTLP_PORT \
--model-repository=$MODELSDIR"
SERVER_LOG="./inference_server_trace_config.log"

run_server
if [ "$SERVER_PID" == "0" ]; then
echo -e "\n***\n*** Failed to start $SERVER\n***"
cat $SERVER_LOG
exit 1
fi

$SIMPLE_HTTP_CLIENT >> client_update.log 2>&1

set +e

wait $COLLECTOR_PID

set -e

kill $SERVER_PID
wait $SERVER_PID

set +e

if ! [ -s trace_collector_http_exporter.log ] && [ `grep -c 'Host: localhost:10000' trace_collector_http_exporter.log` != 3 ] ; then
echo -e "\n***\n*** HTTP exporter test failed.\n***"
exit 1
fi


# Unittests then check that produced spans have expected format and events
OPENTELEMETRY_TEST=opentelemetry_unittest.py
OPENTELEMETRY_LOG="opentelemetry_unittest.log"
EXPECTED_NUM_TESTS="3"

TRITON_OPENTELEMETRY_TEST='true'

SERVER_ARGS="--trace-config=level=TIMESTAMPS --trace-config=rate=1 \
--trace-config=count=100 --trace-config=mode=opentelemetry \
--trace-config=opentelemetry,url=localhost:$OTLP_PORT \
Expand All @@ -705,20 +743,13 @@ SERVER_ARGS="--trace-config=level=TIMESTAMPS --trace-config=rate=1 \
--model-repository=$MODELSDIR"
SERVER_LOG="./inference_server_trace_config.log"

export OTEL_EXPORTER_OTLP_TIMEOUT=5
export OTEL_EXPORTER_OTLP_TRACES_TIMEOUT=5

run_server
if [ "$SERVER_PID" == "0" ]; then
echo -e "\n***\n*** Failed to start $SERVER\n***"
cat $SERVER_LOG
exit 1
fi

# Using netcat as trace collector
apt-get update && apt-get install -y netcat
nc -l -k 127.0.0.1 $OTLP_PORT >> $TRACE_COLLECTOR_LOG 2>&1 & COLLECTOR_PID=$!

set +e
# Preparing traces for unittest.
# Note: need to run this separately, to speed up trace collection.
Expand All @@ -727,6 +758,17 @@ set +e
python -c 'import opentelemetry_unittest; \
opentelemetry_unittest.prepare_traces()' >>$CLIENT_LOG 2>&1

sleep 5

set -e

kill $SERVER_PID
wait $SERVER_PID

set +e

grep -z -o -P '({\n(?s).*}\n)' inference_server_trace_config.log >> trace_collector.log

# Unittest will not start until expected number of spans is collected.
python $OPENTELEMETRY_TEST >>$OPENTELEMETRY_LOG 2>&1
if [ $? -ne 0 ]; then
Expand All @@ -741,12 +783,4 @@ else
fi
fi

kill $COLLECTOR_PID
wait $COLLECTOR_PID

set -e

kill $SERVER_PID
wait $SERVER_PID

exit $RET
8 changes: 8 additions & 0 deletions src/tracer.cc
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,8 @@
#include <cuda_runtime_api.h>
#endif // TRITON_ENABLE_GPU
#ifndef _WIN32
#include "opentelemetry/exporters/ostream/span_exporter_factory.h"
#include "opentelemetry/exporters/otlp/otlp_http_exporter_factory.h"
#include "opentelemetry/sdk/resource/semantic_conventions.h"
namespace otlp = opentelemetry::exporter::otlp;
namespace otel_trace_sdk = opentelemetry::sdk::trace;
Expand Down Expand Up @@ -380,6 +382,12 @@ TraceManager::InitTracer(const triton::server::TraceConfigMap& config_map)
}
}
auto exporter = otlp::OtlpHttpExporterFactory::Create(opts);
auto test_exporter = triton::server::GetEnvironmentVariableOrDefault(
"TRITON_OPENTELEMETRY_TEST", "false");
if (test_exporter != "false") {
exporter =
opentelemetry::exporter::trace::OStreamSpanExporterFactory::Create();
}
auto processor =
otel_trace_sdk::SimpleSpanProcessorFactory::Create(std::move(exporter));
auto resource = otel_resource::Resource::Create(attributes);
Expand Down
2 changes: 0 additions & 2 deletions src/tracer.h
Original file line number Diff line number Diff line change
Expand Up @@ -36,15 +36,13 @@
#include <unordered_map>

#if !defined(_WIN32) && defined(TRITON_ENABLE_TRACING)
#include "opentelemetry/exporters/otlp/otlp_http_exporter_factory.h"
#include "opentelemetry/nostd/shared_ptr.h"
#include "opentelemetry/sdk/resource/resource.h"
#include "opentelemetry/sdk/trace/processor.h"
#include "opentelemetry/sdk/trace/simple_processor_factory.h"
#include "opentelemetry/sdk/trace/tracer_provider_factory.h"
#include "opentelemetry/trace/context.h"
#include "opentelemetry/trace/provider.h"
namespace otlp = opentelemetry::exporter::otlp;
namespace otel_trace_sdk = opentelemetry::sdk::trace;
namespace otel_trace_api = opentelemetry::trace;
#endif
Expand Down