forked from commaai/openpilot
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathstatsd.py
executable file
·181 lines (148 loc) · 5.26 KB
/
statsd.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
#!/usr/bin/env python3
import os
import zmq
import time
from pathlib import Path
from collections import defaultdict
from datetime import datetime, UTC
from typing import NoReturn
from openpilot.common.params import Params
from cereal.messaging import SubMaster
from openpilot.system.hardware.hw import Paths
from openpilot.common.swaglog import cloudlog
from openpilot.system.hardware import HARDWARE
from openpilot.common.file_helpers import atomic_write_in_dir
from openpilot.system.version import get_build_metadata
from openpilot.system.loggerd.config import STATS_DIR_FILE_LIMIT, STATS_SOCKET, STATS_FLUSH_TIME_S
class METRIC_TYPE:
GAUGE = 'g'
SAMPLE = 'sa'
class StatLog:
def __init__(self):
self.pid = None
self.zctx = None
self.sock = None
def connect(self) -> None:
self.zctx = zmq.Context()
self.sock = self.zctx.socket(zmq.PUSH)
self.sock.setsockopt(zmq.LINGER, 10)
self.sock.connect(STATS_SOCKET)
self.pid = os.getpid()
def __del__(self):
if self.sock is not None:
self.sock.close()
if self.zctx is not None:
self.zctx.term()
def _send(self, metric: str) -> None:
if os.getpid() != self.pid:
self.connect()
try:
self.sock.send_string(metric, zmq.NOBLOCK)
except zmq.error.Again:
# drop :/
pass
def gauge(self, name: str, value: float) -> None:
self._send(f"{name}:{value}|{METRIC_TYPE.GAUGE}")
# Samples will be recorded in a buffer and at aggregation time,
# statistical properties will be logged (mean, count, percentiles, ...)
def sample(self, name: str, value: float):
self._send(f"{name}:{value}|{METRIC_TYPE.SAMPLE}")
def main() -> NoReturn:
dongle_id = Params().get("DongleId", encoding='utf-8')
def get_influxdb_line(measurement: str, value: float | dict[str, float], timestamp: datetime, tags: dict) -> str:
res = f"{measurement}"
for k, v in tags.items():
res += f",{k}={str(v)}"
res += " "
if isinstance(value, float):
value = {'value': value}
for k, v in value.items():
res += f"{k}={v},"
res += f"dongle_id=\"{dongle_id}\" {int(timestamp.timestamp() * 1e9)}\n"
return res
# open statistics socket
ctx = zmq.Context.instance()
sock = ctx.socket(zmq.PULL)
sock.bind(STATS_SOCKET)
STATS_DIR = Paths.stats_root()
# initialize stats directory
Path(STATS_DIR).mkdir(parents=True, exist_ok=True)
build_metadata = get_build_metadata()
# initialize tags
tags = {
'started': False,
'version': build_metadata.openpilot.version,
'branch': build_metadata.channel,
'dirty': build_metadata.openpilot.is_dirty,
'origin': build_metadata.openpilot.git_normalized_origin,
'deviceType': HARDWARE.get_device_type(),
}
# subscribe to deviceState for started state
sm = SubMaster(['deviceState'])
idx = 0
last_flush_time = time.monotonic()
gauges = {}
samples: dict[str, list[float]] = defaultdict(list)
try:
while True:
started_prev = sm['deviceState'].started
sm.update()
# Update metrics
while True:
try:
metric = sock.recv_string(zmq.NOBLOCK)
try:
metric_type = metric.split('|')[1]
metric_name = metric.split(':')[0]
metric_value = float(metric.split('|')[0].split(':')[1])
if metric_type == METRIC_TYPE.GAUGE:
gauges[metric_name] = metric_value
elif metric_type == METRIC_TYPE.SAMPLE:
samples[metric_name].append(metric_value)
else:
cloudlog.event("unknown metric type", metric_type=metric_type)
except Exception:
cloudlog.event("malformed metric", metric=metric)
except zmq.error.Again:
break
# flush when started state changes or after FLUSH_TIME_S
if (time.monotonic() > last_flush_time + STATS_FLUSH_TIME_S) or (sm['deviceState'].started != started_prev):
result = ""
current_time = datetime.now(UTC)
tags['started'] = sm['deviceState'].started
for key, value in gauges.items():
result += get_influxdb_line(f"gauge.{key}", value, current_time, tags)
for key, values in samples.items():
values.sort()
sample_count = len(values)
sample_sum = sum(values)
stats = {
'count': sample_count,
'min': values[0],
'max': values[-1],
'mean': sample_sum / sample_count,
}
for percentile in [0.05, 0.5, 0.95]:
value = values[int(round(percentile * (sample_count - 1)))]
stats[f"p{int(percentile * 100)}"] = value
result += get_influxdb_line(f"sample.{key}", stats, current_time, tags)
# clear intermediate data
gauges.clear()
samples.clear()
last_flush_time = time.monotonic()
# check that we aren't filling up the drive
if len(os.listdir(STATS_DIR)) < STATS_DIR_FILE_LIMIT:
if len(result) > 0:
stats_path = os.path.join(STATS_DIR, f"{current_time.timestamp():.0f}_{idx}")
with atomic_write_in_dir(stats_path) as f:
f.write(result)
idx += 1
else:
cloudlog.error("stats dir full")
finally:
sock.close()
ctx.term()
if __name__ == "__main__":
main()
else:
statlog = StatLog()