From 050695a8f9203e1eecc66ed83219c18e8221f006 Mon Sep 17 00:00:00 2001 From: Robusta Runner Date: Mon, 24 Jun 2024 17:27:55 +0300 Subject: [PATCH 1/8] add confluence tool --- README.md | 25 +++++++++++++++++- holmes/core/tools.py | 35 ++++++++++++++++--------- holmes/plugins/toolsets/confluence.yaml | 13 +++++++++ 3 files changed, 60 insertions(+), 13 deletions(-) create mode 100644 holmes/plugins/toolsets/confluence.yaml diff --git a/README.md b/README.md index 8c807711..93a59ac2 100644 --- a/README.md +++ b/README.md @@ -230,9 +230,32 @@ In particular, note that [vLLM does not yet support function calling](https://gi +### Enabling Integrations + +
+Confluence +HolmesGPT can read runbooks from Confluence. To give it access, set the following environment variables: + +* CONFLUENCE_BASE_URL - e.g. https://robusta-dev-test.atlassian.net +* CONFLUENCE_USER - e.g. user@company.com +* CONFLUENCE_API_KEY - [refer to Atlassian docs on generating API keys](https://support.atlassian.com/atlassian-account/docs/manage-api-tokens-for-your-atlassian-account/) +
+ +
+ +Jira, GitHub, OpsGenie, PagerDuty, and AlertManager + + +HolmesGPT can pull tickets/alerts from each of these sources and investigate them. + +Refer to `holmes investigate --help` commands for configuration details. E.g. `holmes investigate jira --help`. + +See also, examples. +
+ ## Other Use Cases -HolmesGPT is usually used for incident response, but it can function as a general-purpose DevOps assistant too. Here are some examples: +HolmesGPT was designed for incident response, but it is a general DevOps assistant too. Here are some examples:
Ask Questions About Your Cloud diff --git a/holmes/core/tools.py b/holmes/core/tools.py index 3c466d4a..4209b214 100644 --- a/holmes/core/tools.py +++ b/holmes/core/tools.py @@ -1,4 +1,5 @@ import logging +import os import re import shlex import subprocess @@ -115,15 +116,18 @@ def __execute_subprocess(self, cmd) -> str: return f"Command `{cmd}` failed with return code {e.returncode}\nstdout:\n{e.stdout}\nstderr:\n{e.stderr}" -class ToolsetPrerequisite(BaseModel): +class ToolsetCommandPrerequisite(BaseModel): command: str # must complete successfully (error code 0) for prereq to be satisfied expected_output: str = None # optional +class ToolsetEnvironmentPrerequisite(BaseModel): + env: List[str] = [] # optional + class Toolset(BaseModel): model_config = ConfigDict(extra='forbid') name: str - prerequisites: List[ToolsetPrerequisite] = [] + prerequisites: List[Union[ToolsetCommandPrerequisite, ToolsetEnvironmentPrerequisite]] = [] tools: List[YAMLTool] _path: PrivateAttr = None @@ -148,17 +152,24 @@ def get_disabled_reason(self): def check_prerequisites(self): for prereq in self.prerequisites: - try: - result = subprocess.run(prereq.command, shell=True, check=True, text=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) - if prereq.expected_output and prereq.expected_output not in result.stdout: + if isinstance(prereq, ToolsetCommandPrerequisite): + try: + result = subprocess.run(prereq.command, shell=True, check=True, text=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + if prereq.expected_output and prereq.expected_output not in result.stdout: + self._enabled = False + self._disabled_reason = f"prereq check gave wrong output" + return + except subprocess.CalledProcessError as e: self._enabled = False - self._disabled_reason = f"prereq check gave wrong output" - return - except subprocess.CalledProcessError as e: - self._enabled = False - self._disabled_reason = f"prereq check failed w/ errorcode {e.returncode}" - logging.debug(f"Toolset {self.name} : Failed to run prereq command {prereq}", exc_info=True) - return + self._disabled_reason = f"prereq check failed with errorcode {e.returncode}" + logging.debug(f"Toolset {self.name} : Failed to run prereq command {prereq}", exc_info=True) + return + elif isinstance(prereq, ToolsetEnvironmentPrerequisite): + for env_var in prereq.env: + if env_var not in os.environ: + self._enabled = False + self._disabled_reason = f"prereq check failed because environment variable {env_var} was not set" + return self._enabled = True class YAMLToolExecutor: diff --git a/holmes/plugins/toolsets/confluence.yaml b/holmes/plugins/toolsets/confluence.yaml new file mode 100644 index 00000000..ccdb7508 --- /dev/null +++ b/holmes/plugins/toolsets/confluence.yaml @@ -0,0 +1,13 @@ +toolsets: +- name: "confluence" + prerequisites: + - command: "curl --version" + - env: + - CONFLUENCE_USER + - CONFLUENCE_API_KEY + - CONFLUENCE_BASE_URL + + tools: + - name: "fetch_confluence_url" + description: "Fetch a page in confluence" + command: "curl -u ${CONFLUENCE_USER}:${CONFLUENCE_API_KEY} -X GET -H 'Content-Type: application/json' ${CONFLUENCE_BASE_URL}/wiki/rest/api/content/{{ confluence_page_id }}?expand=body.storage" From 1ed9f6773e019c12724002c48197fad9f3c8efc0 Mon Sep 17 00:00:00 2001 From: Robusta Runner Date: Mon, 24 Jun 2024 17:30:04 +0300 Subject: [PATCH 2/8] Update README.md --- README.md | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/README.md b/README.md index 93a59ac2..f271f4b4 100644 --- a/README.md +++ b/README.md @@ -248,9 +248,7 @@ Jira, GitHub, OpsGenie, PagerDuty, and AlertManager HolmesGPT can pull tickets/alerts from each of these sources and investigate them. -Refer to `holmes investigate --help` commands for configuration details. E.g. `holmes investigate jira --help`. - -See also, examples. +Refer to `holmes investigate jira --help` etc for details, or view the examples.
## Other Use Cases From ef983478378237aea542f6b91c3eedd38adbc017 Mon Sep 17 00:00:00 2001 From: Robusta Runner Date: Mon, 24 Jun 2024 17:41:44 +0300 Subject: [PATCH 3/8] Remove old import --- holmes/plugins/toolsets/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/holmes/plugins/toolsets/__init__.py b/holmes/plugins/toolsets/__init__.py index 44e730ce..556e007f 100644 --- a/holmes/plugins/toolsets/__init__.py +++ b/holmes/plugins/toolsets/__init__.py @@ -6,7 +6,7 @@ from pydantic import BaseModel -from holmes.core.tools import Toolset, ToolsetPrerequisite +from holmes.core.tools import Toolset from holmes.utils.pydantic_utils import load_model_from_file THIS_DIR = os.path.abspath(os.path.dirname(__file__)) From 9384f4742bbf2a7ba77e38850ad11490d70176d1 Mon Sep 17 00:00:00 2001 From: Robusta Runner Date: Mon, 24 Jun 2024 18:31:14 +0300 Subject: [PATCH 4/8] experimental: add browsing tool --- .../prompts/generic_investigation.jinja2 | 9 ++++++-- holmes/plugins/toolsets/confluence.yaml | 13 ----------- .../plugins/toolsets/external-knowledge.yaml | 23 +++++++++++++++++++ 3 files changed, 30 insertions(+), 15 deletions(-) delete mode 100644 holmes/plugins/toolsets/confluence.yaml create mode 100644 holmes/plugins/toolsets/external-knowledge.yaml diff --git a/holmes/plugins/prompts/generic_investigation.jinja2 b/holmes/plugins/prompts/generic_investigation.jinja2 index c5cd0212..8079c5cb 100644 --- a/holmes/plugins/prompts/generic_investigation.jinja2 +++ b/holmes/plugins/prompts/generic_investigation.jinja2 @@ -15,8 +15,13 @@ Example investigation for a NodeUnavailableAlert: *Details:* Node `name-of-node` has 2.3% disk space remaining, causing the node to be unavailable for scheduling pods. If there are other resources that are impacted (other than the direct resource mentioned in the alert) list them as well under Resource. -Whenever there are precise numbers in the data available, quote them. -(E.g. don't say an app is repeatedly crashing, rather say the app has crashed X times so far. But only quote relevant numbers or metrics.) +Whenever there are precise numbers in the data available, quote them. For example: +* Don't say an app is repeatedly crashing, rather say the app has crashed X times so far +* Don't just say x/y nodes don't match a pod's affinity selector, rather say x/y nodes don't match the selector ABC +* And so on +But only quote relevant numbers or metrics that are available. Do not guess. + +If a runbook url is present as well as tool that can fetch it, you MUST fetch the runbook before beginning your investigation. When it can provide extra information, first run as many tools as you need to gather more information, then respond. If possible, do so repeatedly on different IT resources. diff --git a/holmes/plugins/toolsets/confluence.yaml b/holmes/plugins/toolsets/confluence.yaml deleted file mode 100644 index ccdb7508..00000000 --- a/holmes/plugins/toolsets/confluence.yaml +++ /dev/null @@ -1,13 +0,0 @@ -toolsets: -- name: "confluence" - prerequisites: - - command: "curl --version" - - env: - - CONFLUENCE_USER - - CONFLUENCE_API_KEY - - CONFLUENCE_BASE_URL - - tools: - - name: "fetch_confluence_url" - description: "Fetch a page in confluence" - command: "curl -u ${CONFLUENCE_USER}:${CONFLUENCE_API_KEY} -X GET -H 'Content-Type: application/json' ${CONFLUENCE_BASE_URL}/wiki/rest/api/content/{{ confluence_page_id }}?expand=body.storage" diff --git a/holmes/plugins/toolsets/external-knowledge.yaml b/holmes/plugins/toolsets/external-knowledge.yaml new file mode 100644 index 00000000..120cc210 --- /dev/null +++ b/holmes/plugins/toolsets/external-knowledge.yaml @@ -0,0 +1,23 @@ +toolsets: +- name: "confluence" + prerequisites: + - command: "curl --version" + - env: + - CONFLUENCE_USER + - CONFLUENCE_API_KEY + - CONFLUENCE_BASE_URL + + tools: + - name: "fetch_confluence_url" + description: "Fetch a page in confluence. Use this to fetch confluence runbooks if they are present before starting your investigation." + command: "curl -u ${CONFLUENCE_USER}:${CONFLUENCE_API_KEY} -X GET -H 'Content-Type: application/json' ${CONFLUENCE_BASE_URL}/wiki/rest/api/content/{{ confluence_page_id }}?expand=body.storage" + +- name: "internet" + prerequisites: + - command: "w3m -version" + tools: + - name: "fetch_webpage" + description: "Fetch a webpage with w3m. Use this to fetch runbooks if they are present before starting your investigation (if no other tool like confluence is more appropriate)" + command: "w3m -dump {{ url }}" + + From a9f2916eda5f0d1618ac38a4611fe60adb952083 Mon Sep 17 00:00:00 2001 From: Robusta Runner Date: Mon, 1 Jul 2024 03:02:28 +0300 Subject: [PATCH 5/8] add slab tool --- holmes/core/issue.py | 2 +- holmes/plugins/toolsets/external-knowledge.yaml | 14 ++++++++++++++ 2 files changed, 15 insertions(+), 1 deletion(-) diff --git a/holmes/core/issue.py b/holmes/core/issue.py index 5bc6c3cc..f811cd0e 100644 --- a/holmes/core/issue.py +++ b/holmes/core/issue.py @@ -20,7 +20,7 @@ class Issue(BaseModel): # Name of the issue - not necessarily unique name: str - # Source of the issue - e.g. Jira + # Source of the issue - e.g. jira source_type: str # Identifier for the instance of the source - e.g. Jira project key diff --git a/holmes/plugins/toolsets/external-knowledge.yaml b/holmes/plugins/toolsets/external-knowledge.yaml index 120cc210..88fd9076 100644 --- a/holmes/plugins/toolsets/external-knowledge.yaml +++ b/holmes/plugins/toolsets/external-knowledge.yaml @@ -21,3 +21,17 @@ toolsets: command: "w3m -dump {{ url }}" +- name: "slab" + prerequisites: + - command: "curl --version" + - env: + - SLAB_API_KEY + tools: + - name: "fetch_slab_document" + description: "Fetch a document from slab. Use this to fetch runbooks if they are present before starting your investigation." + command: | + curl -X POST \ + -H "Authorization: ${SLAB_API_KEY}" \ + -H "Content-Type: application/json" \ + -d '{"query":"query { post(id: \"{{ post_id }}\") { id title content } }"}' \ + https://api.slab.com/v1/graphql From 7bf7d3abea0e70db8629497e1f07c9e78c2493a9 Mon Sep 17 00:00:00 2001 From: Robusta Runner Date: Mon, 1 Jul 2024 03:34:31 +0300 Subject: [PATCH 6/8] Fix jira specific issues --- holmes/plugins/prompts/generic_investigation.jinja2 | 2 +- holmes/plugins/runbooks/jira.yaml | 12 ++++++++++++ holmes/plugins/sources/pagerduty/__init__.py | 2 +- 3 files changed, 14 insertions(+), 2 deletions(-) create mode 100644 holmes/plugins/runbooks/jira.yaml diff --git a/holmes/plugins/prompts/generic_investigation.jinja2 b/holmes/plugins/prompts/generic_investigation.jinja2 index 8079c5cb..3e36a2df 100644 --- a/holmes/plugins/prompts/generic_investigation.jinja2 +++ b/holmes/plugins/prompts/generic_investigation.jinja2 @@ -60,7 +60,7 @@ Remove every unnecessary word. Do not use markdown other than what is described above. {% if runbooks %} -Here are runbooks for this specific investigation. Please follow them if relevant. +Here are runbooks for this specific investigation. Please follow them if relevant. THIS IS NOT IN PLACE OF RUNNING TOOLS! {% for r in runbooks %} * {{ r }} {% endfor %} diff --git a/holmes/plugins/runbooks/jira.yaml b/holmes/plugins/runbooks/jira.yaml new file mode 100644 index 00000000..546fc1cb --- /dev/null +++ b/holmes/plugins/runbooks/jira.yaml @@ -0,0 +1,12 @@ +# runbooks for jira alerts +# the AI will follow the instructions inside these runbooks to investigate alerts! +# please feel free to open PRs adding your own runboks +runbooks: + - match: + source: "jira" + instructions: > + Investigate and try to solve whatever is written in the title and description of the ticket. + Ignore issues related to jira itself, like plugin or licensing problems. + Never give an answer like "XYZ is experiencing an issue, as indicated by the Jira issue. Further investigation is needed to determine the exact cause." + You are the agent that is supposed to investigate so do so! + If you have references to a service or a component, start by searching for related infrastructure or resources using tools that take keywords \ No newline at end of file diff --git a/holmes/plugins/sources/pagerduty/__init__.py b/holmes/plugins/sources/pagerduty/__init__.py index de472b08..174ead99 100644 --- a/holmes/plugins/sources/pagerduty/__init__.py +++ b/holmes/plugins/sources/pagerduty/__init__.py @@ -50,7 +50,7 @@ def convert_to_issue(self, source_issue): return Issue( id=source_issue["id"], name=source_issue["summary"], - source_type="PagerDuty", + source_type="pagerduty", source_instance_id=self.api_url, url=f"{source_issue['html_url']}", raw=source_issue, From 4161f72055971dabdc6fb1837aa0810ca5c29308 Mon Sep 17 00:00:00 2001 From: Robusta Runner Date: Mon, 1 Jul 2024 10:29:34 +0300 Subject: [PATCH 7/8] Update external-knowledge.yaml --- holmes/plugins/toolsets/external-knowledge.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/holmes/plugins/toolsets/external-knowledge.yaml b/holmes/plugins/toolsets/external-knowledge.yaml index 88fd9076..081d8ace 100644 --- a/holmes/plugins/toolsets/external-knowledge.yaml +++ b/holmes/plugins/toolsets/external-knowledge.yaml @@ -12,6 +12,7 @@ toolsets: description: "Fetch a page in confluence. Use this to fetch confluence runbooks if they are present before starting your investigation." command: "curl -u ${CONFLUENCE_USER}:${CONFLUENCE_API_KEY} -X GET -H 'Content-Type: application/json' ${CONFLUENCE_BASE_URL}/wiki/rest/api/content/{{ confluence_page_id }}?expand=body.storage" + - name: "internet" prerequisites: - command: "w3m -version" From ec66518dca7926fbaedd902e195743e8d6270795 Mon Sep 17 00:00:00 2001 From: Robusta Runner Date: Mon, 1 Jul 2024 14:03:07 +0300 Subject: [PATCH 8/8] More improvements to reliability --- holmes/plugins/prompts/generic_ask.jinja2 | 2 +- .../plugins/prompts/generic_investigation.jinja2 | 14 +++++++++++++- 2 files changed, 14 insertions(+), 2 deletions(-) diff --git a/holmes/plugins/prompts/generic_ask.jinja2 b/holmes/plugins/prompts/generic_ask.jinja2 index 8c207a05..cbe30929 100644 --- a/holmes/plugins/prompts/generic_ask.jinja2 +++ b/holmes/plugins/prompts/generic_ask.jinja2 @@ -24,7 +24,7 @@ Examples: User: Why did the webserver-example app crash? (Call tool kubectl_find_resource kind=pod keyword=webserver`) -(Call tool kubectl_logs_previous namespace=demos pod=webserver-example-1299492-d9g9d # this pod name was found from the previous tool call) +(Call tool kubectl_previous_logs namespace=demos pod=webserver-example-1299492-d9g9d # this pod name was found from the previous tool call) AI: `webserver-example-1299492-d9g9d` crashed due to email validation error during HTTP request for /api/create_user Relevant logs: diff --git a/holmes/plugins/prompts/generic_investigation.jinja2 b/holmes/plugins/prompts/generic_investigation.jinja2 index 3e36a2df..2d51c88a 100644 --- a/holmes/plugins/prompts/generic_investigation.jinja2 +++ b/holmes/plugins/prompts/generic_investigation.jinja2 @@ -30,7 +30,7 @@ You must use tools to investigate whenever possible. When investigating Kubernetes problems, run as many kubectl commands as you need to gather more information, then respond. If possible, do so repeatedly on different Kubernetes objects. For example, for deployments first run kubectl on the deployment then a replicaset inside it, then a pod inside that. -When investigating a pod that crashed, fetch pods logs with --previous so you see logs from before the crash. +Do not fetch logs for a pod that crashed with kubectl_logs, use the kubectl_previous_logs tool instead If you don't know, just say that the analysis was inconclusive. If there are multiple possible causes list them in a numbered list. @@ -59,6 +59,18 @@ Remove every unnecessary word. *Surround the title of the root cause like this*. Do not use markdown other than what is described above. +Examples of tool usage: + +User: Why did the webserver-example app crash? +(Call tool kubectl_find_resource kind=pod keyword=webserver`) +(Call tool kubectl_previous_logs namespace=demos pod=webserver-example-1299492-d9g9d # this pod name was found from the previous tool call and we use previous whenever investigating a crash) + +*Email validation error during for /api/create_user* +*Resource:* `webserver-example-1299492-d9g9d` in namespace `web` +*Details:* Validation error led to unhandled Java exception causing a crash: `2021-01-01T00:00:00.000Z [ERROR] Missing required field 'email' in request body` + +End of Examples + {% if runbooks %} Here are runbooks for this specific investigation. Please follow them if relevant. THIS IS NOT IN PLACE OF RUNNING TOOLS! {% for r in runbooks %}