diff --git a/README.md b/README.md
index 8cb58b41..2d82b74a 100644
--- a/README.md
+++ b/README.md
@@ -237,9 +237,30 @@ In particular, note that [vLLM does not yet support function calling](https://gi
+### Enabling Integrations
+
+
+Confluence
+HolmesGPT can read runbooks from Confluence. To give it access, set the following environment variables:
+
+* CONFLUENCE_BASE_URL - e.g. https://robusta-dev-test.atlassian.net
+* CONFLUENCE_USER - e.g. user@company.com
+* CONFLUENCE_API_KEY - [refer to Atlassian docs on generating API keys](https://support.atlassian.com/atlassian-account/docs/manage-api-tokens-for-your-atlassian-account/)
+
+
+
+
+Jira, GitHub, OpsGenie, PagerDuty, and AlertManager
+
+
+HolmesGPT can pull tickets/alerts from each of these sources and investigate them.
+
+Refer to `holmes investigate jira --help` etc for details, or view the examples.
+
+
## Other Use Cases
-HolmesGPT is usually used for incident response, but it can function as a general-purpose DevOps assistant too. Here are some examples:
+HolmesGPT was designed for incident response, but it is a general DevOps assistant too. Here are some examples:
Ask Questions About Your Cloud
diff --git a/holmes/core/issue.py b/holmes/core/issue.py
index 5bc6c3cc..f811cd0e 100644
--- a/holmes/core/issue.py
+++ b/holmes/core/issue.py
@@ -20,7 +20,7 @@ class Issue(BaseModel):
# Name of the issue - not necessarily unique
name: str
- # Source of the issue - e.g. Jira
+ # Source of the issue - e.g. jira
source_type: str
# Identifier for the instance of the source - e.g. Jira project key
diff --git a/holmes/core/tools.py b/holmes/core/tools.py
index 3c466d4a..4209b214 100644
--- a/holmes/core/tools.py
+++ b/holmes/core/tools.py
@@ -1,4 +1,5 @@
import logging
+import os
import re
import shlex
import subprocess
@@ -115,15 +116,18 @@ def __execute_subprocess(self, cmd) -> str:
return f"Command `{cmd}` failed with return code {e.returncode}\nstdout:\n{e.stdout}\nstderr:\n{e.stderr}"
-class ToolsetPrerequisite(BaseModel):
+class ToolsetCommandPrerequisite(BaseModel):
command: str # must complete successfully (error code 0) for prereq to be satisfied
expected_output: str = None # optional
+class ToolsetEnvironmentPrerequisite(BaseModel):
+ env: List[str] = [] # optional
+
class Toolset(BaseModel):
model_config = ConfigDict(extra='forbid')
name: str
- prerequisites: List[ToolsetPrerequisite] = []
+ prerequisites: List[Union[ToolsetCommandPrerequisite, ToolsetEnvironmentPrerequisite]] = []
tools: List[YAMLTool]
_path: PrivateAttr = None
@@ -148,17 +152,24 @@ def get_disabled_reason(self):
def check_prerequisites(self):
for prereq in self.prerequisites:
- try:
- result = subprocess.run(prereq.command, shell=True, check=True, text=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
- if prereq.expected_output and prereq.expected_output not in result.stdout:
+ if isinstance(prereq, ToolsetCommandPrerequisite):
+ try:
+ result = subprocess.run(prereq.command, shell=True, check=True, text=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+ if prereq.expected_output and prereq.expected_output not in result.stdout:
+ self._enabled = False
+ self._disabled_reason = f"prereq check gave wrong output"
+ return
+ except subprocess.CalledProcessError as e:
self._enabled = False
- self._disabled_reason = f"prereq check gave wrong output"
- return
- except subprocess.CalledProcessError as e:
- self._enabled = False
- self._disabled_reason = f"prereq check failed w/ errorcode {e.returncode}"
- logging.debug(f"Toolset {self.name} : Failed to run prereq command {prereq}", exc_info=True)
- return
+ self._disabled_reason = f"prereq check failed with errorcode {e.returncode}"
+ logging.debug(f"Toolset {self.name} : Failed to run prereq command {prereq}", exc_info=True)
+ return
+ elif isinstance(prereq, ToolsetEnvironmentPrerequisite):
+ for env_var in prereq.env:
+ if env_var not in os.environ:
+ self._enabled = False
+ self._disabled_reason = f"prereq check failed because environment variable {env_var} was not set"
+ return
self._enabled = True
class YAMLToolExecutor:
diff --git a/holmes/plugins/prompts/generic_ask.jinja2 b/holmes/plugins/prompts/generic_ask.jinja2
index 8c207a05..cbe30929 100644
--- a/holmes/plugins/prompts/generic_ask.jinja2
+++ b/holmes/plugins/prompts/generic_ask.jinja2
@@ -24,7 +24,7 @@ Examples:
User: Why did the webserver-example app crash?
(Call tool kubectl_find_resource kind=pod keyword=webserver`)
-(Call tool kubectl_logs_previous namespace=demos pod=webserver-example-1299492-d9g9d # this pod name was found from the previous tool call)
+(Call tool kubectl_previous_logs namespace=demos pod=webserver-example-1299492-d9g9d # this pod name was found from the previous tool call)
AI: `webserver-example-1299492-d9g9d` crashed due to email validation error during HTTP request for /api/create_user
Relevant logs:
diff --git a/holmes/plugins/prompts/generic_investigation.jinja2 b/holmes/plugins/prompts/generic_investigation.jinja2
index c5cd0212..2d51c88a 100644
--- a/holmes/plugins/prompts/generic_investigation.jinja2
+++ b/holmes/plugins/prompts/generic_investigation.jinja2
@@ -15,8 +15,13 @@ Example investigation for a NodeUnavailableAlert:
*Details:* Node `name-of-node` has 2.3% disk space remaining, causing the node to be unavailable for scheduling pods.
If there are other resources that are impacted (other than the direct resource mentioned in the alert) list them as well under Resource.
-Whenever there are precise numbers in the data available, quote them.
-(E.g. don't say an app is repeatedly crashing, rather say the app has crashed X times so far. But only quote relevant numbers or metrics.)
+Whenever there are precise numbers in the data available, quote them. For example:
+* Don't say an app is repeatedly crashing, rather say the app has crashed X times so far
+* Don't just say x/y nodes don't match a pod's affinity selector, rather say x/y nodes don't match the selector ABC
+* And so on
+But only quote relevant numbers or metrics that are available. Do not guess.
+
+If a runbook url is present as well as tool that can fetch it, you MUST fetch the runbook before beginning your investigation.
When it can provide extra information, first run as many tools as you need to gather more information, then respond.
If possible, do so repeatedly on different IT resources.
@@ -25,7 +30,7 @@ You must use tools to investigate whenever possible.
When investigating Kubernetes problems, run as many kubectl commands as you need to gather more information, then respond.
If possible, do so repeatedly on different Kubernetes objects.
For example, for deployments first run kubectl on the deployment then a replicaset inside it, then a pod inside that.
-When investigating a pod that crashed, fetch pods logs with --previous so you see logs from before the crash.
+Do not fetch logs for a pod that crashed with kubectl_logs, use the kubectl_previous_logs tool instead
If you don't know, just say that the analysis was inconclusive.
If there are multiple possible causes list them in a numbered list.
@@ -54,8 +59,20 @@ Remove every unnecessary word.
*Surround the title of the root cause like this*.
Do not use markdown other than what is described above.
+Examples of tool usage:
+
+User: Why did the webserver-example app crash?
+(Call tool kubectl_find_resource kind=pod keyword=webserver`)
+(Call tool kubectl_previous_logs namespace=demos pod=webserver-example-1299492-d9g9d # this pod name was found from the previous tool call and we use previous whenever investigating a crash)
+
+*Email validation error during for /api/create_user*
+*Resource:* `webserver-example-1299492-d9g9d` in namespace `web`
+*Details:* Validation error led to unhandled Java exception causing a crash: `2021-01-01T00:00:00.000Z [ERROR] Missing required field 'email' in request body`
+
+End of Examples
+
{% if runbooks %}
-Here are runbooks for this specific investigation. Please follow them if relevant.
+Here are runbooks for this specific investigation. Please follow them if relevant. THIS IS NOT IN PLACE OF RUNNING TOOLS!
{% for r in runbooks %}
* {{ r }}
{% endfor %}
diff --git a/holmes/plugins/runbooks/jira.yaml b/holmes/plugins/runbooks/jira.yaml
new file mode 100644
index 00000000..546fc1cb
--- /dev/null
+++ b/holmes/plugins/runbooks/jira.yaml
@@ -0,0 +1,12 @@
+# runbooks for jira alerts
+# the AI will follow the instructions inside these runbooks to investigate alerts!
+# please feel free to open PRs adding your own runboks
+runbooks:
+ - match:
+ source: "jira"
+ instructions: >
+ Investigate and try to solve whatever is written in the title and description of the ticket.
+ Ignore issues related to jira itself, like plugin or licensing problems.
+ Never give an answer like "XYZ is experiencing an issue, as indicated by the Jira issue. Further investigation is needed to determine the exact cause."
+ You are the agent that is supposed to investigate so do so!
+ If you have references to a service or a component, start by searching for related infrastructure or resources using tools that take keywords
\ No newline at end of file
diff --git a/holmes/plugins/sources/pagerduty/__init__.py b/holmes/plugins/sources/pagerduty/__init__.py
index de472b08..174ead99 100644
--- a/holmes/plugins/sources/pagerduty/__init__.py
+++ b/holmes/plugins/sources/pagerduty/__init__.py
@@ -50,7 +50,7 @@ def convert_to_issue(self, source_issue):
return Issue(
id=source_issue["id"],
name=source_issue["summary"],
- source_type="PagerDuty",
+ source_type="pagerduty",
source_instance_id=self.api_url,
url=f"{source_issue['html_url']}",
raw=source_issue,
diff --git a/holmes/plugins/toolsets/__init__.py b/holmes/plugins/toolsets/__init__.py
index 44e730ce..556e007f 100644
--- a/holmes/plugins/toolsets/__init__.py
+++ b/holmes/plugins/toolsets/__init__.py
@@ -6,7 +6,7 @@
from pydantic import BaseModel
-from holmes.core.tools import Toolset, ToolsetPrerequisite
+from holmes.core.tools import Toolset
from holmes.utils.pydantic_utils import load_model_from_file
THIS_DIR = os.path.abspath(os.path.dirname(__file__))
diff --git a/holmes/plugins/toolsets/external-knowledge.yaml b/holmes/plugins/toolsets/external-knowledge.yaml
new file mode 100644
index 00000000..081d8ace
--- /dev/null
+++ b/holmes/plugins/toolsets/external-knowledge.yaml
@@ -0,0 +1,38 @@
+toolsets:
+- name: "confluence"
+ prerequisites:
+ - command: "curl --version"
+ - env:
+ - CONFLUENCE_USER
+ - CONFLUENCE_API_KEY
+ - CONFLUENCE_BASE_URL
+
+ tools:
+ - name: "fetch_confluence_url"
+ description: "Fetch a page in confluence. Use this to fetch confluence runbooks if they are present before starting your investigation."
+ command: "curl -u ${CONFLUENCE_USER}:${CONFLUENCE_API_KEY} -X GET -H 'Content-Type: application/json' ${CONFLUENCE_BASE_URL}/wiki/rest/api/content/{{ confluence_page_id }}?expand=body.storage"
+
+
+- name: "internet"
+ prerequisites:
+ - command: "w3m -version"
+ tools:
+ - name: "fetch_webpage"
+ description: "Fetch a webpage with w3m. Use this to fetch runbooks if they are present before starting your investigation (if no other tool like confluence is more appropriate)"
+ command: "w3m -dump {{ url }}"
+
+
+- name: "slab"
+ prerequisites:
+ - command: "curl --version"
+ - env:
+ - SLAB_API_KEY
+ tools:
+ - name: "fetch_slab_document"
+ description: "Fetch a document from slab. Use this to fetch runbooks if they are present before starting your investigation."
+ command: |
+ curl -X POST \
+ -H "Authorization: ${SLAB_API_KEY}" \
+ -H "Content-Type: application/json" \
+ -d '{"query":"query { post(id: \"{{ post_id }}\") { id title content } }"}' \
+ https://api.slab.com/v1/graphql