From 050695a8f9203e1eecc66ed83219c18e8221f006 Mon Sep 17 00:00:00 2001
From: Robusta Runner <aantny@gmail.com>
Date: Mon, 24 Jun 2024 17:27:55 +0300
Subject: [PATCH 1/8] add confluence tool

---
 README.md                               | 25 +++++++++++++++++-
 holmes/core/tools.py                    | 35 ++++++++++++++++---------
 holmes/plugins/toolsets/confluence.yaml | 13 +++++++++
 3 files changed, 60 insertions(+), 13 deletions(-)
 create mode 100644 holmes/plugins/toolsets/confluence.yaml
diff --git a/README.md b/README.md
index 8c807711..93a59ac2 100644
--- a/README.md
+++ b/README.md
@@ -230,9 +230,32 @@ In particular, note that [vLLM does not yet support function calling](https://gi
 
 </details>
 
+### Enabling Integrations
+
+<details>
+<summary>Confluence</summary>
+HolmesGPT can read runbooks from Confluence. To give it access, set the following environment variables:
+
+* CONFLUENCE_BASE_URL - e.g. https://robusta-dev-test.atlassian.net
+* CONFLUENCE_USER - e.g. user@company.com
+* CONFLUENCE_API_KEY - [refer to Atlassian docs on generating API keys](https://support.atlassian.com/atlassian-account/docs/manage-api-tokens-for-your-atlassian-account/)
+</details>
+
+<details>
+<summary>
+Jira, GitHub, OpsGenie, PagerDuty, and AlertManager
+</summary>
+
+HolmesGPT can pull tickets/alerts from each of these sources and investigate them.
+
+Refer to `holmes investigate --help` commands for configuration details. E.g. `holmes investigate jira --help`.
+
+See also, <a href="#examples">examples</a>.
+</details>
+
 ## Other Use Cases
 
-HolmesGPT is usually used for incident response, but it can function as a general-purpose DevOps assistant too. Here are some examples:
+HolmesGPT was designed for incident response, but it is a general DevOps assistant too. Here are some examples:
 
 <details>
 <summary>Ask Questions About Your Cloud</summary>
diff --git a/holmes/core/tools.py b/holmes/core/tools.py
index 3c466d4a..4209b214 100644
--- a/holmes/core/tools.py
+++ b/holmes/core/tools.py
@@ -1,4 +1,5 @@
 import logging
+import os
 import re
 import shlex
 import subprocess
@@ -115,15 +116,18 @@ def __execute_subprocess(self, cmd) -> str:
             return f"Command `{cmd}` failed with return code {e.returncode}\nstdout:\n{e.stdout}\nstderr:\n{e.stderr}"
 
 
-class ToolsetPrerequisite(BaseModel):
+class ToolsetCommandPrerequisite(BaseModel):
     command: str                 # must complete successfully (error code 0) for prereq to be satisfied
     expected_output: str = None  # optional
 
+class ToolsetEnvironmentPrerequisite(BaseModel):
+    env: List[str] = []          # optional
+
 class Toolset(BaseModel):
     model_config = ConfigDict(extra='forbid')
 
     name: str
-    prerequisites: List[ToolsetPrerequisite] = []
+    prerequisites: List[Union[ToolsetCommandPrerequisite, ToolsetEnvironmentPrerequisite]] = []
     tools: List[YAMLTool]
 
     _path: PrivateAttr = None
@@ -148,17 +152,24 @@ def get_disabled_reason(self):
 
     def check_prerequisites(self):
         for prereq in self.prerequisites:
-            try:
-                result = subprocess.run(prereq.command, shell=True, check=True, text=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
-                if prereq.expected_output and prereq.expected_output not in result.stdout:
+            if isinstance(prereq, ToolsetCommandPrerequisite):
+                try:
+                    result = subprocess.run(prereq.command, shell=True, check=True, text=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+                    if prereq.expected_output and prereq.expected_output not in result.stdout:
+                        self._enabled = False
+                        self._disabled_reason = f"prereq check gave wrong output"
+                        return
+                except subprocess.CalledProcessError as e:
                     self._enabled = False
-                    self._disabled_reason = f"prereq check gave wrong output"
-                    return 
-            except subprocess.CalledProcessError as e:
-                self._enabled = False
-                self._disabled_reason = f"prereq check failed w/ errorcode {e.returncode}"
-                logging.debug(f"Toolset {self.name} : Failed to run prereq command {prereq}", exc_info=True)
-                return
+                    self._disabled_reason = f"prereq check failed with errorcode {e.returncode}"
+                    logging.debug(f"Toolset {self.name} : Failed to run prereq command {prereq}", exc_info=True)
+                    return
+            elif isinstance(prereq, ToolsetEnvironmentPrerequisite):
+                for env_var in prereq.env:
+                    if env_var not in os.environ:
+                        self._enabled = False
+                        self._disabled_reason = f"prereq check failed because environment variable {env_var} was not set"
+                        return
         self._enabled = True
 
 class YAMLToolExecutor:
diff --git a/holmes/plugins/toolsets/confluence.yaml b/holmes/plugins/toolsets/confluence.yaml
new file mode 100644
index 00000000..ccdb7508
--- /dev/null
+++ b/holmes/plugins/toolsets/confluence.yaml
@@ -0,0 +1,13 @@
+toolsets:
+- name: "confluence"
+  prerequisites:
+  - command: "curl --version"
+  - env:
+    - CONFLUENCE_USER
+    - CONFLUENCE_API_KEY
+    - CONFLUENCE_BASE_URL
+
+  tools:
+  - name: "fetch_confluence_url"
+    description: "Fetch a page in confluence"
+    command: "curl -u ${CONFLUENCE_USER}:${CONFLUENCE_API_KEY} -X GET -H 'Content-Type: application/json' ${CONFLUENCE_BASE_URL}/wiki/rest/api/content/{{ confluence_page_id }}?expand=body.storage"

From 1ed9f6773e019c12724002c48197fad9f3c8efc0 Mon Sep 17 00:00:00 2001
From: Robusta Runner <aantny@gmail.com>
Date: Mon, 24 Jun 2024 17:30:04 +0300
Subject: [PATCH 2/8] Update README.md

---
 README.md | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/README.md b/README.md
index 93a59ac2..f271f4b4 100644
--- a/README.md
+++ b/README.md
@@ -248,9 +248,7 @@ Jira, GitHub, OpsGenie, PagerDuty, and AlertManager
 
 HolmesGPT can pull tickets/alerts from each of these sources and investigate them.
 
-Refer to `holmes investigate --help` commands for configuration details. E.g. `holmes investigate jira --help`.
-
-See also, <a href="#examples">examples</a>.
+Refer to `holmes investigate jira --help` etc for details, or view the <a href="#examples">examples</a>.
 </details>
 
 ## Other Use Cases

From ef983478378237aea542f6b91c3eedd38adbc017 Mon Sep 17 00:00:00 2001
From: Robusta Runner <aantny@gmail.com>
Date: Mon, 24 Jun 2024 17:41:44 +0300
Subject: [PATCH 3/8] Remove old import

---
 holmes/plugins/toolsets/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/holmes/plugins/toolsets/__init__.py b/holmes/plugins/toolsets/__init__.py
index 44e730ce..556e007f 100644
--- a/holmes/plugins/toolsets/__init__.py
+++ b/holmes/plugins/toolsets/__init__.py
@@ -6,7 +6,7 @@
 
 from pydantic import BaseModel
 
-from holmes.core.tools import Toolset, ToolsetPrerequisite
+from holmes.core.tools import Toolset
 from holmes.utils.pydantic_utils import load_model_from_file
 
 THIS_DIR = os.path.abspath(os.path.dirname(__file__))

From 9384f4742bbf2a7ba77e38850ad11490d70176d1 Mon Sep 17 00:00:00 2001
From: Robusta Runner <aantny@gmail.com>
Date: Mon, 24 Jun 2024 18:31:14 +0300
Subject: [PATCH 4/8] experimental: add browsing tool

---
 .../prompts/generic_investigation.jinja2      |  9 ++++++--
 holmes/plugins/toolsets/confluence.yaml       | 13 -----------
 .../plugins/toolsets/external-knowledge.yaml  | 23 +++++++++++++++++++
 3 files changed, 30 insertions(+), 15 deletions(-)
 delete mode 100644 holmes/plugins/toolsets/confluence.yaml
 create mode 100644 holmes/plugins/toolsets/external-knowledge.yaml

diff --git a/holmes/plugins/prompts/generic_investigation.jinja2 b/holmes/plugins/prompts/generic_investigation.jinja2
index c5cd0212..8079c5cb 100644
--- a/holmes/plugins/prompts/generic_investigation.jinja2
+++ b/holmes/plugins/prompts/generic_investigation.jinja2
@@ -15,8 +15,13 @@ Example investigation for a NodeUnavailableAlert:
 *Details:* Node `name-of-node` has 2.3% disk space remaining, causing the node to be unavailable for scheduling pods.
 
 If there are other resources that are impacted (other than the direct resource mentioned in the alert) list them as well under Resource.
-Whenever there are precise numbers in the data available, quote them.
-(E.g. don't say an app is repeatedly crashing, rather say the app has crashed X times so far. But only quote relevant numbers or metrics.)
+Whenever there are precise numbers in the data available, quote them. For example:
+* Don't say an app is repeatedly crashing, rather say the app has crashed X times so far
+* Don't just say x/y nodes don't match a pod's affinity selector, rather say x/y nodes don't match the selector ABC
+* And so on
+But only quote relevant numbers or metrics that are available. Do not guess.
+
+If a runbook url is present as well as tool that can fetch it, you MUST fetch the runbook before beginning your investigation.
 
 When it can provide extra information, first run as many tools as you need to gather more information, then respond. 
 If possible, do so repeatedly on different IT resources.
diff --git a/holmes/plugins/toolsets/confluence.yaml b/holmes/plugins/toolsets/confluence.yaml
deleted file mode 100644
index ccdb7508..00000000
--- a/holmes/plugins/toolsets/confluence.yaml
+++ /dev/null
@@ -1,13 +0,0 @@
-toolsets:
-- name: "confluence"
-  prerequisites:
-  - command: "curl --version"
-  - env:
-    - CONFLUENCE_USER
-    - CONFLUENCE_API_KEY
-    - CONFLUENCE_BASE_URL
-
-  tools:
-  - name: "fetch_confluence_url"
-    description: "Fetch a page in confluence"
-    command: "curl -u ${CONFLUENCE_USER}:${CONFLUENCE_API_KEY} -X GET -H 'Content-Type: application/json' ${CONFLUENCE_BASE_URL}/wiki/rest/api/content/{{ confluence_page_id }}?expand=body.storage"
diff --git a/holmes/plugins/toolsets/external-knowledge.yaml b/holmes/plugins/toolsets/external-knowledge.yaml
new file mode 100644
index 00000000..120cc210
--- /dev/null
+++ b/holmes/plugins/toolsets/external-knowledge.yaml
@@ -0,0 +1,23 @@
+toolsets:
+- name: "confluence"
+  prerequisites:
+  - command: "curl --version"
+  - env:
+    - CONFLUENCE_USER
+    - CONFLUENCE_API_KEY
+    - CONFLUENCE_BASE_URL
+
+  tools:
+  - name: "fetch_confluence_url"
+    description: "Fetch a page in confluence.  Use this to fetch confluence runbooks if they are present before starting your investigation."
+    command: "curl -u ${CONFLUENCE_USER}:${CONFLUENCE_API_KEY} -X GET -H 'Content-Type: application/json' ${CONFLUENCE_BASE_URL}/wiki/rest/api/content/{{ confluence_page_id }}?expand=body.storage"
+
+- name: "internet"
+  prerequisites:
+  - command: "w3m -version"
+  tools:
+  - name: "fetch_webpage"
+    description: "Fetch a webpage with w3m. Use this to fetch runbooks if they are present before starting your investigation (if no other tool like confluence is more appropriate)"
+    command: "w3m -dump {{ url }}"
+
+

From a9f2916eda5f0d1618ac38a4611fe60adb952083 Mon Sep 17 00:00:00 2001
From: Robusta Runner <aantny@gmail.com>
Date: Mon, 1 Jul 2024 03:02:28 +0300
Subject: [PATCH 5/8] add slab tool

---
 holmes/core/issue.py                            |  2 +-
 holmes/plugins/toolsets/external-knowledge.yaml | 14 ++++++++++++++
 2 files changed, 15 insertions(+), 1 deletion(-)

diff --git a/holmes/core/issue.py b/holmes/core/issue.py
index 5bc6c3cc..f811cd0e 100644
--- a/holmes/core/issue.py
+++ b/holmes/core/issue.py
@@ -20,7 +20,7 @@ class Issue(BaseModel):
     # Name of the issue - not necessarily unique  
     name: str                                      
 
-    # Source of the issue - e.g. Jira
+    # Source of the issue - e.g. jira
     source_type: str
 
     # Identifier for the instance of the source - e.g. Jira project key                                
diff --git a/holmes/plugins/toolsets/external-knowledge.yaml b/holmes/plugins/toolsets/external-knowledge.yaml
index 120cc210..88fd9076 100644
--- a/holmes/plugins/toolsets/external-knowledge.yaml
+++ b/holmes/plugins/toolsets/external-knowledge.yaml
@@ -21,3 +21,17 @@ toolsets:
     command: "w3m -dump {{ url }}"
 
 
+- name: "slab"
+  prerequisites:
+  - command: "curl --version"
+  - env:
+    - SLAB_API_KEY
+  tools:
+  - name: "fetch_slab_document"
+    description: "Fetch a document from slab. Use this to fetch runbooks if they are present before starting your investigation."
+    command: |
+      curl -X POST \
+        -H "Authorization: ${SLAB_API_KEY}" \
+        -H "Content-Type: application/json" \
+        -d '{"query":"query { post(id: \"{{ post_id }}\") { id title content } }"}' \
+        https://api.slab.com/v1/graphql

From 7bf7d3abea0e70db8629497e1f07c9e78c2493a9 Mon Sep 17 00:00:00 2001
From: Robusta Runner <aantny@gmail.com>
Date: Mon, 1 Jul 2024 03:34:31 +0300
Subject: [PATCH 6/8] Fix jira specific issues

---
 holmes/plugins/prompts/generic_investigation.jinja2 |  2 +-
 holmes/plugins/runbooks/jira.yaml                   | 12 ++++++++++++
 holmes/plugins/sources/pagerduty/__init__.py        |  2 +-
 3 files changed, 14 insertions(+), 2 deletions(-)
 create mode 100644 holmes/plugins/runbooks/jira.yaml

diff --git a/holmes/plugins/prompts/generic_investigation.jinja2 b/holmes/plugins/prompts/generic_investigation.jinja2
index 8079c5cb..3e36a2df 100644
--- a/holmes/plugins/prompts/generic_investigation.jinja2
+++ b/holmes/plugins/prompts/generic_investigation.jinja2
@@ -60,7 +60,7 @@ Remove every unnecessary word.
 Do not use markdown other than what is described above.
 
 {% if runbooks %}
-Here are runbooks for this specific investigation. Please follow them if relevant.
+Here are runbooks for this specific investigation. Please follow them if relevant. THIS IS NOT IN PLACE OF RUNNING TOOLS!
 {% for r in runbooks %}
 * {{ r }}
 {% endfor %}
diff --git a/holmes/plugins/runbooks/jira.yaml b/holmes/plugins/runbooks/jira.yaml
new file mode 100644
index 00000000..546fc1cb
--- /dev/null
+++ b/holmes/plugins/runbooks/jira.yaml
@@ -0,0 +1,12 @@
+# runbooks for jira alerts
+# the AI will follow the instructions inside these runbooks to investigate alerts!
+# please feel free to open PRs adding your own runboks
+runbooks:
+  - match:
+      source: "jira"
+    instructions: >
+      Investigate and try to solve whatever is written in the title and description of the ticket.
+      Ignore issues related to jira itself, like plugin or licensing problems.
+      Never give an answer like "XYZ is experiencing an issue, as indicated by the Jira issue. Further investigation is needed to determine the exact cause."
+      You are the agent that is supposed to investigate so do so!
+      If you have references to a service or a component, start by searching for related infrastructure or resources using tools that take keywords
\ No newline at end of file
diff --git a/holmes/plugins/sources/pagerduty/__init__.py b/holmes/plugins/sources/pagerduty/__init__.py
index de472b08..174ead99 100644
--- a/holmes/plugins/sources/pagerduty/__init__.py
+++ b/holmes/plugins/sources/pagerduty/__init__.py
@@ -50,7 +50,7 @@ def convert_to_issue(self, source_issue):
         return Issue(
             id=source_issue["id"],
             name=source_issue["summary"],
-            source_type="PagerDuty",
+            source_type="pagerduty",
             source_instance_id=self.api_url,
             url=f"{source_issue['html_url']}",
             raw=source_issue,

From 4161f72055971dabdc6fb1837aa0810ca5c29308 Mon Sep 17 00:00:00 2001
From: Robusta Runner <aantny@gmail.com>
Date: Mon, 1 Jul 2024 10:29:34 +0300
Subject: [PATCH 7/8] Update external-knowledge.yaml

---
 holmes/plugins/toolsets/external-knowledge.yaml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/holmes/plugins/toolsets/external-knowledge.yaml b/holmes/plugins/toolsets/external-knowledge.yaml
index 88fd9076..081d8ace 100644
--- a/holmes/plugins/toolsets/external-knowledge.yaml
+++ b/holmes/plugins/toolsets/external-knowledge.yaml
@@ -12,6 +12,7 @@ toolsets:
     description: "Fetch a page in confluence.  Use this to fetch confluence runbooks if they are present before starting your investigation."
     command: "curl -u ${CONFLUENCE_USER}:${CONFLUENCE_API_KEY} -X GET -H 'Content-Type: application/json' ${CONFLUENCE_BASE_URL}/wiki/rest/api/content/{{ confluence_page_id }}?expand=body.storage"
 
+
 - name: "internet"
   prerequisites:
   - command: "w3m -version"

From ec66518dca7926fbaedd902e195743e8d6270795 Mon Sep 17 00:00:00 2001
From: Robusta Runner <aantny@gmail.com>
Date: Mon, 1 Jul 2024 14:03:07 +0300
Subject: [PATCH 8/8] More improvements to reliability

---
 holmes/plugins/prompts/generic_ask.jinja2          |  2 +-
 .../plugins/prompts/generic_investigation.jinja2   | 14 +++++++++++++-
 2 files changed, 14 insertions(+), 2 deletions(-)

diff --git a/holmes/plugins/prompts/generic_ask.jinja2 b/holmes/plugins/prompts/generic_ask.jinja2
index 8c207a05..cbe30929 100644
--- a/holmes/plugins/prompts/generic_ask.jinja2
+++ b/holmes/plugins/prompts/generic_ask.jinja2
@@ -24,7 +24,7 @@ Examples:
 
 User: Why did the webserver-example app crash?
 (Call tool kubectl_find_resource kind=pod keyword=webserver`)
-(Call tool kubectl_logs_previous namespace=demos pod=webserver-example-1299492-d9g9d # this pod name was found from the previous tool call)
+(Call tool kubectl_previous_logs namespace=demos pod=webserver-example-1299492-d9g9d # this pod name was found from the previous tool call)
 
 AI: `webserver-example-1299492-d9g9d` crashed due to email validation error during HTTP request for /api/create_user
 Relevant logs:
diff --git a/holmes/plugins/prompts/generic_investigation.jinja2 b/holmes/plugins/prompts/generic_investigation.jinja2
index 3e36a2df..2d51c88a 100644
--- a/holmes/plugins/prompts/generic_investigation.jinja2
+++ b/holmes/plugins/prompts/generic_investigation.jinja2
@@ -30,7 +30,7 @@ You must use tools to investigate whenever possible.
 When investigating Kubernetes problems, run as many kubectl commands as you need to gather more information, then respond.
 If possible, do so repeatedly on different Kubernetes objects.
 For example, for deployments first run kubectl on the deployment then a replicaset inside it, then a pod inside that.
-When investigating a pod that crashed, fetch pods logs with --previous so you see logs from before the crash.
+Do not fetch logs for a pod that crashed with kubectl_logs, use the kubectl_previous_logs tool instead
 
 If you don't know, just say that the analysis was inconclusive.
 If there are multiple possible causes list them in a numbered list.
@@ -59,6 +59,18 @@ Remove every unnecessary word.
 *Surround the title of the root cause like this*. 
 Do not use markdown other than what is described above.
 
+Examples of tool usage:
+
+User: Why did the webserver-example app crash?
+(Call tool kubectl_find_resource kind=pod keyword=webserver`)
+(Call tool kubectl_previous_logs namespace=demos pod=webserver-example-1299492-d9g9d # this pod name was found from the previous tool call and we use previous whenever investigating a crash)
+
+*Email validation error during for /api/create_user*
+*Resource:* `webserver-example-1299492-d9g9d` in namespace `web`
+*Details:* Validation error led to unhandled Java exception causing a crash: `2021-01-01T00:00:00.000Z [ERROR] Missing required field 'email' in request body`
+
+End of Examples
+
 {% if runbooks %}
 Here are runbooks for this specific investigation. Please follow them if relevant. THIS IS NOT IN PLACE OF RUNNING TOOLS!
 {% for r in runbooks %}