feat: add lab-monitoring MCP server for Prometheus and Alertmanager

New MCP server that queries live Prometheus and Alertmanager HTTP APIs
with 8 tools: list_alerts, get_alert, search_metrics, get_metric_metadata,
query (PromQL), list_targets, list_silences, and create_silence.

Extends the MCP core with ModeCustom and NewGenericServer for servers
that don't require a database. Includes CLI with direct commands
(alerts, query, targets, metrics), NixOS module, and comprehensive
httptest-based tests.

Bumps existing binaries to 0.2.1 due to shared internal/mcp change.

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
2026-02-04 23:11:53 +01:00
parent 0bd4ed778a
commit 1755364bba
19 changed files with 2567 additions and 22 deletions

View File

@@ -0,0 +1,434 @@
package monitoring
import (
"context"
"encoding/json"
"fmt"
"strings"
"time"
"git.t-juice.club/torjus/labmcp/internal/mcp"
)
// RegisterHandlers registers all monitoring tool handlers on the MCP server.
func RegisterHandlers(server *mcp.Server, prom *PrometheusClient, am *AlertmanagerClient) {
server.RegisterTool(listAlertsTool(), makeListAlertsHandler(am))
server.RegisterTool(getAlertTool(), makeGetAlertHandler(am))
server.RegisterTool(searchMetricsTool(), makeSearchMetricsHandler(prom))
server.RegisterTool(getMetricMetadataTool(), makeGetMetricMetadataHandler(prom))
server.RegisterTool(queryTool(), makeQueryHandler(prom))
server.RegisterTool(listTargetsTool(), makeListTargetsHandler(prom))
server.RegisterTool(listSilencesTool(), makeListSilencesHandler(am))
server.RegisterTool(createSilenceTool(), makeCreateSilenceHandler(am))
}
// Tool definitions
func listAlertsTool() mcp.Tool {
return mcp.Tool{
Name: "list_alerts",
Description: "List alerts from Alertmanager with optional filters",
InputSchema: mcp.InputSchema{
Type: "object",
Properties: map[string]mcp.Property{
"state": {
Type: "string",
Description: "Filter by alert state: 'active', 'suppressed', or 'unprocessed'",
Enum: []string{"active", "suppressed", "unprocessed"},
},
"severity": {
Type: "string",
Description: "Filter by severity label (e.g., 'critical', 'warning')",
},
"receiver": {
Type: "string",
Description: "Filter by receiver name",
},
},
},
}
}
func getAlertTool() mcp.Tool {
return mcp.Tool{
Name: "get_alert",
Description: "Get full details for a specific alert by fingerprint",
InputSchema: mcp.InputSchema{
Type: "object",
Properties: map[string]mcp.Property{
"fingerprint": {
Type: "string",
Description: "Alert fingerprint identifier",
},
},
Required: []string{"fingerprint"},
},
}
}
func searchMetricsTool() mcp.Tool {
return mcp.Tool{
Name: "search_metrics",
Description: "Search Prometheus metric names with optional substring filter, enriched with metadata (type, help text)",
InputSchema: mcp.InputSchema{
Type: "object",
Properties: map[string]mcp.Property{
"query": {
Type: "string",
Description: "Substring to filter metric names (e.g., 'cpu', 'memory', 'node_'). Empty returns all metrics.",
},
"limit": {
Type: "integer",
Description: "Maximum number of results (default: 50)",
Default: 50,
},
},
},
}
}
func getMetricMetadataTool() mcp.Tool {
return mcp.Tool{
Name: "get_metric_metadata",
Description: "Get type, help text, and unit for a specific Prometheus metric",
InputSchema: mcp.InputSchema{
Type: "object",
Properties: map[string]mcp.Property{
"metric": {
Type: "string",
Description: "Metric name (e.g., 'node_cpu_seconds_total')",
},
},
Required: []string{"metric"},
},
}
}
func queryTool() mcp.Tool {
return mcp.Tool{
Name: "query",
Description: "Execute an instant PromQL query against Prometheus. Supports aggregations like avg_over_time(metric[1h]), rate(), sum(), etc.",
InputSchema: mcp.InputSchema{
Type: "object",
Properties: map[string]mcp.Property{
"promql": {
Type: "string",
Description: "PromQL expression to evaluate (e.g., 'up', 'rate(http_requests_total[5m])', 'avg_over_time(node_load1[1h])')",
},
},
Required: []string{"promql"},
},
}
}
func listTargetsTool() mcp.Tool {
return mcp.Tool{
Name: "list_targets",
Description: "List Prometheus scrape targets with health status, grouped by job",
InputSchema: mcp.InputSchema{
Type: "object",
Properties: map[string]mcp.Property{},
},
}
}
func listSilencesTool() mcp.Tool {
return mcp.Tool{
Name: "list_silences",
Description: "List active and pending alert silences from Alertmanager",
InputSchema: mcp.InputSchema{
Type: "object",
Properties: map[string]mcp.Property{},
},
}
}
func createSilenceTool() mcp.Tool {
return mcp.Tool{
Name: "create_silence",
Description: `Create a new silence in Alertmanager. IMPORTANT: Always confirm with the user before creating a silence, showing them the matchers, duration, and reason.`,
InputSchema: mcp.InputSchema{
Type: "object",
Properties: map[string]mcp.Property{
"matchers": {
Type: "string",
Description: `JSON array of matchers, e.g. [{"name":"alertname","value":"TargetDown","isRegex":false}]`,
},
"duration": {
Type: "string",
Description: "Silence duration in Go duration format (e.g., '2h', '30m', '1h30m')",
},
"author": {
Type: "string",
Description: "Author of the silence",
},
"comment": {
Type: "string",
Description: "Reason for the silence",
},
},
Required: []string{"matchers", "duration", "author", "comment"},
},
}
}
// Handler constructors
func makeListAlertsHandler(am *AlertmanagerClient) mcp.ToolHandler {
return func(ctx context.Context, args map[string]interface{}) (mcp.CallToolResult, error) {
filters := AlertFilters{}
if state, ok := args["state"].(string); ok && state != "" {
switch state {
case "active":
active := true
filters.Active = &active
silenced := false
filters.Silenced = &silenced
inhibited := false
filters.Inhibited = &inhibited
case "suppressed":
active := false
filters.Active = &active
case "unprocessed":
unprocessed := true
filters.Unprocessed = &unprocessed
}
}
if severity, ok := args["severity"].(string); ok && severity != "" {
filters.Filter = append(filters.Filter, fmt.Sprintf(`severity="%s"`, severity))
}
if receiver, ok := args["receiver"].(string); ok && receiver != "" {
filters.Receiver = receiver
}
alerts, err := am.ListAlerts(ctx, filters)
if err != nil {
return mcp.ErrorContent(fmt.Errorf("failed to list alerts: %w", err)), nil
}
return mcp.CallToolResult{
Content: []mcp.Content{mcp.TextContent(formatAlerts(alerts))},
}, nil
}
}
func makeGetAlertHandler(am *AlertmanagerClient) mcp.ToolHandler {
return func(ctx context.Context, args map[string]interface{}) (mcp.CallToolResult, error) {
fingerprint, _ := args["fingerprint"].(string)
if fingerprint == "" {
return mcp.ErrorContent(fmt.Errorf("fingerprint is required")), nil
}
// Fetch all alerts and find the one matching the fingerprint
alerts, err := am.ListAlerts(ctx, AlertFilters{})
if err != nil {
return mcp.ErrorContent(fmt.Errorf("failed to fetch alerts: %w", err)), nil
}
for _, a := range alerts {
if a.Fingerprint == fingerprint {
return mcp.CallToolResult{
Content: []mcp.Content{mcp.TextContent(formatAlerts([]Alert{a}))},
}, nil
}
}
return mcp.ErrorContent(fmt.Errorf("alert with fingerprint '%s' not found", fingerprint)), nil
}
}
func makeSearchMetricsHandler(prom *PrometheusClient) mcp.ToolHandler {
return func(ctx context.Context, args map[string]interface{}) (mcp.CallToolResult, error) {
query, _ := args["query"].(string)
limit := 50
if l, ok := args["limit"].(float64); ok && l > 0 {
limit = int(l)
}
// Get all metric names
allNames, err := prom.LabelValues(ctx, "__name__")
if err != nil {
return mcp.ErrorContent(fmt.Errorf("failed to fetch metric names: %w", err)), nil
}
// Filter by substring
var matched []string
queryLower := strings.ToLower(query)
for _, name := range allNames {
if query == "" || strings.Contains(strings.ToLower(name), queryLower) {
matched = append(matched, name)
if len(matched) >= limit {
break
}
}
}
// Fetch metadata for matched metrics
metadata, err := prom.Metadata(ctx, "")
if err != nil {
// Non-fatal: proceed without metadata
metadata = nil
}
return mcp.CallToolResult{
Content: []mcp.Content{mcp.TextContent(formatMetricSearch(matched, metadata))},
}, nil
}
}
func makeGetMetricMetadataHandler(prom *PrometheusClient) mcp.ToolHandler {
return func(ctx context.Context, args map[string]interface{}) (mcp.CallToolResult, error) {
metric, _ := args["metric"].(string)
if metric == "" {
return mcp.ErrorContent(fmt.Errorf("metric is required")), nil
}
metadata, err := prom.Metadata(ctx, metric)
if err != nil {
return mcp.ErrorContent(fmt.Errorf("failed to fetch metadata: %w", err)), nil
}
metas := metadata[metric]
return mcp.CallToolResult{
Content: []mcp.Content{mcp.TextContent(formatMetricMetadata(metric, metas))},
}, nil
}
}
func makeQueryHandler(prom *PrometheusClient) mcp.ToolHandler {
return func(ctx context.Context, args map[string]interface{}) (mcp.CallToolResult, error) {
promql, _ := args["promql"].(string)
if promql == "" {
return mcp.ErrorContent(fmt.Errorf("promql is required")), nil
}
data, err := prom.Query(ctx, promql, time.Time{})
if err != nil {
return mcp.ErrorContent(fmt.Errorf("query failed: %w", err)), nil
}
var result string
switch data.ResultType {
case "vector":
result = formatInstantVector(data.Result)
case "scalar":
if len(data.Result) > 0 && len(data.Result[0].Value) >= 2 {
if v, ok := data.Result[0].Value[1].(string); ok {
result = fmt.Sprintf("**Scalar result:** %s", v)
}
}
if result == "" {
result = "Scalar query returned no value."
}
default:
result = fmt.Sprintf("Result type: %s\n\n%s", data.ResultType, formatInstantVector(data.Result))
}
return mcp.CallToolResult{
Content: []mcp.Content{mcp.TextContent(result)},
}, nil
}
}
func makeListTargetsHandler(prom *PrometheusClient) mcp.ToolHandler {
return func(ctx context.Context, args map[string]interface{}) (mcp.CallToolResult, error) {
data, err := prom.Targets(ctx)
if err != nil {
return mcp.ErrorContent(fmt.Errorf("failed to fetch targets: %w", err)), nil
}
return mcp.CallToolResult{
Content: []mcp.Content{mcp.TextContent(formatTargets(data))},
}, nil
}
}
func makeListSilencesHandler(am *AlertmanagerClient) mcp.ToolHandler {
return func(ctx context.Context, args map[string]interface{}) (mcp.CallToolResult, error) {
silences, err := am.ListSilences(ctx)
if err != nil {
return mcp.ErrorContent(fmt.Errorf("failed to fetch silences: %w", err)), nil
}
// Filter to active/pending only
var filtered []Silence
for _, s := range silences {
if s.Status != nil && (s.Status.State == "active" || s.Status.State == "pending") {
filtered = append(filtered, s)
}
}
return mcp.CallToolResult{
Content: []mcp.Content{mcp.TextContent(formatSilences(filtered))},
}, nil
}
}
func makeCreateSilenceHandler(am *AlertmanagerClient) mcp.ToolHandler {
return func(ctx context.Context, args map[string]interface{}) (mcp.CallToolResult, error) {
matchersJSON, _ := args["matchers"].(string)
if matchersJSON == "" {
return mcp.ErrorContent(fmt.Errorf("matchers is required")), nil
}
durationStr, _ := args["duration"].(string)
if durationStr == "" {
return mcp.ErrorContent(fmt.Errorf("duration is required")), nil
}
author, _ := args["author"].(string)
if author == "" {
return mcp.ErrorContent(fmt.Errorf("author is required")), nil
}
comment, _ := args["comment"].(string)
if comment == "" {
return mcp.ErrorContent(fmt.Errorf("comment is required")), nil
}
// Parse matchers
var matchers []Matcher
if err := parseJSON(matchersJSON, &matchers); err != nil {
return mcp.ErrorContent(fmt.Errorf("invalid matchers JSON: %w", err)), nil
}
// Parse duration
duration, err := time.ParseDuration(durationStr)
if err != nil {
return mcp.ErrorContent(fmt.Errorf("invalid duration: %w", err)), nil
}
now := time.Now()
silence := Silence{
Matchers: matchers,
StartsAt: now,
EndsAt: now.Add(duration),
CreatedBy: author,
Comment: comment,
}
id, err := am.CreateSilence(ctx, silence)
if err != nil {
return mcp.ErrorContent(fmt.Errorf("failed to create silence: %w", err)), nil
}
var sb strings.Builder
sb.WriteString("Silence created successfully.\n\n")
sb.WriteString(fmt.Sprintf("**ID:** %s\n", id))
sb.WriteString(fmt.Sprintf("**Expires:** %s\n", silence.EndsAt.Format(time.RFC3339)))
sb.WriteString(fmt.Sprintf("**Author:** %s\n", author))
sb.WriteString(fmt.Sprintf("**Comment:** %s\n", comment))
return mcp.CallToolResult{
Content: []mcp.Content{mcp.TextContent(sb.String())},
}, nil
}
}
// parseJSON is a helper to unmarshal JSON from a string.
func parseJSON(s string, v interface{}) error {
return json.Unmarshal([]byte(s), v)
}