feat: add lab-monitoring MCP server for Prometheus and Alertmanager

New MCP server that queries live Prometheus and Alertmanager HTTP APIs with 8 tools: list_alerts, get_alert, search_metrics, get_metric_metadata, query (PromQL), list_targets, list_silences, and create_silence. Extends the MCP core with ModeCustom and NewGenericServer for servers that don't require a database. Includes CLI with direct commands (alerts, query, targets, metrics), NixOS module, and comprehensive httptest-based tests. Bumps existing binaries to 0.2.1 due to shared internal/mcp change. Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-02-04 23:11:53 +01:00
parent 0bd4ed778a
commit 1755364bba
19 changed files with 2567 additions and 22 deletions
--- a/internal/monitoring/handlers.go
+++ b/internal/monitoring/handlers.go
@@ -0,0 +1,434 @@
+package monitoring
+
+import (
+	"context"
+	"encoding/json"
+	"fmt"
+	"strings"
+	"time"
+
+	"git.t-juice.club/torjus/labmcp/internal/mcp"
+)
+
+// RegisterHandlers registers all monitoring tool handlers on the MCP server.
+func RegisterHandlers(server *mcp.Server, prom *PrometheusClient, am *AlertmanagerClient) {
+	server.RegisterTool(listAlertsTool(), makeListAlertsHandler(am))
+	server.RegisterTool(getAlertTool(), makeGetAlertHandler(am))
+	server.RegisterTool(searchMetricsTool(), makeSearchMetricsHandler(prom))
+	server.RegisterTool(getMetricMetadataTool(), makeGetMetricMetadataHandler(prom))
+	server.RegisterTool(queryTool(), makeQueryHandler(prom))
+	server.RegisterTool(listTargetsTool(), makeListTargetsHandler(prom))
+	server.RegisterTool(listSilencesTool(), makeListSilencesHandler(am))
+	server.RegisterTool(createSilenceTool(), makeCreateSilenceHandler(am))
+}
+
+// Tool definitions
+
+func listAlertsTool() mcp.Tool {
+	return mcp.Tool{
+		Name:        "list_alerts",
+		Description: "List alerts from Alertmanager with optional filters",
+		InputSchema: mcp.InputSchema{
+			Type: "object",
+			Properties: map[string]mcp.Property{
+				"state": {
+					Type:        "string",
+					Description: "Filter by alert state: 'active', 'suppressed', or 'unprocessed'",
+					Enum:        []string{"active", "suppressed", "unprocessed"},
+				},
+				"severity": {
+					Type:        "string",
+					Description: "Filter by severity label (e.g., 'critical', 'warning')",
+				},
+				"receiver": {
+					Type:        "string",
+					Description: "Filter by receiver name",
+				},
+			},
+		},
+	}
+}
+
+func getAlertTool() mcp.Tool {
+	return mcp.Tool{
+		Name:        "get_alert",
+		Description: "Get full details for a specific alert by fingerprint",
+		InputSchema: mcp.InputSchema{
+			Type: "object",
+			Properties: map[string]mcp.Property{
+				"fingerprint": {
+					Type:        "string",
+					Description: "Alert fingerprint identifier",
+				},
+			},
+			Required: []string{"fingerprint"},
+		},
+	}
+}
+
+func searchMetricsTool() mcp.Tool {
+	return mcp.Tool{
+		Name:        "search_metrics",
+		Description: "Search Prometheus metric names with optional substring filter, enriched with metadata (type, help text)",
+		InputSchema: mcp.InputSchema{
+			Type: "object",
+			Properties: map[string]mcp.Property{
+				"query": {
+					Type:        "string",
+					Description: "Substring to filter metric names (e.g., 'cpu', 'memory', 'node_'). Empty returns all metrics.",
+				},
+				"limit": {
+					Type:        "integer",
+					Description: "Maximum number of results (default: 50)",
+					Default:     50,
+				},
+			},
+		},
+	}
+}
+
+func getMetricMetadataTool() mcp.Tool {
+	return mcp.Tool{
+		Name:        "get_metric_metadata",
+		Description: "Get type, help text, and unit for a specific Prometheus metric",
+		InputSchema: mcp.InputSchema{
+			Type: "object",
+			Properties: map[string]mcp.Property{
+				"metric": {
+					Type:        "string",
+					Description: "Metric name (e.g., 'node_cpu_seconds_total')",
+				},
+			},
+			Required: []string{"metric"},
+		},
+	}
+}
+
+func queryTool() mcp.Tool {
+	return mcp.Tool{
+		Name:        "query",
+		Description: "Execute an instant PromQL query against Prometheus. Supports aggregations like avg_over_time(metric[1h]), rate(), sum(), etc.",
+		InputSchema: mcp.InputSchema{
+			Type: "object",
+			Properties: map[string]mcp.Property{
+				"promql": {
+					Type:        "string",
+					Description: "PromQL expression to evaluate (e.g., 'up', 'rate(http_requests_total[5m])', 'avg_over_time(node_load1[1h])')",
+				},
+			},
+			Required: []string{"promql"},
+		},
+	}
+}
+
+func listTargetsTool() mcp.Tool {
+	return mcp.Tool{
+		Name:        "list_targets",
+		Description: "List Prometheus scrape targets with health status, grouped by job",
+		InputSchema: mcp.InputSchema{
+			Type:       "object",
+			Properties: map[string]mcp.Property{},
+		},
+	}
+}
+
+func listSilencesTool() mcp.Tool {
+	return mcp.Tool{
+		Name:        "list_silences",
+		Description: "List active and pending alert silences from Alertmanager",
+		InputSchema: mcp.InputSchema{
+			Type:       "object",
+			Properties: map[string]mcp.Property{},
+		},
+	}
+}
+
+func createSilenceTool() mcp.Tool {
+	return mcp.Tool{
+		Name:        "create_silence",
+		Description: `Create a new silence in Alertmanager. IMPORTANT: Always confirm with the user before creating a silence, showing them the matchers, duration, and reason.`,
+		InputSchema: mcp.InputSchema{
+			Type: "object",
+			Properties: map[string]mcp.Property{
+				"matchers": {
+					Type:        "string",
+					Description: `JSON array of matchers, e.g. [{"name":"alertname","value":"TargetDown","isRegex":false}]`,
+				},
+				"duration": {
+					Type:        "string",
+					Description: "Silence duration in Go duration format (e.g., '2h', '30m', '1h30m')",
+				},
+				"author": {
+					Type:        "string",
+					Description: "Author of the silence",
+				},
+				"comment": {
+					Type:        "string",
+					Description: "Reason for the silence",
+				},
+			},
+			Required: []string{"matchers", "duration", "author", "comment"},
+		},
+	}
+}
+
+// Handler constructors
+
+func makeListAlertsHandler(am *AlertmanagerClient) mcp.ToolHandler {
+	return func(ctx context.Context, args map[string]interface{}) (mcp.CallToolResult, error) {
+		filters := AlertFilters{}
+
+		if state, ok := args["state"].(string); ok && state != "" {
+			switch state {
+			case "active":
+				active := true
+				filters.Active = &active
+				silenced := false
+				filters.Silenced = &silenced
+				inhibited := false
+				filters.Inhibited = &inhibited
+			case "suppressed":
+				active := false
+				filters.Active = &active
+			case "unprocessed":
+				unprocessed := true
+				filters.Unprocessed = &unprocessed
+			}
+		}
+
+		if severity, ok := args["severity"].(string); ok && severity != "" {
+			filters.Filter = append(filters.Filter, fmt.Sprintf(`severity="%s"`, severity))
+		}
+
+		if receiver, ok := args["receiver"].(string); ok && receiver != "" {
+			filters.Receiver = receiver
+		}
+
+		alerts, err := am.ListAlerts(ctx, filters)
+		if err != nil {
+			return mcp.ErrorContent(fmt.Errorf("failed to list alerts: %w", err)), nil
+		}
+
+		return mcp.CallToolResult{
+			Content: []mcp.Content{mcp.TextContent(formatAlerts(alerts))},
+		}, nil
+	}
+}
+
+func makeGetAlertHandler(am *AlertmanagerClient) mcp.ToolHandler {
+	return func(ctx context.Context, args map[string]interface{}) (mcp.CallToolResult, error) {
+		fingerprint, _ := args["fingerprint"].(string)
+		if fingerprint == "" {
+			return mcp.ErrorContent(fmt.Errorf("fingerprint is required")), nil
+		}
+
+		// Fetch all alerts and find the one matching the fingerprint
+		alerts, err := am.ListAlerts(ctx, AlertFilters{})
+		if err != nil {
+			return mcp.ErrorContent(fmt.Errorf("failed to fetch alerts: %w", err)), nil
+		}
+
+		for _, a := range alerts {
+			if a.Fingerprint == fingerprint {
+				return mcp.CallToolResult{
+					Content: []mcp.Content{mcp.TextContent(formatAlerts([]Alert{a}))},
+				}, nil
+			}
+		}
+
+		return mcp.ErrorContent(fmt.Errorf("alert with fingerprint '%s' not found", fingerprint)), nil
+	}
+}
+
+func makeSearchMetricsHandler(prom *PrometheusClient) mcp.ToolHandler {
+	return func(ctx context.Context, args map[string]interface{}) (mcp.CallToolResult, error) {
+		query, _ := args["query"].(string)
+		limit := 50
+		if l, ok := args["limit"].(float64); ok && l > 0 {
+			limit = int(l)
+		}
+
+		// Get all metric names
+		allNames, err := prom.LabelValues(ctx, "__name__")
+		if err != nil {
+			return mcp.ErrorContent(fmt.Errorf("failed to fetch metric names: %w", err)), nil
+		}
+
+		// Filter by substring
+		var matched []string
+		queryLower := strings.ToLower(query)
+		for _, name := range allNames {
+			if query == "" || strings.Contains(strings.ToLower(name), queryLower) {
+				matched = append(matched, name)
+				if len(matched) >= limit {
+					break
+				}
+			}
+		}
+
+		// Fetch metadata for matched metrics
+		metadata, err := prom.Metadata(ctx, "")
+		if err != nil {
+			// Non-fatal: proceed without metadata
+			metadata = nil
+		}
+
+		return mcp.CallToolResult{
+			Content: []mcp.Content{mcp.TextContent(formatMetricSearch(matched, metadata))},
+		}, nil
+	}
+}
+
+func makeGetMetricMetadataHandler(prom *PrometheusClient) mcp.ToolHandler {
+	return func(ctx context.Context, args map[string]interface{}) (mcp.CallToolResult, error) {
+		metric, _ := args["metric"].(string)
+		if metric == "" {
+			return mcp.ErrorContent(fmt.Errorf("metric is required")), nil
+		}
+
+		metadata, err := prom.Metadata(ctx, metric)
+		if err != nil {
+			return mcp.ErrorContent(fmt.Errorf("failed to fetch metadata: %w", err)), nil
+		}
+
+		metas := metadata[metric]
+		return mcp.CallToolResult{
+			Content: []mcp.Content{mcp.TextContent(formatMetricMetadata(metric, metas))},
+		}, nil
+	}
+}
+
+func makeQueryHandler(prom *PrometheusClient) mcp.ToolHandler {
+	return func(ctx context.Context, args map[string]interface{}) (mcp.CallToolResult, error) {
+		promql, _ := args["promql"].(string)
+		if promql == "" {
+			return mcp.ErrorContent(fmt.Errorf("promql is required")), nil
+		}
+
+		data, err := prom.Query(ctx, promql, time.Time{})
+		if err != nil {
+			return mcp.ErrorContent(fmt.Errorf("query failed: %w", err)), nil
+		}
+
+		var result string
+		switch data.ResultType {
+		case "vector":
+			result = formatInstantVector(data.Result)
+		case "scalar":
+			if len(data.Result) > 0 && len(data.Result[0].Value) >= 2 {
+				if v, ok := data.Result[0].Value[1].(string); ok {
+					result = fmt.Sprintf("**Scalar result:** %s", v)
+				}
+			}
+			if result == "" {
+				result = "Scalar query returned no value."
+			}
+		default:
+			result = fmt.Sprintf("Result type: %s\n\n%s", data.ResultType, formatInstantVector(data.Result))
+		}
+
+		return mcp.CallToolResult{
+			Content: []mcp.Content{mcp.TextContent(result)},
+		}, nil
+	}
+}
+
+func makeListTargetsHandler(prom *PrometheusClient) mcp.ToolHandler {
+	return func(ctx context.Context, args map[string]interface{}) (mcp.CallToolResult, error) {
+		data, err := prom.Targets(ctx)
+		if err != nil {
+			return mcp.ErrorContent(fmt.Errorf("failed to fetch targets: %w", err)), nil
+		}
+
+		return mcp.CallToolResult{
+			Content: []mcp.Content{mcp.TextContent(formatTargets(data))},
+		}, nil
+	}
+}
+
+func makeListSilencesHandler(am *AlertmanagerClient) mcp.ToolHandler {
+	return func(ctx context.Context, args map[string]interface{}) (mcp.CallToolResult, error) {
+		silences, err := am.ListSilences(ctx)
+		if err != nil {
+			return mcp.ErrorContent(fmt.Errorf("failed to fetch silences: %w", err)), nil
+		}
+
+		// Filter to active/pending only
+		var filtered []Silence
+		for _, s := range silences {
+			if s.Status != nil && (s.Status.State == "active" || s.Status.State == "pending") {
+				filtered = append(filtered, s)
+			}
+		}
+
+		return mcp.CallToolResult{
+			Content: []mcp.Content{mcp.TextContent(formatSilences(filtered))},
+		}, nil
+	}
+}
+
+func makeCreateSilenceHandler(am *AlertmanagerClient) mcp.ToolHandler {
+	return func(ctx context.Context, args map[string]interface{}) (mcp.CallToolResult, error) {
+		matchersJSON, _ := args["matchers"].(string)
+		if matchersJSON == "" {
+			return mcp.ErrorContent(fmt.Errorf("matchers is required")), nil
+		}
+
+		durationStr, _ := args["duration"].(string)
+		if durationStr == "" {
+			return mcp.ErrorContent(fmt.Errorf("duration is required")), nil
+		}
+
+		author, _ := args["author"].(string)
+		if author == "" {
+			return mcp.ErrorContent(fmt.Errorf("author is required")), nil
+		}
+
+		comment, _ := args["comment"].(string)
+		if comment == "" {
+			return mcp.ErrorContent(fmt.Errorf("comment is required")), nil
+		}
+
+		// Parse matchers
+		var matchers []Matcher
+		if err := parseJSON(matchersJSON, &matchers); err != nil {
+			return mcp.ErrorContent(fmt.Errorf("invalid matchers JSON: %w", err)), nil
+		}
+
+		// Parse duration
+		duration, err := time.ParseDuration(durationStr)
+		if err != nil {
+			return mcp.ErrorContent(fmt.Errorf("invalid duration: %w", err)), nil
+		}
+
+		now := time.Now()
+		silence := Silence{
+			Matchers:  matchers,
+			StartsAt:  now,
+			EndsAt:    now.Add(duration),
+			CreatedBy: author,
+			Comment:   comment,
+		}
+
+		id, err := am.CreateSilence(ctx, silence)
+		if err != nil {
+			return mcp.ErrorContent(fmt.Errorf("failed to create silence: %w", err)), nil
+		}
+
+		var sb strings.Builder
+		sb.WriteString("Silence created successfully.\n\n")
+		sb.WriteString(fmt.Sprintf("**ID:** %s\n", id))
+		sb.WriteString(fmt.Sprintf("**Expires:** %s\n", silence.EndsAt.Format(time.RFC3339)))
+		sb.WriteString(fmt.Sprintf("**Author:** %s\n", author))
+		sb.WriteString(fmt.Sprintf("**Comment:** %s\n", comment))
+
+		return mcp.CallToolResult{
+			Content: []mcp.Content{mcp.TextContent(sb.String())},
+		}, nil
+	}
+}
+
+// parseJSON is a helper to unmarshal JSON from a string.
+func parseJSON(s string, v interface{}) error {
+	return json.Unmarshal([]byte(s), v)
+}