package monitoring import ( "context" "encoding/json" "fmt" "strings" "time" "git.t-juice.club/torjus/labmcp/internal/mcp" ) // AlertSummary queries Alertmanager for active (non-silenced) alerts and returns // a short summary string. Returns empty string if there are no alerts or if // Alertmanager is unreachable. func AlertSummary(am *AlertmanagerClient) string { active := true silenced := false inhibited := false alerts, err := am.ListAlerts(context.Background(), AlertFilters{ Active: &active, Silenced: &silenced, Inhibited: &inhibited, }) if err != nil || len(alerts) == 0 { return "" } // Count by severity severities := make(map[string]int) for _, a := range alerts { sev := a.Labels["severity"] if sev == "" { sev = "unknown" } severities[sev]++ } var parts []string // Show critical first if present if n, ok := severities["critical"]; ok { parts = append(parts, fmt.Sprintf("%d critical", n)) delete(severities, "critical") } if n, ok := severities["warning"]; ok { parts = append(parts, fmt.Sprintf("%d warning", n)) delete(severities, "warning") } for sev, n := range severities { parts = append(parts, fmt.Sprintf("%d %s", n, sev)) } return fmt.Sprintf("ALERT STATUS: %d active alert(s) firing (%s). Use list_alerts for details. Let the user know about this.", len(alerts), strings.Join(parts, ", ")) } // HandlerOptions configures which handlers are registered. type HandlerOptions struct { // EnableSilences enables the create_silence tool, which is a write operation. // Disabled by default as a safety measure. EnableSilences bool } // RegisterHandlers registers all monitoring tool handlers on the MCP server. func RegisterHandlers(server *mcp.Server, prom *PrometheusClient, am *AlertmanagerClient, loki *LokiClient, opts HandlerOptions) { server.RegisterTool(listAlertsTool(), makeListAlertsHandler(am)) server.RegisterTool(getAlertTool(), makeGetAlertHandler(am)) server.RegisterTool(searchMetricsTool(), makeSearchMetricsHandler(prom)) server.RegisterTool(getMetricMetadataTool(), makeGetMetricMetadataHandler(prom)) server.RegisterTool(queryTool(), makeQueryHandler(prom)) server.RegisterTool(listTargetsTool(), makeListTargetsHandler(prom)) server.RegisterTool(listSilencesTool(), makeListSilencesHandler(am)) if opts.EnableSilences { server.RegisterTool(createSilenceTool(), makeCreateSilenceHandler(am)) } if loki != nil { server.RegisterTool(queryLogsTool(), makeQueryLogsHandler(loki)) server.RegisterTool(listLabelsTool(), makeListLabelsHandler(loki)) server.RegisterTool(listLabelValuesTool(), makeListLabelValuesHandler(loki)) } } // Tool definitions func listAlertsTool() mcp.Tool { return mcp.Tool{ Name: "list_alerts", Description: "List alerts from Alertmanager with optional filters", InputSchema: mcp.InputSchema{ Type: "object", Properties: map[string]mcp.Property{ "state": { Type: "string", Description: "Filter by alert state: 'active', 'suppressed', 'unprocessed', or 'all' (default: active)", Enum: []string{"active", "suppressed", "unprocessed", "all"}, }, "severity": { Type: "string", Description: "Filter by severity label (e.g., 'critical', 'warning')", }, "receiver": { Type: "string", Description: "Filter by receiver name", }, }, }, } } func getAlertTool() mcp.Tool { return mcp.Tool{ Name: "get_alert", Description: "Get full details for a specific alert by fingerprint", InputSchema: mcp.InputSchema{ Type: "object", Properties: map[string]mcp.Property{ "fingerprint": { Type: "string", Description: "Alert fingerprint identifier", }, }, Required: []string{"fingerprint"}, }, } } func searchMetricsTool() mcp.Tool { return mcp.Tool{ Name: "search_metrics", Description: "Search Prometheus metric names with optional substring filter, enriched with metadata (type, help text)", InputSchema: mcp.InputSchema{ Type: "object", Properties: map[string]mcp.Property{ "query": { Type: "string", Description: "Substring to filter metric names (e.g., 'cpu', 'memory', 'node_'). Empty returns all metrics.", }, "limit": { Type: "integer", Description: "Maximum number of results (default: 50)", Default: 50, }, }, }, } } func getMetricMetadataTool() mcp.Tool { return mcp.Tool{ Name: "get_metric_metadata", Description: "Get type, help text, and unit for a specific Prometheus metric", InputSchema: mcp.InputSchema{ Type: "object", Properties: map[string]mcp.Property{ "metric": { Type: "string", Description: "Metric name (e.g., 'node_cpu_seconds_total')", }, }, Required: []string{"metric"}, }, } } func queryTool() mcp.Tool { return mcp.Tool{ Name: "query", Description: "Execute an instant PromQL query against Prometheus. Supports aggregations like avg_over_time(metric[1h]), rate(), sum(), etc.", InputSchema: mcp.InputSchema{ Type: "object", Properties: map[string]mcp.Property{ "promql": { Type: "string", Description: "PromQL expression to evaluate (e.g., 'up', 'rate(http_requests_total[5m])', 'avg_over_time(node_load1[1h])')", }, }, Required: []string{"promql"}, }, } } func listTargetsTool() mcp.Tool { return mcp.Tool{ Name: "list_targets", Description: "List Prometheus scrape targets with health status, grouped by job", InputSchema: mcp.InputSchema{ Type: "object", Properties: map[string]mcp.Property{}, }, } } func listSilencesTool() mcp.Tool { return mcp.Tool{ Name: "list_silences", Description: "List active and pending alert silences from Alertmanager", InputSchema: mcp.InputSchema{ Type: "object", Properties: map[string]mcp.Property{}, }, } } func createSilenceTool() mcp.Tool { return mcp.Tool{ Name: "create_silence", Description: `Create a new silence in Alertmanager. IMPORTANT: Always confirm with the user before creating a silence, showing them the matchers, duration, and reason.`, InputSchema: mcp.InputSchema{ Type: "object", Properties: map[string]mcp.Property{ "matchers": { Type: "string", Description: `JSON array of matchers, e.g. [{"name":"alertname","value":"TargetDown","isRegex":false}]`, }, "duration": { Type: "string", Description: "Silence duration in Go duration format (e.g., '2h', '30m', '1h30m')", }, "author": { Type: "string", Description: "Author of the silence", }, "comment": { Type: "string", Description: "Reason for the silence", }, }, Required: []string{"matchers", "duration", "author", "comment"}, }, } } // Handler constructors func makeListAlertsHandler(am *AlertmanagerClient) mcp.ToolHandler { return func(ctx context.Context, args map[string]interface{}) (mcp.CallToolResult, error) { filters := AlertFilters{} state, _ := args["state"].(string) switch state { case "active", "": // Default to active alerts only (non-silenced, non-inhibited) active := true filters.Active = &active silenced := false filters.Silenced = &silenced inhibited := false filters.Inhibited = &inhibited case "suppressed": active := false filters.Active = &active case "unprocessed": unprocessed := true filters.Unprocessed = &unprocessed case "all": // No filters - return everything } if severity, ok := args["severity"].(string); ok && severity != "" { filters.Filter = append(filters.Filter, fmt.Sprintf(`severity="%s"`, severity)) } if receiver, ok := args["receiver"].(string); ok && receiver != "" { filters.Receiver = receiver } alerts, err := am.ListAlerts(ctx, filters) if err != nil { return mcp.ErrorContent(fmt.Errorf("failed to list alerts: %w", err)), nil } return mcp.CallToolResult{ Content: []mcp.Content{mcp.TextContent(formatAlerts(alerts))}, }, nil } } func makeGetAlertHandler(am *AlertmanagerClient) mcp.ToolHandler { return func(ctx context.Context, args map[string]interface{}) (mcp.CallToolResult, error) { fingerprint, _ := args["fingerprint"].(string) if fingerprint == "" { return mcp.ErrorContent(fmt.Errorf("fingerprint is required")), nil } // Fetch all alerts and find the one matching the fingerprint alerts, err := am.ListAlerts(ctx, AlertFilters{}) if err != nil { return mcp.ErrorContent(fmt.Errorf("failed to fetch alerts: %w", err)), nil } for _, a := range alerts { if a.Fingerprint == fingerprint { return mcp.CallToolResult{ Content: []mcp.Content{mcp.TextContent(formatAlerts([]Alert{a}))}, }, nil } } return mcp.ErrorContent(fmt.Errorf("alert with fingerprint '%s' not found", fingerprint)), nil } } func makeSearchMetricsHandler(prom *PrometheusClient) mcp.ToolHandler { return func(ctx context.Context, args map[string]interface{}) (mcp.CallToolResult, error) { query, _ := args["query"].(string) limit := 50 if l, ok := args["limit"].(float64); ok && l > 0 { limit = int(l) } // Get all metric names allNames, err := prom.LabelValues(ctx, "__name__") if err != nil { return mcp.ErrorContent(fmt.Errorf("failed to fetch metric names: %w", err)), nil } // Filter by substring var matched []string queryLower := strings.ToLower(query) for _, name := range allNames { if query == "" || strings.Contains(strings.ToLower(name), queryLower) { matched = append(matched, name) if len(matched) >= limit { break } } } // Fetch metadata for matched metrics metadata, err := prom.Metadata(ctx, "") if err != nil { // Non-fatal: proceed without metadata metadata = nil } return mcp.CallToolResult{ Content: []mcp.Content{mcp.TextContent(formatMetricSearch(matched, metadata))}, }, nil } } func makeGetMetricMetadataHandler(prom *PrometheusClient) mcp.ToolHandler { return func(ctx context.Context, args map[string]interface{}) (mcp.CallToolResult, error) { metric, _ := args["metric"].(string) if metric == "" { return mcp.ErrorContent(fmt.Errorf("metric is required")), nil } metadata, err := prom.Metadata(ctx, metric) if err != nil { return mcp.ErrorContent(fmt.Errorf("failed to fetch metadata: %w", err)), nil } metas := metadata[metric] return mcp.CallToolResult{ Content: []mcp.Content{mcp.TextContent(formatMetricMetadata(metric, metas))}, }, nil } } func makeQueryHandler(prom *PrometheusClient) mcp.ToolHandler { return func(ctx context.Context, args map[string]interface{}) (mcp.CallToolResult, error) { promql, _ := args["promql"].(string) if promql == "" { return mcp.ErrorContent(fmt.Errorf("promql is required")), nil } data, err := prom.Query(ctx, promql, time.Time{}) if err != nil { return mcp.ErrorContent(fmt.Errorf("query failed: %w", err)), nil } var result string switch data.ResultType { case "vector": result = formatInstantVector(data.Result) case "scalar": if len(data.Result) > 0 && len(data.Result[0].Value) >= 2 { if v, ok := data.Result[0].Value[1].(string); ok { result = fmt.Sprintf("**Scalar result:** %s", v) } } if result == "" { result = "Scalar query returned no value." } default: result = fmt.Sprintf("Result type: %s\n\n%s", data.ResultType, formatInstantVector(data.Result)) } return mcp.CallToolResult{ Content: []mcp.Content{mcp.TextContent(result)}, }, nil } } func makeListTargetsHandler(prom *PrometheusClient) mcp.ToolHandler { return func(ctx context.Context, args map[string]interface{}) (mcp.CallToolResult, error) { data, err := prom.Targets(ctx) if err != nil { return mcp.ErrorContent(fmt.Errorf("failed to fetch targets: %w", err)), nil } return mcp.CallToolResult{ Content: []mcp.Content{mcp.TextContent(formatTargets(data))}, }, nil } } func makeListSilencesHandler(am *AlertmanagerClient) mcp.ToolHandler { return func(ctx context.Context, args map[string]interface{}) (mcp.CallToolResult, error) { silences, err := am.ListSilences(ctx) if err != nil { return mcp.ErrorContent(fmt.Errorf("failed to fetch silences: %w", err)), nil } // Filter to active/pending only var filtered []Silence for _, s := range silences { if s.Status != nil && (s.Status.State == "active" || s.Status.State == "pending") { filtered = append(filtered, s) } } return mcp.CallToolResult{ Content: []mcp.Content{mcp.TextContent(formatSilences(filtered))}, }, nil } } func makeCreateSilenceHandler(am *AlertmanagerClient) mcp.ToolHandler { return func(ctx context.Context, args map[string]interface{}) (mcp.CallToolResult, error) { matchersJSON, _ := args["matchers"].(string) if matchersJSON == "" { return mcp.ErrorContent(fmt.Errorf("matchers is required")), nil } durationStr, _ := args["duration"].(string) if durationStr == "" { return mcp.ErrorContent(fmt.Errorf("duration is required")), nil } author, _ := args["author"].(string) if author == "" { return mcp.ErrorContent(fmt.Errorf("author is required")), nil } comment, _ := args["comment"].(string) if comment == "" { return mcp.ErrorContent(fmt.Errorf("comment is required")), nil } // Parse matchers var matchers []Matcher if err := parseJSON(matchersJSON, &matchers); err != nil { return mcp.ErrorContent(fmt.Errorf("invalid matchers JSON: %w", err)), nil } // Parse duration duration, err := time.ParseDuration(durationStr) if err != nil { return mcp.ErrorContent(fmt.Errorf("invalid duration: %w", err)), nil } now := time.Now() silence := Silence{ Matchers: matchers, StartsAt: now, EndsAt: now.Add(duration), CreatedBy: author, Comment: comment, } id, err := am.CreateSilence(ctx, silence) if err != nil { return mcp.ErrorContent(fmt.Errorf("failed to create silence: %w", err)), nil } var sb strings.Builder sb.WriteString("Silence created successfully.\n\n") sb.WriteString(fmt.Sprintf("**ID:** %s\n", id)) sb.WriteString(fmt.Sprintf("**Expires:** %s\n", silence.EndsAt.Format(time.RFC3339))) sb.WriteString(fmt.Sprintf("**Author:** %s\n", author)) sb.WriteString(fmt.Sprintf("**Comment:** %s\n", comment)) return mcp.CallToolResult{ Content: []mcp.Content{mcp.TextContent(sb.String())}, }, nil } } // parseJSON is a helper to unmarshal JSON from a string. func parseJSON(s string, v interface{}) error { return json.Unmarshal([]byte(s), v) } // Loki tool definitions func queryLogsTool() mcp.Tool { return mcp.Tool{ Name: "query_logs", Description: "Execute a LogQL range query against Loki to search and retrieve log entries", InputSchema: mcp.InputSchema{ Type: "object", Properties: map[string]mcp.Property{ "logql": { Type: "string", Description: `LogQL query expression (e.g., '{job="varlogs"}', '{job="nginx"} |= "error"')`, }, "start": { Type: "string", Description: "Start time: relative duration (e.g., '1h', '30m'), RFC3339 timestamp, or Unix epoch seconds. Default: 1h ago", }, "end": { Type: "string", Description: "End time: relative duration (e.g., '5m'), RFC3339 timestamp, or Unix epoch seconds. Default: now", }, "limit": { Type: "integer", Description: "Maximum number of log entries to return (default: 100)", Default: 100, }, "direction": { Type: "string", Description: "Sort order for log entries: 'backward' (newest first) or 'forward' (oldest first)", Enum: []string{"backward", "forward"}, }, }, Required: []string{"logql"}, }, } } func listLabelsTool() mcp.Tool { return mcp.Tool{ Name: "list_labels", Description: "List available label names from Loki", InputSchema: mcp.InputSchema{ Type: "object", Properties: map[string]mcp.Property{}, }, } } func listLabelValuesTool() mcp.Tool { return mcp.Tool{ Name: "list_label_values", Description: "List values for a specific label from Loki", InputSchema: mcp.InputSchema{ Type: "object", Properties: map[string]mcp.Property{ "label": { Type: "string", Description: "Label name to get values for (e.g., 'job', 'instance')", }, }, Required: []string{"label"}, }, } } // Loki handler constructors func makeQueryLogsHandler(loki *LokiClient) mcp.ToolHandler { return func(ctx context.Context, args map[string]interface{}) (mcp.CallToolResult, error) { logql, _ := args["logql"].(string) if logql == "" { return mcp.ErrorContent(fmt.Errorf("logql is required")), nil } now := time.Now() start := now.Add(-time.Hour) end := now if startStr, ok := args["start"].(string); ok && startStr != "" { parsed, err := parseTimeArg(startStr, now.Add(-time.Hour)) if err != nil { return mcp.ErrorContent(fmt.Errorf("invalid start time: %w", err)), nil } start = parsed } if endStr, ok := args["end"].(string); ok && endStr != "" { parsed, err := parseTimeArg(endStr, now) if err != nil { return mcp.ErrorContent(fmt.Errorf("invalid end time: %w", err)), nil } end = parsed } limit := 100 if l, ok := args["limit"].(float64); ok && l > 0 { limit = int(l) } if limit > 5000 { limit = 5000 } direction := "backward" if d, ok := args["direction"].(string); ok && d != "" { if d != "backward" && d != "forward" { return mcp.ErrorContent(fmt.Errorf("direction must be 'backward' or 'forward'")), nil } direction = d } data, err := loki.QueryRange(ctx, logql, start, end, limit, direction) if err != nil { return mcp.ErrorContent(fmt.Errorf("log query failed: %w", err)), nil } return mcp.CallToolResult{ Content: []mcp.Content{mcp.TextContent(formatLogStreams(data))}, }, nil } } func makeListLabelsHandler(loki *LokiClient) mcp.ToolHandler { return func(ctx context.Context, args map[string]interface{}) (mcp.CallToolResult, error) { labels, err := loki.Labels(ctx) if err != nil { return mcp.ErrorContent(fmt.Errorf("failed to list labels: %w", err)), nil } return mcp.CallToolResult{ Content: []mcp.Content{mcp.TextContent(formatLabels(labels))}, }, nil } } func makeListLabelValuesHandler(loki *LokiClient) mcp.ToolHandler { return func(ctx context.Context, args map[string]interface{}) (mcp.CallToolResult, error) { label, _ := args["label"].(string) if label == "" { return mcp.ErrorContent(fmt.Errorf("label is required")), nil } values, err := loki.LabelValues(ctx, label) if err != nil { return mcp.ErrorContent(fmt.Errorf("failed to list label values: %w", err)), nil } return mcp.CallToolResult{ Content: []mcp.Content{mcp.TextContent(formatLabelValues(label, values))}, }, nil } } // parseTimeArg parses a time argument that can be: // - A relative duration (e.g., "1h", "30m", "2h30m") — interpreted as that duration ago from now // - An RFC3339 timestamp (e.g., "2024-01-15T10:30:00Z") // - A Unix epoch in seconds (e.g., "1705312200") // If parsing fails, returns the provided default time. func parseTimeArg(s string, defaultTime time.Time) (time.Time, error) { // Try as relative duration first if d, err := time.ParseDuration(s); err == nil { return time.Now().Add(-d), nil } // Try as RFC3339 if t, err := time.Parse(time.RFC3339, s); err == nil { return t, nil } // Try as Unix epoch seconds var epoch int64 validDigits := true for _, c := range s { if c >= '0' && c <= '9' { epoch = epoch*10 + int64(c-'0') } else { validDigits = false break } } if validDigits && len(s) > 0 { return time.Unix(epoch, 0), nil } return defaultTime, fmt.Errorf("cannot parse time '%s': use relative duration (e.g., '1h'), RFC3339, or Unix epoch seconds", s) }