This repository has been archived on 2026-03-10. You can view files and clone it. You cannot open issues or pull requests or push a commit.
Files
labmcp/internal/monitoring/handlers.go
Torjus Håkestad 4ae92b4f85 chore: migrate module path from git.t-juice.club to code.t-juice.club
Update Go module path and all import references for Gitea to Forgejo
host migration.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-09 19:48:25 +01:00

681 lines
20 KiB
Go

package monitoring
import (
"context"
"encoding/json"
"fmt"
"strings"
"time"
"code.t-juice.club/torjus/labmcp/internal/mcp"
)
// AlertSummary queries Alertmanager for active (non-silenced) alerts and returns
// a short summary string. Returns empty string if there are no alerts or if
// Alertmanager is unreachable.
func AlertSummary(am *AlertmanagerClient) string {
active := true
silenced := false
inhibited := false
alerts, err := am.ListAlerts(context.Background(), AlertFilters{
Active: &active,
Silenced: &silenced,
Inhibited: &inhibited,
})
if err != nil || len(alerts) == 0 {
return ""
}
// Count by severity
severities := make(map[string]int)
for _, a := range alerts {
sev := a.Labels["severity"]
if sev == "" {
sev = "unknown"
}
severities[sev]++
}
var parts []string
// Show critical first if present
if n, ok := severities["critical"]; ok {
parts = append(parts, fmt.Sprintf("%d critical", n))
delete(severities, "critical")
}
if n, ok := severities["warning"]; ok {
parts = append(parts, fmt.Sprintf("%d warning", n))
delete(severities, "warning")
}
for sev, n := range severities {
parts = append(parts, fmt.Sprintf("%d %s", n, sev))
}
return fmt.Sprintf("ALERT STATUS: %d active alert(s) firing (%s). Use list_alerts for details. Let the user know about this.",
len(alerts), strings.Join(parts, ", "))
}
// HandlerOptions configures which handlers are registered.
type HandlerOptions struct {
// EnableSilences enables the create_silence tool, which is a write operation.
// Disabled by default as a safety measure.
EnableSilences bool
}
// RegisterHandlers registers all monitoring tool handlers on the MCP server.
func RegisterHandlers(server *mcp.Server, prom *PrometheusClient, am *AlertmanagerClient, loki *LokiClient, opts HandlerOptions) {
server.RegisterTool(listAlertsTool(), makeListAlertsHandler(am))
server.RegisterTool(getAlertTool(), makeGetAlertHandler(am))
server.RegisterTool(searchMetricsTool(), makeSearchMetricsHandler(prom))
server.RegisterTool(getMetricMetadataTool(), makeGetMetricMetadataHandler(prom))
server.RegisterTool(queryTool(), makeQueryHandler(prom))
server.RegisterTool(listTargetsTool(), makeListTargetsHandler(prom))
server.RegisterTool(listSilencesTool(), makeListSilencesHandler(am))
if opts.EnableSilences {
server.RegisterTool(createSilenceTool(), makeCreateSilenceHandler(am))
}
if loki != nil {
server.RegisterTool(queryLogsTool(), makeQueryLogsHandler(loki))
server.RegisterTool(listLabelsTool(), makeListLabelsHandler(loki))
server.RegisterTool(listLabelValuesTool(), makeListLabelValuesHandler(loki))
}
}
// Tool definitions
func listAlertsTool() mcp.Tool {
return mcp.Tool{
Name: "list_alerts",
Description: "List alerts from Alertmanager with optional filters",
InputSchema: mcp.InputSchema{
Type: "object",
Properties: map[string]mcp.Property{
"state": {
Type: "string",
Description: "Filter by alert state: 'active', 'suppressed', 'unprocessed', or 'all' (default: active)",
Enum: []string{"active", "suppressed", "unprocessed", "all"},
},
"severity": {
Type: "string",
Description: "Filter by severity label (e.g., 'critical', 'warning')",
},
"receiver": {
Type: "string",
Description: "Filter by receiver name",
},
},
},
}
}
func getAlertTool() mcp.Tool {
return mcp.Tool{
Name: "get_alert",
Description: "Get full details for a specific alert by fingerprint",
InputSchema: mcp.InputSchema{
Type: "object",
Properties: map[string]mcp.Property{
"fingerprint": {
Type: "string",
Description: "Alert fingerprint identifier",
},
},
Required: []string{"fingerprint"},
},
}
}
func searchMetricsTool() mcp.Tool {
return mcp.Tool{
Name: "search_metrics",
Description: "Search Prometheus metric names with optional substring filter, enriched with metadata (type, help text)",
InputSchema: mcp.InputSchema{
Type: "object",
Properties: map[string]mcp.Property{
"query": {
Type: "string",
Description: "Substring to filter metric names (e.g., 'cpu', 'memory', 'node_'). Empty returns all metrics.",
},
"limit": {
Type: "integer",
Description: "Maximum number of results (default: 50)",
Default: 50,
},
},
},
}
}
func getMetricMetadataTool() mcp.Tool {
return mcp.Tool{
Name: "get_metric_metadata",
Description: "Get type, help text, and unit for a specific Prometheus metric",
InputSchema: mcp.InputSchema{
Type: "object",
Properties: map[string]mcp.Property{
"metric": {
Type: "string",
Description: "Metric name (e.g., 'node_cpu_seconds_total')",
},
},
Required: []string{"metric"},
},
}
}
func queryTool() mcp.Tool {
return mcp.Tool{
Name: "query",
Description: "Execute an instant PromQL query against Prometheus. Supports aggregations like avg_over_time(metric[1h]), rate(), sum(), etc.",
InputSchema: mcp.InputSchema{
Type: "object",
Properties: map[string]mcp.Property{
"promql": {
Type: "string",
Description: "PromQL expression to evaluate (e.g., 'up', 'rate(http_requests_total[5m])', 'avg_over_time(node_load1[1h])')",
},
},
Required: []string{"promql"},
},
}
}
func listTargetsTool() mcp.Tool {
return mcp.Tool{
Name: "list_targets",
Description: "List Prometheus scrape targets with health status, grouped by job",
InputSchema: mcp.InputSchema{
Type: "object",
Properties: map[string]mcp.Property{},
},
}
}
func listSilencesTool() mcp.Tool {
return mcp.Tool{
Name: "list_silences",
Description: "List active and pending alert silences from Alertmanager",
InputSchema: mcp.InputSchema{
Type: "object",
Properties: map[string]mcp.Property{},
},
}
}
func createSilenceTool() mcp.Tool {
return mcp.Tool{
Name: "create_silence",
Description: `Create a new silence in Alertmanager. IMPORTANT: Always confirm with the user before creating a silence, showing them the matchers, duration, and reason.`,
InputSchema: mcp.InputSchema{
Type: "object",
Properties: map[string]mcp.Property{
"matchers": {
Type: "string",
Description: `JSON array of matchers, e.g. [{"name":"alertname","value":"TargetDown","isRegex":false}]`,
},
"duration": {
Type: "string",
Description: "Silence duration in Go duration format (e.g., '2h', '30m', '1h30m')",
},
"author": {
Type: "string",
Description: "Author of the silence",
},
"comment": {
Type: "string",
Description: "Reason for the silence",
},
},
Required: []string{"matchers", "duration", "author", "comment"},
},
}
}
// Handler constructors
func makeListAlertsHandler(am *AlertmanagerClient) mcp.ToolHandler {
return func(ctx context.Context, args map[string]interface{}) (mcp.CallToolResult, error) {
filters := AlertFilters{}
state, _ := args["state"].(string)
switch state {
case "active", "":
// Default to active alerts only (non-silenced, non-inhibited)
active := true
filters.Active = &active
silenced := false
filters.Silenced = &silenced
inhibited := false
filters.Inhibited = &inhibited
case "suppressed":
active := false
filters.Active = &active
case "unprocessed":
unprocessed := true
filters.Unprocessed = &unprocessed
case "all":
// No filters - return everything
}
if severity, ok := args["severity"].(string); ok && severity != "" {
filters.Filter = append(filters.Filter, fmt.Sprintf(`severity="%s"`, severity))
}
if receiver, ok := args["receiver"].(string); ok && receiver != "" {
filters.Receiver = receiver
}
alerts, err := am.ListAlerts(ctx, filters)
if err != nil {
return mcp.ErrorContent(fmt.Errorf("failed to list alerts: %w", err)), nil
}
return mcp.CallToolResult{
Content: []mcp.Content{mcp.TextContent(formatAlerts(alerts))},
}, nil
}
}
func makeGetAlertHandler(am *AlertmanagerClient) mcp.ToolHandler {
return func(ctx context.Context, args map[string]interface{}) (mcp.CallToolResult, error) {
fingerprint, _ := args["fingerprint"].(string)
if fingerprint == "" {
return mcp.ErrorContent(fmt.Errorf("fingerprint is required")), nil
}
// Fetch all alerts and find the one matching the fingerprint
alerts, err := am.ListAlerts(ctx, AlertFilters{})
if err != nil {
return mcp.ErrorContent(fmt.Errorf("failed to fetch alerts: %w", err)), nil
}
for _, a := range alerts {
if a.Fingerprint == fingerprint {
return mcp.CallToolResult{
Content: []mcp.Content{mcp.TextContent(formatAlerts([]Alert{a}))},
}, nil
}
}
return mcp.ErrorContent(fmt.Errorf("alert with fingerprint '%s' not found", fingerprint)), nil
}
}
func makeSearchMetricsHandler(prom *PrometheusClient) mcp.ToolHandler {
return func(ctx context.Context, args map[string]interface{}) (mcp.CallToolResult, error) {
query, _ := args["query"].(string)
limit := 50
if l, ok := args["limit"].(float64); ok && l > 0 {
limit = int(l)
}
// Get all metric names
allNames, err := prom.LabelValues(ctx, "__name__")
if err != nil {
return mcp.ErrorContent(fmt.Errorf("failed to fetch metric names: %w", err)), nil
}
// Filter by substring
var matched []string
queryLower := strings.ToLower(query)
for _, name := range allNames {
if query == "" || strings.Contains(strings.ToLower(name), queryLower) {
matched = append(matched, name)
if len(matched) >= limit {
break
}
}
}
// Fetch metadata for matched metrics
metadata, err := prom.Metadata(ctx, "")
if err != nil {
// Non-fatal: proceed without metadata
metadata = nil
}
return mcp.CallToolResult{
Content: []mcp.Content{mcp.TextContent(formatMetricSearch(matched, metadata))},
}, nil
}
}
func makeGetMetricMetadataHandler(prom *PrometheusClient) mcp.ToolHandler {
return func(ctx context.Context, args map[string]interface{}) (mcp.CallToolResult, error) {
metric, _ := args["metric"].(string)
if metric == "" {
return mcp.ErrorContent(fmt.Errorf("metric is required")), nil
}
metadata, err := prom.Metadata(ctx, metric)
if err != nil {
return mcp.ErrorContent(fmt.Errorf("failed to fetch metadata: %w", err)), nil
}
metas := metadata[metric]
return mcp.CallToolResult{
Content: []mcp.Content{mcp.TextContent(formatMetricMetadata(metric, metas))},
}, nil
}
}
func makeQueryHandler(prom *PrometheusClient) mcp.ToolHandler {
return func(ctx context.Context, args map[string]interface{}) (mcp.CallToolResult, error) {
promql, _ := args["promql"].(string)
if promql == "" {
return mcp.ErrorContent(fmt.Errorf("promql is required")), nil
}
data, err := prom.Query(ctx, promql, time.Time{})
if err != nil {
return mcp.ErrorContent(fmt.Errorf("query failed: %w", err)), nil
}
var result string
switch data.ResultType {
case "vector":
result = formatInstantVector(data.Result)
case "scalar":
if len(data.Result) > 0 && len(data.Result[0].Value) >= 2 {
if v, ok := data.Result[0].Value[1].(string); ok {
result = fmt.Sprintf("**Scalar result:** %s", v)
}
}
if result == "" {
result = "Scalar query returned no value."
}
default:
result = fmt.Sprintf("Result type: %s\n\n%s", data.ResultType, formatInstantVector(data.Result))
}
return mcp.CallToolResult{
Content: []mcp.Content{mcp.TextContent(result)},
}, nil
}
}
func makeListTargetsHandler(prom *PrometheusClient) mcp.ToolHandler {
return func(ctx context.Context, args map[string]interface{}) (mcp.CallToolResult, error) {
data, err := prom.Targets(ctx)
if err != nil {
return mcp.ErrorContent(fmt.Errorf("failed to fetch targets: %w", err)), nil
}
return mcp.CallToolResult{
Content: []mcp.Content{mcp.TextContent(formatTargets(data))},
}, nil
}
}
func makeListSilencesHandler(am *AlertmanagerClient) mcp.ToolHandler {
return func(ctx context.Context, args map[string]interface{}) (mcp.CallToolResult, error) {
silences, err := am.ListSilences(ctx)
if err != nil {
return mcp.ErrorContent(fmt.Errorf("failed to fetch silences: %w", err)), nil
}
// Filter to active/pending only
var filtered []Silence
for _, s := range silences {
if s.Status != nil && (s.Status.State == "active" || s.Status.State == "pending") {
filtered = append(filtered, s)
}
}
return mcp.CallToolResult{
Content: []mcp.Content{mcp.TextContent(formatSilences(filtered))},
}, nil
}
}
func makeCreateSilenceHandler(am *AlertmanagerClient) mcp.ToolHandler {
return func(ctx context.Context, args map[string]interface{}) (mcp.CallToolResult, error) {
matchersJSON, _ := args["matchers"].(string)
if matchersJSON == "" {
return mcp.ErrorContent(fmt.Errorf("matchers is required")), nil
}
durationStr, _ := args["duration"].(string)
if durationStr == "" {
return mcp.ErrorContent(fmt.Errorf("duration is required")), nil
}
author, _ := args["author"].(string)
if author == "" {
return mcp.ErrorContent(fmt.Errorf("author is required")), nil
}
comment, _ := args["comment"].(string)
if comment == "" {
return mcp.ErrorContent(fmt.Errorf("comment is required")), nil
}
// Parse matchers
var matchers []Matcher
if err := parseJSON(matchersJSON, &matchers); err != nil {
return mcp.ErrorContent(fmt.Errorf("invalid matchers JSON: %w", err)), nil
}
// Parse duration
duration, err := time.ParseDuration(durationStr)
if err != nil {
return mcp.ErrorContent(fmt.Errorf("invalid duration: %w", err)), nil
}
now := time.Now()
silence := Silence{
Matchers: matchers,
StartsAt: now,
EndsAt: now.Add(duration),
CreatedBy: author,
Comment: comment,
}
id, err := am.CreateSilence(ctx, silence)
if err != nil {
return mcp.ErrorContent(fmt.Errorf("failed to create silence: %w", err)), nil
}
var sb strings.Builder
sb.WriteString("Silence created successfully.\n\n")
sb.WriteString(fmt.Sprintf("**ID:** %s\n", id))
sb.WriteString(fmt.Sprintf("**Expires:** %s\n", silence.EndsAt.Format(time.RFC3339)))
sb.WriteString(fmt.Sprintf("**Author:** %s\n", author))
sb.WriteString(fmt.Sprintf("**Comment:** %s\n", comment))
return mcp.CallToolResult{
Content: []mcp.Content{mcp.TextContent(sb.String())},
}, nil
}
}
// parseJSON is a helper to unmarshal JSON from a string.
func parseJSON(s string, v interface{}) error {
return json.Unmarshal([]byte(s), v)
}
// Loki tool definitions
func queryLogsTool() mcp.Tool {
return mcp.Tool{
Name: "query_logs",
Description: "Execute a LogQL range query against Loki to search and retrieve log entries",
InputSchema: mcp.InputSchema{
Type: "object",
Properties: map[string]mcp.Property{
"logql": {
Type: "string",
Description: `LogQL query expression (e.g., '{job="varlogs"}', '{job="nginx"} |= "error"')`,
},
"start": {
Type: "string",
Description: "Start time: relative duration (e.g., '1h', '30m'), RFC3339 timestamp, or Unix epoch seconds. Default: 1h ago",
},
"end": {
Type: "string",
Description: "End time: relative duration (e.g., '5m'), RFC3339 timestamp, or Unix epoch seconds. Default: now",
},
"limit": {
Type: "integer",
Description: "Maximum number of log entries to return (default: 100)",
Default: 100,
},
"direction": {
Type: "string",
Description: "Sort order for log entries: 'backward' (newest first) or 'forward' (oldest first)",
Enum: []string{"backward", "forward"},
},
},
Required: []string{"logql"},
},
}
}
func listLabelsTool() mcp.Tool {
return mcp.Tool{
Name: "list_labels",
Description: "List available label names from Loki",
InputSchema: mcp.InputSchema{
Type: "object",
Properties: map[string]mcp.Property{},
},
}
}
func listLabelValuesTool() mcp.Tool {
return mcp.Tool{
Name: "list_label_values",
Description: "List values for a specific label from Loki",
InputSchema: mcp.InputSchema{
Type: "object",
Properties: map[string]mcp.Property{
"label": {
Type: "string",
Description: "Label name to get values for (e.g., 'job', 'instance')",
},
},
Required: []string{"label"},
},
}
}
// Loki handler constructors
func makeQueryLogsHandler(loki *LokiClient) mcp.ToolHandler {
return func(ctx context.Context, args map[string]interface{}) (mcp.CallToolResult, error) {
logql, _ := args["logql"].(string)
if logql == "" {
return mcp.ErrorContent(fmt.Errorf("logql is required")), nil
}
now := time.Now()
start := now.Add(-time.Hour)
end := now
if startStr, ok := args["start"].(string); ok && startStr != "" {
parsed, err := parseTimeArg(startStr, now.Add(-time.Hour))
if err != nil {
return mcp.ErrorContent(fmt.Errorf("invalid start time: %w", err)), nil
}
start = parsed
}
if endStr, ok := args["end"].(string); ok && endStr != "" {
parsed, err := parseTimeArg(endStr, now)
if err != nil {
return mcp.ErrorContent(fmt.Errorf("invalid end time: %w", err)), nil
}
end = parsed
}
limit := 100
if l, ok := args["limit"].(float64); ok && l > 0 {
limit = int(l)
}
if limit > 5000 {
limit = 5000
}
direction := "backward"
if d, ok := args["direction"].(string); ok && d != "" {
if d != "backward" && d != "forward" {
return mcp.ErrorContent(fmt.Errorf("direction must be 'backward' or 'forward'")), nil
}
direction = d
}
data, err := loki.QueryRange(ctx, logql, start, end, limit, direction)
if err != nil {
return mcp.ErrorContent(fmt.Errorf("log query failed: %w", err)), nil
}
return mcp.CallToolResult{
Content: []mcp.Content{mcp.TextContent(formatLogStreams(data))},
}, nil
}
}
func makeListLabelsHandler(loki *LokiClient) mcp.ToolHandler {
return func(ctx context.Context, args map[string]interface{}) (mcp.CallToolResult, error) {
labels, err := loki.Labels(ctx)
if err != nil {
return mcp.ErrorContent(fmt.Errorf("failed to list labels: %w", err)), nil
}
return mcp.CallToolResult{
Content: []mcp.Content{mcp.TextContent(formatLabels(labels))},
}, nil
}
}
func makeListLabelValuesHandler(loki *LokiClient) mcp.ToolHandler {
return func(ctx context.Context, args map[string]interface{}) (mcp.CallToolResult, error) {
label, _ := args["label"].(string)
if label == "" {
return mcp.ErrorContent(fmt.Errorf("label is required")), nil
}
values, err := loki.LabelValues(ctx, label)
if err != nil {
return mcp.ErrorContent(fmt.Errorf("failed to list label values: %w", err)), nil
}
return mcp.CallToolResult{
Content: []mcp.Content{mcp.TextContent(formatLabelValues(label, values))},
}, nil
}
}
// parseTimeArg parses a time argument that can be:
// - A relative duration (e.g., "1h", "30m", "2h30m") — interpreted as that duration ago from now
// - An RFC3339 timestamp (e.g., "2024-01-15T10:30:00Z")
// - A Unix epoch in seconds (e.g., "1705312200")
// If parsing fails, returns the provided default time.
func parseTimeArg(s string, defaultTime time.Time) (time.Time, error) {
// Try as relative duration first
if d, err := time.ParseDuration(s); err == nil {
return time.Now().Add(-d), nil
}
// Try as RFC3339
if t, err := time.Parse(time.RFC3339, s); err == nil {
return t, nil
}
// Try as Unix epoch seconds
var epoch int64
validDigits := true
for _, c := range s {
if c >= '0' && c <= '9' {
epoch = epoch*10 + int64(c-'0')
} else {
validDigits = false
break
}
}
if validDigits && len(s) > 0 {
return time.Unix(epoch, 0), nil
}
return defaultTime, fmt.Errorf("cannot parse time '%s': use relative duration (e.g., '1h'), RFC3339, or Unix epoch seconds", s)
}