This repository has been archived on 2026-03-10. You can view files and clone it. You cannot open issues or pull requests or push a commit.
Files
labmcp/internal/monitoring/handlers.go
Torjus Håkestad d97e554dfc fix: cap log query limit and validate direction parameter
Prevent unbounded memory usage by capping the limit parameter to 5000.
Validate direction against allowed values instead of passing through
to Loki unchecked.

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-02-05 20:58:35 +01:00

679 lines
19 KiB
Go

package monitoring
import (
"context"
"encoding/json"
"fmt"
"strings"
"time"
"git.t-juice.club/torjus/labmcp/internal/mcp"
)
// AlertSummary queries Alertmanager for active (non-silenced) alerts and returns
// a short summary string. Returns empty string if there are no alerts or if
// Alertmanager is unreachable.
func AlertSummary(am *AlertmanagerClient) string {
active := true
silenced := false
inhibited := false
alerts, err := am.ListAlerts(context.Background(), AlertFilters{
Active: &active,
Silenced: &silenced,
Inhibited: &inhibited,
})
if err != nil || len(alerts) == 0 {
return ""
}
// Count by severity
severities := make(map[string]int)
for _, a := range alerts {
sev := a.Labels["severity"]
if sev == "" {
sev = "unknown"
}
severities[sev]++
}
var parts []string
// Show critical first if present
if n, ok := severities["critical"]; ok {
parts = append(parts, fmt.Sprintf("%d critical", n))
delete(severities, "critical")
}
if n, ok := severities["warning"]; ok {
parts = append(parts, fmt.Sprintf("%d warning", n))
delete(severities, "warning")
}
for sev, n := range severities {
parts = append(parts, fmt.Sprintf("%d %s", n, sev))
}
return fmt.Sprintf("ALERT STATUS: %d active alert(s) firing (%s). Use list_alerts for details. Let the user know about this.",
len(alerts), strings.Join(parts, ", "))
}
// HandlerOptions configures which handlers are registered.
type HandlerOptions struct {
// EnableSilences enables the create_silence tool, which is a write operation.
// Disabled by default as a safety measure.
EnableSilences bool
}
// RegisterHandlers registers all monitoring tool handlers on the MCP server.
func RegisterHandlers(server *mcp.Server, prom *PrometheusClient, am *AlertmanagerClient, loki *LokiClient, opts HandlerOptions) {
server.RegisterTool(listAlertsTool(), makeListAlertsHandler(am))
server.RegisterTool(getAlertTool(), makeGetAlertHandler(am))
server.RegisterTool(searchMetricsTool(), makeSearchMetricsHandler(prom))
server.RegisterTool(getMetricMetadataTool(), makeGetMetricMetadataHandler(prom))
server.RegisterTool(queryTool(), makeQueryHandler(prom))
server.RegisterTool(listTargetsTool(), makeListTargetsHandler(prom))
server.RegisterTool(listSilencesTool(), makeListSilencesHandler(am))
if opts.EnableSilences {
server.RegisterTool(createSilenceTool(), makeCreateSilenceHandler(am))
}
if loki != nil {
server.RegisterTool(queryLogsTool(), makeQueryLogsHandler(loki))
server.RegisterTool(listLabelsTool(), makeListLabelsHandler(loki))
server.RegisterTool(listLabelValuesTool(), makeListLabelValuesHandler(loki))
}
}
// Tool definitions
func listAlertsTool() mcp.Tool {
return mcp.Tool{
Name: "list_alerts",
Description: "List alerts from Alertmanager with optional filters",
InputSchema: mcp.InputSchema{
Type: "object",
Properties: map[string]mcp.Property{
"state": {
Type: "string",
Description: "Filter by alert state: 'active', 'suppressed', or 'unprocessed'",
Enum: []string{"active", "suppressed", "unprocessed"},
},
"severity": {
Type: "string",
Description: "Filter by severity label (e.g., 'critical', 'warning')",
},
"receiver": {
Type: "string",
Description: "Filter by receiver name",
},
},
},
}
}
func getAlertTool() mcp.Tool {
return mcp.Tool{
Name: "get_alert",
Description: "Get full details for a specific alert by fingerprint",
InputSchema: mcp.InputSchema{
Type: "object",
Properties: map[string]mcp.Property{
"fingerprint": {
Type: "string",
Description: "Alert fingerprint identifier",
},
},
Required: []string{"fingerprint"},
},
}
}
func searchMetricsTool() mcp.Tool {
return mcp.Tool{
Name: "search_metrics",
Description: "Search Prometheus metric names with optional substring filter, enriched with metadata (type, help text)",
InputSchema: mcp.InputSchema{
Type: "object",
Properties: map[string]mcp.Property{
"query": {
Type: "string",
Description: "Substring to filter metric names (e.g., 'cpu', 'memory', 'node_'). Empty returns all metrics.",
},
"limit": {
Type: "integer",
Description: "Maximum number of results (default: 50)",
Default: 50,
},
},
},
}
}
func getMetricMetadataTool() mcp.Tool {
return mcp.Tool{
Name: "get_metric_metadata",
Description: "Get type, help text, and unit for a specific Prometheus metric",
InputSchema: mcp.InputSchema{
Type: "object",
Properties: map[string]mcp.Property{
"metric": {
Type: "string",
Description: "Metric name (e.g., 'node_cpu_seconds_total')",
},
},
Required: []string{"metric"},
},
}
}
func queryTool() mcp.Tool {
return mcp.Tool{
Name: "query",
Description: "Execute an instant PromQL query against Prometheus. Supports aggregations like avg_over_time(metric[1h]), rate(), sum(), etc.",
InputSchema: mcp.InputSchema{
Type: "object",
Properties: map[string]mcp.Property{
"promql": {
Type: "string",
Description: "PromQL expression to evaluate (e.g., 'up', 'rate(http_requests_total[5m])', 'avg_over_time(node_load1[1h])')",
},
},
Required: []string{"promql"},
},
}
}
func listTargetsTool() mcp.Tool {
return mcp.Tool{
Name: "list_targets",
Description: "List Prometheus scrape targets with health status, grouped by job",
InputSchema: mcp.InputSchema{
Type: "object",
Properties: map[string]mcp.Property{},
},
}
}
func listSilencesTool() mcp.Tool {
return mcp.Tool{
Name: "list_silences",
Description: "List active and pending alert silences from Alertmanager",
InputSchema: mcp.InputSchema{
Type: "object",
Properties: map[string]mcp.Property{},
},
}
}
func createSilenceTool() mcp.Tool {
return mcp.Tool{
Name: "create_silence",
Description: `Create a new silence in Alertmanager. IMPORTANT: Always confirm with the user before creating a silence, showing them the matchers, duration, and reason.`,
InputSchema: mcp.InputSchema{
Type: "object",
Properties: map[string]mcp.Property{
"matchers": {
Type: "string",
Description: `JSON array of matchers, e.g. [{"name":"alertname","value":"TargetDown","isRegex":false}]`,
},
"duration": {
Type: "string",
Description: "Silence duration in Go duration format (e.g., '2h', '30m', '1h30m')",
},
"author": {
Type: "string",
Description: "Author of the silence",
},
"comment": {
Type: "string",
Description: "Reason for the silence",
},
},
Required: []string{"matchers", "duration", "author", "comment"},
},
}
}
// Handler constructors
func makeListAlertsHandler(am *AlertmanagerClient) mcp.ToolHandler {
return func(ctx context.Context, args map[string]interface{}) (mcp.CallToolResult, error) {
filters := AlertFilters{}
if state, ok := args["state"].(string); ok && state != "" {
switch state {
case "active":
active := true
filters.Active = &active
silenced := false
filters.Silenced = &silenced
inhibited := false
filters.Inhibited = &inhibited
case "suppressed":
active := false
filters.Active = &active
case "unprocessed":
unprocessed := true
filters.Unprocessed = &unprocessed
}
}
if severity, ok := args["severity"].(string); ok && severity != "" {
filters.Filter = append(filters.Filter, fmt.Sprintf(`severity="%s"`, severity))
}
if receiver, ok := args["receiver"].(string); ok && receiver != "" {
filters.Receiver = receiver
}
alerts, err := am.ListAlerts(ctx, filters)
if err != nil {
return mcp.ErrorContent(fmt.Errorf("failed to list alerts: %w", err)), nil
}
return mcp.CallToolResult{
Content: []mcp.Content{mcp.TextContent(formatAlerts(alerts))},
}, nil
}
}
func makeGetAlertHandler(am *AlertmanagerClient) mcp.ToolHandler {
return func(ctx context.Context, args map[string]interface{}) (mcp.CallToolResult, error) {
fingerprint, _ := args["fingerprint"].(string)
if fingerprint == "" {
return mcp.ErrorContent(fmt.Errorf("fingerprint is required")), nil
}
// Fetch all alerts and find the one matching the fingerprint
alerts, err := am.ListAlerts(ctx, AlertFilters{})
if err != nil {
return mcp.ErrorContent(fmt.Errorf("failed to fetch alerts: %w", err)), nil
}
for _, a := range alerts {
if a.Fingerprint == fingerprint {
return mcp.CallToolResult{
Content: []mcp.Content{mcp.TextContent(formatAlerts([]Alert{a}))},
}, nil
}
}
return mcp.ErrorContent(fmt.Errorf("alert with fingerprint '%s' not found", fingerprint)), nil
}
}
func makeSearchMetricsHandler(prom *PrometheusClient) mcp.ToolHandler {
return func(ctx context.Context, args map[string]interface{}) (mcp.CallToolResult, error) {
query, _ := args["query"].(string)
limit := 50
if l, ok := args["limit"].(float64); ok && l > 0 {
limit = int(l)
}
// Get all metric names
allNames, err := prom.LabelValues(ctx, "__name__")
if err != nil {
return mcp.ErrorContent(fmt.Errorf("failed to fetch metric names: %w", err)), nil
}
// Filter by substring
var matched []string
queryLower := strings.ToLower(query)
for _, name := range allNames {
if query == "" || strings.Contains(strings.ToLower(name), queryLower) {
matched = append(matched, name)
if len(matched) >= limit {
break
}
}
}
// Fetch metadata for matched metrics
metadata, err := prom.Metadata(ctx, "")
if err != nil {
// Non-fatal: proceed without metadata
metadata = nil
}
return mcp.CallToolResult{
Content: []mcp.Content{mcp.TextContent(formatMetricSearch(matched, metadata))},
}, nil
}
}
func makeGetMetricMetadataHandler(prom *PrometheusClient) mcp.ToolHandler {
return func(ctx context.Context, args map[string]interface{}) (mcp.CallToolResult, error) {
metric, _ := args["metric"].(string)
if metric == "" {
return mcp.ErrorContent(fmt.Errorf("metric is required")), nil
}
metadata, err := prom.Metadata(ctx, metric)
if err != nil {
return mcp.ErrorContent(fmt.Errorf("failed to fetch metadata: %w", err)), nil
}
metas := metadata[metric]
return mcp.CallToolResult{
Content: []mcp.Content{mcp.TextContent(formatMetricMetadata(metric, metas))},
}, nil
}
}
func makeQueryHandler(prom *PrometheusClient) mcp.ToolHandler {
return func(ctx context.Context, args map[string]interface{}) (mcp.CallToolResult, error) {
promql, _ := args["promql"].(string)
if promql == "" {
return mcp.ErrorContent(fmt.Errorf("promql is required")), nil
}
data, err := prom.Query(ctx, promql, time.Time{})
if err != nil {
return mcp.ErrorContent(fmt.Errorf("query failed: %w", err)), nil
}
var result string
switch data.ResultType {
case "vector":
result = formatInstantVector(data.Result)
case "scalar":
if len(data.Result) > 0 && len(data.Result[0].Value) >= 2 {
if v, ok := data.Result[0].Value[1].(string); ok {
result = fmt.Sprintf("**Scalar result:** %s", v)
}
}
if result == "" {
result = "Scalar query returned no value."
}
default:
result = fmt.Sprintf("Result type: %s\n\n%s", data.ResultType, formatInstantVector(data.Result))
}
return mcp.CallToolResult{
Content: []mcp.Content{mcp.TextContent(result)},
}, nil
}
}
func makeListTargetsHandler(prom *PrometheusClient) mcp.ToolHandler {
return func(ctx context.Context, args map[string]interface{}) (mcp.CallToolResult, error) {
data, err := prom.Targets(ctx)
if err != nil {
return mcp.ErrorContent(fmt.Errorf("failed to fetch targets: %w", err)), nil
}
return mcp.CallToolResult{
Content: []mcp.Content{mcp.TextContent(formatTargets(data))},
}, nil
}
}
func makeListSilencesHandler(am *AlertmanagerClient) mcp.ToolHandler {
return func(ctx context.Context, args map[string]interface{}) (mcp.CallToolResult, error) {
silences, err := am.ListSilences(ctx)
if err != nil {
return mcp.ErrorContent(fmt.Errorf("failed to fetch silences: %w", err)), nil
}
// Filter to active/pending only
var filtered []Silence
for _, s := range silences {
if s.Status != nil && (s.Status.State == "active" || s.Status.State == "pending") {
filtered = append(filtered, s)
}
}
return mcp.CallToolResult{
Content: []mcp.Content{mcp.TextContent(formatSilences(filtered))},
}, nil
}
}
func makeCreateSilenceHandler(am *AlertmanagerClient) mcp.ToolHandler {
return func(ctx context.Context, args map[string]interface{}) (mcp.CallToolResult, error) {
matchersJSON, _ := args["matchers"].(string)
if matchersJSON == "" {
return mcp.ErrorContent(fmt.Errorf("matchers is required")), nil
}
durationStr, _ := args["duration"].(string)
if durationStr == "" {
return mcp.ErrorContent(fmt.Errorf("duration is required")), nil
}
author, _ := args["author"].(string)
if author == "" {
return mcp.ErrorContent(fmt.Errorf("author is required")), nil
}
comment, _ := args["comment"].(string)
if comment == "" {
return mcp.ErrorContent(fmt.Errorf("comment is required")), nil
}
// Parse matchers
var matchers []Matcher
if err := parseJSON(matchersJSON, &matchers); err != nil {
return mcp.ErrorContent(fmt.Errorf("invalid matchers JSON: %w", err)), nil
}
// Parse duration
duration, err := time.ParseDuration(durationStr)
if err != nil {
return mcp.ErrorContent(fmt.Errorf("invalid duration: %w", err)), nil
}
now := time.Now()
silence := Silence{
Matchers: matchers,
StartsAt: now,
EndsAt: now.Add(duration),
CreatedBy: author,
Comment: comment,
}
id, err := am.CreateSilence(ctx, silence)
if err != nil {
return mcp.ErrorContent(fmt.Errorf("failed to create silence: %w", err)), nil
}
var sb strings.Builder
sb.WriteString("Silence created successfully.\n\n")
sb.WriteString(fmt.Sprintf("**ID:** %s\n", id))
sb.WriteString(fmt.Sprintf("**Expires:** %s\n", silence.EndsAt.Format(time.RFC3339)))
sb.WriteString(fmt.Sprintf("**Author:** %s\n", author))
sb.WriteString(fmt.Sprintf("**Comment:** %s\n", comment))
return mcp.CallToolResult{
Content: []mcp.Content{mcp.TextContent(sb.String())},
}, nil
}
}
// parseJSON is a helper to unmarshal JSON from a string.
func parseJSON(s string, v interface{}) error {
return json.Unmarshal([]byte(s), v)
}
// Loki tool definitions
func queryLogsTool() mcp.Tool {
return mcp.Tool{
Name: "query_logs",
Description: "Execute a LogQL range query against Loki to search and retrieve log entries",
InputSchema: mcp.InputSchema{
Type: "object",
Properties: map[string]mcp.Property{
"logql": {
Type: "string",
Description: `LogQL query expression (e.g., '{job="varlogs"}', '{job="nginx"} |= "error"')`,
},
"start": {
Type: "string",
Description: "Start time: relative duration (e.g., '1h', '30m'), RFC3339 timestamp, or Unix epoch seconds. Default: 1h ago",
},
"end": {
Type: "string",
Description: "End time: relative duration (e.g., '5m'), RFC3339 timestamp, or Unix epoch seconds. Default: now",
},
"limit": {
Type: "integer",
Description: "Maximum number of log entries to return (default: 100)",
Default: 100,
},
"direction": {
Type: "string",
Description: "Sort order for log entries: 'backward' (newest first) or 'forward' (oldest first)",
Enum: []string{"backward", "forward"},
},
},
Required: []string{"logql"},
},
}
}
func listLabelsTool() mcp.Tool {
return mcp.Tool{
Name: "list_labels",
Description: "List available label names from Loki",
InputSchema: mcp.InputSchema{
Type: "object",
Properties: map[string]mcp.Property{},
},
}
}
func listLabelValuesTool() mcp.Tool {
return mcp.Tool{
Name: "list_label_values",
Description: "List values for a specific label from Loki",
InputSchema: mcp.InputSchema{
Type: "object",
Properties: map[string]mcp.Property{
"label": {
Type: "string",
Description: "Label name to get values for (e.g., 'job', 'instance')",
},
},
Required: []string{"label"},
},
}
}
// Loki handler constructors
func makeQueryLogsHandler(loki *LokiClient) mcp.ToolHandler {
return func(ctx context.Context, args map[string]interface{}) (mcp.CallToolResult, error) {
logql, _ := args["logql"].(string)
if logql == "" {
return mcp.ErrorContent(fmt.Errorf("logql is required")), nil
}
now := time.Now()
start := now.Add(-time.Hour)
end := now
if startStr, ok := args["start"].(string); ok && startStr != "" {
parsed, err := parseTimeArg(startStr, now.Add(-time.Hour))
if err != nil {
return mcp.ErrorContent(fmt.Errorf("invalid start time: %w", err)), nil
}
start = parsed
}
if endStr, ok := args["end"].(string); ok && endStr != "" {
parsed, err := parseTimeArg(endStr, now)
if err != nil {
return mcp.ErrorContent(fmt.Errorf("invalid end time: %w", err)), nil
}
end = parsed
}
limit := 100
if l, ok := args["limit"].(float64); ok && l > 0 {
limit = int(l)
}
if limit > 5000 {
limit = 5000
}
direction := "backward"
if d, ok := args["direction"].(string); ok && d != "" {
if d != "backward" && d != "forward" {
return mcp.ErrorContent(fmt.Errorf("direction must be 'backward' or 'forward'")), nil
}
direction = d
}
data, err := loki.QueryRange(ctx, logql, start, end, limit, direction)
if err != nil {
return mcp.ErrorContent(fmt.Errorf("log query failed: %w", err)), nil
}
return mcp.CallToolResult{
Content: []mcp.Content{mcp.TextContent(formatLogStreams(data))},
}, nil
}
}
func makeListLabelsHandler(loki *LokiClient) mcp.ToolHandler {
return func(ctx context.Context, args map[string]interface{}) (mcp.CallToolResult, error) {
labels, err := loki.Labels(ctx)
if err != nil {
return mcp.ErrorContent(fmt.Errorf("failed to list labels: %w", err)), nil
}
return mcp.CallToolResult{
Content: []mcp.Content{mcp.TextContent(formatLabels(labels))},
}, nil
}
}
func makeListLabelValuesHandler(loki *LokiClient) mcp.ToolHandler {
return func(ctx context.Context, args map[string]interface{}) (mcp.CallToolResult, error) {
label, _ := args["label"].(string)
if label == "" {
return mcp.ErrorContent(fmt.Errorf("label is required")), nil
}
values, err := loki.LabelValues(ctx, label)
if err != nil {
return mcp.ErrorContent(fmt.Errorf("failed to list label values: %w", err)), nil
}
return mcp.CallToolResult{
Content: []mcp.Content{mcp.TextContent(formatLabelValues(label, values))},
}, nil
}
}
// parseTimeArg parses a time argument that can be:
// - A relative duration (e.g., "1h", "30m", "2h30m") — interpreted as that duration ago from now
// - An RFC3339 timestamp (e.g., "2024-01-15T10:30:00Z")
// - A Unix epoch in seconds (e.g., "1705312200")
// If parsing fails, returns the provided default time.
func parseTimeArg(s string, defaultTime time.Time) (time.Time, error) {
// Try as relative duration first
if d, err := time.ParseDuration(s); err == nil {
return time.Now().Add(-d), nil
}
// Try as RFC3339
if t, err := time.Parse(time.RFC3339, s); err == nil {
return t, nil
}
// Try as Unix epoch seconds
var epoch int64
validDigits := true
for _, c := range s {
if c >= '0' && c <= '9' {
epoch = epoch*10 + int64(c-'0')
} else {
validDigits = false
break
}
}
if validDigits && len(s) > 0 {
return time.Unix(epoch, 0), nil
}
return defaultTime, fmt.Errorf("cannot parse time '%s': use relative duration (e.g., '1h'), RFC3339, or Unix epoch seconds", s)
}