labmcp/internal/monitoring/handlers.go

package monitoring

import (
	"context"
	"encoding/json"
	"fmt"
	"strings"
	"time"

	"code.t-juice.club/torjus/labmcp/internal/mcp"
)

// AlertSummary queries Alertmanager for active (non-silenced) alerts and returns
// a short summary string. Returns empty string if there are no alerts or if
// Alertmanager is unreachable.
func AlertSummary(am *AlertmanagerClient) string {
	active := true
	silenced := false
	inhibited := false
	alerts, err := am.ListAlerts(context.Background(), AlertFilters{
		Active:    &active,
		Silenced:  &silenced,
		Inhibited: &inhibited,
	})
	if err != nil || len(alerts) == 0 {
		return ""
	}

	// Count by severity
	severities := make(map[string]int)
	for _, a := range alerts {
		sev := a.Labels["severity"]
		if sev == "" {
			sev = "unknown"
		}
		severities[sev]++
	}

	var parts []string
	// Show critical first if present
	if n, ok := severities["critical"]; ok {
		parts = append(parts, fmt.Sprintf("%d critical", n))
		delete(severities, "critical")
	}
	if n, ok := severities["warning"]; ok {
		parts = append(parts, fmt.Sprintf("%d warning", n))
		delete(severities, "warning")
	}
	for sev, n := range severities {
		parts = append(parts, fmt.Sprintf("%d %s", n, sev))
	}

	return fmt.Sprintf("ALERT STATUS: %d active alert(s) firing (%s). Use list_alerts for details. Let the user know about this.",
		len(alerts), strings.Join(parts, ", "))
}

// HandlerOptions configures which handlers are registered.
type HandlerOptions struct {
	// EnableSilences enables the create_silence tool, which is a write operation.
	// Disabled by default as a safety measure.
	EnableSilences bool
}

// RegisterHandlers registers all monitoring tool handlers on the MCP server.
func RegisterHandlers(server *mcp.Server, prom *PrometheusClient, am *AlertmanagerClient, loki *LokiClient, opts HandlerOptions) {
	server.RegisterTool(listAlertsTool(), makeListAlertsHandler(am))
	server.RegisterTool(getAlertTool(), makeGetAlertHandler(am))
	server.RegisterTool(searchMetricsTool(), makeSearchMetricsHandler(prom))
	server.RegisterTool(getMetricMetadataTool(), makeGetMetricMetadataHandler(prom))
	server.RegisterTool(queryTool(), makeQueryHandler(prom))
	server.RegisterTool(listTargetsTool(), makeListTargetsHandler(prom))
	server.RegisterTool(listSilencesTool(), makeListSilencesHandler(am))
	if opts.EnableSilences {
		server.RegisterTool(createSilenceTool(), makeCreateSilenceHandler(am))
	}
	if loki != nil {
		server.RegisterTool(queryLogsTool(), makeQueryLogsHandler(loki))
		server.RegisterTool(listLabelsTool(), makeListLabelsHandler(loki))
		server.RegisterTool(listLabelValuesTool(), makeListLabelValuesHandler(loki))
	}
}

// Tool definitions

func listAlertsTool() mcp.Tool {
	return mcp.Tool{
		Name:        "list_alerts",
		Description: "List alerts from Alertmanager with optional filters",
		InputSchema: mcp.InputSchema{
			Type: "object",
			Properties: map[string]mcp.Property{
				"state": {
					Type:        "string",
					Description: "Filter by alert state: 'active', 'suppressed', 'unprocessed', or 'all' (default: active)",
					Enum:        []string{"active", "suppressed", "unprocessed", "all"},
				},
				"severity": {
					Type:        "string",
					Description: "Filter by severity label (e.g., 'critical', 'warning')",
				},
				"receiver": {
					Type:        "string",
					Description: "Filter by receiver name",
				},
			},
		},
	}
}

func getAlertTool() mcp.Tool {
	return mcp.Tool{
		Name:        "get_alert",
		Description: "Get full details for a specific alert by fingerprint",
		InputSchema: mcp.InputSchema{
			Type: "object",
			Properties: map[string]mcp.Property{
				"fingerprint": {
					Type:        "string",
					Description: "Alert fingerprint identifier",
				},
			},
			Required: []string{"fingerprint"},
		},
	}
}

func searchMetricsTool() mcp.Tool {
	return mcp.Tool{
		Name:        "search_metrics",
		Description: "Search Prometheus metric names with optional substring filter, enriched with metadata (type, help text)",
		InputSchema: mcp.InputSchema{
			Type: "object",
			Properties: map[string]mcp.Property{
				"query": {
					Type:        "string",
					Description: "Substring to filter metric names (e.g., 'cpu', 'memory', 'node_'). Empty returns all metrics.",
				},
				"limit": {
					Type:        "integer",
					Description: "Maximum number of results (default: 50)",
					Default:     50,
				},
			},
		},
	}
}

func getMetricMetadataTool() mcp.Tool {
	return mcp.Tool{
		Name:        "get_metric_metadata",
		Description: "Get type, help text, and unit for a specific Prometheus metric",
		InputSchema: mcp.InputSchema{
			Type: "object",
			Properties: map[string]mcp.Property{
				"metric": {
					Type:        "string",
					Description: "Metric name (e.g., 'node_cpu_seconds_total')",
				},
			},
			Required: []string{"metric"},
		},
	}
}

func queryTool() mcp.Tool {
	return mcp.Tool{
		Name:        "query",
		Description: "Execute an instant PromQL query against Prometheus. Supports aggregations like avg_over_time(metric[1h]), rate(), sum(), etc.",
		InputSchema: mcp.InputSchema{
			Type: "object",
			Properties: map[string]mcp.Property{
				"promql": {
					Type:        "string",
					Description: "PromQL expression to evaluate (e.g., 'up', 'rate(http_requests_total[5m])', 'avg_over_time(node_load1[1h])')",
				},
			},
			Required: []string{"promql"},
		},
	}
}

func listTargetsTool() mcp.Tool {
	return mcp.Tool{
		Name:        "list_targets",
		Description: "List Prometheus scrape targets with health status, grouped by job",
		InputSchema: mcp.InputSchema{
			Type:       "object",
			Properties: map[string]mcp.Property{},
		},
	}
}

func listSilencesTool() mcp.Tool {
	return mcp.Tool{
		Name:        "list_silences",
		Description: "List active and pending alert silences from Alertmanager",
		InputSchema: mcp.InputSchema{
			Type:       "object",
			Properties: map[string]mcp.Property{},
		},
	}
}

func createSilenceTool() mcp.Tool {
	return mcp.Tool{
		Name:        "create_silence",
		Description: `Create a new silence in Alertmanager. IMPORTANT: Always confirm with the user before creating a silence, showing them the matchers, duration, and reason.`,
		InputSchema: mcp.InputSchema{
			Type: "object",
			Properties: map[string]mcp.Property{
				"matchers": {
					Type:        "string",
					Description: `JSON array of matchers, e.g. [{"name":"alertname","value":"TargetDown","isRegex":false}]`,
				},
				"duration": {
					Type:        "string",
					Description: "Silence duration in Go duration format (e.g., '2h', '30m', '1h30m')",
				},
				"author": {
					Type:        "string",
					Description: "Author of the silence",
				},
				"comment": {
					Type:        "string",
					Description: "Reason for the silence",
				},
			},
			Required: []string{"matchers", "duration", "author", "comment"},
		},
	}
}

// Handler constructors

func makeListAlertsHandler(am *AlertmanagerClient) mcp.ToolHandler {
	return func(ctx context.Context, args map[string]interface{}) (mcp.CallToolResult, error) {
		filters := AlertFilters{}

		state, _ := args["state"].(string)
		switch state {
		case "active", "":
			// Default to active alerts only (non-silenced, non-inhibited)
			active := true
			filters.Active = &active
			silenced := false
			filters.Silenced = &silenced
			inhibited := false
			filters.Inhibited = &inhibited
		case "suppressed":
			active := false
			filters.Active = &active
		case "unprocessed":
			unprocessed := true
			filters.Unprocessed = &unprocessed
		case "all":
			// No filters - return everything
		}

		if severity, ok := args["severity"].(string); ok && severity != "" {
			filters.Filter = append(filters.Filter, fmt.Sprintf(`severity="%s"`, severity))
		}

		if receiver, ok := args["receiver"].(string); ok && receiver != "" {
			filters.Receiver = receiver
		}

		alerts, err := am.ListAlerts(ctx, filters)
		if err != nil {
			return mcp.ErrorContent(fmt.Errorf("failed to list alerts: %w", err)), nil
		}

		return mcp.CallToolResult{
			Content: []mcp.Content{mcp.TextContent(formatAlerts(alerts))},
		}, nil
	}
}

func makeGetAlertHandler(am *AlertmanagerClient) mcp.ToolHandler {
	return func(ctx context.Context, args map[string]interface{}) (mcp.CallToolResult, error) {
		fingerprint, _ := args["fingerprint"].(string)
		if fingerprint == "" {
			return mcp.ErrorContent(fmt.Errorf("fingerprint is required")), nil
		}

		// Fetch all alerts and find the one matching the fingerprint
		alerts, err := am.ListAlerts(ctx, AlertFilters{})
		if err != nil {
			return mcp.ErrorContent(fmt.Errorf("failed to fetch alerts: %w", err)), nil
		}

		for _, a := range alerts {
			if a.Fingerprint == fingerprint {
				return mcp.CallToolResult{
					Content: []mcp.Content{mcp.TextContent(formatAlerts([]Alert{a}))},
				}, nil
			}
		}

		return mcp.ErrorContent(fmt.Errorf("alert with fingerprint '%s' not found", fingerprint)), nil
	}
}

func makeSearchMetricsHandler(prom *PrometheusClient) mcp.ToolHandler {
	return func(ctx context.Context, args map[string]interface{}) (mcp.CallToolResult, error) {
		query, _ := args["query"].(string)
		limit := 50
		if l, ok := args["limit"].(float64); ok && l > 0 {
			limit = int(l)
		}

		// Get all metric names
		allNames, err := prom.LabelValues(ctx, "__name__")
		if err != nil {
			return mcp.ErrorContent(fmt.Errorf("failed to fetch metric names: %w", err)), nil
		}

		// Filter by substring
		var matched []string
		queryLower := strings.ToLower(query)
		for _, name := range allNames {
			if query == "" || strings.Contains(strings.ToLower(name), queryLower) {
				matched = append(matched, name)
				if len(matched) >= limit {
					break
				}
			}
		}

		// Fetch metadata for matched metrics
		metadata, err := prom.Metadata(ctx, "")
		if err != nil {
			// Non-fatal: proceed without metadata
			metadata = nil
		}

		return mcp.CallToolResult{
			Content: []mcp.Content{mcp.TextContent(formatMetricSearch(matched, metadata))},
		}, nil
	}
}

func makeGetMetricMetadataHandler(prom *PrometheusClient) mcp.ToolHandler {
	return func(ctx context.Context, args map[string]interface{}) (mcp.CallToolResult, error) {
		metric, _ := args["metric"].(string)
		if metric == "" {
			return mcp.ErrorContent(fmt.Errorf("metric is required")), nil
		}

		metadata, err := prom.Metadata(ctx, metric)
		if err != nil {
			return mcp.ErrorContent(fmt.Errorf("failed to fetch metadata: %w", err)), nil
		}

		metas := metadata[metric]
		return mcp.CallToolResult{
			Content: []mcp.Content{mcp.TextContent(formatMetricMetadata(metric, metas))},
		}, nil
	}
}

func makeQueryHandler(prom *PrometheusClient) mcp.ToolHandler {
	return func(ctx context.Context, args map[string]interface{}) (mcp.CallToolResult, error) {
		promql, _ := args["promql"].(string)
		if promql == "" {
			return mcp.ErrorContent(fmt.Errorf("promql is required")), nil
		}

		data, err := prom.Query(ctx, promql, time.Time{})
		if err != nil {
			return mcp.ErrorContent(fmt.Errorf("query failed: %w", err)), nil
		}

		var result string
		switch data.ResultType {
		case "vector":
			result = formatInstantVector(data.Result)
		case "scalar":
			if len(data.Result) > 0 && len(data.Result[0].Value) >= 2 {
				if v, ok := data.Result[0].Value[1].(string); ok {
					result = fmt.Sprintf("**Scalar result:** %s", v)
				}
			}
			if result == "" {
				result = "Scalar query returned no value."
			}
		default:
			result = fmt.Sprintf("Result type: %s\n\n%s", data.ResultType, formatInstantVector(data.Result))
		}

		return mcp.CallToolResult{
			Content: []mcp.Content{mcp.TextContent(result)},
		}, nil
	}
}

func makeListTargetsHandler(prom *PrometheusClient) mcp.ToolHandler {
	return func(ctx context.Context, args map[string]interface{}) (mcp.CallToolResult, error) {
		data, err := prom.Targets(ctx)
		if err != nil {
			return mcp.ErrorContent(fmt.Errorf("failed to fetch targets: %w", err)), nil
		}

		return mcp.CallToolResult{
			Content: []mcp.Content{mcp.TextContent(formatTargets(data))},
		}, nil
	}
}

func makeListSilencesHandler(am *AlertmanagerClient) mcp.ToolHandler {
	return func(ctx context.Context, args map[string]interface{}) (mcp.CallToolResult, error) {
		silences, err := am.ListSilences(ctx)
		if err != nil {
			return mcp.ErrorContent(fmt.Errorf("failed to fetch silences: %w", err)), nil
		}

		// Filter to active/pending only
		var filtered []Silence
		for _, s := range silences {
			if s.Status != nil && (s.Status.State == "active" || s.Status.State == "pending") {
				filtered = append(filtered, s)
			}
		}

		return mcp.CallToolResult{
			Content: []mcp.Content{mcp.TextContent(formatSilences(filtered))},
		}, nil
	}
}

func makeCreateSilenceHandler(am *AlertmanagerClient) mcp.ToolHandler {
	return func(ctx context.Context, args map[string]interface{}) (mcp.CallToolResult, error) {
		matchersJSON, _ := args["matchers"].(string)
		if matchersJSON == "" {
			return mcp.ErrorContent(fmt.Errorf("matchers is required")), nil
		}

		durationStr, _ := args["duration"].(string)
		if durationStr == "" {
			return mcp.ErrorContent(fmt.Errorf("duration is required")), nil
		}

		author, _ := args["author"].(string)
		if author == "" {
			return mcp.ErrorContent(fmt.Errorf("author is required")), nil
		}

		comment, _ := args["comment"].(string)
		if comment == "" {
			return mcp.ErrorContent(fmt.Errorf("comment is required")), nil
		}

		// Parse matchers
		var matchers []Matcher
		if err := parseJSON(matchersJSON, &matchers); err != nil {
			return mcp.ErrorContent(fmt.Errorf("invalid matchers JSON: %w", err)), nil
		}

		// Parse duration
		duration, err := time.ParseDuration(durationStr)
		if err != nil {
			return mcp.ErrorContent(fmt.Errorf("invalid duration: %w", err)), nil
		}

		now := time.Now()
		silence := Silence{
			Matchers:  matchers,
			StartsAt:  now,
			EndsAt:    now.Add(duration),
			CreatedBy: author,
			Comment:   comment,
		}

		id, err := am.CreateSilence(ctx, silence)
		if err != nil {
			return mcp.ErrorContent(fmt.Errorf("failed to create silence: %w", err)), nil
		}

		var sb strings.Builder
		sb.WriteString("Silence created successfully.\n\n")
		sb.WriteString(fmt.Sprintf("**ID:** %s\n", id))
		sb.WriteString(fmt.Sprintf("**Expires:** %s\n", silence.EndsAt.Format(time.RFC3339)))
		sb.WriteString(fmt.Sprintf("**Author:** %s\n", author))
		sb.WriteString(fmt.Sprintf("**Comment:** %s\n", comment))

		return mcp.CallToolResult{
			Content: []mcp.Content{mcp.TextContent(sb.String())},
		}, nil
	}
}

// parseJSON is a helper to unmarshal JSON from a string.
func parseJSON(s string, v interface{}) error {
	return json.Unmarshal([]byte(s), v)
}

// Loki tool definitions

func queryLogsTool() mcp.Tool {
	return mcp.Tool{
		Name:        "query_logs",
		Description: "Execute a LogQL range query against Loki to search and retrieve log entries",
		InputSchema: mcp.InputSchema{
			Type: "object",
			Properties: map[string]mcp.Property{
				"logql": {
					Type:        "string",
					Description: `LogQL query expression (e.g., '{job="varlogs"}', '{job="nginx"} |= "error"')`,
				},
				"start": {
					Type:        "string",
					Description: "Start time: relative duration (e.g., '1h', '30m'), RFC3339 timestamp, or Unix epoch seconds. Default: 1h ago",
				},
				"end": {
					Type:        "string",
					Description: "End time: relative duration (e.g., '5m'), RFC3339 timestamp, or Unix epoch seconds. Default: now",
				},
				"limit": {
					Type:        "integer",
					Description: "Maximum number of log entries to return (default: 100)",
					Default:     100,
				},
				"direction": {
					Type:        "string",
					Description: "Sort order for log entries: 'backward' (newest first) or 'forward' (oldest first)",
					Enum:        []string{"backward", "forward"},
				},
			},
			Required: []string{"logql"},
		},
	}
}

func listLabelsTool() mcp.Tool {
	return mcp.Tool{
		Name:        "list_labels",
		Description: "List available label names from Loki",
		InputSchema: mcp.InputSchema{
			Type:       "object",
			Properties: map[string]mcp.Property{},
		},
	}
}

func listLabelValuesTool() mcp.Tool {
	return mcp.Tool{
		Name:        "list_label_values",
		Description: "List values for a specific label from Loki",
		InputSchema: mcp.InputSchema{
			Type: "object",
			Properties: map[string]mcp.Property{
				"label": {
					Type:        "string",
					Description: "Label name to get values for (e.g., 'job', 'instance')",
				},
			},
			Required: []string{"label"},
		},
	}
}

// Loki handler constructors

func makeQueryLogsHandler(loki *LokiClient) mcp.ToolHandler {
	return func(ctx context.Context, args map[string]interface{}) (mcp.CallToolResult, error) {
		logql, _ := args["logql"].(string)
		if logql == "" {
			return mcp.ErrorContent(fmt.Errorf("logql is required")), nil
		}

		now := time.Now()
		start := now.Add(-time.Hour)
		end := now

		if startStr, ok := args["start"].(string); ok && startStr != "" {
			parsed, err := parseTimeArg(startStr, now.Add(-time.Hour))
			if err != nil {
				return mcp.ErrorContent(fmt.Errorf("invalid start time: %w", err)), nil
			}
			start = parsed
		}

		if endStr, ok := args["end"].(string); ok && endStr != "" {
			parsed, err := parseTimeArg(endStr, now)
			if err != nil {
				return mcp.ErrorContent(fmt.Errorf("invalid end time: %w", err)), nil
			}
			end = parsed
		}

		limit := 100
		if l, ok := args["limit"].(float64); ok && l > 0 {
			limit = int(l)
		}
		if limit > 5000 {
			limit = 5000
		}

		direction := "backward"
		if d, ok := args["direction"].(string); ok && d != "" {
			if d != "backward" && d != "forward" {
				return mcp.ErrorContent(fmt.Errorf("direction must be 'backward' or 'forward'")), nil
			}
			direction = d
		}

		data, err := loki.QueryRange(ctx, logql, start, end, limit, direction)
		if err != nil {
			return mcp.ErrorContent(fmt.Errorf("log query failed: %w", err)), nil
		}

		return mcp.CallToolResult{
			Content: []mcp.Content{mcp.TextContent(formatLogStreams(data))},
		}, nil
	}
}

func makeListLabelsHandler(loki *LokiClient) mcp.ToolHandler {
	return func(ctx context.Context, args map[string]interface{}) (mcp.CallToolResult, error) {
		labels, err := loki.Labels(ctx)
		if err != nil {
			return mcp.ErrorContent(fmt.Errorf("failed to list labels: %w", err)), nil
		}

		return mcp.CallToolResult{
			Content: []mcp.Content{mcp.TextContent(formatLabels(labels))},
		}, nil
	}
}

func makeListLabelValuesHandler(loki *LokiClient) mcp.ToolHandler {
	return func(ctx context.Context, args map[string]interface{}) (mcp.CallToolResult, error) {
		label, _ := args["label"].(string)
		if label == "" {
			return mcp.ErrorContent(fmt.Errorf("label is required")), nil
		}

		values, err := loki.LabelValues(ctx, label)
		if err != nil {
			return mcp.ErrorContent(fmt.Errorf("failed to list label values: %w", err)), nil
		}

		return mcp.CallToolResult{
			Content: []mcp.Content{mcp.TextContent(formatLabelValues(label, values))},
		}, nil
	}
}

// parseTimeArg parses a time argument that can be:
// - A relative duration (e.g., "1h", "30m", "2h30m") — interpreted as that duration ago from now
// - An RFC3339 timestamp (e.g., "2024-01-15T10:30:00Z")
// - A Unix epoch in seconds (e.g., "1705312200")
// If parsing fails, returns the provided default time.
func parseTimeArg(s string, defaultTime time.Time) (time.Time, error) {
	// Try as relative duration first
	if d, err := time.ParseDuration(s); err == nil {
		return time.Now().Add(-d), nil
	}

	// Try as RFC3339
	if t, err := time.Parse(time.RFC3339, s); err == nil {
		return t, nil
	}

	// Try as Unix epoch seconds
	var epoch int64
	validDigits := true
	for _, c := range s {
		if c >= '0' && c <= '9' {
			epoch = epoch*10 + int64(c-'0')
		} else {
			validDigits = false
			break
		}
	}
	if validDigits && len(s) > 0 {
		return time.Unix(epoch, 0), nil
	}

	return defaultTime, fmt.Errorf("cannot parse time '%s': use relative duration (e.g., '1h'), RFC3339, or Unix epoch seconds", s)
}