feat: add lab-monitoring MCP server for Prometheus and Alertmanager
New MCP server that queries live Prometheus and Alertmanager HTTP APIs with 8 tools: list_alerts, get_alert, search_metrics, get_metric_metadata, query (PromQL), list_targets, list_silences, and create_silence. Extends the MCP core with ModeCustom and NewGenericServer for servers that don't require a database. Includes CLI with direct commands (alerts, query, targets, metrics), NixOS module, and comprehensive httptest-based tests. Bumps existing binaries to 0.2.1 due to shared internal/mcp change. Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
434
internal/monitoring/handlers.go
Normal file
434
internal/monitoring/handlers.go
Normal file
@@ -0,0 +1,434 @@
|
||||
package monitoring
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"git.t-juice.club/torjus/labmcp/internal/mcp"
|
||||
)
|
||||
|
||||
// RegisterHandlers registers all monitoring tool handlers on the MCP server.
|
||||
func RegisterHandlers(server *mcp.Server, prom *PrometheusClient, am *AlertmanagerClient) {
|
||||
server.RegisterTool(listAlertsTool(), makeListAlertsHandler(am))
|
||||
server.RegisterTool(getAlertTool(), makeGetAlertHandler(am))
|
||||
server.RegisterTool(searchMetricsTool(), makeSearchMetricsHandler(prom))
|
||||
server.RegisterTool(getMetricMetadataTool(), makeGetMetricMetadataHandler(prom))
|
||||
server.RegisterTool(queryTool(), makeQueryHandler(prom))
|
||||
server.RegisterTool(listTargetsTool(), makeListTargetsHandler(prom))
|
||||
server.RegisterTool(listSilencesTool(), makeListSilencesHandler(am))
|
||||
server.RegisterTool(createSilenceTool(), makeCreateSilenceHandler(am))
|
||||
}
|
||||
|
||||
// Tool definitions
|
||||
|
||||
func listAlertsTool() mcp.Tool {
|
||||
return mcp.Tool{
|
||||
Name: "list_alerts",
|
||||
Description: "List alerts from Alertmanager with optional filters",
|
||||
InputSchema: mcp.InputSchema{
|
||||
Type: "object",
|
||||
Properties: map[string]mcp.Property{
|
||||
"state": {
|
||||
Type: "string",
|
||||
Description: "Filter by alert state: 'active', 'suppressed', or 'unprocessed'",
|
||||
Enum: []string{"active", "suppressed", "unprocessed"},
|
||||
},
|
||||
"severity": {
|
||||
Type: "string",
|
||||
Description: "Filter by severity label (e.g., 'critical', 'warning')",
|
||||
},
|
||||
"receiver": {
|
||||
Type: "string",
|
||||
Description: "Filter by receiver name",
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
func getAlertTool() mcp.Tool {
|
||||
return mcp.Tool{
|
||||
Name: "get_alert",
|
||||
Description: "Get full details for a specific alert by fingerprint",
|
||||
InputSchema: mcp.InputSchema{
|
||||
Type: "object",
|
||||
Properties: map[string]mcp.Property{
|
||||
"fingerprint": {
|
||||
Type: "string",
|
||||
Description: "Alert fingerprint identifier",
|
||||
},
|
||||
},
|
||||
Required: []string{"fingerprint"},
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
func searchMetricsTool() mcp.Tool {
|
||||
return mcp.Tool{
|
||||
Name: "search_metrics",
|
||||
Description: "Search Prometheus metric names with optional substring filter, enriched with metadata (type, help text)",
|
||||
InputSchema: mcp.InputSchema{
|
||||
Type: "object",
|
||||
Properties: map[string]mcp.Property{
|
||||
"query": {
|
||||
Type: "string",
|
||||
Description: "Substring to filter metric names (e.g., 'cpu', 'memory', 'node_'). Empty returns all metrics.",
|
||||
},
|
||||
"limit": {
|
||||
Type: "integer",
|
||||
Description: "Maximum number of results (default: 50)",
|
||||
Default: 50,
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
func getMetricMetadataTool() mcp.Tool {
|
||||
return mcp.Tool{
|
||||
Name: "get_metric_metadata",
|
||||
Description: "Get type, help text, and unit for a specific Prometheus metric",
|
||||
InputSchema: mcp.InputSchema{
|
||||
Type: "object",
|
||||
Properties: map[string]mcp.Property{
|
||||
"metric": {
|
||||
Type: "string",
|
||||
Description: "Metric name (e.g., 'node_cpu_seconds_total')",
|
||||
},
|
||||
},
|
||||
Required: []string{"metric"},
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
func queryTool() mcp.Tool {
|
||||
return mcp.Tool{
|
||||
Name: "query",
|
||||
Description: "Execute an instant PromQL query against Prometheus. Supports aggregations like avg_over_time(metric[1h]), rate(), sum(), etc.",
|
||||
InputSchema: mcp.InputSchema{
|
||||
Type: "object",
|
||||
Properties: map[string]mcp.Property{
|
||||
"promql": {
|
||||
Type: "string",
|
||||
Description: "PromQL expression to evaluate (e.g., 'up', 'rate(http_requests_total[5m])', 'avg_over_time(node_load1[1h])')",
|
||||
},
|
||||
},
|
||||
Required: []string{"promql"},
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
func listTargetsTool() mcp.Tool {
|
||||
return mcp.Tool{
|
||||
Name: "list_targets",
|
||||
Description: "List Prometheus scrape targets with health status, grouped by job",
|
||||
InputSchema: mcp.InputSchema{
|
||||
Type: "object",
|
||||
Properties: map[string]mcp.Property{},
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
func listSilencesTool() mcp.Tool {
|
||||
return mcp.Tool{
|
||||
Name: "list_silences",
|
||||
Description: "List active and pending alert silences from Alertmanager",
|
||||
InputSchema: mcp.InputSchema{
|
||||
Type: "object",
|
||||
Properties: map[string]mcp.Property{},
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
func createSilenceTool() mcp.Tool {
|
||||
return mcp.Tool{
|
||||
Name: "create_silence",
|
||||
Description: `Create a new silence in Alertmanager. IMPORTANT: Always confirm with the user before creating a silence, showing them the matchers, duration, and reason.`,
|
||||
InputSchema: mcp.InputSchema{
|
||||
Type: "object",
|
||||
Properties: map[string]mcp.Property{
|
||||
"matchers": {
|
||||
Type: "string",
|
||||
Description: `JSON array of matchers, e.g. [{"name":"alertname","value":"TargetDown","isRegex":false}]`,
|
||||
},
|
||||
"duration": {
|
||||
Type: "string",
|
||||
Description: "Silence duration in Go duration format (e.g., '2h', '30m', '1h30m')",
|
||||
},
|
||||
"author": {
|
||||
Type: "string",
|
||||
Description: "Author of the silence",
|
||||
},
|
||||
"comment": {
|
||||
Type: "string",
|
||||
Description: "Reason for the silence",
|
||||
},
|
||||
},
|
||||
Required: []string{"matchers", "duration", "author", "comment"},
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
// Handler constructors
|
||||
|
||||
func makeListAlertsHandler(am *AlertmanagerClient) mcp.ToolHandler {
|
||||
return func(ctx context.Context, args map[string]interface{}) (mcp.CallToolResult, error) {
|
||||
filters := AlertFilters{}
|
||||
|
||||
if state, ok := args["state"].(string); ok && state != "" {
|
||||
switch state {
|
||||
case "active":
|
||||
active := true
|
||||
filters.Active = &active
|
||||
silenced := false
|
||||
filters.Silenced = &silenced
|
||||
inhibited := false
|
||||
filters.Inhibited = &inhibited
|
||||
case "suppressed":
|
||||
active := false
|
||||
filters.Active = &active
|
||||
case "unprocessed":
|
||||
unprocessed := true
|
||||
filters.Unprocessed = &unprocessed
|
||||
}
|
||||
}
|
||||
|
||||
if severity, ok := args["severity"].(string); ok && severity != "" {
|
||||
filters.Filter = append(filters.Filter, fmt.Sprintf(`severity="%s"`, severity))
|
||||
}
|
||||
|
||||
if receiver, ok := args["receiver"].(string); ok && receiver != "" {
|
||||
filters.Receiver = receiver
|
||||
}
|
||||
|
||||
alerts, err := am.ListAlerts(ctx, filters)
|
||||
if err != nil {
|
||||
return mcp.ErrorContent(fmt.Errorf("failed to list alerts: %w", err)), nil
|
||||
}
|
||||
|
||||
return mcp.CallToolResult{
|
||||
Content: []mcp.Content{mcp.TextContent(formatAlerts(alerts))},
|
||||
}, nil
|
||||
}
|
||||
}
|
||||
|
||||
func makeGetAlertHandler(am *AlertmanagerClient) mcp.ToolHandler {
|
||||
return func(ctx context.Context, args map[string]interface{}) (mcp.CallToolResult, error) {
|
||||
fingerprint, _ := args["fingerprint"].(string)
|
||||
if fingerprint == "" {
|
||||
return mcp.ErrorContent(fmt.Errorf("fingerprint is required")), nil
|
||||
}
|
||||
|
||||
// Fetch all alerts and find the one matching the fingerprint
|
||||
alerts, err := am.ListAlerts(ctx, AlertFilters{})
|
||||
if err != nil {
|
||||
return mcp.ErrorContent(fmt.Errorf("failed to fetch alerts: %w", err)), nil
|
||||
}
|
||||
|
||||
for _, a := range alerts {
|
||||
if a.Fingerprint == fingerprint {
|
||||
return mcp.CallToolResult{
|
||||
Content: []mcp.Content{mcp.TextContent(formatAlerts([]Alert{a}))},
|
||||
}, nil
|
||||
}
|
||||
}
|
||||
|
||||
return mcp.ErrorContent(fmt.Errorf("alert with fingerprint '%s' not found", fingerprint)), nil
|
||||
}
|
||||
}
|
||||
|
||||
func makeSearchMetricsHandler(prom *PrometheusClient) mcp.ToolHandler {
|
||||
return func(ctx context.Context, args map[string]interface{}) (mcp.CallToolResult, error) {
|
||||
query, _ := args["query"].(string)
|
||||
limit := 50
|
||||
if l, ok := args["limit"].(float64); ok && l > 0 {
|
||||
limit = int(l)
|
||||
}
|
||||
|
||||
// Get all metric names
|
||||
allNames, err := prom.LabelValues(ctx, "__name__")
|
||||
if err != nil {
|
||||
return mcp.ErrorContent(fmt.Errorf("failed to fetch metric names: %w", err)), nil
|
||||
}
|
||||
|
||||
// Filter by substring
|
||||
var matched []string
|
||||
queryLower := strings.ToLower(query)
|
||||
for _, name := range allNames {
|
||||
if query == "" || strings.Contains(strings.ToLower(name), queryLower) {
|
||||
matched = append(matched, name)
|
||||
if len(matched) >= limit {
|
||||
break
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Fetch metadata for matched metrics
|
||||
metadata, err := prom.Metadata(ctx, "")
|
||||
if err != nil {
|
||||
// Non-fatal: proceed without metadata
|
||||
metadata = nil
|
||||
}
|
||||
|
||||
return mcp.CallToolResult{
|
||||
Content: []mcp.Content{mcp.TextContent(formatMetricSearch(matched, metadata))},
|
||||
}, nil
|
||||
}
|
||||
}
|
||||
|
||||
func makeGetMetricMetadataHandler(prom *PrometheusClient) mcp.ToolHandler {
|
||||
return func(ctx context.Context, args map[string]interface{}) (mcp.CallToolResult, error) {
|
||||
metric, _ := args["metric"].(string)
|
||||
if metric == "" {
|
||||
return mcp.ErrorContent(fmt.Errorf("metric is required")), nil
|
||||
}
|
||||
|
||||
metadata, err := prom.Metadata(ctx, metric)
|
||||
if err != nil {
|
||||
return mcp.ErrorContent(fmt.Errorf("failed to fetch metadata: %w", err)), nil
|
||||
}
|
||||
|
||||
metas := metadata[metric]
|
||||
return mcp.CallToolResult{
|
||||
Content: []mcp.Content{mcp.TextContent(formatMetricMetadata(metric, metas))},
|
||||
}, nil
|
||||
}
|
||||
}
|
||||
|
||||
func makeQueryHandler(prom *PrometheusClient) mcp.ToolHandler {
|
||||
return func(ctx context.Context, args map[string]interface{}) (mcp.CallToolResult, error) {
|
||||
promql, _ := args["promql"].(string)
|
||||
if promql == "" {
|
||||
return mcp.ErrorContent(fmt.Errorf("promql is required")), nil
|
||||
}
|
||||
|
||||
data, err := prom.Query(ctx, promql, time.Time{})
|
||||
if err != nil {
|
||||
return mcp.ErrorContent(fmt.Errorf("query failed: %w", err)), nil
|
||||
}
|
||||
|
||||
var result string
|
||||
switch data.ResultType {
|
||||
case "vector":
|
||||
result = formatInstantVector(data.Result)
|
||||
case "scalar":
|
||||
if len(data.Result) > 0 && len(data.Result[0].Value) >= 2 {
|
||||
if v, ok := data.Result[0].Value[1].(string); ok {
|
||||
result = fmt.Sprintf("**Scalar result:** %s", v)
|
||||
}
|
||||
}
|
||||
if result == "" {
|
||||
result = "Scalar query returned no value."
|
||||
}
|
||||
default:
|
||||
result = fmt.Sprintf("Result type: %s\n\n%s", data.ResultType, formatInstantVector(data.Result))
|
||||
}
|
||||
|
||||
return mcp.CallToolResult{
|
||||
Content: []mcp.Content{mcp.TextContent(result)},
|
||||
}, nil
|
||||
}
|
||||
}
|
||||
|
||||
func makeListTargetsHandler(prom *PrometheusClient) mcp.ToolHandler {
|
||||
return func(ctx context.Context, args map[string]interface{}) (mcp.CallToolResult, error) {
|
||||
data, err := prom.Targets(ctx)
|
||||
if err != nil {
|
||||
return mcp.ErrorContent(fmt.Errorf("failed to fetch targets: %w", err)), nil
|
||||
}
|
||||
|
||||
return mcp.CallToolResult{
|
||||
Content: []mcp.Content{mcp.TextContent(formatTargets(data))},
|
||||
}, nil
|
||||
}
|
||||
}
|
||||
|
||||
func makeListSilencesHandler(am *AlertmanagerClient) mcp.ToolHandler {
|
||||
return func(ctx context.Context, args map[string]interface{}) (mcp.CallToolResult, error) {
|
||||
silences, err := am.ListSilences(ctx)
|
||||
if err != nil {
|
||||
return mcp.ErrorContent(fmt.Errorf("failed to fetch silences: %w", err)), nil
|
||||
}
|
||||
|
||||
// Filter to active/pending only
|
||||
var filtered []Silence
|
||||
for _, s := range silences {
|
||||
if s.Status != nil && (s.Status.State == "active" || s.Status.State == "pending") {
|
||||
filtered = append(filtered, s)
|
||||
}
|
||||
}
|
||||
|
||||
return mcp.CallToolResult{
|
||||
Content: []mcp.Content{mcp.TextContent(formatSilences(filtered))},
|
||||
}, nil
|
||||
}
|
||||
}
|
||||
|
||||
func makeCreateSilenceHandler(am *AlertmanagerClient) mcp.ToolHandler {
|
||||
return func(ctx context.Context, args map[string]interface{}) (mcp.CallToolResult, error) {
|
||||
matchersJSON, _ := args["matchers"].(string)
|
||||
if matchersJSON == "" {
|
||||
return mcp.ErrorContent(fmt.Errorf("matchers is required")), nil
|
||||
}
|
||||
|
||||
durationStr, _ := args["duration"].(string)
|
||||
if durationStr == "" {
|
||||
return mcp.ErrorContent(fmt.Errorf("duration is required")), nil
|
||||
}
|
||||
|
||||
author, _ := args["author"].(string)
|
||||
if author == "" {
|
||||
return mcp.ErrorContent(fmt.Errorf("author is required")), nil
|
||||
}
|
||||
|
||||
comment, _ := args["comment"].(string)
|
||||
if comment == "" {
|
||||
return mcp.ErrorContent(fmt.Errorf("comment is required")), nil
|
||||
}
|
||||
|
||||
// Parse matchers
|
||||
var matchers []Matcher
|
||||
if err := parseJSON(matchersJSON, &matchers); err != nil {
|
||||
return mcp.ErrorContent(fmt.Errorf("invalid matchers JSON: %w", err)), nil
|
||||
}
|
||||
|
||||
// Parse duration
|
||||
duration, err := time.ParseDuration(durationStr)
|
||||
if err != nil {
|
||||
return mcp.ErrorContent(fmt.Errorf("invalid duration: %w", err)), nil
|
||||
}
|
||||
|
||||
now := time.Now()
|
||||
silence := Silence{
|
||||
Matchers: matchers,
|
||||
StartsAt: now,
|
||||
EndsAt: now.Add(duration),
|
||||
CreatedBy: author,
|
||||
Comment: comment,
|
||||
}
|
||||
|
||||
id, err := am.CreateSilence(ctx, silence)
|
||||
if err != nil {
|
||||
return mcp.ErrorContent(fmt.Errorf("failed to create silence: %w", err)), nil
|
||||
}
|
||||
|
||||
var sb strings.Builder
|
||||
sb.WriteString("Silence created successfully.\n\n")
|
||||
sb.WriteString(fmt.Sprintf("**ID:** %s\n", id))
|
||||
sb.WriteString(fmt.Sprintf("**Expires:** %s\n", silence.EndsAt.Format(time.RFC3339)))
|
||||
sb.WriteString(fmt.Sprintf("**Author:** %s\n", author))
|
||||
sb.WriteString(fmt.Sprintf("**Comment:** %s\n", comment))
|
||||
|
||||
return mcp.CallToolResult{
|
||||
Content: []mcp.Content{mcp.TextContent(sb.String())},
|
||||
}, nil
|
||||
}
|
||||
}
|
||||
|
||||
// parseJSON is a helper to unmarshal JSON from a string.
|
||||
func parseJSON(s string, v interface{}) error {
|
||||
return json.Unmarshal([]byte(s), v)
|
||||
}
|
||||
Reference in New Issue
Block a user