feat: add lab-monitoring MCP server for Prometheus and Alertmanager
New MCP server that queries live Prometheus and Alertmanager HTTP APIs with 8 tools: list_alerts, get_alert, search_metrics, get_metric_metadata, query (PromQL), list_targets, list_silences, and create_silence. Extends the MCP core with ModeCustom and NewGenericServer for servers that don't require a database. Includes CLI with direct commands (alerts, query, targets, metrics), NixOS module, and comprehensive httptest-based tests. Bumps existing binaries to 0.2.1 due to shared internal/mcp change. Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
@@ -20,7 +20,7 @@ import (
|
||||
|
||||
const (
|
||||
defaultDatabase = "sqlite://hm-options.db"
|
||||
version = "0.2.0"
|
||||
version = "0.2.1"
|
||||
)
|
||||
|
||||
func main() {
|
||||
|
||||
369
cmd/lab-monitoring/main.go
Normal file
369
cmd/lab-monitoring/main.go
Normal file
@@ -0,0 +1,369 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"log"
|
||||
"os"
|
||||
"os/signal"
|
||||
"syscall"
|
||||
"time"
|
||||
|
||||
"github.com/urfave/cli/v2"
|
||||
|
||||
"git.t-juice.club/torjus/labmcp/internal/mcp"
|
||||
"git.t-juice.club/torjus/labmcp/internal/monitoring"
|
||||
)
|
||||
|
||||
const version = "0.1.0"
|
||||
|
||||
func main() {
|
||||
app := &cli.App{
|
||||
Name: "lab-monitoring",
|
||||
Usage: "MCP server for Prometheus and Alertmanager monitoring",
|
||||
Version: version,
|
||||
Flags: []cli.Flag{
|
||||
&cli.StringFlag{
|
||||
Name: "prometheus-url",
|
||||
Usage: "Prometheus base URL",
|
||||
EnvVars: []string{"PROMETHEUS_URL"},
|
||||
Value: "http://localhost:9090",
|
||||
},
|
||||
&cli.StringFlag{
|
||||
Name: "alertmanager-url",
|
||||
Usage: "Alertmanager base URL",
|
||||
EnvVars: []string{"ALERTMANAGER_URL"},
|
||||
Value: "http://localhost:9093",
|
||||
},
|
||||
},
|
||||
Commands: []*cli.Command{
|
||||
serveCommand(),
|
||||
alertsCommand(),
|
||||
queryCommand(),
|
||||
targetsCommand(),
|
||||
metricsCommand(),
|
||||
},
|
||||
}
|
||||
|
||||
if err := app.Run(os.Args); err != nil {
|
||||
log.Fatal(err)
|
||||
}
|
||||
}
|
||||
|
||||
func serveCommand() *cli.Command {
|
||||
return &cli.Command{
|
||||
Name: "serve",
|
||||
Usage: "Run MCP server for lab monitoring",
|
||||
Flags: []cli.Flag{
|
||||
&cli.StringFlag{
|
||||
Name: "transport",
|
||||
Aliases: []string{"t"},
|
||||
Usage: "Transport type: 'stdio' or 'http'",
|
||||
Value: "stdio",
|
||||
},
|
||||
&cli.StringFlag{
|
||||
Name: "http-address",
|
||||
Usage: "HTTP listen address",
|
||||
Value: "127.0.0.1:8084",
|
||||
},
|
||||
&cli.StringFlag{
|
||||
Name: "http-endpoint",
|
||||
Usage: "HTTP endpoint path",
|
||||
Value: "/mcp",
|
||||
},
|
||||
&cli.StringSliceFlag{
|
||||
Name: "allowed-origins",
|
||||
Usage: "Allowed Origin headers for CORS",
|
||||
},
|
||||
&cli.StringFlag{
|
||||
Name: "tls-cert",
|
||||
Usage: "TLS certificate file",
|
||||
},
|
||||
&cli.StringFlag{
|
||||
Name: "tls-key",
|
||||
Usage: "TLS key file",
|
||||
},
|
||||
&cli.DurationFlag{
|
||||
Name: "session-ttl",
|
||||
Usage: "Session TTL for HTTP transport",
|
||||
Value: 30 * time.Minute,
|
||||
},
|
||||
},
|
||||
Action: func(c *cli.Context) error {
|
||||
return runServe(c)
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
func alertsCommand() *cli.Command {
|
||||
return &cli.Command{
|
||||
Name: "alerts",
|
||||
Usage: "List alerts from Alertmanager",
|
||||
Flags: []cli.Flag{
|
||||
&cli.StringFlag{
|
||||
Name: "state",
|
||||
Usage: "Filter by state: active, suppressed, unprocessed",
|
||||
},
|
||||
&cli.StringFlag{
|
||||
Name: "severity",
|
||||
Usage: "Filter by severity label",
|
||||
},
|
||||
},
|
||||
Action: func(c *cli.Context) error {
|
||||
return runAlerts(c)
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
func queryCommand() *cli.Command {
|
||||
return &cli.Command{
|
||||
Name: "query",
|
||||
Usage: "Execute an instant PromQL query",
|
||||
ArgsUsage: "<promql>",
|
||||
Action: func(c *cli.Context) error {
|
||||
if c.NArg() < 1 {
|
||||
return fmt.Errorf("promql expression required")
|
||||
}
|
||||
return runQuery(c, c.Args().First())
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
func targetsCommand() *cli.Command {
|
||||
return &cli.Command{
|
||||
Name: "targets",
|
||||
Usage: "List scrape targets",
|
||||
Action: func(c *cli.Context) error {
|
||||
return runTargets(c)
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
func metricsCommand() *cli.Command {
|
||||
return &cli.Command{
|
||||
Name: "metrics",
|
||||
Usage: "Search metric names",
|
||||
ArgsUsage: "<search>",
|
||||
Flags: []cli.Flag{
|
||||
&cli.IntFlag{
|
||||
Name: "limit",
|
||||
Aliases: []string{"n"},
|
||||
Usage: "Maximum number of results",
|
||||
Value: 50,
|
||||
},
|
||||
},
|
||||
Action: func(c *cli.Context) error {
|
||||
query := ""
|
||||
if c.NArg() > 0 {
|
||||
query = c.Args().First()
|
||||
}
|
||||
return runMetrics(c, query)
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
func runServe(c *cli.Context) error {
|
||||
ctx, cancel := signal.NotifyContext(context.Background(), syscall.SIGINT, syscall.SIGTERM)
|
||||
defer cancel()
|
||||
|
||||
logger := log.New(os.Stderr, "[mcp] ", log.LstdFlags)
|
||||
config := mcp.DefaultMonitoringConfig()
|
||||
server := mcp.NewGenericServer(logger, config)
|
||||
|
||||
prom := monitoring.NewPrometheusClient(c.String("prometheus-url"))
|
||||
am := monitoring.NewAlertmanagerClient(c.String("alertmanager-url"))
|
||||
monitoring.RegisterHandlers(server, prom, am)
|
||||
|
||||
transport := c.String("transport")
|
||||
switch transport {
|
||||
case "stdio":
|
||||
logger.Println("Starting lab-monitoring MCP server on stdio...")
|
||||
return server.Run(ctx, os.Stdin, os.Stdout)
|
||||
|
||||
case "http":
|
||||
httpConfig := mcp.HTTPConfig{
|
||||
Address: c.String("http-address"),
|
||||
Endpoint: c.String("http-endpoint"),
|
||||
AllowedOrigins: c.StringSlice("allowed-origins"),
|
||||
SessionTTL: c.Duration("session-ttl"),
|
||||
TLSCertFile: c.String("tls-cert"),
|
||||
TLSKeyFile: c.String("tls-key"),
|
||||
}
|
||||
httpTransport := mcp.NewHTTPTransport(server, httpConfig)
|
||||
return httpTransport.Run(ctx)
|
||||
|
||||
default:
|
||||
return fmt.Errorf("unknown transport: %s (use 'stdio' or 'http')", transport)
|
||||
}
|
||||
}
|
||||
|
||||
func runAlerts(c *cli.Context) error {
|
||||
ctx := context.Background()
|
||||
am := monitoring.NewAlertmanagerClient(c.String("alertmanager-url"))
|
||||
|
||||
filters := monitoring.AlertFilters{}
|
||||
if state := c.String("state"); state != "" {
|
||||
switch state {
|
||||
case "active":
|
||||
active := true
|
||||
filters.Active = &active
|
||||
silenced := false
|
||||
filters.Silenced = &silenced
|
||||
inhibited := false
|
||||
filters.Inhibited = &inhibited
|
||||
case "suppressed":
|
||||
active := false
|
||||
filters.Active = &active
|
||||
case "unprocessed":
|
||||
unprocessed := true
|
||||
filters.Unprocessed = &unprocessed
|
||||
}
|
||||
}
|
||||
if severity := c.String("severity"); severity != "" {
|
||||
filters.Filter = append(filters.Filter, fmt.Sprintf(`severity="%s"`, severity))
|
||||
}
|
||||
|
||||
alerts, err := am.ListAlerts(ctx, filters)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to list alerts: %w", err)
|
||||
}
|
||||
|
||||
if len(alerts) == 0 {
|
||||
fmt.Println("No alerts found.")
|
||||
return nil
|
||||
}
|
||||
|
||||
for _, a := range alerts {
|
||||
state := a.Status.State
|
||||
severity := a.Labels["severity"]
|
||||
name := a.Labels["alertname"]
|
||||
fmt.Printf("[%s] %s (severity=%s, fingerprint=%s)\n", state, name, severity, a.Fingerprint)
|
||||
for k, v := range a.Annotations {
|
||||
fmt.Printf(" %s: %s\n", k, v)
|
||||
}
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func runQuery(c *cli.Context, promql string) error {
|
||||
ctx := context.Background()
|
||||
prom := monitoring.NewPrometheusClient(c.String("prometheus-url"))
|
||||
|
||||
data, err := prom.Query(ctx, promql, time.Time{})
|
||||
if err != nil {
|
||||
return fmt.Errorf("query failed: %w", err)
|
||||
}
|
||||
|
||||
for _, r := range data.Result {
|
||||
labels := ""
|
||||
for k, v := range r.Metric {
|
||||
if labels != "" {
|
||||
labels += ", "
|
||||
}
|
||||
labels += fmt.Sprintf("%s=%q", k, v)
|
||||
}
|
||||
value := ""
|
||||
if len(r.Value) >= 2 {
|
||||
if v, ok := r.Value[1].(string); ok {
|
||||
value = v
|
||||
}
|
||||
}
|
||||
fmt.Printf("{%s} %s\n", labels, value)
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func runTargets(c *cli.Context) error {
|
||||
ctx := context.Background()
|
||||
prom := monitoring.NewPrometheusClient(c.String("prometheus-url"))
|
||||
|
||||
data, err := prom.Targets(ctx)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to fetch targets: %w", err)
|
||||
}
|
||||
|
||||
if len(data.ActiveTargets) == 0 {
|
||||
fmt.Println("No active targets.")
|
||||
return nil
|
||||
}
|
||||
|
||||
for _, t := range data.ActiveTargets {
|
||||
job := t.Labels["job"]
|
||||
instance := t.Labels["instance"]
|
||||
fmt.Printf("[%s] %s/%s (last scrape: %s, duration: %.3fs)\n",
|
||||
t.Health, job, instance, t.LastScrape.Format("15:04:05"), t.LastScrapeDuration)
|
||||
if t.LastError != "" {
|
||||
fmt.Printf(" error: %s\n", t.LastError)
|
||||
}
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func runMetrics(c *cli.Context, query string) error {
|
||||
ctx := context.Background()
|
||||
prom := monitoring.NewPrometheusClient(c.String("prometheus-url"))
|
||||
|
||||
names, err := prom.LabelValues(ctx, "__name__")
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to fetch metric names: %w", err)
|
||||
}
|
||||
|
||||
limit := c.Int("limit")
|
||||
count := 0
|
||||
for _, name := range names {
|
||||
if query != "" {
|
||||
// Simple case-insensitive substring match
|
||||
if !containsIgnoreCase(name, query) {
|
||||
continue
|
||||
}
|
||||
}
|
||||
fmt.Println(name)
|
||||
count++
|
||||
if count >= limit {
|
||||
fmt.Printf("... (showing %d of matching metrics, use --limit to see more)\n", limit)
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
if count == 0 {
|
||||
fmt.Printf("No metrics found matching '%s'\n", query)
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func containsIgnoreCase(s, substr string) bool {
|
||||
sLower := make([]byte, len(s))
|
||||
subLower := make([]byte, len(substr))
|
||||
for i := range s {
|
||||
if s[i] >= 'A' && s[i] <= 'Z' {
|
||||
sLower[i] = s[i] + 32
|
||||
} else {
|
||||
sLower[i] = s[i]
|
||||
}
|
||||
}
|
||||
for i := range substr {
|
||||
if substr[i] >= 'A' && substr[i] <= 'Z' {
|
||||
subLower[i] = substr[i] + 32
|
||||
} else {
|
||||
subLower[i] = substr[i]
|
||||
}
|
||||
}
|
||||
|
||||
for i := 0; i <= len(sLower)-len(subLower); i++ {
|
||||
match := true
|
||||
for j := range subLower {
|
||||
if sLower[i+j] != subLower[j] {
|
||||
match = false
|
||||
break
|
||||
}
|
||||
}
|
||||
if match {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
@@ -19,7 +19,7 @@ import (
|
||||
|
||||
const (
|
||||
defaultDatabase = "sqlite://nixos-options.db"
|
||||
version = "0.2.0"
|
||||
version = "0.2.1"
|
||||
)
|
||||
|
||||
func main() {
|
||||
|
||||
@@ -20,7 +20,7 @@ import (
|
||||
|
||||
const (
|
||||
defaultDatabase = "sqlite://nixpkgs-search.db"
|
||||
version = "0.2.0"
|
||||
version = "0.2.1"
|
||||
)
|
||||
|
||||
func main() {
|
||||
|
||||
Reference in New Issue
Block a user