Add 3 opt-in Loki tools (query_logs, list_labels, list_label_values) that are registered when LOKI_URL is configured. Includes Loki HTTP client, CLI commands (logs, labels), NixOS module option, formatting, and tests. Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
587 lines
13 KiB
Go
587 lines
13 KiB
Go
package main
|
|
|
|
import (
|
|
"context"
|
|
"fmt"
|
|
"log"
|
|
"os"
|
|
"os/signal"
|
|
"syscall"
|
|
"time"
|
|
|
|
"github.com/urfave/cli/v2"
|
|
|
|
"git.t-juice.club/torjus/labmcp/internal/mcp"
|
|
"git.t-juice.club/torjus/labmcp/internal/monitoring"
|
|
)
|
|
|
|
const version = "0.2.0"
|
|
|
|
func main() {
|
|
app := &cli.App{
|
|
Name: "lab-monitoring",
|
|
Usage: "MCP server for Prometheus and Alertmanager monitoring",
|
|
Version: version,
|
|
Flags: []cli.Flag{
|
|
&cli.StringFlag{
|
|
Name: "prometheus-url",
|
|
Usage: "Prometheus base URL",
|
|
EnvVars: []string{"PROMETHEUS_URL"},
|
|
Value: "http://localhost:9090",
|
|
},
|
|
&cli.StringFlag{
|
|
Name: "alertmanager-url",
|
|
Usage: "Alertmanager base URL",
|
|
EnvVars: []string{"ALERTMANAGER_URL"},
|
|
Value: "http://localhost:9093",
|
|
},
|
|
&cli.StringFlag{
|
|
Name: "loki-url",
|
|
Usage: "Loki base URL (optional, enables log query tools)",
|
|
EnvVars: []string{"LOKI_URL"},
|
|
},
|
|
},
|
|
Commands: []*cli.Command{
|
|
serveCommand(),
|
|
alertsCommand(),
|
|
queryCommand(),
|
|
targetsCommand(),
|
|
metricsCommand(),
|
|
logsCommand(),
|
|
labelsCommand(),
|
|
},
|
|
}
|
|
|
|
if err := app.Run(os.Args); err != nil {
|
|
log.Fatal(err)
|
|
}
|
|
}
|
|
|
|
func serveCommand() *cli.Command {
|
|
return &cli.Command{
|
|
Name: "serve",
|
|
Usage: "Run MCP server for lab monitoring",
|
|
Flags: []cli.Flag{
|
|
&cli.StringFlag{
|
|
Name: "transport",
|
|
Aliases: []string{"t"},
|
|
Usage: "Transport type: 'stdio' or 'http'",
|
|
Value: "stdio",
|
|
},
|
|
&cli.StringFlag{
|
|
Name: "http-address",
|
|
Usage: "HTTP listen address",
|
|
Value: "127.0.0.1:8084",
|
|
},
|
|
&cli.StringFlag{
|
|
Name: "http-endpoint",
|
|
Usage: "HTTP endpoint path",
|
|
Value: "/mcp",
|
|
},
|
|
&cli.StringSliceFlag{
|
|
Name: "allowed-origins",
|
|
Usage: "Allowed Origin headers for CORS",
|
|
},
|
|
&cli.StringFlag{
|
|
Name: "tls-cert",
|
|
Usage: "TLS certificate file",
|
|
},
|
|
&cli.StringFlag{
|
|
Name: "tls-key",
|
|
Usage: "TLS key file",
|
|
},
|
|
&cli.DurationFlag{
|
|
Name: "session-ttl",
|
|
Usage: "Session TTL for HTTP transport",
|
|
Value: 30 * time.Minute,
|
|
},
|
|
&cli.BoolFlag{
|
|
Name: "enable-silences",
|
|
Usage: "Enable the create_silence tool (write operation, disabled by default)",
|
|
},
|
|
},
|
|
Action: func(c *cli.Context) error {
|
|
return runServe(c)
|
|
},
|
|
}
|
|
}
|
|
|
|
func alertsCommand() *cli.Command {
|
|
return &cli.Command{
|
|
Name: "alerts",
|
|
Usage: "List alerts from Alertmanager",
|
|
Flags: []cli.Flag{
|
|
&cli.StringFlag{
|
|
Name: "state",
|
|
Usage: "Filter by state: active, suppressed, unprocessed",
|
|
},
|
|
&cli.StringFlag{
|
|
Name: "severity",
|
|
Usage: "Filter by severity label",
|
|
},
|
|
},
|
|
Action: func(c *cli.Context) error {
|
|
return runAlerts(c)
|
|
},
|
|
}
|
|
}
|
|
|
|
func queryCommand() *cli.Command {
|
|
return &cli.Command{
|
|
Name: "query",
|
|
Usage: "Execute an instant PromQL query",
|
|
ArgsUsage: "<promql>",
|
|
Action: func(c *cli.Context) error {
|
|
if c.NArg() < 1 {
|
|
return fmt.Errorf("promql expression required")
|
|
}
|
|
return runQuery(c, c.Args().First())
|
|
},
|
|
}
|
|
}
|
|
|
|
func targetsCommand() *cli.Command {
|
|
return &cli.Command{
|
|
Name: "targets",
|
|
Usage: "List scrape targets",
|
|
Action: func(c *cli.Context) error {
|
|
return runTargets(c)
|
|
},
|
|
}
|
|
}
|
|
|
|
func metricsCommand() *cli.Command {
|
|
return &cli.Command{
|
|
Name: "metrics",
|
|
Usage: "Search metric names",
|
|
ArgsUsage: "<search>",
|
|
Flags: []cli.Flag{
|
|
&cli.IntFlag{
|
|
Name: "limit",
|
|
Aliases: []string{"n"},
|
|
Usage: "Maximum number of results",
|
|
Value: 50,
|
|
},
|
|
},
|
|
Action: func(c *cli.Context) error {
|
|
query := ""
|
|
if c.NArg() > 0 {
|
|
query = c.Args().First()
|
|
}
|
|
return runMetrics(c, query)
|
|
},
|
|
}
|
|
}
|
|
|
|
func runServe(c *cli.Context) error {
|
|
ctx, cancel := signal.NotifyContext(context.Background(), syscall.SIGINT, syscall.SIGTERM)
|
|
defer cancel()
|
|
|
|
logger := log.New(os.Stderr, "[mcp] ", log.LstdFlags)
|
|
config := mcp.DefaultMonitoringConfig()
|
|
|
|
prom := monitoring.NewPrometheusClient(c.String("prometheus-url"))
|
|
am := monitoring.NewAlertmanagerClient(c.String("alertmanager-url"))
|
|
|
|
var loki *monitoring.LokiClient
|
|
if lokiURL := c.String("loki-url"); lokiURL != "" {
|
|
loki = monitoring.NewLokiClient(lokiURL)
|
|
}
|
|
|
|
config.InstructionsFunc = func() string {
|
|
return monitoring.AlertSummary(am)
|
|
}
|
|
|
|
server := mcp.NewGenericServer(logger, config)
|
|
opts := monitoring.HandlerOptions{
|
|
EnableSilences: c.Bool("enable-silences"),
|
|
}
|
|
monitoring.RegisterHandlers(server, prom, am, loki, opts)
|
|
|
|
transport := c.String("transport")
|
|
switch transport {
|
|
case "stdio":
|
|
logger.Println("Starting lab-monitoring MCP server on stdio...")
|
|
return server.Run(ctx, os.Stdin, os.Stdout)
|
|
|
|
case "http":
|
|
httpConfig := mcp.HTTPConfig{
|
|
Address: c.String("http-address"),
|
|
Endpoint: c.String("http-endpoint"),
|
|
AllowedOrigins: c.StringSlice("allowed-origins"),
|
|
SessionTTL: c.Duration("session-ttl"),
|
|
TLSCertFile: c.String("tls-cert"),
|
|
TLSKeyFile: c.String("tls-key"),
|
|
}
|
|
httpTransport := mcp.NewHTTPTransport(server, httpConfig)
|
|
return httpTransport.Run(ctx)
|
|
|
|
default:
|
|
return fmt.Errorf("unknown transport: %s (use 'stdio' or 'http')", transport)
|
|
}
|
|
}
|
|
|
|
func runAlerts(c *cli.Context) error {
|
|
ctx := context.Background()
|
|
am := monitoring.NewAlertmanagerClient(c.String("alertmanager-url"))
|
|
|
|
filters := monitoring.AlertFilters{}
|
|
if state := c.String("state"); state != "" {
|
|
switch state {
|
|
case "active":
|
|
active := true
|
|
filters.Active = &active
|
|
silenced := false
|
|
filters.Silenced = &silenced
|
|
inhibited := false
|
|
filters.Inhibited = &inhibited
|
|
case "suppressed":
|
|
active := false
|
|
filters.Active = &active
|
|
case "unprocessed":
|
|
unprocessed := true
|
|
filters.Unprocessed = &unprocessed
|
|
}
|
|
}
|
|
if severity := c.String("severity"); severity != "" {
|
|
filters.Filter = append(filters.Filter, fmt.Sprintf(`severity="%s"`, severity))
|
|
}
|
|
|
|
alerts, err := am.ListAlerts(ctx, filters)
|
|
if err != nil {
|
|
return fmt.Errorf("failed to list alerts: %w", err)
|
|
}
|
|
|
|
if len(alerts) == 0 {
|
|
fmt.Println("No alerts found.")
|
|
return nil
|
|
}
|
|
|
|
for _, a := range alerts {
|
|
state := a.Status.State
|
|
severity := a.Labels["severity"]
|
|
name := a.Labels["alertname"]
|
|
fmt.Printf("[%s] %s (severity=%s, fingerprint=%s)\n", state, name, severity, a.Fingerprint)
|
|
for k, v := range a.Annotations {
|
|
fmt.Printf(" %s: %s\n", k, v)
|
|
}
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
func runQuery(c *cli.Context, promql string) error {
|
|
ctx := context.Background()
|
|
prom := monitoring.NewPrometheusClient(c.String("prometheus-url"))
|
|
|
|
data, err := prom.Query(ctx, promql, time.Time{})
|
|
if err != nil {
|
|
return fmt.Errorf("query failed: %w", err)
|
|
}
|
|
|
|
for _, r := range data.Result {
|
|
labels := ""
|
|
for k, v := range r.Metric {
|
|
if labels != "" {
|
|
labels += ", "
|
|
}
|
|
labels += fmt.Sprintf("%s=%q", k, v)
|
|
}
|
|
value := ""
|
|
if len(r.Value) >= 2 {
|
|
if v, ok := r.Value[1].(string); ok {
|
|
value = v
|
|
}
|
|
}
|
|
fmt.Printf("{%s} %s\n", labels, value)
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
func runTargets(c *cli.Context) error {
|
|
ctx := context.Background()
|
|
prom := monitoring.NewPrometheusClient(c.String("prometheus-url"))
|
|
|
|
data, err := prom.Targets(ctx)
|
|
if err != nil {
|
|
return fmt.Errorf("failed to fetch targets: %w", err)
|
|
}
|
|
|
|
if len(data.ActiveTargets) == 0 {
|
|
fmt.Println("No active targets.")
|
|
return nil
|
|
}
|
|
|
|
for _, t := range data.ActiveTargets {
|
|
job := t.Labels["job"]
|
|
instance := t.Labels["instance"]
|
|
fmt.Printf("[%s] %s/%s (last scrape: %s, duration: %.3fs)\n",
|
|
t.Health, job, instance, t.LastScrape.Format("15:04:05"), t.LastScrapeDuration)
|
|
if t.LastError != "" {
|
|
fmt.Printf(" error: %s\n", t.LastError)
|
|
}
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
func runMetrics(c *cli.Context, query string) error {
|
|
ctx := context.Background()
|
|
prom := monitoring.NewPrometheusClient(c.String("prometheus-url"))
|
|
|
|
names, err := prom.LabelValues(ctx, "__name__")
|
|
if err != nil {
|
|
return fmt.Errorf("failed to fetch metric names: %w", err)
|
|
}
|
|
|
|
limit := c.Int("limit")
|
|
count := 0
|
|
for _, name := range names {
|
|
if query != "" {
|
|
// Simple case-insensitive substring match
|
|
if !containsIgnoreCase(name, query) {
|
|
continue
|
|
}
|
|
}
|
|
fmt.Println(name)
|
|
count++
|
|
if count >= limit {
|
|
fmt.Printf("... (showing %d of matching metrics, use --limit to see more)\n", limit)
|
|
break
|
|
}
|
|
}
|
|
|
|
if count == 0 {
|
|
fmt.Printf("No metrics found matching '%s'\n", query)
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
func logsCommand() *cli.Command {
|
|
return &cli.Command{
|
|
Name: "logs",
|
|
Usage: "Query logs from Loki using LogQL",
|
|
ArgsUsage: "<logql>",
|
|
Flags: []cli.Flag{
|
|
&cli.StringFlag{
|
|
Name: "start",
|
|
Usage: "Start time: relative duration (e.g., '1h'), RFC3339, or Unix epoch",
|
|
Value: "1h",
|
|
},
|
|
&cli.StringFlag{
|
|
Name: "end",
|
|
Usage: "End time: relative duration, RFC3339, or Unix epoch",
|
|
Value: "now",
|
|
},
|
|
&cli.IntFlag{
|
|
Name: "limit",
|
|
Aliases: []string{"n"},
|
|
Usage: "Maximum number of entries",
|
|
Value: 100,
|
|
},
|
|
&cli.StringFlag{
|
|
Name: "direction",
|
|
Usage: "Sort order: 'backward' (newest first) or 'forward' (oldest first)",
|
|
Value: "backward",
|
|
},
|
|
},
|
|
Action: func(c *cli.Context) error {
|
|
if c.NArg() < 1 {
|
|
return fmt.Errorf("LogQL expression required")
|
|
}
|
|
return runLogs(c, c.Args().First())
|
|
},
|
|
}
|
|
}
|
|
|
|
func labelsCommand() *cli.Command {
|
|
return &cli.Command{
|
|
Name: "labels",
|
|
Usage: "List labels from Loki, or values for a specific label",
|
|
Flags: []cli.Flag{
|
|
&cli.StringFlag{
|
|
Name: "values",
|
|
Usage: "Get values for this label name instead of listing labels",
|
|
},
|
|
},
|
|
Action: func(c *cli.Context) error {
|
|
return runLabels(c)
|
|
},
|
|
}
|
|
}
|
|
|
|
func runLogs(c *cli.Context, logql string) error {
|
|
lokiURL := c.String("loki-url")
|
|
if lokiURL == "" {
|
|
return fmt.Errorf("--loki-url or LOKI_URL is required for log queries")
|
|
}
|
|
|
|
ctx := context.Background()
|
|
loki := monitoring.NewLokiClient(lokiURL)
|
|
|
|
now := time.Now()
|
|
start, err := parseCLITime(c.String("start"), now.Add(-time.Hour))
|
|
if err != nil {
|
|
return fmt.Errorf("invalid start time: %w", err)
|
|
}
|
|
end, err := parseCLITime(c.String("end"), now)
|
|
if err != nil {
|
|
return fmt.Errorf("invalid end time: %w", err)
|
|
}
|
|
|
|
data, err := loki.QueryRange(ctx, logql, start, end, c.Int("limit"), c.String("direction"))
|
|
if err != nil {
|
|
return fmt.Errorf("log query failed: %w", err)
|
|
}
|
|
|
|
totalEntries := 0
|
|
for _, stream := range data.Result {
|
|
totalEntries += len(stream.Values)
|
|
}
|
|
|
|
if totalEntries == 0 {
|
|
fmt.Println("No log entries found.")
|
|
return nil
|
|
}
|
|
|
|
for _, stream := range data.Result {
|
|
// Print stream labels
|
|
labels := ""
|
|
for k, v := range stream.Stream {
|
|
if labels != "" {
|
|
labels += ", "
|
|
}
|
|
labels += fmt.Sprintf("%s=%q", k, v)
|
|
}
|
|
fmt.Printf("--- {%s} ---\n", labels)
|
|
|
|
for _, entry := range stream.Values {
|
|
ts := formatCLITimestamp(entry[0])
|
|
fmt.Printf("[%s] %s\n", ts, entry[1])
|
|
}
|
|
fmt.Println()
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
func runLabels(c *cli.Context) error {
|
|
lokiURL := c.String("loki-url")
|
|
if lokiURL == "" {
|
|
return fmt.Errorf("--loki-url or LOKI_URL is required for label queries")
|
|
}
|
|
|
|
ctx := context.Background()
|
|
loki := monitoring.NewLokiClient(lokiURL)
|
|
|
|
if label := c.String("values"); label != "" {
|
|
values, err := loki.LabelValues(ctx, label)
|
|
if err != nil {
|
|
return fmt.Errorf("failed to list label values: %w", err)
|
|
}
|
|
if len(values) == 0 {
|
|
fmt.Printf("No values found for label '%s'.\n", label)
|
|
return nil
|
|
}
|
|
for _, v := range values {
|
|
fmt.Println(v)
|
|
}
|
|
return nil
|
|
}
|
|
|
|
labels, err := loki.Labels(ctx)
|
|
if err != nil {
|
|
return fmt.Errorf("failed to list labels: %w", err)
|
|
}
|
|
if len(labels) == 0 {
|
|
fmt.Println("No labels found.")
|
|
return nil
|
|
}
|
|
for _, label := range labels {
|
|
fmt.Println(label)
|
|
}
|
|
return nil
|
|
}
|
|
|
|
// parseCLITime parses a time string for CLI use. Handles "now", relative durations,
|
|
// RFC3339, and Unix epoch seconds.
|
|
func parseCLITime(s string, defaultTime time.Time) (time.Time, error) {
|
|
if s == "now" || s == "" {
|
|
return time.Now(), nil
|
|
}
|
|
|
|
// Try as relative duration
|
|
if d, err := time.ParseDuration(s); err == nil {
|
|
return time.Now().Add(-d), nil
|
|
}
|
|
|
|
// Try as RFC3339
|
|
if t, err := time.Parse(time.RFC3339, s); err == nil {
|
|
return t, nil
|
|
}
|
|
|
|
// Try as Unix epoch seconds
|
|
var epoch int64
|
|
validDigits := true
|
|
for _, c := range s {
|
|
if c >= '0' && c <= '9' {
|
|
epoch = epoch*10 + int64(c-'0')
|
|
} else {
|
|
validDigits = false
|
|
break
|
|
}
|
|
}
|
|
if validDigits && len(s) > 0 {
|
|
return time.Unix(epoch, 0), nil
|
|
}
|
|
|
|
return defaultTime, fmt.Errorf("cannot parse time '%s'", s)
|
|
}
|
|
|
|
// formatCLITimestamp converts a nanosecond Unix timestamp string to a readable format.
|
|
func formatCLITimestamp(nsStr string) string {
|
|
var ns int64
|
|
for _, c := range nsStr {
|
|
if c >= '0' && c <= '9' {
|
|
ns = ns*10 + int64(c-'0')
|
|
}
|
|
}
|
|
t := time.Unix(0, ns)
|
|
return t.Local().Format("2006-01-02 15:04:05")
|
|
}
|
|
|
|
func containsIgnoreCase(s, substr string) bool {
|
|
sLower := make([]byte, len(s))
|
|
subLower := make([]byte, len(substr))
|
|
for i := range s {
|
|
if s[i] >= 'A' && s[i] <= 'Z' {
|
|
sLower[i] = s[i] + 32
|
|
} else {
|
|
sLower[i] = s[i]
|
|
}
|
|
}
|
|
for i := range substr {
|
|
if substr[i] >= 'A' && substr[i] <= 'Z' {
|
|
subLower[i] = substr[i] + 32
|
|
} else {
|
|
subLower[i] = substr[i]
|
|
}
|
|
}
|
|
|
|
for i := 0; i <= len(sLower)-len(subLower); i++ {
|
|
match := true
|
|
for j := range subLower {
|
|
if sLower[i+j] != subLower[j] {
|
|
match = false
|
|
break
|
|
}
|
|
}
|
|
if match {
|
|
return true
|
|
}
|
|
}
|
|
return false
|
|
}
|