diff --git a/.mcp.json b/.mcp.json index 4705ce8..e22d3d1 100644 --- a/.mcp.json +++ b/.mcp.json @@ -25,6 +25,19 @@ "env": { "NIXPKGS_SEARCH_DATABASE": "sqlite://:memory:" } + }, + "lab-monitoring": { + "command": "nix", + "args": [ + "run", + ".#lab-monitoring", + "--", + "serve" + ], + "env": { + "PROMETHEUS_URL": "http://localhost:9090", + "ALERTMANAGER_URL": "http://localhost:9093" + } } } } diff --git a/CLAUDE.md b/CLAUDE.md index 8b66d40..0971009 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -20,7 +20,12 @@ Search and query NixOS configuration options. Uses nixpkgs as source. ### Home Manager Options (`hm-options`) Search and query Home Manager configuration options. Uses home-manager repository as source. -All servers share the same architecture: +### Lab Monitoring (`lab-monitoring`) +Query Prometheus metrics and Alertmanager alerts. Unlike other servers, this queries live HTTP APIs — no database or indexing needed. +- 8 tools: list/get alerts, search metrics, get metadata, PromQL query, list targets, list/create silences +- Configurable Prometheus and Alertmanager URLs via flags or environment variables + +The nixpkgs/options/hm servers share a database-backed architecture: - Full-text search across option/package names and descriptions - Query specific options/packages with full metadata - Index multiple revisions (by git hash or channel name) @@ -38,8 +43,9 @@ All servers share the same architecture: ## Project Status **Complete and maintained** - All core features implemented: -- Full MCP servers with 6 tools each -- PostgreSQL and SQLite backends with FTS +- Full MCP servers (6 tools each for nixpkgs/options, 8 tools for monitoring) +- PostgreSQL and SQLite backends with FTS (for nixpkgs/options servers) +- Live API queries for Prometheus/Alertmanager (monitoring server) - NixOS modules for deployment - CLI for manual operations - Comprehensive test suite @@ -53,8 +59,10 @@ labmcp/ │ │ └── main.go # Combined options+packages CLI (primary) │ ├── nixos-options/ │ │ └── main.go # NixOS options CLI (legacy) -│ └── hm-options/ -│ └── main.go # Home Manager options CLI +│ ├── hm-options/ +│ │ └── main.go # Home Manager options CLI +│ └── lab-monitoring/ +│ └── main.go # Prometheus/Alertmanager CLI ├── internal/ │ ├── database/ │ │ ├── interface.go # Store interface (options + packages) @@ -82,15 +90,23 @@ labmcp/ │ │ ├── indexer.go # Home Manager indexing │ │ ├── types.go # Channel aliases, extensions │ │ └── *_test.go # Indexer tests -│ └── packages/ -│ ├── indexer.go # Nix packages indexing -│ ├── parser.go # nix-env JSON parsing -│ ├── types.go # Package types, channel aliases -│ └── *_test.go # Parser tests +│ ├── packages/ +│ │ ├── indexer.go # Nix packages indexing +│ │ ├── parser.go # nix-env JSON parsing +│ │ ├── types.go # Package types, channel aliases +│ │ └── *_test.go # Parser tests +│ └── monitoring/ +│ ├── types.go # Prometheus/Alertmanager API types +│ ├── prometheus.go # Prometheus HTTP client +│ ├── alertmanager.go # Alertmanager HTTP client +│ ├── handlers.go # MCP tool definitions + handlers +│ ├── format.go # Markdown formatting utilities +│ └── *_test.go # Tests (httptest-based) ├── nix/ -│ ├── module.nix # NixOS module for nixos-options -│ ├── hm-options-module.nix # NixOS module for hm-options -│ └── package.nix # Parameterized Nix package +│ ├── module.nix # NixOS module for nixos-options +│ ├── hm-options-module.nix # NixOS module for hm-options +│ ├── lab-monitoring-module.nix # NixOS module for lab-monitoring +│ └── package.nix # Parameterized Nix package ├── testdata/ │ └── options-sample.json # Test fixture ├── flake.nix @@ -124,6 +140,19 @@ labmcp/ | `list_revisions` | List all indexed revisions | | `delete_revision` | Delete an indexed revision | +### Monitoring Server (lab-monitoring) + +| Tool | Description | +|------|-------------| +| `list_alerts` | List alerts with optional filters (state, severity, receiver) | +| `get_alert` | Get full details for a specific alert by fingerprint | +| `search_metrics` | Search metric names with substring filter, enriched with metadata | +| `get_metric_metadata` | Get type, help text, and unit for a specific metric | +| `query` | Execute instant PromQL query | +| `list_targets` | List scrape targets with health status | +| `list_silences` | List active/pending silences | +| `create_silence` | Create a silence (confirms with user first) | + ## Key Implementation Details ### Database @@ -212,6 +241,17 @@ hm-options delete # Delete indexed revision hm-options --version # Show version ``` +### lab-monitoring +```bash +lab-monitoring serve # Run MCP server on STDIO +lab-monitoring serve --transport http # Run MCP server on HTTP +lab-monitoring alerts # List alerts +lab-monitoring alerts --state active # Filter by state +lab-monitoring query 'up' # Instant PromQL query +lab-monitoring targets # List scrape targets +lab-monitoring metrics node # Search metric names +``` + ### Channel Aliases **nixpkgs-search/nixos-options**: `nixos-unstable`, `nixos-stable`, `nixos-24.11`, `nixos-24.05`, etc. @@ -220,6 +260,9 @@ hm-options --version # Show version ## Notes for Claude +### Planning +When creating implementation plans, the first step should usually be to **checkout an appropriately named feature branch** (e.g., `git checkout -b feature/lab-monitoring`). This keeps work isolated and makes PRs cleaner. + ### Development Workflow - **Always run `go fmt ./...` before committing Go code** - **Run Go commands using `nix develop -c`** (e.g., `nix develop -c go test ./...`) @@ -257,7 +300,8 @@ Version is defined in multiple places that must stay in sync: - `cmd/nixpkgs-search/main.go` - `cmd/nixos-options/main.go` - `cmd/hm-options/main.go` -- `internal/mcp/server.go` (in `DefaultNixOSConfig`, `DefaultHomeManagerConfig`, `DefaultNixpkgsPackagesConfig`) +- `cmd/lab-monitoring/main.go` +- `internal/mcp/server.go` (in `DefaultNixOSConfig`, `DefaultHomeManagerConfig`, `DefaultNixpkgsPackagesConfig`, `DefaultMonitoringConfig`) - `nix/package.nix` ### User Preferences @@ -284,6 +328,7 @@ nix develop -c go test -bench=. -benchtime=1x -timeout=30m ./internal/homemanage nix build .#nixpkgs-search nix build .#nixos-options nix build .#hm-options +nix build .#lab-monitoring # Run directly nix run .#nixpkgs-search -- options serve @@ -291,6 +336,7 @@ nix run .#nixpkgs-search -- packages serve nix run .#nixpkgs-search -- index nixos-unstable nix run .#hm-options -- serve nix run .#hm-options -- index hm-unstable +nix run .#lab-monitoring -- serve ``` ### Indexing Performance diff --git a/TODO.md b/TODO.md index f55726c..9cd60bf 100644 --- a/TODO.md +++ b/TODO.md @@ -15,6 +15,7 @@ ## New MCP Servers - [x] `nixpkgs-packages` - Index and search nixpkgs packages (implemented in `nixpkgs-search packages`) +- [x] `lab-monitoring` - Query Prometheus and Alertmanager APIs (8 tools, no database required) ## Nice to Have diff --git a/cmd/hm-options/main.go b/cmd/hm-options/main.go index 0a04107..8a1b76f 100644 --- a/cmd/hm-options/main.go +++ b/cmd/hm-options/main.go @@ -20,7 +20,7 @@ import ( const ( defaultDatabase = "sqlite://hm-options.db" - version = "0.2.0" + version = "0.2.1" ) func main() { diff --git a/cmd/lab-monitoring/main.go b/cmd/lab-monitoring/main.go new file mode 100644 index 0000000..410782a --- /dev/null +++ b/cmd/lab-monitoring/main.go @@ -0,0 +1,369 @@ +package main + +import ( + "context" + "fmt" + "log" + "os" + "os/signal" + "syscall" + "time" + + "github.com/urfave/cli/v2" + + "git.t-juice.club/torjus/labmcp/internal/mcp" + "git.t-juice.club/torjus/labmcp/internal/monitoring" +) + +const version = "0.1.0" + +func main() { + app := &cli.App{ + Name: "lab-monitoring", + Usage: "MCP server for Prometheus and Alertmanager monitoring", + Version: version, + Flags: []cli.Flag{ + &cli.StringFlag{ + Name: "prometheus-url", + Usage: "Prometheus base URL", + EnvVars: []string{"PROMETHEUS_URL"}, + Value: "http://localhost:9090", + }, + &cli.StringFlag{ + Name: "alertmanager-url", + Usage: "Alertmanager base URL", + EnvVars: []string{"ALERTMANAGER_URL"}, + Value: "http://localhost:9093", + }, + }, + Commands: []*cli.Command{ + serveCommand(), + alertsCommand(), + queryCommand(), + targetsCommand(), + metricsCommand(), + }, + } + + if err := app.Run(os.Args); err != nil { + log.Fatal(err) + } +} + +func serveCommand() *cli.Command { + return &cli.Command{ + Name: "serve", + Usage: "Run MCP server for lab monitoring", + Flags: []cli.Flag{ + &cli.StringFlag{ + Name: "transport", + Aliases: []string{"t"}, + Usage: "Transport type: 'stdio' or 'http'", + Value: "stdio", + }, + &cli.StringFlag{ + Name: "http-address", + Usage: "HTTP listen address", + Value: "127.0.0.1:8084", + }, + &cli.StringFlag{ + Name: "http-endpoint", + Usage: "HTTP endpoint path", + Value: "/mcp", + }, + &cli.StringSliceFlag{ + Name: "allowed-origins", + Usage: "Allowed Origin headers for CORS", + }, + &cli.StringFlag{ + Name: "tls-cert", + Usage: "TLS certificate file", + }, + &cli.StringFlag{ + Name: "tls-key", + Usage: "TLS key file", + }, + &cli.DurationFlag{ + Name: "session-ttl", + Usage: "Session TTL for HTTP transport", + Value: 30 * time.Minute, + }, + }, + Action: func(c *cli.Context) error { + return runServe(c) + }, + } +} + +func alertsCommand() *cli.Command { + return &cli.Command{ + Name: "alerts", + Usage: "List alerts from Alertmanager", + Flags: []cli.Flag{ + &cli.StringFlag{ + Name: "state", + Usage: "Filter by state: active, suppressed, unprocessed", + }, + &cli.StringFlag{ + Name: "severity", + Usage: "Filter by severity label", + }, + }, + Action: func(c *cli.Context) error { + return runAlerts(c) + }, + } +} + +func queryCommand() *cli.Command { + return &cli.Command{ + Name: "query", + Usage: "Execute an instant PromQL query", + ArgsUsage: "", + Action: func(c *cli.Context) error { + if c.NArg() < 1 { + return fmt.Errorf("promql expression required") + } + return runQuery(c, c.Args().First()) + }, + } +} + +func targetsCommand() *cli.Command { + return &cli.Command{ + Name: "targets", + Usage: "List scrape targets", + Action: func(c *cli.Context) error { + return runTargets(c) + }, + } +} + +func metricsCommand() *cli.Command { + return &cli.Command{ + Name: "metrics", + Usage: "Search metric names", + ArgsUsage: "", + Flags: []cli.Flag{ + &cli.IntFlag{ + Name: "limit", + Aliases: []string{"n"}, + Usage: "Maximum number of results", + Value: 50, + }, + }, + Action: func(c *cli.Context) error { + query := "" + if c.NArg() > 0 { + query = c.Args().First() + } + return runMetrics(c, query) + }, + } +} + +func runServe(c *cli.Context) error { + ctx, cancel := signal.NotifyContext(context.Background(), syscall.SIGINT, syscall.SIGTERM) + defer cancel() + + logger := log.New(os.Stderr, "[mcp] ", log.LstdFlags) + config := mcp.DefaultMonitoringConfig() + server := mcp.NewGenericServer(logger, config) + + prom := monitoring.NewPrometheusClient(c.String("prometheus-url")) + am := monitoring.NewAlertmanagerClient(c.String("alertmanager-url")) + monitoring.RegisterHandlers(server, prom, am) + + transport := c.String("transport") + switch transport { + case "stdio": + logger.Println("Starting lab-monitoring MCP server on stdio...") + return server.Run(ctx, os.Stdin, os.Stdout) + + case "http": + httpConfig := mcp.HTTPConfig{ + Address: c.String("http-address"), + Endpoint: c.String("http-endpoint"), + AllowedOrigins: c.StringSlice("allowed-origins"), + SessionTTL: c.Duration("session-ttl"), + TLSCertFile: c.String("tls-cert"), + TLSKeyFile: c.String("tls-key"), + } + httpTransport := mcp.NewHTTPTransport(server, httpConfig) + return httpTransport.Run(ctx) + + default: + return fmt.Errorf("unknown transport: %s (use 'stdio' or 'http')", transport) + } +} + +func runAlerts(c *cli.Context) error { + ctx := context.Background() + am := monitoring.NewAlertmanagerClient(c.String("alertmanager-url")) + + filters := monitoring.AlertFilters{} + if state := c.String("state"); state != "" { + switch state { + case "active": + active := true + filters.Active = &active + silenced := false + filters.Silenced = &silenced + inhibited := false + filters.Inhibited = &inhibited + case "suppressed": + active := false + filters.Active = &active + case "unprocessed": + unprocessed := true + filters.Unprocessed = &unprocessed + } + } + if severity := c.String("severity"); severity != "" { + filters.Filter = append(filters.Filter, fmt.Sprintf(`severity="%s"`, severity)) + } + + alerts, err := am.ListAlerts(ctx, filters) + if err != nil { + return fmt.Errorf("failed to list alerts: %w", err) + } + + if len(alerts) == 0 { + fmt.Println("No alerts found.") + return nil + } + + for _, a := range alerts { + state := a.Status.State + severity := a.Labels["severity"] + name := a.Labels["alertname"] + fmt.Printf("[%s] %s (severity=%s, fingerprint=%s)\n", state, name, severity, a.Fingerprint) + for k, v := range a.Annotations { + fmt.Printf(" %s: %s\n", k, v) + } + } + + return nil +} + +func runQuery(c *cli.Context, promql string) error { + ctx := context.Background() + prom := monitoring.NewPrometheusClient(c.String("prometheus-url")) + + data, err := prom.Query(ctx, promql, time.Time{}) + if err != nil { + return fmt.Errorf("query failed: %w", err) + } + + for _, r := range data.Result { + labels := "" + for k, v := range r.Metric { + if labels != "" { + labels += ", " + } + labels += fmt.Sprintf("%s=%q", k, v) + } + value := "" + if len(r.Value) >= 2 { + if v, ok := r.Value[1].(string); ok { + value = v + } + } + fmt.Printf("{%s} %s\n", labels, value) + } + + return nil +} + +func runTargets(c *cli.Context) error { + ctx := context.Background() + prom := monitoring.NewPrometheusClient(c.String("prometheus-url")) + + data, err := prom.Targets(ctx) + if err != nil { + return fmt.Errorf("failed to fetch targets: %w", err) + } + + if len(data.ActiveTargets) == 0 { + fmt.Println("No active targets.") + return nil + } + + for _, t := range data.ActiveTargets { + job := t.Labels["job"] + instance := t.Labels["instance"] + fmt.Printf("[%s] %s/%s (last scrape: %s, duration: %.3fs)\n", + t.Health, job, instance, t.LastScrape.Format("15:04:05"), t.LastScrapeDuration) + if t.LastError != "" { + fmt.Printf(" error: %s\n", t.LastError) + } + } + + return nil +} + +func runMetrics(c *cli.Context, query string) error { + ctx := context.Background() + prom := monitoring.NewPrometheusClient(c.String("prometheus-url")) + + names, err := prom.LabelValues(ctx, "__name__") + if err != nil { + return fmt.Errorf("failed to fetch metric names: %w", err) + } + + limit := c.Int("limit") + count := 0 + for _, name := range names { + if query != "" { + // Simple case-insensitive substring match + if !containsIgnoreCase(name, query) { + continue + } + } + fmt.Println(name) + count++ + if count >= limit { + fmt.Printf("... (showing %d of matching metrics, use --limit to see more)\n", limit) + break + } + } + + if count == 0 { + fmt.Printf("No metrics found matching '%s'\n", query) + } + + return nil +} + +func containsIgnoreCase(s, substr string) bool { + sLower := make([]byte, len(s)) + subLower := make([]byte, len(substr)) + for i := range s { + if s[i] >= 'A' && s[i] <= 'Z' { + sLower[i] = s[i] + 32 + } else { + sLower[i] = s[i] + } + } + for i := range substr { + if substr[i] >= 'A' && substr[i] <= 'Z' { + subLower[i] = substr[i] + 32 + } else { + subLower[i] = substr[i] + } + } + + for i := 0; i <= len(sLower)-len(subLower); i++ { + match := true + for j := range subLower { + if sLower[i+j] != subLower[j] { + match = false + break + } + } + if match { + return true + } + } + return false +} diff --git a/cmd/nixos-options/main.go b/cmd/nixos-options/main.go index eb4ed94..5ef9efd 100644 --- a/cmd/nixos-options/main.go +++ b/cmd/nixos-options/main.go @@ -19,7 +19,7 @@ import ( const ( defaultDatabase = "sqlite://nixos-options.db" - version = "0.2.0" + version = "0.2.1" ) func main() { diff --git a/cmd/nixpkgs-search/main.go b/cmd/nixpkgs-search/main.go index 8fe9713..47b0e20 100644 --- a/cmd/nixpkgs-search/main.go +++ b/cmd/nixpkgs-search/main.go @@ -20,7 +20,7 @@ import ( const ( defaultDatabase = "sqlite://nixpkgs-search.db" - version = "0.2.0" + version = "0.2.1" ) func main() { diff --git a/flake.nix b/flake.nix index b478954..ec8f061 100644 --- a/flake.nix +++ b/flake.nix @@ -33,6 +33,13 @@ mainProgram = "nixpkgs-search"; description = "Search nixpkgs options and packages"; }; + lab-monitoring = pkgs.callPackage ./nix/package.nix { + src = ./.; + pname = "lab-monitoring"; + subPackage = "cmd/lab-monitoring"; + mainProgram = "lab-monitoring"; + description = "MCP server for Prometheus and Alertmanager monitoring"; + }; default = self.packages.${system}.nixpkgs-search; }); @@ -73,6 +80,10 @@ imports = [ ./nix/hm-options-module.nix ]; services.hm-options-mcp.package = lib.mkDefault self.packages.${pkgs.system}.hm-options; }; + lab-monitoring-mcp = { pkgs, ... }: { + imports = [ ./nix/lab-monitoring-module.nix ]; + services.lab-monitoring.package = lib.mkDefault self.packages.${pkgs.system}.lab-monitoring; + }; default = self.nixosModules.nixpkgs-search-mcp; }; }; diff --git a/internal/mcp/server.go b/internal/mcp/server.go index 2fce554..0a97f75 100644 --- a/internal/mcp/server.go +++ b/internal/mcp/server.go @@ -18,6 +18,8 @@ const ( ModeOptions ServerMode = "options" // ModePackages exposes only package-related tools. ModePackages ServerMode = "packages" + // ModeCustom exposes externally registered tools (no database required). + ModeCustom ServerMode = "custom" ) // ServerConfig contains configuration for the MCP server. @@ -40,7 +42,7 @@ type ServerConfig struct { func DefaultNixOSConfig() ServerConfig { return ServerConfig{ Name: "nixos-options", - Version: "0.2.0", + Version: "0.2.1", DefaultChannel: "nixos-stable", SourceName: "nixpkgs", Mode: ModeOptions, @@ -60,7 +62,7 @@ This ensures option documentation matches the nixpkgs version the project actual func DefaultNixpkgsPackagesConfig() ServerConfig { return ServerConfig{ Name: "nixpkgs-packages", - Version: "0.2.0", + Version: "0.2.1", DefaultChannel: "nixos-stable", SourceName: "nixpkgs", Mode: ModePackages, @@ -74,11 +76,29 @@ This ensures package information matches the nixpkgs version the project actuall } } +// DefaultMonitoringConfig returns the default configuration for the lab monitoring server. +func DefaultMonitoringConfig() ServerConfig { + return ServerConfig{ + Name: "lab-monitoring", + Version: "0.1.0", + Mode: ModeCustom, + Instructions: `Lab Monitoring MCP Server - Query Prometheus metrics and Alertmanager alerts. + +Tools for querying your monitoring stack: +- Search and query Prometheus metrics with PromQL +- List and inspect alerts from Alertmanager +- View scrape target health status +- Manage alert silences + +All queries are executed against live Prometheus and Alertmanager HTTP APIs.`, + } +} + // DefaultHomeManagerConfig returns the default configuration for Home Manager options server. func DefaultHomeManagerConfig() ServerConfig { return ServerConfig{ Name: "hm-options", - Version: "0.2.0", + Version: "0.2.1", DefaultChannel: "hm-stable", SourceName: "home-manager", Mode: ModeOptions, @@ -99,6 +119,7 @@ type Server struct { store database.Store config ServerConfig tools map[string]ToolHandler + toolDefs []Tool initialized bool logger *log.Logger } @@ -106,7 +127,7 @@ type Server struct { // ToolHandler is a function that handles a tool call. type ToolHandler func(ctx context.Context, args map[string]interface{}) (CallToolResult, error) -// NewServer creates a new MCP server with the given configuration. +// NewServer creates a new MCP server with a database store. func NewServer(store database.Store, logger *log.Logger, config ServerConfig) *Server { if logger == nil { logger = log.New(io.Discard, "", 0) @@ -121,6 +142,25 @@ func NewServer(store database.Store, logger *log.Logger, config ServerConfig) *S return s } +// NewGenericServer creates a new MCP server without a database store. +// Use RegisterTool to add tools externally. +func NewGenericServer(logger *log.Logger, config ServerConfig) *Server { + if logger == nil { + logger = log.New(io.Discard, "", 0) + } + return &Server{ + config: config, + tools: make(map[string]ToolHandler), + logger: logger, + } +} + +// RegisterTool registers an externally defined tool with its handler. +func (s *Server) RegisterTool(tool Tool, handler ToolHandler) { + s.toolDefs = append(s.toolDefs, tool) + s.tools[tool.Name] = handler +} + // registerTools registers all available tools. func (s *Server) registerTools() { // Tools will be implemented in handlers.go @@ -237,6 +277,11 @@ func (s *Server) handleToolsList(req *Request) *Response { // getToolDefinitions returns the tool definitions. func (s *Server) getToolDefinitions() []Tool { + // For custom mode, return externally registered tools + if s.config.Mode == ModeCustom { + return s.toolDefs + } + // For packages mode, return package tools if s.config.Mode == ModePackages { return s.getPackageToolDefinitions() diff --git a/internal/monitoring/alertmanager.go b/internal/monitoring/alertmanager.go new file mode 100644 index 0000000..aad5671 --- /dev/null +++ b/internal/monitoring/alertmanager.go @@ -0,0 +1,153 @@ +package monitoring + +import ( + "bytes" + "context" + "encoding/json" + "fmt" + "io" + "net/http" + "net/url" + "strings" + "time" +) + +// AlertmanagerClient is an HTTP client for the Alertmanager API v2. +type AlertmanagerClient struct { + baseURL string + httpClient *http.Client +} + +// NewAlertmanagerClient creates a new Alertmanager API client. +func NewAlertmanagerClient(baseURL string) *AlertmanagerClient { + return &AlertmanagerClient{ + baseURL: strings.TrimRight(baseURL, "/"), + httpClient: &http.Client{ + Timeout: 30 * time.Second, + }, + } +} + +// ListAlerts returns alerts matching the given filters. +func (c *AlertmanagerClient) ListAlerts(ctx context.Context, filters AlertFilters) ([]Alert, error) { + params := url.Values{} + + if filters.Active != nil { + params.Set("active", fmt.Sprintf("%t", *filters.Active)) + } + if filters.Silenced != nil { + params.Set("silenced", fmt.Sprintf("%t", *filters.Silenced)) + } + if filters.Inhibited != nil { + params.Set("inhibited", fmt.Sprintf("%t", *filters.Inhibited)) + } + if filters.Unprocessed != nil { + params.Set("unprocessed", fmt.Sprintf("%t", *filters.Unprocessed)) + } + if filters.Receiver != "" { + params.Set("receiver", filters.Receiver) + } + for _, f := range filters.Filter { + params.Add("filter", f) + } + + u := c.baseURL + "/api/v2/alerts" + if len(params) > 0 { + u += "?" + params.Encode() + } + + req, err := http.NewRequestWithContext(ctx, http.MethodGet, u, nil) + if err != nil { + return nil, fmt.Errorf("failed to create request: %w", err) + } + + resp, err := c.httpClient.Do(req) + if err != nil { + return nil, fmt.Errorf("request failed: %w", err) + } + defer resp.Body.Close() //nolint:errcheck // cleanup on exit + + body, err := io.ReadAll(resp.Body) + if err != nil { + return nil, fmt.Errorf("failed to read response: %w", err) + } + + if resp.StatusCode != http.StatusOK { + return nil, fmt.Errorf("unexpected status %d: %s", resp.StatusCode, string(body)) + } + + var alerts []Alert + if err := json.Unmarshal(body, &alerts); err != nil { + return nil, fmt.Errorf("failed to parse alerts: %w", err) + } + + return alerts, nil +} + +// ListSilences returns all silences. +func (c *AlertmanagerClient) ListSilences(ctx context.Context) ([]Silence, error) { + req, err := http.NewRequestWithContext(ctx, http.MethodGet, c.baseURL+"/api/v2/silences", nil) + if err != nil { + return nil, fmt.Errorf("failed to create request: %w", err) + } + + resp, err := c.httpClient.Do(req) + if err != nil { + return nil, fmt.Errorf("request failed: %w", err) + } + defer resp.Body.Close() //nolint:errcheck // cleanup on exit + + body, err := io.ReadAll(resp.Body) + if err != nil { + return nil, fmt.Errorf("failed to read response: %w", err) + } + + if resp.StatusCode != http.StatusOK { + return nil, fmt.Errorf("unexpected status %d: %s", resp.StatusCode, string(body)) + } + + var silences []Silence + if err := json.Unmarshal(body, &silences); err != nil { + return nil, fmt.Errorf("failed to parse silences: %w", err) + } + + return silences, nil +} + +// CreateSilence creates a new silence and returns the silence ID. +func (c *AlertmanagerClient) CreateSilence(ctx context.Context, silence Silence) (string, error) { + data, err := json.Marshal(silence) + if err != nil { + return "", fmt.Errorf("failed to marshal silence: %w", err) + } + + req, err := http.NewRequestWithContext(ctx, http.MethodPost, c.baseURL+"/api/v2/silences", bytes.NewReader(data)) + if err != nil { + return "", fmt.Errorf("failed to create request: %w", err) + } + req.Header.Set("Content-Type", "application/json") + + resp, err := c.httpClient.Do(req) + if err != nil { + return "", fmt.Errorf("request failed: %w", err) + } + defer resp.Body.Close() //nolint:errcheck // cleanup on exit + + body, err := io.ReadAll(resp.Body) + if err != nil { + return "", fmt.Errorf("failed to read response: %w", err) + } + + if resp.StatusCode != http.StatusOK { + return "", fmt.Errorf("unexpected status %d: %s", resp.StatusCode, string(body)) + } + + var result struct { + SilenceID string `json:"silenceID"` + } + if err := json.Unmarshal(body, &result); err != nil { + return "", fmt.Errorf("failed to parse response: %w", err) + } + + return result.SilenceID, nil +} diff --git a/internal/monitoring/alertmanager_test.go b/internal/monitoring/alertmanager_test.go new file mode 100644 index 0000000..d58c2cb --- /dev/null +++ b/internal/monitoring/alertmanager_test.go @@ -0,0 +1,175 @@ +package monitoring + +import ( + "context" + "encoding/json" + "net/http" + "net/http/httptest" + "testing" + "time" +) + +func TestAlertmanagerClient_ListAlerts(t *testing.T) { + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + if r.URL.Path != "/api/v2/alerts" { + t.Errorf("unexpected path: %s", r.URL.Path) + } + + w.Header().Set("Content-Type", "application/json") + _, _ = w.Write([]byte(`[ + { + "annotations": {"summary": "Target is down"}, + "endsAt": "2024-01-01T01:00:00Z", + "fingerprint": "abc123", + "receivers": [{"name": "default"}], + "startsAt": "2024-01-01T00:00:00Z", + "status": {"inhibitedBy": [], "silencedBy": [], "state": "active"}, + "updatedAt": "2024-01-01T00:00:00Z", + "generatorURL": "http://prometheus:9090/graph", + "labels": {"alertname": "TargetDown", "severity": "critical", "instance": "node1:9100"} + } + ]`)) + })) + defer srv.Close() + + client := NewAlertmanagerClient(srv.URL) + alerts, err := client.ListAlerts(context.Background(), AlertFilters{}) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + + if len(alerts) != 1 { + t.Fatalf("expected 1 alert, got %d", len(alerts)) + } + if alerts[0].Fingerprint != "abc123" { + t.Errorf("expected fingerprint=abc123, got %s", alerts[0].Fingerprint) + } + if alerts[0].Labels["alertname"] != "TargetDown" { + t.Errorf("expected alertname=TargetDown, got %s", alerts[0].Labels["alertname"]) + } + if alerts[0].Status.State != "active" { + t.Errorf("expected state=active, got %s", alerts[0].Status.State) + } +} + +func TestAlertmanagerClient_ListAlertsWithFilters(t *testing.T) { + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + q := r.URL.Query() + if q.Get("active") != "true" { + t.Errorf("expected active=true, got %s", q.Get("active")) + } + if q.Get("silenced") != "false" { + t.Errorf("expected silenced=false, got %s", q.Get("silenced")) + } + + w.Header().Set("Content-Type", "application/json") + _, _ = w.Write([]byte(`[]`)) + })) + defer srv.Close() + + client := NewAlertmanagerClient(srv.URL) + active := true + silenced := false + _, err := client.ListAlerts(context.Background(), AlertFilters{ + Active: &active, + Silenced: &silenced, + }) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } +} + +func TestAlertmanagerClient_ListSilences(t *testing.T) { + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + if r.URL.Path != "/api/v2/silences" { + t.Errorf("unexpected path: %s", r.URL.Path) + } + + w.Header().Set("Content-Type", "application/json") + _, _ = w.Write([]byte(`[ + { + "id": "silence-1", + "matchers": [{"name": "alertname", "value": "TargetDown", "isRegex": false}], + "startsAt": "2024-01-01T00:00:00Z", + "endsAt": "2024-01-01T02:00:00Z", + "createdBy": "admin", + "comment": "Maintenance window", + "status": {"state": "active"} + } + ]`)) + })) + defer srv.Close() + + client := NewAlertmanagerClient(srv.URL) + silences, err := client.ListSilences(context.Background()) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + + if len(silences) != 1 { + t.Fatalf("expected 1 silence, got %d", len(silences)) + } + if silences[0].ID != "silence-1" { + t.Errorf("expected id=silence-1, got %s", silences[0].ID) + } + if silences[0].CreatedBy != "admin" { + t.Errorf("expected createdBy=admin, got %s", silences[0].CreatedBy) + } +} + +func TestAlertmanagerClient_CreateSilence(t *testing.T) { + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + if r.Method != http.MethodPost { + t.Errorf("expected POST, got %s", r.Method) + } + if r.URL.Path != "/api/v2/silences" { + t.Errorf("unexpected path: %s", r.URL.Path) + } + if r.Header.Get("Content-Type") != "application/json" { + t.Errorf("expected Content-Type=application/json, got %s", r.Header.Get("Content-Type")) + } + + var silence Silence + if err := json.NewDecoder(r.Body).Decode(&silence); err != nil { + t.Fatalf("failed to decode request body: %v", err) + } + if silence.CreatedBy != "admin" { + t.Errorf("expected createdBy=admin, got %s", silence.CreatedBy) + } + + w.Header().Set("Content-Type", "application/json") + _, _ = w.Write([]byte(`{"silenceID": "new-silence-id"}`)) + })) + defer srv.Close() + + client := NewAlertmanagerClient(srv.URL) + id, err := client.CreateSilence(context.Background(), Silence{ + Matchers: []Matcher{ + {Name: "alertname", Value: "TargetDown", IsRegex: false}, + }, + StartsAt: time.Now(), + EndsAt: time.Now().Add(2 * time.Hour), + CreatedBy: "admin", + Comment: "Test silence", + }) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + if id != "new-silence-id" { + t.Errorf("expected id=new-silence-id, got %s", id) + } +} + +func TestAlertmanagerClient_HTTPError(t *testing.T) { + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.WriteHeader(http.StatusInternalServerError) + _, _ = w.Write([]byte("internal error")) + })) + defer srv.Close() + + client := NewAlertmanagerClient(srv.URL) + _, err := client.ListAlerts(context.Background(), AlertFilters{}) + if err == nil { + t.Fatal("expected error, got nil") + } +} diff --git a/internal/monitoring/format.go b/internal/monitoring/format.go new file mode 100644 index 0000000..a5d4b66 --- /dev/null +++ b/internal/monitoring/format.go @@ -0,0 +1,317 @@ +package monitoring + +import ( + "fmt" + "sort" + "strings" + "time" +) + +const maxRows = 100 + +// formatInstantVector formats instant vector results as a markdown table. +func formatInstantVector(results []PromInstantVector) string { + if len(results) == 0 { + return "No results." + } + + // Collect all label keys across results (excluding __name__) + labelKeys := collectLabelKeys(results) + + var sb strings.Builder + + // Header + sb.WriteString("| ") + if _, ok := results[0].Metric["__name__"]; ok { + sb.WriteString("Metric | ") + } + for _, key := range labelKeys { + sb.WriteString(key) + sb.WriteString(" | ") + } + sb.WriteString("Value |\n") + + // Separator + sb.WriteString("| ") + if _, ok := results[0].Metric["__name__"]; ok { + sb.WriteString("--- | ") + } + for range labelKeys { + sb.WriteString("--- | ") + } + sb.WriteString("--- |\n") + + // Rows + truncated := false + for i, r := range results { + if i >= maxRows { + truncated = true + break + } + sb.WriteString("| ") + if _, ok := results[0].Metric["__name__"]; ok { + sb.WriteString(r.Metric["__name__"]) + sb.WriteString(" | ") + } + for _, key := range labelKeys { + sb.WriteString(r.Metric[key]) + sb.WriteString(" | ") + } + // Value is at index 1 of the value tuple + if len(r.Value) >= 2 { + if v, ok := r.Value[1].(string); ok { + sb.WriteString(v) + } + } + sb.WriteString(" |\n") + } + + if truncated { + sb.WriteString(fmt.Sprintf("\n*Showing %d of %d results (truncated)*\n", maxRows, len(results))) + } + + return sb.String() +} + +// collectLabelKeys returns sorted label keys across all results, excluding __name__. +func collectLabelKeys(results []PromInstantVector) []string { + keySet := make(map[string]struct{}) + for _, r := range results { + for k := range r.Metric { + if k != "__name__" { + keySet[k] = struct{}{} + } + } + } + keys := make([]string, 0, len(keySet)) + for k := range keySet { + keys = append(keys, k) + } + sort.Strings(keys) + return keys +} + +// formatAlerts formats alerts as grouped markdown. +func formatAlerts(alerts []Alert) string { + if len(alerts) == 0 { + return "No alerts found." + } + + // Group by alertname + groups := make(map[string][]Alert) + var order []string + for _, a := range alerts { + name := a.Labels["alertname"] + if _, exists := groups[name]; !exists { + order = append(order, name) + } + groups[name] = append(groups[name], a) + } + + var sb strings.Builder + sb.WriteString(fmt.Sprintf("**%d alert(s)**\n\n", len(alerts))) + + for _, name := range order { + group := groups[name] + sb.WriteString(fmt.Sprintf("## %s (%d)\n\n", name, len(group))) + + for i, a := range group { + if i >= maxRows { + sb.WriteString(fmt.Sprintf("*... and %d more*\n", len(group)-maxRows)) + break + } + + sb.WriteString(fmt.Sprintf("**State:** %s | **Severity:** %s\n", a.Status.State, a.Labels["severity"])) + + // Labels (excluding alertname and severity) + var labels []string + for k, v := range a.Labels { + if k != "alertname" && k != "severity" { + labels = append(labels, fmt.Sprintf("%s=%s", k, v)) + } + } + sort.Strings(labels) + if len(labels) > 0 { + sb.WriteString(fmt.Sprintf("**Labels:** %s\n", strings.Join(labels, ", "))) + } + + // Annotations + for k, v := range a.Annotations { + sb.WriteString(fmt.Sprintf("**%s:** %s\n", k, v)) + } + + sb.WriteString(fmt.Sprintf("**Fingerprint:** %s\n", a.Fingerprint)) + sb.WriteString(fmt.Sprintf("**Started:** %s\n", a.StartsAt.Format(time.RFC3339))) + + if len(a.Status.SilencedBy) > 0 { + sb.WriteString(fmt.Sprintf("**Silenced by:** %s\n", strings.Join(a.Status.SilencedBy, ", "))) + } + if len(a.Status.InhibitedBy) > 0 { + sb.WriteString(fmt.Sprintf("**Inhibited by:** %s\n", strings.Join(a.Status.InhibitedBy, ", "))) + } + + sb.WriteString("\n") + } + } + + return sb.String() +} + +// formatTargets formats targets as grouped markdown. +func formatTargets(targets *PromTargetsData) string { + if targets == nil || len(targets.ActiveTargets) == 0 { + return "No active targets." + } + + // Group by job + groups := make(map[string][]PromTarget) + var order []string + for _, t := range targets.ActiveTargets { + job := t.Labels["job"] + if _, exists := groups[job]; !exists { + order = append(order, job) + } + groups[job] = append(groups[job], t) + } + sort.Strings(order) + + var sb strings.Builder + sb.WriteString(fmt.Sprintf("**%d active target(s)**\n\n", len(targets.ActiveTargets))) + + // Count health statuses + healthCounts := make(map[string]int) + for _, t := range targets.ActiveTargets { + healthCounts[t.Health]++ + } + var healthParts []string + for h, c := range healthCounts { + healthParts = append(healthParts, fmt.Sprintf("%s: %d", h, c)) + } + sort.Strings(healthParts) + sb.WriteString(fmt.Sprintf("**Health summary:** %s\n\n", strings.Join(healthParts, ", "))) + + for _, job := range order { + group := groups[job] + sb.WriteString(fmt.Sprintf("## %s (%d targets)\n\n", job, len(group))) + + sb.WriteString("| Instance | Health | Last Scrape | Duration | Error |\n") + sb.WriteString("| --- | --- | --- | --- | --- |\n") + + for _, t := range group { + instance := t.Labels["instance"] + lastScrape := t.LastScrape.Format("15:04:05") + duration := fmt.Sprintf("%.3fs", t.LastScrapeDuration) + lastErr := t.LastError + if lastErr == "" { + lastErr = "-" + } + sb.WriteString(fmt.Sprintf("| %s | %s | %s | %s | %s |\n", + instance, t.Health, lastScrape, duration, lastErr)) + } + sb.WriteString("\n") + } + + return sb.String() +} + +// formatSilences formats silences as markdown. +func formatSilences(silences []Silence) string { + if len(silences) == 0 { + return "No silences found." + } + + var sb strings.Builder + sb.WriteString(fmt.Sprintf("**%d silence(s)**\n\n", len(silences))) + + for _, s := range silences { + state := "unknown" + if s.Status != nil { + state = s.Status.State + } + sb.WriteString(fmt.Sprintf("## Silence %s [%s]\n\n", s.ID, state)) + + // Matchers + var matchers []string + for _, m := range s.Matchers { + op := "=" + if m.IsRegex { + op = "=~" + } + if m.IsEqual != nil && !*m.IsEqual { + if m.IsRegex { + op = "!~" + } else { + op = "!=" + } + } + matchers = append(matchers, fmt.Sprintf("%s%s%s", m.Name, op, m.Value)) + } + sb.WriteString(fmt.Sprintf("**Matchers:** %s\n", strings.Join(matchers, ", "))) + sb.WriteString(fmt.Sprintf("**Created by:** %s\n", s.CreatedBy)) + sb.WriteString(fmt.Sprintf("**Comment:** %s\n", s.Comment)) + sb.WriteString(fmt.Sprintf("**Starts:** %s\n", s.StartsAt.Format(time.RFC3339))) + sb.WriteString(fmt.Sprintf("**Ends:** %s\n", s.EndsAt.Format(time.RFC3339))) + sb.WriteString("\n") + } + + return sb.String() +} + +// formatMetricSearch formats metric search results. +func formatMetricSearch(names []string, metadata map[string][]PromMetadata) string { + if len(names) == 0 { + return "No metrics found matching the search." + } + + var sb strings.Builder + sb.WriteString(fmt.Sprintf("**%d metric(s) found**\n\n", len(names))) + + sb.WriteString("| Metric | Type | Help |\n") + sb.WriteString("| --- | --- | --- |\n") + + truncated := false + for i, name := range names { + if i >= maxRows { + truncated = true + break + } + metaType := "" + help := "" + if metas, ok := metadata[name]; ok && len(metas) > 0 { + metaType = metas[0].Type + help = metas[0].Help + if len(help) > 100 { + help = help[:100] + "..." + } + } + sb.WriteString(fmt.Sprintf("| %s | %s | %s |\n", name, metaType, help)) + } + + if truncated { + sb.WriteString(fmt.Sprintf("\n*Showing %d of %d metrics (truncated)*\n", maxRows, len(names))) + } + + return sb.String() +} + +// formatMetricMetadata formats metadata for a single metric. +func formatMetricMetadata(name string, metas []PromMetadata) string { + if len(metas) == 0 { + return fmt.Sprintf("No metadata found for metric '%s'.", name) + } + + var sb strings.Builder + sb.WriteString(fmt.Sprintf("# %s\n\n", name)) + + for _, m := range metas { + sb.WriteString(fmt.Sprintf("**Type:** %s\n", m.Type)) + if m.Help != "" { + sb.WriteString(fmt.Sprintf("**Help:** %s\n", m.Help)) + } + if m.Unit != "" { + sb.WriteString(fmt.Sprintf("**Unit:** %s\n", m.Unit)) + } + } + + return sb.String() +} diff --git a/internal/monitoring/handlers.go b/internal/monitoring/handlers.go new file mode 100644 index 0000000..79e6237 --- /dev/null +++ b/internal/monitoring/handlers.go @@ -0,0 +1,434 @@ +package monitoring + +import ( + "context" + "encoding/json" + "fmt" + "strings" + "time" + + "git.t-juice.club/torjus/labmcp/internal/mcp" +) + +// RegisterHandlers registers all monitoring tool handlers on the MCP server. +func RegisterHandlers(server *mcp.Server, prom *PrometheusClient, am *AlertmanagerClient) { + server.RegisterTool(listAlertsTool(), makeListAlertsHandler(am)) + server.RegisterTool(getAlertTool(), makeGetAlertHandler(am)) + server.RegisterTool(searchMetricsTool(), makeSearchMetricsHandler(prom)) + server.RegisterTool(getMetricMetadataTool(), makeGetMetricMetadataHandler(prom)) + server.RegisterTool(queryTool(), makeQueryHandler(prom)) + server.RegisterTool(listTargetsTool(), makeListTargetsHandler(prom)) + server.RegisterTool(listSilencesTool(), makeListSilencesHandler(am)) + server.RegisterTool(createSilenceTool(), makeCreateSilenceHandler(am)) +} + +// Tool definitions + +func listAlertsTool() mcp.Tool { + return mcp.Tool{ + Name: "list_alerts", + Description: "List alerts from Alertmanager with optional filters", + InputSchema: mcp.InputSchema{ + Type: "object", + Properties: map[string]mcp.Property{ + "state": { + Type: "string", + Description: "Filter by alert state: 'active', 'suppressed', or 'unprocessed'", + Enum: []string{"active", "suppressed", "unprocessed"}, + }, + "severity": { + Type: "string", + Description: "Filter by severity label (e.g., 'critical', 'warning')", + }, + "receiver": { + Type: "string", + Description: "Filter by receiver name", + }, + }, + }, + } +} + +func getAlertTool() mcp.Tool { + return mcp.Tool{ + Name: "get_alert", + Description: "Get full details for a specific alert by fingerprint", + InputSchema: mcp.InputSchema{ + Type: "object", + Properties: map[string]mcp.Property{ + "fingerprint": { + Type: "string", + Description: "Alert fingerprint identifier", + }, + }, + Required: []string{"fingerprint"}, + }, + } +} + +func searchMetricsTool() mcp.Tool { + return mcp.Tool{ + Name: "search_metrics", + Description: "Search Prometheus metric names with optional substring filter, enriched with metadata (type, help text)", + InputSchema: mcp.InputSchema{ + Type: "object", + Properties: map[string]mcp.Property{ + "query": { + Type: "string", + Description: "Substring to filter metric names (e.g., 'cpu', 'memory', 'node_'). Empty returns all metrics.", + }, + "limit": { + Type: "integer", + Description: "Maximum number of results (default: 50)", + Default: 50, + }, + }, + }, + } +} + +func getMetricMetadataTool() mcp.Tool { + return mcp.Tool{ + Name: "get_metric_metadata", + Description: "Get type, help text, and unit for a specific Prometheus metric", + InputSchema: mcp.InputSchema{ + Type: "object", + Properties: map[string]mcp.Property{ + "metric": { + Type: "string", + Description: "Metric name (e.g., 'node_cpu_seconds_total')", + }, + }, + Required: []string{"metric"}, + }, + } +} + +func queryTool() mcp.Tool { + return mcp.Tool{ + Name: "query", + Description: "Execute an instant PromQL query against Prometheus. Supports aggregations like avg_over_time(metric[1h]), rate(), sum(), etc.", + InputSchema: mcp.InputSchema{ + Type: "object", + Properties: map[string]mcp.Property{ + "promql": { + Type: "string", + Description: "PromQL expression to evaluate (e.g., 'up', 'rate(http_requests_total[5m])', 'avg_over_time(node_load1[1h])')", + }, + }, + Required: []string{"promql"}, + }, + } +} + +func listTargetsTool() mcp.Tool { + return mcp.Tool{ + Name: "list_targets", + Description: "List Prometheus scrape targets with health status, grouped by job", + InputSchema: mcp.InputSchema{ + Type: "object", + Properties: map[string]mcp.Property{}, + }, + } +} + +func listSilencesTool() mcp.Tool { + return mcp.Tool{ + Name: "list_silences", + Description: "List active and pending alert silences from Alertmanager", + InputSchema: mcp.InputSchema{ + Type: "object", + Properties: map[string]mcp.Property{}, + }, + } +} + +func createSilenceTool() mcp.Tool { + return mcp.Tool{ + Name: "create_silence", + Description: `Create a new silence in Alertmanager. IMPORTANT: Always confirm with the user before creating a silence, showing them the matchers, duration, and reason.`, + InputSchema: mcp.InputSchema{ + Type: "object", + Properties: map[string]mcp.Property{ + "matchers": { + Type: "string", + Description: `JSON array of matchers, e.g. [{"name":"alertname","value":"TargetDown","isRegex":false}]`, + }, + "duration": { + Type: "string", + Description: "Silence duration in Go duration format (e.g., '2h', '30m', '1h30m')", + }, + "author": { + Type: "string", + Description: "Author of the silence", + }, + "comment": { + Type: "string", + Description: "Reason for the silence", + }, + }, + Required: []string{"matchers", "duration", "author", "comment"}, + }, + } +} + +// Handler constructors + +func makeListAlertsHandler(am *AlertmanagerClient) mcp.ToolHandler { + return func(ctx context.Context, args map[string]interface{}) (mcp.CallToolResult, error) { + filters := AlertFilters{} + + if state, ok := args["state"].(string); ok && state != "" { + switch state { + case "active": + active := true + filters.Active = &active + silenced := false + filters.Silenced = &silenced + inhibited := false + filters.Inhibited = &inhibited + case "suppressed": + active := false + filters.Active = &active + case "unprocessed": + unprocessed := true + filters.Unprocessed = &unprocessed + } + } + + if severity, ok := args["severity"].(string); ok && severity != "" { + filters.Filter = append(filters.Filter, fmt.Sprintf(`severity="%s"`, severity)) + } + + if receiver, ok := args["receiver"].(string); ok && receiver != "" { + filters.Receiver = receiver + } + + alerts, err := am.ListAlerts(ctx, filters) + if err != nil { + return mcp.ErrorContent(fmt.Errorf("failed to list alerts: %w", err)), nil + } + + return mcp.CallToolResult{ + Content: []mcp.Content{mcp.TextContent(formatAlerts(alerts))}, + }, nil + } +} + +func makeGetAlertHandler(am *AlertmanagerClient) mcp.ToolHandler { + return func(ctx context.Context, args map[string]interface{}) (mcp.CallToolResult, error) { + fingerprint, _ := args["fingerprint"].(string) + if fingerprint == "" { + return mcp.ErrorContent(fmt.Errorf("fingerprint is required")), nil + } + + // Fetch all alerts and find the one matching the fingerprint + alerts, err := am.ListAlerts(ctx, AlertFilters{}) + if err != nil { + return mcp.ErrorContent(fmt.Errorf("failed to fetch alerts: %w", err)), nil + } + + for _, a := range alerts { + if a.Fingerprint == fingerprint { + return mcp.CallToolResult{ + Content: []mcp.Content{mcp.TextContent(formatAlerts([]Alert{a}))}, + }, nil + } + } + + return mcp.ErrorContent(fmt.Errorf("alert with fingerprint '%s' not found", fingerprint)), nil + } +} + +func makeSearchMetricsHandler(prom *PrometheusClient) mcp.ToolHandler { + return func(ctx context.Context, args map[string]interface{}) (mcp.CallToolResult, error) { + query, _ := args["query"].(string) + limit := 50 + if l, ok := args["limit"].(float64); ok && l > 0 { + limit = int(l) + } + + // Get all metric names + allNames, err := prom.LabelValues(ctx, "__name__") + if err != nil { + return mcp.ErrorContent(fmt.Errorf("failed to fetch metric names: %w", err)), nil + } + + // Filter by substring + var matched []string + queryLower := strings.ToLower(query) + for _, name := range allNames { + if query == "" || strings.Contains(strings.ToLower(name), queryLower) { + matched = append(matched, name) + if len(matched) >= limit { + break + } + } + } + + // Fetch metadata for matched metrics + metadata, err := prom.Metadata(ctx, "") + if err != nil { + // Non-fatal: proceed without metadata + metadata = nil + } + + return mcp.CallToolResult{ + Content: []mcp.Content{mcp.TextContent(formatMetricSearch(matched, metadata))}, + }, nil + } +} + +func makeGetMetricMetadataHandler(prom *PrometheusClient) mcp.ToolHandler { + return func(ctx context.Context, args map[string]interface{}) (mcp.CallToolResult, error) { + metric, _ := args["metric"].(string) + if metric == "" { + return mcp.ErrorContent(fmt.Errorf("metric is required")), nil + } + + metadata, err := prom.Metadata(ctx, metric) + if err != nil { + return mcp.ErrorContent(fmt.Errorf("failed to fetch metadata: %w", err)), nil + } + + metas := metadata[metric] + return mcp.CallToolResult{ + Content: []mcp.Content{mcp.TextContent(formatMetricMetadata(metric, metas))}, + }, nil + } +} + +func makeQueryHandler(prom *PrometheusClient) mcp.ToolHandler { + return func(ctx context.Context, args map[string]interface{}) (mcp.CallToolResult, error) { + promql, _ := args["promql"].(string) + if promql == "" { + return mcp.ErrorContent(fmt.Errorf("promql is required")), nil + } + + data, err := prom.Query(ctx, promql, time.Time{}) + if err != nil { + return mcp.ErrorContent(fmt.Errorf("query failed: %w", err)), nil + } + + var result string + switch data.ResultType { + case "vector": + result = formatInstantVector(data.Result) + case "scalar": + if len(data.Result) > 0 && len(data.Result[0].Value) >= 2 { + if v, ok := data.Result[0].Value[1].(string); ok { + result = fmt.Sprintf("**Scalar result:** %s", v) + } + } + if result == "" { + result = "Scalar query returned no value." + } + default: + result = fmt.Sprintf("Result type: %s\n\n%s", data.ResultType, formatInstantVector(data.Result)) + } + + return mcp.CallToolResult{ + Content: []mcp.Content{mcp.TextContent(result)}, + }, nil + } +} + +func makeListTargetsHandler(prom *PrometheusClient) mcp.ToolHandler { + return func(ctx context.Context, args map[string]interface{}) (mcp.CallToolResult, error) { + data, err := prom.Targets(ctx) + if err != nil { + return mcp.ErrorContent(fmt.Errorf("failed to fetch targets: %w", err)), nil + } + + return mcp.CallToolResult{ + Content: []mcp.Content{mcp.TextContent(formatTargets(data))}, + }, nil + } +} + +func makeListSilencesHandler(am *AlertmanagerClient) mcp.ToolHandler { + return func(ctx context.Context, args map[string]interface{}) (mcp.CallToolResult, error) { + silences, err := am.ListSilences(ctx) + if err != nil { + return mcp.ErrorContent(fmt.Errorf("failed to fetch silences: %w", err)), nil + } + + // Filter to active/pending only + var filtered []Silence + for _, s := range silences { + if s.Status != nil && (s.Status.State == "active" || s.Status.State == "pending") { + filtered = append(filtered, s) + } + } + + return mcp.CallToolResult{ + Content: []mcp.Content{mcp.TextContent(formatSilences(filtered))}, + }, nil + } +} + +func makeCreateSilenceHandler(am *AlertmanagerClient) mcp.ToolHandler { + return func(ctx context.Context, args map[string]interface{}) (mcp.CallToolResult, error) { + matchersJSON, _ := args["matchers"].(string) + if matchersJSON == "" { + return mcp.ErrorContent(fmt.Errorf("matchers is required")), nil + } + + durationStr, _ := args["duration"].(string) + if durationStr == "" { + return mcp.ErrorContent(fmt.Errorf("duration is required")), nil + } + + author, _ := args["author"].(string) + if author == "" { + return mcp.ErrorContent(fmt.Errorf("author is required")), nil + } + + comment, _ := args["comment"].(string) + if comment == "" { + return mcp.ErrorContent(fmt.Errorf("comment is required")), nil + } + + // Parse matchers + var matchers []Matcher + if err := parseJSON(matchersJSON, &matchers); err != nil { + return mcp.ErrorContent(fmt.Errorf("invalid matchers JSON: %w", err)), nil + } + + // Parse duration + duration, err := time.ParseDuration(durationStr) + if err != nil { + return mcp.ErrorContent(fmt.Errorf("invalid duration: %w", err)), nil + } + + now := time.Now() + silence := Silence{ + Matchers: matchers, + StartsAt: now, + EndsAt: now.Add(duration), + CreatedBy: author, + Comment: comment, + } + + id, err := am.CreateSilence(ctx, silence) + if err != nil { + return mcp.ErrorContent(fmt.Errorf("failed to create silence: %w", err)), nil + } + + var sb strings.Builder + sb.WriteString("Silence created successfully.\n\n") + sb.WriteString(fmt.Sprintf("**ID:** %s\n", id)) + sb.WriteString(fmt.Sprintf("**Expires:** %s\n", silence.EndsAt.Format(time.RFC3339))) + sb.WriteString(fmt.Sprintf("**Author:** %s\n", author)) + sb.WriteString(fmt.Sprintf("**Comment:** %s\n", comment)) + + return mcp.CallToolResult{ + Content: []mcp.Content{mcp.TextContent(sb.String())}, + }, nil + } +} + +// parseJSON is a helper to unmarshal JSON from a string. +func parseJSON(s string, v interface{}) error { + return json.Unmarshal([]byte(s), v) +} diff --git a/internal/monitoring/handlers_test.go b/internal/monitoring/handlers_test.go new file mode 100644 index 0000000..69435e8 --- /dev/null +++ b/internal/monitoring/handlers_test.go @@ -0,0 +1,378 @@ +package monitoring + +import ( + "context" + "encoding/json" + "io" + "log" + "net/http" + "net/http/httptest" + "strings" + "testing" + + "git.t-juice.club/torjus/labmcp/internal/mcp" +) + +// setupTestServer creates a test MCP server with monitoring handlers backed by test HTTP servers. +func setupTestServer(t *testing.T, promHandler, amHandler http.HandlerFunc) (*mcp.Server, func()) { + t.Helper() + + promSrv := httptest.NewServer(promHandler) + amSrv := httptest.NewServer(amHandler) + + logger := log.New(io.Discard, "", 0) + config := mcp.DefaultMonitoringConfig() + server := mcp.NewGenericServer(logger, config) + + prom := NewPrometheusClient(promSrv.URL) + am := NewAlertmanagerClient(amSrv.URL) + RegisterHandlers(server, prom, am) + + cleanup := func() { + promSrv.Close() + amSrv.Close() + } + + return server, cleanup +} + +func TestHandler_ListAlerts(t *testing.T) { + server, cleanup := setupTestServer(t, + nil, + func(w http.ResponseWriter, r *http.Request) { + w.Header().Set("Content-Type", "application/json") + _, _ = w.Write([]byte(`[ + { + "annotations": {"summary": "Node is down"}, + "endsAt": "2024-01-01T01:00:00Z", + "fingerprint": "fp1", + "receivers": [{"name": "default"}], + "startsAt": "2024-01-01T00:00:00Z", + "status": {"inhibitedBy": [], "silencedBy": [], "state": "active"}, + "updatedAt": "2024-01-01T00:00:00Z", + "generatorURL": "", + "labels": {"alertname": "NodeDown", "severity": "critical"} + } + ]`)) + }, + ) + defer cleanup() + + result := callTool(t, server, "list_alerts", map[string]interface{}{}) + if result.IsError { + t.Fatalf("unexpected error: %s", result.Content[0].Text) + } + if !strings.Contains(result.Content[0].Text, "NodeDown") { + t.Errorf("expected output to contain 'NodeDown', got: %s", result.Content[0].Text) + } + if !strings.Contains(result.Content[0].Text, "1 alert") { + t.Errorf("expected output to contain '1 alert', got: %s", result.Content[0].Text) + } +} + +func TestHandler_GetAlert(t *testing.T) { + server, cleanup := setupTestServer(t, + nil, + func(w http.ResponseWriter, r *http.Request) { + w.Header().Set("Content-Type", "application/json") + _, _ = w.Write([]byte(`[ + { + "annotations": {"summary": "Found it"}, + "endsAt": "2024-01-01T01:00:00Z", + "fingerprint": "target-fp", + "receivers": [{"name": "default"}], + "startsAt": "2024-01-01T00:00:00Z", + "status": {"inhibitedBy": [], "silencedBy": [], "state": "active"}, + "updatedAt": "2024-01-01T00:00:00Z", + "generatorURL": "", + "labels": {"alertname": "TestAlert", "severity": "warning"} + }, + { + "annotations": {}, + "endsAt": "2024-01-01T01:00:00Z", + "fingerprint": "other-fp", + "receivers": [{"name": "default"}], + "startsAt": "2024-01-01T00:00:00Z", + "status": {"inhibitedBy": [], "silencedBy": [], "state": "active"}, + "updatedAt": "2024-01-01T00:00:00Z", + "generatorURL": "", + "labels": {"alertname": "OtherAlert", "severity": "info"} + } + ]`)) + }, + ) + defer cleanup() + + result := callTool(t, server, "get_alert", map[string]interface{}{ + "fingerprint": "target-fp", + }) + if result.IsError { + t.Fatalf("unexpected error: %s", result.Content[0].Text) + } + if !strings.Contains(result.Content[0].Text, "TestAlert") { + t.Errorf("expected output to contain 'TestAlert', got: %s", result.Content[0].Text) + } +} + +func TestHandler_GetAlertNotFound(t *testing.T) { + server, cleanup := setupTestServer(t, + nil, + func(w http.ResponseWriter, r *http.Request) { + w.Header().Set("Content-Type", "application/json") + _, _ = w.Write([]byte(`[]`)) + }, + ) + defer cleanup() + + result := callTool(t, server, "get_alert", map[string]interface{}{ + "fingerprint": "nonexistent", + }) + if !result.IsError { + t.Error("expected error result for nonexistent fingerprint") + } +} + +func TestHandler_Query(t *testing.T) { + server, cleanup := setupTestServer(t, + func(w http.ResponseWriter, r *http.Request) { + if r.URL.Path != "/api/v1/query" { + http.NotFound(w, r) + return + } + w.Header().Set("Content-Type", "application/json") + _, _ = w.Write([]byte(`{ + "status": "success", + "data": { + "resultType": "vector", + "result": [ + { + "metric": {"__name__": "up", "job": "node"}, + "value": [1234567890, "1"] + } + ] + } + }`)) + }, + nil, + ) + defer cleanup() + + result := callTool(t, server, "query", map[string]interface{}{ + "promql": "up", + }) + if result.IsError { + t.Fatalf("unexpected error: %s", result.Content[0].Text) + } + if !strings.Contains(result.Content[0].Text, "node") { + t.Errorf("expected output to contain 'node', got: %s", result.Content[0].Text) + } +} + +func TestHandler_ListTargets(t *testing.T) { + server, cleanup := setupTestServer(t, + func(w http.ResponseWriter, r *http.Request) { + if r.URL.Path != "/api/v1/targets" { + http.NotFound(w, r) + return + } + w.Header().Set("Content-Type", "application/json") + _, _ = w.Write([]byte(`{ + "status": "success", + "data": { + "activeTargets": [ + { + "labels": {"instance": "localhost:9090", "job": "prometheus"}, + "scrapePool": "prometheus", + "scrapeUrl": "http://localhost:9090/metrics", + "globalUrl": "http://localhost:9090/metrics", + "lastError": "", + "lastScrape": "2024-01-01T00:00:00Z", + "lastScrapeDuration": 0.015, + "health": "up", + "scrapeInterval": "15s", + "scrapeTimeout": "10s" + } + ], + "droppedTargets": [] + } + }`)) + }, + nil, + ) + defer cleanup() + + result := callTool(t, server, "list_targets", map[string]interface{}{}) + if result.IsError { + t.Fatalf("unexpected error: %s", result.Content[0].Text) + } + if !strings.Contains(result.Content[0].Text, "prometheus") { + t.Errorf("expected output to contain 'prometheus', got: %s", result.Content[0].Text) + } +} + +func TestHandler_SearchMetrics(t *testing.T) { + server, cleanup := setupTestServer(t, + func(w http.ResponseWriter, r *http.Request) { + w.Header().Set("Content-Type", "application/json") + switch r.URL.Path { + case "/api/v1/label/__name__/values": + _, _ = w.Write([]byte(`{ + "status": "success", + "data": ["node_cpu_seconds_total", "node_memory_MemTotal_bytes", "up"] + }`)) + case "/api/v1/metadata": + _, _ = w.Write([]byte(`{ + "status": "success", + "data": { + "node_cpu_seconds_total": [{"type": "counter", "help": "CPU time", "unit": ""}], + "node_memory_MemTotal_bytes": [{"type": "gauge", "help": "Total memory", "unit": "bytes"}] + } + }`)) + default: + http.NotFound(w, r) + } + }, + nil, + ) + defer cleanup() + + result := callTool(t, server, "search_metrics", map[string]interface{}{ + "query": "node", + }) + if result.IsError { + t.Fatalf("unexpected error: %s", result.Content[0].Text) + } + if !strings.Contains(result.Content[0].Text, "node_cpu") { + t.Errorf("expected output to contain 'node_cpu', got: %s", result.Content[0].Text) + } + // "up" should be filtered out since it doesn't match "node" + if strings.Contains(result.Content[0].Text, "| up |") { + t.Errorf("expected 'up' to be filtered out, got: %s", result.Content[0].Text) + } +} + +func TestHandler_ListSilences(t *testing.T) { + server, cleanup := setupTestServer(t, + nil, + func(w http.ResponseWriter, r *http.Request) { + if r.URL.Path != "/api/v2/silences" { + http.NotFound(w, r) + return + } + w.Header().Set("Content-Type", "application/json") + _, _ = w.Write([]byte(`[ + { + "id": "s1", + "matchers": [{"name": "alertname", "value": "Test", "isRegex": false}], + "startsAt": "2024-01-01T00:00:00Z", + "endsAt": "2024-01-01T02:00:00Z", + "createdBy": "admin", + "comment": "Testing", + "status": {"state": "active"} + }, + { + "id": "s2", + "matchers": [{"name": "job", "value": "node", "isRegex": false}], + "startsAt": "2023-01-01T00:00:00Z", + "endsAt": "2023-01-01T02:00:00Z", + "createdBy": "admin", + "comment": "Old", + "status": {"state": "expired"} + } + ]`)) + }, + ) + defer cleanup() + + result := callTool(t, server, "list_silences", map[string]interface{}{}) + if result.IsError { + t.Fatalf("unexpected error: %s", result.Content[0].Text) + } + // Should show active silence but filter out expired + if !strings.Contains(result.Content[0].Text, "s1") { + t.Errorf("expected active silence s1 in output, got: %s", result.Content[0].Text) + } + if strings.Contains(result.Content[0].Text, "s2") { + t.Errorf("expected expired silence s2 to be filtered out, got: %s", result.Content[0].Text) + } +} + +func TestHandler_ToolCount(t *testing.T) { + server, cleanup := setupTestServer(t, + func(w http.ResponseWriter, r *http.Request) {}, + func(w http.ResponseWriter, r *http.Request) {}, + ) + defer cleanup() + + // Send a tools/list request + req := &mcp.Request{ + JSONRPC: "2.0", + ID: 1, + Method: "tools/list", + } + resp := server.HandleRequest(context.Background(), req) + if resp == nil { + t.Fatal("expected response, got nil") + } + if resp.Error != nil { + t.Fatalf("unexpected error: %s", resp.Error.Message) + } + + resultJSON, err := json.Marshal(resp.Result) + if err != nil { + t.Fatalf("failed to marshal result: %v", err) + } + + var listResult mcp.ListToolsResult + if err := json.Unmarshal(resultJSON, &listResult); err != nil { + t.Fatalf("failed to unmarshal result: %v", err) + } + + if len(listResult.Tools) != 8 { + t.Errorf("expected 8 tools, got %d", len(listResult.Tools)) + for _, tool := range listResult.Tools { + t.Logf(" tool: %s", tool.Name) + } + } +} + +// callTool is a test helper that calls a tool through the MCP server. +func callTool(t *testing.T, server *mcp.Server, name string, args map[string]interface{}) mcp.CallToolResult { + t.Helper() + + params := mcp.CallToolParams{ + Name: name, + Arguments: args, + } + paramsJSON, err := json.Marshal(params) + if err != nil { + t.Fatalf("failed to marshal params: %v", err) + } + + req := &mcp.Request{ + JSONRPC: "2.0", + ID: 1, + Method: "tools/call", + Params: paramsJSON, + } + + resp := server.HandleRequest(context.Background(), req) + if resp == nil { + t.Fatal("expected response, got nil") + } + if resp.Error != nil { + t.Fatalf("JSON-RPC error: %s", resp.Error.Message) + } + + resultJSON, err := json.Marshal(resp.Result) + if err != nil { + t.Fatalf("failed to marshal result: %v", err) + } + + var result mcp.CallToolResult + if err := json.Unmarshal(resultJSON, &result); err != nil { + t.Fatalf("failed to unmarshal result: %v", err) + } + + return result +} diff --git a/internal/monitoring/prometheus.go b/internal/monitoring/prometheus.go new file mode 100644 index 0000000..0bf5902 --- /dev/null +++ b/internal/monitoring/prometheus.go @@ -0,0 +1,135 @@ +package monitoring + +import ( + "context" + "encoding/json" + "fmt" + "io" + "net/http" + "net/url" + "strings" + "time" +) + +// PrometheusClient is an HTTP client for the Prometheus API. +type PrometheusClient struct { + baseURL string + httpClient *http.Client +} + +// NewPrometheusClient creates a new Prometheus API client. +func NewPrometheusClient(baseURL string) *PrometheusClient { + return &PrometheusClient{ + baseURL: strings.TrimRight(baseURL, "/"), + httpClient: &http.Client{ + Timeout: 30 * time.Second, + }, + } +} + +// Query executes an instant PromQL query. If ts is zero, the current time is used. +func (c *PrometheusClient) Query(ctx context.Context, promql string, ts time.Time) (*PromQueryData, error) { + params := url.Values{} + params.Set("query", promql) + if !ts.IsZero() { + params.Set("time", fmt.Sprintf("%d", ts.Unix())) + } + + body, err := c.get(ctx, "/api/v1/query", params) + if err != nil { + return nil, fmt.Errorf("query failed: %w", err) + } + + var data PromQueryData + if err := json.Unmarshal(body, &data); err != nil { + return nil, fmt.Errorf("failed to parse query data: %w", err) + } + return &data, nil +} + +// LabelValues returns all values for a given label name. +func (c *PrometheusClient) LabelValues(ctx context.Context, label string) ([]string, error) { + path := fmt.Sprintf("/api/v1/label/%s/values", url.PathEscape(label)) + body, err := c.get(ctx, path, nil) + if err != nil { + return nil, fmt.Errorf("label values failed: %w", err) + } + + var values []string + if err := json.Unmarshal(body, &values); err != nil { + return nil, fmt.Errorf("failed to parse label values: %w", err) + } + return values, nil +} + +// Metadata returns metadata for metrics. If metric is empty, returns metadata for all metrics. +func (c *PrometheusClient) Metadata(ctx context.Context, metric string) (map[string][]PromMetadata, error) { + params := url.Values{} + if metric != "" { + params.Set("metric", metric) + } + + body, err := c.get(ctx, "/api/v1/metadata", params) + if err != nil { + return nil, fmt.Errorf("metadata failed: %w", err) + } + + var metadata map[string][]PromMetadata + if err := json.Unmarshal(body, &metadata); err != nil { + return nil, fmt.Errorf("failed to parse metadata: %w", err) + } + return metadata, nil +} + +// Targets returns the current scrape targets. +func (c *PrometheusClient) Targets(ctx context.Context) (*PromTargetsData, error) { + body, err := c.get(ctx, "/api/v1/targets", nil) + if err != nil { + return nil, fmt.Errorf("targets failed: %w", err) + } + + var data PromTargetsData + if err := json.Unmarshal(body, &data); err != nil { + return nil, fmt.Errorf("failed to parse targets data: %w", err) + } + return &data, nil +} + +// get performs a GET request and returns the "data" field from the Prometheus response envelope. +func (c *PrometheusClient) get(ctx context.Context, path string, params url.Values) (json.RawMessage, error) { + u := c.baseURL + path + if len(params) > 0 { + u += "?" + params.Encode() + } + + req, err := http.NewRequestWithContext(ctx, http.MethodGet, u, nil) + if err != nil { + return nil, fmt.Errorf("failed to create request: %w", err) + } + + resp, err := c.httpClient.Do(req) + if err != nil { + return nil, fmt.Errorf("request failed: %w", err) + } + defer resp.Body.Close() //nolint:errcheck // cleanup on exit + + body, err := io.ReadAll(resp.Body) + if err != nil { + return nil, fmt.Errorf("failed to read response: %w", err) + } + + if resp.StatusCode != http.StatusOK { + return nil, fmt.Errorf("unexpected status %d: %s", resp.StatusCode, string(body)) + } + + var promResp PromResponse + if err := json.Unmarshal(body, &promResp); err != nil { + return nil, fmt.Errorf("failed to parse response: %w", err) + } + + if promResp.Status != "success" { + return nil, fmt.Errorf("prometheus error (%s): %s", promResp.ErrorType, promResp.Error) + } + + return promResp.Data, nil +} diff --git a/internal/monitoring/prometheus_test.go b/internal/monitoring/prometheus_test.go new file mode 100644 index 0000000..e5712cc --- /dev/null +++ b/internal/monitoring/prometheus_test.go @@ -0,0 +1,209 @@ +package monitoring + +import ( + "context" + "net/http" + "net/http/httptest" + "testing" + "time" +) + +func TestPrometheusClient_Query(t *testing.T) { + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + if r.URL.Path != "/api/v1/query" { + t.Errorf("unexpected path: %s", r.URL.Path) + } + if r.URL.Query().Get("query") != "up" { + t.Errorf("unexpected query param: %s", r.URL.Query().Get("query")) + } + + w.Header().Set("Content-Type", "application/json") + _, _ = w.Write([]byte(`{ + "status": "success", + "data": { + "resultType": "vector", + "result": [ + { + "metric": {"__name__": "up", "job": "prometheus", "instance": "localhost:9090"}, + "value": [1234567890, "1"] + }, + { + "metric": {"__name__": "up", "job": "node", "instance": "localhost:9100"}, + "value": [1234567890, "0"] + } + ] + } + }`)) + })) + defer srv.Close() + + client := NewPrometheusClient(srv.URL) + data, err := client.Query(context.Background(), "up", time.Time{}) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + + if data.ResultType != "vector" { + t.Errorf("expected resultType=vector, got %s", data.ResultType) + } + if len(data.Result) != 2 { + t.Fatalf("expected 2 results, got %d", len(data.Result)) + } + if data.Result[0].Metric["job"] != "prometheus" { + t.Errorf("expected job=prometheus, got %s", data.Result[0].Metric["job"]) + } +} + +func TestPrometheusClient_QueryError(t *testing.T) { + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.Header().Set("Content-Type", "application/json") + _, _ = w.Write([]byte(`{ + "status": "error", + "errorType": "bad_data", + "error": "invalid expression" + }`)) + })) + defer srv.Close() + + client := NewPrometheusClient(srv.URL) + _, err := client.Query(context.Background(), "invalid{", time.Time{}) + if err == nil { + t.Fatal("expected error, got nil") + } + if !contains(err.Error(), "invalid expression") { + t.Errorf("expected error to contain 'invalid expression', got: %s", err.Error()) + } +} + +func TestPrometheusClient_LabelValues(t *testing.T) { + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + if r.URL.Path != "/api/v1/label/__name__/values" { + t.Errorf("unexpected path: %s", r.URL.Path) + } + + w.Header().Set("Content-Type", "application/json") + _, _ = w.Write([]byte(`{ + "status": "success", + "data": ["up", "node_cpu_seconds_total", "prometheus_build_info"] + }`)) + })) + defer srv.Close() + + client := NewPrometheusClient(srv.URL) + values, err := client.LabelValues(context.Background(), "__name__") + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + + if len(values) != 3 { + t.Fatalf("expected 3 values, got %d", len(values)) + } + if values[0] != "up" { + t.Errorf("expected first value=up, got %s", values[0]) + } +} + +func TestPrometheusClient_Metadata(t *testing.T) { + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + if r.URL.Path != "/api/v1/metadata" { + t.Errorf("unexpected path: %s", r.URL.Path) + } + + w.Header().Set("Content-Type", "application/json") + _, _ = w.Write([]byte(`{ + "status": "success", + "data": { + "up": [{"type": "gauge", "help": "Whether the target is up.", "unit": ""}], + "node_cpu_seconds_total": [{"type": "counter", "help": "CPU seconds spent.", "unit": "seconds"}] + } + }`)) + })) + defer srv.Close() + + client := NewPrometheusClient(srv.URL) + metadata, err := client.Metadata(context.Background(), "") + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + + if len(metadata) != 2 { + t.Fatalf("expected 2 metrics, got %d", len(metadata)) + } + if metadata["up"][0].Type != "gauge" { + t.Errorf("expected up type=gauge, got %s", metadata["up"][0].Type) + } +} + +func TestPrometheusClient_Targets(t *testing.T) { + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + if r.URL.Path != "/api/v1/targets" { + t.Errorf("unexpected path: %s", r.URL.Path) + } + + w.Header().Set("Content-Type", "application/json") + _, _ = w.Write([]byte(`{ + "status": "success", + "data": { + "activeTargets": [ + { + "labels": {"instance": "localhost:9090", "job": "prometheus"}, + "scrapePool": "prometheus", + "scrapeUrl": "http://localhost:9090/metrics", + "globalUrl": "http://localhost:9090/metrics", + "lastError": "", + "lastScrape": "2024-01-01T00:00:00Z", + "lastScrapeDuration": 0.01, + "health": "up", + "scrapeInterval": "15s", + "scrapeTimeout": "10s" + } + ], + "droppedTargets": [] + } + }`)) + })) + defer srv.Close() + + client := NewPrometheusClient(srv.URL) + data, err := client.Targets(context.Background()) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + + if len(data.ActiveTargets) != 1 { + t.Fatalf("expected 1 active target, got %d", len(data.ActiveTargets)) + } + if data.ActiveTargets[0].Health != "up" { + t.Errorf("expected health=up, got %s", data.ActiveTargets[0].Health) + } +} + +func TestPrometheusClient_HTTPError(t *testing.T) { + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.WriteHeader(http.StatusInternalServerError) + _, _ = w.Write([]byte("internal error")) + })) + defer srv.Close() + + client := NewPrometheusClient(srv.URL) + _, err := client.Query(context.Background(), "up", time.Time{}) + if err == nil { + t.Fatal("expected error, got nil") + } + if !contains(err.Error(), "500") { + t.Errorf("expected error to contain status code, got: %s", err.Error()) + } +} + +func contains(s, substr string) bool { + return len(s) >= len(substr) && searchString(s, substr) +} + +func searchString(s, substr string) bool { + for i := 0; i <= len(s)-len(substr); i++ { + if s[i:i+len(substr)] == substr { + return true + } + } + return false +} diff --git a/internal/monitoring/types.go b/internal/monitoring/types.go new file mode 100644 index 0000000..002700f --- /dev/null +++ b/internal/monitoring/types.go @@ -0,0 +1,120 @@ +package monitoring + +import ( + "encoding/json" + "time" +) + +// Prometheus API response types + +// PromResponse is the standard Prometheus API response envelope. +type PromResponse struct { + Status string `json:"status"` + Data json.RawMessage `json:"data,omitempty"` + ErrorType string `json:"errorType,omitempty"` + Error string `json:"error,omitempty"` +} + +// PromQueryData represents the data field for query results. +type PromQueryData struct { + ResultType string `json:"resultType"` + Result []PromInstantVector `json:"result"` +} + +// PromInstantVector represents a single instant vector result. +type PromInstantVector struct { + Metric map[string]string `json:"metric"` + Value [2]interface{} `json:"value"` // [timestamp, value_string] +} + +// PromScalar represents a scalar query result. +type PromScalar [2]interface{} // [timestamp, value_string] + +// PromMetadata represents metadata for a single metric. +type PromMetadata struct { + Type string `json:"type"` + Help string `json:"help"` + Unit string `json:"unit"` +} + +// PromTarget represents a single scrape target. +type PromTarget struct { + DiscoveredLabels map[string]string `json:"discoveredLabels"` + Labels map[string]string `json:"labels"` + ScrapePool string `json:"scrapePool"` + ScrapeURL string `json:"scrapeUrl"` + GlobalURL string `json:"globalUrl"` + LastError string `json:"lastError"` + LastScrape time.Time `json:"lastScrape"` + LastScrapeDuration float64 `json:"lastScrapeDuration"` + Health string `json:"health"` + ScrapeInterval string `json:"scrapeInterval"` + ScrapeTimeout string `json:"scrapeTimeout"` +} + +// PromTargetsData represents the data field for targets results. +type PromTargetsData struct { + ActiveTargets []PromTarget `json:"activeTargets"` + DroppedTargets []PromTarget `json:"droppedTargets"` +} + +// Alertmanager API response types + +// Alert represents an alert from the Alertmanager API v2. +type Alert struct { + Annotations map[string]string `json:"annotations"` + EndsAt time.Time `json:"endsAt"` + Fingerprint string `json:"fingerprint"` + Receivers []AlertReceiver `json:"receivers"` + StartsAt time.Time `json:"startsAt"` + Status AlertStatus `json:"status"` + UpdatedAt time.Time `json:"updatedAt"` + GeneratorURL string `json:"generatorURL"` + Labels map[string]string `json:"labels"` +} + +// AlertReceiver represents an alert receiver. +type AlertReceiver struct { + Name string `json:"name"` +} + +// AlertStatus represents the status of an alert. +type AlertStatus struct { + InhibitedBy []string `json:"inhibitedBy"` + SilencedBy []string `json:"silencedBy"` + State string `json:"state"` // "active", "suppressed", "unprocessed" +} + +// AlertFilters contains filters for listing alerts. +type AlertFilters struct { + Active *bool + Silenced *bool + Inhibited *bool + Unprocessed *bool + Filter []string // PromQL-style label matchers, e.g. {severity="critical"} + Receiver string +} + +// Silence represents a silence from the Alertmanager API v2. +type Silence struct { + ID string `json:"id,omitempty"` + Matchers []Matcher `json:"matchers"` + StartsAt time.Time `json:"startsAt"` + EndsAt time.Time `json:"endsAt"` + CreatedBy string `json:"createdBy"` + Comment string `json:"comment"` + Status *SilenceStatus `json:"status,omitempty"` +} + +// SilenceStatus represents the status of a silence. +type SilenceStatus struct { + State string `json:"state"` // "active", "pending", "expired" +} + +// Matcher represents a label matcher for silences. +type Matcher struct { + Name string `json:"name"` + Value string `json:"value"` + IsRegex bool `json:"isRegex"` + IsEqual *bool `json:"isEqual,omitempty"` +} diff --git a/nix/lab-monitoring-module.nix b/nix/lab-monitoring-module.nix new file mode 100644 index 0000000..6a15dbe --- /dev/null +++ b/nix/lab-monitoring-module.nix @@ -0,0 +1,139 @@ +{ config, lib, pkgs, ... }: + +let + cfg = config.services.lab-monitoring; + + mkHttpFlags = httpCfg: lib.concatStringsSep " " ([ + "--transport http" + "--http-address '${httpCfg.address}'" + "--http-endpoint '${httpCfg.endpoint}'" + "--session-ttl '${httpCfg.sessionTTL}'" + ] ++ lib.optionals (httpCfg.allowedOrigins != []) ( + map (origin: "--allowed-origins '${origin}'") httpCfg.allowedOrigins + ) ++ lib.optionals httpCfg.tls.enable [ + "--tls-cert '${httpCfg.tls.certFile}'" + "--tls-key '${httpCfg.tls.keyFile}'" + ]); +in +{ + options.services.lab-monitoring = { + enable = lib.mkEnableOption "Lab Monitoring MCP server"; + + package = lib.mkPackageOption pkgs "lab-monitoring" { }; + + prometheusUrl = lib.mkOption { + type = lib.types.str; + default = "http://localhost:9090"; + description = "Prometheus base URL."; + }; + + alertmanagerUrl = lib.mkOption { + type = lib.types.str; + default = "http://localhost:9093"; + description = "Alertmanager base URL."; + }; + + http = { + address = lib.mkOption { + type = lib.types.str; + default = "127.0.0.1:8084"; + description = "HTTP listen address for the MCP server."; + }; + + endpoint = lib.mkOption { + type = lib.types.str; + default = "/mcp"; + description = "HTTP endpoint path for MCP requests."; + }; + + allowedOrigins = lib.mkOption { + type = lib.types.listOf lib.types.str; + default = [ ]; + description = "Allowed Origin headers for CORS."; + }; + + sessionTTL = lib.mkOption { + type = lib.types.str; + default = "30m"; + description = "Session TTL for HTTP transport."; + }; + + tls = { + enable = lib.mkEnableOption "TLS for HTTP transport"; + + certFile = lib.mkOption { + type = lib.types.nullOr lib.types.path; + default = null; + description = "Path to TLS certificate file."; + }; + + keyFile = lib.mkOption { + type = lib.types.nullOr lib.types.path; + default = null; + description = "Path to TLS private key file."; + }; + }; + }; + + openFirewall = lib.mkOption { + type = lib.types.bool; + default = false; + description = "Whether to open the firewall for the MCP HTTP server."; + }; + }; + + config = lib.mkIf cfg.enable { + assertions = [ + { + assertion = !cfg.http.tls.enable || (cfg.http.tls.certFile != null && cfg.http.tls.keyFile != null); + message = "services.lab-monitoring.http.tls: both certFile and keyFile must be set when TLS is enabled"; + } + ]; + + systemd.services.lab-monitoring = { + description = "Lab Monitoring MCP Server"; + wantedBy = [ "multi-user.target" ]; + after = [ "network.target" ]; + + environment = { + PROMETHEUS_URL = cfg.prometheusUrl; + ALERTMANAGER_URL = cfg.alertmanagerUrl; + }; + + script = let + httpFlags = mkHttpFlags cfg.http; + in '' + exec ${cfg.package}/bin/lab-monitoring serve ${httpFlags} + ''; + + serviceConfig = { + Type = "simple"; + DynamicUser = true; + Restart = "on-failure"; + RestartSec = "5s"; + + # Hardening + NoNewPrivileges = true; + ProtectSystem = "strict"; + ProtectHome = true; + PrivateTmp = true; + PrivateDevices = true; + ProtectKernelTunables = true; + ProtectKernelModules = true; + ProtectControlGroups = true; + RestrictNamespaces = true; + RestrictRealtime = true; + RestrictSUIDSGID = true; + MemoryDenyWriteExecute = true; + LockPersonality = true; + }; + }; + + networking.firewall = lib.mkIf cfg.openFirewall (let + addressParts = lib.splitString ":" cfg.http.address; + port = lib.toInt (lib.last addressParts); + in { + allowedTCPPorts = [ port ]; + }); + }; +} diff --git a/nix/package.nix b/nix/package.nix index 4583e9f..a20e6bd 100644 --- a/nix/package.nix +++ b/nix/package.nix @@ -7,7 +7,7 @@ buildGoModule { inherit pname src; - version = "0.2.0"; + version = "0.2.1"; vendorHash = "sha256-D0KIxQC9ctIAaHBFTvkhBE06uOZwDUcIw8471Ug2doY=";