feat: add lab-monitoring MCP server for Prometheus and Alertmanager

New MCP server that queries live Prometheus and Alertmanager HTTP APIs
with 8 tools: list_alerts, get_alert, search_metrics, get_metric_metadata,
query (PromQL), list_targets, list_silences, and create_silence.

Extends the MCP core with ModeCustom and NewGenericServer for servers
that don't require a database. Includes CLI with direct commands
(alerts, query, targets, metrics), NixOS module, and comprehensive
httptest-based tests.

Bumps existing binaries to 0.2.1 due to shared internal/mcp change.

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
2026-02-04 23:11:53 +01:00
parent 0bd4ed778a
commit 1755364bba
19 changed files with 2567 additions and 22 deletions

View File

@@ -25,6 +25,19 @@
"env": {
"NIXPKGS_SEARCH_DATABASE": "sqlite://:memory:"
}
},
"lab-monitoring": {
"command": "nix",
"args": [
"run",
".#lab-monitoring",
"--",
"serve"
],
"env": {
"PROMETHEUS_URL": "http://localhost:9090",
"ALERTMANAGER_URL": "http://localhost:9093"
}
}
}
}

View File

@@ -20,7 +20,12 @@ Search and query NixOS configuration options. Uses nixpkgs as source.
### Home Manager Options (`hm-options`)
Search and query Home Manager configuration options. Uses home-manager repository as source.
All servers share the same architecture:
### Lab Monitoring (`lab-monitoring`)
Query Prometheus metrics and Alertmanager alerts. Unlike other servers, this queries live HTTP APIs — no database or indexing needed.
- 8 tools: list/get alerts, search metrics, get metadata, PromQL query, list targets, list/create silences
- Configurable Prometheus and Alertmanager URLs via flags or environment variables
The nixpkgs/options/hm servers share a database-backed architecture:
- Full-text search across option/package names and descriptions
- Query specific options/packages with full metadata
- Index multiple revisions (by git hash or channel name)
@@ -38,8 +43,9 @@ All servers share the same architecture:
## Project Status
**Complete and maintained** - All core features implemented:
- Full MCP servers with 6 tools each
- PostgreSQL and SQLite backends with FTS
- Full MCP servers (6 tools each for nixpkgs/options, 8 tools for monitoring)
- PostgreSQL and SQLite backends with FTS (for nixpkgs/options servers)
- Live API queries for Prometheus/Alertmanager (monitoring server)
- NixOS modules for deployment
- CLI for manual operations
- Comprehensive test suite
@@ -53,8 +59,10 @@ labmcp/
│ │ └── main.go # Combined options+packages CLI (primary)
│ ├── nixos-options/
│ │ └── main.go # NixOS options CLI (legacy)
── hm-options/
└── main.go # Home Manager options CLI
── hm-options/
└── main.go # Home Manager options CLI
│ └── lab-monitoring/
│ └── main.go # Prometheus/Alertmanager CLI
├── internal/
│ ├── database/
│ │ ├── interface.go # Store interface (options + packages)
@@ -82,14 +90,22 @@ labmcp/
│ │ ├── indexer.go # Home Manager indexing
│ │ ├── types.go # Channel aliases, extensions
│ │ └── *_test.go # Indexer tests
── packages/
├── indexer.go # Nix packages indexing
├── parser.go # nix-env JSON parsing
├── types.go # Package types, channel aliases
└── *_test.go # Parser tests
── packages/
├── indexer.go # Nix packages indexing
├── parser.go # nix-env JSON parsing
├── types.go # Package types, channel aliases
└── *_test.go # Parser tests
│ └── monitoring/
│ ├── types.go # Prometheus/Alertmanager API types
│ ├── prometheus.go # Prometheus HTTP client
│ ├── alertmanager.go # Alertmanager HTTP client
│ ├── handlers.go # MCP tool definitions + handlers
│ ├── format.go # Markdown formatting utilities
│ └── *_test.go # Tests (httptest-based)
├── nix/
│ ├── module.nix # NixOS module for nixos-options
│ ├── hm-options-module.nix # NixOS module for hm-options
│ ├── lab-monitoring-module.nix # NixOS module for lab-monitoring
│ └── package.nix # Parameterized Nix package
├── testdata/
│ └── options-sample.json # Test fixture
@@ -124,6 +140,19 @@ labmcp/
| `list_revisions` | List all indexed revisions |
| `delete_revision` | Delete an indexed revision |
### Monitoring Server (lab-monitoring)
| Tool | Description |
|------|-------------|
| `list_alerts` | List alerts with optional filters (state, severity, receiver) |
| `get_alert` | Get full details for a specific alert by fingerprint |
| `search_metrics` | Search metric names with substring filter, enriched with metadata |
| `get_metric_metadata` | Get type, help text, and unit for a specific metric |
| `query` | Execute instant PromQL query |
| `list_targets` | List scrape targets with health status |
| `list_silences` | List active/pending silences |
| `create_silence` | Create a silence (confirms with user first) |
## Key Implementation Details
### Database
@@ -212,6 +241,17 @@ hm-options delete <revision> # Delete indexed revision
hm-options --version # Show version
```
### lab-monitoring
```bash
lab-monitoring serve # Run MCP server on STDIO
lab-monitoring serve --transport http # Run MCP server on HTTP
lab-monitoring alerts # List alerts
lab-monitoring alerts --state active # Filter by state
lab-monitoring query 'up' # Instant PromQL query
lab-monitoring targets # List scrape targets
lab-monitoring metrics node # Search metric names
```
### Channel Aliases
**nixpkgs-search/nixos-options**: `nixos-unstable`, `nixos-stable`, `nixos-24.11`, `nixos-24.05`, etc.
@@ -220,6 +260,9 @@ hm-options --version # Show version
## Notes for Claude
### Planning
When creating implementation plans, the first step should usually be to **checkout an appropriately named feature branch** (e.g., `git checkout -b feature/lab-monitoring`). This keeps work isolated and makes PRs cleaner.
### Development Workflow
- **Always run `go fmt ./...` before committing Go code**
- **Run Go commands using `nix develop -c`** (e.g., `nix develop -c go test ./...`)
@@ -257,7 +300,8 @@ Version is defined in multiple places that must stay in sync:
- `cmd/nixpkgs-search/main.go`
- `cmd/nixos-options/main.go`
- `cmd/hm-options/main.go`
- `internal/mcp/server.go` (in `DefaultNixOSConfig`, `DefaultHomeManagerConfig`, `DefaultNixpkgsPackagesConfig`)
- `cmd/lab-monitoring/main.go`
- `internal/mcp/server.go` (in `DefaultNixOSConfig`, `DefaultHomeManagerConfig`, `DefaultNixpkgsPackagesConfig`, `DefaultMonitoringConfig`)
- `nix/package.nix`
### User Preferences
@@ -284,6 +328,7 @@ nix develop -c go test -bench=. -benchtime=1x -timeout=30m ./internal/homemanage
nix build .#nixpkgs-search
nix build .#nixos-options
nix build .#hm-options
nix build .#lab-monitoring
# Run directly
nix run .#nixpkgs-search -- options serve
@@ -291,6 +336,7 @@ nix run .#nixpkgs-search -- packages serve
nix run .#nixpkgs-search -- index nixos-unstable
nix run .#hm-options -- serve
nix run .#hm-options -- index hm-unstable
nix run .#lab-monitoring -- serve
```
### Indexing Performance

View File

@@ -15,6 +15,7 @@
## New MCP Servers
- [x] `nixpkgs-packages` - Index and search nixpkgs packages (implemented in `nixpkgs-search packages`)
- [x] `lab-monitoring` - Query Prometheus and Alertmanager APIs (8 tools, no database required)
## Nice to Have

View File

@@ -20,7 +20,7 @@ import (
const (
defaultDatabase = "sqlite://hm-options.db"
version = "0.2.0"
version = "0.2.1"
)
func main() {

369
cmd/lab-monitoring/main.go Normal file
View File

@@ -0,0 +1,369 @@
package main
import (
"context"
"fmt"
"log"
"os"
"os/signal"
"syscall"
"time"
"github.com/urfave/cli/v2"
"git.t-juice.club/torjus/labmcp/internal/mcp"
"git.t-juice.club/torjus/labmcp/internal/monitoring"
)
const version = "0.1.0"
func main() {
app := &cli.App{
Name: "lab-monitoring",
Usage: "MCP server for Prometheus and Alertmanager monitoring",
Version: version,
Flags: []cli.Flag{
&cli.StringFlag{
Name: "prometheus-url",
Usage: "Prometheus base URL",
EnvVars: []string{"PROMETHEUS_URL"},
Value: "http://localhost:9090",
},
&cli.StringFlag{
Name: "alertmanager-url",
Usage: "Alertmanager base URL",
EnvVars: []string{"ALERTMANAGER_URL"},
Value: "http://localhost:9093",
},
},
Commands: []*cli.Command{
serveCommand(),
alertsCommand(),
queryCommand(),
targetsCommand(),
metricsCommand(),
},
}
if err := app.Run(os.Args); err != nil {
log.Fatal(err)
}
}
func serveCommand() *cli.Command {
return &cli.Command{
Name: "serve",
Usage: "Run MCP server for lab monitoring",
Flags: []cli.Flag{
&cli.StringFlag{
Name: "transport",
Aliases: []string{"t"},
Usage: "Transport type: 'stdio' or 'http'",
Value: "stdio",
},
&cli.StringFlag{
Name: "http-address",
Usage: "HTTP listen address",
Value: "127.0.0.1:8084",
},
&cli.StringFlag{
Name: "http-endpoint",
Usage: "HTTP endpoint path",
Value: "/mcp",
},
&cli.StringSliceFlag{
Name: "allowed-origins",
Usage: "Allowed Origin headers for CORS",
},
&cli.StringFlag{
Name: "tls-cert",
Usage: "TLS certificate file",
},
&cli.StringFlag{
Name: "tls-key",
Usage: "TLS key file",
},
&cli.DurationFlag{
Name: "session-ttl",
Usage: "Session TTL for HTTP transport",
Value: 30 * time.Minute,
},
},
Action: func(c *cli.Context) error {
return runServe(c)
},
}
}
func alertsCommand() *cli.Command {
return &cli.Command{
Name: "alerts",
Usage: "List alerts from Alertmanager",
Flags: []cli.Flag{
&cli.StringFlag{
Name: "state",
Usage: "Filter by state: active, suppressed, unprocessed",
},
&cli.StringFlag{
Name: "severity",
Usage: "Filter by severity label",
},
},
Action: func(c *cli.Context) error {
return runAlerts(c)
},
}
}
func queryCommand() *cli.Command {
return &cli.Command{
Name: "query",
Usage: "Execute an instant PromQL query",
ArgsUsage: "<promql>",
Action: func(c *cli.Context) error {
if c.NArg() < 1 {
return fmt.Errorf("promql expression required")
}
return runQuery(c, c.Args().First())
},
}
}
func targetsCommand() *cli.Command {
return &cli.Command{
Name: "targets",
Usage: "List scrape targets",
Action: func(c *cli.Context) error {
return runTargets(c)
},
}
}
func metricsCommand() *cli.Command {
return &cli.Command{
Name: "metrics",
Usage: "Search metric names",
ArgsUsage: "<search>",
Flags: []cli.Flag{
&cli.IntFlag{
Name: "limit",
Aliases: []string{"n"},
Usage: "Maximum number of results",
Value: 50,
},
},
Action: func(c *cli.Context) error {
query := ""
if c.NArg() > 0 {
query = c.Args().First()
}
return runMetrics(c, query)
},
}
}
func runServe(c *cli.Context) error {
ctx, cancel := signal.NotifyContext(context.Background(), syscall.SIGINT, syscall.SIGTERM)
defer cancel()
logger := log.New(os.Stderr, "[mcp] ", log.LstdFlags)
config := mcp.DefaultMonitoringConfig()
server := mcp.NewGenericServer(logger, config)
prom := monitoring.NewPrometheusClient(c.String("prometheus-url"))
am := monitoring.NewAlertmanagerClient(c.String("alertmanager-url"))
monitoring.RegisterHandlers(server, prom, am)
transport := c.String("transport")
switch transport {
case "stdio":
logger.Println("Starting lab-monitoring MCP server on stdio...")
return server.Run(ctx, os.Stdin, os.Stdout)
case "http":
httpConfig := mcp.HTTPConfig{
Address: c.String("http-address"),
Endpoint: c.String("http-endpoint"),
AllowedOrigins: c.StringSlice("allowed-origins"),
SessionTTL: c.Duration("session-ttl"),
TLSCertFile: c.String("tls-cert"),
TLSKeyFile: c.String("tls-key"),
}
httpTransport := mcp.NewHTTPTransport(server, httpConfig)
return httpTransport.Run(ctx)
default:
return fmt.Errorf("unknown transport: %s (use 'stdio' or 'http')", transport)
}
}
func runAlerts(c *cli.Context) error {
ctx := context.Background()
am := monitoring.NewAlertmanagerClient(c.String("alertmanager-url"))
filters := monitoring.AlertFilters{}
if state := c.String("state"); state != "" {
switch state {
case "active":
active := true
filters.Active = &active
silenced := false
filters.Silenced = &silenced
inhibited := false
filters.Inhibited = &inhibited
case "suppressed":
active := false
filters.Active = &active
case "unprocessed":
unprocessed := true
filters.Unprocessed = &unprocessed
}
}
if severity := c.String("severity"); severity != "" {
filters.Filter = append(filters.Filter, fmt.Sprintf(`severity="%s"`, severity))
}
alerts, err := am.ListAlerts(ctx, filters)
if err != nil {
return fmt.Errorf("failed to list alerts: %w", err)
}
if len(alerts) == 0 {
fmt.Println("No alerts found.")
return nil
}
for _, a := range alerts {
state := a.Status.State
severity := a.Labels["severity"]
name := a.Labels["alertname"]
fmt.Printf("[%s] %s (severity=%s, fingerprint=%s)\n", state, name, severity, a.Fingerprint)
for k, v := range a.Annotations {
fmt.Printf(" %s: %s\n", k, v)
}
}
return nil
}
func runQuery(c *cli.Context, promql string) error {
ctx := context.Background()
prom := monitoring.NewPrometheusClient(c.String("prometheus-url"))
data, err := prom.Query(ctx, promql, time.Time{})
if err != nil {
return fmt.Errorf("query failed: %w", err)
}
for _, r := range data.Result {
labels := ""
for k, v := range r.Metric {
if labels != "" {
labels += ", "
}
labels += fmt.Sprintf("%s=%q", k, v)
}
value := ""
if len(r.Value) >= 2 {
if v, ok := r.Value[1].(string); ok {
value = v
}
}
fmt.Printf("{%s} %s\n", labels, value)
}
return nil
}
func runTargets(c *cli.Context) error {
ctx := context.Background()
prom := monitoring.NewPrometheusClient(c.String("prometheus-url"))
data, err := prom.Targets(ctx)
if err != nil {
return fmt.Errorf("failed to fetch targets: %w", err)
}
if len(data.ActiveTargets) == 0 {
fmt.Println("No active targets.")
return nil
}
for _, t := range data.ActiveTargets {
job := t.Labels["job"]
instance := t.Labels["instance"]
fmt.Printf("[%s] %s/%s (last scrape: %s, duration: %.3fs)\n",
t.Health, job, instance, t.LastScrape.Format("15:04:05"), t.LastScrapeDuration)
if t.LastError != "" {
fmt.Printf(" error: %s\n", t.LastError)
}
}
return nil
}
func runMetrics(c *cli.Context, query string) error {
ctx := context.Background()
prom := monitoring.NewPrometheusClient(c.String("prometheus-url"))
names, err := prom.LabelValues(ctx, "__name__")
if err != nil {
return fmt.Errorf("failed to fetch metric names: %w", err)
}
limit := c.Int("limit")
count := 0
for _, name := range names {
if query != "" {
// Simple case-insensitive substring match
if !containsIgnoreCase(name, query) {
continue
}
}
fmt.Println(name)
count++
if count >= limit {
fmt.Printf("... (showing %d of matching metrics, use --limit to see more)\n", limit)
break
}
}
if count == 0 {
fmt.Printf("No metrics found matching '%s'\n", query)
}
return nil
}
func containsIgnoreCase(s, substr string) bool {
sLower := make([]byte, len(s))
subLower := make([]byte, len(substr))
for i := range s {
if s[i] >= 'A' && s[i] <= 'Z' {
sLower[i] = s[i] + 32
} else {
sLower[i] = s[i]
}
}
for i := range substr {
if substr[i] >= 'A' && substr[i] <= 'Z' {
subLower[i] = substr[i] + 32
} else {
subLower[i] = substr[i]
}
}
for i := 0; i <= len(sLower)-len(subLower); i++ {
match := true
for j := range subLower {
if sLower[i+j] != subLower[j] {
match = false
break
}
}
if match {
return true
}
}
return false
}

View File

@@ -19,7 +19,7 @@ import (
const (
defaultDatabase = "sqlite://nixos-options.db"
version = "0.2.0"
version = "0.2.1"
)
func main() {

View File

@@ -20,7 +20,7 @@ import (
const (
defaultDatabase = "sqlite://nixpkgs-search.db"
version = "0.2.0"
version = "0.2.1"
)
func main() {

View File

@@ -33,6 +33,13 @@
mainProgram = "nixpkgs-search";
description = "Search nixpkgs options and packages";
};
lab-monitoring = pkgs.callPackage ./nix/package.nix {
src = ./.;
pname = "lab-monitoring";
subPackage = "cmd/lab-monitoring";
mainProgram = "lab-monitoring";
description = "MCP server for Prometheus and Alertmanager monitoring";
};
default = self.packages.${system}.nixpkgs-search;
});
@@ -73,6 +80,10 @@
imports = [ ./nix/hm-options-module.nix ];
services.hm-options-mcp.package = lib.mkDefault self.packages.${pkgs.system}.hm-options;
};
lab-monitoring-mcp = { pkgs, ... }: {
imports = [ ./nix/lab-monitoring-module.nix ];
services.lab-monitoring.package = lib.mkDefault self.packages.${pkgs.system}.lab-monitoring;
};
default = self.nixosModules.nixpkgs-search-mcp;
};
};

View File

@@ -18,6 +18,8 @@ const (
ModeOptions ServerMode = "options"
// ModePackages exposes only package-related tools.
ModePackages ServerMode = "packages"
// ModeCustom exposes externally registered tools (no database required).
ModeCustom ServerMode = "custom"
)
// ServerConfig contains configuration for the MCP server.
@@ -40,7 +42,7 @@ type ServerConfig struct {
func DefaultNixOSConfig() ServerConfig {
return ServerConfig{
Name: "nixos-options",
Version: "0.2.0",
Version: "0.2.1",
DefaultChannel: "nixos-stable",
SourceName: "nixpkgs",
Mode: ModeOptions,
@@ -60,7 +62,7 @@ This ensures option documentation matches the nixpkgs version the project actual
func DefaultNixpkgsPackagesConfig() ServerConfig {
return ServerConfig{
Name: "nixpkgs-packages",
Version: "0.2.0",
Version: "0.2.1",
DefaultChannel: "nixos-stable",
SourceName: "nixpkgs",
Mode: ModePackages,
@@ -74,11 +76,29 @@ This ensures package information matches the nixpkgs version the project actuall
}
}
// DefaultMonitoringConfig returns the default configuration for the lab monitoring server.
func DefaultMonitoringConfig() ServerConfig {
return ServerConfig{
Name: "lab-monitoring",
Version: "0.1.0",
Mode: ModeCustom,
Instructions: `Lab Monitoring MCP Server - Query Prometheus metrics and Alertmanager alerts.
Tools for querying your monitoring stack:
- Search and query Prometheus metrics with PromQL
- List and inspect alerts from Alertmanager
- View scrape target health status
- Manage alert silences
All queries are executed against live Prometheus and Alertmanager HTTP APIs.`,
}
}
// DefaultHomeManagerConfig returns the default configuration for Home Manager options server.
func DefaultHomeManagerConfig() ServerConfig {
return ServerConfig{
Name: "hm-options",
Version: "0.2.0",
Version: "0.2.1",
DefaultChannel: "hm-stable",
SourceName: "home-manager",
Mode: ModeOptions,
@@ -99,6 +119,7 @@ type Server struct {
store database.Store
config ServerConfig
tools map[string]ToolHandler
toolDefs []Tool
initialized bool
logger *log.Logger
}
@@ -106,7 +127,7 @@ type Server struct {
// ToolHandler is a function that handles a tool call.
type ToolHandler func(ctx context.Context, args map[string]interface{}) (CallToolResult, error)
// NewServer creates a new MCP server with the given configuration.
// NewServer creates a new MCP server with a database store.
func NewServer(store database.Store, logger *log.Logger, config ServerConfig) *Server {
if logger == nil {
logger = log.New(io.Discard, "", 0)
@@ -121,6 +142,25 @@ func NewServer(store database.Store, logger *log.Logger, config ServerConfig) *S
return s
}
// NewGenericServer creates a new MCP server without a database store.
// Use RegisterTool to add tools externally.
func NewGenericServer(logger *log.Logger, config ServerConfig) *Server {
if logger == nil {
logger = log.New(io.Discard, "", 0)
}
return &Server{
config: config,
tools: make(map[string]ToolHandler),
logger: logger,
}
}
// RegisterTool registers an externally defined tool with its handler.
func (s *Server) RegisterTool(tool Tool, handler ToolHandler) {
s.toolDefs = append(s.toolDefs, tool)
s.tools[tool.Name] = handler
}
// registerTools registers all available tools.
func (s *Server) registerTools() {
// Tools will be implemented in handlers.go
@@ -237,6 +277,11 @@ func (s *Server) handleToolsList(req *Request) *Response {
// getToolDefinitions returns the tool definitions.
func (s *Server) getToolDefinitions() []Tool {
// For custom mode, return externally registered tools
if s.config.Mode == ModeCustom {
return s.toolDefs
}
// For packages mode, return package tools
if s.config.Mode == ModePackages {
return s.getPackageToolDefinitions()

View File

@@ -0,0 +1,153 @@
package monitoring
import (
"bytes"
"context"
"encoding/json"
"fmt"
"io"
"net/http"
"net/url"
"strings"
"time"
)
// AlertmanagerClient is an HTTP client for the Alertmanager API v2.
type AlertmanagerClient struct {
baseURL string
httpClient *http.Client
}
// NewAlertmanagerClient creates a new Alertmanager API client.
func NewAlertmanagerClient(baseURL string) *AlertmanagerClient {
return &AlertmanagerClient{
baseURL: strings.TrimRight(baseURL, "/"),
httpClient: &http.Client{
Timeout: 30 * time.Second,
},
}
}
// ListAlerts returns alerts matching the given filters.
func (c *AlertmanagerClient) ListAlerts(ctx context.Context, filters AlertFilters) ([]Alert, error) {
params := url.Values{}
if filters.Active != nil {
params.Set("active", fmt.Sprintf("%t", *filters.Active))
}
if filters.Silenced != nil {
params.Set("silenced", fmt.Sprintf("%t", *filters.Silenced))
}
if filters.Inhibited != nil {
params.Set("inhibited", fmt.Sprintf("%t", *filters.Inhibited))
}
if filters.Unprocessed != nil {
params.Set("unprocessed", fmt.Sprintf("%t", *filters.Unprocessed))
}
if filters.Receiver != "" {
params.Set("receiver", filters.Receiver)
}
for _, f := range filters.Filter {
params.Add("filter", f)
}
u := c.baseURL + "/api/v2/alerts"
if len(params) > 0 {
u += "?" + params.Encode()
}
req, err := http.NewRequestWithContext(ctx, http.MethodGet, u, nil)
if err != nil {
return nil, fmt.Errorf("failed to create request: %w", err)
}
resp, err := c.httpClient.Do(req)
if err != nil {
return nil, fmt.Errorf("request failed: %w", err)
}
defer resp.Body.Close() //nolint:errcheck // cleanup on exit
body, err := io.ReadAll(resp.Body)
if err != nil {
return nil, fmt.Errorf("failed to read response: %w", err)
}
if resp.StatusCode != http.StatusOK {
return nil, fmt.Errorf("unexpected status %d: %s", resp.StatusCode, string(body))
}
var alerts []Alert
if err := json.Unmarshal(body, &alerts); err != nil {
return nil, fmt.Errorf("failed to parse alerts: %w", err)
}
return alerts, nil
}
// ListSilences returns all silences.
func (c *AlertmanagerClient) ListSilences(ctx context.Context) ([]Silence, error) {
req, err := http.NewRequestWithContext(ctx, http.MethodGet, c.baseURL+"/api/v2/silences", nil)
if err != nil {
return nil, fmt.Errorf("failed to create request: %w", err)
}
resp, err := c.httpClient.Do(req)
if err != nil {
return nil, fmt.Errorf("request failed: %w", err)
}
defer resp.Body.Close() //nolint:errcheck // cleanup on exit
body, err := io.ReadAll(resp.Body)
if err != nil {
return nil, fmt.Errorf("failed to read response: %w", err)
}
if resp.StatusCode != http.StatusOK {
return nil, fmt.Errorf("unexpected status %d: %s", resp.StatusCode, string(body))
}
var silences []Silence
if err := json.Unmarshal(body, &silences); err != nil {
return nil, fmt.Errorf("failed to parse silences: %w", err)
}
return silences, nil
}
// CreateSilence creates a new silence and returns the silence ID.
func (c *AlertmanagerClient) CreateSilence(ctx context.Context, silence Silence) (string, error) {
data, err := json.Marshal(silence)
if err != nil {
return "", fmt.Errorf("failed to marshal silence: %w", err)
}
req, err := http.NewRequestWithContext(ctx, http.MethodPost, c.baseURL+"/api/v2/silences", bytes.NewReader(data))
if err != nil {
return "", fmt.Errorf("failed to create request: %w", err)
}
req.Header.Set("Content-Type", "application/json")
resp, err := c.httpClient.Do(req)
if err != nil {
return "", fmt.Errorf("request failed: %w", err)
}
defer resp.Body.Close() //nolint:errcheck // cleanup on exit
body, err := io.ReadAll(resp.Body)
if err != nil {
return "", fmt.Errorf("failed to read response: %w", err)
}
if resp.StatusCode != http.StatusOK {
return "", fmt.Errorf("unexpected status %d: %s", resp.StatusCode, string(body))
}
var result struct {
SilenceID string `json:"silenceID"`
}
if err := json.Unmarshal(body, &result); err != nil {
return "", fmt.Errorf("failed to parse response: %w", err)
}
return result.SilenceID, nil
}

View File

@@ -0,0 +1,175 @@
package monitoring
import (
"context"
"encoding/json"
"net/http"
"net/http/httptest"
"testing"
"time"
)
func TestAlertmanagerClient_ListAlerts(t *testing.T) {
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
if r.URL.Path != "/api/v2/alerts" {
t.Errorf("unexpected path: %s", r.URL.Path)
}
w.Header().Set("Content-Type", "application/json")
_, _ = w.Write([]byte(`[
{
"annotations": {"summary": "Target is down"},
"endsAt": "2024-01-01T01:00:00Z",
"fingerprint": "abc123",
"receivers": [{"name": "default"}],
"startsAt": "2024-01-01T00:00:00Z",
"status": {"inhibitedBy": [], "silencedBy": [], "state": "active"},
"updatedAt": "2024-01-01T00:00:00Z",
"generatorURL": "http://prometheus:9090/graph",
"labels": {"alertname": "TargetDown", "severity": "critical", "instance": "node1:9100"}
}
]`))
}))
defer srv.Close()
client := NewAlertmanagerClient(srv.URL)
alerts, err := client.ListAlerts(context.Background(), AlertFilters{})
if err != nil {
t.Fatalf("unexpected error: %v", err)
}
if len(alerts) != 1 {
t.Fatalf("expected 1 alert, got %d", len(alerts))
}
if alerts[0].Fingerprint != "abc123" {
t.Errorf("expected fingerprint=abc123, got %s", alerts[0].Fingerprint)
}
if alerts[0].Labels["alertname"] != "TargetDown" {
t.Errorf("expected alertname=TargetDown, got %s", alerts[0].Labels["alertname"])
}
if alerts[0].Status.State != "active" {
t.Errorf("expected state=active, got %s", alerts[0].Status.State)
}
}
func TestAlertmanagerClient_ListAlertsWithFilters(t *testing.T) {
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
q := r.URL.Query()
if q.Get("active") != "true" {
t.Errorf("expected active=true, got %s", q.Get("active"))
}
if q.Get("silenced") != "false" {
t.Errorf("expected silenced=false, got %s", q.Get("silenced"))
}
w.Header().Set("Content-Type", "application/json")
_, _ = w.Write([]byte(`[]`))
}))
defer srv.Close()
client := NewAlertmanagerClient(srv.URL)
active := true
silenced := false
_, err := client.ListAlerts(context.Background(), AlertFilters{
Active: &active,
Silenced: &silenced,
})
if err != nil {
t.Fatalf("unexpected error: %v", err)
}
}
func TestAlertmanagerClient_ListSilences(t *testing.T) {
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
if r.URL.Path != "/api/v2/silences" {
t.Errorf("unexpected path: %s", r.URL.Path)
}
w.Header().Set("Content-Type", "application/json")
_, _ = w.Write([]byte(`[
{
"id": "silence-1",
"matchers": [{"name": "alertname", "value": "TargetDown", "isRegex": false}],
"startsAt": "2024-01-01T00:00:00Z",
"endsAt": "2024-01-01T02:00:00Z",
"createdBy": "admin",
"comment": "Maintenance window",
"status": {"state": "active"}
}
]`))
}))
defer srv.Close()
client := NewAlertmanagerClient(srv.URL)
silences, err := client.ListSilences(context.Background())
if err != nil {
t.Fatalf("unexpected error: %v", err)
}
if len(silences) != 1 {
t.Fatalf("expected 1 silence, got %d", len(silences))
}
if silences[0].ID != "silence-1" {
t.Errorf("expected id=silence-1, got %s", silences[0].ID)
}
if silences[0].CreatedBy != "admin" {
t.Errorf("expected createdBy=admin, got %s", silences[0].CreatedBy)
}
}
func TestAlertmanagerClient_CreateSilence(t *testing.T) {
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
if r.Method != http.MethodPost {
t.Errorf("expected POST, got %s", r.Method)
}
if r.URL.Path != "/api/v2/silences" {
t.Errorf("unexpected path: %s", r.URL.Path)
}
if r.Header.Get("Content-Type") != "application/json" {
t.Errorf("expected Content-Type=application/json, got %s", r.Header.Get("Content-Type"))
}
var silence Silence
if err := json.NewDecoder(r.Body).Decode(&silence); err != nil {
t.Fatalf("failed to decode request body: %v", err)
}
if silence.CreatedBy != "admin" {
t.Errorf("expected createdBy=admin, got %s", silence.CreatedBy)
}
w.Header().Set("Content-Type", "application/json")
_, _ = w.Write([]byte(`{"silenceID": "new-silence-id"}`))
}))
defer srv.Close()
client := NewAlertmanagerClient(srv.URL)
id, err := client.CreateSilence(context.Background(), Silence{
Matchers: []Matcher{
{Name: "alertname", Value: "TargetDown", IsRegex: false},
},
StartsAt: time.Now(),
EndsAt: time.Now().Add(2 * time.Hour),
CreatedBy: "admin",
Comment: "Test silence",
})
if err != nil {
t.Fatalf("unexpected error: %v", err)
}
if id != "new-silence-id" {
t.Errorf("expected id=new-silence-id, got %s", id)
}
}
func TestAlertmanagerClient_HTTPError(t *testing.T) {
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
w.WriteHeader(http.StatusInternalServerError)
_, _ = w.Write([]byte("internal error"))
}))
defer srv.Close()
client := NewAlertmanagerClient(srv.URL)
_, err := client.ListAlerts(context.Background(), AlertFilters{})
if err == nil {
t.Fatal("expected error, got nil")
}
}

View File

@@ -0,0 +1,317 @@
package monitoring
import (
"fmt"
"sort"
"strings"
"time"
)
const maxRows = 100
// formatInstantVector formats instant vector results as a markdown table.
func formatInstantVector(results []PromInstantVector) string {
if len(results) == 0 {
return "No results."
}
// Collect all label keys across results (excluding __name__)
labelKeys := collectLabelKeys(results)
var sb strings.Builder
// Header
sb.WriteString("| ")
if _, ok := results[0].Metric["__name__"]; ok {
sb.WriteString("Metric | ")
}
for _, key := range labelKeys {
sb.WriteString(key)
sb.WriteString(" | ")
}
sb.WriteString("Value |\n")
// Separator
sb.WriteString("| ")
if _, ok := results[0].Metric["__name__"]; ok {
sb.WriteString("--- | ")
}
for range labelKeys {
sb.WriteString("--- | ")
}
sb.WriteString("--- |\n")
// Rows
truncated := false
for i, r := range results {
if i >= maxRows {
truncated = true
break
}
sb.WriteString("| ")
if _, ok := results[0].Metric["__name__"]; ok {
sb.WriteString(r.Metric["__name__"])
sb.WriteString(" | ")
}
for _, key := range labelKeys {
sb.WriteString(r.Metric[key])
sb.WriteString(" | ")
}
// Value is at index 1 of the value tuple
if len(r.Value) >= 2 {
if v, ok := r.Value[1].(string); ok {
sb.WriteString(v)
}
}
sb.WriteString(" |\n")
}
if truncated {
sb.WriteString(fmt.Sprintf("\n*Showing %d of %d results (truncated)*\n", maxRows, len(results)))
}
return sb.String()
}
// collectLabelKeys returns sorted label keys across all results, excluding __name__.
func collectLabelKeys(results []PromInstantVector) []string {
keySet := make(map[string]struct{})
for _, r := range results {
for k := range r.Metric {
if k != "__name__" {
keySet[k] = struct{}{}
}
}
}
keys := make([]string, 0, len(keySet))
for k := range keySet {
keys = append(keys, k)
}
sort.Strings(keys)
return keys
}
// formatAlerts formats alerts as grouped markdown.
func formatAlerts(alerts []Alert) string {
if len(alerts) == 0 {
return "No alerts found."
}
// Group by alertname
groups := make(map[string][]Alert)
var order []string
for _, a := range alerts {
name := a.Labels["alertname"]
if _, exists := groups[name]; !exists {
order = append(order, name)
}
groups[name] = append(groups[name], a)
}
var sb strings.Builder
sb.WriteString(fmt.Sprintf("**%d alert(s)**\n\n", len(alerts)))
for _, name := range order {
group := groups[name]
sb.WriteString(fmt.Sprintf("## %s (%d)\n\n", name, len(group)))
for i, a := range group {
if i >= maxRows {
sb.WriteString(fmt.Sprintf("*... and %d more*\n", len(group)-maxRows))
break
}
sb.WriteString(fmt.Sprintf("**State:** %s | **Severity:** %s\n", a.Status.State, a.Labels["severity"]))
// Labels (excluding alertname and severity)
var labels []string
for k, v := range a.Labels {
if k != "alertname" && k != "severity" {
labels = append(labels, fmt.Sprintf("%s=%s", k, v))
}
}
sort.Strings(labels)
if len(labels) > 0 {
sb.WriteString(fmt.Sprintf("**Labels:** %s\n", strings.Join(labels, ", ")))
}
// Annotations
for k, v := range a.Annotations {
sb.WriteString(fmt.Sprintf("**%s:** %s\n", k, v))
}
sb.WriteString(fmt.Sprintf("**Fingerprint:** %s\n", a.Fingerprint))
sb.WriteString(fmt.Sprintf("**Started:** %s\n", a.StartsAt.Format(time.RFC3339)))
if len(a.Status.SilencedBy) > 0 {
sb.WriteString(fmt.Sprintf("**Silenced by:** %s\n", strings.Join(a.Status.SilencedBy, ", ")))
}
if len(a.Status.InhibitedBy) > 0 {
sb.WriteString(fmt.Sprintf("**Inhibited by:** %s\n", strings.Join(a.Status.InhibitedBy, ", ")))
}
sb.WriteString("\n")
}
}
return sb.String()
}
// formatTargets formats targets as grouped markdown.
func formatTargets(targets *PromTargetsData) string {
if targets == nil || len(targets.ActiveTargets) == 0 {
return "No active targets."
}
// Group by job
groups := make(map[string][]PromTarget)
var order []string
for _, t := range targets.ActiveTargets {
job := t.Labels["job"]
if _, exists := groups[job]; !exists {
order = append(order, job)
}
groups[job] = append(groups[job], t)
}
sort.Strings(order)
var sb strings.Builder
sb.WriteString(fmt.Sprintf("**%d active target(s)**\n\n", len(targets.ActiveTargets)))
// Count health statuses
healthCounts := make(map[string]int)
for _, t := range targets.ActiveTargets {
healthCounts[t.Health]++
}
var healthParts []string
for h, c := range healthCounts {
healthParts = append(healthParts, fmt.Sprintf("%s: %d", h, c))
}
sort.Strings(healthParts)
sb.WriteString(fmt.Sprintf("**Health summary:** %s\n\n", strings.Join(healthParts, ", ")))
for _, job := range order {
group := groups[job]
sb.WriteString(fmt.Sprintf("## %s (%d targets)\n\n", job, len(group)))
sb.WriteString("| Instance | Health | Last Scrape | Duration | Error |\n")
sb.WriteString("| --- | --- | --- | --- | --- |\n")
for _, t := range group {
instance := t.Labels["instance"]
lastScrape := t.LastScrape.Format("15:04:05")
duration := fmt.Sprintf("%.3fs", t.LastScrapeDuration)
lastErr := t.LastError
if lastErr == "" {
lastErr = "-"
}
sb.WriteString(fmt.Sprintf("| %s | %s | %s | %s | %s |\n",
instance, t.Health, lastScrape, duration, lastErr))
}
sb.WriteString("\n")
}
return sb.String()
}
// formatSilences formats silences as markdown.
func formatSilences(silences []Silence) string {
if len(silences) == 0 {
return "No silences found."
}
var sb strings.Builder
sb.WriteString(fmt.Sprintf("**%d silence(s)**\n\n", len(silences)))
for _, s := range silences {
state := "unknown"
if s.Status != nil {
state = s.Status.State
}
sb.WriteString(fmt.Sprintf("## Silence %s [%s]\n\n", s.ID, state))
// Matchers
var matchers []string
for _, m := range s.Matchers {
op := "="
if m.IsRegex {
op = "=~"
}
if m.IsEqual != nil && !*m.IsEqual {
if m.IsRegex {
op = "!~"
} else {
op = "!="
}
}
matchers = append(matchers, fmt.Sprintf("%s%s%s", m.Name, op, m.Value))
}
sb.WriteString(fmt.Sprintf("**Matchers:** %s\n", strings.Join(matchers, ", ")))
sb.WriteString(fmt.Sprintf("**Created by:** %s\n", s.CreatedBy))
sb.WriteString(fmt.Sprintf("**Comment:** %s\n", s.Comment))
sb.WriteString(fmt.Sprintf("**Starts:** %s\n", s.StartsAt.Format(time.RFC3339)))
sb.WriteString(fmt.Sprintf("**Ends:** %s\n", s.EndsAt.Format(time.RFC3339)))
sb.WriteString("\n")
}
return sb.String()
}
// formatMetricSearch formats metric search results.
func formatMetricSearch(names []string, metadata map[string][]PromMetadata) string {
if len(names) == 0 {
return "No metrics found matching the search."
}
var sb strings.Builder
sb.WriteString(fmt.Sprintf("**%d metric(s) found**\n\n", len(names)))
sb.WriteString("| Metric | Type | Help |\n")
sb.WriteString("| --- | --- | --- |\n")
truncated := false
for i, name := range names {
if i >= maxRows {
truncated = true
break
}
metaType := ""
help := ""
if metas, ok := metadata[name]; ok && len(metas) > 0 {
metaType = metas[0].Type
help = metas[0].Help
if len(help) > 100 {
help = help[:100] + "..."
}
}
sb.WriteString(fmt.Sprintf("| %s | %s | %s |\n", name, metaType, help))
}
if truncated {
sb.WriteString(fmt.Sprintf("\n*Showing %d of %d metrics (truncated)*\n", maxRows, len(names)))
}
return sb.String()
}
// formatMetricMetadata formats metadata for a single metric.
func formatMetricMetadata(name string, metas []PromMetadata) string {
if len(metas) == 0 {
return fmt.Sprintf("No metadata found for metric '%s'.", name)
}
var sb strings.Builder
sb.WriteString(fmt.Sprintf("# %s\n\n", name))
for _, m := range metas {
sb.WriteString(fmt.Sprintf("**Type:** %s\n", m.Type))
if m.Help != "" {
sb.WriteString(fmt.Sprintf("**Help:** %s\n", m.Help))
}
if m.Unit != "" {
sb.WriteString(fmt.Sprintf("**Unit:** %s\n", m.Unit))
}
}
return sb.String()
}

View File

@@ -0,0 +1,434 @@
package monitoring
import (
"context"
"encoding/json"
"fmt"
"strings"
"time"
"git.t-juice.club/torjus/labmcp/internal/mcp"
)
// RegisterHandlers registers all monitoring tool handlers on the MCP server.
func RegisterHandlers(server *mcp.Server, prom *PrometheusClient, am *AlertmanagerClient) {
server.RegisterTool(listAlertsTool(), makeListAlertsHandler(am))
server.RegisterTool(getAlertTool(), makeGetAlertHandler(am))
server.RegisterTool(searchMetricsTool(), makeSearchMetricsHandler(prom))
server.RegisterTool(getMetricMetadataTool(), makeGetMetricMetadataHandler(prom))
server.RegisterTool(queryTool(), makeQueryHandler(prom))
server.RegisterTool(listTargetsTool(), makeListTargetsHandler(prom))
server.RegisterTool(listSilencesTool(), makeListSilencesHandler(am))
server.RegisterTool(createSilenceTool(), makeCreateSilenceHandler(am))
}
// Tool definitions
func listAlertsTool() mcp.Tool {
return mcp.Tool{
Name: "list_alerts",
Description: "List alerts from Alertmanager with optional filters",
InputSchema: mcp.InputSchema{
Type: "object",
Properties: map[string]mcp.Property{
"state": {
Type: "string",
Description: "Filter by alert state: 'active', 'suppressed', or 'unprocessed'",
Enum: []string{"active", "suppressed", "unprocessed"},
},
"severity": {
Type: "string",
Description: "Filter by severity label (e.g., 'critical', 'warning')",
},
"receiver": {
Type: "string",
Description: "Filter by receiver name",
},
},
},
}
}
func getAlertTool() mcp.Tool {
return mcp.Tool{
Name: "get_alert",
Description: "Get full details for a specific alert by fingerprint",
InputSchema: mcp.InputSchema{
Type: "object",
Properties: map[string]mcp.Property{
"fingerprint": {
Type: "string",
Description: "Alert fingerprint identifier",
},
},
Required: []string{"fingerprint"},
},
}
}
func searchMetricsTool() mcp.Tool {
return mcp.Tool{
Name: "search_metrics",
Description: "Search Prometheus metric names with optional substring filter, enriched with metadata (type, help text)",
InputSchema: mcp.InputSchema{
Type: "object",
Properties: map[string]mcp.Property{
"query": {
Type: "string",
Description: "Substring to filter metric names (e.g., 'cpu', 'memory', 'node_'). Empty returns all metrics.",
},
"limit": {
Type: "integer",
Description: "Maximum number of results (default: 50)",
Default: 50,
},
},
},
}
}
func getMetricMetadataTool() mcp.Tool {
return mcp.Tool{
Name: "get_metric_metadata",
Description: "Get type, help text, and unit for a specific Prometheus metric",
InputSchema: mcp.InputSchema{
Type: "object",
Properties: map[string]mcp.Property{
"metric": {
Type: "string",
Description: "Metric name (e.g., 'node_cpu_seconds_total')",
},
},
Required: []string{"metric"},
},
}
}
func queryTool() mcp.Tool {
return mcp.Tool{
Name: "query",
Description: "Execute an instant PromQL query against Prometheus. Supports aggregations like avg_over_time(metric[1h]), rate(), sum(), etc.",
InputSchema: mcp.InputSchema{
Type: "object",
Properties: map[string]mcp.Property{
"promql": {
Type: "string",
Description: "PromQL expression to evaluate (e.g., 'up', 'rate(http_requests_total[5m])', 'avg_over_time(node_load1[1h])')",
},
},
Required: []string{"promql"},
},
}
}
func listTargetsTool() mcp.Tool {
return mcp.Tool{
Name: "list_targets",
Description: "List Prometheus scrape targets with health status, grouped by job",
InputSchema: mcp.InputSchema{
Type: "object",
Properties: map[string]mcp.Property{},
},
}
}
func listSilencesTool() mcp.Tool {
return mcp.Tool{
Name: "list_silences",
Description: "List active and pending alert silences from Alertmanager",
InputSchema: mcp.InputSchema{
Type: "object",
Properties: map[string]mcp.Property{},
},
}
}
func createSilenceTool() mcp.Tool {
return mcp.Tool{
Name: "create_silence",
Description: `Create a new silence in Alertmanager. IMPORTANT: Always confirm with the user before creating a silence, showing them the matchers, duration, and reason.`,
InputSchema: mcp.InputSchema{
Type: "object",
Properties: map[string]mcp.Property{
"matchers": {
Type: "string",
Description: `JSON array of matchers, e.g. [{"name":"alertname","value":"TargetDown","isRegex":false}]`,
},
"duration": {
Type: "string",
Description: "Silence duration in Go duration format (e.g., '2h', '30m', '1h30m')",
},
"author": {
Type: "string",
Description: "Author of the silence",
},
"comment": {
Type: "string",
Description: "Reason for the silence",
},
},
Required: []string{"matchers", "duration", "author", "comment"},
},
}
}
// Handler constructors
func makeListAlertsHandler(am *AlertmanagerClient) mcp.ToolHandler {
return func(ctx context.Context, args map[string]interface{}) (mcp.CallToolResult, error) {
filters := AlertFilters{}
if state, ok := args["state"].(string); ok && state != "" {
switch state {
case "active":
active := true
filters.Active = &active
silenced := false
filters.Silenced = &silenced
inhibited := false
filters.Inhibited = &inhibited
case "suppressed":
active := false
filters.Active = &active
case "unprocessed":
unprocessed := true
filters.Unprocessed = &unprocessed
}
}
if severity, ok := args["severity"].(string); ok && severity != "" {
filters.Filter = append(filters.Filter, fmt.Sprintf(`severity="%s"`, severity))
}
if receiver, ok := args["receiver"].(string); ok && receiver != "" {
filters.Receiver = receiver
}
alerts, err := am.ListAlerts(ctx, filters)
if err != nil {
return mcp.ErrorContent(fmt.Errorf("failed to list alerts: %w", err)), nil
}
return mcp.CallToolResult{
Content: []mcp.Content{mcp.TextContent(formatAlerts(alerts))},
}, nil
}
}
func makeGetAlertHandler(am *AlertmanagerClient) mcp.ToolHandler {
return func(ctx context.Context, args map[string]interface{}) (mcp.CallToolResult, error) {
fingerprint, _ := args["fingerprint"].(string)
if fingerprint == "" {
return mcp.ErrorContent(fmt.Errorf("fingerprint is required")), nil
}
// Fetch all alerts and find the one matching the fingerprint
alerts, err := am.ListAlerts(ctx, AlertFilters{})
if err != nil {
return mcp.ErrorContent(fmt.Errorf("failed to fetch alerts: %w", err)), nil
}
for _, a := range alerts {
if a.Fingerprint == fingerprint {
return mcp.CallToolResult{
Content: []mcp.Content{mcp.TextContent(formatAlerts([]Alert{a}))},
}, nil
}
}
return mcp.ErrorContent(fmt.Errorf("alert with fingerprint '%s' not found", fingerprint)), nil
}
}
func makeSearchMetricsHandler(prom *PrometheusClient) mcp.ToolHandler {
return func(ctx context.Context, args map[string]interface{}) (mcp.CallToolResult, error) {
query, _ := args["query"].(string)
limit := 50
if l, ok := args["limit"].(float64); ok && l > 0 {
limit = int(l)
}
// Get all metric names
allNames, err := prom.LabelValues(ctx, "__name__")
if err != nil {
return mcp.ErrorContent(fmt.Errorf("failed to fetch metric names: %w", err)), nil
}
// Filter by substring
var matched []string
queryLower := strings.ToLower(query)
for _, name := range allNames {
if query == "" || strings.Contains(strings.ToLower(name), queryLower) {
matched = append(matched, name)
if len(matched) >= limit {
break
}
}
}
// Fetch metadata for matched metrics
metadata, err := prom.Metadata(ctx, "")
if err != nil {
// Non-fatal: proceed without metadata
metadata = nil
}
return mcp.CallToolResult{
Content: []mcp.Content{mcp.TextContent(formatMetricSearch(matched, metadata))},
}, nil
}
}
func makeGetMetricMetadataHandler(prom *PrometheusClient) mcp.ToolHandler {
return func(ctx context.Context, args map[string]interface{}) (mcp.CallToolResult, error) {
metric, _ := args["metric"].(string)
if metric == "" {
return mcp.ErrorContent(fmt.Errorf("metric is required")), nil
}
metadata, err := prom.Metadata(ctx, metric)
if err != nil {
return mcp.ErrorContent(fmt.Errorf("failed to fetch metadata: %w", err)), nil
}
metas := metadata[metric]
return mcp.CallToolResult{
Content: []mcp.Content{mcp.TextContent(formatMetricMetadata(metric, metas))},
}, nil
}
}
func makeQueryHandler(prom *PrometheusClient) mcp.ToolHandler {
return func(ctx context.Context, args map[string]interface{}) (mcp.CallToolResult, error) {
promql, _ := args["promql"].(string)
if promql == "" {
return mcp.ErrorContent(fmt.Errorf("promql is required")), nil
}
data, err := prom.Query(ctx, promql, time.Time{})
if err != nil {
return mcp.ErrorContent(fmt.Errorf("query failed: %w", err)), nil
}
var result string
switch data.ResultType {
case "vector":
result = formatInstantVector(data.Result)
case "scalar":
if len(data.Result) > 0 && len(data.Result[0].Value) >= 2 {
if v, ok := data.Result[0].Value[1].(string); ok {
result = fmt.Sprintf("**Scalar result:** %s", v)
}
}
if result == "" {
result = "Scalar query returned no value."
}
default:
result = fmt.Sprintf("Result type: %s\n\n%s", data.ResultType, formatInstantVector(data.Result))
}
return mcp.CallToolResult{
Content: []mcp.Content{mcp.TextContent(result)},
}, nil
}
}
func makeListTargetsHandler(prom *PrometheusClient) mcp.ToolHandler {
return func(ctx context.Context, args map[string]interface{}) (mcp.CallToolResult, error) {
data, err := prom.Targets(ctx)
if err != nil {
return mcp.ErrorContent(fmt.Errorf("failed to fetch targets: %w", err)), nil
}
return mcp.CallToolResult{
Content: []mcp.Content{mcp.TextContent(formatTargets(data))},
}, nil
}
}
func makeListSilencesHandler(am *AlertmanagerClient) mcp.ToolHandler {
return func(ctx context.Context, args map[string]interface{}) (mcp.CallToolResult, error) {
silences, err := am.ListSilences(ctx)
if err != nil {
return mcp.ErrorContent(fmt.Errorf("failed to fetch silences: %w", err)), nil
}
// Filter to active/pending only
var filtered []Silence
for _, s := range silences {
if s.Status != nil && (s.Status.State == "active" || s.Status.State == "pending") {
filtered = append(filtered, s)
}
}
return mcp.CallToolResult{
Content: []mcp.Content{mcp.TextContent(formatSilences(filtered))},
}, nil
}
}
func makeCreateSilenceHandler(am *AlertmanagerClient) mcp.ToolHandler {
return func(ctx context.Context, args map[string]interface{}) (mcp.CallToolResult, error) {
matchersJSON, _ := args["matchers"].(string)
if matchersJSON == "" {
return mcp.ErrorContent(fmt.Errorf("matchers is required")), nil
}
durationStr, _ := args["duration"].(string)
if durationStr == "" {
return mcp.ErrorContent(fmt.Errorf("duration is required")), nil
}
author, _ := args["author"].(string)
if author == "" {
return mcp.ErrorContent(fmt.Errorf("author is required")), nil
}
comment, _ := args["comment"].(string)
if comment == "" {
return mcp.ErrorContent(fmt.Errorf("comment is required")), nil
}
// Parse matchers
var matchers []Matcher
if err := parseJSON(matchersJSON, &matchers); err != nil {
return mcp.ErrorContent(fmt.Errorf("invalid matchers JSON: %w", err)), nil
}
// Parse duration
duration, err := time.ParseDuration(durationStr)
if err != nil {
return mcp.ErrorContent(fmt.Errorf("invalid duration: %w", err)), nil
}
now := time.Now()
silence := Silence{
Matchers: matchers,
StartsAt: now,
EndsAt: now.Add(duration),
CreatedBy: author,
Comment: comment,
}
id, err := am.CreateSilence(ctx, silence)
if err != nil {
return mcp.ErrorContent(fmt.Errorf("failed to create silence: %w", err)), nil
}
var sb strings.Builder
sb.WriteString("Silence created successfully.\n\n")
sb.WriteString(fmt.Sprintf("**ID:** %s\n", id))
sb.WriteString(fmt.Sprintf("**Expires:** %s\n", silence.EndsAt.Format(time.RFC3339)))
sb.WriteString(fmt.Sprintf("**Author:** %s\n", author))
sb.WriteString(fmt.Sprintf("**Comment:** %s\n", comment))
return mcp.CallToolResult{
Content: []mcp.Content{mcp.TextContent(sb.String())},
}, nil
}
}
// parseJSON is a helper to unmarshal JSON from a string.
func parseJSON(s string, v interface{}) error {
return json.Unmarshal([]byte(s), v)
}

View File

@@ -0,0 +1,378 @@
package monitoring
import (
"context"
"encoding/json"
"io"
"log"
"net/http"
"net/http/httptest"
"strings"
"testing"
"git.t-juice.club/torjus/labmcp/internal/mcp"
)
// setupTestServer creates a test MCP server with monitoring handlers backed by test HTTP servers.
func setupTestServer(t *testing.T, promHandler, amHandler http.HandlerFunc) (*mcp.Server, func()) {
t.Helper()
promSrv := httptest.NewServer(promHandler)
amSrv := httptest.NewServer(amHandler)
logger := log.New(io.Discard, "", 0)
config := mcp.DefaultMonitoringConfig()
server := mcp.NewGenericServer(logger, config)
prom := NewPrometheusClient(promSrv.URL)
am := NewAlertmanagerClient(amSrv.URL)
RegisterHandlers(server, prom, am)
cleanup := func() {
promSrv.Close()
amSrv.Close()
}
return server, cleanup
}
func TestHandler_ListAlerts(t *testing.T) {
server, cleanup := setupTestServer(t,
nil,
func(w http.ResponseWriter, r *http.Request) {
w.Header().Set("Content-Type", "application/json")
_, _ = w.Write([]byte(`[
{
"annotations": {"summary": "Node is down"},
"endsAt": "2024-01-01T01:00:00Z",
"fingerprint": "fp1",
"receivers": [{"name": "default"}],
"startsAt": "2024-01-01T00:00:00Z",
"status": {"inhibitedBy": [], "silencedBy": [], "state": "active"},
"updatedAt": "2024-01-01T00:00:00Z",
"generatorURL": "",
"labels": {"alertname": "NodeDown", "severity": "critical"}
}
]`))
},
)
defer cleanup()
result := callTool(t, server, "list_alerts", map[string]interface{}{})
if result.IsError {
t.Fatalf("unexpected error: %s", result.Content[0].Text)
}
if !strings.Contains(result.Content[0].Text, "NodeDown") {
t.Errorf("expected output to contain 'NodeDown', got: %s", result.Content[0].Text)
}
if !strings.Contains(result.Content[0].Text, "1 alert") {
t.Errorf("expected output to contain '1 alert', got: %s", result.Content[0].Text)
}
}
func TestHandler_GetAlert(t *testing.T) {
server, cleanup := setupTestServer(t,
nil,
func(w http.ResponseWriter, r *http.Request) {
w.Header().Set("Content-Type", "application/json")
_, _ = w.Write([]byte(`[
{
"annotations": {"summary": "Found it"},
"endsAt": "2024-01-01T01:00:00Z",
"fingerprint": "target-fp",
"receivers": [{"name": "default"}],
"startsAt": "2024-01-01T00:00:00Z",
"status": {"inhibitedBy": [], "silencedBy": [], "state": "active"},
"updatedAt": "2024-01-01T00:00:00Z",
"generatorURL": "",
"labels": {"alertname": "TestAlert", "severity": "warning"}
},
{
"annotations": {},
"endsAt": "2024-01-01T01:00:00Z",
"fingerprint": "other-fp",
"receivers": [{"name": "default"}],
"startsAt": "2024-01-01T00:00:00Z",
"status": {"inhibitedBy": [], "silencedBy": [], "state": "active"},
"updatedAt": "2024-01-01T00:00:00Z",
"generatorURL": "",
"labels": {"alertname": "OtherAlert", "severity": "info"}
}
]`))
},
)
defer cleanup()
result := callTool(t, server, "get_alert", map[string]interface{}{
"fingerprint": "target-fp",
})
if result.IsError {
t.Fatalf("unexpected error: %s", result.Content[0].Text)
}
if !strings.Contains(result.Content[0].Text, "TestAlert") {
t.Errorf("expected output to contain 'TestAlert', got: %s", result.Content[0].Text)
}
}
func TestHandler_GetAlertNotFound(t *testing.T) {
server, cleanup := setupTestServer(t,
nil,
func(w http.ResponseWriter, r *http.Request) {
w.Header().Set("Content-Type", "application/json")
_, _ = w.Write([]byte(`[]`))
},
)
defer cleanup()
result := callTool(t, server, "get_alert", map[string]interface{}{
"fingerprint": "nonexistent",
})
if !result.IsError {
t.Error("expected error result for nonexistent fingerprint")
}
}
func TestHandler_Query(t *testing.T) {
server, cleanup := setupTestServer(t,
func(w http.ResponseWriter, r *http.Request) {
if r.URL.Path != "/api/v1/query" {
http.NotFound(w, r)
return
}
w.Header().Set("Content-Type", "application/json")
_, _ = w.Write([]byte(`{
"status": "success",
"data": {
"resultType": "vector",
"result": [
{
"metric": {"__name__": "up", "job": "node"},
"value": [1234567890, "1"]
}
]
}
}`))
},
nil,
)
defer cleanup()
result := callTool(t, server, "query", map[string]interface{}{
"promql": "up",
})
if result.IsError {
t.Fatalf("unexpected error: %s", result.Content[0].Text)
}
if !strings.Contains(result.Content[0].Text, "node") {
t.Errorf("expected output to contain 'node', got: %s", result.Content[0].Text)
}
}
func TestHandler_ListTargets(t *testing.T) {
server, cleanup := setupTestServer(t,
func(w http.ResponseWriter, r *http.Request) {
if r.URL.Path != "/api/v1/targets" {
http.NotFound(w, r)
return
}
w.Header().Set("Content-Type", "application/json")
_, _ = w.Write([]byte(`{
"status": "success",
"data": {
"activeTargets": [
{
"labels": {"instance": "localhost:9090", "job": "prometheus"},
"scrapePool": "prometheus",
"scrapeUrl": "http://localhost:9090/metrics",
"globalUrl": "http://localhost:9090/metrics",
"lastError": "",
"lastScrape": "2024-01-01T00:00:00Z",
"lastScrapeDuration": 0.015,
"health": "up",
"scrapeInterval": "15s",
"scrapeTimeout": "10s"
}
],
"droppedTargets": []
}
}`))
},
nil,
)
defer cleanup()
result := callTool(t, server, "list_targets", map[string]interface{}{})
if result.IsError {
t.Fatalf("unexpected error: %s", result.Content[0].Text)
}
if !strings.Contains(result.Content[0].Text, "prometheus") {
t.Errorf("expected output to contain 'prometheus', got: %s", result.Content[0].Text)
}
}
func TestHandler_SearchMetrics(t *testing.T) {
server, cleanup := setupTestServer(t,
func(w http.ResponseWriter, r *http.Request) {
w.Header().Set("Content-Type", "application/json")
switch r.URL.Path {
case "/api/v1/label/__name__/values":
_, _ = w.Write([]byte(`{
"status": "success",
"data": ["node_cpu_seconds_total", "node_memory_MemTotal_bytes", "up"]
}`))
case "/api/v1/metadata":
_, _ = w.Write([]byte(`{
"status": "success",
"data": {
"node_cpu_seconds_total": [{"type": "counter", "help": "CPU time", "unit": ""}],
"node_memory_MemTotal_bytes": [{"type": "gauge", "help": "Total memory", "unit": "bytes"}]
}
}`))
default:
http.NotFound(w, r)
}
},
nil,
)
defer cleanup()
result := callTool(t, server, "search_metrics", map[string]interface{}{
"query": "node",
})
if result.IsError {
t.Fatalf("unexpected error: %s", result.Content[0].Text)
}
if !strings.Contains(result.Content[0].Text, "node_cpu") {
t.Errorf("expected output to contain 'node_cpu', got: %s", result.Content[0].Text)
}
// "up" should be filtered out since it doesn't match "node"
if strings.Contains(result.Content[0].Text, "| up |") {
t.Errorf("expected 'up' to be filtered out, got: %s", result.Content[0].Text)
}
}
func TestHandler_ListSilences(t *testing.T) {
server, cleanup := setupTestServer(t,
nil,
func(w http.ResponseWriter, r *http.Request) {
if r.URL.Path != "/api/v2/silences" {
http.NotFound(w, r)
return
}
w.Header().Set("Content-Type", "application/json")
_, _ = w.Write([]byte(`[
{
"id": "s1",
"matchers": [{"name": "alertname", "value": "Test", "isRegex": false}],
"startsAt": "2024-01-01T00:00:00Z",
"endsAt": "2024-01-01T02:00:00Z",
"createdBy": "admin",
"comment": "Testing",
"status": {"state": "active"}
},
{
"id": "s2",
"matchers": [{"name": "job", "value": "node", "isRegex": false}],
"startsAt": "2023-01-01T00:00:00Z",
"endsAt": "2023-01-01T02:00:00Z",
"createdBy": "admin",
"comment": "Old",
"status": {"state": "expired"}
}
]`))
},
)
defer cleanup()
result := callTool(t, server, "list_silences", map[string]interface{}{})
if result.IsError {
t.Fatalf("unexpected error: %s", result.Content[0].Text)
}
// Should show active silence but filter out expired
if !strings.Contains(result.Content[0].Text, "s1") {
t.Errorf("expected active silence s1 in output, got: %s", result.Content[0].Text)
}
if strings.Contains(result.Content[0].Text, "s2") {
t.Errorf("expected expired silence s2 to be filtered out, got: %s", result.Content[0].Text)
}
}
func TestHandler_ToolCount(t *testing.T) {
server, cleanup := setupTestServer(t,
func(w http.ResponseWriter, r *http.Request) {},
func(w http.ResponseWriter, r *http.Request) {},
)
defer cleanup()
// Send a tools/list request
req := &mcp.Request{
JSONRPC: "2.0",
ID: 1,
Method: "tools/list",
}
resp := server.HandleRequest(context.Background(), req)
if resp == nil {
t.Fatal("expected response, got nil")
}
if resp.Error != nil {
t.Fatalf("unexpected error: %s", resp.Error.Message)
}
resultJSON, err := json.Marshal(resp.Result)
if err != nil {
t.Fatalf("failed to marshal result: %v", err)
}
var listResult mcp.ListToolsResult
if err := json.Unmarshal(resultJSON, &listResult); err != nil {
t.Fatalf("failed to unmarshal result: %v", err)
}
if len(listResult.Tools) != 8 {
t.Errorf("expected 8 tools, got %d", len(listResult.Tools))
for _, tool := range listResult.Tools {
t.Logf(" tool: %s", tool.Name)
}
}
}
// callTool is a test helper that calls a tool through the MCP server.
func callTool(t *testing.T, server *mcp.Server, name string, args map[string]interface{}) mcp.CallToolResult {
t.Helper()
params := mcp.CallToolParams{
Name: name,
Arguments: args,
}
paramsJSON, err := json.Marshal(params)
if err != nil {
t.Fatalf("failed to marshal params: %v", err)
}
req := &mcp.Request{
JSONRPC: "2.0",
ID: 1,
Method: "tools/call",
Params: paramsJSON,
}
resp := server.HandleRequest(context.Background(), req)
if resp == nil {
t.Fatal("expected response, got nil")
}
if resp.Error != nil {
t.Fatalf("JSON-RPC error: %s", resp.Error.Message)
}
resultJSON, err := json.Marshal(resp.Result)
if err != nil {
t.Fatalf("failed to marshal result: %v", err)
}
var result mcp.CallToolResult
if err := json.Unmarshal(resultJSON, &result); err != nil {
t.Fatalf("failed to unmarshal result: %v", err)
}
return result
}

View File

@@ -0,0 +1,135 @@
package monitoring
import (
"context"
"encoding/json"
"fmt"
"io"
"net/http"
"net/url"
"strings"
"time"
)
// PrometheusClient is an HTTP client for the Prometheus API.
type PrometheusClient struct {
baseURL string
httpClient *http.Client
}
// NewPrometheusClient creates a new Prometheus API client.
func NewPrometheusClient(baseURL string) *PrometheusClient {
return &PrometheusClient{
baseURL: strings.TrimRight(baseURL, "/"),
httpClient: &http.Client{
Timeout: 30 * time.Second,
},
}
}
// Query executes an instant PromQL query. If ts is zero, the current time is used.
func (c *PrometheusClient) Query(ctx context.Context, promql string, ts time.Time) (*PromQueryData, error) {
params := url.Values{}
params.Set("query", promql)
if !ts.IsZero() {
params.Set("time", fmt.Sprintf("%d", ts.Unix()))
}
body, err := c.get(ctx, "/api/v1/query", params)
if err != nil {
return nil, fmt.Errorf("query failed: %w", err)
}
var data PromQueryData
if err := json.Unmarshal(body, &data); err != nil {
return nil, fmt.Errorf("failed to parse query data: %w", err)
}
return &data, nil
}
// LabelValues returns all values for a given label name.
func (c *PrometheusClient) LabelValues(ctx context.Context, label string) ([]string, error) {
path := fmt.Sprintf("/api/v1/label/%s/values", url.PathEscape(label))
body, err := c.get(ctx, path, nil)
if err != nil {
return nil, fmt.Errorf("label values failed: %w", err)
}
var values []string
if err := json.Unmarshal(body, &values); err != nil {
return nil, fmt.Errorf("failed to parse label values: %w", err)
}
return values, nil
}
// Metadata returns metadata for metrics. If metric is empty, returns metadata for all metrics.
func (c *PrometheusClient) Metadata(ctx context.Context, metric string) (map[string][]PromMetadata, error) {
params := url.Values{}
if metric != "" {
params.Set("metric", metric)
}
body, err := c.get(ctx, "/api/v1/metadata", params)
if err != nil {
return nil, fmt.Errorf("metadata failed: %w", err)
}
var metadata map[string][]PromMetadata
if err := json.Unmarshal(body, &metadata); err != nil {
return nil, fmt.Errorf("failed to parse metadata: %w", err)
}
return metadata, nil
}
// Targets returns the current scrape targets.
func (c *PrometheusClient) Targets(ctx context.Context) (*PromTargetsData, error) {
body, err := c.get(ctx, "/api/v1/targets", nil)
if err != nil {
return nil, fmt.Errorf("targets failed: %w", err)
}
var data PromTargetsData
if err := json.Unmarshal(body, &data); err != nil {
return nil, fmt.Errorf("failed to parse targets data: %w", err)
}
return &data, nil
}
// get performs a GET request and returns the "data" field from the Prometheus response envelope.
func (c *PrometheusClient) get(ctx context.Context, path string, params url.Values) (json.RawMessage, error) {
u := c.baseURL + path
if len(params) > 0 {
u += "?" + params.Encode()
}
req, err := http.NewRequestWithContext(ctx, http.MethodGet, u, nil)
if err != nil {
return nil, fmt.Errorf("failed to create request: %w", err)
}
resp, err := c.httpClient.Do(req)
if err != nil {
return nil, fmt.Errorf("request failed: %w", err)
}
defer resp.Body.Close() //nolint:errcheck // cleanup on exit
body, err := io.ReadAll(resp.Body)
if err != nil {
return nil, fmt.Errorf("failed to read response: %w", err)
}
if resp.StatusCode != http.StatusOK {
return nil, fmt.Errorf("unexpected status %d: %s", resp.StatusCode, string(body))
}
var promResp PromResponse
if err := json.Unmarshal(body, &promResp); err != nil {
return nil, fmt.Errorf("failed to parse response: %w", err)
}
if promResp.Status != "success" {
return nil, fmt.Errorf("prometheus error (%s): %s", promResp.ErrorType, promResp.Error)
}
return promResp.Data, nil
}

View File

@@ -0,0 +1,209 @@
package monitoring
import (
"context"
"net/http"
"net/http/httptest"
"testing"
"time"
)
func TestPrometheusClient_Query(t *testing.T) {
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
if r.URL.Path != "/api/v1/query" {
t.Errorf("unexpected path: %s", r.URL.Path)
}
if r.URL.Query().Get("query") != "up" {
t.Errorf("unexpected query param: %s", r.URL.Query().Get("query"))
}
w.Header().Set("Content-Type", "application/json")
_, _ = w.Write([]byte(`{
"status": "success",
"data": {
"resultType": "vector",
"result": [
{
"metric": {"__name__": "up", "job": "prometheus", "instance": "localhost:9090"},
"value": [1234567890, "1"]
},
{
"metric": {"__name__": "up", "job": "node", "instance": "localhost:9100"},
"value": [1234567890, "0"]
}
]
}
}`))
}))
defer srv.Close()
client := NewPrometheusClient(srv.URL)
data, err := client.Query(context.Background(), "up", time.Time{})
if err != nil {
t.Fatalf("unexpected error: %v", err)
}
if data.ResultType != "vector" {
t.Errorf("expected resultType=vector, got %s", data.ResultType)
}
if len(data.Result) != 2 {
t.Fatalf("expected 2 results, got %d", len(data.Result))
}
if data.Result[0].Metric["job"] != "prometheus" {
t.Errorf("expected job=prometheus, got %s", data.Result[0].Metric["job"])
}
}
func TestPrometheusClient_QueryError(t *testing.T) {
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
w.Header().Set("Content-Type", "application/json")
_, _ = w.Write([]byte(`{
"status": "error",
"errorType": "bad_data",
"error": "invalid expression"
}`))
}))
defer srv.Close()
client := NewPrometheusClient(srv.URL)
_, err := client.Query(context.Background(), "invalid{", time.Time{})
if err == nil {
t.Fatal("expected error, got nil")
}
if !contains(err.Error(), "invalid expression") {
t.Errorf("expected error to contain 'invalid expression', got: %s", err.Error())
}
}
func TestPrometheusClient_LabelValues(t *testing.T) {
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
if r.URL.Path != "/api/v1/label/__name__/values" {
t.Errorf("unexpected path: %s", r.URL.Path)
}
w.Header().Set("Content-Type", "application/json")
_, _ = w.Write([]byte(`{
"status": "success",
"data": ["up", "node_cpu_seconds_total", "prometheus_build_info"]
}`))
}))
defer srv.Close()
client := NewPrometheusClient(srv.URL)
values, err := client.LabelValues(context.Background(), "__name__")
if err != nil {
t.Fatalf("unexpected error: %v", err)
}
if len(values) != 3 {
t.Fatalf("expected 3 values, got %d", len(values))
}
if values[0] != "up" {
t.Errorf("expected first value=up, got %s", values[0])
}
}
func TestPrometheusClient_Metadata(t *testing.T) {
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
if r.URL.Path != "/api/v1/metadata" {
t.Errorf("unexpected path: %s", r.URL.Path)
}
w.Header().Set("Content-Type", "application/json")
_, _ = w.Write([]byte(`{
"status": "success",
"data": {
"up": [{"type": "gauge", "help": "Whether the target is up.", "unit": ""}],
"node_cpu_seconds_total": [{"type": "counter", "help": "CPU seconds spent.", "unit": "seconds"}]
}
}`))
}))
defer srv.Close()
client := NewPrometheusClient(srv.URL)
metadata, err := client.Metadata(context.Background(), "")
if err != nil {
t.Fatalf("unexpected error: %v", err)
}
if len(metadata) != 2 {
t.Fatalf("expected 2 metrics, got %d", len(metadata))
}
if metadata["up"][0].Type != "gauge" {
t.Errorf("expected up type=gauge, got %s", metadata["up"][0].Type)
}
}
func TestPrometheusClient_Targets(t *testing.T) {
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
if r.URL.Path != "/api/v1/targets" {
t.Errorf("unexpected path: %s", r.URL.Path)
}
w.Header().Set("Content-Type", "application/json")
_, _ = w.Write([]byte(`{
"status": "success",
"data": {
"activeTargets": [
{
"labels": {"instance": "localhost:9090", "job": "prometheus"},
"scrapePool": "prometheus",
"scrapeUrl": "http://localhost:9090/metrics",
"globalUrl": "http://localhost:9090/metrics",
"lastError": "",
"lastScrape": "2024-01-01T00:00:00Z",
"lastScrapeDuration": 0.01,
"health": "up",
"scrapeInterval": "15s",
"scrapeTimeout": "10s"
}
],
"droppedTargets": []
}
}`))
}))
defer srv.Close()
client := NewPrometheusClient(srv.URL)
data, err := client.Targets(context.Background())
if err != nil {
t.Fatalf("unexpected error: %v", err)
}
if len(data.ActiveTargets) != 1 {
t.Fatalf("expected 1 active target, got %d", len(data.ActiveTargets))
}
if data.ActiveTargets[0].Health != "up" {
t.Errorf("expected health=up, got %s", data.ActiveTargets[0].Health)
}
}
func TestPrometheusClient_HTTPError(t *testing.T) {
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
w.WriteHeader(http.StatusInternalServerError)
_, _ = w.Write([]byte("internal error"))
}))
defer srv.Close()
client := NewPrometheusClient(srv.URL)
_, err := client.Query(context.Background(), "up", time.Time{})
if err == nil {
t.Fatal("expected error, got nil")
}
if !contains(err.Error(), "500") {
t.Errorf("expected error to contain status code, got: %s", err.Error())
}
}
func contains(s, substr string) bool {
return len(s) >= len(substr) && searchString(s, substr)
}
func searchString(s, substr string) bool {
for i := 0; i <= len(s)-len(substr); i++ {
if s[i:i+len(substr)] == substr {
return true
}
}
return false
}

View File

@@ -0,0 +1,120 @@
package monitoring
import (
"encoding/json"
"time"
)
// Prometheus API response types
// PromResponse is the standard Prometheus API response envelope.
type PromResponse struct {
Status string `json:"status"`
Data json.RawMessage `json:"data,omitempty"`
ErrorType string `json:"errorType,omitempty"`
Error string `json:"error,omitempty"`
}
// PromQueryData represents the data field for query results.
type PromQueryData struct {
ResultType string `json:"resultType"`
Result []PromInstantVector `json:"result"`
}
// PromInstantVector represents a single instant vector result.
type PromInstantVector struct {
Metric map[string]string `json:"metric"`
Value [2]interface{} `json:"value"` // [timestamp, value_string]
}
// PromScalar represents a scalar query result.
type PromScalar [2]interface{} // [timestamp, value_string]
// PromMetadata represents metadata for a single metric.
type PromMetadata struct {
Type string `json:"type"`
Help string `json:"help"`
Unit string `json:"unit"`
}
// PromTarget represents a single scrape target.
type PromTarget struct {
DiscoveredLabels map[string]string `json:"discoveredLabels"`
Labels map[string]string `json:"labels"`
ScrapePool string `json:"scrapePool"`
ScrapeURL string `json:"scrapeUrl"`
GlobalURL string `json:"globalUrl"`
LastError string `json:"lastError"`
LastScrape time.Time `json:"lastScrape"`
LastScrapeDuration float64 `json:"lastScrapeDuration"`
Health string `json:"health"`
ScrapeInterval string `json:"scrapeInterval"`
ScrapeTimeout string `json:"scrapeTimeout"`
}
// PromTargetsData represents the data field for targets results.
type PromTargetsData struct {
ActiveTargets []PromTarget `json:"activeTargets"`
DroppedTargets []PromTarget `json:"droppedTargets"`
}
// Alertmanager API response types
// Alert represents an alert from the Alertmanager API v2.
type Alert struct {
Annotations map[string]string `json:"annotations"`
EndsAt time.Time `json:"endsAt"`
Fingerprint string `json:"fingerprint"`
Receivers []AlertReceiver `json:"receivers"`
StartsAt time.Time `json:"startsAt"`
Status AlertStatus `json:"status"`
UpdatedAt time.Time `json:"updatedAt"`
GeneratorURL string `json:"generatorURL"`
Labels map[string]string `json:"labels"`
}
// AlertReceiver represents an alert receiver.
type AlertReceiver struct {
Name string `json:"name"`
}
// AlertStatus represents the status of an alert.
type AlertStatus struct {
InhibitedBy []string `json:"inhibitedBy"`
SilencedBy []string `json:"silencedBy"`
State string `json:"state"` // "active", "suppressed", "unprocessed"
}
// AlertFilters contains filters for listing alerts.
type AlertFilters struct {
Active *bool
Silenced *bool
Inhibited *bool
Unprocessed *bool
Filter []string // PromQL-style label matchers, e.g. {severity="critical"}
Receiver string
}
// Silence represents a silence from the Alertmanager API v2.
type Silence struct {
ID string `json:"id,omitempty"`
Matchers []Matcher `json:"matchers"`
StartsAt time.Time `json:"startsAt"`
EndsAt time.Time `json:"endsAt"`
CreatedBy string `json:"createdBy"`
Comment string `json:"comment"`
Status *SilenceStatus `json:"status,omitempty"`
}
// SilenceStatus represents the status of a silence.
type SilenceStatus struct {
State string `json:"state"` // "active", "pending", "expired"
}
// Matcher represents a label matcher for silences.
type Matcher struct {
Name string `json:"name"`
Value string `json:"value"`
IsRegex bool `json:"isRegex"`
IsEqual *bool `json:"isEqual,omitempty"`
}

View File

@@ -0,0 +1,139 @@
{ config, lib, pkgs, ... }:
let
cfg = config.services.lab-monitoring;
mkHttpFlags = httpCfg: lib.concatStringsSep " " ([
"--transport http"
"--http-address '${httpCfg.address}'"
"--http-endpoint '${httpCfg.endpoint}'"
"--session-ttl '${httpCfg.sessionTTL}'"
] ++ lib.optionals (httpCfg.allowedOrigins != []) (
map (origin: "--allowed-origins '${origin}'") httpCfg.allowedOrigins
) ++ lib.optionals httpCfg.tls.enable [
"--tls-cert '${httpCfg.tls.certFile}'"
"--tls-key '${httpCfg.tls.keyFile}'"
]);
in
{
options.services.lab-monitoring = {
enable = lib.mkEnableOption "Lab Monitoring MCP server";
package = lib.mkPackageOption pkgs "lab-monitoring" { };
prometheusUrl = lib.mkOption {
type = lib.types.str;
default = "http://localhost:9090";
description = "Prometheus base URL.";
};
alertmanagerUrl = lib.mkOption {
type = lib.types.str;
default = "http://localhost:9093";
description = "Alertmanager base URL.";
};
http = {
address = lib.mkOption {
type = lib.types.str;
default = "127.0.0.1:8084";
description = "HTTP listen address for the MCP server.";
};
endpoint = lib.mkOption {
type = lib.types.str;
default = "/mcp";
description = "HTTP endpoint path for MCP requests.";
};
allowedOrigins = lib.mkOption {
type = lib.types.listOf lib.types.str;
default = [ ];
description = "Allowed Origin headers for CORS.";
};
sessionTTL = lib.mkOption {
type = lib.types.str;
default = "30m";
description = "Session TTL for HTTP transport.";
};
tls = {
enable = lib.mkEnableOption "TLS for HTTP transport";
certFile = lib.mkOption {
type = lib.types.nullOr lib.types.path;
default = null;
description = "Path to TLS certificate file.";
};
keyFile = lib.mkOption {
type = lib.types.nullOr lib.types.path;
default = null;
description = "Path to TLS private key file.";
};
};
};
openFirewall = lib.mkOption {
type = lib.types.bool;
default = false;
description = "Whether to open the firewall for the MCP HTTP server.";
};
};
config = lib.mkIf cfg.enable {
assertions = [
{
assertion = !cfg.http.tls.enable || (cfg.http.tls.certFile != null && cfg.http.tls.keyFile != null);
message = "services.lab-monitoring.http.tls: both certFile and keyFile must be set when TLS is enabled";
}
];
systemd.services.lab-monitoring = {
description = "Lab Monitoring MCP Server";
wantedBy = [ "multi-user.target" ];
after = [ "network.target" ];
environment = {
PROMETHEUS_URL = cfg.prometheusUrl;
ALERTMANAGER_URL = cfg.alertmanagerUrl;
};
script = let
httpFlags = mkHttpFlags cfg.http;
in ''
exec ${cfg.package}/bin/lab-monitoring serve ${httpFlags}
'';
serviceConfig = {
Type = "simple";
DynamicUser = true;
Restart = "on-failure";
RestartSec = "5s";
# Hardening
NoNewPrivileges = true;
ProtectSystem = "strict";
ProtectHome = true;
PrivateTmp = true;
PrivateDevices = true;
ProtectKernelTunables = true;
ProtectKernelModules = true;
ProtectControlGroups = true;
RestrictNamespaces = true;
RestrictRealtime = true;
RestrictSUIDSGID = true;
MemoryDenyWriteExecute = true;
LockPersonality = true;
};
};
networking.firewall = lib.mkIf cfg.openFirewall (let
addressParts = lib.splitString ":" cfg.http.address;
port = lib.toInt (lib.last addressParts);
in {
allowedTCPPorts = [ port ];
});
};
}

View File

@@ -7,7 +7,7 @@
buildGoModule {
inherit pname src;
version = "0.2.0";
version = "0.2.1";
vendorHash = "sha256-D0KIxQC9ctIAaHBFTvkhBE06uOZwDUcIw8471Ug2doY=";