feat: add Prometheus metrics endpoint and Docker image (PLAN.md 4.2)

Add internal/metrics package with dedicated Prometheus registry exposing
SSH connection, auth attempt, session, and build info metrics. Wire into
SSH server (4 instrumentation points) and web server (/metrics endpoint).
Add dockerImage output to flake.nix via dockerTools.buildLayeredImage.
Bump version to 0.7.0.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-02-15 05:47:16 +01:00
parent b8fcbc7e10
commit ab07e6a8dc
14 changed files with 342 additions and 19 deletions

View File

@@ -15,6 +15,7 @@ import (
"git.t-juice.club/torjus/oubliette/internal/auth"
"git.t-juice.club/torjus/oubliette/internal/config"
"git.t-juice.club/torjus/oubliette/internal/detection"
"git.t-juice.club/torjus/oubliette/internal/metrics"
"git.t-juice.club/torjus/oubliette/internal/notify"
"git.t-juice.club/torjus/oubliette/internal/shell"
"git.t-juice.club/torjus/oubliette/internal/shell/adventure"
@@ -34,9 +35,10 @@ type Server struct {
connSem chan struct{} // semaphore limiting concurrent connections
shellRegistry *shell.Registry
notifier notify.Sender
metrics *metrics.Metrics
}
func New(cfg config.Config, store storage.Store, logger *slog.Logger) (*Server, error) {
func New(cfg config.Config, store storage.Store, logger *slog.Logger, m *metrics.Metrics) (*Server, error) {
registry := shell.NewRegistry()
if err := registry.Register(bash.NewBashShell(), 1); err != nil {
return nil, fmt.Errorf("registering bash shell: %w", err)
@@ -59,6 +61,7 @@ func New(cfg config.Config, store storage.Store, logger *slog.Logger) (*Server,
connSem: make(chan struct{}, cfg.SSH.MaxConnections),
shellRegistry: registry,
notifier: notify.NewSender(cfg.Notify.Webhooks, logger),
metrics: m,
}
hostKey, err := loadOrGenerateHostKey(cfg.SSH.HostKeyPath)
@@ -102,11 +105,16 @@ func (s *Server) ListenAndServe(ctx context.Context) error {
// Enforce max concurrent connections.
select {
case s.connSem <- struct{}{}:
s.metrics.SSHConnectionsActive.Inc()
go func() {
defer func() { <-s.connSem }()
defer func() {
<-s.connSem
s.metrics.SSHConnectionsActive.Dec()
}()
s.handleConn(conn)
}()
default:
s.metrics.SSHConnectionsTotal.WithLabelValues("rejected_max_connections").Inc()
s.logger.Warn("max connections reached, rejecting", "remote_addr", conn.RemoteAddr())
conn.Close()
}
@@ -118,11 +126,13 @@ func (s *Server) handleConn(conn net.Conn) {
sshConn, chans, reqs, err := ssh.NewServerConn(conn, s.sshConfig)
if err != nil {
s.metrics.SSHConnectionsTotal.WithLabelValues("rejected_handshake").Inc()
s.logger.Debug("SSH handshake failed", "remote_addr", conn.RemoteAddr(), "err", err)
return
}
defer sshConn.Close()
s.metrics.SSHConnectionsTotal.WithLabelValues("accepted").Inc()
s.logger.Info("SSH connection established",
"remote_addr", sshConn.RemoteAddr(),
"user", sshConn.User(),
@@ -171,11 +181,16 @@ func (s *Server) handleSession(channel ssh.Channel, requests <-chan *ssh.Request
}
ip := extractIP(conn.RemoteAddr())
sessionStart := time.Now()
sessionID, err := s.store.CreateSession(context.Background(), ip, conn.User(), selectedShell.Name())
if err != nil {
s.logger.Error("failed to create session", "err", err)
} else {
s.metrics.SessionsTotal.WithLabelValues(selectedShell.Name()).Inc()
s.metrics.SessionsActive.Inc()
defer func() {
s.metrics.SessionsActive.Dec()
s.metrics.SessionDuration.Observe(time.Since(sessionStart).Seconds())
if err := s.store.EndSession(context.Background(), sessionID, time.Now()); err != nil {
s.logger.Error("failed to end session", "err", err)
}
@@ -318,6 +333,12 @@ func (s *Server) passwordCallback(conn ssh.ConnMetadata, password []byte) (*ssh.
ip := extractIP(conn.RemoteAddr())
d := s.authenticator.Authenticate(ip, conn.User(), string(password))
if d.Accepted {
s.metrics.AuthAttemptsTotal.WithLabelValues("accepted", d.Reason).Inc()
} else {
s.metrics.AuthAttemptsTotal.WithLabelValues("rejected", d.Reason).Inc()
}
s.logger.Info("auth attempt",
"remote_addr", conn.RemoteAddr(),
"username", conn.User(),