feat: add human detection scoring and webhook notifications

Implement phase 2.1 (human detection) and 2.2 (notifications): - Detection scorer computes 0.0-1.0 human likelihood from keystroke timing variance, special key usage, typing speed, command diversity, and session duration - Webhook notifier sends JSON POST to configured endpoints with deduplication, custom headers, and event filtering - RecordingChannel gains an event callback for feeding keystrokes to the scorer without coupling shell and detection packages - Server wires scorer into session lifecycle with periodic updates and threshold-based notification triggers - Web UI shows human score in session tables with highlighting - New config sections: [detection] and [[notify.webhooks]] Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-14 21:28:11 +01:00
parent 96c8476f77
commit 0ad6f4cb6a
13 changed files with 1060 additions and 32 deletions
--- a/internal/detection/scorer.go
+++ b/internal/detection/scorer.go
@@ -0,0 +1,259 @@
+package detection
+
+import (
+	"math"
+	"sync"
+	"time"
+)
+
+// Direction constants for RecordEvent.
+const (
+	DirInput  = 0 // client → server (keystrokes)
+	DirOutput = 1 // server → client (shell output)
+)
+
+// Signal weights for the composite score.
+const (
+	weightTimingVariance  = 0.30
+	weightSpecialKeys     = 0.20
+	weightTypingSpeed     = 0.20
+	weightCommandDiversity = 0.15
+	weightSessionDuration = 0.15
+)
+
+// Scorer accumulates keystroke events and computes a 0.0–1.0
+// human likelihood score based on multiple signals.
+type Scorer struct {
+	mu sync.Mutex
+
+	// Input timing data.
+	inputTimes []time.Time
+	delays     []time.Duration
+
+	// Special key counters.
+	specialKeys int
+
+	// Command tracking: we count newlines and unique command prefixes.
+	currentCmd []byte
+	commands   map[string]struct{}
+
+	// Session activity duration.
+	firstInput time.Time
+	lastInput  time.Time
+}
+
+// NewScorer returns a new Scorer ready to record events.
+func NewScorer() *Scorer {
+	return &Scorer{
+		commands: make(map[string]struct{}),
+	}
+}
+
+// RecordEvent records a data event with timestamp and direction.
+// direction should be DirInput (0) for client input or DirOutput (1) for server output.
+func (s *Scorer) RecordEvent(ts time.Time, direction int, data []byte) {
+	if direction != DirInput {
+		return // only analyze input
+	}
+
+	s.mu.Lock()
+	defer s.mu.Unlock()
+
+	if s.firstInput.IsZero() {
+		s.firstInput = ts
+	}
+	s.lastInput = ts
+
+	for _, b := range data {
+		// Track inter-keystroke delay for single-byte inputs.
+		if len(s.inputTimes) > 0 {
+			delay := ts.Sub(s.inputTimes[len(s.inputTimes)-1])
+			if delay > 0 && delay < 30*time.Second {
+				s.delays = append(s.delays, delay)
+			}
+		}
+		s.inputTimes = append(s.inputTimes, ts)
+
+		// Count special keys.
+		if isSpecialKey(b) {
+			s.specialKeys++
+		}
+
+		// Track commands (split on newline/CR).
+		if b == '\r' || b == '\n' {
+			cmd := string(s.currentCmd)
+			if len(cmd) > 0 {
+				s.commands[cmd] = struct{}{}
+			}
+			s.currentCmd = s.currentCmd[:0]
+		} else {
+			// Handle backspace: remove last byte from current command.
+			if b == 0x7f || b == 0x08 {
+				if len(s.currentCmd) > 0 {
+					s.currentCmd = s.currentCmd[:len(s.currentCmd)-1]
+				}
+			} else if b >= 0x20 { // printable
+				s.currentCmd = append(s.currentCmd, b)
+			}
+		}
+	}
+}
+
+// Score computes the composite human likelihood score (0.0–1.0).
+// Thread-safe.
+func (s *Scorer) Score() float64 {
+	s.mu.Lock()
+	defer s.mu.Unlock()
+
+	if len(s.inputTimes) == 0 {
+		return 0
+	}
+
+	tv := s.timingVarianceScore()
+	sk := s.specialKeysScore()
+	ts := s.typingSpeedScore()
+	cd := s.commandDiversityScore()
+	sd := s.sessionDurationScore()
+
+	score := tv*weightTimingVariance +
+		sk*weightSpecialKeys +
+		ts*weightTypingSpeed +
+		cd*weightCommandDiversity +
+		sd*weightSessionDuration
+
+	return clamp(score, 0, 1)
+}
+
+// timingVarianceScore returns 0–1 based on coefficient of variation of inter-key delays.
+// Bots have CV ≈ 0 (instant or uniform), humans have CV ≥ 0.6.
+func (s *Scorer) timingVarianceScore() float64 {
+	if len(s.delays) < 3 {
+		return 0
+	}
+
+	mean := meanDuration(s.delays)
+	if mean == 0 {
+		return 0
+	}
+
+	variance := 0.0
+	for _, d := range s.delays {
+		diff := float64(d) - float64(mean)
+		variance += diff * diff
+	}
+	variance /= float64(len(s.delays))
+	stddev := math.Sqrt(variance)
+	cv := stddev / float64(mean)
+
+	// Map CV to 0–1: CV of 0.6+ is fully human-like.
+	return clamp(cv/0.6, 0, 1)
+}
+
+// specialKeysScore returns 0–1 based on count of special key presses.
+// Scripts almost never generate backspace/tab/ctrl characters.
+func (s *Scorer) specialKeysScore() float64 {
+	// 5+ special keys → full score.
+	return clamp(float64(s.specialKeys)/5.0, 0, 1)
+}
+
+// typingSpeedScore returns 0–1 based on median inter-key delay.
+// Paste/scripts have < 5ms, humans have 30–300ms.
+func (s *Scorer) typingSpeedScore() float64 {
+	if len(s.delays) < 2 {
+		return 0
+	}
+
+	med := medianDuration(s.delays)
+	ms := float64(med) / float64(time.Millisecond)
+
+	if ms < 5 {
+		return 0 // paste or script
+	}
+	if ms > 300 {
+		return 0.7 // very slow, still possibly human
+	}
+	if ms >= 30 && ms <= 300 {
+		return 1.0 // human range
+	}
+	// 5–30ms: transition zone
+	return clamp((ms-5)/25, 0, 1)
+}
+
+// commandDiversityScore returns 0–1 based on number of unique commands.
+func (s *Scorer) commandDiversityScore() float64 {
+	// 3+ unique commands → full score.
+	return clamp(float64(len(s.commands))/3.0, 0, 1)
+}
+
+// sessionDurationScore returns 0–1 based on active input duration.
+func (s *Scorer) sessionDurationScore() float64 {
+	if s.firstInput.IsZero() || s.lastInput.IsZero() {
+		return 0
+	}
+	dur := s.lastInput.Sub(s.firstInput)
+	// 10s+ of active input → full score.
+	return clamp(float64(dur)/float64(10*time.Second), 0, 1)
+}
+
+// isSpecialKey returns true for non-printable keys that humans commonly use.
+func isSpecialKey(b byte) bool {
+	switch b {
+	case 0x7f, // DEL (backspace in most terminals)
+		0x08, // BS
+		0x09, // TAB
+		0x03, // Ctrl-C
+		0x04, // Ctrl-D
+		0x1b: // ESC (arrow keys start with ESC)
+		return true
+	}
+	return false
+}
+
+func clamp(v, lo, hi float64) float64 {
+	if v < lo {
+		return lo
+	}
+	if v > hi {
+		return hi
+	}
+	return v
+}
+
+func meanDuration(ds []time.Duration) time.Duration {
+	if len(ds) == 0 {
+		return 0
+	}
+	var sum time.Duration
+	for _, d := range ds {
+		sum += d
+	}
+	return sum / time.Duration(len(ds))
+}
+
+func medianDuration(ds []time.Duration) time.Duration {
+	n := len(ds)
+	if n == 0 {
+		return 0
+	}
+	// Copy to avoid mutating the original.
+	sorted := make([]time.Duration, n)
+	copy(sorted, ds)
+	sortDurations(sorted)
+	if n%2 == 0 {
+		return (sorted[n/2-1] + sorted[n/2]) / 2
+	}
+	return sorted[n/2]
+}
+
+func sortDurations(ds []time.Duration) {
+	// Simple insertion sort — delay slices are small.
+	for i := 1; i < len(ds); i++ {
+		key := ds[i]
+		j := i - 1
+		for j >= 0 && ds[j] > key {
+			ds[j+1] = ds[j]
+			j--
+		}
+		ds[j+1] = key
+	}
+}
--- a/internal/detection/scorer_test.go
+++ b/internal/detection/scorer_test.go
@@ -0,0 +1,155 @@
+package detection
+
+import (
+	"sync"
+	"testing"
+	"time"
+)
+
+func TestScorer_EmptyInput(t *testing.T) {
+	s := NewScorer()
+	score := s.Score()
+	if score != 0 {
+		t.Errorf("empty scorer: got %f, want 0", score)
+	}
+}
+
+func TestScorer_SingleKeystroke(t *testing.T) {
+	s := NewScorer()
+	s.RecordEvent(time.Now(), DirInput, []byte("a"))
+	score := s.Score()
+	if score != 0 {
+		t.Errorf("single keystroke: got %f, want 0", score)
+	}
+}
+
+func TestScorer_BotLikeInput(t *testing.T) {
+	// Simulate a bot: paste entire commands with uniform tiny delays, no special keys.
+	s := NewScorer()
+	now := time.Now()
+
+	// Bot pastes "cat /etc/passwd\r" all at once with perfectly uniform timing.
+	for range 3 {
+		cmd := []byte("cat /etc/passwd\r")
+		for _, b := range cmd {
+			s.RecordEvent(now, DirInput, []byte{b})
+			now = now.Add(100 * time.Microsecond) // ~0.1ms uniform delay = paste
+		}
+	}
+
+	score := s.Score()
+	if score >= 0.3 {
+		t.Errorf("bot-like input: got %f, want < 0.3", score)
+	}
+}
+
+func TestScorer_HumanLikeInput(t *testing.T) {
+	// Simulate a human: variable timing, backspaces, diverse commands.
+	s := NewScorer()
+	now := time.Now()
+
+	type cmd struct {
+		text  string
+		delay time.Duration // base delay between keys
+	}
+
+	commands := []cmd{
+		{"ls -la\r", 80 * time.Millisecond},
+		{"cat /etc/paswd", 120 * time.Millisecond}, // typo
+		{string([]byte{0x7f}), 200 * time.Millisecond},  // backspace
+		{"wd\r", 90 * time.Millisecond},                  // correction
+		{"whoami\r", 100 * time.Millisecond},
+		{"uname -a\r", 150 * time.Millisecond},
+		{string([]byte{0x09}), 300 * time.Millisecond}, // tab completion
+		{"pwd\r", 70 * time.Millisecond},
+	}
+
+	for _, c := range commands {
+		for _, b := range []byte(c.text) {
+			// Add ±30% jitter to make timing more natural.
+			jitter := time.Duration(float64(c.delay) * 0.3)
+			delay := c.delay + jitter // simplified: always add, still variable across commands
+			s.RecordEvent(now, DirInput, []byte{b})
+			now = now.Add(delay)
+		}
+		// Pause between commands (thinking time).
+		now = now.Add(2 * time.Second)
+	}
+
+	score := s.Score()
+	if score <= 0.6 {
+		t.Errorf("human-like input: got %f, want > 0.6", score)
+	}
+}
+
+func TestScorer_OutputIgnored(t *testing.T) {
+	s := NewScorer()
+	now := time.Now()
+
+	// Only output events — should not affect score.
+	for i := 0; i < 100; i++ {
+		s.RecordEvent(now, DirOutput, []byte("some output\n"))
+		now = now.Add(10 * time.Millisecond)
+	}
+
+	score := s.Score()
+	if score != 0 {
+		t.Errorf("output-only: got %f, want 0", score)
+	}
+}
+
+func TestScorer_ThreadSafety(t *testing.T) {
+	s := NewScorer()
+	now := time.Now()
+
+	var wg sync.WaitGroup
+	for i := 0; i < 10; i++ {
+		wg.Add(1)
+		go func(offset int) {
+			defer wg.Done()
+			for j := 0; j < 100; j++ {
+				ts := now.Add(time.Duration(offset*100+j) * time.Millisecond)
+				s.RecordEvent(ts, DirInput, []byte("a"))
+			}
+		}(i)
+	}
+
+	// Concurrently read score.
+	wg.Add(1)
+	go func() {
+		defer wg.Done()
+		for i := 0; i < 50; i++ {
+			_ = s.Score()
+		}
+	}()
+
+	wg.Wait()
+
+	// Should not panic; score should be valid.
+	score := s.Score()
+	if score < 0 || score > 1 {
+		t.Errorf("concurrent score out of range: %f", score)
+	}
+}
+
+func TestScorer_CommandDiversity(t *testing.T) {
+	s := NewScorer()
+	now := time.Now()
+
+	// Type 4 different commands with human-ish timing.
+	cmds := []string{"ls\r", "pwd\r", "id\r", "whoami\r"}
+	for _, cmd := range cmds {
+		for _, b := range []byte(cmd) {
+			s.RecordEvent(now, DirInput, []byte{b})
+			now = now.Add(100 * time.Millisecond)
+		}
+		now = now.Add(time.Second)
+	}
+
+	score := s.Score()
+	// With 4 unique commands, human timing, and decent duration,
+	// we should get a meaningful score.
+	if score < 0.4 {
+		t.Errorf("diverse commands: got %f, want >= 0.4", score)
+	}
+}