From 6e08ba9720fee84d3d7bf78df81951a4e12950b4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Torjus=20H=C3=A5kestad?= Date: Mon, 9 Feb 2026 21:41:29 +0100 Subject: [PATCH] ansible: restructure with dynamic inventory from flake - Move playbooks/ to ansible/playbooks/ - Add dynamic inventory script that extracts hosts from flake - Groups by tier (tier_test, tier_prod) and role (role_dns, etc.) - Reads homelab.host.* options for metadata - Add static inventory for non-flake hosts (Proxmox) - Add ansible.cfg with inventory path and SSH optimizations - Add group_vars/all.yml for common variables - Add restart-service.yml playbook for restarting systemd services - Update provision-approle.yml with single-host safeguard - Add ANSIBLE_CONFIG to devshell for automatic inventory discovery - Add ansible = "false" label to template2 to exclude from inventory - Update CLAUDE.md to reference ansible/README.md for details Co-Authored-By: Claude Opus 4.5 --- CLAUDE.md | 37 ++-- ansible/README.md | 116 +++++++++++++ ansible/ansible.cfg | 16 ++ ansible/inventory/dynamic_flake.py | 158 ++++++++++++++++++ ansible/inventory/group_vars/all.yml | 5 + ansible/inventory/static.yml | 10 ++ .../playbooks}/build-and-deploy-template.yml | 6 +- .../playbooks}/provision-approle.yml | 52 ++++-- ansible/playbooks/restart-service.yml | 40 +++++ .../playbooks}/run-upgrade.yml | 0 flake.nix | 1 + hosts/template2/configuration.nix | 1 + playbooks/inventory.ini | 5 - 13 files changed, 403 insertions(+), 44 deletions(-) create mode 100644 ansible/README.md create mode 100644 ansible/ansible.cfg create mode 100755 ansible/inventory/dynamic_flake.py create mode 100644 ansible/inventory/group_vars/all.yml create mode 100644 ansible/inventory/static.yml rename {playbooks => ansible/playbooks}/build-and-deploy-template.yml (96%) rename {playbooks => ansible/playbooks}/provision-approle.yml (54%) create mode 100644 ansible/playbooks/restart-service.yml rename {playbooks => ansible/playbooks}/run-upgrade.yml (100%) delete mode 100644 playbooks/inventory.ini diff --git a/CLAUDE.md b/CLAUDE.md index 664420a..af57dec 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -114,6 +114,12 @@ nix develop -c tofu -chdir=terraform/vault apply cd terraform && tofu plan ``` +### Ansible + +Ansible configuration and playbooks are in `/ansible/`. See [ansible/README.md](ansible/README.md) for inventory groups, available playbooks, and usage examples. + +The devshell sets `ANSIBLE_CONFIG` automatically, so no `-i` flag is needed. + ### Secrets Management Secrets are managed by OpenBao (Vault) using AppRole authentication. Most hosts use the @@ -279,7 +285,10 @@ The `current_rev` label contains the git commit hash of the deployed flake confi - `/docs/` - Documentation and plans - `plans/` - Future plans and proposals - `plans/completed/` - Completed plans (moved here when done) -- `/playbooks/` - Ansible playbooks for fleet management +- `/ansible/` - Ansible configuration and playbooks + - `ansible.cfg` - Ansible configuration (inventory path, defaults) + - `inventory/` - Dynamic and static inventory sources + - `playbooks/` - Ansible playbooks for fleet management ### Configuration Inheritance @@ -303,24 +312,11 @@ All hosts automatically get: - Custom root CA trust - DNS zone auto-registration via `homelab.dns` options -### Active Hosts +### Hosts -Production servers: -- `ns1`, `ns2` - Primary/secondary DNS servers (10.69.13.5/6) -- `vault01` - OpenBao (Vault) secrets server + PKI CA -- `ha1` - Home Assistant + Zigbee2MQTT + Mosquitto -- `http-proxy` - Reverse proxy -- `monitoring01` - Full observability stack (Prometheus, Grafana, Loki, Tempo, Pyroscope) -- `jelly01` - Jellyfin media server -- `nix-cache01` - Binary cache server + GitHub Actions runner -- `pgdb1` - PostgreSQL database -- `nats1` - NATS messaging server +Host configurations are in `/hosts//`. See `flake.nix` for the complete list of `nixosConfigurations`. -Test/staging hosts: -- `testvm01`, `testvm02`, `testvm03` - Test-tier VMs for branch testing and deployment validation - -Template hosts: -- `template1`, `template2` - Base templates for cloning new hosts +Use `nix flake show` or `nix develop -c ansible-inventory --graph` to list all hosts. ### Flake Inputs @@ -351,7 +347,7 @@ Most hosts use OpenBao (Vault) for secrets: - `extractKey` option extracts a single key from vault JSON as a plain file - Secrets fetched at boot by `vault-secret-.service` systemd units - Fallback to cached secrets in `/var/lib/vault/cache/` when Vault is unreachable -- Provision AppRole credentials: `nix develop -c ansible-playbook playbooks/provision-approle.yml -e hostname=` +- Provision AppRole credentials: `nix develop -c ansible-playbook ansible/playbooks/provision-approle.yml -l ` ### Auto-Upgrade System @@ -375,7 +371,7 @@ Template VMs are built from `hosts/template2` and deployed to Proxmox using Ansi ```bash # Build NixOS image and deploy to Proxmox as template -nix develop -c ansible-playbook -i playbooks/inventory.ini playbooks/build-and-deploy-template.yml +nix develop -c ansible-playbook ansible/playbooks/build-and-deploy-template.yml ``` This playbook: @@ -450,7 +446,7 @@ This means: - `tofu plan` won't show spurious changes for Proxmox-managed defaults **When rebuilding the template:** -1. Run `nix develop -c ansible-playbook -i playbooks/inventory.ini playbooks/build-and-deploy-template.yml` +1. Run `nix develop -c ansible-playbook ansible/playbooks/build-and-deploy-template.yml` 2. Update `default_template_name` in `terraform/variables.tf` if the name changed 3. Run `tofu plan` - should show no VM recreations (only template name in state) 4. Run `tofu apply` - updates state without touching existing VMs @@ -533,6 +529,7 @@ The `modules/homelab/` directory defines custom options used across hosts for au - `priority` - Alerting priority: `high` or `low`. Controls alerting thresholds for the host. - `role` - Primary role designation (e.g., `dns`, `database`, `bastion`, `vault`) - `labels` - Free-form key-value metadata for host categorization + - `ansible = "false"` - Exclude host from Ansible dynamic inventory **DNS options (`homelab.dns.*`):** - `enable` (default: `true`) - Include host in DNS zone generation diff --git a/ansible/README.md b/ansible/README.md new file mode 100644 index 0000000..c36e30d --- /dev/null +++ b/ansible/README.md @@ -0,0 +1,116 @@ +# Ansible Configuration + +This directory contains Ansible configuration for fleet management tasks. + +## Structure + +``` +ansible/ +├── ansible.cfg # Ansible configuration +├── inventory/ +│ ├── dynamic_flake.py # Dynamic inventory from NixOS flake +│ ├── static.yml # Non-flake hosts (Proxmox, etc.) +│ └── group_vars/ +│ └── all.yml # Common variables +└── playbooks/ + ├── build-and-deploy-template.yml + ├── provision-approle.yml + ├── restart-service.yml + └── run-upgrade.yml +``` + +## Usage + +The devshell automatically configures `ANSIBLE_CONFIG`, so commands work without extra flags: + +```bash +# List inventory groups +nix develop -c ansible-inventory --graph + +# List hosts in a specific group +nix develop -c ansible-inventory --list | jq '.role_dns' + +# Run a playbook +nix develop -c ansible-playbook ansible/playbooks/run-upgrade.yml -l tier_test +``` + +## Inventory + +The inventory combines dynamic and static sources automatically. + +### Dynamic Inventory (from flake) + +The `dynamic_flake.py` script extracts hosts from the NixOS flake using `homelab.host.*` options: + +**Groups generated:** +- `flake_hosts` - All NixOS hosts from the flake +- `tier_test`, `tier_prod` - By `homelab.host.tier` +- `role_dns`, `role_vault`, `role_monitoring`, etc. - By `homelab.host.role` + +**Host variables set:** +- `tier` - Deployment tier (test/prod) +- `role` - Host role +- `short_hostname` - Hostname without domain + +### Static Inventory + +Non-flake hosts are defined in `inventory/static.yml`: + +- `proxmox` - Proxmox hypervisors + +## Playbooks + +| Playbook | Description | Example | +|----------|-------------|---------| +| `run-upgrade.yml` | Trigger nixos-upgrade on hosts | `-l tier_prod` | +| `restart-service.yml` | Restart a systemd service | `-l role_dns -e service=unbound` | +| `provision-approle.yml` | Deploy Vault credentials (single host only) | `-l testvm01` | +| `build-and-deploy-template.yml` | Build and deploy Proxmox template | (no limit needed) | + +### Examples + +```bash +# Restart unbound on all DNS servers +nix develop -c ansible-playbook ansible/playbooks/restart-service.yml \ + -l role_dns -e service=unbound + +# Trigger upgrade on all test hosts +nix develop -c ansible-playbook ansible/playbooks/run-upgrade.yml -l tier_test + +# Provision Vault credentials for a specific host +nix develop -c ansible-playbook ansible/playbooks/provision-approle.yml -l testvm01 + +# Build and deploy Proxmox template +nix develop -c ansible-playbook ansible/playbooks/build-and-deploy-template.yml +``` + +## Excluding Flake Hosts + +To exclude a flake host from the dynamic inventory, add the `ansible = "false"` label in the host's configuration: + +```nix +homelab.host.labels.ansible = "false"; +``` + +Hosts with `homelab.dns.enable = false` are also excluded automatically. + +## Adding Non-Flake Hosts + +Edit `inventory/static.yml` to add hosts not managed by the NixOS flake: + +```yaml +all: + children: + my_group: + hosts: + host1.example.com: + ansible_user: admin +``` + +## Common Variables + +Variables in `inventory/group_vars/all.yml` apply to all hosts: + +- `ansible_user` - Default SSH user (root) +- `domain` - Domain name (home.2rjus.net) +- `vault_addr` - Vault server URL diff --git a/ansible/ansible.cfg b/ansible/ansible.cfg new file mode 100644 index 0000000..686ad13 --- /dev/null +++ b/ansible/ansible.cfg @@ -0,0 +1,16 @@ +[defaults] +inventory = inventory/ +remote_user = root +host_key_checking = False + +# Reduce SSH connection overhead +forks = 10 +pipelining = True + +# Output formatting +stdout_callback = yaml +callbacks_enabled = profile_tasks + +[ssh_connection] +# Reuse SSH connections +ssh_args = -o ControlMaster=auto -o ControlPersist=60s diff --git a/ansible/inventory/dynamic_flake.py b/ansible/inventory/dynamic_flake.py new file mode 100755 index 0000000..b34c50d --- /dev/null +++ b/ansible/inventory/dynamic_flake.py @@ -0,0 +1,158 @@ +#!/usr/bin/env python3 +""" +Dynamic Ansible inventory script that extracts host information from the NixOS flake. + +Generates groups: + - flake_hosts: All hosts defined in the flake + - tier_test, tier_prod: Hosts by deployment tier + - role_: Hosts by role (dns, vault, monitoring, etc.) + +Usage: + ./dynamic_flake.py --list # Return full inventory + ./dynamic_flake.py --host X # Return host vars (not used, but required by Ansible) +""" + +import json +import subprocess +import sys +from pathlib import Path + + +def get_flake_dir() -> Path: + """Find the flake root directory.""" + script_dir = Path(__file__).resolve().parent + # ansible/inventory/dynamic_flake.py -> repo root + return script_dir.parent.parent + + +def evaluate_flake() -> dict: + """Evaluate the flake and extract host metadata.""" + flake_dir = get_flake_dir() + + # Nix expression to extract relevant config from each host + nix_expr = """ + configs: builtins.mapAttrs (name: cfg: { + hostname = cfg.config.networking.hostName; + domain = cfg.config.networking.domain or "home.2rjus.net"; + tier = cfg.config.homelab.host.tier; + role = cfg.config.homelab.host.role; + labels = cfg.config.homelab.host.labels; + dns_enabled = cfg.config.homelab.dns.enable; + }) configs + """ + + try: + result = subprocess.run( + [ + "nix", + "eval", + "--json", + f"{flake_dir}#nixosConfigurations", + "--apply", + nix_expr, + ], + capture_output=True, + text=True, + check=True, + cwd=flake_dir, + ) + return json.loads(result.stdout) + except subprocess.CalledProcessError as e: + print(f"Error evaluating flake: {e.stderr}", file=sys.stderr) + sys.exit(1) + except json.JSONDecodeError as e: + print(f"Error parsing nix output: {e}", file=sys.stderr) + sys.exit(1) + + +def sanitize_group_name(name: str) -> str: + """Sanitize a string for use as an Ansible group name. + + Ansible group names should contain only alphanumeric characters and underscores. + """ + return name.replace("-", "_") + + +def build_inventory(hosts_data: dict) -> dict: + """Build Ansible inventory structure from host data.""" + inventory = { + "_meta": {"hostvars": {}}, + "flake_hosts": {"hosts": []}, + } + + # Track groups we need to create + tier_groups: dict[str, list[str]] = {} + role_groups: dict[str, list[str]] = {} + + for _config_name, host_info in hosts_data.items(): + hostname = host_info["hostname"] + domain = host_info["domain"] + tier = host_info["tier"] + role = host_info["role"] + labels = host_info["labels"] + dns_enabled = host_info["dns_enabled"] + + # Skip hosts that have DNS disabled (like templates) + if not dns_enabled: + continue + + # Skip hosts with ansible = "false" label + if labels.get("ansible") == "false": + continue + + fqdn = f"{hostname}.{domain}" + + # Add to flake_hosts group + inventory["flake_hosts"]["hosts"].append(fqdn) + + # Add host variables + inventory["_meta"]["hostvars"][fqdn] = { + "tier": tier, + "role": role, + "short_hostname": hostname, + } + + # Group by tier + tier_group = f"tier_{sanitize_group_name(tier)}" + if tier_group not in tier_groups: + tier_groups[tier_group] = [] + tier_groups[tier_group].append(fqdn) + + # Group by role (if set) + if role: + role_group = f"role_{sanitize_group_name(role)}" + if role_group not in role_groups: + role_groups[role_group] = [] + role_groups[role_group].append(fqdn) + + # Add tier groups to inventory + for group_name, hosts in tier_groups.items(): + inventory[group_name] = {"hosts": hosts} + + # Add role groups to inventory + for group_name, hosts in role_groups.items(): + inventory[group_name] = {"hosts": hosts} + + return inventory + + +def main(): + if len(sys.argv) < 2: + print("Usage: dynamic_flake.py --list | --host ", file=sys.stderr) + sys.exit(1) + + if sys.argv[1] == "--list": + hosts_data = evaluate_flake() + inventory = build_inventory(hosts_data) + print(json.dumps(inventory, indent=2)) + elif sys.argv[1] == "--host": + # Ansible calls this to get vars for a specific host + # We provide all vars in _meta.hostvars, so just return empty + print(json.dumps({})) + else: + print(f"Unknown option: {sys.argv[1]}", file=sys.stderr) + sys.exit(1) + + +if __name__ == "__main__": + main() diff --git a/ansible/inventory/group_vars/all.yml b/ansible/inventory/group_vars/all.yml new file mode 100644 index 0000000..7b3f3a6 --- /dev/null +++ b/ansible/inventory/group_vars/all.yml @@ -0,0 +1,5 @@ +# Common variables for all hosts + +ansible_user: root +domain: home.2rjus.net +vault_addr: https://vault01.home.2rjus.net:8200 diff --git a/ansible/inventory/static.yml b/ansible/inventory/static.yml new file mode 100644 index 0000000..03d6e66 --- /dev/null +++ b/ansible/inventory/static.yml @@ -0,0 +1,10 @@ +# Static inventory for non-flake hosts +# +# Hosts defined here are merged with the dynamic flake inventory. +# Use this for infrastructure that isn't managed by NixOS. + +all: + children: + proxmox: + hosts: + pve1.home.2rjus.net: diff --git a/playbooks/build-and-deploy-template.yml b/ansible/playbooks/build-and-deploy-template.yml similarity index 96% rename from playbooks/build-and-deploy-template.yml rename to ansible/playbooks/build-and-deploy-template.yml index 5cb9206..4631925 100644 --- a/playbooks/build-and-deploy-template.yml +++ b/ansible/playbooks/build-and-deploy-template.yml @@ -15,13 +15,13 @@ - name: Build NixOS image ansible.builtin.command: cmd: "nixos-rebuild build-image --image-variant proxmox --flake .#template2" - chdir: "{{ playbook_dir }}/.." + chdir: "{{ playbook_dir }}/../.." register: build_result changed_when: true - name: Find built image file ansible.builtin.find: - paths: "{{ playbook_dir}}/../result" + paths: "{{ playbook_dir}}/../../result" patterns: "*.vma.zst" recurse: true register: image_files @@ -105,7 +105,7 @@ gather_facts: false vars: - terraform_dir: "{{ playbook_dir }}/../terraform" + terraform_dir: "{{ playbook_dir }}/../../terraform" tasks: - name: Get image filename from earlier play diff --git a/playbooks/provision-approle.yml b/ansible/playbooks/provision-approle.yml similarity index 54% rename from playbooks/provision-approle.yml rename to ansible/playbooks/provision-approle.yml index d422e68..b9c12df 100644 --- a/playbooks/provision-approle.yml +++ b/ansible/playbooks/provision-approle.yml @@ -1,7 +1,27 @@ --- -# Provision OpenBao AppRole credentials to an existing host -# Usage: nix develop -c ansible-playbook playbooks/provision-approle.yml -e hostname=ha1 +# Provision OpenBao AppRole credentials to a host +# +# Usage: ansible-playbook ansible/playbooks/provision-approle.yml -l # Requires: BAO_ADDR and BAO_TOKEN environment variables set +# +# IMPORTANT: This playbook must target exactly one host to prevent +# accidentally regenerating credentials for multiple hosts. + +- name: Validate single host target + hosts: all + gather_facts: false + + tasks: + - name: Fail if targeting multiple hosts + ansible.builtin.fail: + msg: | + This playbook must target exactly one host. + Use: ansible-playbook provision-approle.yml -l + + Targeting multiple hosts would regenerate credentials for all of them, + potentially breaking existing services. + when: ansible_play_hosts | length != 1 + run_once: true - name: Fetch AppRole credentials from OpenBao hosts: localhost @@ -9,18 +29,17 @@ gather_facts: false vars: - vault_addr: "{{ lookup('env', 'BAO_ADDR') | default('https://vault01.home.2rjus.net:8200', true) }}" - domain: "home.2rjus.net" + target_host: "{{ groups['all'] | first }}" + target_hostname: "{{ hostvars[target_host]['short_hostname'] | default(target_host.split('.')[0]) }}" tasks: - - name: Validate hostname is provided - ansible.builtin.fail: - msg: "hostname variable is required. Use: -e hostname=" - when: hostname is not defined + - name: Display target host + ansible.builtin.debug: + msg: "Provisioning AppRole credentials for: {{ target_hostname }}" - name: Get role-id for host ansible.builtin.command: - cmd: "bao read -field=role_id auth/approle/role/{{ hostname }}/role-id" + cmd: "bao read -field=role_id auth/approle/role/{{ target_hostname }}/role-id" environment: BAO_ADDR: "{{ vault_addr }}" BAO_SKIP_VERIFY: "1" @@ -29,25 +48,26 @@ - name: Generate secret-id for host ansible.builtin.command: - cmd: "bao write -field=secret_id -f auth/approle/role/{{ hostname }}/secret-id" + cmd: "bao write -field=secret_id -f auth/approle/role/{{ target_hostname }}/secret-id" environment: BAO_ADDR: "{{ vault_addr }}" BAO_SKIP_VERIFY: "1" register: secret_id_result changed_when: true - - name: Add target host to inventory - ansible.builtin.add_host: - name: "{{ hostname }}.{{ domain }}" - groups: vault_target - ansible_user: root + - name: Store credentials for next play + ansible.builtin.set_fact: vault_role_id: "{{ role_id_result.stdout }}" vault_secret_id: "{{ secret_id_result.stdout }}" - name: Deploy AppRole credentials to host - hosts: vault_target + hosts: all gather_facts: false + vars: + vault_role_id: "{{ hostvars['localhost']['vault_role_id'] }}" + vault_secret_id: "{{ hostvars['localhost']['vault_secret_id'] }}" + tasks: - name: Create AppRole directory ansible.builtin.file: diff --git a/ansible/playbooks/restart-service.yml b/ansible/playbooks/restart-service.yml new file mode 100644 index 0000000..458197a --- /dev/null +++ b/ansible/playbooks/restart-service.yml @@ -0,0 +1,40 @@ +--- +# Restart a systemd service on target hosts +# +# Usage examples: +# # Restart unbound on all DNS servers +# ansible-playbook restart-service.yml -l role_dns -e service=unbound +# +# # Restart nginx on a specific host +# ansible-playbook restart-service.yml -l http-proxy.home.2rjus.net -e service=nginx +# +# # Restart promtail on all prod hosts +# ansible-playbook restart-service.yml -l tier_prod -e service=promtail + +- name: Restart systemd service + hosts: all + gather_facts: false + + tasks: + - name: Validate service name provided + ansible.builtin.fail: + msg: | + The 'service' variable is required. + Usage: ansible-playbook restart-service.yml -l -e service= + + Examples: + -e service=nginx + -e service=unbound + -e service=promtail + when: service is not defined + run_once: true + + - name: Restart {{ service }} + ansible.builtin.systemd: + name: "{{ service }}" + state: restarted + register: restart_result + + - name: Display result + ansible.builtin.debug: + msg: "Service {{ service }} restarted on {{ inventory_hostname }}" diff --git a/playbooks/run-upgrade.yml b/ansible/playbooks/run-upgrade.yml similarity index 100% rename from playbooks/run-upgrade.yml rename to ansible/playbooks/run-upgrade.yml diff --git a/flake.nix b/flake.nix index 861f002..74f9eec 100644 --- a/flake.nix +++ b/flake.nix @@ -221,6 +221,7 @@ (pkgs.callPackage ./scripts/create-host { }) homelab-deploy.packages.${pkgs.system}.default ]; + ANSIBLE_CONFIG = "./ansible/ansible.cfg"; }; } ); diff --git a/hosts/template2/configuration.nix b/hosts/template2/configuration.nix index ea7def3..f4ef9d3 100644 --- a/hosts/template2/configuration.nix +++ b/hosts/template2/configuration.nix @@ -35,6 +35,7 @@ homelab.host = { tier = "test"; priority = "low"; + labels.ansible = "false"; # Exclude from Ansible inventory }; boot.loader.grub.enable = true; diff --git a/playbooks/inventory.ini b/playbooks/inventory.ini deleted file mode 100644 index d8c057d..0000000 --- a/playbooks/inventory.ini +++ /dev/null @@ -1,5 +0,0 @@ -[proxmox] -pve1.home.2rjus.net - -[proxmox:vars] -ansible_user=root