From 6f7aee3444c9fd7b2cdacf3254675b23f6298cd3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Torjus=20H=C3=A5kestad?= Date: Sun, 1 Feb 2026 10:38:35 +0100 Subject: [PATCH] bootstrap: implement automated VM bootstrap mechanism for Phase 3 Add systemd service that automatically bootstraps freshly deployed VMs with their host-specific NixOS configuration from the flake repository. Changes: - hosts/template2/bootstrap.nix: New systemd oneshot service that: - Runs after cloud-init completes (ensures hostname is set) - Reads hostname from hostnamectl (set by cloud-init from Terraform) - Checks network connectivity via HTTPS (curl) - Runs nixos-rebuild boot with flake URL - Reboots on success, fails gracefully with clear errors on failure - hosts/template2/configuration.nix: Configure cloud-init datasource - Changed from NoCloud to ConfigDrive (used by Proxmox) - Allows cloud-init to receive config from Proxmox - hosts/template2/default.nix: Import bootstrap.nix module - terraform/vms.tf: Add cloud-init disk to VMs - Configure disks.ide.ide2.cloudinit block - Removed invalid cloudinit_cdrom_storage parameter - Enables Proxmox to inject cloud-init configuration - TODO.md: Mark Phase 3 as completed This eliminates the manual nixos-rebuild step from the deployment workflow. VMs now automatically pull and apply their configuration on first boot. Co-Authored-By: Claude Sonnet 4.5 --- TODO.md | 53 ++++++++++++++++--------- hosts/template2/bootstrap.nix | 66 +++++++++++++++++++++++++++++++ hosts/template2/configuration.nix | 9 +---- hosts/template2/default.nix | 1 + terraform/vms.tf | 8 ++++ 5 files changed, 111 insertions(+), 26 deletions(-) create mode 100644 hosts/template2/bootstrap.nix diff --git a/TODO.md b/TODO.md index 8dad29a..70e81a5 100644 --- a/TODO.md +++ b/TODO.md @@ -105,32 +105,47 @@ create-host \ --- -### Phase 3: Bootstrap Mechanism +### Phase 3: Bootstrap Mechanism ✅ COMPLETED + +**Status:** ✅ Fully implemented and tested +**Completed:** 2025-02-01 **Goal:** Get freshly deployed VM to apply its specific host configuration -**Challenge:** Chicken-and-egg problem - VM needs to know its hostname and pull the right config +**Implementation:** Systemd oneshot service that runs on first boot after cloud-init -**Option A: Cloud-init bootstrap script** -- [ ] Add cloud-init `runcmd` to template2 that: - - [ ] Reads hostname from cloud-init metadata - - [ ] Runs `nixos-rebuild boot --flake git+https://git.t-juice.club/torjus/nixos-servers.git#${hostname}` - - [ ] Reboots into the new configuration -- [ ] Test cloud-init script execution on fresh VM -- [ ] Handle failure cases (flake doesn't exist, network issues) +**Approach taken:** Systemd service (variant of Option A) +- Systemd service `nixos-bootstrap.service` runs on first boot +- Depends on `cloud-config.service` to ensure hostname is set +- Reads hostname from `hostnamectl` (set by cloud-init via Terraform) +- Runs `nixos-rebuild boot --flake git+https://git.t-juice.club/torjus/nixos-servers.git#${hostname}` +- Reboots into new configuration on success +- Fails gracefully without reboot on errors (network issues, missing config) +- Service self-destructs after successful bootstrap (not in new config) -**Option B: Terraform provisioner** -- [ ] Use OpenTofu's `remote-exec` provisioner -- [ ] SSH into new VM after creation -- [ ] Run `nixos-rebuild boot --flake #` -- [ ] Trigger reboot via SSH +**Tasks:** +- [x] Create bootstrap service module in template2 + - [x] systemd oneshot service with proper dependencies + - [x] Reads hostname from hostnamectl (cloud-init sets it) + - [x] Checks network connectivity via HTTPS (curl) + - [x] Runs nixos-rebuild boot with flake URL + - [x] Reboots on success, fails gracefully on error +- [x] Configure cloud-init datasource + - [x] Use ConfigDrive datasource (Proxmox provider) + - [x] Add cloud-init disk to Terraform VMs (disks.ide.ide2.cloudinit) + - [x] Hostname passed via cloud-init user-data from Terraform +- [x] Test bootstrap service execution on fresh VM +- [x] Handle failure cases (flake doesn't exist, network issues) + - [x] Clear error messages in journald + - [x] No reboot on failure + - [x] System remains accessible for debugging -**Option C: Two-stage deployment** -- [ ] Deploy VM with template2 (minimal config) -- [ ] Run Ansible playbook to bootstrap specific config -- [ ] Similar to existing `run-upgrade.yml` pattern +**Files:** +- `hosts/template2/bootstrap.nix` - Bootstrap service definition +- `hosts/template2/configuration.nix` - Cloud-init ConfigDrive datasource +- `terraform/vms.tf` - Cloud-init disk configuration -**Decision needed:** Which approach fits best? (Recommend Option A for automation) +**Deliverable:** ✅ VMs automatically bootstrap and reboot into host-specific configuration on first boot --- diff --git a/hosts/template2/bootstrap.nix b/hosts/template2/bootstrap.nix new file mode 100644 index 0000000..da660f0 --- /dev/null +++ b/hosts/template2/bootstrap.nix @@ -0,0 +1,66 @@ +{ pkgs, config, lib, ... }: +let + bootstrap-script = pkgs.writeShellApplication { + name = "nixos-bootstrap"; + runtimeInputs = with pkgs; [ systemd curl nixos-rebuild jq git ]; + text = '' + set -euo pipefail + + # Read hostname set by cloud-init (from Terraform VM name via user-data) + # Cloud-init sets the system hostname from user-data.txt, so we read it from hostnamectl + HOSTNAME=$(hostnamectl hostname) + echo "DEBUG: Hostname from hostnamectl: '$HOSTNAME'" + + echo "Starting NixOS bootstrap for host: $HOSTNAME" + echo "Waiting for network connectivity..." + + # Verify we can reach the git server via HTTPS (doesn't respond to ping) + if ! curl -s --connect-timeout 5 --max-time 10 https://git.t-juice.club >/dev/null 2>&1; then + echo "ERROR: Cannot reach git.t-juice.club via HTTPS" + echo "Check network configuration and DNS settings" + exit 1 + fi + + echo "Network connectivity confirmed" + echo "Fetching and building NixOS configuration from flake..." + + # Build and activate the host-specific configuration + FLAKE_URL="git+https://git.t-juice.club/torjus/nixos-servers.git#''${HOSTNAME}" + + if nixos-rebuild boot --flake "$FLAKE_URL"; then + echo "Successfully built configuration for $HOSTNAME" + echo "Rebooting into new configuration..." + sleep 2 + systemctl reboot + else + echo "ERROR: nixos-rebuild failed for $HOSTNAME" + echo "Check that flake has configuration for this hostname" + echo "Manual intervention required - system will not reboot" + exit 1 + fi + ''; + }; +in +{ + systemd.services."nixos-bootstrap" = { + description = "Bootstrap NixOS configuration from flake on first boot"; + + # Wait for cloud-init to finish setting hostname and network to be online + after = [ "cloud-config.service" "network-online.target" ]; + wants = [ "network-online.target" ]; + requires = [ "cloud-config.service" ]; + + # Run on boot + wantedBy = [ "multi-user.target" ]; + + serviceConfig = { + Type = "oneshot"; + RemainAfterExit = true; + ExecStart = "${bootstrap-script}/bin/nixos-bootstrap"; + + # Logging to journald + StandardOutput = "journal+console"; + StandardError = "journal+console"; + }; + }; +} diff --git a/hosts/template2/configuration.nix b/hosts/template2/configuration.nix index 7daad62..97a1aef 100644 --- a/hosts/template2/configuration.nix +++ b/hosts/template2/configuration.nix @@ -27,14 +27,9 @@ proxmox.cloudInit.defaultStorage = lib.mkForce "local-zfs"; }; - # Configure cloud-init to only use NoCloud datasource (no EC2 metadata service) + # Configure cloud-init to use ConfigDrive datasource (used by Proxmox) services.cloud-init.settings = { - datasource_list = [ "NoCloud" ]; - datasource = { - NoCloud = { - fs_label = "cidata"; - }; - }; + datasource_list = [ "ConfigDrive" "NoCloud" ]; }; boot.loader.grub.enable = true; diff --git a/hosts/template2/default.nix b/hosts/template2/default.nix index 711cc51..c0054e4 100644 --- a/hosts/template2/default.nix +++ b/hosts/template2/default.nix @@ -4,6 +4,7 @@ ./hardware-configuration.nix ./configuration.nix ./scripts.nix + ./bootstrap.nix ../../system/packages.nix ]; } diff --git a/terraform/vms.tf b/terraform/vms.tf index 1b995a8..6570e70 100644 --- a/terraform/vms.tf +++ b/terraform/vms.tf @@ -24,6 +24,7 @@ locals { # Example Minimal VM using all defaults (uncomment to deploy): # "minimal-vm" = {} + # "bootstrap-verify-test" = {} } # Compute VM configurations with defaults applied @@ -86,6 +87,13 @@ resource "proxmox_vm_qemu" "vm" { } } } + ide { + ide2 { + cloudinit { + storage = each.value.storage + } + } + } } # Start on boot