bootstrap: implement automated VM bootstrap mechanism for Phase 3 #7
53
TODO.md
53
TODO.md
@@ -105,32 +105,47 @@ create-host \
|
|||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
### Phase 3: Bootstrap Mechanism
|
### Phase 3: Bootstrap Mechanism ✅ COMPLETED
|
||||||
|
|
||||||
|
**Status:** ✅ Fully implemented and tested
|
||||||
|
**Completed:** 2025-02-01
|
||||||
|
|
||||||
**Goal:** Get freshly deployed VM to apply its specific host configuration
|
**Goal:** Get freshly deployed VM to apply its specific host configuration
|
||||||
|
|
||||||
**Challenge:** Chicken-and-egg problem - VM needs to know its hostname and pull the right config
|
**Implementation:** Systemd oneshot service that runs on first boot after cloud-init
|
||||||
|
|
||||||
**Option A: Cloud-init bootstrap script**
|
**Approach taken:** Systemd service (variant of Option A)
|
||||||
- [ ] Add cloud-init `runcmd` to template2 that:
|
- Systemd service `nixos-bootstrap.service` runs on first boot
|
||||||
- [ ] Reads hostname from cloud-init metadata
|
- Depends on `cloud-config.service` to ensure hostname is set
|
||||||
- [ ] Runs `nixos-rebuild boot --flake git+https://git.t-juice.club/torjus/nixos-servers.git#${hostname}`
|
- Reads hostname from `hostnamectl` (set by cloud-init via Terraform)
|
||||||
- [ ] Reboots into the new configuration
|
- Runs `nixos-rebuild boot --flake git+https://git.t-juice.club/torjus/nixos-servers.git#${hostname}`
|
||||||
- [ ] Test cloud-init script execution on fresh VM
|
- Reboots into new configuration on success
|
||||||
- [ ] Handle failure cases (flake doesn't exist, network issues)
|
- Fails gracefully without reboot on errors (network issues, missing config)
|
||||||
|
- Service self-destructs after successful bootstrap (not in new config)
|
||||||
|
|
||||||
**Option B: Terraform provisioner**
|
**Tasks:**
|
||||||
- [ ] Use OpenTofu's `remote-exec` provisioner
|
- [x] Create bootstrap service module in template2
|
||||||
- [ ] SSH into new VM after creation
|
- [x] systemd oneshot service with proper dependencies
|
||||||
- [ ] Run `nixos-rebuild boot --flake <url>#<hostname>`
|
- [x] Reads hostname from hostnamectl (cloud-init sets it)
|
||||||
- [ ] Trigger reboot via SSH
|
- [x] Checks network connectivity via HTTPS (curl)
|
||||||
|
- [x] Runs nixos-rebuild boot with flake URL
|
||||||
|
- [x] Reboots on success, fails gracefully on error
|
||||||
|
- [x] Configure cloud-init datasource
|
||||||
|
- [x] Use ConfigDrive datasource (Proxmox provider)
|
||||||
|
- [x] Add cloud-init disk to Terraform VMs (disks.ide.ide2.cloudinit)
|
||||||
|
- [x] Hostname passed via cloud-init user-data from Terraform
|
||||||
|
- [x] Test bootstrap service execution on fresh VM
|
||||||
|
- [x] Handle failure cases (flake doesn't exist, network issues)
|
||||||
|
- [x] Clear error messages in journald
|
||||||
|
- [x] No reboot on failure
|
||||||
|
- [x] System remains accessible for debugging
|
||||||
|
|
||||||
**Option C: Two-stage deployment**
|
**Files:**
|
||||||
- [ ] Deploy VM with template2 (minimal config)
|
- `hosts/template2/bootstrap.nix` - Bootstrap service definition
|
||||||
- [ ] Run Ansible playbook to bootstrap specific config
|
- `hosts/template2/configuration.nix` - Cloud-init ConfigDrive datasource
|
||||||
- [ ] Similar to existing `run-upgrade.yml` pattern
|
- `terraform/vms.tf` - Cloud-init disk configuration
|
||||||
|
|
||||||
**Decision needed:** Which approach fits best? (Recommend Option A for automation)
|
**Deliverable:** ✅ VMs automatically bootstrap and reboot into host-specific configuration on first boot
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
|
|||||||
66
hosts/template2/bootstrap.nix
Normal file
66
hosts/template2/bootstrap.nix
Normal file
@@ -0,0 +1,66 @@
|
|||||||
|
{ pkgs, config, lib, ... }:
|
||||||
|
let
|
||||||
|
bootstrap-script = pkgs.writeShellApplication {
|
||||||
|
name = "nixos-bootstrap";
|
||||||
|
runtimeInputs = with pkgs; [ systemd curl nixos-rebuild jq git ];
|
||||||
|
text = ''
|
||||||
|
set -euo pipefail
|
||||||
|
|
||||||
|
# Read hostname set by cloud-init (from Terraform VM name via user-data)
|
||||||
|
# Cloud-init sets the system hostname from user-data.txt, so we read it from hostnamectl
|
||||||
|
HOSTNAME=$(hostnamectl hostname)
|
||||||
|
echo "DEBUG: Hostname from hostnamectl: '$HOSTNAME'"
|
||||||
|
|
||||||
|
echo "Starting NixOS bootstrap for host: $HOSTNAME"
|
||||||
|
echo "Waiting for network connectivity..."
|
||||||
|
|
||||||
|
# Verify we can reach the git server via HTTPS (doesn't respond to ping)
|
||||||
|
if ! curl -s --connect-timeout 5 --max-time 10 https://git.t-juice.club >/dev/null 2>&1; then
|
||||||
|
echo "ERROR: Cannot reach git.t-juice.club via HTTPS"
|
||||||
|
echo "Check network configuration and DNS settings"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo "Network connectivity confirmed"
|
||||||
|
echo "Fetching and building NixOS configuration from flake..."
|
||||||
|
|
||||||
|
# Build and activate the host-specific configuration
|
||||||
|
FLAKE_URL="git+https://git.t-juice.club/torjus/nixos-servers.git#''${HOSTNAME}"
|
||||||
|
|
||||||
|
if nixos-rebuild boot --flake "$FLAKE_URL"; then
|
||||||
|
echo "Successfully built configuration for $HOSTNAME"
|
||||||
|
echo "Rebooting into new configuration..."
|
||||||
|
sleep 2
|
||||||
|
systemctl reboot
|
||||||
|
else
|
||||||
|
echo "ERROR: nixos-rebuild failed for $HOSTNAME"
|
||||||
|
echo "Check that flake has configuration for this hostname"
|
||||||
|
echo "Manual intervention required - system will not reboot"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
'';
|
||||||
|
};
|
||||||
|
in
|
||||||
|
{
|
||||||
|
systemd.services."nixos-bootstrap" = {
|
||||||
|
description = "Bootstrap NixOS configuration from flake on first boot";
|
||||||
|
|
||||||
|
# Wait for cloud-init to finish setting hostname and network to be online
|
||||||
|
after = [ "cloud-config.service" "network-online.target" ];
|
||||||
|
wants = [ "network-online.target" ];
|
||||||
|
requires = [ "cloud-config.service" ];
|
||||||
|
|
||||||
|
# Run on boot
|
||||||
|
wantedBy = [ "multi-user.target" ];
|
||||||
|
|
||||||
|
serviceConfig = {
|
||||||
|
Type = "oneshot";
|
||||||
|
RemainAfterExit = true;
|
||||||
|
ExecStart = "${bootstrap-script}/bin/nixos-bootstrap";
|
||||||
|
|
||||||
|
# Logging to journald
|
||||||
|
StandardOutput = "journal+console";
|
||||||
|
StandardError = "journal+console";
|
||||||
|
};
|
||||||
|
};
|
||||||
|
}
|
||||||
@@ -27,14 +27,9 @@
|
|||||||
proxmox.cloudInit.defaultStorage = lib.mkForce "local-zfs";
|
proxmox.cloudInit.defaultStorage = lib.mkForce "local-zfs";
|
||||||
};
|
};
|
||||||
|
|
||||||
# Configure cloud-init to only use NoCloud datasource (no EC2 metadata service)
|
# Configure cloud-init to use ConfigDrive datasource (used by Proxmox)
|
||||||
services.cloud-init.settings = {
|
services.cloud-init.settings = {
|
||||||
datasource_list = [ "NoCloud" ];
|
datasource_list = [ "ConfigDrive" "NoCloud" ];
|
||||||
datasource = {
|
|
||||||
NoCloud = {
|
|
||||||
fs_label = "cidata";
|
|
||||||
};
|
|
||||||
};
|
|
||||||
};
|
};
|
||||||
|
|
||||||
boot.loader.grub.enable = true;
|
boot.loader.grub.enable = true;
|
||||||
|
|||||||
@@ -4,6 +4,7 @@
|
|||||||
./hardware-configuration.nix
|
./hardware-configuration.nix
|
||||||
./configuration.nix
|
./configuration.nix
|
||||||
./scripts.nix
|
./scripts.nix
|
||||||
|
./bootstrap.nix
|
||||||
../../system/packages.nix
|
../../system/packages.nix
|
||||||
];
|
];
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -24,6 +24,7 @@ locals {
|
|||||||
|
|
||||||
# Example Minimal VM using all defaults (uncomment to deploy):
|
# Example Minimal VM using all defaults (uncomment to deploy):
|
||||||
# "minimal-vm" = {}
|
# "minimal-vm" = {}
|
||||||
|
# "bootstrap-verify-test" = {}
|
||||||
}
|
}
|
||||||
|
|
||||||
# Compute VM configurations with defaults applied
|
# Compute VM configurations with defaults applied
|
||||||
@@ -86,6 +87,13 @@ resource "proxmox_vm_qemu" "vm" {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
ide {
|
||||||
|
ide2 {
|
||||||
|
cloudinit {
|
||||||
|
storage = each.value.storage
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
# Start on boot
|
# Start on boot
|
||||||
|
|||||||
Reference in New Issue
Block a user