From a1ae766eb8e2ac3309a7de3525e4172ccde236ed Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Torjus=20H=C3=A5kestad?= Date: Sat, 7 Feb 2026 14:49:58 +0100 Subject: [PATCH 01/11] template2: show bootstrap progress on tty1 - Display bootstrap banner and live progress on tty1 instead of login prompt - Add custom getty greeting on other ttys indicating this is a bootstrap image - Disable getty on tty1 during bootstrap so output is visible Co-Authored-By: Claude Opus 4.5 --- hosts/template2/bootstrap.nix | 37 +++++++++++++++++++++++++++++------ 1 file changed, 31 insertions(+), 6 deletions(-) diff --git a/hosts/template2/bootstrap.nix b/hosts/template2/bootstrap.nix index 8aac949..07bd829 100644 --- a/hosts/template2/bootstrap.nix +++ b/hosts/template2/bootstrap.nix @@ -6,11 +6,18 @@ let text = '' set -euo pipefail + # Clear screen and show bootstrap banner + clear + echo "================================================================================" + echo " NIXOS BOOTSTRAP IN PROGRESS" + echo "================================================================================" + echo "" + # Read hostname set by cloud-init (from Terraform VM name via user-data) # Cloud-init sets the system hostname from user-data.txt, so we read it from hostnamectl HOSTNAME=$(hostnamectl hostname) - echo "DEBUG: Hostname from hostnamectl: '$HOSTNAME'" - + echo "Hostname: $HOSTNAME" + echo "" echo "Starting NixOS bootstrap for host: $HOSTNAME" echo "Waiting for network connectivity..." @@ -93,6 +100,21 @@ let }; in { + # Custom greeting line to indicate this is a bootstrap image + services.getty.greetingLine = lib.mkForce '' + ================================================================================ + BOOTSTRAP IMAGE - NixOS \V (\l) + ================================================================================ + + Bootstrap service is running. Logs are displayed on tty1. + Check status: journalctl -fu nixos-bootstrap + ''; + + # Disable getty on tty1 so bootstrap output is visible + systemd.services."getty@tty1" = { + enable = false; + }; + systemd.services."nixos-bootstrap" = { description = "Bootstrap NixOS configuration from flake on first boot"; @@ -107,14 +129,17 @@ in serviceConfig = { Type = "oneshot"; RemainAfterExit = true; - ExecStart = "${bootstrap-script}/bin/nixos-bootstrap"; + ExecStart = lib.getExe bootstrap-script; # Read environment variables from cloud-init (set by cloud-init write_files) EnvironmentFile = "-/run/cloud-init-env"; - # Logging to journald - StandardOutput = "journal+console"; - StandardError = "journal+console"; + # Output to tty1 for visibility on console + StandardOutput = "tty"; + StandardError = "tty"; + TTYPath = "/dev/tty1"; + TTYReset = true; + TTYVHangup = true; }; }; } -- 2.49.1 From 6a3a51407e4d02bfcef02cb66d7fa0a2b3bc3337 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Torjus=20H=C3=A5kestad?= Date: Sat, 7 Feb 2026 14:55:58 +0100 Subject: [PATCH 02/11] playbooks: auto-update terraform template name after deploy Add a third play to build-and-deploy-template.yml that updates terraform/variables.tf with the new template name after deploying to Proxmox. Only updates if the template name has changed. Co-Authored-By: Claude Opus 4.5 --- playbooks/build-and-deploy-template.yml | 45 +++++++++++++++++++++++++ 1 file changed, 45 insertions(+) diff --git a/playbooks/build-and-deploy-template.yml b/playbooks/build-and-deploy-template.yml index fdf29bd..5cb9206 100644 --- a/playbooks/build-and-deploy-template.yml +++ b/playbooks/build-and-deploy-template.yml @@ -99,3 +99,48 @@ - name: Display success message ansible.builtin.debug: msg: "Template VM {{ template_vmid }} created successfully on {{ storage }}" + +- name: Update Terraform template name + hosts: localhost + gather_facts: false + + vars: + terraform_dir: "{{ playbook_dir }}/../terraform" + + tasks: + - name: Get image filename from earlier play + ansible.builtin.set_fact: + image_filename: "{{ hostvars['localhost']['image_filename'] }}" + + - name: Extract template name from image filename + ansible.builtin.set_fact: + new_template_name: "{{ image_filename | regex_replace('\\.vma\\.zst$', '') | regex_replace('^vzdump-qemu-', '') }}" + + - name: Read current Terraform variables file + ansible.builtin.slurp: + src: "{{ terraform_dir }}/variables.tf" + register: variables_tf_content + + - name: Extract current template name from variables.tf + ansible.builtin.set_fact: + current_template_name: "{{ (variables_tf_content.content | b64decode) | regex_search('variable \"default_template_name\"[^}]+default\\s*=\\s*\"([^\"]+)\"', '\\1') | first }}" + + - name: Check if template name has changed + ansible.builtin.set_fact: + template_name_changed: "{{ current_template_name != new_template_name }}" + + - name: Display template name status + ansible.builtin.debug: + msg: "Template name: {{ current_template_name }} -> {{ new_template_name }} ({{ 'changed' if template_name_changed else 'unchanged' }})" + + - name: Update default_template_name in variables.tf + ansible.builtin.replace: + path: "{{ terraform_dir }}/variables.tf" + regexp: '(variable "default_template_name"[^}]+default\s*=\s*)"[^"]+"' + replace: '\1"{{ new_template_name }}"' + when: template_name_changed + + - name: Display update result + ansible.builtin.debug: + msg: "Updated terraform/variables.tf with new template name: {{ new_template_name }}" + when: template_name_changed -- 2.49.1 From 0cf72ec19117b4954c4297f4a92a48e46ae37613 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Torjus=20H=C3=A5kestad?= Date: Sat, 7 Feb 2026 15:02:16 +0100 Subject: [PATCH 03/11] terraform: update template to nixos-25.11.20260203.e576e3c Co-Authored-By: Claude Opus 4.5 --- terraform/variables.tf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/terraform/variables.tf b/terraform/variables.tf index f0fae47..2006a4b 100644 --- a/terraform/variables.tf +++ b/terraform/variables.tf @@ -33,7 +33,7 @@ variable "default_target_node" { variable "default_template_name" { description = "Default template VM name to clone from" type = string - default = "nixos-25.11.20260131.41e216c" + default = "nixos-25.11.20260203.e576e3c" } variable "default_ssh_public_key" { -- 2.49.1 From 78e8d7a60036833eafcfa015c780bef0918aba39 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Torjus=20H=C3=A5kestad?= Date: Sat, 7 Feb 2026 15:10:25 +0100 Subject: [PATCH 04/11] template2: add ncurses for clear command in bootstrap Co-Authored-By: Claude Opus 4.5 --- hosts/template2/bootstrap.nix | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hosts/template2/bootstrap.nix b/hosts/template2/bootstrap.nix index 07bd829..441185c 100644 --- a/hosts/template2/bootstrap.nix +++ b/hosts/template2/bootstrap.nix @@ -2,7 +2,7 @@ let bootstrap-script = pkgs.writeShellApplication { name = "nixos-bootstrap"; - runtimeInputs = with pkgs; [ systemd curl nixos-rebuild jq git ]; + runtimeInputs = with pkgs; [ systemd curl nixos-rebuild jq git ncurses ]; text = '' set -euo pipefail -- 2.49.1 From 4ca3c8890f3c40db3ca6bd380ad03b062e8cea7b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Torjus=20H=C3=A5kestad?= Date: Sat, 7 Feb 2026 15:14:57 +0100 Subject: [PATCH 05/11] terraform: add flake_branch and token for testvm01 Co-Authored-By: Claude Opus 4.5 --- terraform/vms.tf | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/terraform/vms.tf b/terraform/vms.tf index 5d33a73..5186314 100644 --- a/terraform/vms.tf +++ b/terraform/vms.tf @@ -39,10 +39,12 @@ locals { flake_branch = "vault-setup" # Bootstrap from this branch instead of master } "testvm01" = { - ip = "10.69.13.20/24" - cpu_cores = 2 - memory = 2048 - disk_size = "20G" + ip = "10.69.13.20/24" + cpu_cores = 2 + memory = 2048 + disk_size = "20G" + flake_branch = "improve-bootstrap-visibility" + vault_wrapped_token = "s.l88eUKBlieWrNrQrlO76uBnM" } "testvm02" = { ip = "10.69.13.21/24" -- 2.49.1 From 11261c46361be2c1735d1d0f3c74e8fd93ba030d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Torjus=20H=C3=A5kestad?= Date: Sat, 7 Feb 2026 15:24:39 +0100 Subject: [PATCH 06/11] template2: revert to journal+console output for bootstrap TTY output was causing nixos-rebuild to fail. Keep the custom greeting line to indicate bootstrap image, but use journal+console for reliable logging. Co-Authored-By: Claude Opus 4.5 --- hosts/template2/bootstrap.nix | 18 ++++-------------- 1 file changed, 4 insertions(+), 14 deletions(-) diff --git a/hosts/template2/bootstrap.nix b/hosts/template2/bootstrap.nix index 441185c..7db16ca 100644 --- a/hosts/template2/bootstrap.nix +++ b/hosts/template2/bootstrap.nix @@ -2,12 +2,10 @@ let bootstrap-script = pkgs.writeShellApplication { name = "nixos-bootstrap"; - runtimeInputs = with pkgs; [ systemd curl nixos-rebuild jq git ncurses ]; + runtimeInputs = with pkgs; [ systemd curl nixos-rebuild jq git ]; text = '' set -euo pipefail - # Clear screen and show bootstrap banner - clear echo "================================================================================" echo " NIXOS BOOTSTRAP IN PROGRESS" echo "================================================================================" @@ -110,11 +108,6 @@ in Check status: journalctl -fu nixos-bootstrap ''; - # Disable getty on tty1 so bootstrap output is visible - systemd.services."getty@tty1" = { - enable = false; - }; - systemd.services."nixos-bootstrap" = { description = "Bootstrap NixOS configuration from flake on first boot"; @@ -134,12 +127,9 @@ in # Read environment variables from cloud-init (set by cloud-init write_files) EnvironmentFile = "-/run/cloud-init-env"; - # Output to tty1 for visibility on console - StandardOutput = "tty"; - StandardError = "tty"; - TTYPath = "/dev/tty1"; - TTYReset = true; - TTYVHangup = true; + # Log to journal and console + StandardOutput = "journal+console"; + StandardError = "journal+console"; }; }; } -- 2.49.1 From ae3039af195c9b65ea893a4880c2a47e4d46183e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Torjus=20H=C3=A5kestad?= Date: Sat, 7 Feb 2026 15:34:47 +0100 Subject: [PATCH 07/11] template2: send bootstrap status to Loki for remote monitoring Adds log_to_loki function that pushes structured log entries to Loki at key bootstrap stages (starting, network_ok, vault_*, building, success, failed). Enables querying bootstrap state via LogQL without console access. Co-Authored-By: Claude Opus 4.5 --- hosts/template2/bootstrap.nix | 54 +++++++++++++++++++++++++++++++++-- 1 file changed, 51 insertions(+), 3 deletions(-) diff --git a/hosts/template2/bootstrap.nix b/hosts/template2/bootstrap.nix index 7db16ca..ea2d107 100644 --- a/hosts/template2/bootstrap.nix +++ b/hosts/template2/bootstrap.nix @@ -6,6 +6,43 @@ let text = '' set -euo pipefail + LOKI_URL="http://monitoring01.home.2rjus.net:3100/loki/api/v1/push" + + # Send a log entry to Loki with bootstrap status + # Usage: log_to_loki + # Fails silently if Loki is unreachable + log_to_loki() { + local stage="$1" + local message="$2" + local timestamp_ns + timestamp_ns="$(date +%s)000000000" + + local payload + payload=$(jq -n \ + --arg host "$HOSTNAME" \ + --arg stage "$stage" \ + --arg branch "''${BRANCH:-master}" \ + --arg ts "$timestamp_ns" \ + --arg msg "$message" \ + '{ + streams: [{ + stream: { + job: "bootstrap", + host: $host, + stage: $stage, + branch: $branch + }, + values: [[$ts, $msg]] + }] + }') + + curl -s --connect-timeout 2 --max-time 5 \ + -X POST \ + -H "Content-Type: application/json" \ + -d "$payload" \ + "$LOKI_URL" >/dev/null 2>&1 || true + } + echo "================================================================================" echo " NIXOS BOOTSTRAP IN PROGRESS" echo "================================================================================" @@ -14,19 +51,27 @@ let # Read hostname set by cloud-init (from Terraform VM name via user-data) # Cloud-init sets the system hostname from user-data.txt, so we read it from hostnamectl HOSTNAME=$(hostnamectl hostname) + # Read git branch from environment, default to master + BRANCH="''${NIXOS_FLAKE_BRANCH:-master}" + echo "Hostname: $HOSTNAME" echo "" echo "Starting NixOS bootstrap for host: $HOSTNAME" + + log_to_loki "starting" "Bootstrap starting for $HOSTNAME (branch: $BRANCH)" + echo "Waiting for network connectivity..." # Verify we can reach the git server via HTTPS (doesn't respond to ping) if ! curl -s --connect-timeout 5 --max-time 10 https://git.t-juice.club >/dev/null 2>&1; then echo "ERROR: Cannot reach git.t-juice.club via HTTPS" echo "Check network configuration and DNS settings" + log_to_loki "failed" "Network check failed - cannot reach git.t-juice.club" exit 1 fi echo "Network connectivity confirmed" + log_to_loki "network_ok" "Network connectivity confirmed" # Unwrap Vault token and store AppRole credentials (if provided) if [ -n "''${VAULT_WRAPPED_TOKEN:-}" ]; then @@ -55,6 +100,7 @@ let chmod 600 /var/lib/vault/approle/secret-id echo "Vault credentials unwrapped and stored successfully" + log_to_loki "vault_ok" "Vault credentials unwrapped and stored" else echo "WARNING: Failed to unwrap Vault token" if [ -n "$UNWRAP_RESPONSE" ]; then @@ -68,17 +114,17 @@ let echo "To regenerate token, run: create-host --hostname $HOSTNAME --force" echo "" echo "Vault secrets will not be available, but continuing bootstrap..." + log_to_loki "vault_warn" "Failed to unwrap Vault token - continuing without secrets" fi else echo "No Vault wrapped token provided (VAULT_WRAPPED_TOKEN not set)" echo "Skipping Vault credential setup" + log_to_loki "vault_skip" "No Vault token provided - skipping credential setup" fi echo "Fetching and building NixOS configuration from flake..." - - # Read git branch from environment, default to master - BRANCH="''${NIXOS_FLAKE_BRANCH:-master}" echo "Using git branch: $BRANCH" + log_to_loki "building" "Starting nixos-rebuild boot" # Build and activate the host-specific configuration FLAKE_URL="git+https://git.t-juice.club/torjus/nixos-servers.git?ref=$BRANCH#''${HOSTNAME}" @@ -86,12 +132,14 @@ let if nixos-rebuild boot --flake "$FLAKE_URL"; then echo "Successfully built configuration for $HOSTNAME" echo "Rebooting into new configuration..." + log_to_loki "success" "Build successful - rebooting into new configuration" sleep 2 systemctl reboot else echo "ERROR: nixos-rebuild failed for $HOSTNAME" echo "Check that flake has configuration for this hostname" echo "Manual intervention required - system will not reboot" + log_to_loki "failed" "nixos-rebuild failed - manual intervention required" exit 1 fi ''; -- 2.49.1 From 09c9df1bbed9971fcc1aef04fd89be64b0a4b43d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Torjus=20H=C3=A5kestad?= Date: Sat, 7 Feb 2026 15:36:25 +0100 Subject: [PATCH 08/11] terraform: regenerate wrapped token for testvm01 Co-Authored-By: Claude Opus 4.5 --- terraform/vms.tf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/terraform/vms.tf b/terraform/vms.tf index 5186314..f1fe1c8 100644 --- a/terraform/vms.tf +++ b/terraform/vms.tf @@ -44,7 +44,7 @@ locals { memory = 2048 disk_size = "20G" flake_branch = "improve-bootstrap-visibility" - vault_wrapped_token = "s.l88eUKBlieWrNrQrlO76uBnM" + vault_wrapped_token = "s.l5q88wzXfEcr5SMDHmO6o96b" } "testvm02" = { ip = "10.69.13.21/24" -- 2.49.1 From a90d9c33d592afec8bbec9cdc24c0446daab6e3a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Torjus=20H=C3=A5kestad?= Date: Sat, 7 Feb 2026 15:39:56 +0100 Subject: [PATCH 09/11] CLAUDE.md: prefer nix develop -c for devshell commands Co-Authored-By: Claude Opus 4.5 --- CLAUDE.md | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/CLAUDE.md b/CLAUDE.md index 809fdb7..c5a6912 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -61,10 +61,21 @@ Do not run `nix flake update`. Should only be done manually by user. ### Development Environment ```bash -# Enter development shell (provides ansible, python3) +# Enter development shell nix develop ``` +The devshell provides: `ansible`, `tofu` (OpenTofu), `vault` (OpenBao CLI), `create-host`, and `homelab-deploy`. + +**Important:** When suggesting commands that use devshell tools, always use `nix develop -c ` syntax rather than assuming the user is already in a devshell. For example: +```bash +# Good - works regardless of current shell +nix develop -c tofu plan + +# Avoid - requires user to be in devshell +tofu plan +``` + ### Secrets Management Secrets are managed by OpenBao (Vault) using AppRole authentication. Most hosts use the -- 2.49.1 From f19ba2f4b6f573b28f74e9746a448673975cd960 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Torjus=20H=C3=A5kestad?= Date: Sat, 7 Feb 2026 15:41:59 +0100 Subject: [PATCH 10/11] CLAUDE.md: use tofu -chdir instead of cd Co-Authored-By: Claude Opus 4.5 --- CLAUDE.md | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/CLAUDE.md b/CLAUDE.md index c5a6912..c2432a0 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -76,6 +76,16 @@ nix develop -c tofu plan tofu plan ``` +**OpenTofu:** Use the `-chdir` option instead of `cd` when running tofu commands in subdirectories: +```bash +# Good - uses -chdir option +nix develop -c tofu -chdir=terraform plan +nix develop -c tofu -chdir=terraform/vault apply + +# Avoid - changing directories +cd terraform && tofu plan +``` + ### Secrets Management Secrets are managed by OpenBao (Vault) using AppRole authentication. Most hosts use the -- 2.49.1 From eea000b33786213bfe042a3cd64ccdc7ba7b0652 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Torjus=20H=C3=A5kestad?= Date: Sat, 7 Feb 2026 15:57:51 +0100 Subject: [PATCH 11/11] CLAUDE.md: document bootstrap logs in Loki Co-Authored-By: Claude Opus 4.5 --- CLAUDE.md | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/CLAUDE.md b/CLAUDE.md index c2432a0..c23f823 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -161,11 +161,27 @@ The **lab-monitoring** MCP server can query logs from Loki. All hosts ship syste - `host` - Hostname (e.g., `ns1`, `ns2`, `monitoring01`, `ha1`). Use this label, not `hostname`. - `systemd_unit` - Systemd unit name (e.g., `nsd.service`, `prometheus.service`, `nixos-upgrade.service`) -- `job` - Either `systemd-journal` (most logs) or `varlog` (file-based logs like caddy access logs) +- `job` - Either `systemd-journal` (most logs), `varlog` (file-based logs), or `bootstrap` (VM bootstrap logs) - `filename` - For `varlog` job, the log file path (e.g., `/var/log/caddy/nix-cache.log`) Journal log entries are JSON-formatted with the actual log message in the `MESSAGE` field. Other useful fields include `PRIORITY` and `SYSLOG_IDENTIFIER`. +**Bootstrap Logs:** + +VMs provisioned from template2 send bootstrap progress directly to Loki via curl (before promtail is available). These logs use `job="bootstrap"` with additional labels: + +- `host` - Target hostname +- `branch` - Git branch being deployed +- `stage` - Bootstrap stage: `starting`, `network_ok`, `vault_ok`/`vault_skip`/`vault_warn`, `building`, `success`, `failed` + +Query bootstrap status: +``` +{job="bootstrap"} # All bootstrap logs +{job="bootstrap", host="testvm01"} # Specific host +{job="bootstrap", stage="failed"} # All failures +{job="bootstrap", stage=~"building|success"} # Track build progress +``` + **Example LogQL queries:** ``` # Logs from a specific service on a host -- 2.49.1