diff --git a/CLAUDE.md b/CLAUDE.md index 809fdb7..c23f823 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -61,10 +61,31 @@ Do not run `nix flake update`. Should only be done manually by user. ### Development Environment ```bash -# Enter development shell (provides ansible, python3) +# Enter development shell nix develop ``` +The devshell provides: `ansible`, `tofu` (OpenTofu), `vault` (OpenBao CLI), `create-host`, and `homelab-deploy`. + +**Important:** When suggesting commands that use devshell tools, always use `nix develop -c ` syntax rather than assuming the user is already in a devshell. For example: +```bash +# Good - works regardless of current shell +nix develop -c tofu plan + +# Avoid - requires user to be in devshell +tofu plan +``` + +**OpenTofu:** Use the `-chdir` option instead of `cd` when running tofu commands in subdirectories: +```bash +# Good - uses -chdir option +nix develop -c tofu -chdir=terraform plan +nix develop -c tofu -chdir=terraform/vault apply + +# Avoid - changing directories +cd terraform && tofu plan +``` + ### Secrets Management Secrets are managed by OpenBao (Vault) using AppRole authentication. Most hosts use the @@ -140,11 +161,27 @@ The **lab-monitoring** MCP server can query logs from Loki. All hosts ship syste - `host` - Hostname (e.g., `ns1`, `ns2`, `monitoring01`, `ha1`). Use this label, not `hostname`. - `systemd_unit` - Systemd unit name (e.g., `nsd.service`, `prometheus.service`, `nixos-upgrade.service`) -- `job` - Either `systemd-journal` (most logs) or `varlog` (file-based logs like caddy access logs) +- `job` - Either `systemd-journal` (most logs), `varlog` (file-based logs), or `bootstrap` (VM bootstrap logs) - `filename` - For `varlog` job, the log file path (e.g., `/var/log/caddy/nix-cache.log`) Journal log entries are JSON-formatted with the actual log message in the `MESSAGE` field. Other useful fields include `PRIORITY` and `SYSLOG_IDENTIFIER`. +**Bootstrap Logs:** + +VMs provisioned from template2 send bootstrap progress directly to Loki via curl (before promtail is available). These logs use `job="bootstrap"` with additional labels: + +- `host` - Target hostname +- `branch` - Git branch being deployed +- `stage` - Bootstrap stage: `starting`, `network_ok`, `vault_ok`/`vault_skip`/`vault_warn`, `building`, `success`, `failed` + +Query bootstrap status: +``` +{job="bootstrap"} # All bootstrap logs +{job="bootstrap", host="testvm01"} # Specific host +{job="bootstrap", stage="failed"} # All failures +{job="bootstrap", stage=~"building|success"} # Track build progress +``` + **Example LogQL queries:** ``` # Logs from a specific service on a host diff --git a/hosts/template2/bootstrap.nix b/hosts/template2/bootstrap.nix index 8aac949..ea2d107 100644 --- a/hosts/template2/bootstrap.nix +++ b/hosts/template2/bootstrap.nix @@ -6,22 +6,72 @@ let text = '' set -euo pipefail + LOKI_URL="http://monitoring01.home.2rjus.net:3100/loki/api/v1/push" + + # Send a log entry to Loki with bootstrap status + # Usage: log_to_loki + # Fails silently if Loki is unreachable + log_to_loki() { + local stage="$1" + local message="$2" + local timestamp_ns + timestamp_ns="$(date +%s)000000000" + + local payload + payload=$(jq -n \ + --arg host "$HOSTNAME" \ + --arg stage "$stage" \ + --arg branch "''${BRANCH:-master}" \ + --arg ts "$timestamp_ns" \ + --arg msg "$message" \ + '{ + streams: [{ + stream: { + job: "bootstrap", + host: $host, + stage: $stage, + branch: $branch + }, + values: [[$ts, $msg]] + }] + }') + + curl -s --connect-timeout 2 --max-time 5 \ + -X POST \ + -H "Content-Type: application/json" \ + -d "$payload" \ + "$LOKI_URL" >/dev/null 2>&1 || true + } + + echo "================================================================================" + echo " NIXOS BOOTSTRAP IN PROGRESS" + echo "================================================================================" + echo "" + # Read hostname set by cloud-init (from Terraform VM name via user-data) # Cloud-init sets the system hostname from user-data.txt, so we read it from hostnamectl HOSTNAME=$(hostnamectl hostname) - echo "DEBUG: Hostname from hostnamectl: '$HOSTNAME'" + # Read git branch from environment, default to master + BRANCH="''${NIXOS_FLAKE_BRANCH:-master}" + echo "Hostname: $HOSTNAME" + echo "" echo "Starting NixOS bootstrap for host: $HOSTNAME" + + log_to_loki "starting" "Bootstrap starting for $HOSTNAME (branch: $BRANCH)" + echo "Waiting for network connectivity..." # Verify we can reach the git server via HTTPS (doesn't respond to ping) if ! curl -s --connect-timeout 5 --max-time 10 https://git.t-juice.club >/dev/null 2>&1; then echo "ERROR: Cannot reach git.t-juice.club via HTTPS" echo "Check network configuration and DNS settings" + log_to_loki "failed" "Network check failed - cannot reach git.t-juice.club" exit 1 fi echo "Network connectivity confirmed" + log_to_loki "network_ok" "Network connectivity confirmed" # Unwrap Vault token and store AppRole credentials (if provided) if [ -n "''${VAULT_WRAPPED_TOKEN:-}" ]; then @@ -50,6 +100,7 @@ let chmod 600 /var/lib/vault/approle/secret-id echo "Vault credentials unwrapped and stored successfully" + log_to_loki "vault_ok" "Vault credentials unwrapped and stored" else echo "WARNING: Failed to unwrap Vault token" if [ -n "$UNWRAP_RESPONSE" ]; then @@ -63,17 +114,17 @@ let echo "To regenerate token, run: create-host --hostname $HOSTNAME --force" echo "" echo "Vault secrets will not be available, but continuing bootstrap..." + log_to_loki "vault_warn" "Failed to unwrap Vault token - continuing without secrets" fi else echo "No Vault wrapped token provided (VAULT_WRAPPED_TOKEN not set)" echo "Skipping Vault credential setup" + log_to_loki "vault_skip" "No Vault token provided - skipping credential setup" fi echo "Fetching and building NixOS configuration from flake..." - - # Read git branch from environment, default to master - BRANCH="''${NIXOS_FLAKE_BRANCH:-master}" echo "Using git branch: $BRANCH" + log_to_loki "building" "Starting nixos-rebuild boot" # Build and activate the host-specific configuration FLAKE_URL="git+https://git.t-juice.club/torjus/nixos-servers.git?ref=$BRANCH#''${HOSTNAME}" @@ -81,18 +132,30 @@ let if nixos-rebuild boot --flake "$FLAKE_URL"; then echo "Successfully built configuration for $HOSTNAME" echo "Rebooting into new configuration..." + log_to_loki "success" "Build successful - rebooting into new configuration" sleep 2 systemctl reboot else echo "ERROR: nixos-rebuild failed for $HOSTNAME" echo "Check that flake has configuration for this hostname" echo "Manual intervention required - system will not reboot" + log_to_loki "failed" "nixos-rebuild failed - manual intervention required" exit 1 fi ''; }; in { + # Custom greeting line to indicate this is a bootstrap image + services.getty.greetingLine = lib.mkForce '' + ================================================================================ + BOOTSTRAP IMAGE - NixOS \V (\l) + ================================================================================ + + Bootstrap service is running. Logs are displayed on tty1. + Check status: journalctl -fu nixos-bootstrap + ''; + systemd.services."nixos-bootstrap" = { description = "Bootstrap NixOS configuration from flake on first boot"; @@ -107,12 +170,12 @@ in serviceConfig = { Type = "oneshot"; RemainAfterExit = true; - ExecStart = "${bootstrap-script}/bin/nixos-bootstrap"; + ExecStart = lib.getExe bootstrap-script; # Read environment variables from cloud-init (set by cloud-init write_files) EnvironmentFile = "-/run/cloud-init-env"; - # Logging to journald + # Log to journal and console StandardOutput = "journal+console"; StandardError = "journal+console"; }; diff --git a/playbooks/build-and-deploy-template.yml b/playbooks/build-and-deploy-template.yml index fdf29bd..5cb9206 100644 --- a/playbooks/build-and-deploy-template.yml +++ b/playbooks/build-and-deploy-template.yml @@ -99,3 +99,48 @@ - name: Display success message ansible.builtin.debug: msg: "Template VM {{ template_vmid }} created successfully on {{ storage }}" + +- name: Update Terraform template name + hosts: localhost + gather_facts: false + + vars: + terraform_dir: "{{ playbook_dir }}/../terraform" + + tasks: + - name: Get image filename from earlier play + ansible.builtin.set_fact: + image_filename: "{{ hostvars['localhost']['image_filename'] }}" + + - name: Extract template name from image filename + ansible.builtin.set_fact: + new_template_name: "{{ image_filename | regex_replace('\\.vma\\.zst$', '') | regex_replace('^vzdump-qemu-', '') }}" + + - name: Read current Terraform variables file + ansible.builtin.slurp: + src: "{{ terraform_dir }}/variables.tf" + register: variables_tf_content + + - name: Extract current template name from variables.tf + ansible.builtin.set_fact: + current_template_name: "{{ (variables_tf_content.content | b64decode) | regex_search('variable \"default_template_name\"[^}]+default\\s*=\\s*\"([^\"]+)\"', '\\1') | first }}" + + - name: Check if template name has changed + ansible.builtin.set_fact: + template_name_changed: "{{ current_template_name != new_template_name }}" + + - name: Display template name status + ansible.builtin.debug: + msg: "Template name: {{ current_template_name }} -> {{ new_template_name }} ({{ 'changed' if template_name_changed else 'unchanged' }})" + + - name: Update default_template_name in variables.tf + ansible.builtin.replace: + path: "{{ terraform_dir }}/variables.tf" + regexp: '(variable "default_template_name"[^}]+default\s*=\s*)"[^"]+"' + replace: '\1"{{ new_template_name }}"' + when: template_name_changed + + - name: Display update result + ansible.builtin.debug: + msg: "Updated terraform/variables.tf with new template name: {{ new_template_name }}" + when: template_name_changed diff --git a/terraform/variables.tf b/terraform/variables.tf index f0fae47..2006a4b 100644 --- a/terraform/variables.tf +++ b/terraform/variables.tf @@ -33,7 +33,7 @@ variable "default_target_node" { variable "default_template_name" { description = "Default template VM name to clone from" type = string - default = "nixos-25.11.20260131.41e216c" + default = "nixos-25.11.20260203.e576e3c" } variable "default_ssh_public_key" { diff --git a/terraform/vms.tf b/terraform/vms.tf index 5d33a73..f1fe1c8 100644 --- a/terraform/vms.tf +++ b/terraform/vms.tf @@ -39,10 +39,12 @@ locals { flake_branch = "vault-setup" # Bootstrap from this branch instead of master } "testvm01" = { - ip = "10.69.13.20/24" - cpu_cores = 2 - memory = 2048 - disk_size = "20G" + ip = "10.69.13.20/24" + cpu_cores = 2 + memory = 2048 + disk_size = "20G" + flake_branch = "improve-bootstrap-visibility" + vault_wrapped_token = "s.l5q88wzXfEcr5SMDHmO6o96b" } "testvm02" = { ip = "10.69.13.21/24"