diff --git a/README.md b/README.md index 1939988..c6b20e6 100644 --- a/README.md +++ b/README.md @@ -13,7 +13,6 @@ NixOS Flake-based configuration repository for a homelab infrastructure. All hos | `monitoring01` | Prometheus, Grafana, Loki, Tempo, Pyroscope | | `jelly01` | Jellyfin media server | | `nix-cache01` | Nix binary cache | -| `pgdb1` | PostgreSQL | | `nats1` | NATS messaging | | `vault01` | OpenBao (Vault) secrets management | | `template1`, `template2` | VM templates for cloning new hosts | diff --git a/docs/plans/pgdb1-decommission.md b/docs/plans/pgdb1-decommission.md new file mode 100644 index 0000000..6b253a4 --- /dev/null +++ b/docs/plans/pgdb1-decommission.md @@ -0,0 +1,113 @@ +# pgdb1 Decommissioning Plan + +## Overview + +Decommission the pgdb1 PostgreSQL server. The only consumer was Open WebUI on gunter, which has been migrated to use a local PostgreSQL instance. + +## Pre-flight Verification + +Before proceeding, verify that gunter is no longer using pgdb1: + +1. Check Open WebUI on gunter is configured for local PostgreSQL (not 10.69.13.16) +2. Optionally: Check pgdb1 for recent connection activity: + ```bash + ssh pgdb1 'sudo -u postgres psql -c "SELECT * FROM pg_stat_activity WHERE datname IS NOT NULL;"' + ``` + +## Files to Remove + +### Host Configuration +- `hosts/pgdb1/default.nix` +- `hosts/pgdb1/configuration.nix` +- `hosts/pgdb1/hardware-configuration.nix` +- `hosts/pgdb1/` (directory) + +### Service Module +- `services/postgres/postgres.nix` +- `services/postgres/default.nix` +- `services/postgres/` (directory) + +Note: This service module is only used by pgdb1, so it can be removed entirely. + +### Flake Entry +Remove from `flake.nix` (lines 131-138): +```nix +pgdb1 = nixpkgs.lib.nixosSystem { + inherit system; + specialArgs = { + inherit inputs self; + }; + modules = commonModules ++ [ + ./hosts/pgdb1 + ]; +}; +``` + +### Vault AppRole +Remove from `terraform/vault/approle.tf` (lines 69-73): +```hcl +"pgdb1" = { + paths = [ + "secret/data/hosts/pgdb1/*", + ] +} +``` + +### Monitoring Rules +Remove from `services/monitoring/rules.yml` the `postgres_down` alert (lines 359-365): +```yaml +- name: postgres_rules + rules: + - alert: postgres_down + expr: node_systemd_unit_state{instance="pgdb1.home.2rjus.net:9100", name="postgresql.service", state="active"} == 0 + for: 5m + labels: + severity: critical +``` + +### Utility Scripts +Delete `rebuild-all.sh` entirely (obsolete script). + +## Execution Steps + +### Phase 1: Verification +- [ ] Confirm Open WebUI on gunter uses local PostgreSQL +- [ ] Verify no active connections to pgdb1 + +### Phase 2: Code Cleanup +- [ ] Create feature branch: `git checkout -b decommission-pgdb1` +- [ ] Remove `hosts/pgdb1/` directory +- [ ] Remove `services/postgres/` directory +- [ ] Remove pgdb1 entry from `flake.nix` +- [ ] Remove postgres alert from `services/monitoring/rules.yml` +- [ ] Delete `rebuild-all.sh` (obsolete) +- [ ] Run `nix flake check` to verify no broken references +- [ ] Commit changes + +### Phase 3: Terraform Cleanup +- [ ] Remove pgdb1 from `terraform/vault/approle.tf` +- [ ] Run `tofu plan` in `terraform/vault/` to preview changes +- [ ] Run `tofu apply` to remove the AppRole +- [ ] Commit terraform changes + +### Phase 4: Infrastructure Cleanup +- [ ] Shut down pgdb1 VM in Proxmox +- [ ] Delete the VM from Proxmox +- [ ] (Optional) Remove any DNS entries if not auto-generated + +### Phase 5: Finalize +- [ ] Merge feature branch to master +- [ ] Trigger auto-upgrade on DNS servers (ns1, ns2) to remove DNS entry +- [ ] Move this plan to `docs/plans/completed/` + +## Rollback + +If issues arise after decommissioning: +1. The VM can be recreated from template using the git history +2. Database data would need to be restored from backup (if any exists) + +## Notes + +- pgdb1 IP: 10.69.13.16 +- The postgres service allowed connections from gunter (10.69.30.105) +- No restic backup was configured for this host diff --git a/flake.nix b/flake.nix index ee38e58..fcda061 100644 --- a/flake.nix +++ b/flake.nix @@ -128,15 +128,6 @@ ./hosts/nix-cache01 ]; }; - pgdb1 = nixpkgs.lib.nixosSystem { - inherit system; - specialArgs = { - inherit inputs self; - }; - modules = commonModules ++ [ - ./hosts/pgdb1 - ]; - }; nats1 = nixpkgs.lib.nixosSystem { inherit system; specialArgs = { diff --git a/hosts/pgdb1/configuration.nix b/hosts/pgdb1/configuration.nix deleted file mode 100644 index 31299ba..0000000 --- a/hosts/pgdb1/configuration.nix +++ /dev/null @@ -1,66 +0,0 @@ -{ - pkgs, - ... -}: - -{ - imports = [ - ./hardware-configuration.nix - - ../../system - ../../common/vm - ]; - - nixpkgs.config.allowUnfree = true; - # Use the systemd-boot EFI boot loader. - boot.loader.grub = { - enable = true; - device = "/dev/sda"; - configurationLimit = 3; - }; - - networking.hostName = "pgdb1"; - networking.domain = "home.2rjus.net"; - networking.useNetworkd = true; - networking.useDHCP = false; - services.resolved.enable = true; - networking.nameservers = [ - "10.69.13.5" - "10.69.13.6" - ]; - - systemd.network.enable = true; - systemd.network.networks."ens18" = { - matchConfig.Name = "ens18"; - address = [ - "10.69.13.16/24" - ]; - routes = [ - { Gateway = "10.69.13.1"; } - ]; - linkConfig.RequiredForOnline = "routable"; - }; - time.timeZone = "Europe/Oslo"; - - nix.settings.experimental-features = [ - "nix-command" - "flakes" - ]; - nix.settings.tarball-ttl = 0; - environment.systemPackages = with pkgs; [ - vim - wget - git - ]; - - # Open ports in the firewall. - # networking.firewall.allowedTCPPorts = [ ... ]; - # networking.firewall.allowedUDPPorts = [ ... ]; - # Or disable the firewall altogether. - networking.firewall.enable = false; - - vault.enable = true; - homelab.deploy.enable = true; - - system.stateVersion = "23.11"; # Did you read the comment? -} diff --git a/hosts/pgdb1/default.nix b/hosts/pgdb1/default.nix deleted file mode 100644 index 68dc978..0000000 --- a/hosts/pgdb1/default.nix +++ /dev/null @@ -1,7 +0,0 @@ -{ ... }: -{ - imports = [ - ./configuration.nix - ../../services/postgres - ]; -} diff --git a/hosts/pgdb1/hardware-configuration.nix b/hosts/pgdb1/hardware-configuration.nix deleted file mode 100644 index 48bf109..0000000 --- a/hosts/pgdb1/hardware-configuration.nix +++ /dev/null @@ -1,42 +0,0 @@ -{ - config, - lib, - pkgs, - modulesPath, - ... -}: - -{ - imports = [ - (modulesPath + "/profiles/qemu-guest.nix") - ]; - boot.initrd.availableKernelModules = [ - "ata_piix" - "uhci_hcd" - "virtio_pci" - "virtio_scsi" - "sd_mod" - "sr_mod" - ]; - boot.initrd.kernelModules = [ "dm-snapshot" ]; - boot.kernelModules = [ - "ptp_kvm" - ]; - boot.extraModulePackages = [ ]; - - fileSystems."/" = { - device = "/dev/disk/by-label/root"; - fsType = "xfs"; - }; - - swapDevices = [ { device = "/dev/disk/by-label/swap"; } ]; - - # Enables DHCP on each ethernet and wireless interface. In case of scripted networking - # (the default) this is the recommended approach. When using systemd-networkd it's - # still possible to use this option, but it's recommended to use it in conjunction - # with explicit per-interface declarations with `networking.interfaces..useDHCP`. - networking.useDHCP = lib.mkDefault true; - # networking.interfaces.ens18.useDHCP = lib.mkDefault true; - - nixpkgs.hostPlatform = lib.mkDefault "x86_64-linux"; -} diff --git a/rebuild-all.sh b/rebuild-all.sh deleted file mode 100755 index 5dc14ab..0000000 --- a/rebuild-all.sh +++ /dev/null @@ -1,19 +0,0 @@ -#!/usr/bin/env bash -set -euo pipefail - -# array of hosts -HOSTS=( - "ns1" - "ns2" - "ha1" - "http-proxy" - "jelly01" - "monitoring01" - "nix-cache01" - "pgdb1" -) - -for host in "${HOSTS[@]}"; do - echo "Rebuilding $host" - nixos-rebuild boot --flake .#${host} --target-host root@${host} -done diff --git a/services/monitoring/rules.yml b/services/monitoring/rules.yml index 2530d46..5724d8c 100644 --- a/services/monitoring/rules.yml +++ b/services/monitoring/rules.yml @@ -356,32 +356,6 @@ groups: annotations: summary: "Proxmox VM {{ $labels.id }} is stopped" description: "Proxmox VM {{ $labels.id }} ({{ $labels.name }}) has onboot=1 but is stopped." - - name: postgres_rules - rules: - - alert: postgres_down - expr: node_systemd_unit_state{instance="pgdb1.home.2rjus.net:9100", name="postgresql.service", state="active"} == 0 - for: 5m - labels: - severity: critical - annotations: - summary: "PostgreSQL not running on {{ $labels.instance }}" - description: "PostgreSQL has been down on {{ $labels.instance }} more than 5 minutes." - - alert: postgres_exporter_down - expr: up{job="postgres"} == 0 - for: 5m - labels: - severity: warning - annotations: - summary: "PostgreSQL exporter down on {{ $labels.instance }}" - description: "Cannot scrape PostgreSQL metrics from {{ $labels.instance }}." - - alert: postgres_high_connections - expr: pg_stat_activity_count / pg_settings_max_connections > 0.8 - for: 5m - labels: - severity: warning - annotations: - summary: "PostgreSQL connection pool near exhaustion on {{ $labels.instance }}" - description: "PostgreSQL is using over 80% of max_connections on {{ $labels.instance }}." - name: jellyfin_rules rules: - alert: jellyfin_down diff --git a/services/postgres/default.nix b/services/postgres/default.nix deleted file mode 100644 index 8fa5b28..0000000 --- a/services/postgres/default.nix +++ /dev/null @@ -1,6 +0,0 @@ -{ ... }: -{ - imports = [ - ./postgres.nix - ]; -} diff --git a/services/postgres/postgres.nix b/services/postgres/postgres.nix deleted file mode 100644 index c4b8d99..0000000 --- a/services/postgres/postgres.nix +++ /dev/null @@ -1,23 +0,0 @@ -{ pkgs, ... }: -{ - homelab.monitoring.scrapeTargets = [{ - job_name = "postgres"; - port = 9187; - }]; - - services.prometheus.exporters.postgres = { - enable = true; - runAsLocalSuperUser = true; # Use peer auth as postgres user - }; - - services.postgresql = { - enable = true; - enableJIT = true; - enableTCPIP = true; - extensions = ps: with ps; [ pgvector ]; - authentication = '' - # Allow access to everything from gunter - host all all 10.69.30.105/32 scram-sha-256 - ''; - }; -}