Better montiring, bug fixes.

This commit is contained in:
Aner Zakobar
2026-05-10 13:44:27 +03:00
parent af744e819c
commit 09052e8aec
7 changed files with 174 additions and 46 deletions
+42
View File
@@ -128,6 +128,48 @@ homey-deploy-rpi-main
NixOS activates the new config on the Pi immediately, with an automatic
rollback if activation fails.
* Post-deploy setup
Some services require manual one-time configuration after the first deploy.
** Ntfy — push notifications
Ntfy's admin user is created automatically from sops on first start. You
still need to create a phone token and subscribe to the alerts topic.
1. Visit =https://ntfy.zakobar.com= and log in with the admin password
(=ntfy/admin_password= in =secrets/secrets.yaml=).
2. Go to *Account → Access Tokens → Create token* — give it a name (e.g.
"phone") and copy the token value.
3. In the [[https://ntfy.sh][Ntfy mobile app]]:
- *Server*: =https://ntfy.zakobar.com=
- *Access token*: the token you just created
4. Subscribe to the =alerts= topic in the app.
** Uptime Kuma — notifications (two-deploy process)
Uptime Kuma monitors are created automatically by the sync script on first
deploy, but notification channels must be configured in the UI before they
can be attached to monitors. This requires two deploys:
*Deploy 1* — services are up, monitors exist, but no notifications assigned yet.
Then, in the Uptime Kuma UI (=https://uptime.zakobar.com=):
1. Go to *Settings → Notifications → Add Notification*.
2. Choose *ntfy* as the type and fill in:
- *Server URL*: =https://ntfy.zakobar.com=
- *Topic*: =alerts=
- *Token*: use the admin token (or create a dedicated one in ntfy)
3. Save — you do *not* need to manually assign it to any monitor.
*Deploy 2* — run =homey-deploy-rpi-main= again. The sync script will detect
the newly configured notification channel and attach it to every monitor
automatically.
Any notifications added to Uptime Kuma in the future will also be picked up
on the next deploy.
* Backing up
Backups use [[https://restic.net/][restic]] and run automatically via systemd on a daily schedule.
+1 -1
View File
@@ -200,7 +200,7 @@ in
environment.etc."grafana/dashboards/node-exporter-full.json" = {
source = pkgs.fetchurl {
url = "https://grafana.com/api/dashboards/1860/revisions/37/download";
hash = "sha256-AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA=";
hash = "sha256-1DE1aaanRHHeCOMWDGdOS1wBXxOF84UXAjJzT5Ek6mM=";
};
mode = "0444";
};
+4
View File
@@ -192,6 +192,10 @@ in
AUTHELIA_SESSION_SECRET_FILE = "/run/secrets/session_secret";
AUTHELIA_STORAGE_ENCRYPTION_KEY_FILE = "/run/secrets/storage_encryption_key";
AUTHELIA_AUTHENTICATION_BACKEND_LDAP_PASSWORD_FILE = "/run/secrets/ldap_ro_password";
# Changing this forces a container restart when the config changes.
# NixOS bind-mounts resolve symlinks at container start, so the running
# container would otherwise keep the old nix-store config until restarted.
NIXOS_CONFIG_HASH = builtins.hashString "sha256" autheliaConfig;
};
volumes = [
+7 -3
View File
@@ -170,9 +170,13 @@ in
# Uptime Kuma monitor for this service
# -----------------------------------------------------------------------
homey.monitoring.monitors = [{
name = "Nextcloud";
url = "https://nextcloud.${domain}/status.php";
interval = 60;
name = "Nextcloud";
url = "https://nextcloud.${domain}/status.php";
interval = 60;
keyword = "\"maintenance\":false";
# Nightly maintenance is expected — only alert if stuck for 4+ hours.
# 240 retries × 60s = 4 hours of consecutive failures before notifying.
maxretries = 240;
}];
systemd.services."podman-nextcloud" = {
+46 -13
View File
@@ -75,6 +75,46 @@ in
};
};
# Minimal config for the `ntfy user` CLI — the NixOS module puts its
# generated config in the nix store under an unpredictable path, so we
# write a separate file just containing the auth-file path. The server
# ignores this file (it uses the module-generated one via -c flag).
environment.etc."ntfy-sh/user-cli.yml" = {
text = "auth-file: ${dataDir}/ntfy/auth.db\n";
mode = "0444";
};
# Create ntfy data directories on the external HD before ntfy starts.
# Runs as a separate root service (outside ntfy-sh's restricted namespace)
# so it can access /mnt/data without hitting ReadWritePaths restrictions.
systemd.services.ntfy-sh-mkdir = {
description = "Create Ntfy data directories on external HD";
wantedBy = [ "ntfy-sh.service" ];
before = [ "ntfy-sh.service" ];
after = [ "mnt-data.mount" ];
requires = [ "mnt-data.mount" ];
serviceConfig = {
Type = "oneshot";
RemainAfterExit = true;
ExecStart = pkgs.writeShellScript "ntfy-mkdir" ''
set -euo pipefail
mkdir -p ${dataDir}/ntfy/attachments
chown -R ntfy-sh:ntfy-sh ${dataDir}/ntfy
chmod 0750 ${dataDir}/ntfy ${dataDir}/ntfy/attachments
'';
};
};
# Ensure ntfy-sh starts after the HD is mounted and dirs are ready.
# Also widen ReadWritePaths so ntfy-sh can write to the external HD path
# (the NixOS module restricts writes to /var/lib/ntfy-sh by default).
systemd.services.ntfy-sh = {
after = lib.mkAfter [ "mnt-data.mount" "ntfy-sh-mkdir.service" ];
requires = lib.mkAfter [ "mnt-data.mount" "ntfy-sh-mkdir.service" ];
serviceConfig.ReadWritePaths = lib.mkAfter [ "${dataDir}/ntfy" ];
};
# -----------------------------------------------------------------------
# Create the admin user on first start (idempotent)
# -----------------------------------------------------------------------
@@ -102,28 +142,21 @@ in
PASS=$(cat "$CREDENTIALS_DIRECTORY/ntfy_admin_password")
# ntfy user commands need the config file to find the auth database.
# The NixOS ntfy-sh module writes config to /etc/ntfy-sh/server.yml.
NTFY="${pkgs.ntfy-sh}/bin/ntfy user --config /etc/ntfy-sh/server.yml"
# Use the minimal CLI config (just has auth-file path).
NTFY="${pkgs.ntfy-sh}/bin/ntfy user --config /etc/ntfy-sh/user-cli.yml"
# ntfy user list exits non-zero if the user DB is empty/doesn't exist;
# grep exits non-zero if the pattern is missing. Either means no admin.
if $NTFY list 2>/dev/null | grep -qE "^admin\b"; then
# ntfy user list outputs a Unicode table; grep for admin in it.
# ntfy user add reads password + confirmation from stdin (two lines).
if $NTFY list 2>/dev/null | grep -qE "admin"; then
echo "ntfy-sh-setup: admin user already exists"
else
echo "$PASS" | $NTFY add --role=admin admin
printf '%s\n%s\n' "$PASS" "$PASS" | $NTFY add --role=admin admin
echo "ntfy-sh-setup: admin user created"
fi
'';
};
};
# Ensure ntfy-sh starts after the external HD is mounted
systemd.services.ntfy-sh = {
after = lib.mkAfter [ "mnt-data.mount" ];
requires = lib.mkAfter [ "mnt-data.mount" ];
};
# -----------------------------------------------------------------------
# Uptime Kuma monitor for this service
# -----------------------------------------------------------------------
+1 -1
View File
@@ -55,7 +55,7 @@ in
# -----------------------------------------------------------------------
homey.monitoring.monitors = [{
name = "phpLDAPadmin";
url = "http://localhost:${toString cfg.port}";
url = "http://phpldapadmin:80";
interval = 60;
}];
};
+73 -28
View File
@@ -36,8 +36,16 @@ let
monitorsJson = pkgs.writeText "uptime-kuma-monitors.json"
(builtins.toJSON config.homey.monitoring.monitors);
# Python environment for the monitor-sync script
pythonEnv = pkgs.python3.withPackages (ps: [ ps."uptime-kuma-api" ]);
# Python environment for the monitor-sync script.
# uptime-kuma-api's transitive deps (requests, socketio, websocket-client)
# are listed explicitly because withPackages doesn't always pull propagated
# deps transitively in all nixpkgs versions.
pythonEnv = pkgs.python3.withPackages (ps: [
ps."uptime-kuma-api"
ps.requests
ps."python-socketio"
ps."websocket-client"
]);
# Monitor-sync script: idempotent, hash-gated, uses Socket.IO API
syncScript = pkgs.writeText "uptime-kuma-sync.py" ''
@@ -48,6 +56,9 @@ let
Runs as a oneshot systemd service after podman-uptime-kuma.service.
Tracks a hash of the monitor list so it only re-syncs when the NixOS
config changes.
Uptime Kuma v1 has no REST API everything is Socket.IO. Initial admin
creation uses api.setup() which raises if already done (we ignore that).
"""
import hashlib
import json
@@ -62,10 +73,11 @@ let
CREDS_DIR = os.environ.get("CREDENTIALS_DIRECTORY", "")
def wait_for_kuma(timeout=120):
"""Wait until Uptime Kuma HTTP responds (any non-5xx just checks it's up)."""
deadline = time.time() + timeout
while time.time() < deadline:
try:
with urllib.request.urlopen(KUMA_URL + "/", timeout=5) as r:
with urllib.request.urlopen(KUMA_URL, timeout=5) as r:
if r.status < 500:
return True
except Exception:
@@ -103,36 +115,58 @@ let
api = UptimeKumaApi(KUMA_URL)
# Initial setup (creates admin user on first run; no-op if already done)
# Initial admin setup via Socket.IO idempotent (raises if already done, ignore it)
try:
info = api.info()
if not info.get("isSetup", True):
api.setup("admin", password)
print("uptime-kuma-sync: initial admin user created")
api.setup("admin", password)
print("uptime-kuma-sync: initial admin user created")
except Exception as e:
print(f"uptime-kuma-sync: setup check: {e}", file=sys.stderr)
print(f"uptime-kuma-sync: setup skipped (already configured): {e}")
# Login
result = api.login("admin", password)
if not result.get("ok"):
print(f"uptime-kuma-sync: login failed: {result}", file=sys.stderr)
try:
api.login("admin", password)
except Exception as e:
print(f"uptime-kuma-sync: login failed: {e}", file=sys.stderr)
api.disconnect()
sys.exit(1)
# Sync monitors (add missing; skip existing by name)
# Collect all configured notification IDs so every monitor gets them.
notification_ids = [n["id"] for n in api.get_notifications()]
if notification_ids:
print(f"uptime-kuma-sync: attaching notifications: {notification_ids}")
# Sync monitors: add missing, update changed
try:
existing_names = {m["name"] for m in api.get_monitors()}
existing = {m["name"]: m for m in api.get_monitors()}
for m in monitors:
if m["name"] in existing_names:
print(f"uptime-kuma-sync: monitor exists, skipping: {m['name']}")
continue
api.add_monitor(
type=MonitorType.HTTP,
name=m["name"],
url=m["url"],
interval=m.get("interval", 60),
)
print(f"uptime-kuma-sync: created monitor: {m['name']}")
keyword = m.get("keyword")
maxretries = m.get("maxretries", 0)
monitor_type = MonitorType.KEYWORD if keyword else MonitorType.HTTP
extra = {"keyword": keyword} if keyword else {}
if m["name"] not in existing:
api.add_monitor(
type=monitor_type,
name=m["name"],
url=m["url"],
interval=m.get("interval", 60),
maxretries=maxretries,
notification_id_list={str(nid): True for nid in notification_ids},
**extra,
)
print(f"uptime-kuma-sync: created monitor: {m['name']}")
elif (existing[m["name"]].get("url") != m["url"]
or existing[m["name"]].get("keyword") != keyword
or existing[m["name"]].get("maxretries") != maxretries):
api.edit_monitor(
existing[m["name"]]["id"],
type=monitor_type,
url=m["url"],
interval=m.get("interval", 60),
maxretries=maxretries,
notification_id_list={str(nid): True for nid in notification_ids},
**extra,
)
print(f"uptime-kuma-sync: updated monitor: {m['name']}")
finally:
api.disconnect()
@@ -168,6 +202,16 @@ in
default = 60;
description = "Check interval in seconds.";
};
keyword = lib.mkOption {
type = lib.types.nullOr lib.types.str;
default = null;
description = "If set, use a keyword monitor that checks for this string in the response body.";
};
maxretries = lib.mkOption {
type = lib.types.int;
default = 0;
description = "Consecutive failures before a DOWN alert is sent. 0 = alert immediately.";
};
};
});
default = [];
@@ -217,13 +261,14 @@ in
"${dataDir}/uptime-kuma:/app/data"
];
# uptime-kuma image expects /app/data to be writable; no extra network
# needed since we reach it from the host on localhost.
# Join the homey network so monitors can reach other containers by name
# (e.g. phpldapadmin:80) without going through the host loopback.
extraOptions = [ "--network=homey" ];
};
systemd.services."podman-uptime-kuma" = {
after = lib.mkAfter [ "mnt-data.mount" ];
requires = lib.mkAfter [ "mnt-data.mount" ];
after = lib.mkAfter [ "mnt-data.mount" "podman-homey-network.service" ];
requires = lib.mkAfter [ "mnt-data.mount" "podman-homey-network.service" ];
};
# -----------------------------------------------------------------------