flake/hosts/aws/tyo0/services/prometheus.nix

204 lines
6.2 KiB
Nix
Raw Permalink Normal View History

2024-08-29 17:42:37 +00:00
{
lib,
pkgs,
config,
2024-09-21 16:21:30 +00:00
nodes,
2024-10-19 05:00:06 +00:00
ports,
2024-08-29 17:42:37 +00:00
...
}:
2024-09-06 12:19:44 +00:00
let
2024-10-02 07:26:28 +00:00
targets = lib.mapAttrsToList (_name: node: node.fqdn) nodes ++ [ "pek0.ny4.dev" ];
2024-09-06 12:19:44 +00:00
in
2024-08-29 17:42:37 +00:00
{
services.prometheus = {
enable = true;
listenAddress = "127.0.0.1";
2024-09-06 12:19:44 +00:00
port = ports.prometheus;
2024-08-29 18:09:46 +00:00
webExternalUrl = "https://prom.ny4.dev";
2024-08-29 17:42:37 +00:00
exporters.blackbox = {
enable = true;
listenAddress = "127.0.0.1";
2024-09-06 12:19:44 +00:00
port = ports.blackbox;
2024-08-29 17:42:37 +00:00
configFile = (pkgs.formats.yaml { }).generate "config.yaml" {
modules.http_2xx = {
prober = "http";
http.fail_if_not_ssl = true;
2024-08-29 17:42:37 +00:00
};
};
};
scrapeConfigs = [
{
job_name = "node_exporter";
2024-11-03 11:23:16 +00:00
scheme = "https";
metrics_path = "/metrics";
2024-08-29 17:42:37 +00:00
basic_auth = {
username = "prometheus";
password_file = config.sops.secrets."prometheus/auth".path;
};
2024-10-02 07:26:28 +00:00
static_configs = lib.singleton { inherit targets; };
2024-08-29 17:42:37 +00:00
}
{
job_name = "caddy";
2024-11-03 11:23:16 +00:00
scheme = "https";
metrics_path = "/caddy";
basic_auth = {
username = "prometheus";
password_file = config.sops.secrets."prometheus/auth".path;
};
2024-10-02 07:26:28 +00:00
static_configs = lib.singleton { inherit targets; };
}
{
job_name = "ntfy";
scheme = "https";
metrics_path = "/metrics";
static_configs = lib.singleton { targets = [ "ntfy.ny4.dev" ]; };
}
{
job_name = "forgejo";
scheme = "https";
metrics_path = "/metrics";
static_configs = lib.singleton { targets = [ "git.ny4.dev" ]; };
}
{
job_name = "miniflux";
scheme = "https";
metrics_path = "/metrics";
static_configs = lib.singleton { targets = [ "rss.ny4.dev" ]; };
}
{
job_name = "blackbox_exporter";
2024-09-20 17:38:01 +00:00
static_configs = lib.singleton { targets = [ "127.0.0.1:${toString ports.blackbox}" ]; };
}
{
job_name = "blackbox_probe";
2024-08-29 17:42:37 +00:00
metrics_path = "/probe";
params = {
module = [ "http_2xx" ];
};
static_configs = lib.singleton {
targets = [
"https://blog.ny4.dev"
"https://cinny.ny4.dev"
"https://element.ny4.dev"
"https://git.ny4.dev"
"https://id.ny4.dev"
2024-10-26 07:42:14 +00:00
"https://ip.ny4.dev"
2024-08-29 17:42:37 +00:00
"https://mastodon.ny4.dev"
"https://matrix.ny4.dev"
"https://ntfy.ny4.dev"
"https://pb.ny4.dev"
"https://reddit.ny4.dev"
"https://rss.ny4.dev"
"https://vault.ny4.dev"
];
};
relabel_configs = [
{
source_labels = [ "__address__" ];
target_label = "__param_target";
}
{
source_labels = [ "__param_target" ];
target_label = "instance";
}
{
target_label = "__address__";
2024-09-06 12:19:44 +00:00
replacement = "127.0.0.1:${toString ports.blackbox}";
2024-08-29 17:42:37 +00:00
}
];
}
];
rules = lib.singleton (
builtins.toJSON {
groups = lib.singleton {
name = "metrics";
rules = [
{
alert = "NodeDown";
expr = ''up{job="node_exporter"} == 0'';
2024-08-29 17:42:37 +00:00
for = "5m";
annotations = {
summary = "Node exporter down on {{ $labels.instance }}";
description = "Node exporter on {{ $labels.instance }} has been down for more than 5 minutes.";
};
2024-08-29 17:42:37 +00:00
}
{
alert = "HTTPDown";
expr = ''up{job="blackbox_probe"} == 0 or probe_success{job="blackbox_probe"} == 0'';
2024-08-29 17:42:37 +00:00
for = "5m";
annotations = {
summary = "HTTP probe failure on {{ $labels.instance }}";
description = "The HTTP blackbox probe on {{ $labels.instance }} has failed for more than 5 minutes.";
};
2024-08-29 17:42:37 +00:00
}
{
alert = "MemoryFull";
expr = ''node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes < 0.1'';
2024-08-30 20:26:42 +00:00
for = "5m";
annotations = {
summary = "Low available memory on {{ $labels.instance }}";
description = "{{ $labels.instance }} has less than 10% available memory for more than 5 minutes.";
};
2024-08-29 17:42:37 +00:00
}
{
alert = "DiskFull";
expr = ''node_filesystem_avail_bytes{mountpoint=~"/|/persist|/mnt"} / node_filesystem_size_bytes < 0.1'';
annotations = {
summary = "Low disk space on {{ $labels.instance }}";
description = "The disk {{ $labels.device }} mounted at {{ $labels.mountpoint }} on {{ $labels.instance }} has less than 10% of empty space available.";
};
2024-08-29 17:42:37 +00:00
}
{
alert = "UnitFailed";
expr = ''node_systemd_unit_state{state="failed"} == 1'';
annotations = {
summary = "Systemd unit {{ $labels.name }} failure on {{ $labels.instance }}";
description = "The systemd unit {{ $labels.name }} on {{ $labels.instance }} has entered a {{ $labels.state }} state.";
};
2024-08-29 17:42:37 +00:00
}
];
};
}
);
alertmanagers = lib.singleton {
2024-09-20 17:38:01 +00:00
static_configs = lib.singleton { targets = [ "127.0.0.1:${toString ports.alertmanager}" ]; };
2024-08-29 17:42:37 +00:00
};
alertmanager = {
enable = true;
checkConfig = false;
2024-08-29 17:42:37 +00:00
listenAddress = "127.0.0.1";
2024-09-06 12:19:44 +00:00
port = ports.alertmanager;
2024-08-29 17:42:37 +00:00
configuration = {
receivers = lib.singleton {
name = "ntfy";
webhook_configs = lib.singleton {
# https://docs.ntfy.sh/publish/#message-templating
url = "$ALERTMANAGER_WEBHOOK_URL";
};
2024-08-29 17:42:37 +00:00
};
route = {
receiver = "ntfy";
};
};
};
};
2024-08-31 02:15:09 +00:00
systemd.services."alertmanager".serviceConfig.EnvironmentFile =
config.sops.templates."alertmanager/environment".path;
2024-08-31 02:15:09 +00:00
services.caddy.settings.apps.http.servers.srv0.routes = lib.singleton {
2024-09-20 17:38:01 +00:00
match = lib.singleton { host = [ "prom.ny4.dev" ]; };
2024-08-31 02:15:09 +00:00
handle = lib.singleton {
handler = "reverse_proxy";
2024-09-06 12:19:44 +00:00
upstreams = [ { dial = "127.0.0.1:${toString ports.prometheus}"; } ];
2024-08-31 02:15:09 +00:00
};
};
2024-08-29 17:42:37 +00:00
}