1.0.0 Release IaaS

This commit is contained in:
2026-03-15 04:41:02 +09:00
commit a7365da431
292 changed files with 36059 additions and 0 deletions

View File

@@ -0,0 +1,32 @@
# my global config
global:
scrape_interval: 15s # Set the scrape interval to every 15 seconds. Default is every 1 minute.
evaluation_interval: 15s # Evaluate rules every 15 seconds. The default is every 1 minute.
# scrape_timeout is set to the global default (10s).
# Alertmanager configuration
alerting:
alertmanagers:
- static_configs:
- targets:
# - alertmanager:9093
# Load rules once and periodically evaluate them according to the global 'evaluation_interval'.
rule_files:
- "/etc/prometheus/rules.yaml"
# A scrape configuration containing exactly one endpoint to scrape:
# Here it's Prometheus itself.
scrape_configs:
# The job name is added as a label `job=<job_name>` to any timeseries scraped from this config.
- job_name: "prometheus"
# metrics_path defaults to '/metrics'
scheme: "https"
tls_config:
ca_file: "/etc/ssl/prometheus/ilnmors_root_ca.crt"
server_name: "{{ infra_uri['prometheus']['domain'] }}"
static_configs:
- targets: ["localhost:9090"]
# The label name is added as a label `label_name=<label_value>` to any timeseries scraped from this config.
labels:
instance: "{{ node['name'] }}"

View File

@@ -0,0 +1,38 @@
groups:
- name: node_exporters_heartbeat
rules:
{% for instance in ['vmm', 'fw', 'infra', 'auth', 'app'] %}
- alert: {{ instance }}_node_exporter_down
expr: |
(present_over_time(up{instance="{{ instance }}"}[5m]) or on() vector(0)) == 0
for: 30s
labels:
severity: critical
annotations:
summary: "Exporter heartbeat is down: {{ instance }}"
description: "{{ instance }} exporter is down for 5 mins"
{% endfor %}
- name: postgresql_heartbeat
rules:
- alert: Postgresql_Down
expr: |
(present_over_time(pg_up{instance="infra", job="postgres"}[5m]) or on() vector(0)) == 0
for: 30s
labels:
severity: critical
annotations:
summary: "Postgresql Heartbeat Lost: postgresql"
description: "postgresql node is down for 5 mins."
- name: Certificate_expiry_check
rules:
{% for filename in ['root.crt', 'intermediate.crt', 'crowdsec.crt', 'blocky.crt', 'postgresql.crt', 'ldap.crt', 'prometheus.crt', 'loki.crt', 'dsm.crt'] %}
- alert: {{ filename | replace('.', '_') }}_is_expired_soon
expr: |
max(x509_cert_not_after{filename="{{ filename }}"}) - time() < 2592000
for: 1d
labels:
severity: critical
annotations:
summary: "{{ filename }} is expired in 30 days"
description: "{{ filename }} is expired in 30 days."
{% endfor %}

View File

@@ -0,0 +1,9 @@
# Additionally, a certificate and a key file are needed.
tls_server_config:
cert_file: "/etc/ssl/prometheus/prometheus.crt"
key_file: "/etc/ssl/prometheus/prometheus.key"
# Passwords are hashed with bcrypt: https://github.com/prometheus/exporter-toolkit/blob/master/docs/web-configuration.md#about-bcrypt
#basic_auth_users:
# alice: $2y$10$mDwo.lAisC94iLAyP81MCesa29IzH37oigHC/42V2pdJlUprsJPze
# bob: $2y$10$hLqFl9jSjoAAy95Z/zw8Ye8wkdMBM8c5Bn1ptYqP/AXyV0.oy0S8m

View File

@@ -0,0 +1,38 @@
[Quadlet]
DefaultDependencies=false
[Unit]
Description=Prometheus
After=network-online.target
Wants=network-online.target
[Container]
Image=docker.io/prom/prometheus:{{ version['containers']['prometheus'] }}
ContainerName=prometheus
HostName=prometheus
PublishPort=9090:9090/tcp
Volume=%h/containers/prometheus/data:/prometheus:rw
Volume=%h/containers/prometheus/etc:/etc/prometheus:ro
Volume=%h/containers/prometheus/ssl:/etc/ssl/prometheus:ro
Environment="TZ=Asia/Seoul"
Exec=--config.file=/etc/prometheus/prometheus.yaml \
--web.config.file=/etc/prometheus/web-config.yaml \
--web.enable-remote-write-receiver \
--storage.tsdb.path=/prometheus \
--storage.tsdb.retention.time=30d \
--storage.tsdb.retention.size=15GB \
--storage.tsdb.wal-compression
[Service]
Restart=always
RestartSec=10s
TimeoutStopSec=120
[Install]
WantedBy=default.target