- name: Restart {{ container_name }}
community.docker.docker_container:
name: "{{ container_name }}"
state: started
restart: true
- name: Deploy compose file
ansible.builtin.copy:
src: docker-compose.yml
dest: /docker-root/docker-compose.yml
notify: restart stack
handlers:
- name: restart stack
ansible.builtin.command:
cmd: docker compose up -d
chdir: /docker-root
- name: Prune Docker garbage
ansible.builtin.command: >
docker system prune -af --volumes
when: >
ansible_facts.mounts
| selectattr('mount','eq','/')
| map(attribute='size_available')
| first < 2147483648
- name: Verify repo integrity
ansible.builtin.command: >
restic -r {{ restic_repo }} check
environment:
RESTIC_PASSWORD_FILE: "{{ restic_pw }}"
register: check
failed_when: check.rc not in [0, 1]
- community.crypto.x509_certificate_info:
path: /etc/ssl/certs/homelab.pem
register: cert
- ansible.builtin.fail:
msg: "Expires {{ cert.not_after }}"
when: >
cert.not_after | to_datetime('%Y%m%d%H%M%SZ')
< (now() + timedelta(days=14))
- name: Verify Grafana responds
ansible.builtin.uri:
url: "http://{{ ansible_host }}:3000/api/health"
status_code: 200
timeout: 5
retries: 3
delay: 5
- hosts: all
tasks:
- debug:
msg: >
{{ inventory_hostname }}:
disk={{ ansible_facts.mounts
| selectattr('mount','eq','/')
| map(attribute='size_available')
| first | human_readable }},
uptime={{ ansible_facts.uptime_seconds
| int // 86400 }}d
- hosts: monitoring
serial: 1
max_fail_percentage: 0
tasks:
- systemd: { name: node_exporter, state: stopped }
- copy:
src: node_exporter
dest: /usr/local/bin/node_exporter
mode: '0755'
- systemd: { name: node_exporter, state: started }
- uri:
url: "http://{{ ansible_host }}:9100/metrics"
status_code: 200
# group_vars/all/vars.yml (plaintext, committed)
grafana_admin_password: "{{ vault_grafana_admin_password }}"
# group_vars/all/vault.yml (encrypted, committed)
vault_grafana_admin_password: "actual-password"
all:
children:
docker_hosts:
hosts:
docker-host:
ansible_host: 10.0.1.200
docker_root: /docker-root
app-host:
ansible_host: 10.0.1.70
docker_root: /docker-root
# alert-rules.yml.j2
groups:
- name: homelab
rules:
{% for svc in monitored_services %}
- alert: {{ svc.name }}Down
expr: up{job="{{ svc.job }}"} == 0
for: 2m
labels: { severity: critical }
{% endfor %}
- name: Confirm NAS is mounted
ansible.builtin.stat:
path: /mnt/nas/docker-data
register: nas_stat
failed_when: not nas_stat.stat.exists
- name: Create backup timer
copy:
dest: /etc/systemd/system/backup.timer
content: |
[Unit]
Description=Nightly backup
[Timer]
OnCalendar=*-*-* 02:{{ 59
| random(seed=inventory_hostname) }}:00
RandomizedDelaySec=1800
Persistent=true
[Install]
WantedBy=timers.target
notify: reload systemd
- name: Wait for PostgreSQL ready
command: >
docker inspect
--format '{{.State.Health.Status}}'
wiki-db
register: health
until: health.stdout == 'healthy'
retries: 30
delay: 2
- name: Install base packages
ansible.builtin.apt:
name: "{{ item }}"
state: present
update_cache: true
cache_valid_time: 3600
loop:
- htop
- curl
- jq
- restic
tasks:
- copy:
src: Caddyfile
dest: /etc/caddy/Caddyfile
notify: reload caddy
handlers:
- name: reload caddy
command: >
docker exec caddy caddy reload
--config /etc/caddy/Caddyfile
# remediation/restart-container.yml
- hosts: "{{ target_host }}"
tasks:
- command: >
docker ps -a
--filter "status=exited"
--format '{{.Names}}'
register: stopped
- command: "docker restart {{ item }}"
loop: "{{ stopped.stdout_lines }}"
when: stopped.stdout_lines | length > 0
- ansible.posix.authorized_key:
user: "{{ ansible_user }}"
key: "{{ lookup('file',
'~/.ssh/id_ed25519.pub') }}"
state: present
- ansible.builtin.template:
src: promtail-config.yml.j2
dest: /etc/promtail/config.yml
validate: >
promtail -config.file=%s -check-syntax
notify: restart promtail
- command: restic -r {{ repo }} snapshots
environment:
RESTIC_PASSWORD_FILE: "{{ pw_file }}"
register: check
failed_when: false
changed_when: false
- command: restic -r {{ repo }} init
environment:
RESTIC_PASSWORD_FILE: "{{ pw_file }}"
when: check.rc != 0
docker inspect: "OOMKilled": true
# docker-compose.yml — set memory limits
services:
grafana:
deploy:
resources:
limits:
memory: ${GRAFANA_MEM_LIMIT:-2g}
error: open /proc/1/map_files: permission denied
- command: >
restic backup --exclude='/proc' --exclude='/sys' /etc /var
register: backup
failed_when: backup.rc not in [0, 3] # 3 = partial success
ls: cannot access '/mnt/nas': Stale file handle
- ansible.posix.mount:
path: /mnt/nas
src: "{{ nas_ip }}:/volume1/shared"
fstype: nfs
opts: soft,timeo=100,retrans=3
state: remounted
listen tcp :9100: bind: address already in use
- shell: |
pid=$(ss -tlnp | grep ':{{ port }}' | grep -oP 'pid=\K\d+')
[ -n "$pid" ] && kill "$pid" || true
- systemd: { name: "{{ service_name }}", state: started }
fatal: [host]: UNREACHABLE! => {"msg": "Failed to connect via ssh"}
# ansible.cfg
[ssh_connection]
pipelining = True
ssh_args = -o ControlMaster=auto -o ControlPersist=60s
retries = 3
# For noexec /tmp (common in LXCs):
[defaults]
remote_tmp = /var/tmp/.ansible/tmp
~/ansible/ ├── ansible.cfg # SSH, vault path, defaults ├── inventory.yml # Host map, groups by function ├── group_vars/ │ ├── all/ │ │ ├── vars.yml # Shared config (plaintext) │ │ └── vault.yml # Encrypted secrets │ └── docker_hosts.yml ├── host_vars/ │ └── docker-host.yml # Only when a host differs ├── roles/ │ └── restic/ │ ├── tasks/main.yml │ ├── handlers/main.yml │ ├── templates/ │ └── defaults/main.yml ├── playbooks/ # You run these │ ├── deploy-monitoring.yml │ ├── tls-cert-audit.yml │ └── fleet-patch.yml ├── remediation/ # Alert pipeline runs these │ ├── restart-container.yml │ ├── cleanup-disk.yml │ └── service-restart.yml └── templates/ ├── promtail-config.yml.j2 └── restic-backup.service.j2
| Command | What it does |
|---|---|
ansible-playbook playbooks/deploy.yml --check --diff | Dry run — show what would change |
ansible-playbook playbooks/deploy.yml --limit docker-host | Run against one host only |
ansible-playbook playbooks/deploy.yml --vault-password-file ~/.vault_pass | Use vault password file |
ansible all -m shell -a "df -h / | tail -1" | Ad-hoc disk check across fleet |
ansible all -m command -a uptime | Fleet uptime check |
ansible-inventory --list --yaml | Show full inventory structure |
ansible-vault encrypt group_vars/all/vault.yml | Encrypt secrets file |
ansible-vault edit group_vars/all/vault.yml | Edit encrypted vault in-place |
ansible all -m ping | Test connectivity to all hosts |
ansible docker-host -m setup -a "filter=ansible_mounts" | Inspect host facts |