$ ansible-homelab-cheatsheet

20 Patterns You'll Actually Use

1. Restart a crashed container

- name: Restart {{ container_name }}
  community.docker.docker_container:
    name: "{{ container_name }}"
    state: started
    restart: true

2. Handler: restart on config change

- name: Deploy compose file
  ansible.builtin.copy:
    src: docker-compose.yml
    dest: /docker-root/docker-compose.yml
  notify: restart stack

handlers:
  - name: restart stack
    ansible.builtin.command:
      cmd: docker compose up -d
      chdir: /docker-root

3. Disk cleanup

- name: Prune Docker garbage
  ansible.builtin.command: >
    docker system prune -af --volumes
  when: >
    ansible_facts.mounts
    | selectattr('mount','eq','/')
    | map(attribute='size_available')
    | first < 2147483648

4. Backup verification (Restic)

- name: Verify repo integrity
  ansible.builtin.command: >
    restic -r {{ restic_repo }} check
  environment:
    RESTIC_PASSWORD_FILE: "{{ restic_pw }}"
  register: check
  failed_when: check.rc not in [0, 1]

5. TLS cert age check

- community.crypto.x509_certificate_info:
    path: /etc/ssl/certs/homelab.pem
  register: cert

- ansible.builtin.fail:
    msg: "Expires {{ cert.not_after }}"
  when: >
    cert.not_after | to_datetime('%Y%m%d%H%M%SZ')
    < (now() + timedelta(days=14))

6. Service health check

- name: Verify Grafana responds
  ansible.builtin.uri:
    url: "http://{{ ansible_host }}:3000/api/health"
    status_code: 200
    timeout: 5
  retries: 3
  delay: 5

7. Fleet status via facts

- hosts: all
  tasks:
    - debug:
        msg: >
          {{ inventory_hostname }}:
          disk={{ ansible_facts.mounts
            | selectattr('mount','eq','/')
            | map(attribute='size_available')
            | first | human_readable }},
          uptime={{ ansible_facts.uptime_seconds
            | int // 86400 }}d

8. Rolling update (serial)

- hosts: monitoring
  serial: 1
  max_fail_percentage: 0
  tasks:
    - systemd: { name: node_exporter, state: stopped }
    - copy:
        src: node_exporter
        dest: /usr/local/bin/node_exporter
        mode: '0755'
    - systemd: { name: node_exporter, state: started }
    - uri:
        url: "http://{{ ansible_host }}:9100/metrics"
        status_code: 200

9. Vault secret reference

# group_vars/all/vars.yml (plaintext, committed)
grafana_admin_password: "{{ vault_grafana_admin_password }}"

# group_vars/all/vault.yml (encrypted, committed)
vault_grafana_admin_password: "actual-password"

10. Docker host inventory

all:
  children:
    docker_hosts:
      hosts:
        docker-host:
          ansible_host: 10.0.1.200
          docker_root: /docker-root
        app-host:
          ansible_host: 10.0.1.70
          docker_root: /docker-root

11. Prometheus alert rule template

# alert-rules.yml.j2
groups:
  - name: homelab
    rules:
{% for svc in monitored_services %}
      - alert: {{ svc.name }}Down
        expr: up{job="{{ svc.job }}"} == 0
        for: 2m
        labels: { severity: critical }
{% endfor %}

12. Pre-flight: verify NFS mount

- name: Confirm NAS is mounted
  ansible.builtin.stat:
    path: /mnt/nas/docker-data
  register: nas_stat
  failed_when: not nas_stat.stat.exists

13. Deploy systemd timer

- name: Create backup timer
  copy:
    dest: /etc/systemd/system/backup.timer
    content: |
      [Unit]
      Description=Nightly backup
      [Timer]
      OnCalendar=*-*-* 02:{{ 59
        | random(seed=inventory_hostname) }}:00
      RandomizedDelaySec=1800
      Persistent=true
      [Install]
      WantedBy=timers.target
  notify: reload systemd

14. Wait for container healthy

- name: Wait for PostgreSQL ready
  command: >
    docker inspect
    --format '{{.State.Health.Status}}'
    wiki-db
  register: health
  until: health.stdout == 'healthy'
  retries: 30
  delay: 2

15. Conditional package install

- name: Install base packages
  ansible.builtin.apt:
    name: "{{ item }}"
    state: present
    update_cache: true
    cache_valid_time: 3600
  loop:
    - htop
    - curl
    - jq
    - restic

16. Reload Caddy on config change

tasks:
  - copy:
      src: Caddyfile
      dest: /etc/caddy/Caddyfile
    notify: reload caddy

handlers:
  - name: reload caddy
    command: >
      docker exec caddy caddy reload
      --config /etc/caddy/Caddyfile

17. Remediation dispatcher

# remediation/restart-container.yml
- hosts: "{{ target_host }}"
  tasks:
    - command: >
        docker ps -a
        --filter "status=exited"
        --format '{{.Names}}'
      register: stopped
    - command: "docker restart {{ item }}"
      loop: "{{ stopped.stdout_lines }}"
      when: stopped.stdout_lines | length > 0

18. SSH key deployment

- ansible.posix.authorized_key:
    user: "{{ ansible_user }}"
    key: "{{ lookup('file',
      '~/.ssh/id_ed25519.pub') }}"
    state: present

19. Promtail config with validation

- ansible.builtin.template:
    src: promtail-config.yml.j2
    dest: /etc/promtail/config.yml
    validate: >
      promtail -config.file=%s -check-syntax
  notify: restart promtail

20. Idempotent Restic repo init

- command: restic -r {{ repo }} snapshots
  environment:
    RESTIC_PASSWORD_FILE: "{{ pw_file }}"
  register: check
  failed_when: false
  changed_when: false

- command: restic -r {{ repo }} init
  environment:
    RESTIC_PASSWORD_FILE: "{{ pw_file }}"
  when: check.rc != 0

5 Common Failures and the Fix

1. Container OOMKilled

docker inspect: "OOMKilled": true

# docker-compose.yml — set memory limits
services:
  grafana:
    deploy:
      resources:
        limits:
          memory: ${GRAFANA_MEM_LIMIT:-2g}

2. Restic backup exits code 1

error: open /proc/1/map_files: permission denied

- command: >
    restic backup --exclude='/proc' --exclude='/sys' /etc /var
  register: backup
  failed_when: backup.rc not in [0, 3]  # 3 = partial success

3. NFS mount stale after NAS reboot

ls: cannot access '/mnt/nas': Stale file handle

- ansible.posix.mount:
    path: /mnt/nas
    src: "{{ nas_ip }}:/volume1/shared"
    fstype: nfs
    opts: soft,timeo=100,retrans=3
    state: remounted

4. "Address already in use"

listen tcp :9100: bind: address already in use

- shell: |
    pid=$(ss -tlnp | grep ':{{ port }}' | grep -oP 'pid=\K\d+')
    [ -n "$pid" ] && kill "$pid" || true
- systemd: { name: "{{ service_name }}", state: started }

5. "UNREACHABLE" — SSH works manually

fatal: [host]: UNREACHABLE! => {"msg": "Failed to connect via ssh"}

# ansible.cfg
[ssh_connection]
pipelining = True
ssh_args = -o ControlMaster=auto -o ControlPersist=60s
retries = 3

# For noexec /tmp (common in LXCs):
[defaults]
remote_tmp = /var/tmp/.ansible/tmp

Directory Structure

~/ansible/
├── ansible.cfg               # SSH, vault path, defaults
├── inventory.yml             # Host map, groups by function
├── group_vars/
│   ├── all/
│   │   ├── vars.yml          # Shared config (plaintext)
│   │   └── vault.yml         # Encrypted secrets
│   └── docker_hosts.yml
├── host_vars/
│   └── docker-host.yml       # Only when a host differs
├── roles/
│   └── restic/
│       ├── tasks/main.yml
│       ├── handlers/main.yml
│       ├── templates/
│       └── defaults/main.yml
├── playbooks/                 # You run these
│   ├── deploy-monitoring.yml
│   ├── tls-cert-audit.yml
│   └── fleet-patch.yml
├── remediation/               # Alert pipeline runs these
│   ├── restart-container.yml
│   ├── cleanup-disk.yml
│   └── service-restart.yml
└── templates/
    ├── promtail-config.yml.j2
    └── restic-backup.service.j2

10 Commands You'll Use Most

CommandWhat it does
ansible-playbook playbooks/deploy.yml --check --diffDry run — show what would change
ansible-playbook playbooks/deploy.yml --limit docker-hostRun against one host only
ansible-playbook playbooks/deploy.yml --vault-password-file ~/.vault_passUse vault password file
ansible all -m shell -a "df -h / | tail -1"Ad-hoc disk check across fleet
ansible all -m command -a uptimeFleet uptime check
ansible-inventory --list --yamlShow full inventory structure
ansible-vault encrypt group_vars/all/vault.ymlEncrypt secrets file
ansible-vault edit group_vars/all/vault.ymlEdit encrypted vault in-place
ansible all -m pingTest connectivity to all hosts
ansible docker-host -m setup -a "filter=ansible_mounts"Inspect host facts