-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathpve_vm_status.yml
More file actions
398 lines (362 loc) · 17.4 KB
/
Copy pathpve_vm_status.yml
File metadata and controls
398 lines (362 loc) · 17.4 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
# Proxmox Per-VM CPU and Memory Usage (API-First via community.proxmox)
#
# Overview
# Query and report live CPU and memory usage for all VMs and LXCs across one or more Proxmox nodes.
# Fully API-driven using the community.proxmox collection - no SSH or sudo on nodes required.
# Safe to run repeatedly; does not modify cluster state.
#
# What this playbook does
# * Collects all VM and LXC metadata via the Proxmox REST API
# * Filters out templates
# * Enriches each item with derived fields:
# - USED_Gi (used memory in GiB)
# - MAX_Gi (configured memory in GiB)
# - USED_pct (percent of total)
# - CPU_pct (current CPU usage %)
# - vCPUs (configured vCPU count)
# * Sorts VMs (running first, then by USED_Gi descending)
# * Prints a clean, compact list - one line per VM/LXC plus overall totals
#
# Key features
# * Pure API flow - no Proxmox CLI calls, no SSH into nodes
# * Works from any Ansible control host with Proxmox API access
# * Nicely formatted human-readable output suitable for CI logs or terminal use
# * Supports multiple nodes when pm_node is omitted (auto-discovers all)
# * Idempotent and read-only
#
# Inputs & variables
# Proxmox API connection:
# pm_api_host, pm_api_port, api_user, api_token_id, api_token_secret,
# pm_api_validate_certs, pm_node (optional)
#
# Quick start
# 1) Ensure Proxmox API token credentials are defined (vaulted or via env vars)
# 2) Define pm_api_host, pm_node, etc. in group_vars/all/main.yml
# 3) Run:
# ansible-playbook -i inventory/hosts.ini pve_vm_status.yml
#
# Notes & caveats
# * Requires collection: community.proxmox >= 1.3.0
# * Read-only - does not start, stop, or modify any VMs
# * For JSON or structured outputs, wrap in `-o json` callback or use `register` vars
#
# Credit: Thomas Mozdren
---
- name: Proxmox per-VM current usage status (mem + cpu via community.proxmox)
hosts: localhost
gather_facts: false
# module_defaults for the collection modules used below (keeps task calls concise)
module_defaults:
community.proxmox.proxmox_vm_info:
api_host: "{{ pm_api_host }}"
api_port: "{{ pm_api_port }}"
api_user: "{{ api_user }}"
api_token_id: "{{ api_token_id }}"
api_token_secret: "{{ api_token_secret | default(pm_api_token_secret) }}"
validate_certs: "{{ pm_api_validate_certs }}"
community.proxmox.proxmox_node_info:
api_host: "{{ pm_api_host }}"
api_port: "{{ pm_api_port }}"
api_user: "{{ api_user }}"
api_token_id: "{{ api_token_id }}"
api_token_secret: "{{ api_token_secret | default(pm_api_token_secret) }}"
validate_certs: "{{ pm_api_validate_certs }}"
tasks:
- name: Get node(s) info (module returns all nodes; we filter locally)
community.proxmox.proxmox_node_info:
api_host: "{{ pm_api_host }}"
api_user: "{{ api_user }}"
api_token_id: "{{ api_token_id }}"
api_token_secret: "{{ api_token_secret | default(pm_api_token_secret) }}"
register: node_info
- name: Fail if node_info contains no nodes
ansible.builtin.fail:
msg: "Proxmox API returned no nodes. Check API credentials / pm_api_host."
when: node_info.proxmox_nodes | default([]) | length == 0
- name: Initialize node RAM map (node -> max RAM GiB)
ansible.builtin.set_fact:
node_ram_map: {}
changed_when: false
- name: Build node_ram_map from node_info
ansible.builtin.set_fact:
node_ram_map: >-
{{ node_ram_map | combine({
(item.node | default('unknown')): ((item.maxmem | default(0) | int) / (1024.0**3)) | round(2)
}) }}
loop: "{{ node_info.proxmox_nodes | default([]) }}"
changed_when: false
- name: If pm_node specified - find that node record
ansible.builtin.set_fact:
pm_node_record: >-
{{ (node_info.proxmox_nodes | selectattr('node', 'equalto', pm_node) | list | first) | default({}) }}
when: pm_node is defined and pm_node | length > 0
changed_when: false
- name: Fail when pm_node specified but not found
ansible.builtin.fail:
msg: "pm_node '{{ pm_node }}' not found in Proxmox cluster. Discovered nodes: {{ node_info.proxmox_nodes | map(attribute='node') | list }}"
when:
- pm_node is defined
- pm_node | length > 0
- pm_node_record | length == 0
- name: Show discovered node RAM(s) or selected node
ansible.builtin.debug:
msg: >-
{% if pm_node is defined and pm_node | length > 0 %} Node "{{ pm_node }}" host max RAM: {{ node_ram_map[pm_node] | default('UNKNOWN') }} GiB {% else %} Discovered nodes and host max RAM (GiB): {{ node_ram_map }} {% endif %}
- name: Get VMs/LXCs (all nodes unless pm_node is defined)
community.proxmox.proxmox_vm_info:
api_host: "{{ pm_api_host }}"
api_port: "{{ pm_api_port }}"
api_user: "{{ api_user }}"
api_token_id: "{{ api_token_id }}"
api_token_secret: "{{ api_token_secret | default(pm_api_token_secret) }}"
validate_certs: "{{ pm_api_validate_certs }}"
type: all
node: "{{ pm_node if (pm_node | default('') | length > 0) else omit }}"
config: none
register: vm_info
- name: Start with an empty row list
ansible.builtin.set_fact:
vm_rows: []
changed_when: false
- name: Build enriched rows (filter out templates) - append per item
ansible.builtin.set_fact:
vm_rows: >-
{{ vm_rows
+ [ item | combine({
'USED_Gi': (item.mem | default(0) | float / 1073741824),
'MAX_Gi': (item.maxmem | default(0) | float / 1073741824),
'USED_pct': ((item.maxmem | default(0) | float) > 0)
| ternary( (item.mem | default(0) | float / (item.maxmem | float) * 100.0), 0.0),
'CPU_pct': (item.cpu | default(item.cpu) | float * 100.0),
'vCPUs': (item.maxcpu | default(item.cpus | default(0)))
})
]
}}
loop: >-
{{ (vm_info.proxmox_vms | default([]))
| rejectattr('template','equalto', true) | list }}
no_log: true
changed_when: false
- name: Order rows (running first, then by USED_Gi desc)
ansible.builtin.set_fact:
vm_sorted: >-
{{
(
(vm_rows | selectattr('status','equalto','running') | list)
| sort(attribute='USED_Gi', reverse=true)
)
+
(
(vm_rows | rejectattr('status','equalto','running') | list)
| sort(attribute='USED_Gi', reverse=true)
)
}}
changed_when: false
- name: Print simple list (one line per VM, then totals)
vars:
# prefer host-reported current used RAM if available, otherwise fall back to sum of guest USED_Gi
total_used_ram_gi: >-
{{ (node_current_ram_used is defined and node_current_ram_used | float > 0)
| ternary(node_current_ram_used | float, (vm_sorted | map(attribute='USED_Gi') | sum | default(0.0))) }}
total_max_ram_gi: "{{ node_ram_map[pm_node] | default(0.0) }}"
# used_pct_total is based on displayed total_used_ram_gi
used_pct_total: "{{ (total_max_ram_gi|float > 0) | ternary((total_used_ram_gi/total_max_ram_gi*100.0), 0.0) }}"
running_list: "{{ vm_sorted | selectattr('status', 'equalto', 'running') | list }}"
running_cpu_sum: "{{ running_list | map(attribute='CPU_pct') | list | sum | default(0.0) }}"
running_count: "{{ running_list | length | int }}"
avg_cpu_running: "{{ ((running_count | int) > 0) | ternary(running_cpu_sum / (running_count | int), 0.0) }}"
block:
- name: VM line (node/vmid name status CPU% RAMused/RAMmax)
ansible.builtin.debug:
msg: >-
{{
"%s/%s %-32s %-7s CPU:%.1f%% RAM: %.1fGi/%.1fGi (%.1f%%)" | format(
item.node|default('-'),
item.vmid|string,
(item.name|default('-'))[:32],
item.status|default('-'),
item.CPU_pct|default(0.0),
(item.mem / (1024.0**3)) | float | round(2),
(item.maxmem / (1024.0**3)) | float | round(2),
item.USED_pct|default(0.0)
)
}}
loop: "{{ vm_sorted }}"
loop_control:
label: "{{ item.node }}/{{ item.vmid }}"
- name: Get proxmox nodes current ram used
ansible.builtin.set_fact:
node_current_ram_used: >-
{%- if (pm_node is defined and pm_node | length > 0) -%}
{{ (
(node_info.proxmox_nodes
| selectattr('node','equalto', pm_node)
| map(attribute='mem')
| list
)
| first | default(0)
) / (1024.0**3) | float | round(2) }}
{%- else -%}
{{ (node_info.proxmox_nodes
| map(attribute='mem')
| sum
) / (1024.0**3) | float | round(2) }}
{%- endif -%}
changed_when: false
- name: Totals
ansible.builtin.debug:
msg: >-
{{
"Total: %d guests | Running: %d | Stopped: %d | RAM Used: %.1f Gi / %.1f Gi (%.1f%%) | Avg CPU (running): %.1f%%" |
format(
(vm_sorted|length),
running_count,
((vm_sorted|length) - running_count),
total_used_ram_gi, total_max_ram_gi, used_pct_total,
avg_cpu_running
)
}}
- name: Compute guests_max_bytes
ansible.builtin.set_fact:
guests_max_bytes: "{{ vm_info.proxmox_vms | default([]) | map(attribute='maxmem') | select('defined') | map('int') | sum }}"
changed_when: false
- name: Compute guests_max_gib
ansible.builtin.set_fact:
guests_max_gib: "{{ (guests_max_bytes / (1024.0**3)) | float | round(2) }}"
changed_when: false
- name: Build running/all MAX sums and running list (use maxmem bytes -> GiB)
ansible.builtin.set_fact:
running_list: "{{ vm_sorted | selectattr('status', 'equalto', 'running') | list }}"
running_max_sum: >-
{{ ( (vm_sorted
| selectattr('status','equalto','running')
| map(attribute='maxmem') | map('int') | sum
) / (1024.0**3) ) | float | round(2) }}
all_max_sum: >-
{{ ((vm_sorted | map(attribute='maxmem') | map('int') | sum) / (1024.0**3)) | float | round(2) }}
changed_when: false
- name: If host_ram_gib explicitly provided in vars, use it
ansible.builtin.set_fact:
resolved_host_ram_gib: "{{ (host_ram_gib | float) }}"
when: host_ram_gib is defined
- name: Else if pm_node specified (and node_ram_map has entry) use node's API value
ansible.builtin.set_fact:
resolved_host_ram_gib: "{{ (node_ram_map[pm_node] | float) }}"
when:
- resolved_host_ram_gib is not defined
- pm_node is defined
- pm_node | length > 0
- name: Else fallback to conservative sum-of-all-guests MAXs
ansible.builtin.set_fact:
resolved_host_ram_gib: "{{ (all_max_sum | float) }}"
when: resolved_host_ram_gib is not defined
- name: Compute headroom and status facts
ansible.builtin.set_fact:
headroom_running: "{{ (resolved_host_ram_gib - running_max_sum) | round(2) }}"
headroom_all: "{{ (resolved_host_ram_gib - all_max_sum) | round(2) }}"
status_running: "{{ (resolved_host_ram_gib | float >= running_max_sum | float) | ternary('OK', 'TIGHT/RISK') }}"
status_all: "{{ (resolved_host_ram_gib | float >= all_max_sum | float) | ternary('OK', 'TIGHT/RISK') }}"
changed_when: false
- name: Show headroom summary (explicit and safe)
ansible.builtin.debug:
msg:
- "Running guests : {{ running_list | length }} | Running MAX sum : {{ running_max_sum }} Gi"
- "Headroom (all running at MAX) : {{ headroom_running }} Gi | Status: {{ status_running }}"
- "All guests : {{ vm_sorted | length }} | All MAX sum : {{ all_max_sum }} Gi"
- "Headroom (all guests at MAX) : {{ headroom_all }} Gi | Status: {{ status_all }}"
# PLAY 2: Fleet health check — SSH reachability + critical service states
#
# Connects to every host in [lxcs] and [vms] groups, gathers service facts,
# and reports whether expected services are running. Unreachable hosts are
# caught and surfaced in the consolidated summary at the end.
#
# Run stand-alone (skips the Proxmox API play above):
# ansible-playbook pve_vm_status.yml --tags health
- name: Fleet health check — per-host SSH and service status
hosts: lxcs:vms
gather_facts: true
become: false
ignore_unreachable: true
tags: [ health ]
vars:
# Required on every apt-based host; if not running → DEGRADED
_required_all: [ ssh ]
# Required on full VMs only (not LXC containers)
_required_vms: [ qemu-guest-agent ]
# Informational only — reported when present, silently skipped when not
_optional_svcs:
- wazuh-agent
- wazuh-manager
- nginx
- authelia
- grafana-server
- prometheus
tasks:
- name: "[health] Gather service facts (apt-based hosts)"
ansible.builtin.service_facts:
when: ansible_pkg_mgr is defined and ansible_pkg_mgr == 'apt'
- name: "[health] Evaluate required services"
ansible.builtin.set_fact:
_ssh_state: "{{ ansible_facts.services['ssh.service'].state | default('not found') }}"
_sshd_state: "{{ ansible_facts.services['sshd.service'].state | default('not found') }}"
_qga_state: "{{ ansible_facts.services['qemu-guest-agent.service'].state | default('N/A') }}"
when: ansible_pkg_mgr is defined and ansible_pkg_mgr == 'apt'
- name: "[health] Evaluate optional services"
ansible.builtin.set_fact:
_opt_wazuh_agent: "{{ ansible_facts.services['wazuh-agent.service'].state | default(omit) }}"
_opt_wazuh_mgr: "{{ ansible_facts.services['wazuh-manager.service'].state | default(omit) }}"
_opt_nginx: "{{ ansible_facts.services['nginx.service'].state | default(omit) }}"
_opt_authelia: "{{ ansible_facts.services['authelia.service'].state | default(omit) }}"
_opt_grafana: "{{ ansible_facts.services['grafana-server.service'].state | default(omit) }}"
_opt_prometheus: "{{ ansible_facts.services['prometheus.service'].state | default(omit) }}"
when: ansible_pkg_mgr is defined and ansible_pkg_mgr == 'apt'
# SSH can be named 'ssh' or 'sshd' depending on distro; treat either as OK
- name: "[health] Determine overall host status"
ansible.builtin.set_fact:
_health_ok: >-
{{
((_ssh_state | default('') == 'running') or (_sshd_state | default('') == 'running'))
and
(
(inventory_hostname not in groups['vms'])
or (_qga_state | default('') == 'running')
)
}}
when: ansible_pkg_mgr is defined and ansible_pkg_mgr == 'apt'
- name: "[health] Set Alpine host health"
ansible.builtin.set_fact:
_health_ok: true # SSH reachable; service_facts not checked on apk hosts
when: ansible_pkg_mgr is defined and ansible_pkg_mgr == 'apk'
- name: "[health] Print per-host status (Debian/Ubuntu)"
ansible.builtin.debug:
msg:
- "Host : {{ inventory_hostname }} ({{ ansible_host }})"
- "OS : {{ ansible_distribution }} {{ ansible_distribution_version }}"
- "ssh : {{ (_ssh_state | default('') == 'running' or _sshd_state | default('') == 'running') | ternary('running', 'NOT RUNNING') }}"
- "qemu-ga : {{ _qga_state | default('N/A (LXC)') }}{{ (inventory_hostname in groups['vms'] and _qga_state | default('') != 'running') | ternary(' ← WARN', '') }}"
- "wazuh-agent: {{ _opt_wazuh_agent | default('not installed') }}"
- "wazuh-mgr : {{ _opt_wazuh_mgr | default('not installed') }}"
- "nginx : {{ _opt_nginx | default('not installed') }}"
- "authelia : {{ _opt_authelia | default('not installed') }}"
- "grafana : {{ _opt_grafana | default('not installed') }}"
- "prometheus : {{ _opt_prometheus | default('not installed') }}"
- "STATUS : {{ _health_ok | default(false) | ternary('OK', 'DEGRADED') }}"
when: ansible_pkg_mgr is defined and ansible_pkg_mgr == 'apt'
- name: "[health] Print Alpine host status"
ansible.builtin.debug:
msg: "{{ inventory_hostname }} ({{ ansible_host }}) | SSH REACHABLE | Alpine — service_facts skipped"
when: ansible_pkg_mgr is defined and ansible_pkg_mgr == 'apk'
# PLAY 3: Consolidated fleet summary (runs on control node from hostvars)
- name: Fleet health — consolidated summary
hosts: localhost
gather_facts: false
tags: [ health ]
tasks:
- name: "[health] Print consolidated fleet table"
vars:
_fleet: "{{ (groups['lxcs'] | default([])) + (groups['vms'] | default([])) }}"
_hdr: "=== Fleet Health Summary ============================================"
_ftr: "====================================================================="
ansible.builtin.debug:
msg: >-
{%- set ns = namespace(rows=[_hdr]) -%} {%- for h in _fleet -%} {%- set hv = hostvars[h] -%} {%- set ip = hv.ansible_host | default('?') -%} {%- if hv._health_ok is defined -%} {%- set st = hv._health_ok | ternary('OK ', 'DEGRADED ') -%} {%- set os = (hv.ansible_distribution | default('?') + ' ' + hv.ansible_distribution_version | default(''))[:14] -%} {%- else -%} {%- set st = 'UNREACHABLE' -%} {%- set os = '?' -%} {%- endif -%} {%- set _ = ns.rows.append('%-40s %-16s %-15s %s' | format(h, ip, os, st)) -%} {%- endfor -%} {%- set _ = ns.rows.append(_ftr) -%} {{ ns.rows | join('\n') }}