diff --git a/roles/float-base/files/run-node-exporter-script.sh b/roles/float-base/files/run-node-exporter-script.sh deleted file mode 100644 index bacc64563741481588f52bf12126712404c9e9c6..0000000000000000000000000000000000000000 --- a/roles/float-base/files/run-node-exporter-script.sh +++ /dev/null @@ -1,36 +0,0 @@ -#!/bin/sh -# -# Execute a metrics-generating script and safely write its -# output to the /var/lib/prometheus/node-exporter directory. -# -# Uses 'runcron' for locking, so don't wrap it in another -# 'runcron' invocation! -# - -if [ $# -ne 2 ]; then - echo "Usage: $0 <snippet-name> <script-path>" - exit 2 -fi - -script_name="$1" -script_path="$2" -output_dir="/var/lib/prometheus/node-exporter" - -test -d $output_dir || exit 0 - -umask 022 - -output_file="${output_dir}/${script_name}.prom" -tmp_file="${output_file}.$$" -trap "rm -f $tmp_file 2>/dev/null" EXIT INT TERM - -runcron --no-syslog --no-metrics --splay 60 --name "node-exporter-$script_name" -- \ - "$script_path" > "$tmp_file" -if [ $? -gt 0 ]; then - rm -f "$tmp_file" 2>/dev/null - exit 1 -else - mv -f "$tmp_file" "$output_file" -fi - -exit $? diff --git a/roles/float-base/files/run-node-exporter-scripts.sh b/roles/float-base/files/run-node-exporter-scripts.sh new file mode 100644 index 0000000000000000000000000000000000000000..9fc01c19e0aaed0a55b02733d6708f9f5b30f80f --- /dev/null +++ b/roles/float-base/files/run-node-exporter-scripts.sh @@ -0,0 +1,44 @@ +#!/bin/sh +# +# Run all node-exporter-scripts. +# + +scripts_dir="/etc/prometheus/node-exporter-scripts" +output_dir="/var/lib/prometheus/node-exporter" + +# Be sure to drop the current temporary file (global variable) even if +# terminated while we're running scripts. +tmp_file= +trap "rm -f ${tmp_file} 2>/dev/null" EXIT INT TERM + +# Exit with a non-zero status if any script errors out. +exit_status=0 + +# Run each node-exporter script, with a 120s timeout, in sequence. +for script in $(run_parts --list "${scripts_dir}"); do + script_basename=$(basename "${script}") + script_name="${script_basename%.*}" + output_file="${output_dir}/${script_name}.prom" + tmp_file="${output_file}.tmp.$$" + + timeout 120 "${script}" > "${tmp_file}" + rc=$? + case $rc in + 0) + mv -f "${tmp_file}" "${output_file}" + echo "${script_basename}: saved to ${output_file}" + ;; + 124) + echo "error: ${script_basename}: timed out" >&2 + exit_status=$(( ${exit_status} + 1 )) + ;; + *) + echo "error: ${script_basename}: failed (exit status ${rc})" >&2 + exit_status=$(( ${exit_status} + 1 )) + ;; + esac + + rm -f "${tmp_file}" +done + +exit ${exit_status} diff --git a/roles/float-base/tasks/prometheus.yml b/roles/float-base/tasks/prometheus.yml index 9835d0a5e9c32a83550301329e10e5259e7389d3..0262ba75ef69d785dc0bf4b433fa6a478f805e63 100644 --- a/roles/float-base/tasks/prometheus.yml +++ b/roles/float-base/tasks/prometheus.yml @@ -47,8 +47,8 @@ # Prometheus metrics for the local node (collected via the node exporter). - name: Install run-node-exporter-script wrapper copy: - src: run-node-exporter-script.sh - dest: "/usr/local/bin/run-node-exporter-script" + src: run-node-exporter-scripts.sh + dest: "/usr/local/bin/run-node-exporter-scripts" mode: 0755 - name: Create node-exporter scripts directory @@ -66,9 +66,40 @@ mode: 0755 loop: "{{ node_exporter_scripts }}" -- name: Install node-exporter scripts cron jobs +- name: Install node-exporter scripts systemd timer copy: - dest: "/etc/cron.d/node-exporter-script-{{ item | basename | regex_replace('\\..*$', '') }}" + dest: "/etc/systemd/system/node-exporter-scripts.timer" content: | - */20 * * * * root /usr/local/bin/run-node-exporter-script {{ item | basename | regex_replace('\..*$', '') }} /etc/prometheus/node-exporter-scripts/{{ item | basename }} - loop: "{{ node_exporter_scripts }}" + [Unit] + Description=Timer for node-exporter-scripts + [Timer] + OnCalendar=*:0/15 + RandomizeDelaySec=900 + FixedRandomDelay=true + Persistent=true + [Install] + WantedBy=timers.target + register: node_exporter_scripts_systemd_timer + +- name: Install node-exporter scripts systemd unit + copy: + dest: "/etc/systemd/system/node-exporter-scripts.service" + content: | + [Unit] + Description=Run node-exporter scripts + [Service] + Type=oneshot + ExecStart=/usr/local/bin/run-node-exporter-scripts + ProtectHome=true + register: node_exporter_scripts_sytemd_unit + +- name: Reload systemd + systemd: + name: "node-exporter-scripts.timer" + state: started + enabled: true + daemon_reload: "{{ node_exporter_scripts_systemd_timer.changed or node_exporter_scripts_systemd_unit.changed }}" + +- name: Clean up old node-exporter cron jobs + shell: "rm -f /etc/cron.d/node-exporter-script-*" + ignore_errors: true