#!/usr/bin/env zsh # Send alerts when Pressure Stall Information is high # # Copyright © 2023 Trey Blancher # # This program is free software: you can redistribute it and/or modify it # under the terms of the GNU General Public License as published by the Free # Software Foundation, either version 3 of the License, or (at your option) # any later version. # # This program is distributed in the hope that it will be useful, but # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License # for more details. # # You should have received a copy of the GNU General Public License along # with this program. If not, see . # # This script monitors the systemd journal, specifically the # `psi-monitor.service` and waits for Pressure State Information monitor events # to be logged. The monitor program is shipped in the psi-by-example submodule # in this git repository; it is released under the three-clause BSD license # (see its LICENSE file for details). # # It is designed to send desktop notifications to a desktop system, it also # uses the local mail transport agent to send notifications via SMS and email. # It is assumed the desktop notification system is remote, and it uses the # local ssh client to connect to the notification daemon on the remote host. # # This script expects a number of environment variables to be set, in the # systemd psi-alerts@.service overrides (will be placed in # /etc/systemd/system/psi-alerts@.service.d/override.conf. See the # README.md for details. svc="psi-monitor.service" cpu="/proc/pressure/cpu" mem="/proc/pressure/memory" io="/proc/pressure/io" host="$(hostname)" email_to="${EMAIL_TO}" sms_dst="${SMS_DST}" sms_domain="$(awk -F@ '{print $NF}' <<< ${SMS_DST})" ssh_port="${SSH_PORT}" ssh_host="${SSH_HOST}" ssh_user="${SSH_USER}" ssh_id_path="${SSH_ID_PATH}" clear_threshold="${CLEAR_THRESHOLD}" notification_cmd="${NOTIFICATION_CMD}" notification_opts="${NOTIFICATION_OPTS}" id_idx="${NOTIFICATION_IDX}" print_psi () { local psi_file="${1}" cat "${(P)$(tr '[[:upper:]]' '[[:lower:]]' <<< "${psi_file}")}" } print_pidstat () { local psi_type="${1}" case "${psi_type}" in CPU) pidstat -ul --human ;; IO) pidstat -dl --human ;; MEM) pidstat -rl --human ;; *) print "Invalid psi_type: ${psi_type}" >&2 ;; esac } send_notice () { #set -x local psi_type="${1}" shift local current_alarms="" if [[ -n "${1}" ]]; then current_alarms="${1}" fi local psi case "${psi_type}" in CPU|MEM|IO) psi="$(print_psi $(tr '[[:upper:]]' '[[:lower:]]' <<< "${psi_type}"))" ;; *) psi="Multiple alarms, current alarms are ${current_alarms}!" ;; esac integer notification_id if ! notification_id=$(ssh -q -i "${ssh_id_path}" "${ssh_user}@${ssh_host}" -p ${ssh_port} \ "${notification_cmd} ${notification_opts} '${host}: PSI ${psi_type} triggered!' '${psi}'"); then print "Connection to notification daemon failed!" >&2 false else echo ${notification_id} true fi #set +x } send () { #set -x if [[ "${#@}" -lt 2 ]] && [[ "${#@}" -gt 3 ]]; then echo "Wrong number of arguments to send()!" >&2 return false fi local psi_type="${1}" shift local current_alarms="${1}" shift local dst if [[ "${current_alarms}" =~ "@" ]]; then dst="${current_alarms}" unset current_alarms else dst="${1}" fi local psi="$(print_psi $(tr '[[:upper:]]' '[[:lower:]]' <<< ${psi_type}))" local email=$(mktemp /tmp/psi.eml.XXXX.txt) local subj printf "Pressure Stall Information for ${host} triggered on ${psi_type} at $(date +'%FT%T %Z')\n\n" > ${email} if [[ -n "${current_alarms}" ]]; then current_alarms="${current_alarms}|${psi_type}" subj="PSI on ${host} ${current_alarms} triggered!" printf "Multiple alarms triggered: ${current_alarms}\n\n" >> ${email} else subj="PSI on ${host} ${psi_type} triggered!" current_alarms="${psi_type}" fi # is this an email or SMS? if [[ ! "${dst}" =~ "@${sms_domain}" ]]; then for p in $(tr '|' ' ' <<< "${current_alarms}"); do printf "\npidstat info for ${p}\n\n" >> ${email} print_pidstat "${p}" >> ${email} printf "\n\n" >> ${email} done fi # send the message ( printf "To: ${dst}\n" printf "Subject: ${subj}\n" cat ${email} ) | sendmail -t #set +x } is_clear () { local psi_type="${1}" local psi_file="$(tr '[[:upper:]]' '[[:lower:]]' <<< "${psi_type}")" local avg10=$(grep some "${(P)psi_file}" | awk '{print $2}' | awk -F= '{print $2}') local prev=$(grep some "${(P)psi_file}" | awk '{print $5}' | awk -F= '{print $2}') if [[ ${avg10} -lt ${clear_threshold} ]]; then true else false fi } exec_notices () { local psi_type="${1}" shift local current_alarms="${1}" local dunst_id=-1 if [[ -n "${psi_type}" ]]; then case "${psi_type}" in CPU|MEM|IO) dunst_id=$(send_notice "${psi_type}" "${current_alarms}"); send "${psi_type}" "${current_alarms}" "${sms_dst}" send "${psi_type}" "${current_alarms}" "${email_to}" ;; *) echo "Something went wrong!" >&2 false ;; esac fi if [[ "${dunst_id}" -ge 0 ]]; then print "${dunst_id}" true else false fi } check_dunst_id_is_visible () { local dunst_id="${1}" typeset -a ids if ids=$(ssh -q "${ssh_host}" -p ${ssh_port} \ "dunstctl history | jq '.data[0][][${id_idx}].data'"); then echo "Connection to dunst failed!" >&2 return 2 fi if grep -qP "\b${dunst_id}\b" <<< "${ids}"; then true else false fi } #set -x local current_alarm="" local last_alarm="" typeset -A notice_sent typeset -A secs integer last_dunst_id=-1 journalctl -b 0 -fn 3 -u "${svc}" | \ while read line; do local psi_type="$(grep -Eo "(CPU|MEM|IO) PSI event" <<< "${line}" | grep -Eo "CPU|MEM|IO")" if [[ -n "${psi_type}" ]]; then secs+=(${psi_type} $(date +%s)) if [[ "${psi_type}" != "${last_alarm}" ]]; then if [[ ! ${notice_sent[${psi_type}]} ]]; then last_dunst_id=$(exec_notices "${psi_type}" "${current_alarms}") notice_sent+=(${psi_type} true) elif (( last_dunst_id >= 0 )) && ! check_dunst_id_is_visible "${last_dunst_id}"; then continue fi fi last_alarm="${psi_type}" if [[ -z "${current_alarms}" ]]; then current_alarms="${psi_type}" else if ! grep -q "${psi_type}" <<< "${current_alarms}"; then current_alarms="${current_alarms}|${psi_type}" fi fi else typeset -a alarms=( $(tr '|' ' ' <<< "$current_alarms") ) for alarm in ${alarms}; do integer elapsed=$(( $(date +%s) - ${secs[${alarm}]} )) if is_clear "${alarm}" && (( elapsed > 300 )); then current_alarms=$(sed -E "s/${alarm}\|?//" <<< "${current_alarms}") last_alarm=$(awk -F'|' '{print $NF}' <<< "${current_alarms}") fi done fi done #set +x