#!/usr/bin/env zsh ################################################################################ # Send alerts when Pressure Stall Information is high # # Copyright © 2023 Trey Blancher $(base64 -d <<< dHJleUBibGFuY2hlci5uZXQK) # # This program is free software: you can redistribute it and/or modify it # under the terms of the GNU General Public License as published by the Free # Software Foundation, either version 3 of the License, or (at your option) # any later version. # # This program is distributed in the hope that it will be useful, but # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License # for more details. # # You should have received a copy of the GNU General Public License along # with this program. If not, see . # # Submodules may be distributed under a separate software license; see the # LICENSE file within each submodule. # # This script monitors the systemd journal, specifically the # `psi-monitor.service` and waits for Pressure State Information monitor events # to be logged. The monitor program is shipped in the psi-by-example submodule # in this git repository; it is released under the three-clause BSD license # (see its LICENSE file for details). # # It is designed to send desktop notifications to a desktop system, it also # uses the local mail transport agent to send notifications via SMS and email. # It is assumed the desktop notification system is remote, and it uses the # local ssh client to connect to the notification daemon on the remote host. # # This script expects a number of environment variables to be set, in the # systemd psi-alerts@.service overrides (will be placed in # /etc/systemd/system/psi-alerts@.service.d/override.conf. See the # README.md for details. ################################################################################ svc="psi-monitor.service" cpu="/proc/pressure/cpu" mem="/proc/pressure/memory" io="/proc/pressure/io" user="$(whoami)" host="$(hostname)" email_to="${EMAIL_TO}" sms_dst="${SMS_DST}" sms_domain="$(awk -F@ '{print $NF}' <<< ${SMS_DST})" ssh_port="${SSH_PORT}" ssh_host="${SSH_HOST}" ssh_user="${SSH_USER}" ssh_id_path="${SSH_ID_PATH}" clear_threshold="${CLEAR_THRESHOLD}" notification_cmd="${NOTIFICATION_CMD}" notification_hist_cmd="${NOTIFICATION_HIST_CMD}" notification_opts="${NOTIFICATION_OPTS}" id_idx="${NOTIFICATION_IDX}" get_ssh_agent () { for dir in /tmp/ssh-*; do if [[ -O ${dir} ]]; then # only choose the last agent export SSH_AGENT_PID=$(ps -eaf | grep '[s]sh-agent' | \ grep ${user} | awk '{print $2}') export SSH_AUTH_SOCK=$(ls ${dir}/agent.* | tail -1) fi done if [[ -S ${SSH_AUTH_SOCK} ]]; then # we found an ssh_agent socket true else false fi } print_psi () { local psi_file="${1}" cat "${(P)$(tr '[[:upper:]]' '[[:lower:]]' <<< "${psi_file}")}" } print_stats () { local psi_type="${1}" case "${psi_type}" in CPU) top -bcn1 -o %CPU -w 512 | head -n 30 printf "\n\n" pidstat -ul --human ;; IO) sudo iotop --batch --only --iter=10 printf "\n\n" pidstat -dl --human ;; MEM) top -bcn1 -o %MEM -w 512 | head -n 30 printf "\n\n" pidstat -rl --human ;; *) print "Invalid psi_type: ${psi_type}" >&2 ;; esac } send_notice () { #set -x local psi_type="${1}" shift local current_alarms="" if [[ -n "${1}" ]]; then current_alarms="${1}" fi local psi case "${psi_type}" in CPU|MEM|IO) psi="$(print_psi $(tr '[[:upper:]]' '[[:lower:]]' <<< "${psi_type}"))" ;; *) psi="Multiple alarms, current alarms are ${current_alarms}!" ;; esac integer notification_id if get_ssh_agent && [[ -S ${SSH_AUTH_SOCK} ]]; then if ! notification_id=$(ssh -q "${ssh_user}@${ssh_host}" -p ${ssh_port} \ "${notification_cmd} ${notification_opts} '${host}: PSI ${psi_type} triggered!' '${psi}'"); then print "Connection to notification daemon failed!" >&2 false else echo ${notification_id} true fi elif [[ -n "${ssh_id_path}" ]]; then if ! notification_id=$(ssh -q -i "${ssh_id_path}" "${ssh_user}@${ssh_host}" -p ${ssh_port} \ "${notification_cmd} ${notification_opts} '${host}: PSI ${psi_type} triggered!' '${psi}'"); then print "Connection to notification daemon failed!" >&2 false else echo ${notification_id} true fi else echo "No SSH notifications configured. Returning." >&2 false fi #set +x } send () { #set -x if [[ "${#@}" -lt 2 ]] && [[ "${#@}" -gt 3 ]]; then echo "Wrong number of arguments to send()!" >&2 return false fi local psi_type="${1}" shift local current_alarms="${1}" shift local dst if [[ "${current_alarms}" =~ "@" ]]; then dst="${current_alarms}" unset current_alarms else dst="${1}" fi local psi="$(print_psi $(tr '[[:upper:]]' '[[:lower:]]' <<< ${psi_type}))" local email=$(mktemp /tmp/psi.eml.XXXX.txt) local subj printf "Pressure Stall Information for ${host} triggered on ${psi_type} at $(date +'%FT%T %Z')\n\n" > ${email} if [[ -n "${current_alarms}" ]]; then current_alarms="${current_alarms}|${psi_type}" subj="PSI on ${host} ${current_alarms} triggered!" printf "Multiple alarms triggered: ${current_alarms}\n\n" >> ${email} else subj="PSI on ${host} ${psi_type} triggered!" current_alarms="${psi_type}" fi print_psi "${psi_type}" >> ${email} printf "\n\n" >> ${email} # is this an email or SMS? if [[ ! "${dst}" =~ "@${sms_domain}" ]]; then for p in $(tr '|' ' ' <<< "${current_alarms}"); do printf "\n\nStatistics info for ${p}\n\n" >> ${email} print_stats "${p}" >> ${email} printf "\n\n" >> ${email} done fi # send the message ( printf "To: ${dst}\n" printf "Subject: ${subj}\n" cat ${email} ) | sendmail -t #set +x } is_clear () { local psi_type="${1}" local psi_file="$(tr '[[:upper:]]' '[[:lower:]]' <<< "${psi_type}")" local avg300=$(grep some "${(P)psi_file}" | awk '{print $4}' | awk -F= '{print $2}') local prev=$(grep some "${(P)psi_file}" | awk '{print $5}' | awk -F= '{print $2}') if [[ ${avg300} -lt ${clear_threshold} ]]; then true else false fi } exec_notices () { local psi_type="${1}" shift local current_alarms="${1}" local dunst_id=-1 if [[ -n "${psi_type}" ]]; then case "${psi_type}" in CPU|MEM|IO) dunst_id=$(send_notice "${psi_type}" "${current_alarms}"); send "${psi_type}" "${current_alarms}" "${sms_dst}" send "${psi_type}" "${current_alarms}" "${email_to}" ;; *) echo "Something went wrong!" >&2 false ;; esac fi if [[ "${dunst_id}" -ge 0 ]]; then print "${dunst_id}" true else false fi } check_dunst_id_is_visible () { local dunst_id="${1}" typeset -a ids if ! ids=$(ssh -q "${ssh_host}" -p ${ssh_port} -l "${ssh_user}" \ "${notification_hist_cmd} | jq '.data[0][].id.data'"); then if ! ids=$(ssh -qi "${ssh_id_path}" -p ${ssh_port} -l "${ssh_user}" \ "${ssh_host}" "${notification_hist_cmd} | jq '.data[0][].id.data'"); then echo "Connection to dunst failed!" >&2 return 2 fi fi # if the alert is visible, it's not in the dunst history if grep -qP "\b${dunst_id}\b" <<< "${ids}"; then false else true fi } local current_alarm="" typeset -A notice_sent typeset -A secs integer last_dunst_id=-1 local last_line="" #set -x while true; do local line=$(journalctl -u ${svc} -n1) local now=$(date +%s) local last_timestamp=$(date -d "$(awk '{print $1" "$2" "$3}' <<< "${line}")" +%s) local time_diff=$(( now - last_timestamp )) if [[ "${last_line}" == "${line}" ]]; then # last line hasn't changed, check to see if we can clear alarms if (( time_diff >= 3 )); then # haven't seen a monitor alert for 3 seconds, see if we can clear them if [[ -n "${current_alarms}" ]]; then typeset -a alarms=( $(tr '|' ' ' <<< "$current_alarms") ) for alarm in ${alarms}; do integer elapsed=$(( now - ${secs[${alarm}]} )) if is_clear "${alarm}" && (( elapsed >= 300 )); then current_alarms=$(sed -E "s/${alarm}\|?//; s/|$//" <<< "${current_alarms}") unset "notice_sent[${alarm}]" unset "secs[${alarm}]" fi done fi sleep 1 continue fi sleep 1 continue fi last_line="${line}" if (( time_diff < 3 )); then local psi_type="$(grep -Eo "(CPU|MEM|IO) PSI event" <<< "${line}" | grep -Eo "CPU|MEM|IO")" if [[ -n "${psi_type}" ]]; then secs+=(${psi_type} ${now}) if [[ ! ${notice_sent[${psi_type}]} ]]; then last_dunst_id=$(exec_notices "${psi_type}" "${current_alarms}") notice_sent+=(${psi_type} true) elif (( last_dunst_id >= 0 )) && check_dunst_id_is_visible "${last_dunst_id}"; then sleep 1 continue fi if [[ -z "${current_alarms}" ]]; then current_alarms="${psi_type}" else if ! grep -q "${psi_type}" <<< "${current_alarms}"; then current_alarms="${current_alarms}|${psi_type}" fi fi fi fi sleep 1 done #set +x