#!/usr/bin/env zsh # svc="psi-monitor.service" cpu="/proc/pressure/cpu" mem="/proc/pressure/memory" io="/proc/pressure/io" email_to="trey@blancher.net" sms_dst="2517511550@msg.fi.google.com" sms_domain="msg.fi.google.com" port="5999" ssh_host="localhost" clear_threshold="5.0" notification_cmd="dunstify" notification_opts="--timeout=0 --printid --urgency=critical --icon=/usr/share/icons/breeze-dark/emblems/16/emblem-warning.svg" id_idx=15 print_psi () { local psi_file="${1}" cat "${(P)psi_file}" } print_pidstat () { local psi_type="${1}" local opts="-l --human" case "${psi_type}" in MEM) opts="-r ${opts}" ;; IO) opts="-d ${opts}" ;; CPU) opts="-u ${opts}" ;; *) print "Invalid psi_type: ${psi_type}" >&2 ;; esac pidstat "${opts}" } send_notice () { local psi_type="${1}" shift local current_alarms="" if [[ -n "${1}" ]]; then current_alarms="${1}" fi local psi case "${psi_type}" in CPU|MEM|IO) psi="$(print_psi $(tr '[[:upper:]]' '[[:lower:]]' <<< "${psi_type}"))" ;; *) psi="Multiple alarms, current alarms are ${current_alarms}!" ;; esac integer dunst_id if ! dunst_id=$(ssh -q "${ssh_host}" -p ${port} \ "${notification_cmd} ${notification_opts} 'deltachunk: PSI ${psi_type} triggered!' '${psi}'"); then print "Connection to dunst failed!" >&2 false else echo ${dunst_id} true fi } send () { if [[ "${#@}" -lt 2 ]] && [[ "${#@}" -gt 3 ]]; then echo "Wrong number of arguments to send()!" >&2 return false fi local psi_type="${1}" shift local current_alarms="${1}" shift local dst if [[ "${current_alarms}" =~ "@" ]]; then dst="${current_alarms}" unset current_alarms else dst="${1}" fi local psi="$(print_psi $(tr '[[:upper:]]' '[[:lower:]]' <<< ${psi_type}))" local subj="PSI on deltachunk ${psi_type} triggered!" local body="Pressure Stall Information triggered on ${psi_type} at $(date +'%FT%T %Z')" if [[ -n "${current_alarms}" ]]; then body="${body}\nMultiple alarms triggered: ${current_alarms}" # if this is not an SMS, include pidstat info if [[ ! "${dst}" =~ "${sms_domain}" ]]; then for p in $(tr '|' ' ' <<< "${current_alarms}"); do body="${body}\n\n$(print_pidstat ${p})" done fi fi local email=$(mktemp /tmp/psi.eml.XXXX) cat <<-EOF > ${email} ${body} EOF /usr/bin/mail --resource-files=/ \ --subject="${subj}" \ --end-options \ ${dst} < ${email} } is_clear () { local psi_type="${1}" local psi_file="$(tr '[[:upper:]]' '[[:lower:]]' <<< "${psi_type}")" local avg10=$(grep some "${(P)psi_file}" | awk '{print $2}' | awk -F= '{print $2}') local prev=$(grep some "${(P)psi_file}" | awk '{print $5}' | awk -F= '{print $2}') if [[ ${avg10} -lt ${clear_threshold} ]]; then return 0 else return 1 fi } exec_notices () { local psi_type="${1}" shift local current_alarms="${1}" local dunst_id=-1 if [[ -n "${psi_type}" ]]; then case "${psi_type}" in CPU|MEM|IO) dunst_id=$(send_notice "${psi_type}" "${current_alarms}"); send "${psi_type}" "${current_alarms}" "${sms_dst}" send "${psi_type}" "${current_alarms}" "${email_to}" ;; *) echo "Something went wrong!" >&2 false ;; esac fi if [[ "${dunst_id}" -ge 0 ]]; then print "${dunst_id}" true else false fi } check_dunst_id_is_visible () { local dunst_id="${1}" typeset -a ids if ids=$(ssh -q "${ssh_host}" -p ${port} \ "dunstctl history | jq '.data[0][][${id_idx}].data'"); then echo "Connection to dunst failed!" >&2 return 2 fi if grep -qP "\b${dunst_id}\b" <<< "${ids}"; then true else false fi } #set -x local current_alarm="" local last_alarm="" typeset -A notice_sent typeset -A secs integer last_dunst_id=-1 journalctl -b 0 -fu "${svc}" | \ while read line; do local psi_type="$(grep -Eo "(CPU|MEM|IO) PSI event" <<< "${line}" | grep -Eo "CPU|MEM|IO")" if [[ -n "${psi_type}" ]]; then secs+=(${psi_type} $(date +%s)) if [[ "${psi_type}" != "${last_alarm}" ]]; then if [[ ! ${notice_sent[${psi_type}]} ]]; then last_dunst_id=$(exec_notices "${psi_type}" "${current_alarms}") notice_sent+=(${psi_type} true) elif (( last_dunst_id >= 0 )) && ! check_dunst_id_is_visible "${last_dunst_id}"; then continue fi fi last_alarm="${psi_type}" if [[ -z "${current_alarms}" ]]; then current_alarms="${psi_type}" else if ! grep -q "${psi_type}" <<< "${current_alarms}"; then current_alarms="${current_alarms}|${psi_type}" fi fi else typeset -a alarms=( $(tr '|' ' ' <<< "$current_alarms") ) for alarm in ${alarms}; do integer elapsed=$(( $(date +%s) - ${secs[${alarm}]} )) if is_clear "${alarm}" && (( elapsed > 300 )); then current_alarms=$(sed -E "s/${alarm}\|?//" <<< "${current_alarms}") last_alarm=$(awk -F'|' '{print $NF}' <<< "${current_alarms}") fi done fi done #set +x