psi-alerts/psi-alerts.sh

225 lines
5.6 KiB
Bash
Raw Normal View History

2023-08-07 22:20:41 -04:00
#!/usr/bin/env zsh
#
svc="psi-monitor.service"
cpu="/proc/pressure/cpu"
mem="/proc/pressure/memory"
io="/proc/pressure/io"
email_to="trey@blancher.net"
sms_dst="2517511550@msg.fi.google.com"
sms_domain="msg.fi.google.com"
port="5999"
ssh_host="localhost"
clear_threshold="5.0"
notification_cmd="dunstify"
notification_opts="--timeout=0 --printid --urgency=critical --icon=/usr/share/icons/breeze-dark/emblems/16/emblem-warning.svg"
id_idx=15
print_psi () {
local psi_file="${1}"
cat "${(P)psi_file}"
}
print_pidstat () {
local psi_type="${1}"
local opts="-l --human"
case "${psi_type}" in
MEM)
opts="-r ${opts}"
;;
IO)
opts="-d ${opts}"
;;
CPU)
opts="-u ${opts}"
;;
*)
print "Invalid psi_type: ${psi_type}" >&2
;;
esac
pidstat "${opts}"
}
send_notice () {
local psi_type="${1}"
shift
local current_alarms=""
if [[ -n "${1}" ]]; then
current_alarms="${1}"
fi
local psi
case "${psi_type}" in
CPU|MEM|IO)
psi="$(print_psi $(tr '[[:upper:]]' '[[:lower:]]' <<< "${psi_type}"))"
;;
*)
psi="Multiple alarms, current alarms are ${current_alarms}!"
;;
esac
integer dunst_id
if ! dunst_id=$(ssh -q "${ssh_host}" -p ${port} \
"${notification_cmd} ${notification_opts} 'deltachunk: PSI ${psi_type} triggered!' '${psi}'"); then
print "Connection to dunst failed!" >&2
false
else
echo ${dunst_id}
true
fi
}
send () {
if [[ "${#@}" -lt 2 ]] && [[ "${#@}" -gt 3 ]]; then
echo "Wrong number of arguments to send()!" >&2
return false
fi
local psi_type="${1}"
shift
local current_alarms="${1}"
shift
local dst
if [[ "${current_alarms}" =~ "@" ]]; then
dst="${current_alarms}"
unset current_alarms
else
dst="${1}"
fi
local psi="$(print_psi $(tr '[[:upper:]]' '[[:lower:]]' <<< ${psi_type}))"
local subj="PSI on deltachunk ${psi_type} triggered!"
local body="Pressure Stall Information triggered on ${psi_type} at $(date +'%FT%T %Z')"
if [[ -n "${current_alarms}" ]]; then
body="${body}\nMultiple alarms triggered: ${current_alarms}"
# if this is not an SMS, include pidstat info
if [[ ! "${dst}" =~ "${sms_domain}" ]]; then
for p in $(tr '|' ' ' <<< "${current_alarms}"); do
body="${body}\n\n$(print_pidstat ${p})"
done
fi
fi
local email=$(mktemp /tmp/psi.eml.XXXX)
cat <<-EOF > ${email}
${body}
EOF
/usr/bin/mail --resource-files=/ \
--subject="${subj}" \
--end-options \
${dst} < ${email}
}
is_clear () {
local psi_type="${1}"
local psi_file="$(tr '[[:upper:]]' '[[:lower:]]' <<< "${psi_type}")"
local avg10=$(grep some "${(P)psi_file}" | awk '{print $2}' | awk -F= '{print $2}')
local prev=$(grep some "${(P)psi_file}" | awk '{print $5}' | awk -F= '{print $2}')
if [[ ${avg10} -lt ${clear_threshold} ]]; then
return 0
else
return 1
fi
}
exec_notices () {
local psi_type="${1}"
shift
local current_alarms="${1}"
local dunst_id=-1
if [[ -n "${psi_type}" ]]; then
case "${psi_type}" in
CPU|MEM|IO)
dunst_id=$(send_notice "${psi_type}" "${current_alarms}");
send "${psi_type}" "${current_alarms}" "${sms_dst}"
send "${psi_type}" "${current_alarms}" "${email_to}"
;;
*)
echo "Something went wrong!" >&2
false
;;
esac
fi
if [[ "${dunst_id}" -ge 0 ]]; then
print "${dunst_id}"
true
else
false
fi
}
check_dunst_id_is_visible () {
local dunst_id="${1}"
typeset -a ids
if ids=$(ssh -q "${ssh_host}" -p ${port} \
"dunstctl history | jq '.data[0][][${id_idx}].data'"); then
echo "Connection to dunst failed!" >&2
return 2
fi
if grep -qP "\b${dunst_id}\b" <<< "${ids}"; then
true
else
false
fi
}
#set -x
local current_alarm=""
local last_alarm=""
typeset -A notice_sent
typeset -A secs
integer last_dunst_id=-1
journalctl -b 0 -fu "${svc}" | \
while read line; do
local psi_type="$(grep -Eo "(CPU|MEM|IO) PSI event" <<< "${line}" | grep -Eo "CPU|MEM|IO")"
if [[ -n "${psi_type}" ]]; then
secs+=(${psi_type} $(date +%s))
if [[ "${psi_type}" != "${last_alarm}" ]]; then
if [[ ! ${notice_sent[${psi_type}]} ]]; then
last_dunst_id=$(exec_notices "${psi_type}" "${current_alarms}")
notice_sent+=(${psi_type} true)
elif (( last_dunst_id >= 0 )) && ! check_dunst_id_is_visible "${last_dunst_id}"; then
continue
fi
fi
last_alarm="${psi_type}"
if [[ -z "${current_alarms}" ]]; then
current_alarms="${psi_type}"
else
if ! grep -q "${psi_type}" <<< "${current_alarms}"; then
current_alarms="${current_alarms}|${psi_type}"
fi
fi
else
typeset -a alarms=( $(tr '|' ' ' <<< "$current_alarms") )
for alarm in ${alarms}; do
integer elapsed=$(( $(date +%s) - ${secs[${alarm}]} ))
if is_clear "${alarm}" && (( elapsed > 300 )); then
current_alarms=$(sed -E "s/${alarm}\|?//" <<< "${current_alarms}")
last_alarm=$(awk -F'|' '{print $NF}' <<< "${current_alarms}")
fi
done
fi
done
#set +x