225 lines
5.6 KiB
Bash
225 lines
5.6 KiB
Bash
|
#!/usr/bin/env zsh
|
||
|
#
|
||
|
|
||
|
svc="psi-monitor.service"
|
||
|
cpu="/proc/pressure/cpu"
|
||
|
mem="/proc/pressure/memory"
|
||
|
io="/proc/pressure/io"
|
||
|
email_to="trey@blancher.net"
|
||
|
sms_dst="2517511550@msg.fi.google.com"
|
||
|
sms_domain="msg.fi.google.com"
|
||
|
port="5999"
|
||
|
ssh_host="localhost"
|
||
|
clear_threshold="5.0"
|
||
|
notification_cmd="dunstify"
|
||
|
notification_opts="--timeout=0 --printid --urgency=critical --icon=/usr/share/icons/breeze-dark/emblems/16/emblem-warning.svg"
|
||
|
id_idx=15
|
||
|
|
||
|
print_psi () {
|
||
|
local psi_file="${1}"
|
||
|
cat "${(P)psi_file}"
|
||
|
}
|
||
|
|
||
|
print_pidstat () {
|
||
|
local psi_type="${1}"
|
||
|
local opts="-l --human"
|
||
|
|
||
|
case "${psi_type}" in
|
||
|
MEM)
|
||
|
opts="-r ${opts}"
|
||
|
;;
|
||
|
IO)
|
||
|
opts="-d ${opts}"
|
||
|
;;
|
||
|
CPU)
|
||
|
opts="-u ${opts}"
|
||
|
;;
|
||
|
*)
|
||
|
print "Invalid psi_type: ${psi_type}" >&2
|
||
|
;;
|
||
|
esac
|
||
|
|
||
|
pidstat "${opts}"
|
||
|
}
|
||
|
|
||
|
send_notice () {
|
||
|
local psi_type="${1}"
|
||
|
shift
|
||
|
|
||
|
local current_alarms=""
|
||
|
if [[ -n "${1}" ]]; then
|
||
|
current_alarms="${1}"
|
||
|
fi
|
||
|
|
||
|
local psi
|
||
|
case "${psi_type}" in
|
||
|
CPU|MEM|IO)
|
||
|
psi="$(print_psi $(tr '[[:upper:]]' '[[:lower:]]' <<< "${psi_type}"))"
|
||
|
;;
|
||
|
*)
|
||
|
psi="Multiple alarms, current alarms are ${current_alarms}!"
|
||
|
;;
|
||
|
esac
|
||
|
|
||
|
integer dunst_id
|
||
|
if ! dunst_id=$(ssh -q "${ssh_host}" -p ${port} \
|
||
|
"${notification_cmd} ${notification_opts} 'deltachunk: PSI ${psi_type} triggered!' '${psi}'"); then
|
||
|
print "Connection to dunst failed!" >&2
|
||
|
false
|
||
|
else
|
||
|
echo ${dunst_id}
|
||
|
true
|
||
|
fi
|
||
|
}
|
||
|
|
||
|
send () {
|
||
|
if [[ "${#@}" -lt 2 ]] && [[ "${#@}" -gt 3 ]]; then
|
||
|
echo "Wrong number of arguments to send()!" >&2
|
||
|
return false
|
||
|
fi
|
||
|
|
||
|
local psi_type="${1}"
|
||
|
shift
|
||
|
|
||
|
local current_alarms="${1}"
|
||
|
shift
|
||
|
|
||
|
local dst
|
||
|
if [[ "${current_alarms}" =~ "@" ]]; then
|
||
|
dst="${current_alarms}"
|
||
|
unset current_alarms
|
||
|
else
|
||
|
dst="${1}"
|
||
|
fi
|
||
|
|
||
|
local psi="$(print_psi $(tr '[[:upper:]]' '[[:lower:]]' <<< ${psi_type}))"
|
||
|
|
||
|
local subj="PSI on deltachunk ${psi_type} triggered!"
|
||
|
|
||
|
|
||
|
local body="Pressure Stall Information triggered on ${psi_type} at $(date +'%FT%T %Z')"
|
||
|
if [[ -n "${current_alarms}" ]]; then
|
||
|
body="${body}\nMultiple alarms triggered: ${current_alarms}"
|
||
|
# if this is not an SMS, include pidstat info
|
||
|
if [[ ! "${dst}" =~ "${sms_domain}" ]]; then
|
||
|
for p in $(tr '|' ' ' <<< "${current_alarms}"); do
|
||
|
body="${body}\n\n$(print_pidstat ${p})"
|
||
|
done
|
||
|
fi
|
||
|
fi
|
||
|
|
||
|
|
||
|
local email=$(mktemp /tmp/psi.eml.XXXX)
|
||
|
|
||
|
cat <<-EOF > ${email}
|
||
|
${body}
|
||
|
|
||
|
EOF
|
||
|
|
||
|
|
||
|
/usr/bin/mail --resource-files=/ \
|
||
|
--subject="${subj}" \
|
||
|
--end-options \
|
||
|
${dst} < ${email}
|
||
|
}
|
||
|
|
||
|
is_clear () {
|
||
|
local psi_type="${1}"
|
||
|
local psi_file="$(tr '[[:upper:]]' '[[:lower:]]' <<< "${psi_type}")"
|
||
|
|
||
|
local avg10=$(grep some "${(P)psi_file}" | awk '{print $2}' | awk -F= '{print $2}')
|
||
|
local prev=$(grep some "${(P)psi_file}" | awk '{print $5}' | awk -F= '{print $2}')
|
||
|
|
||
|
|
||
|
if [[ ${avg10} -lt ${clear_threshold} ]]; then
|
||
|
return 0
|
||
|
else
|
||
|
return 1
|
||
|
fi
|
||
|
}
|
||
|
|
||
|
exec_notices () {
|
||
|
local psi_type="${1}"
|
||
|
shift
|
||
|
local current_alarms="${1}"
|
||
|
|
||
|
local dunst_id=-1
|
||
|
if [[ -n "${psi_type}" ]]; then
|
||
|
case "${psi_type}" in
|
||
|
CPU|MEM|IO)
|
||
|
dunst_id=$(send_notice "${psi_type}" "${current_alarms}");
|
||
|
send "${psi_type}" "${current_alarms}" "${sms_dst}"
|
||
|
send "${psi_type}" "${current_alarms}" "${email_to}"
|
||
|
;;
|
||
|
*)
|
||
|
echo "Something went wrong!" >&2
|
||
|
false
|
||
|
;;
|
||
|
esac
|
||
|
fi
|
||
|
if [[ "${dunst_id}" -ge 0 ]]; then
|
||
|
print "${dunst_id}"
|
||
|
true
|
||
|
else
|
||
|
false
|
||
|
fi
|
||
|
}
|
||
|
|
||
|
check_dunst_id_is_visible () {
|
||
|
local dunst_id="${1}"
|
||
|
|
||
|
typeset -a ids
|
||
|
if ids=$(ssh -q "${ssh_host}" -p ${port} \
|
||
|
"dunstctl history | jq '.data[0][][${id_idx}].data'"); then
|
||
|
echo "Connection to dunst failed!" >&2
|
||
|
return 2
|
||
|
fi
|
||
|
|
||
|
if grep -qP "\b${dunst_id}\b" <<< "${ids}"; then
|
||
|
true
|
||
|
else
|
||
|
false
|
||
|
fi
|
||
|
}
|
||
|
#set -x
|
||
|
local current_alarm=""
|
||
|
local last_alarm=""
|
||
|
typeset -A notice_sent
|
||
|
typeset -A secs
|
||
|
integer last_dunst_id=-1
|
||
|
|
||
|
journalctl -b 0 -fu "${svc}" | \
|
||
|
while read line; do
|
||
|
local psi_type="$(grep -Eo "(CPU|MEM|IO) PSI event" <<< "${line}" | grep -Eo "CPU|MEM|IO")"
|
||
|
if [[ -n "${psi_type}" ]]; then
|
||
|
secs+=(${psi_type} $(date +%s))
|
||
|
if [[ "${psi_type}" != "${last_alarm}" ]]; then
|
||
|
if [[ ! ${notice_sent[${psi_type}]} ]]; then
|
||
|
last_dunst_id=$(exec_notices "${psi_type}" "${current_alarms}")
|
||
|
notice_sent+=(${psi_type} true)
|
||
|
elif (( last_dunst_id >= 0 )) && ! check_dunst_id_is_visible "${last_dunst_id}"; then
|
||
|
continue
|
||
|
fi
|
||
|
fi
|
||
|
last_alarm="${psi_type}"
|
||
|
if [[ -z "${current_alarms}" ]]; then
|
||
|
current_alarms="${psi_type}"
|
||
|
else
|
||
|
if ! grep -q "${psi_type}" <<< "${current_alarms}"; then
|
||
|
current_alarms="${current_alarms}|${psi_type}"
|
||
|
fi
|
||
|
fi
|
||
|
else
|
||
|
typeset -a alarms=( $(tr '|' ' ' <<< "$current_alarms") )
|
||
|
for alarm in ${alarms}; do
|
||
|
integer elapsed=$(( $(date +%s) - ${secs[${alarm}]} ))
|
||
|
if is_clear "${alarm}" && (( elapsed > 300 )); then
|
||
|
current_alarms=$(sed -E "s/${alarm}\|?//" <<< "${current_alarms}")
|
||
|
last_alarm=$(awk -F'|' '{print $NF}' <<< "${current_alarms}")
|
||
|
fi
|
||
|
done
|
||
|
fi
|
||
|
done
|
||
|
#set +x
|
||
|
|