2023-08-07 22:20:41 -04:00
|
|
|
#!/usr/bin/env zsh
|
2023-08-07 22:28:14 -04:00
|
|
|
|
2023-08-19 13:56:47 -04:00
|
|
|
################################################################################
|
2023-08-07 22:28:14 -04:00
|
|
|
# Send alerts when Pressure Stall Information is high
|
|
|
|
#
|
2023-08-19 13:56:47 -04:00
|
|
|
# Copyright © 2023 Trey Blancher $(base64 -d <<< dHJleUBibGFuY2hlci5uZXQK)
|
2023-08-07 22:28:14 -04:00
|
|
|
#
|
|
|
|
# This program is free software: you can redistribute it and/or modify it
|
|
|
|
# under the terms of the GNU General Public License as published by the Free
|
|
|
|
# Software Foundation, either version 3 of the License, or (at your option)
|
|
|
|
# any later version.
|
|
|
|
#
|
|
|
|
# This program is distributed in the hope that it will be useful, but
|
|
|
|
# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
|
|
|
|
# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
|
|
|
|
# for more details.
|
|
|
|
#
|
|
|
|
# You should have received a copy of the GNU General Public License along
|
|
|
|
# with this program. If not, see <https://www.gnu.org/licenses/>.
|
2023-08-19 13:56:47 -04:00
|
|
|
#
|
|
|
|
# Submodules may be distributed under a separate software license; see the
|
|
|
|
# LICENSE file within each submodule.
|
2023-08-07 22:28:14 -04:00
|
|
|
#
|
|
|
|
# This script monitors the systemd journal, specifically the
|
|
|
|
# `psi-monitor.service` and waits for Pressure State Information monitor events
|
|
|
|
# to be logged. The monitor program is shipped in the psi-by-example submodule
|
|
|
|
# in this git repository; it is released under the three-clause BSD license
|
|
|
|
# (see its LICENSE file for details).
|
2023-08-07 22:20:41 -04:00
|
|
|
#
|
2023-08-07 22:28:14 -04:00
|
|
|
# It is designed to send desktop notifications to a desktop system, it also
|
|
|
|
# uses the local mail transport agent to send notifications via SMS and email.
|
|
|
|
# It is assumed the desktop notification system is remote, and it uses the
|
|
|
|
# local ssh client to connect to the notification daemon on the remote host.
|
|
|
|
#
|
|
|
|
# This script expects a number of environment variables to be set, in the
|
|
|
|
# systemd psi-alerts@<user>.service overrides (will be placed in
|
|
|
|
# /etc/systemd/system/psi-alerts@<user>.service.d/override.conf. See the
|
|
|
|
# README.md for details.
|
2023-08-19 13:56:47 -04:00
|
|
|
################################################################################
|
2023-08-07 22:20:41 -04:00
|
|
|
|
|
|
|
svc="psi-monitor.service"
|
|
|
|
cpu="/proc/pressure/cpu"
|
|
|
|
mem="/proc/pressure/memory"
|
|
|
|
io="/proc/pressure/io"
|
2023-08-26 09:17:08 -04:00
|
|
|
user="$(whoami)"
|
2023-08-07 22:28:14 -04:00
|
|
|
host="$(hostname)"
|
|
|
|
email_to="${EMAIL_TO}"
|
|
|
|
sms_dst="${SMS_DST}"
|
|
|
|
sms_domain="$(awk -F@ '{print $NF}' <<< ${SMS_DST})"
|
|
|
|
ssh_port="${SSH_PORT}"
|
|
|
|
ssh_host="${SSH_HOST}"
|
2023-08-08 18:20:04 -04:00
|
|
|
ssh_user="${SSH_USER}"
|
|
|
|
ssh_id_path="${SSH_ID_PATH}"
|
2023-08-07 22:28:14 -04:00
|
|
|
clear_threshold="${CLEAR_THRESHOLD}"
|
|
|
|
notification_cmd="${NOTIFICATION_CMD}"
|
2023-08-24 09:59:45 -04:00
|
|
|
notification_hist_cmd="${NOTIFICATION_HIST_CMD}"
|
2023-08-07 22:28:14 -04:00
|
|
|
notification_opts="${NOTIFICATION_OPTS}"
|
|
|
|
id_idx="${NOTIFICATION_IDX}"
|
2023-08-07 22:20:41 -04:00
|
|
|
|
2023-08-24 09:59:45 -04:00
|
|
|
get_ssh_agent () {
|
|
|
|
for dir in /tmp/ssh-*; do
|
|
|
|
if [[ -O ${dir} ]]; then
|
|
|
|
# only choose the last agent
|
2023-08-27 18:16:56 -04:00
|
|
|
export SSH_AGENT_PID=$(ps -eaf | grep '[s]sh-agent' | \
|
2023-08-26 09:17:08 -04:00
|
|
|
grep ${user} | awk '{print $2}')
|
2023-08-24 09:59:45 -04:00
|
|
|
export SSH_AUTH_SOCK=$(ls ${dir}/agent.* | tail -1)
|
|
|
|
fi
|
|
|
|
done
|
|
|
|
|
|
|
|
if [[ -S ${SSH_AUTH_SOCK} ]]; then
|
|
|
|
# we found an ssh_agent socket
|
|
|
|
true
|
|
|
|
else
|
|
|
|
false
|
|
|
|
fi
|
|
|
|
}
|
2023-08-07 22:20:41 -04:00
|
|
|
print_psi () {
|
|
|
|
local psi_file="${1}"
|
2023-08-08 18:20:04 -04:00
|
|
|
cat "${(P)$(tr '[[:upper:]]' '[[:lower:]]' <<< "${psi_file}")}"
|
2023-08-07 22:20:41 -04:00
|
|
|
}
|
|
|
|
|
2023-08-26 09:03:15 -04:00
|
|
|
print_stats () {
|
2023-08-07 22:20:41 -04:00
|
|
|
local psi_type="${1}"
|
|
|
|
|
|
|
|
case "${psi_type}" in
|
2023-08-07 22:28:14 -04:00
|
|
|
CPU)
|
2023-08-26 09:03:15 -04:00
|
|
|
top -bcn1 -o %CPU -w 512 | head -n 30
|
|
|
|
printf "\n\n"
|
2023-08-08 18:20:04 -04:00
|
|
|
pidstat -ul --human
|
2023-08-07 22:20:41 -04:00
|
|
|
;;
|
|
|
|
IO)
|
2023-08-31 22:56:01 -04:00
|
|
|
sudo iotop --batch --only --iter=10
|
2023-08-26 09:03:15 -04:00
|
|
|
printf "\n\n"
|
2023-08-08 18:20:04 -04:00
|
|
|
pidstat -dl --human
|
2023-08-07 22:20:41 -04:00
|
|
|
;;
|
2023-08-07 22:28:14 -04:00
|
|
|
MEM)
|
2023-08-26 09:03:15 -04:00
|
|
|
top -bcn1 -o %MEM -w 512 | head -n 30
|
|
|
|
printf "\n\n"
|
2023-08-08 18:20:04 -04:00
|
|
|
pidstat -rl --human
|
2023-08-07 22:20:41 -04:00
|
|
|
;;
|
|
|
|
*)
|
|
|
|
print "Invalid psi_type: ${psi_type}" >&2
|
|
|
|
;;
|
|
|
|
esac
|
|
|
|
}
|
|
|
|
|
2023-08-08 18:20:04 -04:00
|
|
|
|
2023-08-07 22:20:41 -04:00
|
|
|
send_notice () {
|
2023-08-08 18:20:04 -04:00
|
|
|
#set -x
|
2023-08-07 22:20:41 -04:00
|
|
|
local psi_type="${1}"
|
|
|
|
shift
|
|
|
|
|
|
|
|
local current_alarms=""
|
|
|
|
if [[ -n "${1}" ]]; then
|
|
|
|
current_alarms="${1}"
|
|
|
|
fi
|
|
|
|
|
|
|
|
local psi
|
|
|
|
case "${psi_type}" in
|
|
|
|
CPU|MEM|IO)
|
|
|
|
psi="$(print_psi $(tr '[[:upper:]]' '[[:lower:]]' <<< "${psi_type}"))"
|
|
|
|
;;
|
|
|
|
*)
|
|
|
|
psi="Multiple alarms, current alarms are ${current_alarms}!"
|
|
|
|
;;
|
|
|
|
esac
|
|
|
|
|
2023-08-07 22:28:14 -04:00
|
|
|
integer notification_id
|
2023-08-24 09:59:45 -04:00
|
|
|
if get_ssh_agent && [[ -S ${SSH_AUTH_SOCK} ]]; then
|
|
|
|
if ! notification_id=$(ssh -q "${ssh_user}@${ssh_host}" -p ${ssh_port} \
|
|
|
|
"${notification_cmd} ${notification_opts} '${host}: PSI ${psi_type} triggered!' '${psi}'"); then
|
|
|
|
print "Connection to notification daemon failed!" >&2
|
|
|
|
false
|
|
|
|
else
|
|
|
|
echo ${notification_id}
|
|
|
|
true
|
|
|
|
fi
|
|
|
|
elif [[ -n "${ssh_id_path}" ]]; then
|
|
|
|
if ! notification_id=$(ssh -q -i "${ssh_id_path}" "${ssh_user}@${ssh_host}" -p ${ssh_port} \
|
|
|
|
"${notification_cmd} ${notification_opts} '${host}: PSI ${psi_type} triggered!' '${psi}'"); then
|
|
|
|
print "Connection to notification daemon failed!" >&2
|
|
|
|
false
|
|
|
|
else
|
|
|
|
echo ${notification_id}
|
|
|
|
true
|
|
|
|
fi
|
2023-08-07 22:20:41 -04:00
|
|
|
else
|
2023-08-24 09:59:45 -04:00
|
|
|
echo "No SSH notifications configured. Returning." >&2
|
|
|
|
false
|
2023-08-07 22:20:41 -04:00
|
|
|
fi
|
2023-08-08 18:20:04 -04:00
|
|
|
#set +x
|
2023-08-07 22:20:41 -04:00
|
|
|
}
|
|
|
|
|
|
|
|
send () {
|
2023-08-19 10:02:24 -04:00
|
|
|
#set -x
|
2023-08-07 22:20:41 -04:00
|
|
|
if [[ "${#@}" -lt 2 ]] && [[ "${#@}" -gt 3 ]]; then
|
|
|
|
echo "Wrong number of arguments to send()!" >&2
|
|
|
|
return false
|
|
|
|
fi
|
|
|
|
|
|
|
|
local psi_type="${1}"
|
|
|
|
shift
|
|
|
|
|
|
|
|
local current_alarms="${1}"
|
|
|
|
shift
|
|
|
|
|
|
|
|
local dst
|
|
|
|
if [[ "${current_alarms}" =~ "@" ]]; then
|
|
|
|
dst="${current_alarms}"
|
|
|
|
unset current_alarms
|
|
|
|
else
|
|
|
|
dst="${1}"
|
|
|
|
fi
|
|
|
|
|
|
|
|
local psi="$(print_psi $(tr '[[:upper:]]' '[[:lower:]]' <<< ${psi_type}))"
|
|
|
|
|
|
|
|
|
2023-08-19 13:15:53 -04:00
|
|
|
local email=$(mktemp /tmp/psi.eml.XXXX.txt)
|
2023-08-08 18:20:04 -04:00
|
|
|
local subj
|
2023-08-12 13:16:29 -04:00
|
|
|
printf "Pressure Stall Information for ${host} triggered on ${psi_type} at $(date +'%FT%T %Z')\n\n" > ${email}
|
2023-08-07 22:20:41 -04:00
|
|
|
if [[ -n "${current_alarms}" ]]; then
|
2023-08-14 16:41:41 -04:00
|
|
|
current_alarms="${current_alarms}|${psi_type}"
|
2023-08-12 13:16:29 -04:00
|
|
|
subj="PSI on ${host} ${current_alarms} triggered!"
|
2023-08-08 18:20:04 -04:00
|
|
|
printf "Multiple alarms triggered: ${current_alarms}\n\n" >> ${email}
|
|
|
|
else
|
2023-08-12 13:16:29 -04:00
|
|
|
subj="PSI on ${host} ${psi_type} triggered!"
|
2023-08-08 18:20:04 -04:00
|
|
|
current_alarms="${psi_type}"
|
2023-08-07 22:20:41 -04:00
|
|
|
fi
|
2023-08-27 18:16:56 -04:00
|
|
|
print_psi "${psi_type}" >> ${email}
|
|
|
|
printf "\n\n" >> ${email}
|
2023-08-08 18:20:04 -04:00
|
|
|
# is this an email or SMS?
|
2023-08-19 10:02:24 -04:00
|
|
|
if [[ ! "${dst}" =~ "@${sms_domain}" ]]; then
|
2023-08-08 18:20:04 -04:00
|
|
|
for p in $(tr '|' ' ' <<< "${current_alarms}"); do
|
2023-08-27 18:16:56 -04:00
|
|
|
printf "\n\nStatistics info for ${p}\n\n" >> ${email}
|
2023-08-26 09:03:15 -04:00
|
|
|
print_stats "${p}" >> ${email}
|
2023-08-08 18:20:04 -04:00
|
|
|
printf "\n\n" >> ${email}
|
|
|
|
done
|
|
|
|
fi
|
2023-08-07 22:20:41 -04:00
|
|
|
|
2023-08-08 18:20:04 -04:00
|
|
|
# send the message
|
2023-08-19 13:15:53 -04:00
|
|
|
(
|
|
|
|
printf "To: ${dst}\n"
|
|
|
|
printf "Subject: ${subj}\n"
|
|
|
|
cat ${email}
|
|
|
|
) | sendmail -t
|
|
|
|
|
2023-08-19 10:02:24 -04:00
|
|
|
#set +x
|
2023-08-07 22:20:41 -04:00
|
|
|
}
|
|
|
|
|
|
|
|
is_clear () {
|
|
|
|
local psi_type="${1}"
|
|
|
|
local psi_file="$(tr '[[:upper:]]' '[[:lower:]]' <<< "${psi_type}")"
|
|
|
|
|
2023-08-24 09:59:45 -04:00
|
|
|
local avg300=$(grep some "${(P)psi_file}" | awk '{print $4}' | awk -F= '{print $2}')
|
2023-08-07 22:20:41 -04:00
|
|
|
local prev=$(grep some "${(P)psi_file}" | awk '{print $5}' | awk -F= '{print $2}')
|
|
|
|
|
|
|
|
|
2023-08-24 09:59:45 -04:00
|
|
|
if [[ ${avg300} -lt ${clear_threshold} ]]; then
|
2023-08-12 16:19:01 -04:00
|
|
|
true
|
2023-08-07 22:20:41 -04:00
|
|
|
else
|
2023-08-12 16:19:01 -04:00
|
|
|
false
|
2023-08-07 22:20:41 -04:00
|
|
|
fi
|
|
|
|
}
|
|
|
|
|
|
|
|
exec_notices () {
|
|
|
|
local psi_type="${1}"
|
|
|
|
shift
|
|
|
|
local current_alarms="${1}"
|
|
|
|
|
|
|
|
local dunst_id=-1
|
|
|
|
if [[ -n "${psi_type}" ]]; then
|
|
|
|
case "${psi_type}" in
|
|
|
|
CPU|MEM|IO)
|
|
|
|
dunst_id=$(send_notice "${psi_type}" "${current_alarms}");
|
|
|
|
send "${psi_type}" "${current_alarms}" "${sms_dst}"
|
|
|
|
send "${psi_type}" "${current_alarms}" "${email_to}"
|
|
|
|
;;
|
|
|
|
*)
|
|
|
|
echo "Something went wrong!" >&2
|
|
|
|
false
|
|
|
|
;;
|
|
|
|
esac
|
|
|
|
fi
|
|
|
|
if [[ "${dunst_id}" -ge 0 ]]; then
|
|
|
|
print "${dunst_id}"
|
|
|
|
true
|
|
|
|
else
|
|
|
|
false
|
|
|
|
fi
|
|
|
|
}
|
|
|
|
|
|
|
|
check_dunst_id_is_visible () {
|
|
|
|
local dunst_id="${1}"
|
|
|
|
|
|
|
|
typeset -a ids
|
2023-08-24 09:59:45 -04:00
|
|
|
if ! ids=$(ssh -q "${ssh_host}" -p ${ssh_port} -l "${ssh_user}" \
|
|
|
|
"${notification_hist_cmd} | jq '.data[0][].id.data'"); then
|
|
|
|
if ! ids=$(ssh -qi "${ssh_id_path}" -p ${ssh_port} -l "${ssh_user}" \
|
|
|
|
"${ssh_host}" "${notification_hist_cmd} | jq '.data[0][].id.data'"); then
|
|
|
|
echo "Connection to dunst failed!" >&2
|
|
|
|
return 2
|
|
|
|
fi
|
2023-08-07 22:20:41 -04:00
|
|
|
fi
|
|
|
|
|
2023-08-24 09:59:45 -04:00
|
|
|
# if the alert is visible, it's not in the dunst history
|
2023-08-07 22:20:41 -04:00
|
|
|
if grep -qP "\b${dunst_id}\b" <<< "${ids}"; then
|
|
|
|
false
|
2023-08-24 09:59:45 -04:00
|
|
|
else
|
|
|
|
true
|
2023-08-07 22:20:41 -04:00
|
|
|
fi
|
|
|
|
}
|
2023-08-24 09:59:45 -04:00
|
|
|
|
2023-08-07 22:20:41 -04:00
|
|
|
local current_alarm=""
|
|
|
|
typeset -A notice_sent
|
|
|
|
typeset -A secs
|
|
|
|
integer last_dunst_id=-1
|
2023-08-24 09:59:45 -04:00
|
|
|
local last_line=""
|
2023-08-07 22:20:41 -04:00
|
|
|
|
2023-08-25 17:35:58 -04:00
|
|
|
#set -x
|
2023-08-24 09:59:45 -04:00
|
|
|
while true; do
|
|
|
|
local line=$(journalctl -u ${svc} -n1)
|
|
|
|
local now=$(date +%s)
|
2023-08-25 17:35:58 -04:00
|
|
|
local last_timestamp=$(date -d "$(awk '{print $1" "$2" "$3}' <<< "${line}")" +%s)
|
2023-08-24 09:59:45 -04:00
|
|
|
local time_diff=$(( now - last_timestamp ))
|
2023-08-25 17:35:58 -04:00
|
|
|
if [[ "${last_line}" == "${line}" ]]; then
|
|
|
|
# last line hasn't changed, check to see if we can clear alarms
|
|
|
|
if (( time_diff >= 3 )); then
|
|
|
|
# haven't seen a monitor alert for 3 seconds, see if we can clear them
|
|
|
|
if [[ -n "${current_alarms}" ]]; then
|
|
|
|
typeset -a alarms=( $(tr '|' ' ' <<< "$current_alarms") )
|
|
|
|
for alarm in ${alarms}; do
|
|
|
|
integer elapsed=$(( now - ${secs[${alarm}]} ))
|
|
|
|
if is_clear "${alarm}" && (( elapsed >= 300 )); then
|
|
|
|
current_alarms=$(sed -E "s/${alarm}\|?//" <<< "${current_alarms}")
|
|
|
|
unset "notice_sent[${alarm}]"
|
|
|
|
unset "secs[${alarm}]"
|
|
|
|
fi
|
|
|
|
done
|
|
|
|
fi
|
|
|
|
sleep 1
|
|
|
|
continue
|
2023-08-24 09:59:45 -04:00
|
|
|
fi
|
2023-08-25 17:35:58 -04:00
|
|
|
|
2023-08-24 09:59:45 -04:00
|
|
|
sleep 1
|
|
|
|
continue
|
|
|
|
fi
|
2023-08-25 17:35:58 -04:00
|
|
|
last_line="${line}"
|
2023-08-25 18:06:02 -04:00
|
|
|
if (( time_diff < 3 )); then
|
|
|
|
local psi_type="$(grep -Eo "(CPU|MEM|IO) PSI event" <<< "${line}" | grep -Eo "CPU|MEM|IO")"
|
|
|
|
if [[ -n "${psi_type}" ]]; then
|
|
|
|
secs+=(${psi_type} ${now})
|
|
|
|
if [[ ! ${notice_sent[${psi_type}]} ]]; then
|
|
|
|
last_dunst_id=$(exec_notices "${psi_type}" "${current_alarms}")
|
|
|
|
notice_sent+=(${psi_type} true)
|
|
|
|
elif (( last_dunst_id >= 0 )) && check_dunst_id_is_visible "${last_dunst_id}"; then
|
|
|
|
sleep 1
|
|
|
|
continue
|
|
|
|
fi
|
|
|
|
if [[ -z "${current_alarms}" ]]; then
|
|
|
|
current_alarms="${psi_type}"
|
|
|
|
else
|
|
|
|
if ! grep -q "${psi_type}" <<< "${current_alarms}"; then
|
|
|
|
current_alarms="${current_alarms}|${psi_type}"
|
|
|
|
fi
|
2023-08-07 22:20:41 -04:00
|
|
|
fi
|
|
|
|
fi
|
|
|
|
fi
|
2023-08-24 09:59:45 -04:00
|
|
|
sleep 1
|
2023-08-07 22:20:41 -04:00
|
|
|
done
|
2023-08-25 17:35:58 -04:00
|
|
|
#set +x
|
2023-08-07 22:20:41 -04:00
|
|
|
|