Compare commits
No commits in common. "b414e81081f40a2e39a667c08ee43142a2e21b64" and "428ed91dd1f5975b1d31528723178aacaadc4f1c" have entirely different histories.
b414e81081
...
428ed91dd1
9
.gitignore
vendored
9
.gitignore
vendored
@ -1,9 +0,0 @@
|
|||||||
*
|
|
||||||
!psi-alerts.sh
|
|
||||||
!psi-alerts@.service
|
|
||||||
!psi-monitor.service
|
|
||||||
!psi-by-example
|
|
||||||
!.gitignore
|
|
||||||
!CONFIGURE.md
|
|
||||||
!INSTALL.md
|
|
||||||
!README.md
|
|
38
CONFIGURE.md
38
CONFIGURE.md
@ -25,7 +25,6 @@ Environment=EMAIL_TO="email@domain.tld"
|
|||||||
Environment=SMS_DST="phone_number@sms.domain.tld"
|
Environment=SMS_DST="phone_number@sms.domain.tld"
|
||||||
Environment=NOTIFICATION_CMD="dunstify"
|
Environment=NOTIFICATION_CMD="dunstify"
|
||||||
Environment=NOTIFICATION_OPTS="--timeout=0 --printid --urgency=critical --icon=/usr/share/icons/breeze-dark/emblems/16/emblem-warning.svg"
|
Environment=NOTIFICATION_OPTS="--timeout=0 --printid --urgency=critical --icon=/usr/share/icons/breeze-dark/emblems/16/emblem-warning.svg"
|
||||||
Environment=NOTIFICATION_HIST_CMD="dunstctl history"
|
|
||||||
Environment=NOTIFICATION_IDX=15
|
Environment=NOTIFICATION_IDX=15
|
||||||
Environment=SSH_USER="username"
|
Environment=SSH_USER="username"
|
||||||
Environment=SSH_HOST="localhost"
|
Environment=SSH_HOST="localhost"
|
||||||
@ -50,40 +49,3 @@ Environment=CLEAR_THRESHOLD="5.0"
|
|||||||
# WantedBy=multi-user.target
|
# WantedBy=multi-user.target
|
||||||
```
|
```
|
||||||
|
|
||||||
All of these are required except where noted, there are no default options
|
|
||||||
(defaults may be added in the future). A brief description of each:
|
|
||||||
* **EMAIL_TO**: the email address the notification should be sent to. The
|
|
||||||
output of `pidstat` will be included in the body of this email, for each
|
|
||||||
triggered resource type (CPU, I/O, Memory), at the time the monitor alerted.
|
|
||||||
* **SMS_DST**: the email-to-SMS address, as defined by your mobile carrier.
|
|
||||||
Please review your mobile carrier's documentation. For Google Fi, based in
|
|
||||||
the US, the format is
|
|
||||||
`<mobile_number_without_country_code>@msg.fi.google.com`. This email address
|
|
||||||
does **not** get the output of `pidstat` in the body of the message.
|
|
||||||
* **NOTIFICATION_CMD**: The command on the remote host to run to display
|
|
||||||
notifications, e.g. `notify-send` or `dunstify`.
|
|
||||||
* **NOTIFICATION_OPTS**: Options for the `${NOTIFICATION_CMD}`. Should
|
|
||||||
include `--print-id` if supported by the command.
|
|
||||||
* **NOTIFICATION_HIST_CMD**: The command to display the notification history
|
|
||||||
(e.g. `dunst history`).
|
|
||||||
* **NOTIFICATION_IDX**: The index if the JSON structure that contains the
|
|
||||||
notification ID. `dunst`, as of version 1.9.2-1, displays its history as a
|
|
||||||
JSON structure. For other notification daemons, some other history mechanism
|
|
||||||
will likely be required; patches needed and welcome!
|
|
||||||
* **SSH_USER**: The SSH username to connect to the remote host that will
|
|
||||||
display the notifications to the system administrator.
|
|
||||||
* **SSH_HOST**: The SSH host to connect to. This is where
|
|
||||||
`${NOTIFICATION_CMD} ${NOTIFICATION_OPTS}` and `${NOTIFICATION_HIST_CMD}`
|
|
||||||
will run.
|
|
||||||
* **SSH_PORT**: The SSH port to connect to.
|
|
||||||
* **SSH_ID_PATH**: The path to the SSH id (private key file) to use for
|
|
||||||
authenticating to the remote host. This can be exluded if the local user
|
|
||||||
already has an ssh-agent running, with the necessary key and passphrase
|
|
||||||
entered. If ssh-agent is not desired, then this SSH id (private key file)
|
|
||||||
should have an empty passphrase (i.e., no passphrase). Not having this
|
|
||||||
environment variable, and no ssh-agent will disable the desktop notifications
|
|
||||||
(SMS and email will still work, as they don't use SSH)
|
|
||||||
* **CLEAR_THRESHOLD**: The percentage threshold the some avg300 threshold
|
|
||||||
should be below before considering the alert cleared. This will depend
|
|
||||||
highly on the workload running on
|
|
||||||
|
|
||||||
|
13
README.md
13
README.md
@ -17,18 +17,6 @@ near real time.
|
|||||||
* psi-by-example (a modified version of this is included in this project as a
|
* psi-by-example (a modified version of this is included in this project as a
|
||||||
submodule)
|
submodule)
|
||||||
* a libnotify-compatible desktop notification system
|
* a libnotify-compatible desktop notification system
|
||||||
* any notification program should use the `--print-id` parameter if
|
|
||||||
possible
|
|
||||||
* both `notify-send` and `dunstify` (part of
|
|
||||||
[dunst](https://dunst-project.org/)) support this
|
|
||||||
* note, this has only been tested with `dunst`, since it has the capability
|
|
||||||
of showing notification history
|
|
||||||
* `notify-send` specifically does not appear to retain a history, so the
|
|
||||||
`check_dunst_id_is_visible` function won't work with it (and the logic to
|
|
||||||
skip sending a new notification if one is already sent will be broken).
|
|
||||||
* since I don't use `notify-send`, I'm not sure how to solve this
|
|
||||||
* patches welcome!
|
|
||||||
* jq (for the aformentioned `dunst` integration)
|
|
||||||
|
|
||||||
## History
|
## History
|
||||||
|
|
||||||
@ -110,7 +98,6 @@ below the configurable threshold for at least five minutes).
|
|||||||
* consider reworking this for a user service, not a system service
|
* consider reworking this for a user service, not a system service
|
||||||
* this could make desktop notifications simpler, and not having to use
|
* this could make desktop notifications simpler, and not having to use
|
||||||
SSH keys without passphrases
|
SSH keys without passphrases
|
||||||
* possibly learn how to connect to an existing ssh-agent
|
|
||||||
* need to become much more familiar with user services
|
* need to become much more familiar with user services
|
||||||
* consider reworking all code in a compiled language (other than C)
|
* consider reworking all code in a compiled language (other than C)
|
||||||
* time to learn Go
|
* time to learn Go
|
||||||
|
122
psi-alerts.sh
122
psi-alerts.sh
@ -1,9 +1,8 @@
|
|||||||
#!/usr/bin/env zsh
|
#!/usr/bin/env zsh
|
||||||
|
|
||||||
################################################################################
|
|
||||||
# Send alerts when Pressure Stall Information is high
|
# Send alerts when Pressure Stall Information is high
|
||||||
#
|
#
|
||||||
# Copyright © 2023 Trey Blancher $(base64 -d <<< dHJleUBibGFuY2hlci5uZXQK)
|
# Copyright © 2023 Trey Blancher
|
||||||
#
|
#
|
||||||
# This program is free software: you can redistribute it and/or modify it
|
# This program is free software: you can redistribute it and/or modify it
|
||||||
# under the terms of the GNU General Public License as published by the Free
|
# under the terms of the GNU General Public License as published by the Free
|
||||||
@ -17,9 +16,6 @@
|
|||||||
#
|
#
|
||||||
# You should have received a copy of the GNU General Public License along
|
# You should have received a copy of the GNU General Public License along
|
||||||
# with this program. If not, see <https://www.gnu.org/licenses/>.
|
# with this program. If not, see <https://www.gnu.org/licenses/>.
|
||||||
#
|
|
||||||
# Submodules may be distributed under a separate software license; see the
|
|
||||||
# LICENSE file within each submodule.
|
|
||||||
#
|
#
|
||||||
# This script monitors the systemd journal, specifically the
|
# This script monitors the systemd journal, specifically the
|
||||||
# `psi-monitor.service` and waits for Pressure State Information monitor events
|
# `psi-monitor.service` and waits for Pressure State Information monitor events
|
||||||
@ -36,7 +32,7 @@
|
|||||||
# systemd psi-alerts@<user>.service overrides (will be placed in
|
# systemd psi-alerts@<user>.service overrides (will be placed in
|
||||||
# /etc/systemd/system/psi-alerts@<user>.service.d/override.conf. See the
|
# /etc/systemd/system/psi-alerts@<user>.service.d/override.conf. See the
|
||||||
# README.md for details.
|
# README.md for details.
|
||||||
################################################################################
|
|
||||||
|
|
||||||
svc="psi-monitor.service"
|
svc="psi-monitor.service"
|
||||||
cpu="/proc/pressure/cpu"
|
cpu="/proc/pressure/cpu"
|
||||||
@ -52,25 +48,9 @@ ssh_user="${SSH_USER}"
|
|||||||
ssh_id_path="${SSH_ID_PATH}"
|
ssh_id_path="${SSH_ID_PATH}"
|
||||||
clear_threshold="${CLEAR_THRESHOLD}"
|
clear_threshold="${CLEAR_THRESHOLD}"
|
||||||
notification_cmd="${NOTIFICATION_CMD}"
|
notification_cmd="${NOTIFICATION_CMD}"
|
||||||
notification_hist_cmd="${NOTIFICATION_HIST_CMD}"
|
|
||||||
notification_opts="${NOTIFICATION_OPTS}"
|
notification_opts="${NOTIFICATION_OPTS}"
|
||||||
id_idx="${NOTIFICATION_IDX}"
|
id_idx="${NOTIFICATION_IDX}"
|
||||||
|
|
||||||
get_ssh_agent () {
|
|
||||||
for dir in /tmp/ssh-*; do
|
|
||||||
if [[ -O ${dir} ]]; then
|
|
||||||
# only choose the last agent
|
|
||||||
export SSH_AUTH_SOCK=$(ls ${dir}/agent.* | tail -1)
|
|
||||||
fi
|
|
||||||
done
|
|
||||||
|
|
||||||
if [[ -S ${SSH_AUTH_SOCK} ]]; then
|
|
||||||
# we found an ssh_agent socket
|
|
||||||
true
|
|
||||||
else
|
|
||||||
false
|
|
||||||
fi
|
|
||||||
}
|
|
||||||
print_psi () {
|
print_psi () {
|
||||||
local psi_file="${1}"
|
local psi_file="${1}"
|
||||||
cat "${(P)$(tr '[[:upper:]]' '[[:lower:]]' <<< "${psi_file}")}"
|
cat "${(P)$(tr '[[:upper:]]' '[[:lower:]]' <<< "${psi_file}")}"
|
||||||
@ -117,27 +97,13 @@ send_notice () {
|
|||||||
esac
|
esac
|
||||||
|
|
||||||
integer notification_id
|
integer notification_id
|
||||||
if get_ssh_agent && [[ -S ${SSH_AUTH_SOCK} ]]; then
|
if ! notification_id=$(ssh -q -i "${ssh_id_path}" "${ssh_user}@${ssh_host}" -p ${ssh_port} \
|
||||||
if ! notification_id=$(ssh -q "${ssh_user}@${ssh_host}" -p ${ssh_port} \
|
"${notification_cmd} ${notification_opts} '${host}: PSI ${psi_type} triggered!' '${psi}'"); then
|
||||||
"${notification_cmd} ${notification_opts} '${host}: PSI ${psi_type} triggered!' '${psi}'"); then
|
print "Connection to notification daemon failed!" >&2
|
||||||
print "Connection to notification daemon failed!" >&2
|
false
|
||||||
false
|
|
||||||
else
|
|
||||||
echo ${notification_id}
|
|
||||||
true
|
|
||||||
fi
|
|
||||||
elif [[ -n "${ssh_id_path}" ]]; then
|
|
||||||
if ! notification_id=$(ssh -q -i "${ssh_id_path}" "${ssh_user}@${ssh_host}" -p ${ssh_port} \
|
|
||||||
"${notification_cmd} ${notification_opts} '${host}: PSI ${psi_type} triggered!' '${psi}'"); then
|
|
||||||
print "Connection to notification daemon failed!" >&2
|
|
||||||
false
|
|
||||||
else
|
|
||||||
echo ${notification_id}
|
|
||||||
true
|
|
||||||
fi
|
|
||||||
else
|
else
|
||||||
echo "No SSH notifications configured. Returning." >&2
|
echo ${notification_id}
|
||||||
false
|
true
|
||||||
fi
|
fi
|
||||||
#set +x
|
#set +x
|
||||||
}
|
}
|
||||||
@ -201,11 +167,11 @@ is_clear () {
|
|||||||
local psi_type="${1}"
|
local psi_type="${1}"
|
||||||
local psi_file="$(tr '[[:upper:]]' '[[:lower:]]' <<< "${psi_type}")"
|
local psi_file="$(tr '[[:upper:]]' '[[:lower:]]' <<< "${psi_type}")"
|
||||||
|
|
||||||
local avg300=$(grep some "${(P)psi_file}" | awk '{print $4}' | awk -F= '{print $2}')
|
local avg10=$(grep some "${(P)psi_file}" | awk '{print $2}' | awk -F= '{print $2}')
|
||||||
local prev=$(grep some "${(P)psi_file}" | awk '{print $5}' | awk -F= '{print $2}')
|
local prev=$(grep some "${(P)psi_file}" | awk '{print $5}' | awk -F= '{print $2}')
|
||||||
|
|
||||||
|
|
||||||
if [[ ${avg300} -lt ${clear_threshold} ]]; then
|
if [[ ${avg10} -lt ${clear_threshold} ]]; then
|
||||||
true
|
true
|
||||||
else
|
else
|
||||||
false
|
false
|
||||||
@ -243,69 +209,35 @@ check_dunst_id_is_visible () {
|
|||||||
local dunst_id="${1}"
|
local dunst_id="${1}"
|
||||||
|
|
||||||
typeset -a ids
|
typeset -a ids
|
||||||
if ! ids=$(ssh -q "${ssh_host}" -p ${ssh_port} -l "${ssh_user}" \
|
if ids=$(ssh -q "${ssh_host}" -p ${ssh_port} \
|
||||||
"${notification_hist_cmd} | jq '.data[0][].id.data'"); then
|
"dunstctl history | jq '.data[0][][${id_idx}].data'"); then
|
||||||
if ! ids=$(ssh -qi "${ssh_id_path}" -p ${ssh_port} -l "${ssh_user}" \
|
echo "Connection to dunst failed!" >&2
|
||||||
"${ssh_host}" "${notification_hist_cmd} | jq '.data[0][].id.data'"); then
|
return 2
|
||||||
echo "Connection to dunst failed!" >&2
|
|
||||||
return 2
|
|
||||||
fi
|
|
||||||
fi
|
fi
|
||||||
|
|
||||||
# if the alert is visible, it's not in the dunst history
|
|
||||||
if grep -qP "\b${dunst_id}\b" <<< "${ids}"; then
|
if grep -qP "\b${dunst_id}\b" <<< "${ids}"; then
|
||||||
false
|
|
||||||
else
|
|
||||||
true
|
true
|
||||||
|
else
|
||||||
|
false
|
||||||
fi
|
fi
|
||||||
}
|
}
|
||||||
|
#set -x
|
||||||
local current_alarm=""
|
local current_alarm=""
|
||||||
local last_alarm=""
|
local last_alarm=""
|
||||||
typeset -A notice_sent
|
typeset -A notice_sent
|
||||||
typeset -A secs
|
typeset -A secs
|
||||||
integer last_dunst_id=-1
|
integer last_dunst_id=-1
|
||||||
local last_line=""
|
|
||||||
|
|
||||||
set -x
|
journalctl -b 0 -fn 3 -u "${svc}" | \
|
||||||
while true; do
|
while read line; do
|
||||||
local line=$(journalctl -u ${svc} -n1)
|
|
||||||
if [[ "${last_line}" == "${line}" ]]; then
|
|
||||||
# line hasn't changed since last run, do nothing
|
|
||||||
sleep 1
|
|
||||||
continue
|
|
||||||
fi
|
|
||||||
last_line="${line}"
|
|
||||||
local now=$(date +%s)
|
|
||||||
local last_timestamp=$(date -d $(awk '{print $1" "$2" "$3}' <<< "${line}") +%s)
|
|
||||||
local time_diff=$(( now - last_timestamp ))
|
|
||||||
if (( time_diff >= 3 )); then
|
|
||||||
# haven't seen a monitor alert for 3 seconds, see if we can clear them
|
|
||||||
if [[ -n "${current_alarms}" ]]; then
|
|
||||||
typeset -a alarms=( $(tr '|' ' ' <<< "$current_alarms") )
|
|
||||||
for alarm in ${alarms}; do
|
|
||||||
integer elapsed=$(( now - ${secs[${alarm}]} ))
|
|
||||||
if is_clear "${alarm}" && (( elapsed >= 300 )); then
|
|
||||||
current_alarms=$(sed -E "s/${alarm}\|?//" <<< "${current_alarms}")
|
|
||||||
last_alarm=$(awk -F'|' '{print $NF}' <<< "${current_alarms}")
|
|
||||||
unset "notice_sent[${alarm}]"
|
|
||||||
unset "secs[${alarm}]"
|
|
||||||
fi
|
|
||||||
done
|
|
||||||
fi
|
|
||||||
sleep 1
|
|
||||||
continue
|
|
||||||
fi
|
|
||||||
local psi_type="$(grep -Eo "(CPU|MEM|IO) PSI event" <<< "${line}" | grep -Eo "CPU|MEM|IO")"
|
local psi_type="$(grep -Eo "(CPU|MEM|IO) PSI event" <<< "${line}" | grep -Eo "CPU|MEM|IO")"
|
||||||
if [[ -n "${psi_type}" ]]; then
|
if [[ -n "${psi_type}" ]]; then
|
||||||
secs+=(${psi_type} ${now})
|
secs+=(${psi_type} $(date +%s))
|
||||||
if [[ "${psi_type}" != "${last_alarm}" ]]; then
|
if [[ "${psi_type}" != "${last_alarm}" ]]; then
|
||||||
if [[ ! ${notice_sent[${psi_type}]} ]]; then
|
if [[ ! ${notice_sent[${psi_type}]} ]]; then
|
||||||
last_dunst_id=$(exec_notices "${psi_type}" "${current_alarms}")
|
last_dunst_id=$(exec_notices "${psi_type}" "${current_alarms}")
|
||||||
notice_sent+=(${psi_type} true)
|
notice_sent+=(${psi_type} true)
|
||||||
elif (( last_dunst_id >= 0 )) && check_dunst_id_is_visible "${last_dunst_id}"; then
|
elif (( last_dunst_id >= 0 )) && ! check_dunst_id_is_visible "${last_dunst_id}"; then
|
||||||
last_alarm="${psi_type}"
|
|
||||||
sleep 1
|
|
||||||
continue
|
continue
|
||||||
fi
|
fi
|
||||||
fi
|
fi
|
||||||
@ -317,8 +249,16 @@ while true; do
|
|||||||
current_alarms="${current_alarms}|${psi_type}"
|
current_alarms="${current_alarms}|${psi_type}"
|
||||||
fi
|
fi
|
||||||
fi
|
fi
|
||||||
|
else
|
||||||
|
typeset -a alarms=( $(tr '|' ' ' <<< "$current_alarms") )
|
||||||
|
for alarm in ${alarms}; do
|
||||||
|
integer elapsed=$(( $(date +%s) - ${secs[${alarm}]} ))
|
||||||
|
if is_clear "${alarm}" && (( elapsed > 300 )); then
|
||||||
|
current_alarms=$(sed -E "s/${alarm}\|?//" <<< "${current_alarms}")
|
||||||
|
last_alarm=$(awk -F'|' '{print $NF}' <<< "${current_alarms}")
|
||||||
|
fi
|
||||||
|
done
|
||||||
fi
|
fi
|
||||||
sleep 1
|
|
||||||
done
|
done
|
||||||
set +x
|
#set +x
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user