Compare commits
	
		
			26 Commits
		
	
	
		
			ae32ba4ae4
			...
			primary
		
	
	| Author | SHA1 | Date | |
|---|---|---|---|
| 
						 | 
					5cab66dfbb | ||
| 976370452e | |||
| cf33dc92f3 | |||
| 5997bcb6f8 | |||
| 7f5806c0a9 | |||
| e18e94bb23 | |||
| 23ab5a2371 | |||
| 030f4f34f1 | |||
| 7c6742a1b2 | |||
| 427fb181d1 | |||
| b414e81081 | |||
| 68372af7d7 | |||
| 0540fed30b | |||
| 15f6697dad | |||
| 428ed91dd1 | |||
| 7432e06f58 | |||
| c22b59dc85 | |||
| d3e6d66a3f | |||
| 68f72e2d8c | |||
| a99e3c3ab9 | |||
| 73eb6adb4a | |||
| 345de155fa | |||
| 53a4f7f73c | |||
| ea42a066ec | |||
| 42f94bbf77 | |||
| 354088b245 | 
							
								
								
									
										12
									
								
								.gitignore
									
									
									
									
										vendored
									
									
										Normal file
									
								
							
							
						
						
									
										12
									
								
								.gitignore
									
									
									
									
										vendored
									
									
										Normal file
									
								
							@@ -0,0 +1,12 @@
 | 
			
		||||
*
 | 
			
		||||
!.gitignore
 | 
			
		||||
!CONFIGURE.md
 | 
			
		||||
!INSTALL.md
 | 
			
		||||
!README.md
 | 
			
		||||
!psi-alerts-user.service
 | 
			
		||||
!psi-alerts.sh
 | 
			
		||||
!psi-alerts@.service
 | 
			
		||||
!psi-by-example
 | 
			
		||||
!psi-monitor-user.service
 | 
			
		||||
!psi-monitor.service
 | 
			
		||||
!psi-monitor.sh
 | 
			
		||||
							
								
								
									
										100
									
								
								CONFIGURE.md
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										100
									
								
								CONFIGURE.md
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,100 @@
 | 
			
		||||
# CONFIGURE Included in this project are a number of systemd units:
 | 
			
		||||
    * psi-monitor.service
 | 
			
		||||
        * uses psi-monitor executable (in /usr/bin/)
 | 
			
		||||
    * psi-alerts@.service (systemd template service)
 | 
			
		||||
        * uses psi-alerts.sh script in */usr/local/bin/*
 | 
			
		||||
    * psi-alerts-user.service (systemd user service)
 | 
			
		||||
        * also uses psi-alerts.sh script in *~/bin/* (or wherever you want to
 | 
			
		||||
          put it)
 | 
			
		||||
 | 
			
		||||
The `psi-alerts.sh` is essentially a daemon (a systemd simple service), and for
 | 
			
		||||
now the systemd template needs to be instantiated with the username that will
 | 
			
		||||
execute `psi-alerts.sh` (if using the systemd template).  Also, a systemd unit
 | 
			
		||||
override should be created, like so:  
 | 
			
		||||
 | 
			
		||||
```
 | 
			
		||||
sudo cp psi-alerts@.service /etc/systemd/system/
 | 
			
		||||
sudo systemctl edit psi-alerts@<user>.service
 | 
			
		||||
```
 | 
			
		||||
--OR--
 | 
			
		||||
```
 | 
			
		||||
cp psi-alerts-user.service ~/.config/systemd/user/psi-alerts.service
 | 
			
		||||
systemctl --user edit psi-alerts.service
 | 
			
		||||
```
 | 
			
		||||
This will open an editor, and in later versions of systemd the comment code will be included, clearly showing where the override should be entered:
 | 
			
		||||
 | 
			
		||||
```
 | 
			
		||||
### Editing /etc/systemd/system/psi-alerts@trey.service.d/override.conf
 | 
			
		||||
### Anything between here and the comment below will become the contents of the drop-in file
 | 
			
		||||
 | 
			
		||||
[Service]
 | 
			
		||||
Environment=EMAIL_TO="email@domain.tld"
 | 
			
		||||
Environment=SMS_DST="phone_number@sms.domain.tld"
 | 
			
		||||
Environment=NOTIFICATION_CMD="dunstify"
 | 
			
		||||
Environment=NOTIFICATION_OPTS="--timeout=0 --printid --urgency=critical --icon=/usr/share/icons/breeze-dark/emblems/16/emblem-warning.svg"
 | 
			
		||||
Environment=NOTIFICATION_HIST_CMD="dunstctl history"
 | 
			
		||||
Environment=NOTIFICATION_IDX=15
 | 
			
		||||
Environment=SSH_USER="username"
 | 
			
		||||
Environment=SSH_HOST="localhost"
 | 
			
		||||
Environment=SSH_PORT=5999
 | 
			
		||||
Environment=SSH_ID_PATH="~user/.ssh/psi-alerts"
 | 
			
		||||
Environment=CLEAR_THRESHOLD="5.0"
 | 
			
		||||
ExecStart=  # Clear ExecStart for user unit
 | 
			
		||||
ExecStart=/path/to/psi-alerts.sh --user # User unit
 | 
			
		||||
 | 
			
		||||
### Edits below this comment will be discarded
 | 
			
		||||
 | 
			
		||||
### /etc/systemd/system/psi-alerts@.service
 | 
			
		||||
# [Unit]
 | 
			
		||||
# Description=Pressure Stall Information (PSI) alerts
 | 
			
		||||
# PartOf=multi-user.target  # system template
 | 
			
		||||
# PartOf=default.target     # user service
 | 
			
		||||
# After=psi-monitor.service
 | 
			
		||||
#
 | 
			
		||||
# [Service]
 | 
			
		||||
# 
 | 
			
		||||
# User=%i # User unit will not have User=%i
 | 
			
		||||
# Type=simple
 | 
			
		||||
# ExecStart=psi-alerts.sh
 | 
			
		||||
#
 | 
			
		||||
# [Install]
 | 
			
		||||
# WantedBy=multi-user.target
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
All of these are required except where noted, there are no default options
 | 
			
		||||
(defaults may be added in the future).  A brief description of each:
 | 
			
		||||
* **EMAIL_TO**:  the email address the notification should be sent to.  The
 | 
			
		||||
  output of `pidstat` will be included in the body of this email, for each
 | 
			
		||||
  triggered resource type (CPU, I/O, Memory), at the time the monitor alerted.
 | 
			
		||||
* **SMS_DST**:  the email-to-SMS address, as defined by your mobile carrier.
 | 
			
		||||
  Please review your mobile carrier's documentation.  For Google Fi, based in
 | 
			
		||||
  the US, the format is
 | 
			
		||||
  `<mobile_number_without_country_code>@msg.fi.google.com`.  This email address
 | 
			
		||||
  does **not** get the output of `pidstat` in the body of the message.
 | 
			
		||||
* **NOTIFICATION_CMD**:   The command on the remote host to run to display
 | 
			
		||||
  notifications, e.g. `notify-send` or `dunstify`.
 | 
			
		||||
* **NOTIFICATION_OPTS**:   Options for the `${NOTIFICATION_CMD}`.  Should
 | 
			
		||||
  include `--print-id` if supported by the command.
 | 
			
		||||
* **NOTIFICATION_HIST_CMD**:  The command to display the notification history
 | 
			
		||||
  (e.g. `dunst history`).
 | 
			
		||||
* **NOTIFICATION_IDX**:  The index if the JSON structure that contains the
 | 
			
		||||
  notification ID.  `dunst`, as of version 1.9.2-1, displays its history as a
 | 
			
		||||
  JSON structure.  For other notification daemons, some other history mechanism
 | 
			
		||||
  will likely be required;  patches needed and welcome!
 | 
			
		||||
* **SSH_USER**:  The SSH username to connect to the remote host that will
 | 
			
		||||
  display the notifications to the system administrator.
 | 
			
		||||
* **SSH_HOST**:  The SSH host to connect to.  This is where
 | 
			
		||||
  `${NOTIFICATION_CMD} ${NOTIFICATION_OPTS}` and `${NOTIFICATION_HIST_CMD}`
 | 
			
		||||
  will run.
 | 
			
		||||
* **SSH_PORT**:  The SSH port to connect to.
 | 
			
		||||
* **SSH_ID_PATH**:  The path to the SSH id (private key file) to use for
 | 
			
		||||
  authenticating to the remote host.  This can be exluded if the local user
 | 
			
		||||
  already has an ssh-agent running, with the necessary key and passphrase
 | 
			
		||||
  entered.  If ssh-agent is not desired, then this SSH id (private key file)
 | 
			
		||||
  should have an empty passphrase (i.e., no passphrase).  Not having this
 | 
			
		||||
  environment variable, and no ssh-agent will disable the desktop notifications
 | 
			
		||||
  (SMS and email will still work, as they don't use SSH)
 | 
			
		||||
* **CLEAR_THRESHOLD**:  The percentage threshold the some avg300 threshold
 | 
			
		||||
  should be below before considering the alert cleared.  This will depend
 | 
			
		||||
  highly on the workload running on the system.
 | 
			
		||||
 | 
			
		||||
							
								
								
									
										63
									
								
								INSTALL.md
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										63
									
								
								INSTALL.md
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,63 @@
 | 
			
		||||
# INSTALL
 | 
			
		||||
First, clone this repository with the `--recurse-submodules` flag:
 | 
			
		||||
```
 | 
			
		||||
$ git clone --recurse-submodules https://git.eldon.me/trey/psi-alerts.git
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
`--recurse-submodules` is only necessary if you wish to use the modified
 | 
			
		||||
psi-by-example program for `psi-monitor`.  I found this too noisy to be of use,
 | 
			
		||||
it alerts too quickly so I wrote my own with relaxed timing.
 | 
			
		||||
 | 
			
		||||
If you want to use the psi-by-example/psi-monitor code, you'll need to compile
 | 
			
		||||
it: 
 | 
			
		||||
 | 
			
		||||
```
 | 
			
		||||
gcc -o psi-monitor psi-monitor.c
 | 
			
		||||
``` 
 | 
			
		||||
 | 
			
		||||
## Using the systemd template unit
 | 
			
		||||
1.  Copy the `psi-alerts.sh` and `psi-monitor.sh` scripts to */usr/local/bin*:
 | 
			
		||||
 | 
			
		||||
    ```
 | 
			
		||||
    sudo cp psi-alerts.sh /usr/local/bin
 | 
			
		||||
    sudo cp psi-monitor.sh /usr/local/bin/psi-monitor 
 | 
			
		||||
    ### OR ###
 | 
			
		||||
    sudo cp psi-by-example/psi-monitor /usr/local/bin
 | 
			
		||||
    ```
 | 
			
		||||
 | 
			
		||||
2.  Copy the systemd units to */etc/systemd/system*:
 | 
			
		||||
 | 
			
		||||
    ```
 | 
			
		||||
    sudo cp psi-alerts@.service psi-monitor.service /etc/systemd/system/
 | 
			
		||||
    ```
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
## Using the systemd user units
 | 
			
		||||
1.  Copy the `psi-alerts.sh` and `psi-monitor.sh` scripts to *~/bin* (or
 | 
			
		||||
    wherever you want them):
 | 
			
		||||
    
 | 
			
		||||
    ```
 | 
			
		||||
    cp -a psi-alerts.sh psi-monitor.sh ~/bin/
 | 
			
		||||
    ```
 | 
			
		||||
 | 
			
		||||
2.  Copy the systemd user units to *~/.config/systemd/user/*
 | 
			
		||||
 | 
			
		||||
    ```
 | 
			
		||||
    cp psi-alerts-user.service ~/.config/systemd/user/psi-alerts.service
 | 
			
		||||
    cp psi-monitor-user.service ~/.config/systemd/user/psi-monitor.service
 | 
			
		||||
    ```
 | 
			
		||||
 | 
			
		||||
# CONFIGURE
 | 
			
		||||
See *CONFIGURE.md* in this repository
 | 
			
		||||
 | 
			
		||||
# ENABLE and START
 | 
			
		||||
## system template instance:
 | 
			
		||||
```
 | 
			
		||||
sudo systemctl enable --now psi-monitor.service psi-alerts@<user>.service
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
## User instance
 | 
			
		||||
```
 | 
			
		||||
systemctl --user enable --now psi-monitor.service psi-alerts.service
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
							
								
								
									
										106
									
								
								README.md
									
									
									
									
									
								
							
							
						
						
									
										106
									
								
								README.md
									
									
									
									
									
								
							@@ -2,28 +2,102 @@
 | 
			
		||||
## PURPOSE
 | 
			
		||||
 | 
			
		||||
This project aims to deliver Pressure Stall Information (PSI) alerts via
 | 
			
		||||
standard Linux graphical desktop (through `libnotify` compatible daemons and
 | 
			
		||||
CLI programs), and email (email-to-SMS is also supported).  This can alert the
 | 
			
		||||
system administrator of CPU, I/O, or Memory (RAM) pressure in near real time.
 | 
			
		||||
standard Linux graphical desktop notifications (through `libnotify` compatible
 | 
			
		||||
daemons and CLI programs), and email (email-to-SMS is also supported).  This
 | 
			
		||||
can alert the system administrator of CPU, I/O, or Memory (RAM) pressure in
 | 
			
		||||
near real time.
 | 
			
		||||
 | 
			
		||||
## PREREQUISITES
 | 
			
		||||
* A Linux system with kernel 5.2.0 or greater
 | 
			
		||||
* A Linux system with kernel 5.2.0 or greater, with the /proc filesystem
 | 
			
		||||
  enabled
 | 
			
		||||
* systemd
 | 
			
		||||
* zsh
 | 
			
		||||
* sysstat (for pidstat)
 | 
			
		||||
* ssh (OpenSSH, for desktop notifications)
 | 
			
		||||
* psi-by-example (a modified version of this is included in this project as a
 | 
			
		||||
  submodule)
 | 
			
		||||
* a libnotify-compatible desktop notification system
 | 
			
		||||
    * any notification program should use the `--print-id` parameter if
 | 
			
		||||
      possible
 | 
			
		||||
        * both `notify-send` and `dunstify` (part of
 | 
			
		||||
          [dunst](https://dunst-project.org/)) support this
 | 
			
		||||
    * note, this has only been tested with `dunst`, since it has the capability
 | 
			
		||||
      of showing notification history
 | 
			
		||||
        * `notify-send` specifically does not appear to retain a history, so the
 | 
			
		||||
          `check_dunst_id_is_visible` function won't work with it (and the logic to
 | 
			
		||||
          skip sending a new notification if one is already sent will be broken).  
 | 
			
		||||
            * since I don't use `notify-send`, I'm not sure how to solve this
 | 
			
		||||
            * patches welcome!
 | 
			
		||||
* jq (for the aformentioned `dunst` integration)
 | 
			
		||||
 | 
			
		||||
## History
 | 
			
		||||
 | 
			
		||||
When I first learned about [Pressure Stall
 | 
			
		||||
Information](https://docs.kernel.org/accounting/psi.html) (PSI), I was
 | 
			
		||||
intrigued.  This provides a real-time view into the performance and typical
 | 
			
		||||
resource contention Linux system administrators need to worry about:  CPU, I/O,
 | 
			
		||||
and Memory (RAM).  During this research, I found [this
 | 
			
		||||
post](https://unixism.net/2019/08/linux-pressure-stall-information-psi-by-example/)
 | 
			
		||||
complete with a C code example;  albeit, it was light on I/O details and the
 | 
			
		||||
example C code the author provided didn't even include Memory pressure at all
 | 
			
		||||
(so modified it to include Memory pressure).  
 | 
			
		||||
 | 
			
		||||
A quick and dirty description of PSI:  whenever one or more processes are
 | 
			
		||||
waiting for some measurable resource (CPU, I/O, or RAM), the percentage of
 | 
			
		||||
processes waiting on the resource will begin to increase.  Initially, the
 | 
			
		||||
percentage will be low, but as resource contention increases, more and more
 | 
			
		||||
processes will be waiting to be processed by the CPU for that resource.  If not
 | 
			
		||||
all processes are waiting on this resource, PSI calls this the "some"
 | 
			
		||||
contention for resources.  If all processes are waiting on the resource, this
 | 
			
		||||
is known as the "full" resource contention.  
 | 
			
		||||
 | 
			
		||||
The pressure information is exposed in the _/proc_ filesystem in these three
 | 
			
		||||
virtual files: _/proc/pressure/cpu_, _/proc/pressure/io_,
 | 
			
		||||
_/proc/pressure/memory_.  Each file reports both some and full, and has the
 | 
			
		||||
following output:
 | 
			
		||||
 | 
			
		||||
## INSTALL
 | 
			
		||||
First, clone this repository with the `--recurse-submodules` flag:
 | 
			
		||||
```
 | 
			
		||||
$ git clone --recurse-submodules https://git.eldon.me/trey/psi-alerts.git
 | 
			
		||||
some avg10=0.02 avg60=0.43 avg300=0.55 total=711489361
 | 
			
		||||
full avg10=0.02 avg60=0.43 avg300=0.54 total=681874430
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
## CONFIGURE
 | 
			
		||||
Included in this project are a number of systemd units:
 | 
			
		||||
    * psi-monitor.service
 | 
			
		||||
    * psi-alerts@.service (template service)
 | 
			
		||||
This example is taken from _/proc/pressure/io_, for I/O pressure.   The full
 | 
			
		||||
CPU pressure information really depends on the cgroups, which this project
 | 
			
		||||
doesn't pay close attention to at this time.  The percentages are a measure of
 | 
			
		||||
the average resource pressure over the last 10s, 60s, and 300s (5 minutes).
 | 
			
		||||
The total is the number of microseconds that any processes were waiting for the
 | 
			
		||||
resource;  this is a counter that is reset on boot, and will continously update
 | 
			
		||||
as processes wait for the resource.  They always have to wait for the resource,
 | 
			
		||||
even if it's on the order of hundreds of microseconds or less.  Even if the
 | 
			
		||||
percentages were all zeroes, the total counter will be nonzero (at least for
 | 
			
		||||
the some metrics), and even the full metrics will have a nonzero total except
 | 
			
		||||
for CPU, because the full CPU total only really applies to cgroups (and are out
 | 
			
		||||
of scope for this project at present).
 | 
			
		||||
 | 
			
		||||
The monitor code (from psi-by-example listed above) only considers the "some"
 | 
			
		||||
pressure for all three resources, which will usually alert before the system
 | 
			
		||||
becomes critical (and in the case of full Memory usage/thrashing, completely
 | 
			
		||||
unusable for any workload).  Thus the alerts should come in well before the full
 | 
			
		||||
resource pressure gets maxed out.
 | 
			
		||||
 | 
			
		||||
Now, I don't know C very well, but this _monitor.c_ code was easy enough to
 | 
			
		||||
extend to include memory pressure.  However, the _create_load.c_ only creates
 | 
			
		||||
CPU and I/O load (memory load is too detrimental to system performance).  
 | 
			
		||||
 | 
			
		||||
This was developed on an [SSDNodes VPS](https://ssdnodes.com) (Virtual Private
 | 
			
		||||
Server), which is a KVM virtual machine, backed by SSD hardware.  It is very
 | 
			
		||||
well provisioned with virtual hardware:  8 vCPUs, 32GiB RAM, and 640GiB SSD
 | 
			
		||||
disk space.  Currently, there is very little load on this system, even with
 | 
			
		||||
four different websites on it, with corresponding database engines, and an
 | 
			
		||||
nginx reverse proxy.  I plan on putting
 | 
			
		||||
[mailcow-dockerized](https://docs.mailcow.email/) on this VPS soon, which has
 | 
			
		||||
the potential to increase the load significantly.
 | 
			
		||||
 | 
			
		||||
Now, once the regular workload of this VPS increases, my current configuration
 | 
			
		||||
may become too noisy.  However, I've tried to configure `psi-alerts.sh` in such
 | 
			
		||||
a way that it only alerts once when the pressure on a resource increases, and
 | 
			
		||||
won't alert again until that pressure subsides (and the some percentages drop
 | 
			
		||||
below the configurable threshold for at least five minutes).
 | 
			
		||||
 | 
			
		||||
## TODO
 | 
			
		||||
* finish INSTALL section
 | 
			
		||||
@@ -33,3 +107,13 @@ Included in this project are a number of systemd units:
 | 
			
		||||
    * about defining an instance and editing it
 | 
			
		||||
        * `sudo systemctl edit psi-alerts@<user>.service`
 | 
			
		||||
        * mainly for `Environment=` variables
 | 
			
		||||
    * consider reworking this for a user service, not a system service
 | 
			
		||||
        * this could make desktop notifications simpler, and not having to use
 | 
			
		||||
          SSH keys without passphrases
 | 
			
		||||
          * possibly learn how to connect to an existing ssh-agent
 | 
			
		||||
        * need to become much more familiar with user services
 | 
			
		||||
* consider reworking all code in a compiled language (other than C)
 | 
			
		||||
    * time to learn Go
 | 
			
		||||
    * or continue learning Rust
 | 
			
		||||
    * need to know how to use kernel syscalls in these languages (if possible)
 | 
			
		||||
    * also, convert psi-alerts.sh script to either of these languages
 | 
			
		||||
 
 | 
			
		||||
							
								
								
									
										12
									
								
								psi-alerts-user.service
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										12
									
								
								psi-alerts-user.service
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,12 @@
 | 
			
		||||
[Unit]
 | 
			
		||||
Description=Pressure Stall Information (PSI) alerts
 | 
			
		||||
PartOf=default.target
 | 
			
		||||
After=psi-monitor.service
 | 
			
		||||
 | 
			
		||||
[Service]
 | 
			
		||||
Type=simple
 | 
			
		||||
ExecStart=psi-alerts.sh
 | 
			
		||||
 | 
			
		||||
[Install]
 | 
			
		||||
WantedBy=default.target
 | 
			
		||||
 | 
			
		||||
							
								
								
									
										237
									
								
								psi-alerts.sh
									
									
									
									
									
										
										
										Normal file → Executable file
									
								
							
							
						
						
									
										237
									
								
								psi-alerts.sh
									
									
									
									
									
										
										
										Normal file → Executable file
									
								
							@@ -1,8 +1,9 @@
 | 
			
		||||
#!/usr/bin/env zsh
 | 
			
		||||
 | 
			
		||||
################################################################################
 | 
			
		||||
# Send alerts when Pressure Stall Information is high
 | 
			
		||||
# 
 | 
			
		||||
#    Copyright © 2023 Trey Blancher
 | 
			
		||||
#    Copyright © 2023 Trey Blancher $(base64 -d <<< dHJleUBibGFuY2hlci5uZXQK)
 | 
			
		||||
#
 | 
			
		||||
#    This program is free software: you can redistribute it and/or modify it
 | 
			
		||||
#    under the terms of the GNU General Public License as published by the Free
 | 
			
		||||
@@ -16,6 +17,9 @@
 | 
			
		||||
#
 | 
			
		||||
#    You should have received a copy of the GNU General Public License along
 | 
			
		||||
#    with this program.  If not, see <https://www.gnu.org/licenses/>.
 | 
			
		||||
#
 | 
			
		||||
#    Submodules may be distributed under a separate software license;  see the
 | 
			
		||||
#    LICENSE file within each submodule.
 | 
			
		||||
# 
 | 
			
		||||
# This script monitors the systemd journal, specifically the
 | 
			
		||||
# `psi-monitor.service` and waits for Pressure State Information monitor events
 | 
			
		||||
@@ -32,51 +36,85 @@
 | 
			
		||||
# systemd psi-alerts@<user>.service overrides (will be placed in
 | 
			
		||||
# /etc/systemd/system/psi-alerts@<user>.service.d/override.conf. See the
 | 
			
		||||
# README.md for details.
 | 
			
		||||
 | 
			
		||||
################################################################################
 | 
			
		||||
 | 
			
		||||
svc="psi-monitor.service"
 | 
			
		||||
cpu="/proc/pressure/cpu"
 | 
			
		||||
mem="/proc/pressure/memory"
 | 
			
		||||
io="/proc/pressure/io"
 | 
			
		||||
user="$(whoami)"
 | 
			
		||||
host="$(hostname)"
 | 
			
		||||
email_to="${EMAIL_TO}"
 | 
			
		||||
sms_dst="${SMS_DST}"
 | 
			
		||||
sms_domain="$(awk -F@ '{print $NF}' <<< ${SMS_DST})"
 | 
			
		||||
ssh_port="${SSH_PORT}"
 | 
			
		||||
ssh_host="${SSH_HOST}"
 | 
			
		||||
ssh_user="${SSH_USER}"
 | 
			
		||||
ssh_id_path="${SSH_ID_PATH}"
 | 
			
		||||
clear_threshold="${CLEAR_THRESHOLD}"
 | 
			
		||||
notification_cmd="${NOTIFICATION_CMD}"
 | 
			
		||||
notification_hist_cmd="${NOTIFICATION_HIST_CMD}"
 | 
			
		||||
notification_opts="${NOTIFICATION_OPTS}"
 | 
			
		||||
id_idx="${NOTIFICATION_IDX}"
 | 
			
		||||
user=false
 | 
			
		||||
 | 
			
		||||
if [[ -n "${1}" ]]; then
 | 
			
		||||
    if  [[ "${1}" == "-u" ]] || \
 | 
			
		||||
        [[ "${1}" == "--user" ]]; then
 | 
			
		||||
        user=true
 | 
			
		||||
    fi 
 | 
			
		||||
fi
 | 
			
		||||
       
 | 
			
		||||
get_ssh_agent () {
 | 
			
		||||
    for dir in /tmp/ssh-*; do
 | 
			
		||||
        if [[ -O ${dir} ]]; then
 | 
			
		||||
            # only choose the last agent
 | 
			
		||||
            export SSH_AGENT_PID=$(ps -eaf | grep '[s]sh-agent' | \
 | 
			
		||||
                                   grep ${user} | awk '{print $2}')
 | 
			
		||||
            export SSH_AUTH_SOCK=$(ls ${dir}/agent.* | tail -1)
 | 
			
		||||
        fi
 | 
			
		||||
    done
 | 
			
		||||
 | 
			
		||||
    if [[ -S ${SSH_AUTH_SOCK} ]]; then
 | 
			
		||||
        # we found an ssh_agent socket
 | 
			
		||||
        true
 | 
			
		||||
    else
 | 
			
		||||
        false
 | 
			
		||||
    fi
 | 
			
		||||
}
 | 
			
		||||
print_psi () {
 | 
			
		||||
    local psi_file="${1}"
 | 
			
		||||
    cat "${(P)psi_file}"
 | 
			
		||||
    cat "${(P)$(tr '[[:upper:]]' '[[:lower:]]' <<< "${psi_file}")}"
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
print_pidstat () {
 | 
			
		||||
print_stats () {
 | 
			
		||||
    local psi_type="${1}"
 | 
			
		||||
    local opts="-l --human"
 | 
			
		||||
 | 
			
		||||
    case "${psi_type}" in
 | 
			
		||||
        CPU)
 | 
			
		||||
            opts="-u ${opts}"
 | 
			
		||||
            top -bcn1 -o %CPU -w 512 | head -n 30
 | 
			
		||||
            printf "\n\n"
 | 
			
		||||
            pidstat -ul --human
 | 
			
		||||
            ;;
 | 
			
		||||
        IO)
 | 
			
		||||
            opts="-d ${opts}"
 | 
			
		||||
            sudo iotop --batch --only --iter=10
 | 
			
		||||
            printf "\n\n"
 | 
			
		||||
            pidstat -dl --human
 | 
			
		||||
            ;;
 | 
			
		||||
        MEM)
 | 
			
		||||
            opts="-r ${opts}"
 | 
			
		||||
            top -bcn1 -o %MEM -w 512 | head -n 30
 | 
			
		||||
            printf "\n\n"
 | 
			
		||||
            pidstat -rl --human
 | 
			
		||||
            ;;
 | 
			
		||||
        *)
 | 
			
		||||
            print "Invalid psi_type:  ${psi_type}" >&2
 | 
			
		||||
            ;;
 | 
			
		||||
    esac
 | 
			
		||||
 | 
			
		||||
    pidstat "${opts}"
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
send_notice () {
 | 
			
		||||
    #set -x
 | 
			
		||||
    local psi_type="${1}"
 | 
			
		||||
    shift
 | 
			
		||||
    
 | 
			
		||||
@@ -96,19 +134,35 @@ send_notice () {
 | 
			
		||||
    esac
 | 
			
		||||
 | 
			
		||||
    integer notification_id
 | 
			
		||||
    if ! notification_id=$(ssh -q "${ssh_host}" -p ${ssh_port} \
 | 
			
		||||
        "${notification_cmd} ${notification_opts} '${host}:  PSI ${psi_type} triggered!' '${psi}'"); then
 | 
			
		||||
            print "Connection to notification daemon failed!" >&2
 | 
			
		||||
            false
 | 
			
		||||
    if get_ssh_agent && [[ -S ${SSH_AUTH_SOCK} ]]; then
 | 
			
		||||
        if ! notification_id=$(ssh -q "${ssh_user}@${ssh_host}" -p ${ssh_port} \
 | 
			
		||||
            "${notification_cmd} ${notification_opts} '${host}:  PSI ${psi_type} triggered!' '${psi}'"); then
 | 
			
		||||
                print "Connection to notification daemon failed!" >&2
 | 
			
		||||
                false
 | 
			
		||||
        else
 | 
			
		||||
            print ${notification_id}
 | 
			
		||||
            true
 | 
			
		||||
        fi
 | 
			
		||||
    elif [[ -n "${ssh_id_path}" ]]; then
 | 
			
		||||
        if ! notification_id=$(ssh -q -i "${ssh_id_path}" "${ssh_user}@${ssh_host}" -p ${ssh_port} \
 | 
			
		||||
            "${notification_cmd} ${notification_opts} '${host}:  PSI ${psi_type} triggered!' '${psi}'"); then
 | 
			
		||||
                print "Connection to notification daemon failed!" >&2
 | 
			
		||||
                false
 | 
			
		||||
        else
 | 
			
		||||
            print ${notification_id}
 | 
			
		||||
            true
 | 
			
		||||
        fi
 | 
			
		||||
    else
 | 
			
		||||
        echo ${notification_id}
 | 
			
		||||
        true
 | 
			
		||||
        print "No SSH notifications configured.  Returning." >&2
 | 
			
		||||
        false
 | 
			
		||||
    fi
 | 
			
		||||
    #set +x
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
send () {
 | 
			
		||||
    #set -x
 | 
			
		||||
    if [[ "${#@}" -lt 2 ]] && [[ "${#@}" -gt 3 ]]; then
 | 
			
		||||
        echo "Wrong number of arguments to send()!" >&2
 | 
			
		||||
        print "Wrong number of arguments to send()!" >&2
 | 
			
		||||
        return false
 | 
			
		||||
    fi
 | 
			
		||||
 | 
			
		||||
@@ -128,47 +182,51 @@ send () {
 | 
			
		||||
 | 
			
		||||
    local psi="$(print_psi $(tr '[[:upper:]]' '[[:lower:]]' <<< ${psi_type}))"
 | 
			
		||||
 | 
			
		||||
    local subj="PSI on deltachunk ${psi_type} triggered!"
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
    local body="Pressure Stall Information triggered on ${psi_type} at $(date +'%FT%T %Z')"
 | 
			
		||||
    local email=$(mktemp /tmp/psi.eml.XXXX.txt)
 | 
			
		||||
    local subj
 | 
			
		||||
    printf "Pressure Stall Information for ${host} triggered on ${psi_type} at $(date +'%FT%T %Z')\n\n" > ${email}
 | 
			
		||||
    if [[ -n "${current_alarms}" ]]; then
 | 
			
		||||
        body="${body}\nMultiple alarms triggered:  ${current_alarms}"
 | 
			
		||||
        # if this is not an SMS, include pidstat info
 | 
			
		||||
        if [[ ! "${dst}" =~ "${sms_domain}" ]]; then
 | 
			
		||||
            for p in $(tr '|' ' ' <<< "${current_alarms}"); do
 | 
			
		||||
                body="${body}\n\n$(print_pidstat ${p})"
 | 
			
		||||
            done
 | 
			
		||||
        fi
 | 
			
		||||
        current_alarms="${current_alarms}|${psi_type}"
 | 
			
		||||
        subj="PSI on ${host} ${current_alarms} triggered!"
 | 
			
		||||
        printf "Multiple alarms triggered:  ${current_alarms}\n\n" >> ${email}
 | 
			
		||||
    else
 | 
			
		||||
        subj="PSI on ${host} ${psi_type} triggered!"
 | 
			
		||||
        current_alarms="${psi_type}"
 | 
			
		||||
    fi
 | 
			
		||||
    print_psi "${psi_type}" >> ${email}
 | 
			
		||||
    printf "\n\n" >> ${email}
 | 
			
		||||
    # is this an email or SMS?
 | 
			
		||||
    if [[ ! "${dst}" =~ "@${sms_domain}" ]]; then
 | 
			
		||||
        for p in $(tr '|' ' ' <<< "${current_alarms}"); do
 | 
			
		||||
            printf "\n\nStatistics info for ${p}\n\n" >> ${email}
 | 
			
		||||
            print_stats "${p}" >> ${email}
 | 
			
		||||
            printf "\n\n" >> ${email}
 | 
			
		||||
        done
 | 
			
		||||
    fi
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
    local email=$(mktemp /tmp/psi.eml.XXXX)
 | 
			
		||||
 | 
			
		||||
    cat <<-EOF > ${email}
 | 
			
		||||
        ${body}
 | 
			
		||||
 | 
			
		||||
EOF
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
    /usr/bin/mail --resource-files=/ \
 | 
			
		||||
                  --subject="${subj}" \
 | 
			
		||||
                  --end-options \
 | 
			
		||||
                  ${dst} < ${email}
 | 
			
		||||
    # send the message
 | 
			
		||||
    (
 | 
			
		||||
       printf "To:  ${dst}\n"
 | 
			
		||||
       printf "Subject:  ${subj}\n"
 | 
			
		||||
       cat ${email}
 | 
			
		||||
    ) | sendmail -t
 | 
			
		||||
         
 | 
			
		||||
    #set +x
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
is_clear () {
 | 
			
		||||
    local psi_type="${1}"
 | 
			
		||||
    local psi_file="$(tr '[[:upper:]]' '[[:lower:]]' <<< "${psi_type}")"
 | 
			
		||||
 | 
			
		||||
    local avg10=$(grep some "${(P)psi_file}" | awk '{print $2}' | awk -F= '{print $2}')
 | 
			
		||||
    local avg300=$(grep some "${(P)psi_file}" | awk '{print $4}' | awk -F= '{print $2}')
 | 
			
		||||
    local prev=$(grep some "${(P)psi_file}" | awk '{print $5}' | awk -F= '{print $2}')
 | 
			
		||||
 | 
			
		||||
    
 | 
			
		||||
    if [[ ${avg10} -lt ${clear_threshold} ]]; then
 | 
			
		||||
        return 0
 | 
			
		||||
    if [[ ${avg300} -lt ${clear_threshold} ]]; then
 | 
			
		||||
        true
 | 
			
		||||
    else
 | 
			
		||||
        return 1
 | 
			
		||||
        false
 | 
			
		||||
    fi
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
@@ -186,7 +244,7 @@ exec_notices () {
 | 
			
		||||
                send "${psi_type}" "${current_alarms}" "${email_to}"
 | 
			
		||||
                ;;
 | 
			
		||||
            *)
 | 
			
		||||
                echo "Something went wrong!" >&2
 | 
			
		||||
                print "Something went wrong!" >&2
 | 
			
		||||
                false
 | 
			
		||||
                ;;
 | 
			
		||||
        esac
 | 
			
		||||
@@ -203,56 +261,83 @@ check_dunst_id_is_visible () {
 | 
			
		||||
    local dunst_id="${1}"
 | 
			
		||||
 | 
			
		||||
    typeset -a ids
 | 
			
		||||
    if ids=$(ssh -q "${ssh_host}" -p ${ssh_port} \
 | 
			
		||||
        "dunstctl history | jq '.data[0][][${id_idx}].data'"); then
 | 
			
		||||
            echo "Connection to dunst failed!" >&2
 | 
			
		||||
            return 2
 | 
			
		||||
    if ! ids=$(ssh -q "${ssh_host}" -p ${ssh_port} -l "${ssh_user}" \
 | 
			
		||||
        "${notification_hist_cmd} | jq '.data[0][].id.data'"); then
 | 
			
		||||
        if ! ids=$(ssh -qi "${ssh_id_path}" -p ${ssh_port} -l "${ssh_user}" \
 | 
			
		||||
            "${ssh_host}" "${notification_hist_cmd} | jq '.data[0][].id.data'"); then
 | 
			
		||||
                print "Connection to dunst failed!" >&2
 | 
			
		||||
                return 2
 | 
			
		||||
        fi
 | 
			
		||||
    fi
 | 
			
		||||
 | 
			
		||||
    # if the alert is visible, it's not in the dunst history
 | 
			
		||||
    if grep -qP "\b${dunst_id}\b" <<< "${ids}"; then
 | 
			
		||||
        true
 | 
			
		||||
    else
 | 
			
		||||
        false
 | 
			
		||||
    else
 | 
			
		||||
        true
 | 
			
		||||
    fi
 | 
			
		||||
}
 | 
			
		||||
#set -x
 | 
			
		||||
 | 
			
		||||
local current_alarm=""
 | 
			
		||||
local last_alarm=""
 | 
			
		||||
typeset -A notice_sent
 | 
			
		||||
typeset -A secs
 | 
			
		||||
integer last_dunst_id=-1
 | 
			
		||||
local last_line=""
 | 
			
		||||
 | 
			
		||||
journalctl -b 0 -fu "${svc}"  | \
 | 
			
		||||
while read line; do
 | 
			
		||||
    local psi_type="$(grep -Eo "(CPU|MEM|IO) PSI event" <<< "${line}" | grep -Eo "CPU|MEM|IO")"
 | 
			
		||||
    if [[ -n "${psi_type}" ]]; then
 | 
			
		||||
        secs+=(${psi_type} $(date +%s))
 | 
			
		||||
        if [[ "${psi_type}" != "${last_alarm}" ]]; then
 | 
			
		||||
#set -x
 | 
			
		||||
while true; do
 | 
			
		||||
    if ${user}; then
 | 
			
		||||
        line=$(journalctl --user -u ${svc} -n1)
 | 
			
		||||
    else
 | 
			
		||||
        line=$(journalctl -u ${svc} -n1)
 | 
			
		||||
    fi
 | 
			
		||||
    now=$(date +%s)
 | 
			
		||||
    last_timestamp=$(date -d "$(awk '{print $1" "$2" "$3}' <<< "${line}")" +%s)
 | 
			
		||||
    time_diff=$(( now - last_timestamp ))
 | 
			
		||||
    if [[ "${last_line}" == "${line}" ]]; then
 | 
			
		||||
        # last line hasn't changed, check to see if we can clear alarms
 | 
			
		||||
        if (( time_diff >= 3 )); then
 | 
			
		||||
            # haven't seen a monitor alert for 3 seconds, see if we can clear them
 | 
			
		||||
            if [[ -n "${current_alarms}" ]]; then
 | 
			
		||||
                typeset -a alarms=( $(tr '|' ' ' <<< "$current_alarms") )
 | 
			
		||||
                for alarm in ${alarms}; do
 | 
			
		||||
                    integer elapsed=$(( now - ${secs[${alarm}]} ))
 | 
			
		||||
                    if is_clear "${alarm}" && (( elapsed >= 300 )); then
 | 
			
		||||
                        current_alarms=$(sed -E "s/${alarm}\|?//; s/|$//" <<< "${current_alarms}")
 | 
			
		||||
                        unset "notice_sent[${alarm}]"
 | 
			
		||||
                        unset "secs[${alarm}]"
 | 
			
		||||
                    fi
 | 
			
		||||
                done
 | 
			
		||||
            fi
 | 
			
		||||
            sleep 1
 | 
			
		||||
            continue
 | 
			
		||||
        fi
 | 
			
		||||
    
 | 
			
		||||
        sleep 1
 | 
			
		||||
        continue
 | 
			
		||||
    fi
 | 
			
		||||
    last_line="${line}"
 | 
			
		||||
    if (( time_diff < 3 )); then
 | 
			
		||||
        local psi_type="$(grep -Eo "(CPU|MEM|IO) PSI event" <<< "${line}" | grep -Eo "CPU|MEM|IO")"
 | 
			
		||||
        if [[ -n "${psi_type}" ]]; then
 | 
			
		||||
            secs+=(${psi_type} ${now})
 | 
			
		||||
            if [[ ! ${notice_sent[${psi_type}]} ]]; then
 | 
			
		||||
                last_dunst_id=$(exec_notices "${psi_type}" "${current_alarms}")
 | 
			
		||||
                notice_sent+=(${psi_type} true)
 | 
			
		||||
            elif (( last_dunst_id >= 0 )) && ! check_dunst_id_is_visible "${last_dunst_id}"; then
 | 
			
		||||
            elif (( last_dunst_id >= 0 )) && check_dunst_id_is_visible "${last_dunst_id}"; then
 | 
			
		||||
                sleep 1
 | 
			
		||||
                continue
 | 
			
		||||
            fi
 | 
			
		||||
        fi
 | 
			
		||||
        last_alarm="${psi_type}"
 | 
			
		||||
        if [[ -z "${current_alarms}" ]]; then
 | 
			
		||||
            current_alarms="${psi_type}"
 | 
			
		||||
        else
 | 
			
		||||
            if ! grep -q "${psi_type}" <<< "${current_alarms}"; then
 | 
			
		||||
                current_alarms="${current_alarms}|${psi_type}"
 | 
			
		||||
            if [[ -z "${current_alarms}" ]]; then
 | 
			
		||||
                current_alarms="${psi_type}"
 | 
			
		||||
            else
 | 
			
		||||
                if ! grep -q "${psi_type}" <<< "${current_alarms}"; then
 | 
			
		||||
                    current_alarms="${current_alarms}|${psi_type}"
 | 
			
		||||
                fi
 | 
			
		||||
            fi
 | 
			
		||||
        fi
 | 
			
		||||
    else
 | 
			
		||||
        typeset -a alarms=( $(tr '|' ' ' <<< "$current_alarms") )
 | 
			
		||||
        for alarm in ${alarms}; do
 | 
			
		||||
            integer elapsed=$(( $(date +%s) - ${secs[${alarm}]} ))
 | 
			
		||||
            if is_clear "${alarm}" && (( elapsed > 300 )); then
 | 
			
		||||
                current_alarms=$(sed -E "s/${alarm}\|?//" <<< "${current_alarms}")
 | 
			
		||||
                last_alarm=$(awk -F'|' '{print $NF}' <<< "${current_alarms}")
 | 
			
		||||
            fi
 | 
			
		||||
        done
 | 
			
		||||
    fi
 | 
			
		||||
    sleep 1
 | 
			
		||||
done
 | 
			
		||||
#set +x
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
 Submodule psi-by-example updated: 2f9d714642...e09aacd35f
									
								
							
							
								
								
									
										10
									
								
								psi-monitor-user.service
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										10
									
								
								psi-monitor-user.service
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,10 @@
 | 
			
		||||
[Unit]
 | 
			
		||||
Description=Pressure Stall Information (PSI) Monitor
 | 
			
		||||
PartOf=default.target
 | 
			
		||||
 | 
			
		||||
[Service]
 | 
			
		||||
Type=simple
 | 
			
		||||
ExecStart=/home/trey/bin/psi-monitor.sh 80
 | 
			
		||||
 | 
			
		||||
[Install]
 | 
			
		||||
WantedBy=default.target
 | 
			
		||||
							
								
								
									
										62
									
								
								psi-monitor.sh
									
									
									
									
									
										Executable file
									
								
							
							
						
						
									
										62
									
								
								psi-monitor.sh
									
									
									
									
									
										Executable file
									
								
							@@ -0,0 +1,62 @@
 | 
			
		||||
#!/usr/bin/env zsh
 | 
			
		||||
#
 | 
			
		||||
# Pressure Stall Information monitor
 | 
			
		||||
#
 | 
			
		||||
#    Copyright © 2023 Trey Blancher $(base64 -d <<< dHJleUBibGFuY2hlci5uZXQK)
 | 
			
		||||
#
 | 
			
		||||
#    This program is free software: you can redistribute it and/or modify it
 | 
			
		||||
#    under the terms of the GNU General Public License as published by the Free
 | 
			
		||||
#    Software Foundation, either version 3 of the License, or (at your option)
 | 
			
		||||
#    any later version.
 | 
			
		||||
#
 | 
			
		||||
#    This program is distributed in the hope that it will be useful, but
 | 
			
		||||
#    WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
 | 
			
		||||
#    or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
 | 
			
		||||
#    for more details.
 | 
			
		||||
#
 | 
			
		||||
#    You should have received a copy of the GNU General Public License along
 | 
			
		||||
#    with this program.  If not, see <https://www.gnu.org/licenses/>.
 | 
			
		||||
#
 | 
			
		||||
#    Submodules may be distributed under a separate software license;  see the
 | 
			
		||||
#    LICENSE file within each submodule.
 | 
			
		||||
# 
 | 
			
		||||
# This script monitors the three pressure stall information files
 | 
			
		||||
# /proc/pressure{cpu,io,memory} and reports if any resource is above threshold
 | 
			
		||||
# for the "some" values.  It takes an optional single argument, the threshold at
 | 
			
		||||
# which to alert.  If this is not supplied, it defaults to a threshold of 30.0 
 | 
			
		||||
# percent.
 | 
			
		||||
#
 | 
			
		||||
 | 
			
		||||
local cpu="/proc/pressure/cpu"
 | 
			
		||||
local cpu_ctr=0
 | 
			
		||||
local io="/proc/pressure/io"
 | 
			
		||||
local io_ctr=0
 | 
			
		||||
local mem="/proc/pressure/memory"
 | 
			
		||||
local mem_ctr=0
 | 
			
		||||
local threshold=30.0
 | 
			
		||||
 | 
			
		||||
if [[ -n "${1}" ]]; then
 | 
			
		||||
    threshold=${1}
 | 
			
		||||
fi
 | 
			
		||||
 | 
			
		||||
# main loop
 | 
			
		||||
while true; do
 | 
			
		||||
    local cpu_pct=$(grep 'some' ${cpu} | awk '{print $2}' | awk -F'=' '{print $2}')    
 | 
			
		||||
    local io_pct=$(grep 'some' ${io} | awk '{print $2}' | awk -F'=' '{print $2}')    
 | 
			
		||||
    local mem_pct=$(grep 'some' ${mem} | awk '{print $2}' | awk -F'=' '{print $2}')    
 | 
			
		||||
    
 | 
			
		||||
    if (( cpu_pct > threshold )); then
 | 
			
		||||
        cpu_ctr=$(( ${cpu_ctr} + 1 ))
 | 
			
		||||
        printf "CPU PSI event %d triggered.\n" ${cpu_ctr}
 | 
			
		||||
    fi
 | 
			
		||||
    if (( io_pct > threshold )); then
 | 
			
		||||
        io_ctr=$(( ${io_ctr} + 1 ))
 | 
			
		||||
        printf "IO PSI event %d triggered.\n" ${io_ctr}
 | 
			
		||||
    fi
 | 
			
		||||
    if (( mem_pct > threshold )); then
 | 
			
		||||
        mem_ctr=$(( ${mem_ctr} + 1 ))
 | 
			
		||||
        printf "MEM PSI event %d triggered.\n" ${mem_ctr}
 | 
			
		||||
    fi
 | 
			
		||||
 | 
			
		||||
    sleep 10
 | 
			
		||||
done
 | 
			
		||||
		Reference in New Issue
	
	Block a user