Compare commits
	
		
			2 Commits
		
	
	
		
			ae32ba4ae4
			...
			42f94bbf77
		
	
	| Author | SHA1 | Date | |
|---|---|---|---|
| 42f94bbf77 | |||
| 354088b245 | 
							
								
								
									
										51
									
								
								CONFIGURE.md
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										51
									
								
								CONFIGURE.md
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,51 @@
 | 
				
			|||||||
 | 
					# CONFIGURE
 | 
				
			||||||
 | 
					Included in this project are a number of systemd units:
 | 
				
			||||||
 | 
					    * psi-monitor.service
 | 
				
			||||||
 | 
					        * uses psi-monitor executable (in /usr/bin/)
 | 
				
			||||||
 | 
					    * psi-alerts@.service (system template service)
 | 
				
			||||||
 | 
					        * uses psi-alerts.sh script
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					The `psi-alerts.sh` is essentially a daemon (a systemd simple service), and for
 | 
				
			||||||
 | 
					now the systemd template needs to be instantiated with the username that will
 | 
				
			||||||
 | 
					execute `psi-alerts.sh`.  Also, a systemd unit override should be created, like
 | 
				
			||||||
 | 
					so:  
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					```
 | 
				
			||||||
 | 
					sudo systemctl edit psi-alerts@<user>.service
 | 
				
			||||||
 | 
					```
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					This will open an editor, and in later versions of systemd the comment code will be included, clearly showing where the override should be entered:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					```
 | 
				
			||||||
 | 
					### Editing /etc/systemd/system/psi-alerts@trey.service.d/override.conf
 | 
				
			||||||
 | 
					### Anything between here and the comment below will become the contents of the drop-in file
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					[Service]
 | 
				
			||||||
 | 
					Environment=EMAIL_TO="email@domain.tld"
 | 
				
			||||||
 | 
					Environment=SMS_DST="phone_number@sms.domain.tld"
 | 
				
			||||||
 | 
					Environment=NOTIFICATION_CMD="dunstify"
 | 
				
			||||||
 | 
					Environment=NOTIFICATION_OPTS="--timeout=0 --printid --urgency=critical --icon=/usr/share/icons/breeze-dark/emblems/16/emblem-warning.svg"
 | 
				
			||||||
 | 
					Environment=NOTIFICATION_IDX=15
 | 
				
			||||||
 | 
					Environment=SSH_USER="username"
 | 
				
			||||||
 | 
					Environment=SSH_HOST="localhost"
 | 
				
			||||||
 | 
					Environment=SSH_PORT=5999
 | 
				
			||||||
 | 
					Environment=SSH_ID_PATH="~trey/.ssh/psi-alerts"
 | 
				
			||||||
 | 
					Environment=CLEAR_THRESHOLD="5.0"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					### Edits below this comment will be discarded
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					### /etc/systemd/system/psi-alerts@.service
 | 
				
			||||||
 | 
					# [Unit]
 | 
				
			||||||
 | 
					# Description=Pressure Stall Information (PSI) alerts
 | 
				
			||||||
 | 
					# PartOf=multi-user.target
 | 
				
			||||||
 | 
					# After=psi-monitor.service
 | 
				
			||||||
 | 
					#
 | 
				
			||||||
 | 
					# [Service]
 | 
				
			||||||
 | 
					# User=%i
 | 
				
			||||||
 | 
					# Type=simple
 | 
				
			||||||
 | 
					# ExecStart=psi-alerts.sh
 | 
				
			||||||
 | 
					#
 | 
				
			||||||
 | 
					# [Install]
 | 
				
			||||||
 | 
					# WantedBy=multi-user.target
 | 
				
			||||||
 | 
					```
 | 
				
			||||||
 | 
					
 | 
				
			||||||
							
								
								
									
										7
									
								
								INSTALL.md
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										7
									
								
								INSTALL.md
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,7 @@
 | 
				
			|||||||
 | 
					# INSTALL
 | 
				
			||||||
 | 
					First, clone this repository with the `--recurse-submodules` flag:
 | 
				
			||||||
 | 
					```
 | 
				
			||||||
 | 
					$ git clone --recurse-submodules https://git.eldon.me/trey/psi-alerts.git
 | 
				
			||||||
 | 
					```
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
							
								
								
									
										93
									
								
								README.md
									
									
									
									
									
								
							
							
						
						
									
										93
									
								
								README.md
									
									
									
									
									
								
							@@ -2,28 +2,90 @@
 | 
				
			|||||||
## PURPOSE
 | 
					## PURPOSE
 | 
				
			||||||
 | 
					
 | 
				
			||||||
This project aims to deliver Pressure Stall Information (PSI) alerts via
 | 
					This project aims to deliver Pressure Stall Information (PSI) alerts via
 | 
				
			||||||
standard Linux graphical desktop (through `libnotify` compatible daemons and
 | 
					standard Linux graphical desktop notifications (through `libnotify` compatible
 | 
				
			||||||
CLI programs), and email (email-to-SMS is also supported).  This can alert the
 | 
					daemons and CLI programs), and email (email-to-SMS is also supported).  This
 | 
				
			||||||
system administrator of CPU, I/O, or Memory (RAM) pressure in near real time.
 | 
					can alert the system administrator of CPU, I/O, or Memory (RAM) pressure in
 | 
				
			||||||
 | 
					near real time.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
## PREREQUISITES
 | 
					## PREREQUISITES
 | 
				
			||||||
* A Linux system with kernel 5.2.0 or greater
 | 
					* A Linux system with kernel 5.2.0 or greater, with the /proc filesystem
 | 
				
			||||||
 | 
					  enabled
 | 
				
			||||||
* systemd
 | 
					* systemd
 | 
				
			||||||
* zsh
 | 
					* zsh
 | 
				
			||||||
 | 
					* sysstat (for pidstat)
 | 
				
			||||||
* ssh (OpenSSH, for desktop notifications)
 | 
					* ssh (OpenSSH, for desktop notifications)
 | 
				
			||||||
* psi-by-example (a modified version of this is included in this project as a
 | 
					* psi-by-example (a modified version of this is included in this project as a
 | 
				
			||||||
  submodule)
 | 
					  submodule)
 | 
				
			||||||
 | 
					* a libnotify-compatible desktop notification system
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					## History
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					When I first learned about [Pressure Stall
 | 
				
			||||||
 | 
					Information](https://docs.kernel.org/accounting/psi.html) (PSI), I was
 | 
				
			||||||
 | 
					intrigued.  This provides a real-time view into the performance and typical
 | 
				
			||||||
 | 
					resource contention Linux system administrators need to worry about:  CPU, I/O,
 | 
				
			||||||
 | 
					and Memory (RAM).  During this research, I found [this
 | 
				
			||||||
 | 
					post](https://unixism.net/2019/08/linux-pressure-stall-information-psi-by-example/)
 | 
				
			||||||
 | 
					complete with a C code example;  albeit, it was light on I/O details and the
 | 
				
			||||||
 | 
					example C code the author provided didn't even include Memory pressure at all
 | 
				
			||||||
 | 
					(so modified it to include Memory pressure).  
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					A quick and dirty description of PSI:  whenever one or more processes are
 | 
				
			||||||
 | 
					waiting for some measurable resource (CPU, I/O, or RAM), the percentage of
 | 
				
			||||||
 | 
					processes waiting on the resource will begin to increase.  Initially, the
 | 
				
			||||||
 | 
					percentage will be low, but as resource contention increases, more and more
 | 
				
			||||||
 | 
					processes will be waiting to be processed by the CPU for that resource.  If not
 | 
				
			||||||
 | 
					all processes are waiting on this resource, PSI calls this the "some"
 | 
				
			||||||
 | 
					contention for resources.  If all processes are waiting on the resource, this
 | 
				
			||||||
 | 
					is known as the "full" resource contention.  
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					The pressure information is exposed in the _/proc_ filesystem in these three
 | 
				
			||||||
 | 
					virtual files: _/proc/pressure/cpu_, _/proc/pressure/io_,
 | 
				
			||||||
 | 
					_/proc/pressure/memory_.  Each file reports both some and full, and has the
 | 
				
			||||||
 | 
					following output:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
## INSTALL
 | 
					 | 
				
			||||||
First, clone this repository with the `--recurse-submodules` flag:
 | 
					 | 
				
			||||||
```
 | 
					```
 | 
				
			||||||
$ git clone --recurse-submodules https://git.eldon.me/trey/psi-alerts.git
 | 
					some avg10=0.02 avg60=0.43 avg300=0.55 total=711489361
 | 
				
			||||||
 | 
					full avg10=0.02 avg60=0.43 avg300=0.54 total=681874430
 | 
				
			||||||
```
 | 
					```
 | 
				
			||||||
 | 
					
 | 
				
			||||||
## CONFIGURE
 | 
					This example is taken from _/proc/pressure/io_, for I/O pressure.   The full
 | 
				
			||||||
Included in this project are a number of systemd units:
 | 
					CPU pressure information really depends on the cgroups, which this project
 | 
				
			||||||
    * psi-monitor.service
 | 
					doesn't pay close attention to at this time.  The percentages are a measure of
 | 
				
			||||||
    * psi-alerts@.service (template service)
 | 
					the average resource pressure over the last 10s, 60s, and 300s (5 minutes).
 | 
				
			||||||
 | 
					The total is the number of microseconds that any processes were waiting for the
 | 
				
			||||||
 | 
					resource;  this is a counter that is reset on boot, and will continously update
 | 
				
			||||||
 | 
					as processes wait for the resource.  They always have to wait for the resource,
 | 
				
			||||||
 | 
					even if it's on the order of hundreds of microseconds or less.  Even if the
 | 
				
			||||||
 | 
					percentages were all zeroes, the total counter will be nonzero (at least for
 | 
				
			||||||
 | 
					the some metrics), and even the full metrics will have a nonzero total except
 | 
				
			||||||
 | 
					for CPU, because the full CPU total only really applies to cgroups (and are out
 | 
				
			||||||
 | 
					of scope for this project at present).
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					The monitor code (from psi-by-example listed above) only considers the "some"
 | 
				
			||||||
 | 
					pressure for all three resources, which will usually alert before the system
 | 
				
			||||||
 | 
					becomes critical (and in the case of full Memory usage/thrashing, completely
 | 
				
			||||||
 | 
					unusable for any workload).  Thus the alerts should come in well before the full
 | 
				
			||||||
 | 
					resource pressure gets maxed out.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Now, I don't know C very well, but this _monitor.c_ code was easy enough to
 | 
				
			||||||
 | 
					extend to include memory pressure.  However, the _create_load.c_ only creates
 | 
				
			||||||
 | 
					CPU and I/O load (memory load is too detrimental to system performance).  
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					This was developed on an [SSDNodes VPS](https://ssdnodes.com) (Virtual Private
 | 
				
			||||||
 | 
					Server), which is a KVM virtual machine, backed by SSD hardware.  It is very
 | 
				
			||||||
 | 
					well provisioned with virtual hardware:  8 vCPUs, 32GiB RAM, and 640GiB SSD
 | 
				
			||||||
 | 
					disk space.  Currently, there is very little load on this system, even with
 | 
				
			||||||
 | 
					four different websites on it, with corresponding database engines, and an
 | 
				
			||||||
 | 
					nginx reverse proxy.  I plan on putting
 | 
				
			||||||
 | 
					[mailcow-dockerized](https://docs.mailcow.email/) on this VPS soon, which has
 | 
				
			||||||
 | 
					the potential to increase the load significantly.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Now, once the regular workload of this VPS increases, my current configuration
 | 
				
			||||||
 | 
					may become too noisy.  However, I've tried to configure `psi-alerts.sh` in such
 | 
				
			||||||
 | 
					a way that it only alerts once when the pressure on a resource increases, and
 | 
				
			||||||
 | 
					won't alert again until that pressure subsides (and the some percentages drop
 | 
				
			||||||
 | 
					below the configurable threshold for at least five minutes).
 | 
				
			||||||
 | 
					
 | 
				
			||||||
## TODO
 | 
					## TODO
 | 
				
			||||||
* finish INSTALL section
 | 
					* finish INSTALL section
 | 
				
			||||||
@@ -33,3 +95,12 @@ Included in this project are a number of systemd units:
 | 
				
			|||||||
    * about defining an instance and editing it
 | 
					    * about defining an instance and editing it
 | 
				
			||||||
        * `sudo systemctl edit psi-alerts@<user>.service`
 | 
					        * `sudo systemctl edit psi-alerts@<user>.service`
 | 
				
			||||||
        * mainly for `Environment=` variables
 | 
					        * mainly for `Environment=` variables
 | 
				
			||||||
 | 
					    * consider reworking this for a user service, not a system service
 | 
				
			||||||
 | 
					        * this could make desktop notifications simpler, and not having to use
 | 
				
			||||||
 | 
					          SSH keys without passphrases
 | 
				
			||||||
 | 
					        * need to become much more familiar with user services
 | 
				
			||||||
 | 
					* consider reworking all code in a compiled language (other than C)
 | 
				
			||||||
 | 
					    * time to learn Go
 | 
				
			||||||
 | 
					    * or continue learning Rust
 | 
				
			||||||
 | 
					    * need to know how to use kernel syscalls in these languages (if possible)
 | 
				
			||||||
 | 
					    * also, convert psi-alerts.sh script to either of these languages
 | 
				
			||||||
 
 | 
				
			|||||||
							
								
								
									
										55
									
								
								psi-alerts.sh
									
									
									
									
									
										
										
										Normal file → Executable file
									
								
							
							
						
						
									
										55
									
								
								psi-alerts.sh
									
									
									
									
									
										
										
										Normal file → Executable file
									
								
							@@ -44,6 +44,8 @@ sms_dst="${SMS_DST}"
 | 
				
			|||||||
sms_domain="$(awk -F@ '{print $NF}' <<< ${SMS_DST})"
 | 
					sms_domain="$(awk -F@ '{print $NF}' <<< ${SMS_DST})"
 | 
				
			||||||
ssh_port="${SSH_PORT}"
 | 
					ssh_port="${SSH_PORT}"
 | 
				
			||||||
ssh_host="${SSH_HOST}"
 | 
					ssh_host="${SSH_HOST}"
 | 
				
			||||||
 | 
					ssh_user="${SSH_USER}"
 | 
				
			||||||
 | 
					ssh_id_path="${SSH_ID_PATH}"
 | 
				
			||||||
clear_threshold="${CLEAR_THRESHOLD}"
 | 
					clear_threshold="${CLEAR_THRESHOLD}"
 | 
				
			||||||
notification_cmd="${NOTIFICATION_CMD}"
 | 
					notification_cmd="${NOTIFICATION_CMD}"
 | 
				
			||||||
notification_opts="${NOTIFICATION_OPTS}"
 | 
					notification_opts="${NOTIFICATION_OPTS}"
 | 
				
			||||||
@@ -51,32 +53,31 @@ id_idx="${NOTIFICATION_IDX}"
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
print_psi () {
 | 
					print_psi () {
 | 
				
			||||||
    local psi_file="${1}"
 | 
					    local psi_file="${1}"
 | 
				
			||||||
    cat "${(P)psi_file}"
 | 
					    cat "${(P)$(tr '[[:upper:]]' '[[:lower:]]' <<< "${psi_file}")}"
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
print_pidstat () {
 | 
					print_pidstat () {
 | 
				
			||||||
    local psi_type="${1}"
 | 
					    local psi_type="${1}"
 | 
				
			||||||
    local opts="-l --human"
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
    case "${psi_type}" in
 | 
					    case "${psi_type}" in
 | 
				
			||||||
        CPU)
 | 
					        CPU)
 | 
				
			||||||
            opts="-u ${opts}"
 | 
					            pidstat -ul --human
 | 
				
			||||||
            ;;
 | 
					            ;;
 | 
				
			||||||
        IO)
 | 
					        IO)
 | 
				
			||||||
            opts="-d ${opts}"
 | 
					            pidstat -dl --human
 | 
				
			||||||
            ;;
 | 
					            ;;
 | 
				
			||||||
        MEM)
 | 
					        MEM)
 | 
				
			||||||
            opts="-r ${opts}"
 | 
					            pidstat -rl --human
 | 
				
			||||||
            ;;
 | 
					            ;;
 | 
				
			||||||
        *)
 | 
					        *)
 | 
				
			||||||
            print "Invalid psi_type:  ${psi_type}" >&2
 | 
					            print "Invalid psi_type:  ${psi_type}" >&2
 | 
				
			||||||
            ;;
 | 
					            ;;
 | 
				
			||||||
    esac
 | 
					    esac
 | 
				
			||||||
 | 
					 | 
				
			||||||
    pidstat "${opts}"
 | 
					 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
send_notice () {
 | 
					send_notice () {
 | 
				
			||||||
 | 
					    #set -x
 | 
				
			||||||
    local psi_type="${1}"
 | 
					    local psi_type="${1}"
 | 
				
			||||||
    shift
 | 
					    shift
 | 
				
			||||||
    
 | 
					    
 | 
				
			||||||
@@ -96,7 +97,7 @@ send_notice () {
 | 
				
			|||||||
    esac
 | 
					    esac
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    integer notification_id
 | 
					    integer notification_id
 | 
				
			||||||
    if ! notification_id=$(ssh -q "${ssh_host}" -p ${ssh_port} \
 | 
					    if ! notification_id=$(ssh -q -i "${ssh_id_path}" "${ssh_user}@${ssh_host}" -p ${ssh_port} \
 | 
				
			||||||
        "${notification_cmd} ${notification_opts} '${host}:  PSI ${psi_type} triggered!' '${psi}'"); then
 | 
					        "${notification_cmd} ${notification_opts} '${host}:  PSI ${psi_type} triggered!' '${psi}'"); then
 | 
				
			||||||
            print "Connection to notification daemon failed!" >&2
 | 
					            print "Connection to notification daemon failed!" >&2
 | 
				
			||||||
            false
 | 
					            false
 | 
				
			||||||
@@ -104,6 +105,7 @@ send_notice () {
 | 
				
			|||||||
        echo ${notification_id}
 | 
					        echo ${notification_id}
 | 
				
			||||||
        true
 | 
					        true
 | 
				
			||||||
    fi
 | 
					    fi
 | 
				
			||||||
 | 
					    #set +x
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
send () {
 | 
					send () {
 | 
				
			||||||
@@ -128,29 +130,28 @@ send () {
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
    local psi="$(print_psi $(tr '[[:upper:]]' '[[:lower:]]' <<< ${psi_type}))"
 | 
					    local psi="$(print_psi $(tr '[[:upper:]]' '[[:lower:]]' <<< ${psi_type}))"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    local subj="PSI on deltachunk ${psi_type} triggered!"
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    local body="Pressure Stall Information triggered on ${psi_type} at $(date +'%FT%T %Z')"
 | 
					 | 
				
			||||||
    if [[ -n "${current_alarms}" ]]; then
 | 
					 | 
				
			||||||
        body="${body}\nMultiple alarms triggered:  ${current_alarms}"
 | 
					 | 
				
			||||||
        # if this is not an SMS, include pidstat info
 | 
					 | 
				
			||||||
        if [[ ! "${dst}" =~ "${sms_domain}" ]]; then
 | 
					 | 
				
			||||||
            for p in $(tr '|' ' ' <<< "${current_alarms}"); do
 | 
					 | 
				
			||||||
                body="${body}\n\n$(print_pidstat ${p})"
 | 
					 | 
				
			||||||
            done
 | 
					 | 
				
			||||||
        fi
 | 
					 | 
				
			||||||
    fi
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
    local email=$(mktemp /tmp/psi.eml.XXXX)
 | 
					    local email=$(mktemp /tmp/psi.eml.XXXX)
 | 
				
			||||||
 | 
					    local subj
 | 
				
			||||||
 | 
					    printf "Pressure Stall Information triggered on ${psi_type} at $(date +'%FT%T %Z')\n\n" > ${email}
 | 
				
			||||||
 | 
					    if [[ -n "${current_alarms}" ]]; then
 | 
				
			||||||
 | 
					        subj="PSI on deltachunk ${current_alarms} triggered!"
 | 
				
			||||||
 | 
					        printf "Multiple alarms triggered:  ${current_alarms}\n\n" >> ${email}
 | 
				
			||||||
 | 
					    else
 | 
				
			||||||
 | 
					        subj="PSI on deltachunk ${psi_type} triggered!"
 | 
				
			||||||
 | 
					        current_alarms="${psi_type}"
 | 
				
			||||||
 | 
					    fi
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    cat <<-EOF > ${email}
 | 
					    # is this an email or SMS?
 | 
				
			||||||
        ${body}
 | 
					    if [[ ! "${dst}" =~ "${sms_domain}" ]]; then
 | 
				
			||||||
 | 
					        for p in $(tr '|' ' ' <<< "${current_alarms}"); do
 | 
				
			||||||
EOF
 | 
					            printf "\npidstat info for ${p}\n\n" >> ${email}
 | 
				
			||||||
 | 
					            print_pidstat "${p}" >> ${email}
 | 
				
			||||||
 | 
					            printf "\n\n" >> ${email}
 | 
				
			||||||
 | 
					        done
 | 
				
			||||||
 | 
					    fi
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    # send the message
 | 
				
			||||||
    /usr/bin/mail --resource-files=/ \
 | 
					    /usr/bin/mail --resource-files=/ \
 | 
				
			||||||
                  --subject="${subj}" \
 | 
					                  --subject="${subj}" \
 | 
				
			||||||
                  --end-options \
 | 
					                  --end-options \
 | 
				
			||||||
 
 | 
				
			|||||||
		Reference in New Issue
	
	Block a user