#!/bin/sh # Purpose {{{ ## If Xymon server says that a service is in error on a remote host, try to ## restart this service. ## 1. Create a ssh keyring for xymon user {{{ # sudo mkdir -p -- /var/lib/xymon/.ssh/ # sudo ssh-keygen -f /var/lib/xymon/.ssh/id_rsa -N '' -q # sudo chown -R xymon:xymon /var/lib/xymon/.ssh/ ## }}} ## 2. Remote user {{{ # Ensure to have the ${REMOTE_SSH_USER} available on remote hosts and allowed # to connect with SSH. # Restrict the SSH access to a single SSH key from the Xymon server IP # (~${REMOTE_SSH_USER}/.ssh/authorized_keys) : ## from="IP.SRV.XYM.ON" ssh-rsa AAAAA… # Allow sudo commands to restart services (/etc/sudoers.d/xymon-ssh) : ## xymon-ssh ALL=(root:root) NOPASSWD: /bin/systemctl restart * ## }}} ## 3. Xymon Configuration {{{ # PROC monitoring need to display the real service name in it's description : ## PROC %^/sbin/rpcbind MIN=1 MAX=1 COLOR=red "TEXT=rpcbind" # You can add more information about this proc if you an underscore "_" : ## PROC %^/usr/sbin/rpc.idmapd MIN=1 MAX=1 COLOR=red "TEXT=NFS-server_rpc.idmapd" ## This way, the script will only take the text before the underscore "_" as the ## service name to be restarted. # Don't add whitespaces in the description of a process. ## }}} # }}} # Vars {{{ DEBUG=1 REMOTE_SSH_USER="xymon-ssh" temp_dir=$(mktemp -d -t xymon-procs-alert-XXXXXX.tmp) debug_stdout="${temp_dir}/debug.stdout" debug_stderr="${temp_dir}/debug.stderr" service_list="${temp_dir}/services.error.list" # }}} # Create log files touch "${debug_stdout}" "${debug_stderr}" # Manage only procs probe {{{ if [ "${BBSVCNAME}" = "procs" ]; then [ "${DEBUG}" -eq "0" ] && printf '\e[1;35m%-6s\e[m\n' "DEBUG : ${BBHOSTNAME} — ${BBSVCNAME} error" >> "${debug_stdout}" else [ "${DEBUG}" -eq "0" ] && printf '\e[1;35m%-6s\e[m\n' "DEBUG : ${BBHOSTNAME} — ${BBSVCNAME} probe is not managed." >> "${debug_stderr}" [ "${DEBUG}" -eq "0" ] || rm -rf -- "${temp_dir}" exit 0 fi # }}} # Get the list of processes with an error echo "${BBALPHAMSG}" | grep -E "&(red|yellow)" | cut -d" " -f2- | tr '[:upper:]' '[:lower:]' > "${service_list}" # If any error on a process if [ -s "${service_list}" ]; then [ "${DEBUG}" -eq "0" ] && printf '\e[1;35m%-6s\e[m\n' "DEBUG : process list — Some processes seems to be in error." >> "${debug_stdout}" while IFS= read -r line; do ## Pattern "req. between" {{{ if echo "${line}" | grep -q -E -- ".* \\(found .*, req. between .* and .*\\)" ; then [ "${DEBUG}" -eq "0" ] && printf '\e[1;35m%-6s\e[m\n' "DEBUG : while process loop — Pattern \"req. between\"." >> "${debug_stdout}" service_name="$(echo "${line}" | cut -d" " -f1 | sed 's/_.*//')" process_found="$(echo "${line}" | cut -d" " -f3 | tr -d ',')" process_min="$(echo "${line}" | cut -d" " -f6)" process_max="$(echo "${line}" | cut -d" " -f8 | tr -d ')')" fi ## }}} ## Pattern "req. .* or more" {{{ if echo "${line}" | grep -q -E -- ".* \\(found .*, req. .* or more\\)" ; then [ "${DEBUG}" -eq "0" ] && printf '\e[1;35m%-6s\e[m\n' "DEBUG : while process loop — Pattern \"req. .* or more\"." >> "${debug_stdout}" service_name="$(echo "${line}" | cut -d" " -f1 | sed 's/_.*//')" process_found="$(echo "${line}" | cut -d" " -f3 | tr -d ',')" process_min="$(echo "${line}" | cut -d" " -f5)" process_max="nolimit" fi ## }}} [ "${DEBUG}" -eq "0" ] && printf '\e[1;35m%-6s\e[m\n' "DEBUG : while process loop — Found ${process_found} process(es) for ${service_name} service and require between ${process_min} and ${process_max}." >> "${debug_stdout}" # Restart service if needed {{{ if [ "${process_found}" ] && [ "${process_min}" ] && [ "${process_found}" -lt "${process_min}" ]; then [ "${DEBUG}" -eq "0" ] && printf '\e[1;35m%-6s\e[m\n' "DEBUG : while process loop — ${service_name} need to be restarted." >> "${debug_stdout}" [ "${DEBUG}" -eq "0" ] && printf '\e[1;35m%-6s\e[m\n' "DEBUG : while process loop — ssh -n -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${REMOTE_SSH_USER}@${BBHOSTNAME} sudo systemctl restart ${service_name}.service" >> "${debug_stdout}" ssh -n -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null "${REMOTE_SSH_USER}"@"${BBHOSTNAME}" "sudo systemctl restart ${service_name}.service" >> "${debug_stdout}" 2>> "${debug_stderr}" else [ "${DEBUG}" -eq "0" ] && printf '\e[1;35m%-6s\e[m\n' "DEBUG : while process loop — ${service_name} service is not managed." >> "${debug_stdout}" fi # }}} done < "${service_list}" # Also restart xymon-client service {{{ [ "${DEBUG}" -eq "0" ] && printf '\e[1;35m%-6s\e[m\n' "DEBUG : process list — xymon-client also need to be restarted." >> "${debug_stdout}" [ "${DEBUG}" -eq "0" ] && printf '\e[1;35m%-6s\e[m\n' "DEBUG : process list — ssh -n -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${REMOTE_SSH_USER}@${BBHOSTNAME} sudo systemctl restart xymon-client.service" >> "${debug_stdout}" ssh -n -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null "${REMOTE_SSH_USER}"@"${BBHOSTNAME}" "sudo systemctl restart xymon-client.service" >> "${debug_stdout}" 2>> "${debug_stderr}" # }}} else [ "${DEBUG}" -eq "0" ] && printf '\e[1;35m%-6s\e[m\n' "DEBUG : process list — No error on any process." >> "${debug_stdout}" fi # Remove empty error file [ -s "${debug_stderr}" ] || rm -f "${debug_stderr}" # Remove temp_dir if DEBUG is disable [ "${DEBUG}" -eq "0" ] || rm -rf -- "${temp_dir}" exit 0