shell:实现linux服务器资源监控并发送告警邮件详解程序员

1、安装方式

wget http://10.8.225.126/wsmonitor/install.sh;sh install.sh t[email protected]

2、install.sh

#!/bin/sh 
if [ $# -ne 1 ]; then 
echo "########################################################" 
echo "# 用法:sh install_monitor.sh 邮箱地址;                #" 
echo "# 如:sh install_monitor.sh [email protected]          #" 
echo "# 若邮箱地址多个用英文逗号分隔                         #" 
echo "########################################################" 
exit 1 
fi 
#对应目录放开权限,安装sendmail 
/usr/bin/chattr -i /etc/shadow /etc/passwd /etc/gshadow /etc/group /etc/group- 
mkdir /var/spool/clientmqueue;chmod 777 /var/spool/clientmqueue 
for i in `seq 1 3` 
do 
echo $i 
rpm -q --quiet sendmail 
if [ $? == 1 ]; then 
yum install -y sendmail 
if [ $? == 1 ]; then 
echo "######ERROR!sendmail install fail!######" 
echo "######Please try [yum install -y sendmail]######" 
exit 
fi 
fi 
rpm -q --quiet sendmail 
if [ $? == 0 ]; then 
service sendmail restart 
if [ $? == 1 ]; then 
echo "######ERROR!sendmail start fail!######" 
echo "######Please try [service sendmail restart]######" 
exit 
fi 
chkconfig --add sendmail 
break 
fi 
done 
#下载监控脚本及启动后台执行 
current=`date "+%Y-%m-%d %H:%M:%S"` 
timestamp=`date -d "$current" +%s` 
mkdir -p /usr/local/wsmonitor 
mkdir -p /usr/local/wsmonitor/bak 
for i in `seq 1 3` 
do 
echo $i 
#    if [ -f wsmonitor.sh ];then 
#        mv /usr/local/wsmonitor/wsmonitor.sh /usr/local/wsmonitor/bak/wsmonitor.sh.$timestamp 
#    fi 
wget http://10.8.225.126/wsmonitor/wsmonitor.sh -O /usr/local/wsmonitor/wsmonitor.sh 
wget http://10.8.225.126/wsmonitor/wsmonitor -O /etc/init.d/wsmonitor 
wget http://10.8.225.126/wsmonitor/ReadMe_wsmonitor.txt -O /usr/local/wsmonitor/ReadMe_wsmonitor.txt 
    #替换监控脚本中的邮箱地址 
sed -i '[email protected]/MAILTO='$1'/g' /usr/local/wsmonitor/wsmonitor.sh 
#启动服务 
chmod +x /etc/init.d/wsmonitor 
service wsmonitor restart 
echo "#################################################################" 
echo "#####Install wsmonitor success! #####" 
echo "#################################################################" 
echo "#####If you want to change mailaddress,edit [/usr/local/wsmonitor/wsmonitor.sh]! #####" 
echo "#####Please use [service wsmonitor start/stop/restart] #####" 
echo "使用说明见:/usr/local/wsmonitor/ReadMe_wsmonitor.txt" 
exit 
done

 

 

3、wsmontor.sh

#!/bin/bash 
ntpdate cn.pool.ntp.org 
#配置多个收件人用英文逗号隔开 
#MAILTO=cc@wangsu.com,[email protected] 
#MAILTO=cc@wangsu.com 
MAILTO=[email protected],[email protected] 
#1、监控间隔时间,单位(秒) 
TIME_INTERVAL=5 
#2、内存使用率,单位 (%) 
MEM_RATE=1 
#3、流量监控设置 
#要监控的网卡 
ETH_NAME=eth0 
#上行下行流量,单位(MB) 
FLOWIN_MB=10240 
FLOWOUT_MB=10240 
#4、TCP连接状态数量,连接的最大个数 
TIME_WAIT=1000 
ESTABLISHED=1000 
#5、CPU使用率,大于该值发送告警,单位(%) 
CPU_RATE=80 
#6、硬盘占用率,单位(%) 
DISK_RATE=80 
############################################################ 
################!以下脚本内容不需要修改!!################ 
############################################################ 
#检测是否加入crontab 
#username=`users|awk '{print $1}'` 
#crontab_path=/var/spool/cron/$username 
#script_path=`pwd` 
#ifexist=`crontab -l|grep wsmonitor_keepalive.sh|wc -l` 
#ifexist_mail=`crontab -l|grep wsmonitor_keepalive.sh|grep $MAILTO|wc -l` 
#if [ $ifexist -eq 0 ];then 
#        echo "1 * * * * sh $script_path/wsmonitor_keepalive.sh $MAILTO  > /dev/null 2>&1" >>$crontab_path 
#elif [[ $ifexist -eq 1  &&  $ifexist_mail -eq 0 ]];then 
#        sed -i '/wsmonitor_keepalive.sh/d' $crontab_path 
#        echo "1 * * * * sh $script_path/wsmonitor_keepalive.sh $MAILTO  > /dev/null 2>&1" >>$crontab_path 
#fi 
while [ 1 ] 
do 
#获取本机IP 
SERVER_IP=`ifconfig -a|grep inet|grep -v 127.0.0.1|grep -v inet6|awk '{print $2}'|tr -d "addr:"` 
#定义发送邮件的内容 
MAILSUBJECT="WARNING_[$SERVER_IP]" 
MAILFROM="[email protected]" 
function do_sendmail() 
{ 
sendmail -t <<EOF 
from:wsmonitor <$MAILFROM> 
to:$MAILTO 
subject:[$1]_$MAILSUBJECT 
Content-Type: text/html;charset=utf-8 
<html> 
<body> 
$time  $1当前监控记录值为:$2,大于设置的阈值:$3,详情登陆系统查看[$SERVER_IP]。当天同一资源超过阈值只告警一次。<br /> 
备注:请勿回复此邮件! 
</body></html> 
EOF 
} 
# 监 控 脚 本 执 行 内 容 
################################################################################ 
mkdir -p /usr/local/wsmonitor/log 
date=`date +"%Y%m%d"` 
time=`date +"%Y-%m-%d %H:%M:%S"` 
LOG_PATH=/usr/local/wsmonitor/log/monitor_$date.log 
################################################################## 
####1、为CPU使用率计算先记录日志1################################# 
####2、为FLOW计算先获取flowin和flowout############################ 
################################################################## 
cpulog_1=$(cat /proc/stat | grep 'cpu ' | awk '{print $2" "$3" "$4" "$5" "$6" "$7" "$8}')     
flowin_old=$(cat /proc/net/dev|grep -w $ETH_NAME|sed -e "s//(.*/)/:/(.*/)//2/g" |awk '{ print $1 }') 
flowout_old=$(cat /proc/net/dev|grep -w $ETH_NAME|sed -e "s//(.*/)/:/(.*/)//2/g" |awk '{ print $9 }') 
#监控周期 
sleep $TIME_INTERVAL 
################################################################## 
###########################内存监控部分########################### 
################################################################## 
#对于应用程序来说,buffers/caches是未被使用的,对应-/+ buffers/cache那一行 
memuse=`free -m|grep "buffers/cache"|awk '{print $3}'` 
memtotal=`free -m|grep "Mem"|awk '{print $2}'` 
MEM_NOW=`echo "$memuse/$memtotal*100"|bc -l|cut -d. -f1` 
MEM_TAG=`echo "$MEM_NOW $MEM_RATE"|awk '{tag=0;if($1>$2){tag=1} print tag}'` 
echo "$time MEM $MEM_NOW $MEM_RATE $MEM_TAG">>$LOG_PATH 
#DISK:df -lh查看磁盘使用率,获取最大值 
disk_log=/tmp/disk_tmp.log 
df -lh|grep -v Filesystem|awk '{print $5}' > $disk_log 
DISK_NOW=`cat $disk_log |awk 'BEGIN{max=0}{if($1+0>max+0) max=$1 fi} END {print max}'|cut -d% -f1` 
DISK_TAG=`echo "$DISK_NOW $DISK_RATE"|awk '{tag=0;if($1>$2){tag=1} print tag}'` 
echo "$time DISK $DISK_NOW $DISK_RATE $DISK_TAG">>$LOG_PATH 
################################################################### 
#####连接数:只获取ESTABLISHED和TIME_WAIT状态的tcp连接数############ 
################################################################### 
tcpconn_log=/tmp/tcpconn.log 
netstat -n | awk '/^tcp/ {++S[$NF]} END {for(a in S) print a, S[a]}' >$tcpconn_log 
#注意连接数为0的情况 
grep ESTABLISHED $tcpconn_log > /dev/null 
if [ $? -eq 1 ] ; then 
echo "ESTABLISHED 0 " >>$tcpconn_log 
fi 
ESTABLISHED_NOW=`grep ESTABLISHED $tcpconn_log|awk '{print $2}'` 
ESTABLISHED_TAG=`echo "$ESTABLISHED_NOW $ESTABLISHED"|awk '{tag=0;if($1>$2){tag=1} print tag}'` 
echo "$time ESTABLISHED $ESTABLISHED_NOW $ESTABLISHED $ESTABLISHED_TAG">>$LOG_PATH 
grep TIME_WAIT $tcpconn_log > /dev/null 
if [ $? -eq 1 ] ; then 
echo "TIME_WAIT 0 " >>$tcpconn_log 
fi 
TIME_WAIT_NOW=`grep TIME_WAIT $tcpconn_log|awk '{print $2}'` 
TIME_WAIT_TAG=`echo "$TIME_WAIT_NOW $TIME_WAIT"|awk '{tag=0;if($1>$2){tag=1} print tag}'` 
echo "$time TIME_WAIT $TIME_WAIT_NOW $TIME_WAIT $TIME_WAIT_TAG">>$LOG_PATH 
################################################################## 
######################### 流量 ################################### 
################################################################## 
flowin=$(cat /proc/net/dev|grep -w $ETH_NAME|sed -e "s//(.*/)/:/(.*/)//2/g"|awk '{ print $1 }') 
flowout=$(cat /proc/net/dev|grep -w $ETH_NAME|sed -e "s//(.*/)/:/(.*/)//2/g"|awk '{ print $9 }') 
FLOWIN=$((($flowin-$flowin_old)/$TIME_INTERVAL)) 
FLOWOUT=$((($flowout-$flowout_old)/$TIME_INTERVAL)) 
#echo "Recv rate: $((${FLOWIN}/1024)) KB/s   Sent rate:  $((${FLOWOUT}/1024)) KB/s " 
flowin_old=${flowin} 
flowout_old=${flowout} 
#转换单位为MB 
FLOWIN_MB_NOW=$(((${FLOWIN}/1024)/1024)) 
FLOWOUT_MB_NOW=$(((${FLOWOUT}/1024)/1024)) 
FLOWIN_MB_TAG=`echo "$FLOWIN_MB_NOW $FLOWIN_MB"|awk '{tag=0;if($1>$2){tag=1} print tag}'` 
FLOWOUT_MB_TAG=`echo "$FLOWOUT_MB_NOW $FLOWOUT_MB"|awk '{tag=0;if($1>$2){tag=1} print tag}'` 
echo "$time FLOWIN_MB $FLOWIN_MB_NOW $FLOWIN_MB $FLOWIN_MB_TAG">>$LOG_PATH 
echo "$time FLOWOUT_MB $FLOWOUT_MB_NOW $FLOWOUT_MB $FLOWOUT_MB_TAG">>$LOG_PATH 
#转换单位为KB 
FLOWIN_KB_NOW=$((${FLOWIN}/1024)) 
FLOWOUT_KB_NOW=$((${FLOWOUT}/1024)) 
FLOWIN_KB_TAG=`echo "$FLOWIN_KB_NOW $((${FLOWIN_MB}*1024))"|awk '{tag=0;if($1>$2){tag=1} print tag}'` 
FLOWOUT_KB_TAG=`echo "$FLOWOUT_KB_NOW $((${FLOWOUT_MB}*1024))"|awk '{tag=0;if($1>$2){tag=1} print tag}'` 
echo "$time FLOWIN_KB $FLOWIN_KB_NOW $((${FLOWIN_MB}*1024)) $FLOWIN_KB_TAG">>$LOG_PATH 
echo "$time FLOWOUT_KB $FLOWOUT_KB_NOW $((${FLOWOUT_MB}*1024)) $FLOWOUT_KB_TAG">>$LOG_PATH 
################################################################## 
######################CPU 通过/proc/stat计算###################### 
################################################################## 
sys_idle_1=$(echo $cpulog_1 | awk '{print $4}') 
total_1=$(echo $cpulog_1 | awk '{print $1+$2+$3+$4+$5+$6+$7}') 
cpulog_2=$(cat /proc/stat | grep 'cpu ' | awk '{print $2" "$3" "$4" "$5" "$6" "$7" "$8}') 
sys_idle_2=$(echo $cpulog_2 | awk '{print $4}') 
total_2=$(echo $cpulog_2 | awk '{print $1+$2+$3+$4+$5+$6+$7}') 
sys_idle=`expr $sys_idle_2 - $sys_idle_1` 
total=`expr $total_2 - $total_1` 
CPU_NOW=`echo "100-$sys_idle/$total*100"|bc -l|cut -d. -f1` 
if [ $total -eq 0 ] ;then 
CPU_NOW=0 
elif [ ! -n "$CPU_NOW" ];then 
CPU_NOW=0 
fi 
CPU_TAG=`echo "$CPU_NOW $CPU_RATE"|awk '{tag=0;if($1>$2){tag=1} print tag}'` 
echo "$time CPU $CPU_NOW $CPU_RATE $CPU_TAG">>$LOG_PATH 
######################################################################################## 
#检查日志中是否有超过阈值的记录 
function check_ifoverload() 
{ 
all_count=`grep $1 $LOG_PATH|awk '{if($6==1) print $0}'|wc -l` 
latest_count=`tail -n $2 $LOG_PATH|grep $1|awk '{if($6==1) print $0}'|wc -l` 
if [[ $all_count -eq 1 && $latest_count -eq 1 ]];then 
do_sendmail $1 $3 $4 
fi 
} 
check_item=9 
check_ifoverload MEM $check_item $MEM_NOW'%' $MEM_RATE'%' 
check_ifoverload CPU $check_item $CPU_NOW'%' $CPU_RATE'%' 
check_ifoverload DISK $check_item $DISK_NOW'%' $DISK_RATE'%' 
check_ifoverload ESTABLISHED $check_item $ESTABLISHED_NOW $ESTABLISHED 
check_ifoverload TIME_WAIT $check_item $TIME_WAIT_NOW $TIME_WAIT 
check_ifoverload FLOWIN_MB $check_item $FLOWIN_MB_NOW $FLOWIN_MB 
check_ifoverload FLOWOUT_MB $check_item $FLOWOUT_MB_NOW $FLOWOUT_MB 
done

 

原创文章,作者:Maggie-Hunter,如若转载,请注明出处:https://blog.ytso.com/2064.html

(0)
上一篇 2021年7月15日
下一篇 2021年7月15日

相关推荐

发表回复

登录后才能评论