#########################################################################
# File Name:monitor.sh
# AUTHOR:WUGJ
# Mail: [email protected]
# Created time:2015 November 16 Monday 15:14 19 seconds
#########################################################################
#!/bin/bash
#监控系统负载, memory, out of the message warning
Hostname= ' hostname|sed ' s/.local//g '
#ip = ' ifconfig |awk ' {print $1,$2} ' |egrep-e ' inet addr '-e ' Link ' |egrep-v ' lo|127.0.0.1 ' |cut-f 2-d ': ' |sed ' s/link/:/g ' `
ip= ' ifconfig eth0|grep "inet addr" |cut-f 2-d ":" |cut-f 1-d "" '
echo IP: $ip
#cpu个数
cpu_num= ' grep-c ' model name '/proc/cpuinfo '
echo Cpu_num: $cpu _num
#统计节点状态信息日志路径
Stat_path=/share/nas1/wugj/script/shell/log
echo persent static path: $stat _path
Cur_time= ' Date +%y%m%d '
#节点状态文件
stat_file= "$hostname ' date +%y%m%d '. xls"
err_log= "' Date +%y%m '. Log"
if [[!-F $stat _path/$err _log]];
Then touch $stat _path/$err _log
echo User PID host date command > $stat _path/$err _log
Fi
Echo $stat _file
#设置平均负载的警告值
load_warn=0.70
#提取本机的静态变量
Watc_cpu_test () {
#系统15分钟的负载
load_15= ' uptime |awk ' {print $NF} '
echo load_15: $load _15
#每个核心每15分钟负载
Average_load= ' echo ' scale=3;a= $load _15/$cpu _num;if (Length (a) ==scale (a)) print 0;p rint a "|BC"
Echo $average _load
Average_int= ' echo $average _load|cut-f 1-d. " `
echo Average_int: $average _int
#当单个核心15分钟的平均负载值大于等于1.0 (that is, single-digit integer greater than 0), direct email alarm, if less than 1.0 two times comparison
if (($average _int > 0)); Then
echo "$hostname 15-minute system average load of $average_load, exceeding the alert value of 1.0, please deal with it now!!! "
Else
#当前系统15分钟平均负载值与告警值进行比较 (1 is returned when the alarm value is greater than 0.70, and 0 is returned if it is less than)
load_now= ' expr $average _load \> $load _warn '
#如果系统单个核心15分钟的平均负载值大于告警值0.70 (return value is 1), send an email to the administrator
if (($load _now = = 1)); Then
echo "$hostname 15 minutes of the average system load of $average _load, exceeding the alert value of 0.70, please timely processing. "
Fi
Fi
}
function timeout ()
{
Waitsec=5
($*) & pid=$!
(Sleep $waitsec && kill-hup $pid) 2>/dev/null & watchdog=$!
#if command is finish
If wait $pid 2>/dev/null;then
Pkill-hup-p $watchdog
Wait $watchdog
Fi
}
Mem_quota=90
function Watch_mem ()
{
Memtotal= ' Cat/proc/meminfo |grep "memtotal" |awk ' {print $} '
Memfree= ' Cat/proc/meminfo |grep "Memfree" |awk ' {print $} '
Cached= ' Cat/proc/meminfo |grep "^cached" |awk ' {print $} '
buffers= ' cat/proc/meminfo |grep "buffers" |awk ' {print $} '
#echo "Hahhahah"
mem_usage=$ ((100-memfree*100/memtotal-buffers*100/memtotal-cached*100/memtotal))
#Echo mem_memssage:
If [$mem _usage-gt $mem _quota];then
Mem_message= "warn! The Memory usage is over than $mem _usage% "
Kill_proc
Return 1
Else
return 0
Fi
}
function Get_cpu_info ()
{
Head-1/proc/stat|awk ' {used+=$2+$3+$4;unused+=$5+$6+$7+$8} end{print used,unused} '
}
function Proc_cpu_ps5 ()
{
PS aux|sort-nk3r|head-n 5 >> $stat _path/$stat _file
}
function Proc_cpu_top5 ()
{
Top-n 1-b |sed-n ' 7 ' p >> $stat _path/$stat _file
Top-n 1-b | Grep-v-E ' ^[[:alpha:]]|^$| COMMAND ' |sort-k9nr|head-n 5 >> $stat _path/$stat _file
usr ' =top-n 1-b | Grep-v-E ' ^[[:alpha:]]|^$| COMMAND ' |sort-k9nr|head-n 1|awk-f ' "' {print $} '
Mail-s "CPU Load High"-c [email protected] [email protected]< $stat _paht/$stat _file
}
function Kill_proc ()
{
#pid = ' Top-n 1-b | Grep-v-E ' ^[[:alpha:]]|^$| Command|root ' |sort-k6nr-k 9|head-n 1|awk-f ' "' {print '} '
#mesg = ' Top-n 1-b | Grep-v-E ' ^[[:alpha:]]|^$| Command|root ' |sort-k6nr-k 9|head-n 1|awk-f "" ' {print "\ t" $6 "\ T" $ "
#usr = ' Top-n 1-b | Grep-v-E ' ^[[:alpha:]]|^$| Command|root ' |sort-k6nr-k 9|head-n 1|awk-f ' "' {print $} '
#command = ' top-n 1-b-C | Grep-v-E ' ^[[:alpha:]]|^$| COMMAND ' |sort-k9nr|head-n 1|awk-f ' "' {print $} '
Top-n 1-b-C |awk-f "" ' {
res=$6; pid=$1;user=$2; com=$12;
if (res ~/m/) {
res=res*1024;
Print user "\ t" res "\ T" pid "\ t" COM;
}
else if (res ~/g/) {
res=res*1024*1024;
Print user "\ t" res "\ T" pid "\ t" COM;
}
else if (res ~/*/) {
res=res*1;
Print user "\ t" res "\ T" pid "\ t" COM;
}
} ' |sort-k2nr|head-n 1|awk-f ' "' {
System ("kill-9");
System ("Echo Killed process" "$");
System ("echo" $ "" HOSTNAME ">>" "$stat _path/$err _log" ");
System ("Echo Dear" $ ", you process" $ "have been killed of '" $hostname "' at ' date ' |mail-s" killed PID warn "[Email PR Otected] "$" @biomarker. com.cn ");
}‘
#echo $pid |xargs kill-9
#echo $USR $pid has been killed in ' date '
#echo "$usr $pid $hostname ' date ' $command ' >> $stat _path/$err _log
#echo "You process $usr $pid $MESG of $hostname at ' date ' have been killed" |mail-s "killed pid"-C [EMA Il protected] [email protected]
}
Cpu_quota=80
function Watch_cpu ()
{
time_point_1= ' Get_cpu_info '
Sleep 10
Time_point_2= ' Get_cpu_info '
Cpu_usage= ' echo $time _point_1 $time _point_2|awk ' {used=$3-$1;total=$3+$4-$1-$2;print used*100/total} '
echo cpu_usage: $cpu _usage>> $stat _path/$stat _file
if [[$cpu _usage > $cpu _quota]]; Then
Cpu_message= "warn! The CPU Usage is over than $cpu _quota% "
echo cpu_message: $cpu _message >> $stat _path/$stat _file
#timeout PROC_CPU_PS5
Timeout PROC_CPU_TOP5
#kill_proc
Fi
#测试
# timeout PROC_CPU_PS5
# timeout PROC_CPU_TOP5
}
function Update_file () {
if [[!-F $stat _path/$stat _file]];then
Touch $stat _path/$stat _file
Echo Make new file successful
Fi
awk' begin{
Print "IP: '" $ip "'"
Print "Host: '" $hostname "'"
Print "Cpu_num: '" $cpu _num "'"
Print "Mem_usage: '" $mem _usage "'%"
Print "Mem_message: '" $mem _message "'"
} ' > $stat _path/$stat _file
}
Watch_mem
Update_file
Watch_cpu
Kill_proc
This article is from the "Life Need Wife" blog, so be sure to keep this source http://drsin.blog.51cto.com/10182098/1717192
Cluster monitoring scripts