Cluster monitoring scripts

Source: Internet
Author: User

#########################################################################

# File Name:monitor.sh

# AUTHOR:WUGJ

# Mail: [email protected]

# Created time:2015 November 16 Monday 15:14 19 seconds

#########################################################################

#!/bin/bash

#监控系统负载, memory, out of the message warning

Hostname= ' hostname|sed ' s/.local//g '

#ip = ' ifconfig |awk ' {print $1,$2} ' |egrep-e ' inet addr '-e ' Link ' |egrep-v ' lo|127.0.0.1 ' |cut-f 2-d ': ' |sed ' s/link/:/g ' `

ip= ' ifconfig eth0|grep "inet addr" |cut-f 2-d ":" |cut-f 1-d "" '

echo IP: $ip

#cpu个数

cpu_num= ' grep-c ' model name '/proc/cpuinfo '

echo Cpu_num: $cpu _num

#统计节点状态信息日志路径

Stat_path=/share/nas1/wugj/script/shell/log

echo persent static path: $stat _path

Cur_time= ' Date +%y%m%d '

#节点状态文件

stat_file= "$hostname ' date +%y%m%d '. xls"


err_log= "' Date +%y%m '. Log"

if [[!-F $stat _path/$err _log]];

Then touch $stat _path/$err _log

echo User PID host date command > $stat _path/$err _log

Fi

Echo $stat _file

#设置平均负载的警告值

load_warn=0.70

#提取本机的静态变量

Watc_cpu_test () {

#系统15分钟的负载

load_15= ' uptime |awk ' {print $NF} '

echo load_15: $load _15

#每个核心每15分钟负载

Average_load= ' echo ' scale=3;a= $load _15/$cpu _num;if (Length (a) ==scale (a)) print 0;p rint a "|BC"

Echo $average _load

Average_int= ' echo $average _load|cut-f 1-d. " `

echo Average_int: $average _int

#当单个核心15分钟的平均负载值大于等于1.0 (that is, single-digit integer greater than 0), direct email alarm, if less than 1.0 two times comparison

if (($average _int > 0)); Then

echo "$hostname 15-minute system average load of $average_load, exceeding the alert value of 1.0, please deal with it now!!! "

Else

#当前系统15分钟平均负载值与告警值进行比较 (1 is returned when the alarm value is greater than 0.70, and 0 is returned if it is less than)

load_now= ' expr $average _load \> $load _warn '

#如果系统单个核心15分钟的平均负载值大于告警值0.70 (return value is 1), send an email to the administrator

if (($load _now = = 1)); Then

echo "$hostname 15 minutes of the average system load of $average _load, exceeding the alert value of 0.70, please timely processing. "

Fi

Fi

}

function timeout ()

{

Waitsec=5

($*) & pid=$!

(Sleep $waitsec && kill-hup $pid) 2>/dev/null & watchdog=$!

#if command is finish

If wait $pid 2>/dev/null;then

Pkill-hup-p $watchdog

Wait $watchdog

Fi


}

Mem_quota=90

function Watch_mem ()

{

Memtotal= ' Cat/proc/meminfo |grep "memtotal" |awk ' {print $} '

Memfree= ' Cat/proc/meminfo |grep "Memfree" |awk ' {print $} '

Cached= ' Cat/proc/meminfo |grep "^cached" |awk ' {print $} '

buffers= ' cat/proc/meminfo |grep "buffers" |awk ' {print $} '

#echo "Hahhahah"

mem_usage=$ ((100-memfree*100/memtotal-buffers*100/memtotal-cached*100/memtotal))

#Echo mem_memssage:

If [$mem _usage-gt $mem _quota];then

Mem_message= "warn! The Memory usage is over than $mem _usage% "

Kill_proc

Return 1

Else

return 0

Fi

}

function Get_cpu_info ()

{

Head-1/proc/stat|awk ' {used+=$2+$3+$4;unused+=$5+$6+$7+$8} end{print used,unused} '

}

function Proc_cpu_ps5 ()

{

PS aux|sort-nk3r|head-n 5 >> $stat _path/$stat _file

}

function Proc_cpu_top5 ()

{

Top-n 1-b |sed-n ' 7 ' p >> $stat _path/$stat _file

Top-n 1-b | Grep-v-E ' ^[[:alpha:]]|^$| COMMAND ' |sort-k9nr|head-n 5 >> $stat _path/$stat _file

usr ' =top-n 1-b | Grep-v-E ' ^[[:alpha:]]|^$| COMMAND ' |sort-k9nr|head-n 1|awk-f ' "' {print $} '

Mail-s "CPU Load High"-c [email protected] [email protected]< $stat _paht/$stat _file

}

function Kill_proc ()

{

#pid = ' Top-n 1-b | Grep-v-E ' ^[[:alpha:]]|^$| Command|root ' |sort-k6nr-k 9|head-n 1|awk-f ' "' {print '} '

#mesg = ' Top-n 1-b | Grep-v-E ' ^[[:alpha:]]|^$| Command|root ' |sort-k6nr-k 9|head-n 1|awk-f "" ' {print "\ t" $6 "\ T" $ "

#usr = ' Top-n 1-b | Grep-v-E ' ^[[:alpha:]]|^$| Command|root ' |sort-k6nr-k 9|head-n 1|awk-f ' "' {print $} '

#command = ' top-n 1-b-C | Grep-v-E ' ^[[:alpha:]]|^$| COMMAND ' |sort-k9nr|head-n 1|awk-f ' "' {print $} '

Top-n 1-b-C |awk-f "" ' {

res=$6; pid=$1;user=$2; com=$12;

if (res ~/m/) {

res=res*1024;

Print user "\ t" res "\ T" pid "\ t" COM;

}

else if (res ~/g/) {

res=res*1024*1024;

Print user "\ t" res "\ T" pid "\ t" COM;

}

else if (res ~/*/) {

res=res*1;

Print user "\ t" res "\ T" pid "\ t" COM;

}

} ' |sort-k2nr|head-n 1|awk-f ' "' {

System ("kill-9");

System ("Echo Killed process" "$");


System ("echo" $ "" HOSTNAME ">>" "$stat _path/$err _log" ");

System ("Echo Dear" $ ", you process" $ "have been killed of '" $hostname "' at ' date ' |mail-s" killed PID warn "[Email PR Otected] "$" @biomarker. com.cn ");

}‘

#echo $pid |xargs kill-9

#echo $USR $pid has been killed in ' date '

#echo "$usr $pid $hostname ' date ' $command ' >> $stat _path/$err _log

#echo "You process $usr $pid $MESG of $hostname at ' date ' have been killed" |mail-s "killed pid"-C [EMA Il protected] [email protected]


}

Cpu_quota=80

function Watch_cpu ()

{

time_point_1= ' Get_cpu_info '

Sleep 10

Time_point_2= ' Get_cpu_info '

Cpu_usage= ' echo $time _point_1 $time _point_2|awk ' {used=$3-$1;total=$3+$4-$1-$2;print used*100/total} '

echo cpu_usage: $cpu _usage>> $stat _path/$stat _file

if [[$cpu _usage > $cpu _quota]]; Then

Cpu_message= "warn! The CPU Usage is over than $cpu _quota% "

echo cpu_message: $cpu _message >> $stat _path/$stat _file

#timeout PROC_CPU_PS5

Timeout PROC_CPU_TOP5

#kill_proc

Fi

#测试

# timeout PROC_CPU_PS5

# timeout PROC_CPU_TOP5

}

function Update_file () {

if [[!-F $stat _path/$stat _file]];then

Touch $stat _path/$stat _file

Echo Make new file successful

Fi

awk' begin{

Print "IP: '" $ip "'"

Print "Host: '" $hostname "'"

Print "Cpu_num: '" $cpu _num "'"

Print "Mem_usage: '" $mem _usage "'%"

Print "Mem_message: '" $mem _message "'"

} ' > $stat _path/$stat _file

}

Watch_mem

Update_file

Watch_cpu

Kill_proc


This article is from the "Life Need Wife" blog, so be sure to keep this source http://drsin.blog.51cto.com/10182098/1717192

Cluster monitoring scripts

Contact Us

The content source of this page is from Internet, which doesn't represent Alibaba Cloud's opinion; products and services mentioned on that page don't have any relationship with Alibaba Cloud. If the content of the page makes you feel confusing, please write us an email, we will handle the problem within 5 days after receiving your email.

If you find any instances of plagiarism from the community, please send an email to: info-contact@alibabacloud.com and provide relevant evidence. A staff member will contact you within 5 working days.

A Free Trial That Lets You Build Big!

Start building with 50+ products and up to 12 months usage for Elastic Compute Service

  • Sales Support

    1 on 1 presale consultation

  • After-Sales Support

    24/7 Technical Support 6 Free Tickets per Quarter Faster Response

  • Alibaba Cloud offers highly flexible support services tailored to meet your exact needs.