背景:由于线上服务器硬盘故障,导致服务,数据失效;
目的:保证第一时间发现硬盘信息是否正常;
方案:使用Nagios 自定义脚本来监控硬盘状态;
注意:下面脚本只提供已经安装hpacucli megacli 软件使用;
#!/bin/bash
#Marc.wang 2014/06/17
export PATH=$PATH:/usr/sbin/:/sbin/:/usr/bin/
Get_localhost_Hostname=`hostname -I |awk '{print $1}'`
Nagios="nagios.org"
SERVER_TYPE=$(/usr/sbin/dmidecode | grep "Vendor" | awk -F\: 'NR==1{print $2}'|awk '{print $1}')
#The nagios command run nsca
Send_nsca_ssl_message (){
/usr/local/nagios/bin/send_nsca -H ${Nagios} -d ";" -c /usr/local/nagios/etc/send_nsca.cfg
}
#hp command run
HP_DISK_STATUS_COMMAND() {
rpm -qa |grep hpacucli >> /dev/null 2>&1
echo $?
}
# dell command run
DELL_IBM_DISK_STATUS_COMMAND() {
rpm -qa |grep MegaCli >> /dev/null 2>&1
echo $?
}
bug_test=$(ps ax |grep hpacucli |grep -v grep |wc -l)
if [ "${bug_test}" != "0" ]
then
echo "$Get_localhost_Hostname;check_raid;2; hpacucli command run not data." | Send_nsca_ssl_message
exit 2
fi
CHECK_RAID_STATUS_HP () {
/usr/sbin/hpacucli ctrl all show config detail |grep physicaldrive -A 4 |sed 's/ //g'|grep "Status:" |grep -v "Status:OK" | wc -l
}
case $SERVER_TYPE in
HP|hp|Hp|Hewlett-Packard)
TEST_HP_COMMAND () {
hpacucli ctrl all show config detail >> /dev/null 2>&1
echo $?
}
HP_RPM=$(HP_DISK_STATUS_COMMAND)
sleep 3
if [ ${HP_RPM} != "0" ]
then
echo "$Get_localhost_Hostname;check_raid;2; $SERVER_TYPE command hpacucli Not Found" | Send_nsca_ssl_message
exit 2
elif [[ ${HP_RPM} == "0" ]];
then
HP_RAID_STATUS_NUMBER=$(CHECK_RAID_STATUS_HP)
sleep 3
TEST_HP=$(TEST_HP_COMMAND)
if [ "$HP_RAID_STATUS_NUMBER" == "0" ] && [ "$TEST_HP" == "0" ];
then
echo "$Get_localhost_Hostname;check_raid;0;Check_Raid_status:OK" | Send_nsca_ssl_message
exit 0
elif [ "${TEST_HP}" != "0" ]
then
echo "$Get_localhost_Hostname;check_raid;2;Check_Raid_status: run command hpacucli Error" | Send_nsca_ssl_message
exit 2
elif [ "$HP_RAID_STATUS_NUMBER" != "0" ] && [ "$TEST_HP" == "0" ]
then
echo "$Get_localhost_Hostname;check_raid;2;Check_Raid_status:Critical" | Send_nsca_ssl_message
exit 2
fi
fi
;;
DELL|Dell|DEll|DeLL|dell|IBM|ibm|Ibm|IBm)
if [ -f "/opt/MegaRAID/MegaCli/MegaCli64" ];
then
CHECK_RAID_STATUS_IBM_DELL () {
/opt/MegaRAID/MegaCli/MegaCli64 -LdPdInfo -a0 | grep -E "(Media Error Count:|Other Error Count:)" |awk -F: '{sum1 += $2} END {print sum1}'
}
TEST_DELL_COMMAND (){
/opt/MegaRAID/MegaCli/MegaCli64 -LdPdInfo -a0 >> /dev/null
echo $?
}
else
CHECK_RAID_STATUS_IBM_DELL () {
MegaCli -LdPdInfo -a0 | grep -E "(Media Error Count:|Other Error Count:)" |awk -F: '{sum1 += $2} END {print sum1}'
}
TEST_DELL_COMMAND (){
MegaCli -LdPdInfo -a0 >> /dev/null
echo $?
}
fi
IBM_DELL_RPM=$(DELL_IBM_DISK_STATUS_COMMAND)
if [[ ${IBM_DELL_RPM} == "0" ]]
then
TEST_OTHER_COMMAND=$(TEST_DELL_COMMAND)
DELL_IBM_STATUS_NUMBER=$(CHECK_RAID_STATUS_IBM_DELL)
if [[ -z "$DELL_IBM_STATUS_NUMBER" ]]
then
echo "$Get_localhost_Hostname;check_raid;2;Check_Raid_status:MegaCli Command Not Found!" | Send_nsca_ssl_message
exit 2
elif [[ "$DELL_IBM_STATUS_NUMBER" -gt "2000" ]] ;
then
echo "$Get_localhost_Hostname;check_raid;2;Check_Raid_status:Critical" | Send_nsca_ssl_message
exit 2
elif [[ "$DELL_IBM_STATUS_NUMBER" -lt "2000" ]] && [[ "$TEST_OTHER_COMMAND" == "0" ]]
then
echo "$Get_localhost_Hostname;check_raid;0;Check_Raid_status:OK" | Send_nsca_ssl_message
exit 0
fi
fi
;;
*)
echo "$Get_localhost_Hostname;check_raid;2;This machine is not IBM DELL or HP!" | Send_nsca_ssl_message
;;
esac
免责声明:本站发布的内容(图片、视频和文字)以原创、转载和分享为主,文章观点不代表本网站立场,如果涉及侵权请联系站长邮箱:is@yisu.com进行举报,并提供相关证据,一经查实,将立刻删除涉嫌侵权内容。
网络异常,请检查网络