라떼군 이야기
NHN 호스팅(구 고도몰) 데몬 체크 서비스 오동작 이슈
Problem
Linux 환경에 Apache를 이용한 NHN 호스팅(구 고도몰)1를 이용하는 경우 5분에 한번씩 Apache서비스가 재시작 되는 이슈가 있었다. /usr/local/apache/logs/error_log 경로에 아래와 같은 로그로 재시작 하고 있다는 것을 인지할 수 있었다.
[Fri Feb 25 11:25:02.644188 2022] [mpm_prefork:notice] [pid 10385] AH00173: SIGHUP received. Attempting to restart
[Fri Feb 25 11:25:07.807513 2022] [mpm_prefork:notice] [pid 10385] AH00163: Apache/2.4.43 (Unix) OpenSSL/1.0.2k-fips configured — resuming normal operations
[Fri Feb 25 11:25:07.807545 2022] [core:notice] [pid 10385] AH00094: Command line: '/usr/local/apache/bin/httpd'
[Fri Feb 25 11:25:09.295129 2022] [mpm_prefork:notice] [pid 10385] AH00173: SIGHUP received. Attempting to restart
[Fri Feb 25 11:25:14.373213 2022] [mpm_prefork:notice] [pid 10385] AH00163: Apache/2.4.43 (Unix) OpenSSL/1.0.2k-fips configured — resuming normal operations
[Fri Feb 25 11:25:14.373247 2022] [core:notice] [pid 10385] AH00094: Command line: '/usr/local/apache/bin/httpd'
[Fri Feb 25 11:25:15.766019 2022] [mpm_prefork:notice] [pid 10385] AH00173: SIGHUP received. Attempting to restart
[Fri Feb 25 11:25:20.822169 2022] [mpm_prefork:notice] [pid 10385] AH00163: Apache/2.4.43 (Unix) OpenSSL/1.0.2k-fips configured — resuming normal operations
[Fri Feb 25 11:25:20.822205 2022] [core:notice] [pid 10385] AH00094: Command line: '/usr/local/apache/bin/httpd'
Solution
원인은 호스팅 관리를 위해 NHN에서 작성한 관리 스크립트에서 과부하 상태로 파악하고 서비스를 재시작 하는 것이 원인이었다. 해당 스크립트는 crontab으로 관리되고 있었고, 5분에 한번씩 실행되고 있었다. 서비스 업체에 문의한 결과 오동작하는 crontab 스크립트를 비활성화 하는 것으로 답변을 받았다.
*/5 * * * * root /usr/local/Godo_Manage/srvh_daemon_check.sh
#!/bin/sh
Http_Limit_Number="900"
Http_min_Number="3"
Load_Limit_Number="20"
Swap_Limit_Number="1000000"
Nobody_exclude="/usr/local/apache/bin/httpd\|/usr/local/apache_php4/bin/httpd\|/usr/local/apache_php5/bin/httpd\|/usr/local/godoweb_ssl/bin/httpd\|\[httpd\]\|nginx\|ftp\|INIsecurepay\|du$\|/usr/sbin/sendmail\|sendmail\|/card/"
ADMIN="hosting-server@godohosting.com"
TO_DATE=`date +%Y%m%d-%H:%M:%S`
LOG_MONTH=`date +%Y%m`
server_ip=`/sbin/ifconfig -a | grep "inet addr:" | grep -v "127.0.0.1" | awk '{ print $2}' | cut -d: -f2`
server_host=`/bin/hostname -f`
Subject="[godo_hosting_report] $server_host ( $server_ip ) daemon check mail"
if [ ! -d "/usr/local/Godo_Manage/logs" ]; then
mkdir "/usr/local/Godo_Manage/logs"
fi
Deamon_Check_Log_File="/usr/local/Godo_Manage/logs/daemon_check_${LOG_MONTH}.txt"
Apache_deamon_check(){
echo " " >> $Deamon_Check_Log_File
echo "[$TO_DATE] ########### Apche daemon check start############" >> $Deamon_Check_Log_File
excute_check="false"
http_check=`/usr/bin/pstree | grep -c "\[httpd\]"`
if [ "$http_check" == "0" ]; then
num="0"
else
num=`/usr/bin/pstree | grep "\[httpd\]" | awk '{print $2 }' | cut -d- -f2|cut -d* -f1`
fi
echo "[$TO_DATE] deamon num $num" >> $Deamon_Check_Log_File
if [ "$num" -gt "$Http_Limit_Number" ]; then
echo "[$TO_DATE] apache restart by deamon num $num" >> $Deamon_Check_Log_File
i="0"
while [ $excute_check == "false" ];do
i=`expr $i + 1`
/usr/local/apache/bin/apachectl restart
local Check_excute=$?
if [ $Check_excute != "0" ]; then
excute_check="false"
echo "[$TO_DATE] Apache restart fail" >> $Deamon_Check_Log_File
else
sleep 5
num2=`/usr/bin/pstree | grep "\[httpd\]" | awk '{print $2 }' | cut -d- -f2|cut -d* -f1`
if [ "$num2" -gt "3" ]; then
excute_check="true"
echo "[$TO_DATE] Apchec restart success " >> $Deamon_Check_Log_File
tail $Deamon_Check_Log_File |mail -s "$Subject" $ADMIN
fi
fi
if [ "$i" -gt "2" ]; then
excute_check="true"
echo "[$TO_DATE] Apchec restart Fail over 2 " >> $Deamon_Check_Log_File
fi
done
fi
if [ "$num" -lt "$Http_min_Number" ]; then
echo "[$TO_DATE] apache restart by deamon num $num" >> $Deamon_Check_Log_File
i="0"
while [ $excute_check == "false" ];do
i=`expr $i + 1`
/usr/local/apache/bin/apachectl restart
local Check_excute=$?
if [ $Check_excute != "0" ]; then
excute_check="false"
echo "[$TO_DATE] Apache restart fail" >> $Deamon_Check_Log_File
else
sleep 5
num2=`/usr/bin/pstree | grep "\[httpd\]" | awk '{print $2 }' | cut -d- -f2|cut -d* -f1`
if [ "$num2" -gt "3" ]; then
excute_check="true"
echo "[$TO_DATE] Apchec restart success " >> $Deamon_Check_Log_File
tail $Deamon_Check_Log_File |mail -s "$Subject" $ADMIN
fi
fi
if [ "$i" -gt "2" ]; then
excute_check="true"
echo "[$TO_DATE] Apchec restart Fail over 2 " >> $Deamon_Check_Log_File
fi
done
fi
echo "[$TO_DATE] ########### Apche daemon check end############" >> $Deamon_Check_Log_File
}
Nobody_deamon_check(){
echo " " >> $Deamon_Check_Log_File
echo "[$TO_DATE] ########### Nobody daemon check start############" >> $Deamon_Check_Log_File
excute_check="false"
Nobody_daemon=`ps -ef | grep ^nobody | awk '{print $2":"$8 }' | grep -v "${Nobody_exclude}"`
for daemon_check in echo $Nobody_daemon ; do
echo ${daemon_check}
if [ ${daemon_check} != "echo" -a -n ${daemon_check} ]; then
kill_pid=`echo ${daemon_check} | cut -d: -f1`
kill_daemon_name=`echo ${daemon_check} | cut -d: -f2`
echo "[$TO_DATE] kill daemon name [$kill_daemon_name] " >> $Deamon_Check_Log_File
kill -9 $kill_pid
local Check_excute=$?
if [ $Check_excute != "0" ]; then
#excute_check="false"
echo "[$TO_DATE] nobody daemon kill fail" >> $Deamon_Check_Log_File
tail $Deamon_Check_Log_File |mail -s "$Subject" $ADMIN
else
#excute_check="true"
echo "[$TO_DATE] nobody daemon kill success " >> $Deamon_Check_Log_File
tail $Deamon_Check_Log_File |mail -s "$Subject" $ADMIN
fi
fi
done
echo "[$TO_DATE] ########### Apche daemon check end############" >> $Deamon_Check_Log_File
}
Load_check(){
echo " " >> $Deamon_Check_Log_File
echo "[$TO_DATE] ########### Load check start############" >> $Deamon_Check_Log_File
excute_check="false"
Load_daemon=`cat /proc/loadavg | awk '{print $1 }' | cut -d. -f1`
echo $Load_daemon
echo "[$TO_DATE] Load= [$Load_daemon]" >> $Deamon_Check_Log_File
if [ "$Load_daemon" -gt "$Load_Limit_Number" ]; then
echo "[$TO_DATE] apache restart by load num $Load_daemon" >> $Deamon_Check_Log_File
i="0"
while [ $excute_check == "false" ];do
i=`expr $i + 1`
/usr/local/apache/bin/apachectl restart
local Check_excute=$?
if [ $Check_excute != "0" ]; then
excute_check="false"
echo "[$TO_DATE] Apache restart fail" >> $Deamon_Check_Log_File
else
sleep 5
num2=`/usr/bin/pstree | grep "\[httpd\]" | awk '{print $2 }' | cut -d- -f2|cut -d* -f1`
if [ "$num2" -gt "3" ]; then
excute_check="true"
echo "[$TO_DATE] Apchec restart success " >> $Deamon_Check_Log_File
tail $Deamon_Check_Log_File |mail -s "$Subject" $ADMIN
fi
fi
if [ "$i" -gt "2" ]; then
excute_check="true"
echo "[$TO_DATE] Apchec restart Fail over 2 " >> $Deamon_Check_Log_File
fi
done
fi
echo "[$TO_DATE] ########### Load check end############" >> $Deamon_Check_Log_File
}
Swap_check(){
echo " " >> $Deamon_Check_Log_File
echo "[$TO_DATE] ########### Swap use check start############" >> $Deamon_Check_Log_File
excute_check="false"
Swap_daemon=`tail -n 1 /proc/swaps | awk '{print $4 }'`
echo $Load_daemon
echo "[$TO_DATE] Swap_use= [$Swap_daemon]" >> $Deamon_Check_Log_File
if [ "$Swap_daemon" -gt "$Swap_Limit_Number" ]; then
echo "[$TO_DATE] apache restart by Swap use $Swap_daemon" >> $Deamon_Check_Log_File
i="0"
while [ $excute_check == "false" ];do
i=`expr $i + 1`
/usr/local/apache/bin/apachectl restart
local Check_excute=$?
if [ $Check_excute != "0" ]; then
excute_check="false"
echo "[$TO_DATE] Apache restart fail" >> $Deamon_Check_Log_File
else
sleep 5
num2=`/usr/bin/pstree | grep "\[httpd\]" | awk '{print $2 }' | cut -d- -f2|cut -d* -f1`
if [ "$num" -gt "3" ]; then
excute_check="true"
echo "[$TO_DATE] Apchec restart success " >> $Deamon_Check_Log_File
tail $Deamon_Check_Log_File |mail -s "$Subject" $ADMIN
fi
fi
if [ "$i" -gt "2" ]; then
excute_check="true"
echo "[$TO_DATE] Apchec restart Fail over 2 " >> $Deamon_Check_Log_File
fi
done
fi
echo "[$TO_DATE] ########### Swap use check end############" >> $Deamon_Check_Log_File
}
## Start
Apache_deamon_check
Nobody_deamon_check
Load_check