前言:
公司目前还是传统方式的微服务集群,使用的是SpringCloud架构,最近晚上经常出现服务注册失败,consul上的服务会下线,导致整个微服务应用不可以使用,出现Down机状态,这个时候到集群节点上通过 ps -ef|grep xxx.jar 发现服务进程还在,然后继续查看相应日志,找到了原因,由于微服务会每隔一段时间去自检mysql,mq的连通性,如果出现一个超时(公司使用aws,难免会出现超时),那么服务就任务自己是异常状态,从而自动退出,鉴于这种情况,博主自己设计了一套服务治理方案,可以在服务出现异常退出(异常是指由于网络延时,连接超时等)时,去优雅的重启服务,减少了人工干预。
环境介绍:
主机名 | 角色 | 操作系统 |
opsServer | 跳板机+ansible+python | ubuntu18.0 |
cloudServer1p | 微服务+consul | ubuntu18.0 |
cloudServer2p | 微服务+consul | ubuntu18.0 |
cloudServer3p | 微服务+consul | ubuntu18.0 |
操作步骤:
1. 在 opsServer上操作:
安装ansible:
apt-get update #更新软件源中的所有软件列表
apt-get install ansible #安装ansible
配置ansible:
vim /etc/ansible/ansible.cfg
[defaults]
inventory = /etc/ansible/hosts
forks = 5
remote_port = 22
host_key_checking = False
timeout = 10
log_path = /var/log/ansible.log
[inventory]
[privilege_escalation]
[paramiko_connection]
[ssh_connection]
[persistent_connection]
[accelerate]
[selinux]
[colors]
[diff]
vim /etc/ansible/hosts
[webservers]
cloudServer1p ansible_ssh_user=cloud ansible_ssh_key=/home/cloud/.ssh/id_rsa
cloudServer2p ansible_ssh_user=cloud ansible_ssh_key=/home/cloud/.ssh/id_rsa
cloudServer3p ansible_ssh_user=cloud ansible_ssh_key=/home/cloud/.ssh/id_rsa
解释:
确保opsServer与cloudServer1p ,cloudServer2p ,cloudServer3p的ssh 打通,这里是用密钥进行认证
配置crontab任务:
crontab -l
*/5 * * * * /usr/bin/python /home/cloud/ops/manageCloudService1p.py `curl -s http://cloudServer1p:8530/v1/agent/services` >/home/cloud/ops/cloudServer1p.log
*/5 * * * * /usr/bin/python /home/cloud/ops/manageCloudService2p.py `curl -s http://cloudServer2p:8530/v1/agent/services` >/home/cloud/ops/cloudServer2p.log
*/5 * * * * /usr/bin/python /home/cloud/ops/manageCloudService3p.py `curl -s http://cloudServer3p:8530/v1/agent/services` >/home/cloud/ops/cloudServer3p.log
注释:
curl -s http://cloudServer1p:8530/v1/agent/services 通过调用consul接口获取consul注册的微服务然后作为参数传递给脚本去处理。
manageCloudService1p.py内容:
# python json_format.py json_text
import os
import sys
import json
import smtplib
from email.mime.text import MIMEText
length = len(sys.argv)
def sendMail(name):
mail_host = 'mail.163.com'
mail_user = 'alert@163.com'
mail_pass = 'test123'
sender = 'alert@163.com'
receivers = ['alert@163.com']
message = MIMEText(name+' server check failure! i will start this server.','plain','utf-8')
message['Subject'] = 'cloud server check'
message['From'] = sender
message['To'] = receivers[0]
try:
smtpObj = smtplib.SMTP()
smtpObj.connect(mail_host,25)
smtpObj.login(mail_user,mail_pass)
smtpObj.sendmail(
sender,receivers,message.as_string())
smtpObj.quit()
print('success')
except smtplib.SMTPException as e:
print('error',e)
if length > 1:
try:
jsonstr = sys.argv[1]
jsonObj = json.loads(jsonstr)
serverName=["gateway-service-9999","order-service-8020","portal-service-8080","product-service-8010"]
realServerName=[]
for key in jsonObj.keys():
realServerName.append(key)
for name in serverName:
if name not in realServerName:
print(name+"is down")
if name == 'gateway-service-9999':
sendMail(name)
os.system("ansible-playbook /etc/ansible/restartService.yml -e serverList=cloudServer1p -e serverName=restartGatewayService")
elif name == 'order-service-8020':
sendMail(name)
os.system("ansible-playbook /etc/ansible/restartService.yml -e serverList=cloudServer1p -e serverName=restartOrderService")
elif name == 'portal-service-8080':
sendMail(name)
os.system("ansible-playbook /etc/ansible/restartService.yml -e serverList=cloudServer1p -e serverName=restartPortalService")
elif name == 'product-service-8010':
sendMail(name)
os.system("ansible-playbook /etc/ansible/restartService.yml -e serverList=cloudServer1p -e serverName=restartProductService")
else:
print(name+"not exist!")
else:
print(name+"is up")
except Exception:
print("json parse error.")
else :
print("argv's length is 1, no json text input.")
manageCloudService2p.py内容:
# python json_format.py json_text
import os
import sys
import json
import smtplib
from email.mime.text import MIMEText
length = len(sys.argv)
def sendMail(name):
mail_host = 'mail.163.com'
mail_user = 'alert@163.com'
mail_pass = 'test123'
sender = 'alert@163.com'
receivers = ['alert@163.com']
message = MIMEText(name+' server check failure! i will start this server.','plain','utf-8')
message['Subject'] = 'cloud server check'
message['From'] = sender
message['To'] = receivers[0]
try:
smtpObj = smtplib.SMTP()
smtpObj.connect(mail_host,25)
smtpObj.login(mail_user,mail_pass)
smtpObj.sendmail(
sender,receivers,message.as_string())
smtpObj.quit()
print('success')
except smtplib.SMTPException as e:
print('error',e)
if length > 1:
try:
jsonstr = sys.argv[1]
jsonObj = json.loads(jsonstr)
serverName=["gateway-service-9999","order-service-8020","portal-service-8080","product-service-8010"]
realServerName=[]
for key in jsonObj.keys():
realServerName.append(key)
for name in serverName:
if name not in realServerName:
print(name+"is down")
if name == 'gateway-service-9999':
sendMail(name)
os.system("ansible-playbook /etc/ansible/restartService.yml -e serverList=cloudServer2p -e serverName=restartGatewayService")
elif name == 'order-service-8020':
sendMail(name)
os.system("ansible-playbook /etc/ansible/restartService.yml -e serverList=cloudServer2p -e serverName=restartOrderService")
elif name == 'portal-service-8080':
sendMail(name)
os.system("ansible-playbook /etc/ansible/restartService.yml -e serverList=cloudServer2p -e serverName=restartPortalService")
elif name == 'product-service-8010':
sendMail(name)
os.system("ansible-playbook /etc/ansible/restartService.yml -e serverList=cloudServer2p -e serverName=restartProductService")
else:
print(name+"not exist!")
else:
print(name+"is up")
except Exception:
print("json parse error.")
else :
print("argv's length is 1, no json text input.")
manageCloudService3p.py内容:
# python json_format.py json_text
import os
import sys
import json
import smtplib
from email.mime.text import MIMEText
length = len(sys.argv)
def sendMail(name):
mail_host = 'mail.163.com'
mail_user = 'alert@163.com'
mail_pass = 'test123'
sender = 'alert@163.com'
receivers = ['alert@163.com']
message = MIMEText(name+' server check failure! i will start this server.','plain','utf-8')
message['Subject'] = 'cloud server check'
message['From'] = sender
message['To'] = receivers[0]
try:
smtpObj = smtplib.SMTP()
smtpObj.connect(mail_host,25)
smtpObj.login(mail_user,mail_pass)
smtpObj.sendmail(
sender,receivers,message.as_string())
smtpObj.quit()
print('success')
except smtplib.SMTPException as e:
print('error',e)
if length > 1:
try:
jsonstr = sys.argv[1]
jsonObj = json.loads(jsonstr)
serverName=["gateway-service-9999","order-service-8020","portal-service-8080","product-service-8010"]
realServerName=[]
for key in jsonObj.keys():
realServerName.append(key)
for name in serverName:
if name not in realServerName:
print(name+"is down")
if name == 'gateway-service-9999':
sendMail(name)
os.system("ansible-playbook /etc/ansible/restartService.yml -e serverList=cloudServer3p -e serverName=restartGatewayService")
elif name == 'order-service-8020':
sendMail(name)
os.system("ansible-playbook /etc/ansible/restartService.yml -e serverList=cloudServer3p -e serverName=restartOrderService")
elif name == 'portal-service-8080':
sendMail(name)
os.system("ansible-playbook /etc/ansible/restartService.yml -e serverList=cloudServer3p -e serverName=restartPortalService")
elif name == 'product-service-8010':
sendMail(name)
os.system("ansible-playbook /etc/ansible/restartService.yml -e serverList=cloudServer3p -e serverName=restartProductService")
else:
print(name+"not exist!")
else:
print(name+"is up")
except Exception:
print("json parse error.")
else :
print("argv's length is 1, no json text input.")
restartService.yml内容如下:
---
- hosts: '{{ serverList }}'
tasks:
- name: restart server
command: "/bin/bash /home/cloud/startService.sh {{serverName}}"
register: result
- name: show debug info
debug: var=result.stdout verbosity=0
2. 在微服务节点上操作:
将 startService.sh 这个脚本分别拷贝到三个微服务节点的 /home/cloud/ 目录下面:
startService.sh 内容如下:
#!/bin/bash
restartGatewayService()
{
srevice_name="gateway-service"
service_pid=`ps -ef|grep ${srevice_name} |grep -v task|grep prod |grep -v grep |awk '{print $2}'`
if [ ! -n "$service_pid" ];then
echo "${srevice_name} is already stopped,i will start ${srevice_name} now!"
/usr/bin/nohup java -jar -Xms512m -Xmx512m -Dserver.port=9999 -Dspring.profiles.active=prod -Dlogging.level.root=info /home/cloud/services/gateway-service.jar >> /home/cloud/services/gateway-service.log &
sleep 10
else
echo "${srevice_name} is running,i will kill -15 ${srevice_name} and start it now!"
kill -15 $service_pid
sleep 10
/usr/bin/nohup java -jar -Xms512m -Xmx512m -Dserver.port=9999 -Dspring.profiles.active=prod -Dlogging.level.root=info /home/cloud/services/gateway-service.jar >> /home/cloud/services/gateway-service.log &
fi
}
restartOrderService()
{
srevice_name="order-service"
service_pid=`ps -ef|grep ${srevice_name} |grep -v task|grep prod |grep -v grep |awk '{print $2}'`
if [ ! -n "$service_pid" ];then
echo "${srevice_name} is already stopped,i will start ${srevice_name} now!"
/usr/bin/nohup java -jar -Xms512m -Xmx512m -Dserver.port=8020 -Dspring.profiles.active=prod -Dlogging.level.root=info /home/cloud/services/order-service.jar >> /home/cloud/services/order-service.log &
sleep 10
else
echo "${srevice_name} is running,i will kill -15 ${srevice_name} and start it now!"
kill -15 $service_pid
sleep 10
/usr/bin/nohup java -jar -Xms512m -Xmx512m -Dserver.port=8020 -Dspring.profiles.active=prod -Dlogging.level.root=info /home/cloud/services/order-service.jar >> /home/cloud/services/order-service.log &
fi
}
restartPortalService()
{
srevice_name="portal-service"
service_pid=`ps -ef|grep ${srevice_name} |grep -v task|grep prod |grep -v grep |awk '{print $2}'`
if [ ! -n "$service_pid" ];then
echo "${srevice_name} is already stopped,i will start ${srevice_name} now!"
/usr/bin/nohup java -jar -Xms512m -Xmx512m -Dserver.port=8080 -Dspring.profiles.active=prod -Dlogging.level.root=info /home/cloud/services/portal-service.jar >> /home/cloud/services/portal-service.log &
sleep 10
else
echo "${srevice_name} is running,i will kill -15 ${srevice_name} and start it now!"
kill -15 $service_pid
sleep 10
/usr/bin/nohup java -jar -Xms512m -Xmx512m -Dserver.port=8080 -Dspring.profiles.active=prod -Dlogging.level.root=info /home/cloud/services/portal-service.jar >> /home/cloud/services/portal-service.log &
fi
}
restartProductService()
{
srevice_name="product-service"
service_pid=`ps -ef|grep ${srevice_name} |grep -v task|grep prod |grep -v grep |awk '{print $2}'`
if [ ! -n "$service_pid" ];then
echo "${srevice_name} is already stopped,i will start ${srevice_name} now!"
/usr/bin/nohup java -jar -Xms512m -Xmx512m -Dserver.port=8010 -Dspring.profiles.active=prod -Dlogging.level.root=info /home/cloud/services/product-service.jar >> /home/cloud/services/product-service.log &
sleep 10
else
echo "${srevice_name} is running,i will kill -15 ${srevice_name} and start it now!"
kill -15 $service_pid
sleep 10
/usr/bin/nohup java -jar -Xms512m -Xmx512m -Dserver.port=8010 -Dspring.profiles.active=prod -Dlogging.level.root=info /home/cloud/services/product-service.jar >> /home/cloud/services/product-service.log &
fi
}
restartAllService()
{
restartGatewayService
restartOrderService
restartPortalService
restartProductService
}
serviceStatus()
{
echo "service status as:"
jps -l
}
case $1 in
"all")
echo "restart all service!"
restartAllService
serviceStatus
;;
"restartGatewayService")
echo "restart gateway service!"
restartGatewayService
serviceStatus
;;
"restartOrderService")
echo "restart order service!"
restartOrderService
serviceStatus
;;
"restartPortalService")
echo "restart portal service!"
restartPortalService
serviceStatus
;;
"restartProductService")
echo "restart product service!"
restartProductService
serviceStatus
;;
*)
echo "input parameter error! USAGE: bash startService.sh all|one"
;;
esac
此方案已经在生产环境开始运行,运行效果测试有效,如有问题请及时联系博主。
原创文章,作者:ItWorker,如若转载,请注明出处:https://blog.ytso.com/tech/pnotes/114916.html