需求
由于我们的业务报警比较频繁,之前是针对每个报警进行具体处理,但是有时还会重复出现,或者后续处理有时忘记跟进等,因此进行报警短信的统计,可以针对一些问题与业务跟进,明确后续的优化方向等。
实现
实现原理如下图:
其中核心部分zbx_statis,其实就是我编写的一个python脚本,它会从zabbixDB中查询过去一周的所有报警信息,并按不同维度统计每周的报表上传到公司的git上,同时将一条汇总的sql插入到cmdb的库表中展示。
报警格式依赖
报表的分析统计可以分两个维度:
报警类型纬度;
业务纬度;
不管从哪个维度进行的统计,都需要一个前提:报警格式规范化。
针对报警内容的需求,我们对zabbix的trigger名称、主机名hostname等进行了规范化。
举例:
[17][15:31:04][productname-test-mysql-00][PROBLEM][005][cpu idle too low (<30%)][0.10 %][负责人:张学岩][15:31:07]
productname-test-mysql-00 是主机名,按业务等级进行命名,用于报警统计的业务纬度统计;
cpu idle too low (<30%) 是报警的类型,可以据此项进行类型纬度的统计;
通过维护一个主要业务列表,然后根据hostname匹配可以从业务纬度进行统计;
通过将报警类型规范化,用固定的格式放在报警信息的固定位置,可以按类型进行统计。
报表展示
以下是截取的部分报表的展示。
按报警类型纬度:
最下面的详细信息跳转即业务纬度的统计。
按业务纬度:
CMDB统计图表:
很直观的展示每周的报警数量,如果优化比较好的话,会看到整体应该是下降的趋势。
附件
上文提到的报警统计 python 脚本,写的时间比较久了,现在看内容还是比较杂乱,我也懒得改了,放出来供大家参考,内容如下:
#!/usr/bin/env python26 # encoding: utf-8 import MySQLdb import traceback import copy import datetime import time import operator import sys reload(sys) sys.setdefaultencoding( "utf-8" ) HOST = 'zabbix_db_host' DB = 'zabbix' PORT = 3306 RETRY_TIMES = 3 # 业务类型 GROUP_TYPE = ['A', 'B', 'C', '...', 'X', 'Y', ] BASE_DIR = 'alerts_statistic/' START_TIME = (datetime.datetime.now() - datetime.timedelta(days=(7 + datetime.datetime.now().weekday()))).strftime("%Y%m%d") END_TIME = (datetime.datetime.now() - datetime.timedelta(days=(datetime.datetime.now().weekday()))).strftime("%Y%m%d") DAY_SUM = 0 NIGHT_SUM = 0 class Connection: def __init__(self, *args, **kwargs): self.args = args self.kwargs = kwargs self.kwargs['user'] = "user" self.kwargs['passwd'] = "password" self.kwargs['port'] = kwargs['port'] if kwargs.has_key("port") else 3306 self.kwargs['db'] = kwargs['db'] if kwargs.has_key("db") else "information_schema" self.kwargs['connect_timeout'] = 1 def get_connection(self): ret = {"errno":0, 'errmsg':"", 'value':None} conn = None try: for i in range(0, RETRY_TIMES): conn = MySQLdb.connect(*self.args, **self.kwargs) if conn: break ret['value'] = conn except Exception, err: ret['error'] = -1 ret['errmsg'] = self.kwargs['host'] + str(err) traceback.print_exc() finally: return ret def create_connection(*args, **kwargs): __conn__ = Connection(*args, **kwargs) ret = __conn__.get_connection() if ret['errno']: return None else: return ret['value'] def get_alert(): start_timestamp = int(time.mktime(datetime.datetime.strptime(START_TIME + ' 00:00:00', "%Y%m%d %H:%M:%S").timetuple())) end_timestamp = int(time.mktime(datetime.datetime.strptime(END_TIME + ' 00:00:00', "%Y%m%d %H:%M:%S").timetuple())) try: conn = create_connection(host = HOST, db = DB, port = PORT, charset = 'utf8') if conn: SQL = """select from_unixtime(a.clock),a.subject from alerts a,events b left join triggers c on b.objectid=c.triggerid where a.eventid=b.eventid and a.alerttype=0 and a.subject not like '%test-%' and a.subject not like '%-test%' and a.clock>={start_time} and a.clock< {end_time} group by a.subject order by a.clock""" SQL = SQL.format(start_time=start_timestamp, end_time=end_timestamp) print SQL cursor = conn.cursor() cursor.execute(SQL) ret = cursor.fetchall() cursor.close() conn.close() return ret except Exception,e: pass def alert_statistic(alert_list): result = {} alerts_list = [] if alert_list: for alert in alert_list: alerts_list.append(alert) for group in GROUP_TYPE: alerts = [] alerts_list2 = copy.copy(alerts_list) for alert in alerts_list2: if group in alert[1]: alerts.append(alert) alerts_list.remove(alert) result[group] = alerts result['other'] = alerts_list result['status'] = 0 else: result['status'] = 1 return result def write_to_file(result, day): if result: #statis_date = datetime.datetime.now().strftime("%Y-%m-%d") file_name = BASE_DIR + 'detail/' + START_TIME + '-' + END_TIME + '_' + day + '.md' writer = open(file_name,'w') writer.write('## ' + START_TIME + ' - ' + END_TIME + ': ' + day) alert_sum = 0 for group in result.keys(): alert_sum = alert_sum + len(result[group]) writer.write('/n/n**短信总数:' + str(alert_sum) + '**') for group in result.keys(): #print group, ":", len(result[group]) if group == 'status': continue if len(result[group]) == 0: continue writer.write('/n/n### ' + group + '(' + str(len(result[group])) + ')' + '/n/n') writer.write('|报警内容|报警时间|/n|---|---|/n') for alert in result[group]: writer.write('|' + str(alert[1]) + '|' + str(alert[0]) + '|/n') writer.close() def day_night_split(result, day='light'): results = {} if result: for group in result.keys(): if group == 'status': continue alerts = result[group] alerts2 = copy.copy(alerts) for alert in alerts: alert_time = alert[0] alert_hour_time = alert_time.strftime("%H") if int(alert_hour_time) >= 7 and day == 'light': pass elif int(alert_hour_time) >= 7 and day == 'night': alerts2.remove(alert) elif int(alert_hour_time) < 7 and day == 'light': alerts2.remove(alert) elif int(alert_hour_time) < 7 and day == 'night': pass results[group] = alerts2 return results def alert_groupby(alert_list): alerts = [] alert_group = [] group_list = [] for group in alert_list.keys(): if group == 'status': continue for alert in alert_list[group]: alerts.append(alert[1]) for alert in alerts: ### 兼容添加trigger id的改动 temp1 = alert.split('][') alert_type = '' if len(temp1) == 9: alert_type = alert.split('][')[5].split(',')[0] else: alert_type = alert.split('][')[4].split(',')[0] if alert_type not in alert_group: alert_group.append(alert_type) for type in alert_group: type_dict = {} count = 0 hostlist = [] for alert in alerts: if type == alert.split('][')[4].split(',')[0] or type == alert.split('][')[5].split(',')[0]: count = count + 1 hostname = alert.split('][')[2] if hostname not in hostlist: hostlist.append(hostname) type_dict['type'] = type type_dict['hostlist'] = ",".join(hostlist) type_dict['count'] = str(count) group_list.append(type_dict) group_list.sort(key=lambda x : int(x['count']), reverse=True) return group_list def write_group(group_light, group_night): if group_light and group_night: file_name = BASE_DIR + START_TIME + '-' + END_TIME + '.md' file_detail_light = START_TIME + '-' + END_TIME + '_白天' + '.md' file_detail_night = START_TIME + '-' + END_TIME + '_夜间' + '.md' writer = open(file_name,'w') writer.write('## ' + START_TIME + '-' + END_TIME + '/n/n') ## light alert_sum = 0 for group in group_light: alert_sum = alert_sum + int(group['count']) global DAY_SUM DAY_SUM = alert_sum writer.write('### ' + '白天:' + str(alert_sum) + '/n/n') writer.write("|报警类型|报警数量|报警主机|/n|---|---|---|/n") for group in group_light: writer.write("|" + group['type'] + "|" + group['count'] + "|" + group['hostlist'] + "|/n") writer.write("/n[详细报警信息](detail/" + file_detail_light + ")/n/n") ## night alert_sum = 0 for group in group_night: alert_sum = alert_sum + int(group['count']) global NIGHT_SUM NIGHT_SUM = alert_sum writer.write('### ' + '夜间:' + str(alert_sum) + '/n/n') writer.write("|报警类型|报警数量|报警主机|/n|---|---|---|/n") for group in group_night: writer.write("|" + group['type'] + "|" + group['count'] + "|" + group['hostlist'] + "|/n") writer.write("/n[详细报警信息](detail/" + file_detail_night + ")/n/n") writer.close() def write_trend(sql): host = 'cmdb_host' db = 'cmdb_db' port = 3306 try: conn = create_connection(host = host, db = db, port = port, charset = 'utf8') if conn: SQL = sql # SQL = SQL.format(start_time=start_timestamp, end_time=end_timestamp) print SQL cursor = conn.cursor() cursor.execute(SQL) ret = cursor.fetchall() cursor.close() conn.commit() conn.close() return ret except Exception,e: print e def git_push(): import os os.system("cd alerts_statis && git add alerts_statis && git commit -m 'update' && git push") if __name__ == '__main__': # alert 列表 alert_list = get_alert() # 按业务进行统计 result = alert_statistic(alert_list) if result['status'] == 0: # 区分白天夜间 result_day = day_night_split(result, 'light') result_night = day_night_split(result, 'night') # 按报警类型划分 light_alert = alert_groupby(result_day) night_alert = alert_groupby(result_night) # 写入文件 write_group(light_alert, night_alert) write_to_file(result_day, '白天') write_to_file(result_night, '夜间') git_push() sql = 'insert into alerts (start_time,end_time,all_count,day_count,night_count)values("' + START_TIME + '","' + END_TIME + '",' + str(DAY_SUM + NIGHT_SUM) + ',' + str(DAY_SUM) + ',' + str(NIGHT_SUM) + ');' # 写入cmdb write_trend(sql) else: print('There/'s no alert warning or something error.')
原创文章,作者:ItWorker,如若转载,请注明出处:https://blog.ytso.com/118592.html