通过对 nginx 的 access.log 日志来简单分析 nginx 的实时流量,当然 access.log 的日志格式要用 json 格式化输出
设置nginx的日志格式,如下:
log_format main '{"remote_addr":"$remote_addr","remote_user":"$remote_user","time_local":"$time_iso8601","request":"$request","status":"$status","body_bytes_sent":"$body_bytes_sent","http_referer":"$http_referer","http_user_agent":"$http_user_agent","http_x_forwarded_for":"$http_x_forwarded_for"}';
脚本主要从日志中获取body_bytes_sent发送的包体大小,累加而成,使用方法:
python nginx_netflow.py -f /var/nginx.log -n 100 tail后100行
python nginx_netflow.py 全部文件,超过100M tail最后10w行
运行:
相关代码:
#!/usr/bin/env python # set coding: utf-8 # description: nginx access log 下流量计算 根据send_body字段 以不同的request url来查询, # 流量越集中的请求,越需要优化 # usage: # python nginx_netflow.py 100 tail后100行 # python nginx_netflow.py 全部文件,超过100M tail最后10w行 # # todo: 输入ip地址,列出访问地址top N ; 可选输入 access.log路径 ; >10M 时输出kb # __author__ = "richardzgt" import os,sys import re from collections import defaultdict from optparse import OptionParser import time import linecache NGINX_FILE = '/var/nginx.log' FILE_MAX_SIZE = 100 # 100M 1024*1024*100 TOP_N = 5 # 打印前流量 5 LAST_LINE_NUM = 10000 REQUEST_WITH_Q = re.compile(r'(GET|POST)\s(\/\S+)\?') REQUEST_WITHOUT_Q = re.compile(r'(GET|POST)\s(\/\S+)') def exeTime(func): def newFunc(*args, **kwargs): t0 = time.time() print "@%s, {%s} start" % (time.strftime("%X", time.localtime()), func.__name__) back = func(*args, **kwargs) print "@%s, {%s} end" % (time.strftime("%X", time.localtime()), func.__name__) print "@%.3fs taken for {%s}" % (time.time() - t0, func.__name__) return back return newFunc def args_options(): parser = OptionParser() parser.add_option("-f", "--file", dest="file", action="store", help="input access.log directory") # parser.add_option("-s",dest="show",action="store_true",default=False, # help=u"show access.log") parser.add_option("-n",dest="nums",action="store", help="tail the access log file,default is [%s]" % LAST_LINE_NUM) return parser def group_by(group_dict,group_str,body_bytes_sent): if group_dict.get(group_str): group_dict[group_str]['sum_bytes'] += body_bytes_sent group_dict[group_str]['count'] += 1 else: group_dict[group_str] = {'sum_bytes': body_bytes_sent,'count':1} return group_dict # return group_dict def order_bytes(group_dict): return sorted(group_dict, key=lambda name: group_dict[name]['sum_bytes'], reverse=-1) def print_order(get_group_type,get_order_btyes): print("%s" % ('=='*10)) for index,each in enumerate(get_order_btyes): if index >= TOP_N: break sum_bytes = get_group_type.get(each).get('sum_bytes') count = get_group_type.get(each).get('count') print("%s match %s times,bring \033[1;31;m %0.fMB \033[0m" % (each,count,sum_bytes/1024/1024)) def request_url(url): if url.find('?') > 0: regex = REQUEST_WITH_Q.match(url) else: regex = REQUEST_WITHOUT_Q.match(url) if regex is not None: return regex.groups()[1] return 'NONE_REGEX_CONTENT' # @exeTime def all_file_handle(file_obj): group_by_ip = defaultdict(list) group_by_request = defaultdict(list) group_by_action = defaultdict(list) group_by_server_name = defaultdict(list) total_bytes = 0 start_time = "" for eachline in file_obj: try: data = eval(eachline) if not start_time: start_time = data['time_local'] body_bytes_sent = int(data['body_bytes_sent']) total_bytes += body_bytes_sent group_by(group_by_ip,data['remote_addr'],body_bytes_sent) group_by(group_by_request,request_url(data['request']),body_bytes_sent) group_by(group_by_server_name,data['server_name'],body_bytes_sent) except TypeError as e: continue for each_group in (group_by_ip,group_by_request,group_by_server_name): get_order_bytes = order_bytes(each_group) print_order(each_group,get_order_bytes) print("first line start: %s" % (start_time)) print("total: %0.1f MB" % (total_bytes/1024/1024)) def echo_color(content): print("\033[;1m%s\033[0m" % content) def main(nums=0): try: file_obj = open(NGINX_FILE,'r') except IOError: echo_color("not found file") sys.exit() if nums != 0 : handle_file = linecache.getlines(NGINX_FILE)[-nums:] elif os.stat(NGINX_FILE).st_size < FILE_MAX_SIZE*1024*1024: handle_file = file_obj else: print('too big,use limit row num,default is %s' % LAST_LINE_NUM) handle_file = linecache.getlines(NGINX_FILE)[-LAST_LINE_NUM:] all_file_handle(handle_file) if __name__ == '__main__': parser = args_options() (options, args) = parser.parse_args() if options.file: NGINX_FILE=options.file if options.nums: nums = options.nums if nums.isdigit(): nums = int(nums) main(nums) else: print("not get lines num") else: main()
2017-11-08
提升了一波效率,使用linecache模块获取文件最后n行
9s ==> 2s ,效率瓶颈还是在linecache,因为它要把文件都缓存到内存里面
2017-11-27
找个一个类似的使用 pandas方法,有时间改良一下:根据rt,size,ip 多个维度来分析
http://blog.csdn.net/yanggd1987/article/details/69542669
本站文章如无特殊说明,均为本站原创,如若转载,请注明出处:[nginx] 通过日志分析流量组成 - Python技术站