[nginx] 通过日志分析流量组成

通过对 nginx 的 access.log 日志来简单分析 nginx 的实时流量，当然 access.log 的日志格式要用 json 格式化输出

设置nginx的日志格式，如下：

log_format main 
 '{"remote_addr":"$remote_addr","remote_user":"$remote_user","time_local":"$time_iso8601","request":"$request","status":"$status","body_bytes_sent":"$body_bytes_sent","http_referer":"$http_referer","http_user_agent":"$http_user_agent","http_x_forwarded_for":"$http_x_forwarded_for"}';

脚本主要从日志中获取body_bytes_sent发送的包体大小，累加而成，使用方法：

python nginx_netflow.py -f /var/nginx.log -n 100  tail后100行
python nginx_netflow.py      全部文件，超过100M tail最后10w行

运行：

[nginx] 通过日志分析流量组成

相关代码：

#!/usr/bin/env python
# set coding: utf-8
# description: nginx access log 下流量计算 根据send_body字段 以不同的request url来查询，
#              流量越集中的请求，越需要优化
# usage:
#       python nginx_netflow.py 100  tail后100行
#       python nginx_netflow.py      全部文件，超过100M tail最后10w行
#
#  todo: 输入ip地址，列出访问地址top N  ； 可选输入 access.log路径 ； >10M 时输出kb
#

__author__ = "richardzgt"

import os,sys
import re
from collections import defaultdict
from optparse import OptionParser
import time
import linecache

NGINX_FILE = '/var/nginx.log'
FILE_MAX_SIZE = 100 # 100M  1024*1024*100
TOP_N = 5 # 打印前流量 5
LAST_LINE_NUM = 10000

REQUEST_WITH_Q = re.compile(r'(GET|POST)\s(\/\S+)\?')
REQUEST_WITHOUT_Q = re.compile(r'(GET|POST)\s(\/\S+)')


def exeTime(func):
     def newFunc(*args, **kwargs):
         t0 = time.time()
         print "@%s, {%s} start" % (time.strftime("%X", time.localtime()), func.__name__)
         back = func(*args, **kwargs)
         print "@%s, {%s} end" % (time.strftime("%X", time.localtime()), func.__name__)
         print "@%.3fs taken for {%s}" % (time.time() - t0, func.__name__)
         return back
     return newFunc

def args_options():
    parser = OptionParser()
    parser.add_option("-f", "--file", dest="file", action="store",
                      help="input access.log directory")
    # parser.add_option("-s",dest="show",action="store_true",default=False,
    #                   help=u"show access.log")
    parser.add_option("-n",dest="nums",action="store",
                      help="tail the access log file，default is [%s]" % LAST_LINE_NUM)
    return parser


def group_by(group_dict,group_str,body_bytes_sent):
    if group_dict.get(group_str):
        group_dict[group_str]['sum_bytes'] += body_bytes_sent
        group_dict[group_str]['count'] += 1
    else:
        group_dict[group_str] = {'sum_bytes': body_bytes_sent,'count':1}
    return group_dict
    # return group_dict


def order_bytes(group_dict):
    return sorted(group_dict, key=lambda name: group_dict[name]['sum_bytes'], reverse=-1)


def print_order(get_group_type,get_order_btyes):
    print("%s" % ('=='*10))
    for index,each in enumerate(get_order_btyes):
        if index >= TOP_N:
            break
        sum_bytes = get_group_type.get(each).get('sum_bytes')
        count = get_group_type.get(each).get('count')
        print("%s match %s times,bring \033[1;31;m %0.fMB \033[0m" % (each,count,sum_bytes/1024/1024))

def request_url(url):
    if url.find('?') > 0:
        regex = REQUEST_WITH_Q.match(url)
    else:
        regex = REQUEST_WITHOUT_Q.match(url)
    if regex is not None:
        return regex.groups()[1]
    return 'NONE_REGEX_CONTENT'

# @exeTime
def all_file_handle(file_obj):
    group_by_ip = defaultdict(list)
    group_by_request = defaultdict(list)
    group_by_action = defaultdict(list)
    group_by_server_name = defaultdict(list)
    total_bytes = 0
    start_time = ""

    for eachline in file_obj:
        try:
            data = eval(eachline)
            if not start_time:
                start_time = data['time_local']
            body_bytes_sent = int(data['body_bytes_sent'])
            total_bytes += body_bytes_sent
            group_by(group_by_ip,data['remote_addr'],body_bytes_sent)
            group_by(group_by_request,request_url(data['request']),body_bytes_sent)
            group_by(group_by_server_name,data['server_name'],body_bytes_sent)
        except TypeError as e:
            continue
    
    for each_group in (group_by_ip,group_by_request,group_by_server_name):
        get_order_bytes =  order_bytes(each_group)
        print_order(each_group,get_order_bytes)
    print("first line start: %s" % (start_time))
    print("total: %0.1f MB" % (total_bytes/1024/1024))

def echo_color(content):
    print("\033[;1m%s\033[0m" % content)

def main(nums=0):
    try:
        file_obj = open(NGINX_FILE,'r')
    except IOError:
        echo_color("not found file")
        sys.exit()
    if nums != 0 :
        handle_file = linecache.getlines(NGINX_FILE)[-nums:]
    elif os.stat(NGINX_FILE).st_size < FILE_MAX_SIZE*1024*1024:
        handle_file = file_obj
    else:
        print('too big,use limit row num,default is %s' % LAST_LINE_NUM)
        handle_file = linecache.getlines(NGINX_FILE)[-LAST_LINE_NUM:]


    all_file_handle(handle_file)

if __name__ == '__main__':
    parser = args_options()
    (options, args) = parser.parse_args()
    if options.file:
        NGINX_FILE=options.file

    if options.nums:
        nums = options.nums
        if nums.isdigit():
            nums = int(nums)
            main(nums)
        else:
            print("not get lines num")
    else:
        main()

2017-11-08

提升了一波效率，使用linecache模块获取文件最后n行

9s ==> 2s ，效率瓶颈还是在linecache，因为它要把文件都缓存到内存里面

2017-11-27

找个一个类似的使用 pandas方法，有时间改良一下：根据rt，size，ip 多个维度来分析

http://blog.csdn.net/yanggd1987/article/details/69542669

本站文章如无特殊说明，均为本站原创，如若转载，请注明出处：[nginx] 通过日志分析流量组成 - Python技术站