Objective
Pandas is a numpy built with more advanced data structures and tools than the NumPy core is the Ndarray,pandas is also centered around Series and dataframe two core data structures. Series and Dataframe correspond to one-dimensional sequence and two-dimensional table structure respectively. Pandas's conventional approach to importing is as follows:
From pandas import series,dataframe
import pandas as PD
1.1. Pandas Analysis steps
1. Load Log data
2, loading AREA_IP data
3. Count the number of REAL_IP requests. SQL similar to the following:
SELECT Inet_aton (L.REAL_IP),
count (*),
a.addr from
log as L
INNER JOIN area_ip as a on
a.start_ip _num <= Inet_aton (l.real_ip) and A.end_ip_num >= Inet_aton
(l.real_ip)
GROUP by Real_ip
COUNT (*)
LIMIT 0, 100;
1.2. Code
Cat pd_ng_log_stat.py #!/usr/bin/env python #-*-coding:utf-8-*-from ng_line_parser import nglineparser import pan Das as PD import socket import struct class Pdnglogstat (object): Def __init__ (self): Self.ng_line_parser = Nglin Eparser () def _log_line_iter (self, pathes): "" resolves each row in the file and generates an iterator "" for Path in Pathes:with open (path, ' R ') as F:for index, line in enumerate (f): Self.ng_line_parser.parse (line) yield self.ng_line
_parser.to_dict () def _ip2num (self, IP): "" For IP conversion to number "" Ip_num =-1 Try: # converts IP to int/long number Ip_num = Socket.ntohl (Struct.unpack ("I", Socket.inet_aton (str (IP)) [0]) Except:pass finally:return IP _num def _get_addr_by_ip (self, IP): "" "" "Ip_num = Self._ip2num (IP) try:addr_df = self . ip_addr_df[(Self.ip_addr_df.ip_start_num <= ip_num) & (Ip_num-<= Self.ip_addr_df.ip_end_num )] Addr = addr_df.At[addr_df.index.tolist () [0], ' addr '] return addr Except:return None def load_data (self, PA TH): "" "" "" "" "" "" "SELF.DF = PD by loading data into the file path to generate Dataframe" "". Dataframe (Self._log_line_iter (path)) def uv_real_ip (self, top = 100): "" "Statistics CDN IP Quantity" "Group_by_cols = [' Real _ip '] # You need to group columns, only calculate and display the column # Direct Statistics URL_REQ_GRP = Self.df[group_by_cols].groupby (self.df[' r Eal_ip ']) return Url_req_grp.agg ([' Count ']] [' Real_ip '].nlargest (top, ' count ') def uv_real_ip_addr (self, top =
100): "" "" Statistics real IP Address volume "" CNT_DF = Self.uv_real_ip (top) # Add IP Address column cnt_df.insert (Len (cnt_df.columns), ' Addr ', Cnt_df.index.map (SELF._GET_ADDR_BY_IP)) return CNT_DF def load_ip_addr (self, Pat h): "" "" "" "" "" "cols = [' id ', ' ip_start_num ', ' ip_end_num ', ' ip_start ', ' ip_end ', ' addr ', ' operator '] s ELF.IP_ADDR_DF = pd.read_csv (path, sep= ' t ', names=cols, index_col= ' id ') return self.IP_ADDR_DF def main (): File_pathes = [' Www.ttmark.com.access.log '] Pd_ng_log_stat = Pdnglogstat () pd_ng_log_stat . Load_data (file_pathes) # load IP address area_ip_path = ' area_ip.csv ' pd_ng_log_stat.load_ip_addr (area_ip_path) # statistics User real IP traffic and address print pd_ng_log_stat.uv_real_ip_addr () if __name__ = = ' __main__ ': Main ()
Run statistics and output results
Python pd_ng_log_stat.py
count addr
real_ip
60.191.123.80 101013 Hangzhou, Zhejiang province
- 32691 None
218.30.118.79 22523
Beijing
... 136.243.152.18 889 German
157.55.39.219 889 USA
66.249.65.170 888 USA
[ Rows x 2 columns]
Summarize
The above is the entire content of this article, I hope the content of this article for everyone's study or work to bring certain help, if you have questions you can message exchange.