1.1. Pandas Analysis steps
Loading data
COUNT the date of the access_time. SQL similar to the following:
SELECT date_format (access_time, '%H '), COUNT (*) from log GROUP by Date_format (access_time, '%H ');
1.2. Code
Cat pd_ng_log_stat.py
#!/usr/bin/env python
#-*-Coding:utf-8-*-
From Ng_line_parser import Nglineparser
Import Pandas as PD
Import socket
Import struct
Class Pdnglogstat (object):
def __init__ (self):
Self.ng_line_parser = Nglineparser ()
def _log_line_iter (self, pathes):
"" resolves each row in the file and generates an iterator ""
For path in pathes:
With open (path, ' R ') as F:
For index, line in enumerate (f):
Self.ng_line_parser.parse (line)
Yield Self.ng_line_parser.to_dict ()
def load_data (self, Path):
"" "" "to load data generation Dataframe" "by the file path to
SELF.DF = PD. Dataframe (Self._log_line_iter (path))
def pv_hour (self):
"" Calculates access for each period of the day ""
Group_by_cols = [' Access_time '] # need to group columns, only calculate and display the column
# below we are grouped according to HH (hours), so we need to define the grouping policy:
# The group strategy is: self.df[' access_time '].map (Lambda X:x.split (). Pop (). Split (': ') [0])
PV_HOUR_GRP = Self.df[group_by_cols].groupby (
self.df[' Access_time '].map (Lambda X:x.split (). Pop (). Split (': ') [0])
return Pv_hour_grp.agg ([' Count '])
def main ():
File_pathes = [' Www.ttmark.com.access.log ']
Pd_ng_log_stat = Pdnglogstat ()
Pd_ng_log_stat.load_data (file_pathes)
# PV per hour statistics
Print Pd_ng_log_stat.pv_hour ()
if __name__ = = ' __main__ ':
Main ()
Run statistics and output results
Python pd_ng_log_stat.py
Access_time
Count
Access_time
00 31539
01 34824
02 27895
03 29669
04 27742
05 26797
06 29384
07 31102
08 38257
09 43060
10 48064
11 57923
12 56413
13 57971
14 47260
15 46364
16 45721
17 48884
18 49318
19 49162
20 43641
21 42525
22 40371
23 34953