1 分析日誌的python架構awk.py
## Custom awk.py module#class controller: def __init__(self, f): self.m_file = f self.m_handlers = [] def subscribe(self, o): self.m_handlers.append(o) def run(self): for o in self.m_handlers: o.begin() s = self.m_file.readline() while s != "": for o in self.m_handlers: o.process_line(s) s = self.m_file.readline() for o in self.m_handlers: o.end() def print_results(self): print print "Results:" print for o in self.m_handlers: print "------------------------------------------------------" print o.description() print "------------------------------------------------------" print o.result()
統計日誌的點擊量 count_line.py
# Standard sys moduleimport sys# Custom awk.py moduleimport awkclass count_lines:def begin(self):self.m_count = 0def process_line(self, s):self.m_count += 1def end(self):passdef description(self):return "# of lines in the file"def result(self):return self.m_count## Step 1: Create the Awk controller#ac = awk.controller(sys.stdin)## Step 2: Subscribe the handler#ac.subscribe(count_lines())## Step 3: Run#ac.run()## Step 4: Print the results#ac.print_results()
使用方法是shell中執行
# cat apachelog.log|python count_lines.py
統計瀏覽次數超過n次的訪問者 visitors.py
How many people have returned to the site more than N times?
import re;import sysimort awkclass return_visitors:def __init__(self, n):self.m_n = n;self.m_ip_days = {};def begin(self): pass;def process_line(self, s):try:array = s.split();ip = array[0];day = array[3][1:7];if self.m_ip_days.has_key(ip):if day not in self.m_ip_days[ip]:self.m_ip_days[ip].append(day);else:self.m_ip_days[ip] = [];self.m_ip_days[ip].append(day);except IndexError:pass;def end(self):ips = self.m_ip_days.keys();count = 0;for ip in ips:if len(self.m_ip_days[ip]) > self.m_n:count += 1;self.m_count = count;def description(self):return "# of IP addresses that visited more than %s days" % self.m_n;def result(self):return self.m_count;ac = awk.controller(sys.stdin)ac.subscribe(return_visitors(2))ac.run()ac.print_results()
# cat apachelog.log|python visitors.py
按照網域名稱統計訪問量domain.py
import re;import sysimort awkclass referring_domains:def __init__(self):self.m_domains = {};def begin(self):pass;def process_line(self, line):try:array = line.split();referrer = array[10];m = re.search('//[a-zA-Z0-9\-\.]*\.[a-zA-z]{2,3}/', referrer);length = len(m.group(0));domain = m.group(0)[2:length-1];if self.m_domains.has_key(domain):self.m_domains[domain] += 1;else:self.m_domains[domain] = 1;except AttributeError:pass;except IndexError:pass;def end(self):pass;def description(self):return "Referring domains";def sort(self, key1, key2):if self.m_domains[key1] > self.m_domains[key2]:return -1;elif self.m_domains[key1] == self.m_domains[key2]:return 0;else:return 1;def result(self):s = "";keys = self.m_domains.keys();keys.sort(self.sort);for domain in keys:s += domain;s += " ";s += str(self.m_domains[domain]);s += "\n";s += "\n\n";return s;ac = awk.controller(sys.stdin)ac.subscribe(referring_domains())ac.run()ac.print_results()
# cat apachelog.log|python domain.py