工作了,需要抓取一些特定的資源進行分析。沒有高速海量資料那麼大的需求。所以想到用python直接寫一個簡單的爬蟲,之後再編寫模板進行定向解析,就解決問題了。畢竟我們的時間要求和數量級要求不是很高。
在網上搜尋了一些python編寫爬蟲的文檔,下載下來試試,看看效果。雖然我的這篇文章標明是原創,但是文章中也有一部分是參考了別人的資訊,請諒解。因為我是想參考別人的精華,組合重構,來實現我需要的系統。
先是從網上下載一個別人編寫的爬蟲試試效果,網址如下:http://blog.csdn.net/cashey1991/article/details/6262704
Test.py
# -*- coding: utf-8 -*-import WebCrawlerurl = raw_input('設定入口url(例-->http://www.baidu.com): \n')thNumber = int(raw_input('設定線程數:')) #之前類型未轉換出bugMaxdepth = int(raw_input('最大搜尋深度:'))wc = WebCrawler.WebCrawler(thNumber, Maxdepth)wc.Craw(url)
WebCrawler.py
# -*- coding: utf-8 -*-import threadingimport GetUrlimport urllibg_mutex = threading.Lock()g_pages = [] #線程下載頁面後,將頁面內容添加到這個list中g_dledUrl = [] #所有下載過的urlg_toDlUrl = [] #當前要下載的urlg_failedUrl = [] #下載失敗的urlg_totalcount = 0 #下載過的頁面數class WebCrawler: def __init__(self,threadNumber,Maxdepth): self.threadNumber = threadNumber self.threadPool = [] self.Maxdepth = Maxdepth self.logfile = file('#log.txt','w') ## def download(self, url, fileName): Cth = CrawlerThread(url, fileName) self.threadPool.append(Cth) Cth.start() def downloadAll(self): global g_toDlUrl global g_totalcount i = 0 while i < len(g_toDlUrl): j = 0 while j < self.threadNumber and i + j < len(g_toDlUrl): g_totalcount += 1 #進入迴圈則下載頁面數加1 self.download(g_toDlUrl[i+j],str(g_totalcount)+'.htm') print 'Thread started:',i+j,'--File number = ',g_totalcount j += 1 i += j for th in self.threadPool: th.join(30) #等待線程結束,30秒逾時 self.threadPool = [] #清空線程池 g_toDlUrl = [] #清空列表 def updateToDl(self): global g_toDlUrl global g_dledUrl newUrlList = [] for s in g_pages: newUrlList += GetUrl.GetUrl(s) #######GetUrl要具體實現 g_toDlUrl = list(set(newUrlList) - set(g_dledUrl)) #提示unhashable def Craw(self,entryUrl): #這是一個深度搜尋,到g_toDlUrl為空白時結束 g_toDlUrl.append(entryUrl) self.logfile.write('>>>Entry:\n') ## self.logfile.write(entryUrl) ## depth = 0 while len(g_toDlUrl) != 0 and depth <= self.Maxdepth: depth += 1 print 'Searching depth ',depth,'...\n\n' self.downloadAll() self.updateToDl() content = '\n>>>Depth ' + str(depth)+':\n' ## self.logfile.write(content) ## i = 0 ## while i < len(g_toDlUrl): ## content = str(g_totalcount + i + 1) + '->' + g_toDlUrl[i] + '\n'# ## self.logfile.write(content) ## i += 1 ## class CrawlerThread(threading.Thread): def __init__(self, url, fileName): threading.Thread.__init__(self) self.url = url #本線程下載的url self.fileName = fileName def run(self): #線程工作-->下載html頁面 global g_mutex global g_failedUrl global g_dledUrl try: f = urllib.urlopen(self.url) s = f.read() fout = file(self.fileName, 'w') fout.write(s) fout.close() except: g_mutex.acquire() #線程鎖-->鎖上 g_dledUrl.append(self.url) g_failedUrl.append(self.url) g_mutex.release() #線程鎖-->釋放 print 'Failed downloading and saving',self.url return None #記著返回! g_mutex.acquire() #線程鎖-->鎖上 g_pages.append(s) g_dledUrl.append(self.url) g_mutex.release() #線程鎖-->釋放
GetUrl.py
# -*- coding: cp936 -*- urlSep = ['<','>','//','(',')', r'"', r"'", ' ', '\t', '\n']urlTag = ['http://']def is_sep(ch): for c in urlSep: if c == ch: return True return Falsedef find_first_sep(i,s): while i < len(s): if is_sep(s[i]): return i i+=1 return len(s)def GetUrl(strPage): rtList = [] for tag in urlTag: i = 0 i = strPage.find(tag, i, len(strPage)) while i != -1: begin = i end = find_first_sep(begin+len(tag),strPage) rtList.append(strPage[begin:end]) i = strPage.find(tag, end, len(strPage)) return rtList