今天將 一個bfs 的爬蟲 和 抽取Html整合到一起了。現在功能還是有局限性 。 其中抽取本文,詳見 http://www.fuxiang90.me/2012/02/%E6%8A%BD%E5%8F%96html-%E6%AD%A3%E6%96%87/
- 現在只限定爬取 http 協議的網址,並只在內網測試了,因為和外網的串連不是不快。
- 一個全域的 url 隊列 和 url set 。隊列是為了方便的實現bfs , set 是為了不重複爬取網頁,流程還是相當的簡單的,原理也是相當的簡單。
- 然後是單線程的,所以應該是比較慢的,之後會考慮多線程 ,爬取網頁 ,抽取URL ,抽取本文,可以同步進行。
- 其中 是來源
https://www.ibm.com/developerworks/cn/opensource/os-cn-crawler/ ,然後抽取網頁中的url ,我同時還抽取了裡面的本文,這個是為了以後建立索引的時候 ,方便進行中文分詞
代碼在這裡貼 有問題,可能是裡面有 html 的標籤 ,請移步http://www.fuxiang90.me/?p=728
# encoding:utf-8# use BeautifulSoup to get font|p context# 單線程版本的爬取html ,並深度遍曆 ,之後 抽取本文 ,但是單線程未免有點慢# 可以隨意的使用這段代碼,但請保留 下面的一行# author : fuxiang ,mail: fuxiang90@gmail.comfrom BeautifulSoup import BeautifulSoup # For processing HTMLimport urllib2import osimport sysimport reimport Queueimport socketimport timesocket.setdefaulttimeout(8) g_url_queue = Queue.Queue()g_url_queue.put('http://www.bupt.edu.cn/')tt = ['http://www.bupt.edu.cn/']g_url_set = set(tt)max_deep = 1# 傳入的參數是soup 類型 這個提取soup 類型裡面的網址def get_url_list(html): global g_url_set re_html = r'(http://(\w+\.)+\w+)' res = html.findAll('a') #找到所有a標籤 for x in res: t = unicode(x) #這裡的x是soup對象 #url[pos] = str(unicode(x['href']) ) #t = unicode(x) #print unicode(x['href']) m = re.findall(re_html , t) if m is None: continue for xx in m: str_url = xx[0] #print str_url g_url_set |= set('fuxiang') if str_url not in g_url_set : g_url_queue.put(str_url ) g_url_set |= set(str_url)#######################################################def strip_tags(html): """ Python中過濾HTML標籤的函數 >>> str_text=strip_tags("<font color=red>hello</font>") >>> print str_text hello """ from HTMLParser import HTMLParser html = html.strip() html = html.strip("\n") result = [] parser = HTMLParser() parser.handle_data = result.append parser.feed(html) parser.close() return ''.join(result)######################################################## 可以傳入 網址 或者 本地檔案 ,解析出裡面的本文def get_context( url ): re_html = 'http[s]?://[A-Za-z0-9]+.[A-Za-z0-9]+.[A-Za-z0-9]+
m = re.match(re_html,str(url)) if m is None : # 如果url 是本地檔案 fp = open(unicode(url),'r') else: fp = urllib2.urlopen(url) html = fp.read() soup = BeautifulSoup(html) allfonttext=soup.findAll(['a','p','font']) if len(allfonttext)<=0: print 'not found text' fwrite
= open('u'+str(url) ,'w') for i in allfonttext: t = (i.renderContents() ) context = strip_tags(t) fwrite.write (context) fwrite.close()####################################################### def main_fun(deep): global g_url_set global g_url_queue if deep >
max_deep: return count = 0 print 'debug' while g_url_queue.empty() is not True: print 'debug2' l_url = g_url_queue.get() print l_url # 捕捉逾時錯誤 ,有些網頁連結不上 try: fp = urllib2.urlopen(l_url) except : continue html = fp.read() fwrite = open(str(count+1) ,'w') fwrite.write(html)
fwrite.close() soup = BeautifulSoup(html) get_url_list(soup) get_context(count+1) count += 1 if count >= 100 : return # uncompletedef get_html_page(url): furl = urllib2.urlopen(url) html = furl.read() soup = BeautifulSoup(html)if __name__ == "__main__": main_fun(1)
time.sleep(10)
然後我現在想做一個多線程的,即下載頁面 和分析html 抽取裡面的本文和url 是可以同步進行的 ,然後對上面的代碼進行簡單的修改後,勉強能運行 ,主要是增加了 threading ,對全域的queue 訪問了加了鎖控制,因為之前沒有寫過多線程的代碼,所以覺得還是希望路過的朋友可以,提出建議。
# encoding:utf-8# use BeautifulSoup to get font|p context# 可以隨意的使用這段代碼,但請保留 下面的一行# author : fuxiang ,mail: fuxiang90@gmail.comfrom BeautifulSoup import BeautifulSoup # For processing HTMLimport urllib2import osimport sysimport reimport Queueimport socketimport timeimport threadingqueue_lock = threading.RLock()file_lock = threading.RLock()socket.setdefaulttimeout(8) g_url_queue = Queue.Queue()g_url_queue.put('http://www.bupt.edu.cn/')g_file_queue = Queue.Queue()tt = ['http://www.bupt.edu.cn/']g_url_set = set(tt)max_deep = 1#######################################################def strip_tags(html): """ Python中過濾HTML標籤的函數 >>> str_text=strip_tags("<font color=red>hello</font>") >>> print str_text hello """ from HTMLParser import HTMLParser html = html.strip() html = html.strip("\n") result = [] parser = HTMLParser() parser.handle_data = result.append parser.feed(html) parser.close() return ''.join(result) def get_context( soup ,url): allfonttext=soup.findAll(['a','p','font']) if len(allfonttext)<=0: print 'not found text' fwrite = open('u'+str(url) ,'w') for i in allfonttext: t = (i.renderContents() ) context = strip_tags(t) fwrite.write (context) fwrite.close() class get_page_thread(threading.Thread): def __init__(self, name): threading.Thread.__init__(self) self.t_name = name def run(self): global g_url_set global g_url_queue global g_file_queue count = 0 print 'debug' while g_url_queue.empty() is not True: print self.t_name # 增加一個鎖 queue_lock.acquire() l_url = g_url_queue.get() queue_lock.release() print l_url # 捕捉逾時錯誤 ,有些網頁連結不上 try: fp = urllib2.urlopen(l_url) except : continue html = fp.read() fwrite = open(str(count+1) ,'w') fwrite.write(html) fwrite.close() file_lock.acquire() g_file_queue.put(count+1) file_lock.release() count += 1 if count >= 100 : exit class get_url_list_thread(threading.Thread): def __init__(self, name): threading.Thread.__init__(self) self.t_name = name def run(self): global g_url_set global g_file_queue global queue_lock global file_lock while g_file_queue.empty() is not True: file_lock.acquire() filename = g_file_queue.get() file_lock.release() fd = open(str(filename),'r') html = fd.read(); soup = BeautifulSoup(html) get_context(soup,filename) re_html = r'(http://(\w+\.)+\w+)' res = soup.findAll('a') #找到所有a標籤 for x in res: t = unicode(x) #這裡的x是soup對象 #url[pos] = str(unicode(x['href']) ) #t = unicode(x) #print unicode(x['href']) m = re.findall(re_html , t) if m is None: continue for xx in m: str_url = xx[0] #print str_url g_url_set |= set('fuxiang') if str_url not in g_url_set : queue_lock.acquire() g_url_queue.put(str_url ) queue_lock.release() g_url_set |= set(str_url) # uncompletedef get_html_page(url): furl = urllib2.urlopen(url) html = furl.read() soup = BeautifulSoup(html)if __name__ == "__main__": thread1 = get_page_thread('a') thread2 = get_url_list_thread('b') thread3 = get_page_thread('c') thread4 = get_page_thread('d') thread1.start() time.sleep(20) thread2.start() time.sleep(20) thread3.start() thread4.start()